From c9eebcb4541d961ab390f0bd0a22e2c89f1bcc78 Mon Sep 17 00:00:00 2001 From: Haohang Huang <31998628+symphonylyh@users.noreply.github.com> Date: Tue, 5 Aug 2025 00:47:41 -0700 Subject: [PATCH] [TRTLLM-6674][feat] (Breaking Change) Hopper SWA non-cyclic kernels + KV reuse + Spec Dec (#6379) Signed-off-by: Haohang Huang <31998628+symphonylyh@users.noreply.github.com> Signed-off-by: symphonylyh <31998628+symphonylyh@users.noreply.github.com> --- .../fmha_v2/src/fmha/gmem_tile_qkv_packed.h | 44 ------ cpp/kernels/fmha_v2/src/fmha/mask.h | 4 +- .../fmha_v2/src/fmha/warpspec/compute.h | 2 +- cpp/kernels/fmha_v2/src/fmha/warpspec/dma.h | 57 +------- .../fmha_v2/src/fmha/warpspec/epilogue.h | 2 +- .../fmha_v2/src/fused_multihead_attention.cpp | 2 +- ..._multihead_flash_attention_kernel_noloop.h | 4 +- ...head_flash_attention_kernel_noloop_tiled.h | 4 +- cpp/kernels/xqa/defines.h | 4 + cpp/kernels/xqa/mha.cu | 1 - cpp/kernels/xqa/mha_sm90.cu | 125 ++++++++++++------ cpp/kernels/xqa/test/refAttention.cpp | 9 ++ cpp/kernels/xqa/test/test.cpp | 12 +- cpp/kernels/xqa/utils.cuh | 11 ++ cpp/tensorrt_llm/common/attentionOp.cpp | 5 +- cpp/tensorrt_llm/common/attentionOp.h | 3 +- ...28_S_q_k_v_128_alibi_tma_ws_sm90.cubin.cpp | 3 + ..._k_v_128_softcapping_tma_ws_sm90.cubin.cpp | 3 + ...6_64_128_S_q_k_v_128_tma_ws_sm90.cubin.cpp | 3 + ...8_S_q_kv_128_softmax_tma_ws_sm90.cubin.cpp | 4 +- ...16_64_128_S_q_kv_128_tma_ws_sm90.cubin.cpp | 4 +- ...q_paged_kv_128_alibi_tma_ws_sm90.cubin.cpp | 4 +- ...d_kv_128_softcapping_tma_ws_sm90.cubin.cpp | 4 +- ...128_S_q_paged_kv_128_tma_ws_sm90.cubin.cpp | 4 +- ..._128_S_qkv_128_alibi_tma_ws_sm90.cubin.cpp | 4 +- ...ntion_bf16_64_128_S_qkv_128_sm90.cubin.cpp | 4 +- ...4_128_S_qkv_128_softcapping_sm90.cubin.cpp | 4 +- ..._qkv_128_softcapping_tma_ws_sm90.cubin.cpp | 4 +- ...f16_64_128_S_qkv_128_tma_ws_sm90.cubin.cpp | 4 +- ..._bf16_64_32_S_q_paged_kv_64_sm86.cubin.cpp | 4 +- ...ention_bf16_64_32_S_qkv_128_sm89.cubin.cpp | 4 +- ...ention_bf16_64_32_S_qkv_128_sm90.cubin.cpp | 4 +- ...64_32_S_qkv_128_softcapping_sm90.cubin.cpp | 4 +- ...m3_64_256_S_q_kv_128_tma_ws_sm90.cubin.cpp | 4 +- ...q_paged_kv_128_alibi_tma_ws_sm90.cubin.cpp | 4 +- ...d_kv_128_softcapping_tma_ws_sm90.cubin.cpp | 4 +- ...256_S_q_paged_kv_128_tma_ws_sm90.cubin.cpp | 4 +- ..._256_S_qkv_128_alibi_tma_ws_sm90.cubin.cpp | 4 +- ...4_64_256_output_bf16_tma_ws_sm90.cubin.cpp | 4 +- ..._qkv_128_softcapping_tma_ws_sm90.cubin.cpp | 4 +- ...4m3_64_256_S_qkv_128_tma_ws_sm90.cubin.cpp | 4 +- ...e4m3_fp32_128_128_S_q_kv_32_sm89.cubin.cpp | 4 +- ...e4m3_fp32_128_128_S_q_kv_64_sm89.cubin.cpp | 4 +- ...p32_128_128_S_q_paged_kv_32_sm89.cubin.cpp | 4 +- ...p32_128_128_S_q_paged_kv_40_sm89.cubin.cpp | 4 +- ...p32_128_128_S_q_paged_kv_48_sm89.cubin.cpp | 4 +- ...p32_128_128_S_q_paged_kv_64_sm89.cubin.cpp | 4 +- ..._e4m3_fp32_128_128_S_qkv_32_sm89.cubin.cpp | 4 +- ..._e4m3_fp32_128_128_S_qkv_40_sm89.cubin.cpp | 4 +- ..._e4m3_fp32_128_128_S_qkv_48_sm89.cubin.cpp | 4 +- ..._e4m3_fp32_128_128_S_qkv_64_sm89.cubin.cpp | 4 +- ..._e4m3_fp32_64_32_S_q_kv_128_sm89.cubin.cpp | 4 +- ...n_e4m3_fp32_64_32_S_q_kv_72_sm89.cubin.cpp | 4 +- ...fp32_64_32_S_q_paged_kv_104_sm89.cubin.cpp | 4 +- ...fp32_64_32_S_q_paged_kv_128_sm89.cubin.cpp | 4 +- ...fp32_64_32_S_q_paged_kv_160_sm89.cubin.cpp | 4 +- ..._q_paged_kv_192_output_bf16_sm89.cubin.cpp | 4 +- ...fp32_64_32_S_q_paged_kv_192_sm89.cubin.cpp | 4 +- ...fp32_64_32_S_q_paged_kv_256_sm89.cubin.cpp | 4 +- ..._fp32_64_32_S_q_paged_kv_72_sm89.cubin.cpp | 4 +- ..._fp32_64_32_S_q_paged_kv_80_sm89.cubin.cpp | 4 +- ..._fp32_64_32_S_q_paged_kv_96_sm89.cubin.cpp | 4 +- ...n_e4m3_fp32_64_32_S_qkv_104_sm89.cubin.cpp | 4 +- ...8_sage_64_32_32_output_bf16_sm89.cubin.cpp | 4 +- ...8_sage_64_32_32_output_fp16_sm89.cubin.cpp | 4 +- ...n_e4m3_fp32_64_32_S_qkv_128_sm89.cubin.cpp | 4 +- ...n_e4m3_fp32_64_32_S_qkv_160_sm89.cubin.cpp | 4 +- ...64_32_S_qkv_192_output_bf16_sm89.cubin.cpp | 4 +- ...n_e4m3_fp32_64_32_S_qkv_192_sm89.cubin.cpp | 4 +- ...n_e4m3_fp32_64_32_S_qkv_256_sm89.cubin.cpp | 4 +- ...on_e4m3_fp32_64_32_S_qkv_72_sm89.cubin.cpp | 4 +- ...0_sage_64_32_32_output_bf16_sm89.cubin.cpp | 4 +- ...0_sage_64_32_32_output_fp16_sm89.cubin.cpp | 4 +- ...on_e4m3_fp32_64_32_S_qkv_80_sm89.cubin.cpp | 4 +- ...on_e4m3_fp32_64_32_S_qkv_96_sm89.cubin.cpp | 4 +- ...aged_kv_192x128_output_bf16_sm89.cubin.cpp | 4 +- ..._64_64_S_q_paged_kv_192x128_sm89.cubin.cpp | 4 +- ...aged_kv_576x512_output_bf16_sm89.cubin.cpp | 4 +- ..._64_64_S_q_paged_kv_576x512_sm89.cubin.cpp | 4 +- ...4_S_qkv_192x128_output_bf16_sm89.cubin.cpp | 4 +- ...m3_fp32_64_64_S_qkv_192x128_sm89.cubin.cpp | 4 +- ...p16_128_128_S_q_paged_kv_64_sm80.cubin.cpp | 4 +- ...28_S_q_k_v_128_alibi_tma_ws_sm90.cubin.cpp | 3 + ..._k_v_128_softcapping_tma_ws_sm90.cubin.cpp | 3 + ...6_64_128_S_q_k_v_128_tma_ws_sm90.cubin.cpp | 3 + ...8_S_q_kv_128_softmax_tma_ws_sm90.cubin.cpp | 4 +- ...16_64_128_S_q_kv_128_tma_ws_sm90.cubin.cpp | 4 +- ...q_paged_kv_128_alibi_tma_ws_sm90.cubin.cpp | 4 +- ...p16_64_128_S_q_paged_kv_128_sm80.cubin.cpp | 4 +- ...d_kv_128_softcapping_tma_ws_sm90.cubin.cpp | 4 +- ...128_S_q_paged_kv_128_tma_ws_sm90.cubin.cpp | 4 +- ..._128_S_qkv_128_alibi_tma_ws_sm90.cubin.cpp | 4 +- ...ntion_fp16_64_128_S_qkv_128_sm90.cubin.cpp | 4 +- ...4_128_S_qkv_128_softcapping_sm90.cubin.cpp | 4 +- ..._qkv_128_softcapping_tma_ws_sm90.cubin.cpp | 4 +- ...p16_64_128_S_qkv_128_tma_ws_sm90.cubin.cpp | 4 +- ...ention_fp16_64_32_S_qkv_128_sm90.cubin.cpp | 4 +- ...64_32_S_qkv_128_softcapping_sm90.cubin.cpp | 4 +- ...28_S_q_k_v_128_alibi_tma_ws_sm90.cubin.cpp | 3 + ..._k_v_128_softcapping_tma_ws_sm90.cubin.cpp | 3 + ...2_64_128_S_q_k_v_128_tma_ws_sm90.cubin.cpp | 3 + ...8_S_q_kv_128_softmax_tma_ws_sm90.cubin.cpp | 4 +- ...32_64_128_S_q_kv_128_tma_ws_sm90.cubin.cpp | 4 +- ...q_paged_kv_128_alibi_tma_ws_sm90.cubin.cpp | 4 +- ...d_kv_128_softcapping_tma_ws_sm90.cubin.cpp | 4 +- ...128_S_q_paged_kv_128_tma_ws_sm90.cubin.cpp | 4 +- ..._128_S_qkv_128_alibi_tma_ws_sm90.cubin.cpp | 4 +- ..._fp16_fp32_64_128_S_qkv_128_sm90.cubin.cpp | 4 +- ...4_128_S_qkv_128_softcapping_sm90.cubin.cpp | 4 +- ..._qkv_128_softcapping_tma_ws_sm90.cubin.cpp | 4 +- ...p32_64_128_S_qkv_128_tma_ws_sm90.cubin.cpp | 4 +- ...n_fp16_fp32_64_32_S_qkv_128_sm90.cubin.cpp | 4 +- ...64_32_S_qkv_128_softcapping_sm90.cubin.cpp | 4 +- .../fmhaRunner.cpp | 15 --- .../decoderMaskedMultiheadAttentionTemplate.h | 46 +++---- .../decoderXQAImplCommon.h | 16 +-- .../decoderXQAImplJIT/compileEngine.cpp | 3 +- .../decoderXQAImplJIT/decoderXQAImplJIT.cpp | 10 +- .../decoderXQAImplJIT/kernelUtils.cpp | 6 - .../nvrtcWrapper/include/nvrtcWrapper.h | 3 + .../nvrtcWrapper/src/nvrtcWrapper.cpp | 3 +- .../decoderXQARunner.cpp | 2 +- .../xqaParams.h | 2 + .../unfusedAttentionKernels_2_template.h | 60 ++------- cpp/tensorrt_llm/thop/attentionOp.cpp | 5 +- docker/Makefile | 2 + docker/README.md | 5 + .../_torch/attention_backend/flashinfer.py | 5 + .../_torch/attention_backend/trtllm.py | 7 +- .../_torch/attention_backend/vanilla.py | 4 +- tensorrt_llm/_torch/models/modeling_gemma3.py | 2 +- tensorrt_llm/_torch/models/modeling_llama.py | 4 + tensorrt_llm/commands/build.py | 4 +- .../defs/accuracy/test_cli_flow.py | 4 + .../accuracy/test_disaggregated_serving.py | 10 +- .../defs/accuracy/test_llm_api_pytorch.py | 29 ++-- tests/integration/test_lists/waives.txt | 1 + tests/unittest/llmapi/test_llm.py | 1 + .../trt/attention/test_gpt_attention.py | 5 + .../trt/attention/test_gpt_attention_IFB.py | 6 + 140 files changed, 461 insertions(+), 467 deletions(-) create mode 100644 cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_128_S_q_k_v_128_alibi_tma_ws_sm90.cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_128_S_q_k_v_128_softcapping_tma_ws_sm90.cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_128_S_q_k_v_128_tma_ws_sm90.cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_128_S_q_k_v_128_alibi_tma_ws_sm90.cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_128_S_q_k_v_128_softcapping_tma_ws_sm90.cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_128_S_q_k_v_128_tma_ws_sm90.cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_128_S_q_k_v_128_alibi_tma_ws_sm90.cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_128_S_q_k_v_128_softcapping_tma_ws_sm90.cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_128_S_q_k_v_128_tma_ws_sm90.cubin.cpp diff --git a/cpp/kernels/fmha_v2/src/fmha/gmem_tile_qkv_packed.h b/cpp/kernels/fmha_v2/src/fmha/gmem_tile_qkv_packed.h index 7e05ef3caf..8f54c52b0b 100644 --- a/cpp/kernels/fmha_v2/src/fmha/gmem_tile_qkv_packed.h +++ b/cpp/kernels/fmha_v2/src/fmha/gmem_tile_qkv_packed.h @@ -796,7 +796,6 @@ struct Gmem_tile_contiguous_kv template inline __device__ void load(Smem_tile& smem_tile) { - // TODO(perkzz): add remap_kv_row for sliding window attention. uint32_t preds[LDGS]; #pragma unroll for (int ii = 0; ii < LDGS; ++ii) @@ -1091,42 +1090,6 @@ struct Gmem_tile_paged_kv } } - //////////////////////////////////////////////////////////////////////////////////////////////////// - // Remap the row to the one in cyclic kv cache. - inline __device__ void remap_kv_row(int& row) - { - // Sliding window attention + chunked context needs special handling. - if constexpr (SLIDING_WINDOW_ATTENTION) - { - // For chunked context (i.e. separate q and kv layout), the kv cache might be overwritten - // after last chunk is processed. - // To deal with this issue, the new tokens' kv will be appended to the kv cache first, and - // overwrite the kv cache after FMHA is done. - // The kv input layout is like: [cyclic kv cache] + [new tokens' kv]. - // There are two possible cases: - // 1. The kv cache hasn't been overwritten while processing previous chunks, so we can take - // it normally, where we have full kv cache. - // 2. The kv cache has been overwritten while processing previous chunks. we need to mask - // out the tokens in the kv cache based on the sliding window size. It needs to track the - // last kv cache token's position in a circular way. - - // Remap the kv row when kv cache has been overwritten in a circular way. - if (past_seqlen_ > sliding_window_size_) - { - // Map the kv row to the new tokens' kv. - if (row >= past_seqlen_) - { - row = sliding_window_size_ + (row - past_seqlen_); - } - else - { - // Map the kv row to the cyclic kv cache. - row = row % sliding_window_size_; - } - } - } - } - // Load data from memory. template inline __device__ void load(Smem_tile& smem_tile) @@ -1144,13 +1107,6 @@ struct Gmem_tile_paged_kv for (int ii = 0; ii < LDGS; ++ii) { int row_idx = row_ + ii * (int) ROWS_PER_LDG; - - // Remap row_idx if sliding window attention is used. - // This will be removed later as the remapping will be handled by the kvCacheManger in TRTLLM. -#ifdef GENERATE_CUBIN - remap_kv_row(row_idx); -#endif - int paged_kv_block_idx = (row_idx >> paged_kv_log2_block_size_); char const* local_kv_ptr = reinterpret_cast(paged_kv_block_pool_ptr_ + params_kv_block_size_in_bytes_ * paged_kv_global_block_offsets_[paged_kv_block_idx]); diff --git a/cpp/kernels/fmha_v2/src/fmha/mask.h b/cpp/kernels/fmha_v2/src/fmha/mask.h index acb7aece48..f55ff0dfe3 100644 --- a/cpp/kernels/fmha_v2/src/fmha/mask.h +++ b/cpp/kernels/fmha_v2/src/fmha/mask.h @@ -478,7 +478,7 @@ struct Mask : public Mask inline __device__ bool is_valid(int row, int col) const { // Is it a valid position in the sequence, i.e. are we in the lower triangle? - return (row >= col) && (col >= max(0, row - sliding_window_size_)); + return (row >= col) && (col >= max(0, row + 1 - sliding_window_size_)); } // The sliding window size. @@ -946,7 +946,7 @@ struct Mask_hopper : public Mask_hopper= max(0, row - sliding_window_size_); + return col <= row && col >= max(0, row + 1 - sliding_window_size_); } // The sliding window size for attention. diff --git a/cpp/kernels/fmha_v2/src/fmha/warpspec/compute.h b/cpp/kernels/fmha_v2/src/fmha/warpspec/compute.h index b95316e184..65e56dbf5d 100644 --- a/cpp/kernels/fmha_v2/src/fmha/warpspec/compute.h +++ b/cpp/kernels/fmha_v2/src/fmha/warpspec/compute.h @@ -288,7 +288,7 @@ struct Compute // The kv_left_mask_end is the start of the chunk. kv_left_mask_end = div_up(is_chunked_attention ? ((tile_offset_end >> params.log2_chunked_attention_size) << params.log2_chunked_attention_size) - : (tile_offset_end - params.sliding_window_size), + : (tile_offset_end + 1 - params.sliding_window_size), STEP_KV); } diff --git a/cpp/kernels/fmha_v2/src/fmha/warpspec/dma.h b/cpp/kernels/fmha_v2/src/fmha/warpspec/dma.h index 42d766bfc9..12e73bedf1 100644 --- a/cpp/kernels/fmha_v2/src/fmha/warpspec/dma.h +++ b/cpp/kernels/fmha_v2/src/fmha/warpspec/dma.h @@ -199,7 +199,7 @@ struct DMA // The kv_offset_start. int kv_offset_start = is_chunked_attention ? ((q_step_offset >> params.log2_chunked_attention_size) << params.log2_chunked_attention_size) - : max(0, q_step_offset - params.sliding_window_size); + : max(0, q_step_offset + 1 - params.sliding_window_size); kv_idx_start = kv_offset_start / STEP_KV; } @@ -388,51 +388,6 @@ struct DMA elect_one_, {-1, -1, -1, -1, -1, -1, -1, -1}); } - // Calculate the start tile idx. - inline __device__ int remap_kv_tile_idx( - int kv_tile_idx, int num_kv_cache_tiles, int past_kv_length, int sliding_window_size) - { - - // The remapped kv tile idx. - int remapped_kv_tile_idx = kv_tile_idx; - // This will be removed later as the remapping will be handled by the kvCacheManger in TRTLLM. -#ifdef GENERATE_CUBIN - // Sliding window attention + chunked context needs special handling. - if constexpr (SLIDING_OR_CHUNKED_ATTENTION) - { - // For chunked context (i.e. separate q and kv layout), the kv cache might be - // overwritten after last chunk is processed. - // To deal with this issue, the new tokens' kv will be appended to the kv cache first, - // and overwrite the kv cache after FMHA is done. - // The kv input layout is like: [cyclic kv cache] + [new tokens' kv]. - // There are two possible cases: - // 1. The kv cache hasn't been overwritten while processing previous chunks, so we can - // take it normally, where we have full kv cache. - // 2. The kv cache has been overwritten while processing previous chunks. we need to - // mask out the tokens in the kv cache based on the sliding window size. It needs - // to track the last kv cache token's position in a circular way. - - // Remap the kv tile index when kv cache has been overwritten in a circular way. - if (past_kv_length > sliding_window_size) - { - // Map the kv tile index to the new tokens' kv. - if (kv_tile_idx * STEP_KV >= past_kv_length) - { - remapped_kv_tile_idx - = num_kv_cache_tiles + int((kv_tile_idx * STEP_KV - past_kv_length) / STEP_KV); - } - else - { - // Map the kv tile index to the cyclic kv cache. - remapped_kv_tile_idx = kv_tile_idx % num_kv_cache_tiles; - } - } - } -#endif - // Return the remapped kv tile idx. - return remapped_kv_tile_idx; - } - // Support contiguous Q + contiguous/paged KV separate cache. inline __device__ void run_separate_q_and_kv( bert::Fused_multihead_attention_params_v2 const& params, Shared* shared) @@ -560,24 +515,20 @@ struct DMA // Iterate over the kv tiles for this q step. for (int kv_step_idx = kv_idx_start; kv_step_idx < kv_idx_end; kv_step_idx++) { - // Remap the kv tile idx if sliding window attention is enabled. - // Sliding_window_size should be multiple of STEP_KV. - int remapped_kv_step_idx = remap_kv_tile_idx(kv_step_idx, params.sliding_window_size / STEP_KV, - past_kv_length, params.sliding_window_size); // The barrier id. int bar_id; // Load paged kv input. if constexpr (PAGED_KV_INPUT) { - bar_id = load_paged_kv(bidh_kv, remapped_kv_step_idx * STEP_KV, num_valid_kv_blocks, + bar_id = load_paged_kv(bidh_kv, kv_step_idx * STEP_KV, num_valid_kv_blocks, params.paged_kv_cache.mTokensPerBlockLog2, params.blocks_per_tma_load, params.blocks_per_tma_load_log2, params.paged_kv_cache.mMaxBlocksPerSeq, paged_block_offsets, desc_k, desc_v, shared, cbw_k, cbw_v, cbw_v_scratch); } else { - bar_id = load_kv(bidh_kv, remapped_kv_step_idx * STEP_KV, desc_k, desc_v, shared, cbw_k, - cbw_v, cbw_v_scratch); + bar_id = load_kv( + bidh_kv, kv_step_idx * STEP_KV, desc_k, desc_v, shared, cbw_k, cbw_v, cbw_v_scratch); } // Opportunistically hide headinfo in the shadow of UTMALDGs of the QKV tensor diff --git a/cpp/kernels/fmha_v2/src/fmha/warpspec/epilogue.h b/cpp/kernels/fmha_v2/src/fmha/warpspec/epilogue.h index 3e32efed59..217e8c0872 100644 --- a/cpp/kernels/fmha_v2/src/fmha/warpspec/epilogue.h +++ b/cpp/kernels/fmha_v2/src/fmha/warpspec/epilogue.h @@ -134,7 +134,7 @@ struct Softmax_base else { // The sliding window start is the max of 0 and row - sliding_window_size. - return max(0, row - sliding_window_size_); + return max(0, row + 1 - sliding_window_size_); } } diff --git a/cpp/kernels/fmha_v2/src/fused_multihead_attention.cpp b/cpp/kernels/fmha_v2/src/fused_multihead_attention.cpp index e2640241db..6d9811ac07 100644 --- a/cpp/kernels/fmha_v2/src/fused_multihead_attention.cpp +++ b/cpp/kernels/fmha_v2/src/fused_multihead_attention.cpp @@ -1578,7 +1578,7 @@ int main(int argc, char** argv) } else { - valid = valid && (si >= std::max(int(so - sliding_window_size), 0)); + valid = valid && (si >= std::max(int(so + 1 - sliding_window_size), 0)); } } if (is_mtp) diff --git a/cpp/kernels/fmha_v2/src/fused_multihead_flash_attention_kernel_noloop.h b/cpp/kernels/fmha_v2/src/fused_multihead_flash_attention_kernel_noloop.h index a77bb34c1d..cb611f62d5 100644 --- a/cpp/kernels/fmha_v2/src/fused_multihead_flash_attention_kernel_noloop.h +++ b/cpp/kernels/fmha_v2/src/fused_multihead_flash_attention_kernel_noloop.h @@ -175,10 +175,10 @@ inline __device__ void device_flash_attention_nl(Params const& params) int const kv_loop_end = ((valid_seqlen + Cta_tile_p::N - 1) / Cta_tile_p::N) * Cta_tile_p::N; int const kv_loop_start = mask_sliding_window - ? (max(0, q_sequence_start - params.sliding_window_size) / Cta_tile_p::N) * Cta_tile_p::N + ? (max(0, q_sequence_start + 1 - params.sliding_window_size) / Cta_tile_p::N) * Cta_tile_p::N : 0; int const sliding_window_mask_end = mask_sliding_window - ? (max(0, q_sequence_start + Cta_tile_p::M - 1 - params.sliding_window_size) / Cta_tile_p::N) * Cta_tile_p::N + ? (max(0, q_sequence_start + Cta_tile_p::M - params.sliding_window_size) / Cta_tile_p::N) * Cta_tile_p::N : 0; static_assert(Cta_tile_p::M >= Cta_tile_p::N, ""); diff --git a/cpp/kernels/fmha_v2/src/fused_multihead_flash_attention_kernel_noloop_tiled.h b/cpp/kernels/fmha_v2/src/fused_multihead_flash_attention_kernel_noloop_tiled.h index 5f6dc79f93..e945e2885c 100644 --- a/cpp/kernels/fmha_v2/src/fused_multihead_flash_attention_kernel_noloop_tiled.h +++ b/cpp/kernels/fmha_v2/src/fused_multihead_flash_attention_kernel_noloop_tiled.h @@ -176,10 +176,10 @@ inline __device__ void device_flash_attention_nl_tiled(Params const& params) int const kv_loop_end = ((valid_seqlen + Cta_tile_p::N - 1) / Cta_tile_p::N) * Cta_tile_p::N; int const kv_loop_start = mask_sliding_window - ? (max(0, q_sequence_start - params.sliding_window_size) / Cta_tile_p::N) * Cta_tile_p::N + ? (max(0, q_sequence_start + 1 - params.sliding_window_size) / Cta_tile_p::N) * Cta_tile_p::N : 0; int const sliding_window_mask_end = mask_sliding_window - ? (max(0, q_sequence_start + Cta_tile_p::M - 1 - params.sliding_window_size) / Cta_tile_p::N) * Cta_tile_p::N + ? (max(0, q_sequence_start + Cta_tile_p::M - params.sliding_window_size) / Cta_tile_p::N) * Cta_tile_p::N : 0; // Move K and V tiles. diff --git a/cpp/kernels/xqa/defines.h b/cpp/kernels/xqa/defines.h index 5a983fcc60..6b536f63a3 100644 --- a/cpp/kernels/xqa/defines.h +++ b/cpp/kernels/xqa/defines.h @@ -162,6 +162,10 @@ static_assert(CACHE_ELEM_ENUM != 0); #define OPTIMIZE_FOR_LATENCY 1 #endif +#ifndef IS_SPEC_DEC_TREE +#define IS_SPEC_DEC_TREE 1 // by default SPEC_DEC expect tree-based draft token structure +#endif + #define DBG_BATCH_SIZE 2 #define DBG_SEQ_LEN 256 * 4 + 3 #define DBG_NB_CTAS_PER_SEQ 8 diff --git a/cpp/kernels/xqa/mha.cu b/cpp/kernels/xqa/mha.cu index 1acdb9852f..b0ae663c78 100644 --- a/cpp/kernels/xqa/mha.cu +++ b/cpp/kernels/xqa/mha.cu @@ -1592,7 +1592,6 @@ CUBIN_EXPORT __global__ #endif uint32_t const cacheSeqLen = getCacheSeqLen(cacheList, idxReq); - static_assert(!(allowSlidingWindow && useSpecDec), "Sliding window is not yet supported in spec-dec mode"); #if SLIDING_WINDOW bool const rtIsReallySliding = (cacheSeqLen > slidingWinSize); uint32_t const nbTotalSkipTokens = rtIsReallySliding ? cacheSeqLen - slidingWinSize : 0; diff --git a/cpp/kernels/xqa/mha_sm90.cu b/cpp/kernels/xqa/mha_sm90.cu index e21714c0c5..276c7185cf 100644 --- a/cpp/kernels/xqa/mha_sm90.cu +++ b/cpp/kernels/xqa/mha_sm90.cu @@ -62,7 +62,7 @@ constexpr uint32_t gemm0NbThrds = gmmaWarpGrpSize * gemm0NbGmmaGrps; constexpr uint32_t gemm0NbWarps = gmmaWarpsPerGrp * gemm0NbGmmaGrps; #if SPEC_DEC && !SWAP_AB inline constexpr uint32_t ctaNbQHeads = Q_HEADS_PER_CTA; -inline constexpr uint32_t inputTokensPerCta = exactDiv(ctaNbQHeads, headGrpSize); +inline constexpr uint32_t inputTokensPerCta = ctaNbQHeads / headGrpSize; constexpr uint32_t ctaNbValidQHeads = ctaNbQHeads; #elif SPEC_DEC && SWAP_AB inline constexpr uint32_t inputTokensPerCta = specDecQLen; @@ -347,21 +347,19 @@ __device__ inline uint32_t getInputTokOffset(SpecDecParams const& params, uint32 return (params.qCuSeqLens == nullptr) ? params.qSeqLen * idxReq : params.qCuSeqLens[idxReq]; } -static_assert(!allowSlidingWindow, "SpecDec is not supported for sliding window"); - struct SpecDec { static inline constexpr uint32_t tileSize = gemm0CtaTileNbTokens; - static inline constexpr uint32_t ctaMaxQSeqLen = exactDiv(ctaNbQHeads, headGrpSize); + static inline constexpr uint32_t ctaMaxQSeqLen = (ctaNbQHeads / headGrpSize); using TileMaskRow = Vec; - __device__ inline SpecDec(SpecDecParams const& params, uint32_t idxReq, uint32_t ctaIdxY, uint32_t seqLen) + __device__ inline SpecDec(SpecDecParams const& params, uint32_t idxReq, uint32_t idxInputSubSeq, uint32_t seqLen) : params(params) - , ctaIdxY(ctaIdxY) + , idxInputSubSeq(idxInputSubSeq) , seqLen(seqLen) { inputSeqLen = getInputSeqLen(params, idxReq); - baseOffset = divUp(params.qSeqLen, 32U) * (getInputTokOffset(params, idxReq) + ctaMaxQSeqLen * ctaIdxY); + baseOffset = divUp(params.qSeqLen, 32U) * (getInputTokOffset(params, idxReq) + ctaMaxQSeqLen * idxInputSubSeq); } __device__ inline uint32_t unmaskedSeqLen() const @@ -371,8 +369,8 @@ struct SpecDec __device__ inline bool needMask(uint32_t idxTile, uint32_t idxQTokInCta) const { - return tileSize * (idxTile + 1) > unmaskedSeqLen() && ctaMaxQSeqLen * ctaIdxY + idxQTokInCta < inputSeqLen - && params.mask != nullptr; + return tileSize * (idxTile + 1) > unmaskedSeqLen() + && ctaMaxQSeqLen * idxInputSubSeq + idxQTokInCta < inputSeqLen && params.mask != nullptr; } __device__ inline int32_t maskColBeg(uint32_t idxTile) const @@ -408,14 +406,17 @@ struct SpecDec } SpecDecParams const& params; - uint32_t const ctaIdxY; + uint32_t const idxInputSubSeq; uint32_t const seqLen; uint32_t inputSeqLen; uint32_t baseOffset; }; -__device__ void warpGrpApplyMask( - Gemm0Acc& acc, SpecDec const& specDec, uint32_t cacheSeqLen, uint32_t idxTile, uint32_t warpRank); +__device__ void warpGrpApplyMask(Gemm0Acc& acc, SpecDec const& specDec, +#if SLIDING_WINDOW && !IS_SPEC_DEC_TREE + int32_t tok0WinBeg, +#endif + uint32_t cacheSeqLen, uint32_t idxTile, uint32_t warpRank); #endif #if SWAP_AB @@ -684,9 +685,31 @@ CUBIN_EXPORT __global__ uint32_t const cacheSeqLen = getCacheSeqLen(cacheList, idxReq); static_assert(gemm0CtaTileNbTokens == gemm1CtaTileNbTokens); constexpr uint32_t tileSize = gemm0CtaTileNbTokens; - static_assert(!(allowSlidingWindow && useSpecDec), "Sliding window is not yet supported in spec-dec mode"); -#if SLIDING_WINDOW +#if SPEC_DEC + uint32_t const idxInputSubSeq = blockIdx.x; + uint32_t const inputSeqLen = reqInputTokEnd - reqInputTokBeg; + uint32_t const ctaTokOffset = inputTokensPerCta * idxInputSubSeq; + uint32_t const ctaNbValidTokens = mha::min(uint32_t{inputTokensPerCta}, inputSeqLen - ctaTokOffset); + + if (ctaTokOffset >= inputSeqLen) + { + return; + } +#else + uint32_t const idxInputSubSeq = 0; + uint32_t const inputSeqLen = 1; + uint32_t const ctaTokOffset = 0; + uint32_t const ctaNbValidTokens = 1; +#endif +#if SLIDING_WINDOW && SPEC_DEC && !IS_SPEC_DEC_TREE + // get the actual start position depending on ctaTokOffset, which is the draft token position per CTA + uint32_t const tok0SeqLen = cacheSeqLen - inputSeqLen + 1 + ctaTokOffset; + int32_t const tok0WinBeg = int32_t(tok0SeqLen) - int32_t(slidingWinSize); + uint32_t const nbTotalSkipTokens = mha::max(0, tok0WinBeg); +#elif SLIDING_WINDOW bool const rtIsReallySliding = (cacheSeqLen > slidingWinSize); + // if SPEC_DEC && SLIDING_WINDOW && IS_SPEC_DEC_TREE, it should not do sliding + assert(!SPEC_DEC || !rtIsReallySliding); uint32_t const nbTotalSkipTokens = rtIsReallySliding ? cacheSeqLen - slidingWinSize : 0; #else constexpr bool rtIsReallySliding = false; @@ -720,21 +743,6 @@ CUBIN_EXPORT __global__ { return; } -#if SPEC_DEC - uint32_t const idxInputSubSeq = blockIdx.x; - uint32_t const inputSeqLen = reqInputTokEnd - reqInputTokBeg; - uint32_t const ctaTokOffset = inputTokensPerCta * idxInputSubSeq; - uint32_t const ctaNbValidTokens = mha::min(uint32_t{inputTokensPerCta}, inputSeqLen - ctaTokOffset); - if (ctaTokOffset >= inputSeqLen) - { - return; - } -#else - uint32_t const idxInputSubSeq = 0; - uint32_t const inputSeqLen = 1; - uint32_t const ctaTokOffset = 0; - uint32_t const ctaNbValidTokens = 1; -#endif uint32_t const ctaInputTokBeg = reqInputTokBeg + ctaTokOffset; auto const warpIdx = getWarpIdx(uint3{128, 1, 3}); auto const wid = warpIdx.z * 4 + warpIdx.x; @@ -886,10 +894,13 @@ CUBIN_EXPORT __global__ #endif // apply qkScale acc = acc * qkScale; - // apply mask #if SPEC_DEC - warpGrpApplyMask(acc, specDec, cacheSeqLen, idxKTile, warpRank); + warpGrpApplyMask(acc, specDec, +#if SLIDING_WINDOW && !IS_SPEC_DEC_TREE + tok0WinBeg, +#endif + cacheSeqLen, idxKTile, warpRank); #else bool const isFirstTile = (idxKTile == nbSkipLeadingTiles); bool const needMaskLeading = (rtIsReallySliding && isFirstTile && tile0NbSkipTokens > 0); @@ -1342,7 +1353,6 @@ CUBIN_EXPORT __global__ kTilePartLoader.loadPages(idxKTile); #if USE_INPUT_KV || ENABLE_PDL == 2 #if SPEC_DEC - static_assert(SLIDING_WINDOW == 0); bool const anyNewTokens = (gemm0CtaTileNbTokens * (idxKTile + 1) > cacheSeqLen - inputSeqLen); #else bool const anyNewTokens = (gemm0CtaTileNbTokens * (idxKTile + 1) >= cacheSeqLen); @@ -1411,7 +1421,6 @@ CUBIN_EXPORT __global__ vTileLoader.loadPages(idxVTile); #if USE_INPUT_KV || ENABLE_PDL == 2 #if SPEC_DEC - static_assert(SLIDING_WINDOW == 0); bool const anyNewTokens = (gemm0CtaTileNbTokens * (idxVTile + 1) > cacheSeqLen - inputSeqLen); #else bool const anyNewTokens = (gemm0CtaTileNbTokens * (idxVTile + 1) >= cacheSeqLen); @@ -1838,8 +1847,11 @@ __device__ inline GMemKVCacheHead& KVTilePartLoader::getHead(uint32_t pos) #if SWAP_AB #if SPEC_DEC -__device__ inline void warpGrpApplyMask( - Gemm0Acc& acc, SpecDec const& specDec, uint32_t cacheSeqLen, uint32_t idxTile, uint32_t warpRank) +__device__ inline void warpGrpApplyMask(Gemm0Acc& acc, SpecDec const& specDec, +#if SLIDING_WINDOW && !IS_SPEC_DEC_TREE + int32_t tok0WinBeg, +#endif + uint32_t cacheSeqLen, uint32_t idxTile, uint32_t warpRank) { constexpr uint32_t tileSize = gemm0CtaTileNbTokens; static_assert(SPEC_Q_SEQ_LEN <= sizeof(MaskType) * 8, "not implemented"); @@ -2215,22 +2227,40 @@ __device__ inline RegRowWiseVec computeWarpGrpRowMax_sync( } #if SPEC_DEC -__device__ inline void warpGrpApplyMask( - Gemm0Acc& acc, SpecDec const& specDec, uint32_t cacheSeqLen, uint32_t idxTile, uint32_t warpRank) +__device__ inline void warpGrpApplyMask(Gemm0Acc& acc, SpecDec const& specDec, +#if SLIDING_WINDOW && !IS_SPEC_DEC_TREE + int32_t tok0WinBeg, +#endif + uint32_t cacheSeqLen, uint32_t idxTile, uint32_t warpRank) { - static_assert(!SLIDING_WINDOW, "SpecDec is not supported for sliding window"); constexpr uint32_t tileSize = gemm0CtaTileNbTokens; + auto const inputSeqLen = specDec.inputSeqLen; + auto const idxInputSubSeq = specDec.idxInputSubSeq; + constexpr uint64_t fullMask = ~uint64_t{0}; + static_assert(tileSize == sizeof(fullMask) * 8); +#if SLIDING_WINDOW && !IS_SPEC_DEC_TREE + uint32_t const ctaTokOffset = inputTokensPerCta * idxInputSubSeq; + Range const tileRange = {tileSize * idxTile, tileSize * idxTile + tileSize}; + Range const maxMaskOutRange = {0, mha::max(0, tok0WinBeg) + (inputTokensPerCta - 1)}; + bool const ctaNeedBegMask = tileRange.beg < maxMaskOutRange.end; + assert(ctaNeedBegMask == overlap(tileRange, maxMaskOutRange)); + int32_t const tok0NbMaskOut = int32_t(tok0WinBeg) - int32_t(tileSize * idxTile); +#else + constexpr bool ctaNeedBegMask = false; + uint64_t const begMask = fullMask; + int32_t const tok0NbMaskOut = -2147483648; +#endif uint32_t const offset = tileSize * idxTile; uint32_t const nbValidCols = mha::min(offset < cacheSeqLen ? cacheSeqLen - offset : 0U, tileSize); bool const ctaNeedEndMask = (nbValidCols < tileSize); bool const ctaNeedSpecDecMask = specDec.needMask(idxTile, 0); - bool const needMask = ctaNeedEndMask || ctaNeedSpecDecMask; + bool const needMask = ctaNeedBegMask || ctaNeedEndMask || ctaNeedSpecDecMask; if (!needMask) { return; } static_assert(tileSize == 64, "not implemented"); - auto const endMask = (~uint64_t{0} >> (tileSize - nbValidCols)); + auto const endMask = fullMask >> (tileSize - nbValidCols); uint32_t const idxInQuad = laneId() % 4; uint32_t const idxQuad = laneId() / 4; @@ -2241,10 +2271,19 @@ __device__ inline void warpGrpApplyMask( for (uint32_t i = 0; i < GmmaAccCoreMat::rows; i++) { uint32_t const row = gmma::instM * m + gmma::instM / 4 * warpRank + 8 * i + idxQuad; - auto const specDecMask = specDec.needMask(idxTile, row / headGrpSize) - ? specDec.loadTileMaskRow(idxTile, row / headGrpSize) + uint32_t const idxQTokInCta = row / headGrpSize; + bool const isQTokValid + = (headGrpSize * inputTokensPerCta == ctaNbQHeads) || (idxQTokInCta < inputTokensPerCta); + auto const specDecMask = (isQTokValid && specDec.needMask(idxTile, idxQTokInCta)) + ? specDec.loadTileMaskRow(idxTile, idxQTokInCta) : SpecDec::TileMaskRow{~0U, ~0U}; - auto const mask = endMask & reinterpret_cast(specDecMask); +#if SLIDING_WINDOW && !IS_SPEC_DEC_TREE + int32_t const begNbMaskOut = tok0NbMaskOut + int32_t(idxQTokInCta); + uint64_t const begMask = (begNbMaskOut > 0 ? fullMask << begNbMaskOut : fullMask); +#else + uint64_t const begMask = fullMask; +#endif + auto const mask = begMask & endMask & reinterpret_cast(specDecMask); if (mask == ~uint64_t{0}) { continue; diff --git a/cpp/kernels/xqa/test/refAttention.cpp b/cpp/kernels/xqa/test/refAttention.cpp index 3ad281bd61..d8f1a688f5 100644 --- a/cpp/kernels/xqa/test/refAttention.cpp +++ b/cpp/kernels/xqa/test/refAttention.cpp @@ -155,7 +155,16 @@ Eigen::Matrix refAttenti { qF32[i] = toF32Head(q[i]); } +#if SPEC_DEC && SLIDING_WINDOW + // In Spec-dec + SLIDING WINDOW mode, only allow linear tree or !rtIsReallySliding. + // the token starting position is seqLen - qSeqLen + 1 + assert(!IS_SPEC_DEC_TREE || seqLen - qSeqLen + 1 < slidingWinSize); + uint32_t const tok0SeqLen = seqLen - qSeqLen + 1 + q_len; + uint32_t const seqBeg + = (int32_t(tok0SeqLen) < int32_t(slidingWinSize) ? 0 : int32_t(tok0SeqLen) - int32_t(slidingWinSize)); +#else uint32_t const seqBeg = (seqLen < slidingWinSize ? 0 : seqLen - slidingWinSize); +#endif gemm0Acc.leftCols(seqBeg).fill(-INFINITY); for (uint32_t j = seqBeg; j < seqLen; j++) { diff --git a/cpp/kernels/xqa/test/test.cpp b/cpp/kernels/xqa/test/test.cpp index 04219dcb11..19bea5491d 100644 --- a/cpp/kernels/xqa/test/test.cpp +++ b/cpp/kernels/xqa/test/test.cpp @@ -130,7 +130,7 @@ template #endif #endif void runTest(uint32_t batchSize, uint32_t seqLen, bool testPerf, bool refCheck, bool verbose = false, - bool saveData = false, uint32_t ctxLen = ~0U, uint32_t slidingWinSize = std::numeric_limits::max()) + bool saveData = false, uint32_t ctxLen = ~0U, uint32_t slidingWinSize = 1U << 30) { #if IS_MLA if (nbKHeads != 1) @@ -363,6 +363,8 @@ void runTest(uint32_t batchSize, uint32_t seqLen, bool testPerf, bool refCheck, { #if IS_MLA || SPEC_Q_SEQ_LEN hostMask[tokenIdx * qSeqLen + kvPosIdx] = (tokenIdx >= kvPosIdx); +#elif !IS_SPEC_DEC_TREE + hostMask[tokenIdx * qSeqLen + kvPosIdx] = tokenIdx >= kvPosIdx; #else hostMask[tokenIdx * qSeqLen + kvPosIdx] = maskDist(rng); #endif @@ -1038,6 +1040,14 @@ TEST(RefCheck, llama_V2_70b_3) runTest<8, HEAD_GROUP_SIZE, Q_SEQ_LEN>(8, 1028, runPerfTest, runCheckTest); runTest<8, HEAD_GROUP_SIZE, Q_SEQ_LEN>(8, 2048, runPerfTest, runCheckTest); runTest<8, HEAD_GROUP_SIZE, Q_SEQ_LEN>(8, 4096, runPerfTest, runCheckTest); + runTest<8, HEAD_GROUP_SIZE, Q_SEQ_LEN>(8, 2048, runPerfTest, runCheckTest); + +#if SLIDING_WINDOW && !IS_SPEC_DEC_TREE + runTest<4, HEAD_GROUP_SIZE, Q_SEQ_LEN>(4, 2039, false, runCheckTest, true, false, ~0U, 1024); + runTest<8, HEAD_GROUP_SIZE, Q_SEQ_LEN>(8, 63, false, runCheckTest, true, false, ~0U, 61); + runTest<1, HEAD_GROUP_SIZE, Q_SEQ_LEN>(8, 2, false, true, true, false, ~0U, 1); + +#endif } #endif diff --git a/cpp/kernels/xqa/utils.cuh b/cpp/kernels/xqa/utils.cuh index 07773ac29e..2ec5b40995 100644 --- a/cpp/kernels/xqa/utils.cuh +++ b/cpp/kernels/xqa/utils.cuh @@ -1056,3 +1056,14 @@ public: private: uint32_t mTic; }; + +// [beg, end) +struct Range +{ + uint32_t beg, end; +}; + +constexpr bool overlap(Range a, Range b) +{ + return a.beg < b.end && b.beg < a.end; +} diff --git a/cpp/tensorrt_llm/common/attentionOp.cpp b/cpp/tensorrt_llm/common/attentionOp.cpp index be64673122..6e1498ba71 100644 --- a/cpp/tensorrt_llm/common/attentionOp.cpp +++ b/cpp/tensorrt_llm/common/attentionOp.cpp @@ -197,6 +197,7 @@ bool AttentionOp::convertMMHAParamsToXQAParams(tensorrt_llm::kernels::XQAParams& xqaParams.multi_block_mode = common::getEnvForceDeterministicAttention() ? false : mMultiBlockMode; // Medusa mode will have multiple query tokens. xqaParams.multi_query_tokens = mIsSpecDecodingEnabled && mUseSpecDecoding; + xqaParams.is_spec_dec_tree = mIsSpecDecTree; if (mKVCacheQuantMode.hasInt8KvCache()) { @@ -1723,10 +1724,6 @@ int AttentionOp::enqueueContext(EnqueueContextParams const& params, cudaStrea // Run the fmha kernel. mFmhaDispatcher->run(fmhaParams); sync_check_cuda_error(stream); - // The kv cache might need to be updated after FMHA (only when sliding window attention + chunked context is - // used together). Reuse the preprocessingParams. - invokeKvCachePostprocessing(preprocessingParams, stream); - sync_check_cuda_error(stream); if (mCpSize > 1 && mAttnTpSize > 1 && mAttnCpSize == 1) { diff --git a/cpp/tensorrt_llm/common/attentionOp.h b/cpp/tensorrt_llm/common/attentionOp.h index b738fdaf2f..fb71c06d57 100644 --- a/cpp/tensorrt_llm/common/attentionOp.h +++ b/cpp/tensorrt_llm/common/attentionOp.h @@ -391,6 +391,7 @@ public: bool mHasFullAttentionMask = false; bool mIsSpecDecodingEnabled = false; bool mUseSpecDecoding = false; + bool mIsSpecDecTree = true; bool mSpecDecodingIsGenerationLengthVariable = false; int32_t mSpecDecodingMaxGenerationLength = 1; bool mIsMLAEnabled = false; @@ -440,7 +441,7 @@ public: mBlockSparseParams.data(), mPagedKVCache, mTokensPerBlock, mKVCacheQuantMode.value(), mTpSize, mTpRank, mUnfuseQkvGemm, (int32_t) mType, mMaxContextLength, mQKVBiasEnabled, mCrossAttention, mMaxDistance, mPosShiftEnabled, mPagedContextFMHA, mFP8ContextFMHA, mDenseContextFMHA, mHasFullAttentionMask, - mIsSpecDecodingEnabled, mUseSpecDecoding, mSpecDecodingIsGenerationLengthVariable, + mIsSpecDecodingEnabled, mUseSpecDecoding, mIsSpecDecTree, mSpecDecodingIsGenerationLengthVariable, mSpecDecodingMaxGenerationLength, mIsMLAEnabled, mIsGenerationMLA, mUseGenFlashMLA, mMLAParams.data(), mCpSize, mCpRank, mCpGroup, mNumAttnHeads, mNumAttnKVHeads, mNumKVHeadsOrigin, mAttnTpSize, mAttnTpRank, mAttnCpSize, mAttnCpRank, mUlyssesMQABroadcast, mEnableContextFMHA, mFMHAForceFP32Acc, mMultiBlockMode, diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_128_S_q_k_v_128_alibi_tma_ws_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_128_S_q_k_v_128_alibi_tma_ws_sm90.cubin.cpp new file mode 100644 index 0000000000..0acae9aa71 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_128_S_q_k_v_128_alibi_tma_ws_sm90.cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0a0671e7cbbed9f51dc0c47e4b970e2f72067d629ff6562c9d65f9cd55c68578 +size 361861 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_128_S_q_k_v_128_softcapping_tma_ws_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_128_S_q_k_v_128_softcapping_tma_ws_sm90.cubin.cpp new file mode 100644 index 0000000000..4cb6bcd1c1 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_128_S_q_k_v_128_softcapping_tma_ws_sm90.cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5ec9817bebb07483ce29d8d91c45d35c2c05f0101bfa70146fba5a6576a6b825 +size 1091614 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_128_S_q_k_v_128_tma_ws_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_128_S_q_k_v_128_tma_ws_sm90.cubin.cpp new file mode 100644 index 0000000000..470904148a --- /dev/null +++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_128_S_q_k_v_128_tma_ws_sm90.cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0540cdb398818ec54a60c34b462c158e169347db73d244d633669d74211696ba +size 1467312 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_128_S_q_kv_128_softmax_tma_ws_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_128_S_q_kv_128_softmax_tma_ws_sm90.cubin.cpp index 8331dbce4d..281985341d 100644 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_128_S_q_kv_128_softmax_tma_ws_sm90.cubin.cpp +++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_128_S_q_kv_128_softmax_tma_ws_sm90.cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3fff0dfc8b05bdfd41b9f00d65567ff8a96f36e56a75b31e5c48835b7d9c90f6 -size 693780 +oid sha256:69bdfba64f1faff30ed8389a28b7b9ef37c0d180b1df643722b280011c8f74e8 +size 692990 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_128_S_q_kv_128_tma_ws_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_128_S_q_kv_128_tma_ws_sm90.cubin.cpp index 652139d105..8b8738474d 100644 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_128_S_q_kv_128_tma_ws_sm90.cubin.cpp +++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_128_S_q_kv_128_tma_ws_sm90.cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9fa28c23d82290a782267b18eaa36a545213045d493a72513e3a65305c0fb080 -size 672452 +oid sha256:c8173308813999ab64ba8236016b23fbfd3f3f1501f61290bf71ea027ead2920 +size 642456 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_128_S_q_paged_kv_128_alibi_tma_ws_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_128_S_q_paged_kv_128_alibi_tma_ws_sm90.cubin.cpp index a3c98f01b2..6ca952af64 100644 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_128_S_q_paged_kv_128_alibi_tma_ws_sm90.cubin.cpp +++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_128_S_q_paged_kv_128_alibi_tma_ws_sm90.cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:70b101d8936e175391d8051967ff5733a144118ff8793b29b612eac92abc581e -size 423439 +oid sha256:f41ae066b01b2a9c3b5165535f743461a9a1d559f6fcd0a00a04c554f8a50962 +size 414757 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_128_S_q_paged_kv_128_softcapping_tma_ws_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_128_S_q_paged_kv_128_softcapping_tma_ws_sm90.cubin.cpp index ee0ce30744..1a973c5d2e 100644 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_128_S_q_paged_kv_128_softcapping_tma_ws_sm90.cubin.cpp +++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_128_S_q_paged_kv_128_softcapping_tma_ws_sm90.cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:26ae7817cbed824212d92c0eb8b25d0f6b9d6281e4d4b6e95e9b6d6d2f5f0faf -size 1236860 +oid sha256:ab0be8e667d459e13135f96469613f1c095e47187b24e5d40c7c57583351a076 +size 1194236 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_128_S_q_paged_kv_128_tma_ws_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_128_S_q_paged_kv_128_tma_ws_sm90.cubin.cpp index e65389452d..8faf85254d 100644 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_128_S_q_paged_kv_128_tma_ws_sm90.cubin.cpp +++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_128_S_q_paged_kv_128_tma_ws_sm90.cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:97dcf2a904ca8ce22f2282644a53986b03f7c0d7948803d2b2b401d6a6dfb5a9 -size 1719120 +oid sha256:03d86280f76994e2e01d43747cb5c811496b8340d031ebb0c3bdd46437422994 +size 1654394 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_128_S_qkv_128_alibi_tma_ws_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_128_S_qkv_128_alibi_tma_ws_sm90.cubin.cpp index 23274d5f72..53f3032a30 100644 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_128_S_qkv_128_alibi_tma_ws_sm90.cubin.cpp +++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_128_S_qkv_128_alibi_tma_ws_sm90.cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d8a9578f22279c7f83f0126eada9fb14a959e3e841efd641b780be06d5e7ebde -size 375277 +oid sha256:35c5715bcb1a16c343f3a28be105fb6fee1bbca24cf832f71a7d0f20cf9a0b3e +size 365015 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_128_S_qkv_128_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_128_S_qkv_128_sm90.cubin.cpp index f8d1e75b2f..89a4eaa580 100644 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_128_S_qkv_128_sm90.cubin.cpp +++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_128_S_qkv_128_sm90.cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e8f883e1814759b4e4e643edb51465f132f27dd77392e9403908cd954eccb19e -size 1137402 +oid sha256:a3335a8d4b2c0ca63f006c3f957d57aa3f808ef06d4adda322c311a333286d84 +size 1126352 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_128_S_qkv_128_softcapping_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_128_S_qkv_128_softcapping_sm90.cubin.cpp index 8cf6386b36..9cb2eb33c2 100644 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_128_S_qkv_128_softcapping_sm90.cubin.cpp +++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_128_S_qkv_128_softcapping_sm90.cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:eb96a6fdcae7f8e19516c4bc4064ccd759906a8b0052e5148fd01e59c37e2f4f -size 652776 +oid sha256:fdc0bf099862d352b3b765e117437240a82e4749d3efd104881647dd4ea14562 +size 644092 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_128_S_qkv_128_softcapping_tma_ws_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_128_S_qkv_128_softcapping_tma_ws_sm90.cubin.cpp index 6f8890117c..153555cbe4 100644 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_128_S_qkv_128_softcapping_tma_ws_sm90.cubin.cpp +++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_128_S_qkv_128_softcapping_tma_ws_sm90.cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:93fb97424b5abb3f807b300bc67bc37f14355831d0ff1ffa2d5d9c0fd872731d -size 1137390 +oid sha256:ccd938df8f78af4eae306c6e9e669599c2baf6f095f956318470063c560fbd3c +size 1091610 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_128_S_qkv_128_tma_ws_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_128_S_qkv_128_tma_ws_sm90.cubin.cpp index 7e031d3bf8..cab205493a 100644 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_128_S_qkv_128_tma_ws_sm90.cubin.cpp +++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_128_S_qkv_128_tma_ws_sm90.cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a6803c454338b0a0c548204701ba4411ab55602b42cd2122140b5db09cd19660 -size 1537558 +oid sha256:ce4d35ab4c7b65476f0dcec635db1791fcb718afd6b3531338712f5b2bc9aa84 +size 1460204 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_32_S_q_paged_kv_64_sm86.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_32_S_q_paged_kv_64_sm86.cubin.cpp index 397d8f56d2..ab21a448f5 100644 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_32_S_q_paged_kv_64_sm86.cubin.cpp +++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_32_S_q_paged_kv_64_sm86.cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8396a30929e67e906ac438e011acdd1eac5e2bd2fa887c2f6ae8aa0f5b6ccda8 -size 514281 +oid sha256:d088ce37b21d335ba1f92034cf97f78fc968d7fecaa0c4f9ec83a0d5165f1d99 +size 482709 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_32_S_qkv_128_sm89.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_32_S_qkv_128_sm89.cubin.cpp index 18ba9e9449..2fa6ba246e 100644 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_32_S_qkv_128_sm89.cubin.cpp +++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_32_S_qkv_128_sm89.cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2c51433d1240dc1d8ab205f89b8cb7f83d93e0224850433610fd95555ecf6222 -size 665822 +oid sha256:40653ec672098e2cb1f94c473fa67852efcf6b49a6e8109e4fcf39422281acb4 +size 657930 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_32_S_qkv_128_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_32_S_qkv_128_sm90.cubin.cpp index 7ad270f386..ebdb0563ef 100644 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_32_S_qkv_128_sm90.cubin.cpp +++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_32_S_qkv_128_sm90.cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:60f4a4656af5bbeb2c8552bf9f9c7cd779586a4cb5cc9f6cbb1e38d8b279226d -size 684322 +oid sha256:96348957990518db6f51af7c681a71e625dede568cc8f8303dd2de8ad09bfc28 +size 677218 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_32_S_qkv_128_softcapping_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_32_S_qkv_128_softcapping_sm90.cubin.cpp index 2f1dde1db8..7cd5b267e0 100644 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_32_S_qkv_128_softcapping_sm90.cubin.cpp +++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_32_S_qkv_128_softcapping_sm90.cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:61dcb9e691d97658eb41885a1801dc84a2818b7b9939163864c60b2f2f698d01 -size 370981 +oid sha256:4687df80ac2fa9454b0564b0a80d78cfaedc2c7796c8f3a1010dd7ebbf722c83 +size 369401 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_64_256_S_q_kv_128_tma_ws_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_64_256_S_q_kv_128_tma_ws_sm90.cubin.cpp index 2b9e46c7a0..f4da9b9d86 100644 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_64_256_S_q_kv_128_tma_ws_sm90.cubin.cpp +++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_64_256_S_q_kv_128_tma_ws_sm90.cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d188489645839f22b23f7ab60024a38784246dd3cdebb2860afba4b17e555987 -size 981870 +oid sha256:d8b9985065f5f2c62b74c05f8eed02b1909c96656b26fbd7779cc57a2146b037 +size 947140 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_128_alibi_tma_ws_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_128_alibi_tma_ws_sm90.cubin.cpp index 536b3a60f9..8ffdb6589d 100644 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_128_alibi_tma_ws_sm90.cubin.cpp +++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_128_alibi_tma_ws_sm90.cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5bc5c98f5bb68ce8457192a8deb66fd33bd4e18181f6543a80ffee90f9fa889c -size 610511 +oid sha256:23599e63b07ad966df921daf3cb97a9ed5cde27eeda0fd96ba5abd835b48f89a +size 590779 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_128_softcapping_tma_ws_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_128_softcapping_tma_ws_sm90.cubin.cpp index 9ba28ff3ec..1153714c7e 100644 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_128_softcapping_tma_ws_sm90.cubin.cpp +++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_128_softcapping_tma_ws_sm90.cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:38facf3787477a775cb81819dd32adc2b14302a6e245ea1bd39a7c79a27f6be1 -size 1922792 +oid sha256:cd1c452565583b20913d835de9b14c2f19c0cc431bc926ea6c92295362a85bca +size 1813864 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_128_tma_ws_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_128_tma_ws_sm90.cubin.cpp index 079d5342e2..b6383dcbd5 100644 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_128_tma_ws_sm90.cubin.cpp +++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_128_tma_ws_sm90.cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:49d610072be65cb35753c025a6e34d297cb8b00763e31f032f8068fd49e82746 -size 2606330 +oid sha256:b20de2c6bb3081564ddfbf7ece80fb2c17e66f4e7ff0e0969da4e4655e90d1ec +size 2407418 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_64_256_S_qkv_128_alibi_tma_ws_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_64_256_S_qkv_128_alibi_tma_ws_sm90.cubin.cpp index ece0d7125e..3713748af5 100644 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_64_256_S_qkv_128_alibi_tma_ws_sm90.cubin.cpp +++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_64_256_S_qkv_128_alibi_tma_ws_sm90.cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:78b4569d41bffce532654f3b0641599049004acba634be1965685863f4485949 -size 570241 +oid sha256:33a0e8bb2391128e688e5c6356f09a5ed189ce5c1bcdeef4efc0ce0415dc2849 +size 555245 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_64_256_S_qkv_128_sage_64_64_256_output_bf16_tma_ws_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_64_256_S_qkv_128_sage_64_64_256_output_bf16_tma_ws_sm90.cubin.cpp index 779c844357..795d4d68fc 100644 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_64_256_S_qkv_128_sage_64_64_256_output_bf16_tma_ws_sm90.cubin.cpp +++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_64_256_S_qkv_128_sage_64_64_256_output_bf16_tma_ws_sm90.cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:12660d6342b533a1023650fe1c40ed8df1e303878035422e4995697de1abce6b -size 692632 +oid sha256:4b014f41b1cfdf6ed2729778841213a36440191eb3c087346a02c21510bd3f0e +size 665794 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_64_256_S_qkv_128_softcapping_tma_ws_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_64_256_S_qkv_128_softcapping_tma_ws_sm90.cubin.cpp index f32216bae9..5c8dbe22b2 100644 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_64_256_S_qkv_128_softcapping_tma_ws_sm90.cubin.cpp +++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_64_256_S_qkv_128_softcapping_tma_ws_sm90.cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ff17dcd50d76036338dc9f3d009b6b10f5d2b8a338342fef9018dd73a79f1b7a -size 1804378 +oid sha256:bd77afeb7dcd1ff8d6be80788b20e92e4fbc8c3026ba12d1d522c99316754a7c +size 1740442 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_64_256_S_qkv_128_tma_ws_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_64_256_S_qkv_128_tma_ws_sm90.cubin.cpp index a65367f707..ee1a46c9bc 100644 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_64_256_S_qkv_128_tma_ws_sm90.cubin.cpp +++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_64_256_S_qkv_128_tma_ws_sm90.cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:760cc23fd160128f4be3fd1dd6f6ef4bf18551106404b146b7f374af3fb81c4d -size 2338732 +oid sha256:b674707d02aac297b66d523de8b11618ca1598c49eeaf7ce9b1c9d516ce95c4b +size 2247958 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_128_128_S_q_kv_32_sm89.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_128_128_S_q_kv_32_sm89.cubin.cpp index e4141dd2d3..349c2efdfe 100644 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_128_128_S_q_kv_32_sm89.cubin.cpp +++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_128_128_S_q_kv_32_sm89.cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:de60062494c933226d989901d7fc15d886fd5a84c124f1c01fe583cb45281801 -size 601899 +oid sha256:7556f88488e05ee669e763b839afa1b7690060cfa9d8482d419c0ca336df9352 +size 595585 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_128_128_S_q_kv_64_sm89.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_128_128_S_q_kv_64_sm89.cubin.cpp index 8906ad11fe..2ccc55f144 100644 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_128_128_S_q_kv_64_sm89.cubin.cpp +++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_128_128_S_q_kv_64_sm89.cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:367458885389381731b08889460600b9a4e9542cc979a38ad05d6ca3992744b3 -size 912898 +oid sha256:ac9d879aa0c70967bb3a79cd7034998baf43a544c0dd4444ebddeb76e78df5ae +size 908162 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_128_128_S_q_paged_kv_32_sm89.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_128_128_S_q_paged_kv_32_sm89.cubin.cpp index 292e1a9232..ec1ef8aae9 100644 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_128_128_S_q_paged_kv_32_sm89.cubin.cpp +++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_128_128_S_q_paged_kv_32_sm89.cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:87b40dfd9d1ab2258d7de80a89820e686e87243ab43f7dd20990c871d4202841 -size 1408612 +oid sha256:4e781c0278fc46142f578ae51bfeb38767e89d9c25b92023215948f99dd1d3ed +size 1371512 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_128_128_S_q_paged_kv_40_sm89.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_128_128_S_q_paged_kv_40_sm89.cubin.cpp index c9db86ef9b..d904de0acb 100644 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_128_128_S_q_paged_kv_40_sm89.cubin.cpp +++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_128_128_S_q_paged_kv_40_sm89.cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ea80c0c776d59d68b5a47ed7ba0fc8e37ea38ab189419519795ca57dd7589304 -size 1475704 +oid sha256:d608e9e3ec460d2a38f43067a7d7a2dd408e068db690806bbafb11007e175336 +size 1419662 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_128_128_S_q_paged_kv_48_sm89.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_128_128_S_q_paged_kv_48_sm89.cubin.cpp index 398204974d..798e8482b4 100644 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_128_128_S_q_paged_kv_48_sm89.cubin.cpp +++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_128_128_S_q_paged_kv_48_sm89.cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b3c7887870f3defa8c2595868c2c8b40afb2ca0b090dc241ad8a34c754857ab4 -size 1475704 +oid sha256:9c1e1d300866c6425c2495e550230051debdca0a7eb85874ae33c0c2de8a81cb +size 1419662 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_128_128_S_q_paged_kv_64_sm89.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_128_128_S_q_paged_kv_64_sm89.cubin.cpp index ead5c96759..bbcce09e72 100644 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_128_128_S_q_paged_kv_64_sm89.cubin.cpp +++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_128_128_S_q_paged_kv_64_sm89.cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b797da09627dbf7661ccad3e8b7fd741330f008b3f8e033b7a3c7787a7233e1d -size 2003768 +oid sha256:132d83639e34af1b431abdcb3f09542d0389030b85752e18a3ae221ead7d24a3 +size 1965880 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_128_128_S_qkv_32_sm89.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_128_128_S_qkv_32_sm89.cubin.cpp index 4faeb657b9..83287a0376 100644 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_128_128_S_qkv_32_sm89.cubin.cpp +++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_128_128_S_qkv_32_sm89.cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c55e36802f8679e988ed6fac295314367dd9914c5ff457b7c4c5437ab8b53a41 -size 1391232 +oid sha256:4a96710f6c691580c2363c187a75fd436f5e6be732810a1a45182ce72dc52d1e +size 1380182 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_128_128_S_qkv_40_sm89.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_128_128_S_qkv_40_sm89.cubin.cpp index 85f6542b68..0062377934 100644 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_128_128_S_qkv_40_sm89.cubin.cpp +++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_128_128_S_qkv_40_sm89.cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7d9a65aa870c5057349809ae2cc7e03837e37ac3ef2e5633d19e69c444358c96 -size 1409386 +oid sha256:a6339f008f451d030aa36a6b3fac7179e7534f7f2474d641fa0ebfbf487074e7 +size 1401494 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_128_128_S_qkv_48_sm89.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_128_128_S_qkv_48_sm89.cubin.cpp index 15b05089cf..0d719af97a 100644 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_128_128_S_qkv_48_sm89.cubin.cpp +++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_128_128_S_qkv_48_sm89.cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:76cbfb5a29797bbeb2adad93c0c1e0fd4c1c544a6c12faa2a825cdb4eff1dff2 -size 1409386 +oid sha256:57ebcae2b70fc28881f2b3969868d64c203ef4a9cbc9588a9e28051c5f5b6849 +size 1401494 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_128_128_S_qkv_64_sm89.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_128_128_S_qkv_64_sm89.cubin.cpp index ea60da2843..ceab132d42 100644 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_128_128_S_qkv_64_sm89.cubin.cpp +++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_128_128_S_qkv_64_sm89.cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:61c16947041287198b160091a89f1677ebe7babed9c9da6f6625436f7b526a6f -size 1946134 +oid sha256:5e2a4ce1b944feb2b3ed535943089a2d5968bf523b149885df78f7fa4bd7e835 +size 1935872 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_q_kv_128_sm89.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_q_kv_128_sm89.cubin.cpp index bccbb4b8d8..2780675d9d 100644 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_q_kv_128_sm89.cubin.cpp +++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_q_kv_128_sm89.cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f1114bbd784a3ea000d86f00e35086435d50c430ed695448a306cfc4bd54f60c -size 309055 +oid sha256:f5d456b30f89ad05ba5b852fabcffb3f8269913d83ef8c0e4e319f2243dee54d +size 305897 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_q_kv_72_sm89.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_q_kv_72_sm89.cubin.cpp index 4d09371f99..2aa3fd4b0a 100644 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_q_kv_72_sm89.cubin.cpp +++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_q_kv_72_sm89.cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3c8905ae4aafc41cce6557456bdf08d7ae6eb5a93286ccbf5d0b745fb33cd298 -size 293267 +oid sha256:85593d3c2fecb6842a72952c6dcbde19a70e6b26245829d279ca50bb391eb636 +size 290109 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_q_paged_kv_104_sm89.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_q_paged_kv_104_sm89.cubin.cpp index 41214fa51d..b050acbb5a 100644 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_q_paged_kv_104_sm89.cubin.cpp +++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_q_paged_kv_104_sm89.cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e373ec7eb583a0803821145ec16f2ecf1a173c70f0796207750e51b97c72d604 -size 528501 +oid sha256:69cd61bd8334d2109067ef0460a91b8dba4c2cb07392eb636d72d025ccb15bf9 +size 498507 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_q_paged_kv_128_sm89.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_q_paged_kv_128_sm89.cubin.cpp index a946012b6b..e741d50f4c 100644 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_q_paged_kv_128_sm89.cubin.cpp +++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_q_paged_kv_128_sm89.cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2805c97b33142d036c8fc510d603e5c0d6d74174ae1f15b04feeedf44f0b5ab6 -size 702156 +oid sha256:0427b7729ce3cfa652a4595d04f936a947febec8f2c96ce33eed7cbaaa05613e +size 668214 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_q_paged_kv_160_sm89.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_q_paged_kv_160_sm89.cubin.cpp index ce6524aa57..eee064e280 100644 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_q_paged_kv_160_sm89.cubin.cpp +++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_q_paged_kv_160_sm89.cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:111f7cebf93583b831e5714ab597ef6cf9afe9a215a5a9bb1cedf04176f4129b -size 761356 +oid sha256:321bcd81b8965c8dfc08682f775508ae18e3ff711490ee8dff5fe56c20f74843 +size 711628 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_q_paged_kv_192_output_bf16_sm89.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_q_paged_kv_192_output_bf16_sm89.cubin.cpp index 7e03d88b7e..33f4d9cab3 100644 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_q_paged_kv_192_output_bf16_sm89.cubin.cpp +++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_q_paged_kv_192_output_bf16_sm89.cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9b44d7f8e5db9b0fd8ccdd905124faf5a703c89c6de326367ba200697fb518fa -size 806372 +oid sha256:aa77d3789c0ca314689125ec303a8af76554120a708a4b63395c69b7aad07f04 +size 752698 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_q_paged_kv_192_sm89.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_q_paged_kv_192_sm89.cubin.cpp index 053f856fb3..3138343090 100644 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_q_paged_kv_192_sm89.cubin.cpp +++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_q_paged_kv_192_sm89.cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:664ed6e91ccd091fb4733b55a2799d4562df876ef4e3be8ca79e6d0b55bace4a -size 803980 +oid sha256:aa35aa70d0fa304c776c076a1a189d32a054d3f696dac5d99018085d1108c73b +size 748726 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_q_paged_kv_256_sm89.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_q_paged_kv_256_sm89.cubin.cpp index ec8103b8a1..ca7815f710 100644 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_q_paged_kv_256_sm89.cubin.cpp +++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_q_paged_kv_256_sm89.cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:98431cb031d4d41035fd7a5a253fbf4b23214ba9e8689749ad23de925d97b0eb -size 999734 +oid sha256:d1a702d456b5acf279487dd810e3e33efdd1c7bd82530ceb5a32ad30ec30396c +size 946060 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_q_paged_kv_72_sm89.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_q_paged_kv_72_sm89.cubin.cpp index ebaa17c5c6..8bb9403c51 100644 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_q_paged_kv_72_sm89.cubin.cpp +++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_q_paged_kv_72_sm89.cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:48ab14dd4c3e988db85530381833b1753fc8579a8716df1a81799d122ecc19cd -size 520607 +oid sha256:558aa7d42de329c49361c94c4baef16738304b21b6adbe675d77c7819ef37660 +size 489823 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_q_paged_kv_80_sm89.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_q_paged_kv_80_sm89.cubin.cpp index fe3765594a..0754f76695 100644 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_q_paged_kv_80_sm89.cubin.cpp +++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_q_paged_kv_80_sm89.cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a4aa5c1c533f5ce60a50110a6bbfa2af6cd7a0488776cb1fd491ce594b0f94f4 -size 520607 +oid sha256:7b5baa6048e6c33e74c6d343eb7c76252ff2e534fe467b3189af12b5d64af37c +size 489823 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_q_paged_kv_96_sm89.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_q_paged_kv_96_sm89.cubin.cpp index 69da730357..68de134acb 100644 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_q_paged_kv_96_sm89.cubin.cpp +++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_q_paged_kv_96_sm89.cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b0dae8957de096f310cfe6bb977babbe745e7542072920a454a60b9ad05c4318 -size 530867 +oid sha256:e17cb191ad092e6db255ea503e49ea883ed56322fc58ed8d68710f6687376c1f +size 500083 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_qkv_104_sm89.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_qkv_104_sm89.cubin.cpp index 29a11c7b0b..3ebcc110ec 100644 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_qkv_104_sm89.cubin.cpp +++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_qkv_104_sm89.cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:849c37d9f772de883d6fa358161f977216d48932ef8a27cec2cfe931c9880e06 -size 500861 +oid sha256:bfca5660a931e08941347f7a0aefa82c214940e8eaa6b6d89cfded621f34a490 +size 496125 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_qkv_128_sage_64_32_32_output_bf16_sm89.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_qkv_128_sage_64_32_32_output_bf16_sm89.cubin.cpp index b1e2e33414..c0c882331e 100644 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_qkv_128_sage_64_32_32_output_bf16_sm89.cubin.cpp +++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_qkv_128_sage_64_32_32_output_bf16_sm89.cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:189df2e89d79e1969521dcb124bcd71f274493e369b2809fc5ed552e8be1977b -size 184391 +oid sha256:fffd2cd799953808034d7e7b89a57d4fede24db124bfb0d3938188177acbdfeb +size 182023 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_qkv_128_sage_64_32_32_output_fp16_sm89.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_qkv_128_sage_64_32_32_output_fp16_sm89.cubin.cpp index 76ed2ade98..458aa250b4 100644 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_qkv_128_sage_64_32_32_output_fp16_sm89.cubin.cpp +++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_qkv_128_sage_64_32_32_output_fp16_sm89.cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:43ae547cc799f0c688c19daee4bf357d6d2fe2c06d894bcded7ac40e699caced -size 184391 +oid sha256:19ada3a5d449542f103077db8d193bc2293a8f48ccee201e366473964287314c +size 182023 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_qkv_128_sm89.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_qkv_128_sm89.cubin.cpp index 344fd44626..65edc3e52a 100644 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_qkv_128_sm89.cubin.cpp +++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_qkv_128_sm89.cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:39c941a13e14d0cbfcd19e1d11f75047227aaf992d60b56e45f063f92ff80cc8 -size 667412 +oid sha256:b9c32124cd708aab7da30637d85437da0af9bf2157d163c19c6fe14498698cda +size 661096 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_qkv_160_sm89.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_qkv_160_sm89.cubin.cpp index 50293ac4e5..8213475b06 100644 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_qkv_160_sm89.cubin.cpp +++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_qkv_160_sm89.cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:868ce05564bbf9e23a3f6562bd75d537d1c5e901eeb0bbecb24261bcc7d23370 -size 676094 +oid sha256:7f248fd42759509c61d20f912ae74dc3a85448a9c8386370ea92492ed9031e80 +size 672936 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_qkv_192_output_bf16_sm89.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_qkv_192_output_bf16_sm89.cubin.cpp index 7f2a34961d..75bd11ff6e 100644 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_qkv_192_output_bf16_sm89.cubin.cpp +++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_qkv_192_output_bf16_sm89.cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:66d791187f871dc70a6b90cd9d60dc3db06d60c2beaefb3d75c2ff1f949d5458 -size 726636 +oid sha256:190fd946ddc7e1b5e9ca2172ec1de39c6288829773d9ce29fe98374256eff566 +size 721900 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_qkv_192_sm89.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_qkv_192_sm89.cubin.cpp index 13085d8c66..ed5e241d9e 100644 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_qkv_192_sm89.cubin.cpp +++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_qkv_192_sm89.cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6a065d8c65f022875bb49bdc9aa853061149ff2cdfcaf1f8cdf8a3efe456e8a5 -size 723454 +oid sha256:b7cd5976c836bcd75c0cadfe968050ac60bf89b93df021ad6c1681e159c497c5 +size 717928 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_qkv_256_sm89.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_qkv_256_sm89.cubin.cpp index b5ec7f76b4..44ce0c307f 100644 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_qkv_256_sm89.cubin.cpp +++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_qkv_256_sm89.cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:212ffad34a9b3002c1ab7e590bbadf1c94cb9847acbb479c311e9057c4e4c44b -size 932628 +oid sha256:7c536d725e1d9ebd2cb836dfe3993edcc81101534db6b7f1943c8a9443838bf4 +size 927892 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_qkv_72_sm89.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_qkv_72_sm89.cubin.cpp index 2099dc8665..0216db308c 100644 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_qkv_72_sm89.cubin.cpp +++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_qkv_72_sm89.cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e70aa7f7c6f8e41c5f142fd268a88fd0390f59ac9aad56b8be062a05f8f49ff8 -size 638994 +oid sha256:b5907da5a2f68c010d44bbbd0d780e097f9625be15b2f85e8dd1f00dd4c31ff9 +size 631890 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_qkv_80_sage_64_32_32_output_bf16_sm89.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_qkv_80_sage_64_32_32_output_bf16_sm89.cubin.cpp index b43312dbda..c63b37264a 100644 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_qkv_80_sage_64_32_32_output_bf16_sm89.cubin.cpp +++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_qkv_80_sage_64_32_32_output_bf16_sm89.cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d0cc18b1e3835a7cc42648d1bd0b63507020427299027667f9dd4faef37450ab -size 169391 +oid sha256:9cf14c71134a89ed6ffc83c0b7db06ed10e22b55294dc15ddf7f016427f01033 +size 159919 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_qkv_80_sage_64_32_32_output_fp16_sm89.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_qkv_80_sage_64_32_32_output_fp16_sm89.cubin.cpp index bb9d123fad..7d1ac80867 100644 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_qkv_80_sage_64_32_32_output_fp16_sm89.cubin.cpp +++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_qkv_80_sage_64_32_32_output_fp16_sm89.cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:90e97d06799b33f0f4ed6c68aa43616f4f2e013680909ca56d2e514a4481f0cf -size 169391 +oid sha256:f2b83c70dbc8ab0b3695dab3f4d2069b7ee7119e9140d7860b8c19f59a498589 +size 159919 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_qkv_80_sm89.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_qkv_80_sm89.cubin.cpp index 8e7857f9ec..4041bfc97a 100644 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_qkv_80_sm89.cubin.cpp +++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_qkv_80_sm89.cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c48f3c39368e774c4f3c281b7422e0b90e08321fa29591882c7071a635e1c3c6 -size 489019 +oid sha256:fc8369f5701dceea91d429a713ddcbb4ecb0ad08d3c9042688557ead5f00e9da +size 483493 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_qkv_96_sm89.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_qkv_96_sm89.cubin.cpp index 686a996434..f0afe3fcf1 100644 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_qkv_96_sm89.cubin.cpp +++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_qkv_96_sm89.cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b5edbd9d472583367857e998d65097561a9b36bc68ba1ae94f3b79940c7cb6f3 -size 501649 +oid sha256:4e9fffff2d13d49613e5f9334a010ca9bcde43b3bb55a792fd97fe2c867760dc +size 496123 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_64_S_q_paged_kv_192x128_output_bf16_sm89.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_64_S_q_paged_kv_192x128_output_bf16_sm89.cubin.cpp index dc1b346d23..03a4b33cef 100644 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_64_S_q_paged_kv_192x128_output_bf16_sm89.cubin.cpp +++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_64_S_q_paged_kv_192x128_output_bf16_sm89.cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9eeb56a178049dbe0869030e20eeb608423fd5e34e3720230e5ed4373717b91a -size 238849 +oid sha256:dd3041ba5a52263f7f02d64f1911c50e346151bf529e865c1abf22583abd3e21 +size 443285 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_64_S_q_paged_kv_192x128_sm89.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_64_S_q_paged_kv_192x128_sm89.cubin.cpp index c0b56e6cf0..6984f3c170 100644 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_64_S_q_paged_kv_192x128_sm89.cubin.cpp +++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_64_S_q_paged_kv_192x128_sm89.cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:00c69c0bfcb04dcd381677913781984ffafa3980922807faa94f125c01d7b901 -size 238035 +oid sha256:12482099b086249163085e6e3421a61f6e304f865aaf56dd15382614be5e48e7 +size 441683 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_64_S_q_paged_kv_576x512_output_bf16_sm89.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_64_S_q_paged_kv_576x512_output_bf16_sm89.cubin.cpp index d8dde7184a..2bb4cc2582 100644 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_64_S_q_paged_kv_576x512_output_bf16_sm89.cubin.cpp +++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_64_S_q_paged_kv_576x512_output_bf16_sm89.cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:cade6eee7a6be594da0a65e270954a11af436082b02bdd036aeddf9486812996 -size 298837 +oid sha256:bfea1ea1627eaef7b614db08bad00bda8b611c8e466c858e050c0ce2aee2eafb +size 298049 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_64_S_q_paged_kv_576x512_sm89.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_64_S_q_paged_kv_576x512_sm89.cubin.cpp index 394e497b75..7e76c5e13d 100644 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_64_S_q_paged_kv_576x512_sm89.cubin.cpp +++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_64_S_q_paged_kv_576x512_sm89.cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:470b274928968dc99c7cc1299cb906a9c38c2e5ddb556591047677e8b968b2c9 -size 298025 +oid sha256:f828600699faa3a0474085cbbe88d2e0ac7c8e056c976b81a882c3a72682e527 +size 296445 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_64_S_qkv_192x128_output_bf16_sm89.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_64_S_qkv_192x128_output_bf16_sm89.cubin.cpp index c4a5aff2bd..1c1f7bdc42 100644 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_64_S_qkv_192x128_output_bf16_sm89.cubin.cpp +++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_64_S_qkv_192x128_output_bf16_sm89.cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6d9c45c07e5f4513fa4666178709a7051042e1fa791d0ddfe9540802ddf36194 -size 231731 +oid sha256:2d4b297922065ecb79b4a1278d048b253b57601d011fc5833a32f9fc1b78e58e +size 427485 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_64_S_qkv_192x128_sm89.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_64_S_qkv_192x128_sm89.cubin.cpp index 6ba4c09f1e..68394c07c1 100644 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_64_S_qkv_192x128_sm89.cubin.cpp +++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_64_S_qkv_192x128_sm89.cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:682a0bc5821e74d56736641ecd8a7ccb1a7d7352183eda62a56edaa280d99004 -size 230917 +oid sha256:3fd5305445c9856fbd5d9dfaffdd7f87b9014638f33fb63fb2cb4fce9893b20b +size 425883 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_128_128_S_q_paged_kv_64_sm80.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_128_128_S_q_paged_kv_64_sm80.cubin.cpp index 8fd17c8d5b..51778ad0e9 100644 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_128_128_S_q_paged_kv_64_sm80.cubin.cpp +++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_128_128_S_q_paged_kv_64_sm80.cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2dbba9a30ed262e3096c4e7d7c3e4fdadd3e073e41894e8258de9274e08979d7 -size 1615406 +oid sha256:2b7fee97097f799830df2bcb1c782c7ea9018243cbd5cd0e0f47ec299b49db79 +size 1524634 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_128_S_q_k_v_128_alibi_tma_ws_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_128_S_q_k_v_128_alibi_tma_ws_sm90.cubin.cpp new file mode 100644 index 0000000000..537871847d --- /dev/null +++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_128_S_q_k_v_128_alibi_tma_ws_sm90.cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8ac2f9270988bc02329ce11ef3413395b2b8cdc55fcf4911d170536c6e618317 +size 403697 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_128_S_q_k_v_128_softcapping_tma_ws_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_128_S_q_k_v_128_softcapping_tma_ws_sm90.cubin.cpp new file mode 100644 index 0000000000..6bf814ac8a --- /dev/null +++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_128_S_q_k_v_128_softcapping_tma_ws_sm90.cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1234cf31a3a6b84ed25fa0ad6c4df9b53f673f6bac2f639a66086ba50f8717ba +size 1120818 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_128_S_q_k_v_128_tma_ws_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_128_S_q_k_v_128_tma_ws_sm90.cubin.cpp new file mode 100644 index 0000000000..3bebbebcf1 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_128_S_q_k_v_128_tma_ws_sm90.cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0fff300932a16d30844e317ace515a178f159c483e436f6955983b96c5c424c6 +size 1549402 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_128_S_q_kv_128_softmax_tma_ws_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_128_S_q_kv_128_softmax_tma_ws_sm90.cubin.cpp index b9e28a17c5..ef64a37682 100644 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_128_S_q_kv_128_softmax_tma_ws_sm90.cubin.cpp +++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_128_S_q_kv_128_softmax_tma_ws_sm90.cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:dbd51135c48812f21f53811b57057cabbef6c7a8a7833c411d8f8c47a2285c65 -size 724564 +oid sha256:ed10767ec913d314936fc5dbd1fd70c5381a622bf3fcf1590f837da6d3285bca +size 723774 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_128_S_q_kv_128_tma_ws_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_128_S_q_kv_128_tma_ws_sm90.cubin.cpp index 7a93dfaa65..d0bc52f131 100644 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_128_S_q_kv_128_tma_ws_sm90.cubin.cpp +++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_128_S_q_kv_128_tma_ws_sm90.cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c9ca2010bc714808c4e62ad7a66ae070e18bd40f678f46663b5f46d964283e6c -size 704814 +oid sha256:7e7a7a9653a9c4e4e9b0514fc1d70abbb4521c7edbede52568d17d0779d62ffb +size 671662 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_128_S_q_paged_kv_128_alibi_tma_ws_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_128_S_q_paged_kv_128_alibi_tma_ws_sm90.cubin.cpp index a16884caed..3056a533d6 100644 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_128_S_q_paged_kv_128_alibi_tma_ws_sm90.cubin.cpp +++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_128_S_q_paged_kv_128_alibi_tma_ws_sm90.cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:aff65d92093547c644da83b9800c8d8393f1a9d530f809b6bb35138afbe669c8 -size 454223 +oid sha256:1e18db0cd4de65e76e30f219d24ec00095fb16005882c43322182c5fa3f59032 +size 445541 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_128_S_q_paged_kv_128_sm80.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_128_S_q_paged_kv_128_sm80.cubin.cpp index 91712bb82c..50d7f1bece 100644 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_128_S_q_paged_kv_128_sm80.cubin.cpp +++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_128_S_q_paged_kv_128_sm80.cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3242c721b07ab2f56698b11c16f2766b61f1a27c8c30e9458e5179a71340cf76 -size 1377818 +oid sha256:9aceb502c1a95f58f1eab515cf2aeac92be6d255ef405008a4fd871fd54e9ba6 +size 1242842 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_128_S_q_paged_kv_128_softcapping_tma_ws_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_128_S_q_paged_kv_128_softcapping_tma_ws_sm90.cubin.cpp index 5d684d6316..1a74df1288 100644 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_128_S_q_paged_kv_128_softcapping_tma_ws_sm90.cubin.cpp +++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_128_S_q_paged_kv_128_softcapping_tma_ws_sm90.cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:cd323cec032400ab6c820d02d9e1c6da22ad0b627a0bf6bf51de0c0ab4aad99c -size 1260540 +oid sha256:ec96248452f638bb9ca50d3630dd67caf71322c01b17aff301c4a98eb7e27974 +size 1215548 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_128_S_q_paged_kv_128_tma_ws_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_128_S_q_paged_kv_128_tma_ws_sm90.cubin.cpp index 138e82ec0c..e03f7c2575 100644 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_128_S_q_paged_kv_128_tma_ws_sm90.cubin.cpp +++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_128_S_q_paged_kv_128_tma_ws_sm90.cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3adf59ee5801afeed6c1a51c6ca6bf504e534c3c277dd58c91d1818e13c726be -size 1790160 +oid sha256:dabc44860e81532e9b7ecb35773d0ad409d45361e20c9510d24387039999a7c3 +size 1720698 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_128_S_qkv_128_alibi_tma_ws_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_128_S_qkv_128_alibi_tma_ws_sm90.cubin.cpp index 481792268b..b1d87c1278 100644 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_128_S_qkv_128_alibi_tma_ws_sm90.cubin.cpp +++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_128_S_qkv_128_alibi_tma_ws_sm90.cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e17333a518382c1d0980c8c8c4500df358846c602db5f7f2c413f135f3ff263e -size 416321 +oid sha256:0d9c8d1fe282f46c12898ed4851a2640cb33ba5d75c5fe9da8a988f818a0e733 +size 407639 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_128_S_qkv_128_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_128_S_qkv_128_sm90.cubin.cpp index 62e54f7ecc..2a12ddb711 100644 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_128_S_qkv_128_sm90.cubin.cpp +++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_128_S_qkv_128_sm90.cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5654ec576d9e76bec93bbc11dfc7142bf4e57d1bc718e8c76e1b8a9c9dced0dc -size 1108986 +oid sha256:849a280994b3fa1f18ca6c3866a16a68a9b02831f134f8dfcf0d34502c1d6772 +size 1102672 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_128_S_qkv_128_softcapping_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_128_S_qkv_128_softcapping_sm90.cubin.cpp index b485cdcf2e..a2c78e856d 100644 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_128_S_qkv_128_softcapping_sm90.cubin.cpp +++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_128_S_qkv_128_softcapping_sm90.cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:09f3e9c7de20a1fd78f68d32b4be0301a8426ea8b61c90a361968e143a409dee -size 633042 +oid sha256:4e209b01409585433406f8392c77a7398270ee1b58446b728cf74faa6fe1bf9a +size 629884 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_128_S_qkv_128_softcapping_tma_ws_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_128_S_qkv_128_softcapping_tma_ws_sm90.cubin.cpp index 84b753442a..61bbc8d762 100644 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_128_S_qkv_128_softcapping_tma_ws_sm90.cubin.cpp +++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_128_S_qkv_128_softcapping_tma_ws_sm90.cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:22a85bd4725e2ca09a3f45519b9abd3d353f5de8cb5994f40213f5dca233e0ad -size 1162650 +oid sha256:0a22bb0202916831eced0a44acbab769d5647937155e0a2b5e6d0d0cb83c726f +size 1122394 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_128_S_qkv_128_tma_ws_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_128_S_qkv_128_tma_ws_sm90.cubin.cpp index 0445af1cfa..e0170f8db7 100644 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_128_S_qkv_128_tma_ws_sm90.cubin.cpp +++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_128_S_qkv_128_tma_ws_sm90.cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c373d9294f2adc0601433f57e1369eef8ec03a6fc0c0a514b5338ed313e6a6e2 -size 1620438 +oid sha256:582d17d48c7a751a345f74cc8c74f9b8c05278ddfc185da4906310a4973a9bdb +size 1547030 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_32_S_qkv_128_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_32_S_qkv_128_sm90.cubin.cpp index 81125e7086..456d75f72f 100644 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_32_S_qkv_128_sm90.cubin.cpp +++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_32_S_qkv_128_sm90.cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c70a136dfd55771b4218b60536d034f6dbcf285353ce8ea75c8fc93d33d09450 -size 609335 +oid sha256:70f02b7329eef7ceeb73dd43c3bf8f6ea6132c593bba6dbbed720d8b8ff0c287 +size 603809 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_32_S_qkv_128_softcapping_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_32_S_qkv_128_softcapping_sm90.cubin.cpp index 8e7059ad2b..0c0712acaf 100644 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_32_S_qkv_128_softcapping_sm90.cubin.cpp +++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_32_S_qkv_128_softcapping_sm90.cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0af8defec56bebfe634eafe3825626e91301937a1beafd5e2cb61d28e18e86dd -size 333093 +oid sha256:f67d4e70c39bf379ed0f3ef73a3690ac64efaee1e7134c793a760924c270f046 +size 329935 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_128_S_q_k_v_128_alibi_tma_ws_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_128_S_q_k_v_128_alibi_tma_ws_sm90.cubin.cpp new file mode 100644 index 0000000000..f35d06ef06 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_128_S_q_k_v_128_alibi_tma_ws_sm90.cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c2c284c6cb66207bd204bd1b6abe45aa8bf2e0c92631681861df237b8f849a46 +size 363451 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_128_S_q_k_v_128_softcapping_tma_ws_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_128_S_q_k_v_128_softcapping_tma_ws_sm90.cubin.cpp new file mode 100644 index 0000000000..73d9547cf2 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_128_S_q_k_v_128_softcapping_tma_ws_sm90.cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d3bede327d80be420e7bf011ee1a4156365afff7020bbf5a8434da18cb19fb23 +size 1093202 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_128_S_q_k_v_128_tma_ws_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_128_S_q_k_v_128_tma_ws_sm90.cubin.cpp new file mode 100644 index 0000000000..998e46d1f1 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_128_S_q_k_v_128_tma_ws_sm90.cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5ee7695bd5bb0a03eafe29a497060d84caec96ca4d159e99e4f02b99977dd2a6 +size 1469690 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_128_S_q_kv_128_softmax_tma_ws_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_128_S_q_kv_128_softmax_tma_ws_sm90.cubin.cpp index 813ec5559e..a76bf3814f 100644 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_128_S_q_kv_128_softmax_tma_ws_sm90.cubin.cpp +++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_128_S_q_kv_128_softmax_tma_ws_sm90.cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9e05e42418d14593b3d990875c8d813441176118804a2b6d79bc19c420ad176d -size 695368 +oid sha256:cecca7ad5c652989a3008c8219177811ab9c7d617adbbc9ed8548141803c66f5 +size 694578 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_128_S_q_kv_128_tma_ws_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_128_S_q_kv_128_tma_ws_sm90.cubin.cpp index 131f465927..71a5743dd9 100644 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_128_S_q_kv_128_tma_ws_sm90.cubin.cpp +++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_128_S_q_kv_128_tma_ws_sm90.cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3eee694dc657713c85cd5daefb80742ec9789cf01846683d490ecc237863aeda -size 674040 +oid sha256:bd6847c0e897eb794a9b1ff67e64358527fe64c3e01fc214545cf76ec60edc6d +size 644046 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_128_S_q_paged_kv_128_alibi_tma_ws_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_128_S_q_paged_kv_128_alibi_tma_ws_sm90.cubin.cpp index 61f3af8c37..ea50fb0631 100644 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_128_S_q_paged_kv_128_alibi_tma_ws_sm90.cubin.cpp +++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_128_S_q_paged_kv_128_alibi_tma_ws_sm90.cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8baad0ecf9c9f2afcff799f063c24c3d1475f45f4097977bacdfea37fd9fc6db -size 424239 +oid sha256:118cc6d4a5e3e12ce0f2727361fd1d52d1a49c67d0bd1837c24e528c064a0dd7 +size 415557 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_128_S_q_paged_kv_128_softcapping_tma_ws_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_128_S_q_paged_kv_128_softcapping_tma_ws_sm90.cubin.cpp index ef55d9b350..285c32ec70 100644 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_128_S_q_paged_kv_128_softcapping_tma_ws_sm90.cubin.cpp +++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_128_S_q_paged_kv_128_softcapping_tma_ws_sm90.cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:693859c24beb3519f369aa92d5b3097fa7323b5f9e911dd508c029f0289bef17 -size 1238450 +oid sha256:36d6c97af5fb15f32cd1ff13f53dd98a7d670cb80ee766765f42cc453f730812 +size 1195826 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_128_S_q_paged_kv_128_tma_ws_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_128_S_q_paged_kv_128_tma_ws_sm90.cubin.cpp index 5644a54c5b..bd266daa63 100644 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_128_S_q_paged_kv_128_tma_ws_sm90.cubin.cpp +++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_128_S_q_paged_kv_128_tma_ws_sm90.cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5e4ae887df4aaa7f402cc3fc9e44bff89b4211d6b9ad8875a99e44362e188557 -size 1722286 +oid sha256:7775bbc1b43487236cf7570d2ed900f1c9830eab70aac1fa9dc59c439cc0c687 +size 1657562 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_128_alibi_tma_ws_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_128_alibi_tma_ws_sm90.cubin.cpp index 755f0195b6..2d3c2887be 100644 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_128_alibi_tma_ws_sm90.cubin.cpp +++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_128_alibi_tma_ws_sm90.cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:97d53942b6dd1ad8bd7596ffba97f79b5f9c932beb5553a22d7aeaa1f16299f9 -size 376865 +oid sha256:199b1ff3cc3d0ff04477ff8f1e6390dd62b3a7c9dd264cc73ce6c716af20a0f9 +size 366603 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_128_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_128_sm90.cubin.cpp index f03bac6ad1..e0073c3730 100644 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_128_sm90.cubin.cpp +++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_128_sm90.cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:eaf758af72cf17bca3eca50fa0062fe64a354297bc02a4948226e33bbdcb5bb2 -size 1139780 +oid sha256:2e743b470f9607abcbc8b71e7ef67455e6104daf3a80d0bd012a96ecf90a8f18 +size 1128730 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_128_softcapping_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_128_softcapping_sm90.cubin.cpp index 1723635712..1553e77aee 100644 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_128_softcapping_sm90.cubin.cpp +++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_128_softcapping_sm90.cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:13ac9af1a09a4c5ff6eddd9565840aaac11e6072dac3c7a1bb5377705b5d120b -size 653574 +oid sha256:366aa4e9f3263f73c4e76c0ea8008c0449b6d89bcade761500af949912786e32 +size 644892 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_128_softcapping_tma_ws_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_128_softcapping_tma_ws_sm90.cubin.cpp index 55070baa1f..cd0531dde0 100644 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_128_softcapping_tma_ws_sm90.cubin.cpp +++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_128_softcapping_tma_ws_sm90.cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c35488ad990365bc5f50b7b2bfad2572f48ee9060345435e817384d41b4f3b13 -size 1138980 +oid sha256:5b8a8d76e17a24afd7af1dc5e112828f98ace78e3f85a7efaadb0cf1937085cc +size 1093198 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_128_tma_ws_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_128_tma_ws_sm90.cubin.cpp index 1ca06ff0c6..54fd20f69c 100644 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_128_tma_ws_sm90.cubin.cpp +++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_128_tma_ws_sm90.cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f0be66ba8c48682577dee9a7a75a5fdd9e363332881a6400c643a38d7dea16ca -size 1539936 +oid sha256:aeffa2db467fbae3ace85fae9f31e2b8a7c0923ab349ade42318ae6f55249ac8 +size 1462582 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_32_S_qkv_128_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_32_S_qkv_128_sm90.cubin.cpp index f76871460c..673041f7af 100644 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_32_S_qkv_128_sm90.cubin.cpp +++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_32_S_qkv_128_sm90.cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ce5bcf4c0194abce62b39cd408d5a449e3725badf28d51510e7775df30d0ccd9 -size 685912 +oid sha256:ffc92513e64631c33290f1e88e5666f5b85251506d527745c493f2e90da39de4 +size 678808 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_32_S_qkv_128_softcapping_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_32_S_qkv_128_softcapping_sm90.cubin.cpp index daf415f99a..c39e7fa450 100644 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_32_S_qkv_128_softcapping_sm90.cubin.cpp +++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_32_S_qkv_128_softcapping_sm90.cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:fe521017d6cb30dc5f434b809068533a31db662dfa8d19af927ff79761230c62 -size 371779 +oid sha256:faad8cb1e44f5e16f61720966d2a6c9e782461c209cd8000263b50d42093444d +size 370201 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/fmhaRunner.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/fmhaRunner.cpp index a0f68d8080..a0197d8083 100644 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/fmhaRunner.cpp +++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/fmhaRunner.cpp @@ -634,21 +634,6 @@ void FusedMHARunnerV2::run(MHARunnerParams runnerParams) { setTmaDescriptors(runnerParams); } - // Check if the sliding window size is valid or not. - if (mFixedParams.attentionInputLayout == AttentionInputLayout::Q_PAGED_KV - && mLaunchParams.attention_mask_type == ContextAttentionMaskType::SLIDING_OR_CHUNKED_CAUSAL) - { - uint32_t q_step = 0, kv_step = 0; - xmmaKernel->getStepSize(q_step, kv_step, mKernelParams, mLaunchParams); - // The sliding window size needs to be multiple of kv_step, so that the paged context fmha can read the cyclic - // kv cache correctly. - if (runnerParams.kvSeqLen > runnerParams.slidingWindowSize) - { - TLLM_CHECK_WITH_INFO(mKernelParams.sliding_window_size % kv_step == 0, - "The sliding window size doesn't work with paged context fmha kv_step_size = %d.", kv_step); - } - } - // Select the kernel and run it. xmmaKernel->run(mKernelParams, mLaunchParams, runnerParams.stream); } diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderMaskedMultiheadAttentionTemplate.h b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderMaskedMultiheadAttentionTemplate.h index 6ac3f46bf4..744029c177 100644 --- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderMaskedMultiheadAttentionTemplate.h +++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderMaskedMultiheadAttentionTemplate.h @@ -1335,7 +1335,7 @@ __global__ void __launch_bounds__(MAX_THEADS_PER_BLOCK, MIN_BLOCKS_PER_SM) maske // Note that the maximum sequence length supported by the model might be greater than this. // Note max_attention_window_size is maximum of cyclic_attention_window_size among all layers. // By default, you can assume that they are the same. - auto const cyclic_kv_cache_len = static_cast(params.cyclic_attention_window_size); + auto const cyclic_kv_cache_len = params.cyclic_attention_window_size; // The chunked attention size. auto const chunked_attention_size = static_cast(params.chunked_attention_size); // The number of sink tokens in kv cache to support streamingllm @@ -1363,7 +1363,8 @@ __global__ void __launch_bounds__(MAX_THEADS_PER_BLOCK, MIN_BLOCKS_PER_SM) maske #ifndef MMHA_USE_FP32_ACCUM_FOR_LOGITS if (sizeof(Tk) != 4) { - auto const max_timesteps = min(timestep, min(cyclic_kv_cache_len, chunked_attention_size)); + auto const max_timesteps + = min(timestep, min(static_cast(cyclic_kv_cache_len), chunked_attention_size)); logits_smem_ += divUp(max_timesteps + 1, 4u) * 16; } Tk* logits_smem = reinterpret_cast(logits_smem_); @@ -1489,21 +1490,18 @@ __global__ void __launch_bounds__(MAX_THEADS_PER_BLOCK, MIN_BLOCKS_PER_SM) maske int const tlength = DO_CROSS_ATTENTION ? params.memory_length_per_sample[batch_beam_idx] - 1 : (params.length_per_sample ? (params.length_per_sample[batch_beam_idx] - 1) : static_cast(timestep)); - // We will use cyclic kv cache when it exceeds the limit. - // The length position for storing new key and value. - int const cyclic_tlength = kvCacheBuffer.getKVTokenIdx(tlength); // When enable cyclic kv cache and one more block mode, we need to shift the index to the actual index in the // sequence. Otherwise, if the token is not the sink token, we need to add the bubblen length to the index. bool const enable_use_seq_idx_kv = kvCacheBuffer.mEnableOneMoreBlock && tlength > cyclic_kv_cache_len; int const shift_for_cyclic_kv = (enable_use_seq_idx_kv) ? tlength - cyclic_kv_cache_len : kvCacheBuffer.mBubbleLen; int const shift_for_cyclic_k = (enable_use_seq_idx_kv) ? tlength - cyclic_kv_cache_len : pastKCache.mBubbleLen; // The actual kv cache length. - // tlength is the past length actually. - int kv_loop_length = min(tlength, cyclic_kv_cache_len); - // The bound of the kv token idx (kv_loop_length = 0 should not happen ideally, but add here for safety). - int const kv_token_idx_bound = max(kv_loop_length - 1, 0); + // Minus 1 because the current token is also included in the attention window. + int kv_loop_length = min(tlength, cyclic_kv_cache_len - 1); + // The bound of the kv token idx (tlength = 0 should not happen ideally, but add here for safety). + int const kv_token_idx_bound = max(tlength - 1, 0); // The kv_token_start_offset. All tokens before kv_token_start_offset will be fully masked. - int kv_token_start_offset = 0; + int kv_token_start_offset = max(tlength - cyclic_kv_cache_len + 1, 0); // Only consider the current attention chunk if the chunked attention is used. if (params.chunked_attention_size_log2 > 0) { @@ -1515,8 +1513,6 @@ __global__ void __launch_bounds__(MAX_THEADS_PER_BLOCK, MIN_BLOCKS_PER_SM) maske // as context kv cache might be overwritten by the new kv cache int const beam0_context_length = HAS_BEAMS && tlength > cyclic_kv_cache_len ? 0 : params.input_lengths[batch_beam_idx]; - // The position of the current timestep, and it is used to apply the position embedding - int current_pos_idx = (!POS_SHIFT || DO_CROSS_ATTENTION) ? tlength : kv_loop_length; // The offset in the Q and K buffer also accounts for the batch. auto const qk_vec_idx = tidx * QK_VEC_SIZE; @@ -1557,7 +1553,7 @@ __global__ void __launch_bounds__(MAX_THEADS_PER_BLOCK, MIN_BLOCKS_PER_SM) maske { mmha::update_rotary_base_n_scale(rotary_embedding_base, rotary_embedding_scale, params.rotary_embedding_scale_type, params.rotary_embedding_dim, params.rotary_embedding_max_positions, - current_pos_idx); + tlength); // Query // The stride between tokens. We may be able to always use params.stride. uint32_t q_stride = params.stride ? static_cast(params.stride) : (num_heads * Dh); @@ -1581,8 +1577,8 @@ __global__ void __launch_bounds__(MAX_THEADS_PER_BLOCK, MIN_BLOCKS_PER_SM) maske if constexpr (DO_CROSS_ATTENTION) { auto const k_idx = QK_VEC_SIZE * tidx; - int const inBlockIdx = pastKCache.getKVLocalIdx(cyclic_tlength, hi_kv, Dh, k_idx); - Tcache* k_cache = reinterpret_cast(pastKCache.getKBlockPtr(batch_beam_idx, cyclic_tlength)); + int const inBlockIdx = pastKCache.getKVLocalIdx(tlength, hi_kv, Dh, k_idx); + Tcache* k_cache = reinterpret_cast(pastKCache.getKBlockPtr(batch_beam_idx, tlength)); if constexpr (ENABLE_8BITS_K_CACHE) { @@ -1673,18 +1669,19 @@ __global__ void __launch_bounds__(MAX_THEADS_PER_BLOCK, MIN_BLOCKS_PER_SM) maske if (HANDLE_KV) { apply_rotary_embedding(q, k, tidx, params.rotary_embedding_dim, rotary_embedding_base, - rotary_embedding_scale, current_pos_idx, rotary_embedding_inv_freq_cache); + rotary_embedding_scale, tlength, rotary_embedding_inv_freq_cache); } else { apply_rotary_embedding(q, tidx, params.rotary_embedding_dim, rotary_embedding_base, rotary_embedding_scale, - current_pos_idx, rotary_embedding_inv_freq_cache); + tlength, rotary_embedding_inv_freq_cache); } break; } case PositionEmbeddingType::kLONG_ROPE: case PositionEmbeddingType::kROPE_M: case PositionEmbeddingType::kROPE_GPT_NEOX: + case PositionEmbeddingType::kYARN: { bool const do_rotary = is_valid_qk_vec && QK_VEC_SIZE * tidx < params.rotary_embedding_dim; @@ -1697,9 +1694,10 @@ __global__ void __launch_bounds__(MAX_THEADS_PER_BLOCK, MIN_BLOCKS_PER_SM) maske int const smem_pitch = half_rotary_dim; // TODO: adjust for bank conflicts assert(half_rotary_dim % QK_VEC_SIZE == 0); + int position_idx = tlength; if (params.position_embedding_type == PositionEmbeddingType::kROPE_M && params.mrope_position_deltas != nullptr) { - current_pos_idx += params.mrope_position_deltas[batch_idx]; + position_idx += params.mrope_position_deltas[batch_idx]; } if (do_rotary) @@ -1726,7 +1724,7 @@ __global__ void __launch_bounds__(MAX_THEADS_PER_BLOCK, MIN_BLOCKS_PER_SM) maske mmha::vec_from_smem_transpose(k, k_smem_, transpose_idx, smem_pitch); mmha::apply_rotary_embedding(q, k, transpose_idx / tidx_factor, params.rotary_embedding_dim, - rotary_embedding_base, rotary_embedding_scale, current_pos_idx, rotary_embedding_inv_freq_cache, + rotary_embedding_base, rotary_embedding_scale, position_idx, rotary_embedding_inv_freq_cache, rotary_embedding_m_scale, params.rotary_cogvlm_vision_start, params.rotary_cogvlm_vision_length); mmha::write_smem_transpose(k, k_smem_, transpose_idx, smem_pitch); @@ -1734,7 +1732,7 @@ __global__ void __launch_bounds__(MAX_THEADS_PER_BLOCK, MIN_BLOCKS_PER_SM) maske else { mmha::apply_rotary_embedding(q, transpose_idx / tidx_factor, params.rotary_embedding_dim, - rotary_embedding_base, rotary_embedding_scale, current_pos_idx, rotary_embedding_inv_freq_cache, + rotary_embedding_base, rotary_embedding_scale, position_idx, rotary_embedding_inv_freq_cache, rotary_embedding_m_scale, params.rotary_cogvlm_vision_start, params.rotary_cogvlm_vision_length); } mmha::write_smem_transpose(q, q_smem_, transpose_idx, smem_pitch); @@ -2182,9 +2180,9 @@ __global__ void __launch_bounds__(MAX_THEADS_PER_BLOCK, MIN_BLOCKS_PER_SM) maske // Trigger the stores to global memory. Qk_vec_k k_vec = *reinterpret_cast(&k_smem[qk_vec_idx]); auto const k_idx = QK_VEC_SIZE * tidx; - int const inBlockIdx = kvCacheBuffer.getKVLocalIdx(cyclic_tlength, hi_kv, Dh, k_idx); + int const inBlockIdx = kvCacheBuffer.getKVLocalIdx(tlength, hi_kv, Dh, k_idx); // The base pointer for the value in the cache buffer. - Tcache* k_cache = reinterpret_cast(kvCacheBuffer.getKBlockPtr(batch_beam_idx, cyclic_tlength)); + Tcache* k_cache = reinterpret_cast(kvCacheBuffer.getKBlockPtr(batch_beam_idx, tlength)); if constexpr (ENABLE_8BITS_KV_CACHE) { @@ -2391,9 +2389,9 @@ __global__ void __launch_bounds__(MAX_THEADS_PER_BLOCK, MIN_BLOCKS_PER_SM) maske // One group of threads computes the product(s) for the current timestep. if (vo == kv_loop_length % V_PER_ITER && is_valid_vi && (!MULTI_BLOCK_FLAG || (c_tile == current_step_ctile_idx))) { - int const inBlockIdx = kvCacheBuffer.getKVLocalIdx(cyclic_tlength, hi_kv, Dh, vi); + int const inBlockIdx = kvCacheBuffer.getKVLocalIdx(tlength, hi_kv, Dh, vi); // The base pointer for the value in the cache buffer. - Tcache* v_cache_base = reinterpret_cast(kvCacheBuffer.getVBlockPtr(batch_beam_idx, cyclic_tlength)); + Tcache* v_cache_base = reinterpret_cast(kvCacheBuffer.getVBlockPtr(batch_beam_idx, tlength)); V_vec_k v; if (DO_CROSS_ATTENTION) diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplCommon.h b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplCommon.h index bc6bbf49d8..d2be7b328d 100644 --- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplCommon.h +++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplCommon.h @@ -88,21 +88,21 @@ struct XQAKernelRuntimeHasher size_t operator()(XQAKernelRuntimeHashKey const& s) const { size_t key = s.kv_data_type; - key <<= 16; + key <<= 16; // 16 key ^= s.head_size; - key <<= 8; + key <<= 8; // 24 key ^= s.num_q_heads_per_kv; - key <<= 8; + key <<= 8; // 32 key ^= s.beam_size; - key <<= 6; + key <<= 6; // 38 key ^= s.m_tilesize; - key <<= 10; + key <<= 10; // 48 key ^= s.tokens_per_page; - key <<= 1; + key <<= 1; // 49 key ^= s.paged_kv_cache; - key <<= 1; + key <<= 1; // 50 key ^= s.multi_query_tokens; - key <<= 1; + key <<= 1; // 51 key ^= s.is_fp8_output; key <<= 8; key ^= static_cast(s.position_embedding_type.value_or(static_cast(-1))); diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/compileEngine.cpp b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/compileEngine.cpp index 5191c59e7a..f0c71f3766 100644 --- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/compileEngine.cpp +++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/compileEngine.cpp @@ -103,7 +103,8 @@ CubinObj CompileEngine::compile() const // If applyRoPEInXqaKernel, no scratch is needed for storing intermediate RoPE result. Use input KV instead of // scratch in this case. /*use_input_kv=*/applyRoPEInXqaKernel, - /*rope_style=*/ropeStyle}; + /*rope_style=*/ropeStyle, + /*is_spec_dec_tree=*/mXqaParams.is_spec_dec_tree}; if (context.kernel_type == TLLM_XQA_JIT_MLA) { auto const& c = context; diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/decoderXQAImplJIT.cpp b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/decoderXQAImplJIT.cpp index 9406141471..3da27ff38c 100644 --- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/decoderXQAImplJIT.cpp +++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/decoderXQAImplJIT.cpp @@ -84,6 +84,12 @@ bool DecoderXQAImplJIT::mayHavePerfGain(XQAParams const& xqaParams) const // Always use at least 1 block regardless of history length multi_block_count = std::max(1, history_length / kMinHistoryTokensPerBlock); } + // Disable XQA for sliding window when cyclic_attention_window_size <= 256. + if (xqaParams.max_past_kv_length + 1 > xqaParams.cyclic_attention_window_size + && xqaParams.cyclic_attention_window_size <= 256) + { + return false; + } int block_count = num_kv_heads * batch_size * multi_block_count; return static_cast(block_count) * kEnableMinBlockFactor >= static_cast(mRunner->mMultiProcessorCount); } @@ -394,7 +400,9 @@ void DecoderXQAImplJIT::runImpl(XQAParams const& xqaParams, KVCacheBuffer const& else { appendParam(&launchParams.num_k_heads); - bool const allowSlidingWindow = !isSpecDec; + bool const allowSlidingWindow + = !(isSpecDec && xqaParams.is_spec_dec_tree); // sliding windows does not support spec dec with tree-based + // token, only chained tokens if (allowSlidingWindow) { appendParam(&launchParams.slidingWindowSize); diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/kernelUtils.cpp b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/kernelUtils.cpp index d0548c4018..c19b482b30 100644 --- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/kernelUtils.cpp +++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/kernelUtils.cpp @@ -153,12 +153,6 @@ bool supportConfigHMMA(XQAParams const& xqaParams, int SM, bool forConfigurePlug { return false; } - // @fixme: should work but it triggers illegal mem address in invokeQKVPreprocessing. - // Hopper XQA is fine because it does not use invokeQKVPreprocessing. - if (xqaParams.max_past_kv_length + 1 > xqaParams.cyclic_attention_window_size) - { - return false; - } } if (xqaParams.head_size % 16 != 0 || xqaParams.head_size < 16 || xqaParams.head_size > 256) { diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/include/nvrtcWrapper.h b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/include/nvrtcWrapper.h index dc04deb683..ab9e93f0d4 100644 --- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/include/nvrtcWrapper.h +++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/include/nvrtcWrapper.h @@ -63,6 +63,9 @@ extern "C" bool fp8_output; bool use_input_kv; tllmXqaJitRopeStyle rope_style; // useful only when use_input_kv is true. + + bool is_spec_dec_tree + = true; // useful only when multi_query_tokens, should be true unless using linear tree in spec-dec. } tllmXqaJitContext; // tllmXqaJitProgram is an opaque handle for a program. diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/src/nvrtcWrapper.cpp b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/src/nvrtcWrapper.cpp index 55312c31b1..c37d47295f 100644 --- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/src/nvrtcWrapper.cpp +++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/src/nvrtcWrapper.cpp @@ -205,10 +205,11 @@ tllmXqaJitStatus getMacroFlags(tllmXqaJitContext const* context, std::vectormulti_query_tokens ? "0" : "1"; + macros["SLIDING_WINDOW"] = context->multi_query_tokens && context->is_spec_dec_tree ? "0" : "1"; macros["LOW_PREC_OUTPUT"] = context->fp8_output ? "1" : "0"; macros["USE_INPUT_KV"] = context->use_input_kv ? "1" : "0"; macros["ROPE_STYLE"] = std::to_string(int(context->rope_style)); + macros["IS_SPEC_DEC_TREE"] = context->is_spec_dec_tree ? "1" : "0"; // Without these macros, NVRTC uses precompiled headers for cuda_fp16.h etc. // Linking might fail due to ABI incompatibility. diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQARunner.cpp b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQARunner.cpp index fda797800e..94800fcfe1 100644 --- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQARunner.cpp +++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQARunner.cpp @@ -82,7 +82,7 @@ DecoderXQAImpl* DecoderXQARunner::getImplFromXQAParams(XQAParams const& xqaParam // Hopper XQA supports spec dec with JIT, but only for E4M3 kv cache data type. Only allow 64%grpSize==0 for // now. bool const supportedByHopperXqa - = (smVersion == 90 && xqaParams.kv_cache_data_type == XQADataType::DATA_TYPE_E4M3 && 64 % grpSize == 0); + = (smVersion == 90 && xqaParams.kv_cache_data_type == XQADataType::DATA_TYPE_E4M3 && grpSize <= 64); bool const supportedBySm120Mla = (smVersion == 120 && xqaParams.isMLA() && xqaParams.kv_cache_data_type == XQADataType::DATA_TYPE_E4M3); return (supportedByHopperXqa || supportedBySm120Mla) ? mJITImpl.get() : mPrecompiledImpl.get(); diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/xqaParams.h b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/xqaParams.h index ee82da0c88..4c1ab13f05 100644 --- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/xqaParams.h +++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/xqaParams.h @@ -91,6 +91,8 @@ struct XQAParams int max_distance = 0; bool multi_block_mode; bool multi_query_tokens = false; + bool is_spec_dec_tree + = true; // by default, XQA spec-dec expect tree-based draft token, only affective when multi_query_tokens = true float const* logn_scaling_ptr = nullptr; // for logn scaling in XQA diff --git a/cpp/tensorrt_llm/kernels/unfusedAttentionKernels/unfusedAttentionKernels_2_template.h b/cpp/tensorrt_llm/kernels/unfusedAttentionKernels/unfusedAttentionKernels_2_template.h index b4951e8c23..65e55c65fb 100644 --- a/cpp/tensorrt_llm/kernels/unfusedAttentionKernels/unfusedAttentionKernels_2_template.h +++ b/cpp/tensorrt_llm/kernels/unfusedAttentionKernels/unfusedAttentionKernels_2_template.h @@ -530,32 +530,10 @@ __global__ void applyBiasRopeUpdateKVCache(QKVPreprocessingParams(logn_scale, q); } auto const channelIdx{tidx}; - auto const tokenIdxLowerBound - = max(cache_seq_len - params.cyclic_kv_cache_len + params.sink_token_len, params.sink_token_len); - bool const useKVCache = params.kv_cache_buffer.data != nullptr; - bool valid_kv_cache_pos = useKVCache // In KV-cache-less mode. No need to store KV values - && (token_idx_in_seq >= tokenIdxLowerBound || token_idx_in_seq < params.sink_token_len); - auto token_idx_in_kv_cache = token_idx_in_seq; - // Additional kv cache blocks will be allocated if sliding window attention and paged kv context fmha - // (!STORE_QKV) are used together as the original kv cache cannot be overwritten. In this case, new tokens' - // kv will just be appended to the kv cache instead of overwriting it in a circular way. And the kv cache - // will be overwritten after FMHA kernels. - if constexpr (STORE_QKV || GEN_PHASE) - { - // Write the new tokens' kv to the cyclic kv cache. - token_idx_in_kv_cache = params.kv_cache_buffer.getKVTokenIdx(token_idx_in_seq); - } - else - { - // Write the new tokens' kv to the temporary kv cache (write linearly to the cyclic kv cache first, then - // the temporary kv cache). - valid_kv_cache_pos = useKVCache; - if (past_seq_len >= params.cyclic_kv_cache_len) - { - token_idx_in_kv_cache = params.cyclic_kv_cache_len + local_token_idx; - } - } + bool const useKVCache = params.kv_cache_buffer.data != nullptr; + auto token_idx_in_kv_cache = token_idx_in_seq; + bool valid_kv_cache_pos = useKVCache; // Make sure pairs of q or v vecs have been read before write. // One block will handle single head. @@ -945,32 +923,8 @@ __global__ void applyBiasRopeUpdateKVCacheV2(QKVPreprocessingParams= tokenIdxLowerBound); - // Additional kv cache blocks will be allocated if sliding window attention and paged kv context fmha - // (!STORE_QKV) are used together as the original kv cache cannot be overwritten. In this case, new tokens' kv - // will just be appended to the kv cache instead of overwriting it in a circular way. And the kv cache will be - // overwritten after FMHA kernels. - if constexpr (STORE_QKV || GEN_PHASE) - { - bool const cyclic_kv_cache = cache_seq_len > params.cyclic_kv_cache_len; - - // Write the new tokens' kv to the cyclic kv cache. - token_idx_in_kv_cache - = cyclic_kv_cache ? (token_idx_in_kv_cache % params.cyclic_kv_cache_len) : token_idx_in_kv_cache; - } - else - { - // Write the new tokens' kv to the temporary kv cache (write linearly to the cyclic kv cache first, then the - // temporary kv cache). - valid_kv_cache_pos = useKVCache; - if (past_seq_len >= params.cyclic_kv_cache_len) - { - token_idx_in_kv_cache = params.cyclic_kv_cache_len + token_idx_in_seq; - } - } + bool valid_kv_cache_pos = useKVCache; auto kDst = useKVCache ? reinterpret_cast(params.kv_cache_buffer.getKBlockPtr(batch_idx, token_idx_in_kv_cache)) @@ -1111,7 +1065,8 @@ __global__ void applyBiasRopeUpdateKVCacheV2(QKVPreprocessingParams \ @@ -1275,7 +1230,8 @@ void kernelV1Dispatch(QKVPreprocessingParams params, cudaStrea config.attrs = attrs; \ if (params.position_embedding_type == PositionEmbeddingType::kROPE_GPT_NEOX \ || params.position_embedding_type == PositionEmbeddingType::kLONG_ROPE \ - || params.position_embedding_type == PositionEmbeddingType::kROPE_M) \ + || params.position_embedding_type == PositionEmbeddingType::kROPE_M \ + || params.position_embedding_type == PositionEmbeddingType::kYARN) \ { \ cudaLaunchKernelEx(&config, \ applyBiasRopeUpdateKVCacheV2 k, torch: op->mAttentionChunkSize = attention_chunk_size; - TORCH_CHECK(spec_decoding_bool_params.size() == 2, - "Expecting 2 bools for spec-dec mode, is_spec_decoding_enabled and use_spec_decoding."); + TORCH_CHECK(spec_decoding_bool_params.size() == 3, + "Expecting 3 bools for spec-dec mode, is_spec_decoding_enabled, use_spec_decoding, and is_spec_dec_tree."); op->mIsSpecDecodingEnabled = spec_decoding_bool_params[0]; // is_spec_decoding_enabled op->mUseSpecDecoding = spec_decoding_bool_params[1]; // use_spec_decoding + op->mIsSpecDecTree = spec_decoding_bool_params[2]; // is_spec_dec_tree if (is_mla_enable) { diff --git a/docker/Makefile b/docker/Makefile index 8382d96088..8432710af4 100644 --- a/docker/Makefile +++ b/docker/Makefile @@ -130,6 +130,7 @@ else endif SOURCE_DIR ?= $(shell readlink -f ..) CODE_DIR ?= /code/tensorrt_llm +EXTRA_VOLUMES ?= CCACHE_DIR ?= $(CODE_DIR)/cpp/.ccache CONAN_DIR ?= $(CODE_DIR)/cpp/.conan USER_CACHE_DIR ?= $(HOME_DIR)/.cache @@ -151,6 +152,7 @@ endif docker run $(DOCKER_RUN_OPTS) $(DOCKER_RUN_ARGS) \ $(GPU_OPTS) \ --volume $(SOURCE_DIR):$(CODE_DIR) \ + $(EXTRA_VOLUMES) \ $(if $(and $(filter 1,$(LOCAL_USER)),$(shell [ -w "$(USER_CACHE_DIR)" ] && echo 1)),--volume $(USER_CACHE_DIR):/home/$(USER_NAME)/.cache:rw) \ --env "CCACHE_DIR=$(CCACHE_DIR)" \ --env "CCACHE_BASEDIR=$(CODE_DIR)" \ diff --git a/docker/README.md b/docker/README.md index fa1b80a9fd..275de142a3 100644 --- a/docker/README.md +++ b/docker/README.md @@ -44,6 +44,11 @@ Containers can be started with the local user instead of `root` by appending `LO make -C docker devel_run LOCAL_USER=1 ``` +Extra docker volumes can be mounted in addition to the code repository by appending `EXTRA_VOLUMES=` to the run target: +```bash +make -C docker devel_run LOCAL_USER=1 EXTRA_VOLUMES="-v /pathA:/pathA -v /pathB:/pathB" +``` + Specific CUDA architectures supported by the `wheel` can be specified with `CUDA_ARCHS`: ```bash diff --git a/tensorrt_llm/_torch/attention_backend/flashinfer.py b/tensorrt_llm/_torch/attention_backend/flashinfer.py index 483dff5c98..b8bf330488 100644 --- a/tensorrt_llm/_torch/attention_backend/flashinfer.py +++ b/tensorrt_llm/_torch/attention_backend/flashinfer.py @@ -582,6 +582,11 @@ class FlashInferAttention(AttentionBackend[FlashInferAttentionMetadata]): if output is None: output = torch.empty_like(q) + # FlashInfer's sliding window attention is inclusive, while the attention window size defined in TRTLLM is exclusive. + # So we need to subtract 1 from the attention window size for a consistent behavior. + if attention_window_size is not None: + attention_window_size = attention_window_size - 1 + self.forward_impl(q=q, k=k, v=v, diff --git a/tensorrt_llm/_torch/attention_backend/trtllm.py b/tensorrt_llm/_torch/attention_backend/trtllm.py index d39c3e259a..a833515020 100644 --- a/tensorrt_llm/_torch/attention_backend/trtllm.py +++ b/tensorrt_llm/_torch/attention_backend/trtllm.py @@ -67,6 +67,7 @@ class TrtllmAttentionWrapper: v_head_dim: Optional[int] attention_chunk_size: Optional[int] use_spec_decoding: bool + is_spec_dec_tree: bool spec_decoding_position_offsets: Optional[torch.Tensor] spec_decoding_packed_mask: Optional[torch.Tensor] spec_decoding_generation_lengths: Optional[torch.Tensor] @@ -177,6 +178,7 @@ class TrtllmAttentionWrapper: softmax_stats_tensor: Optional[torch.Tensor] = None, is_spec_decoding_enabled: bool = False, use_spec_decoding: bool = False, + is_spec_dec_tree: bool = False, spec_decoding_position_offsets: Optional[torch.Tensor] = None, spec_decoding_packed_mask: Optional[torch.Tensor] = None, spec_decoding_generation_lengths: Optional[torch.Tensor] = None, @@ -258,6 +260,7 @@ class TrtllmAttentionWrapper: ) self.is_spec_decoding_enabled = is_spec_decoding_enabled self.use_spec_decoding = use_spec_decoding + self.is_spec_dec_tree = is_spec_dec_tree self.spec_decoding_position_offsets = spec_decoding_position_offsets self.spec_decoding_packed_mask = spec_decoding_packed_mask self.spec_decoding_generation_lengths = spec_decoding_generation_lengths @@ -405,7 +408,8 @@ class TrtllmAttentionWrapper: self.rotary_embedding_original_max_positions ] spec_decoding_bool_params = [ - self.is_spec_decoding_enabled, self.use_spec_decoding + self.is_spec_decoding_enabled, self.use_spec_decoding, + self.is_spec_dec_tree ] spec_decoding_tensor_params = [ self.spec_decoding_generation_lengths, @@ -1165,6 +1169,7 @@ class TrtllmAttention(AttentionBackend[TrtllmAttentionMetadata]): softmax_stats_tensor=softmax_stats_tensor, is_spec_decoding_enabled=metadata.is_spec_decoding_enabled, use_spec_decoding=metadata.use_spec_decoding, + is_spec_dec_tree=metadata.is_spec_dec_tree, spec_decoding_position_offsets=metadata. spec_decoding_position_offsets, spec_decoding_packed_mask=metadata.spec_decoding_packed_mask, diff --git a/tensorrt_llm/_torch/attention_backend/vanilla.py b/tensorrt_llm/_torch/attention_backend/vanilla.py index 3397ded646..125527455a 100644 --- a/tensorrt_llm/_torch/attention_backend/vanilla.py +++ b/tensorrt_llm/_torch/attention_backend/vanilla.py @@ -44,13 +44,11 @@ def generate_sliding_window_mask(batch_size: int, target_length: int, cache_position: torch.Tensor, device: torch.device, attention_window_size: int): - # TRTLLM's sliding window attention is inclusive. - effective_window_size = attention_window_size + 1 attention_mask_1 = torch.arange( target_length, device=device).unsqueeze(0) <= cache_position.unsqueeze(-1) attention_mask_2 = torch.arange(target_length, device=device).unsqueeze( - 0) > cache_position.unsqueeze(-1) - effective_window_size + 0) > cache_position.unsqueeze(-1) - attention_window_size attention_mask = attention_mask_1 & attention_mask_2 attention_mask = attention_mask[None, None, :, :].expand(batch_size, 1, -1, -1) diff --git a/tensorrt_llm/_torch/models/modeling_gemma3.py b/tensorrt_llm/_torch/models/modeling_gemma3.py index 9ed0a71da1..ccbe0165ca 100644 --- a/tensorrt_llm/_torch/models/modeling_gemma3.py +++ b/tensorrt_llm/_torch/models/modeling_gemma3.py @@ -68,7 +68,7 @@ class Gemma3Attention(Attention): rope_params.theta = config.rope_local_base_freq rope_params.scale_type = RotaryScalingType.none rope_params.scale = 1.0 - self.attention_window_size = config.sliding_window - 1 # Gemma3 sliding window isn't inclusive. + self.attention_window_size = config.sliding_window pos_embd_params = PositionalEmbeddingParams( type=PositionEmbeddingType.rope_gpt_neox, rope=rope_params, diff --git a/tensorrt_llm/_torch/models/modeling_llama.py b/tensorrt_llm/_torch/models/modeling_llama.py index 55bcac82bd..6ec6557961 100644 --- a/tensorrt_llm/_torch/models/modeling_llama.py +++ b/tensorrt_llm/_torch/models/modeling_llama.py @@ -14,6 +14,7 @@ from tensorrt_llm._torch.distributed import (AllReduce, AllReduceFusionOp, AllReduceParams, MoEAllReduce) from tensorrt_llm._torch.models.checkpoints.base_weight_mapper import \ BaseWeightMapper +from tensorrt_llm._utils import get_sm_version from tensorrt_llm.functional import PositionEmbeddingType from tensorrt_llm.logger import logger from tensorrt_llm.lora_manager import HfLoraLoader @@ -70,6 +71,9 @@ class Llama4Attention(Attention): # This is safe to do because we limit seqlen to 8k for # non TRTLLM backends. attention_chunk_size = None + elif get_sm_version() <= 90 and model_config.spec_config is not None: + # pre-Blackwell spec-dec kernel does not support + attention_chunk_size = None super().__init__( hidden_size=config.hidden_size, diff --git a/tensorrt_llm/commands/build.py b/tensorrt_llm/commands/build.py index e6b55f6e04..9374883a9c 100644 --- a/tensorrt_llm/commands/build.py +++ b/tensorrt_llm/commands/build.py @@ -349,8 +349,8 @@ def build_model( model_config.logits_dtype = logits_dtype architecture = model_config.architecture - assert not build_config.plugin_config.streamingllm or architecture == "LlamaForCausalLM", \ - "StreamingLLM is only supported in the llama model." + assert not build_config.plugin_config.streamingllm, \ + "StreamingLLM is no longer supported because attention sink cannot work with the non-cyclic kv cache kernel & runtime changes." assert not build_config.plugin_config.pp_reduce_scatter or architecture == "MixtralForCausalLM", \ "PP reduce scatter is only supported in the mixtral model." diff --git a/tests/integration/defs/accuracy/test_cli_flow.py b/tests/integration/defs/accuracy/test_cli_flow.py index b30ec9c912..d41e3a246d 100644 --- a/tests/integration/defs/accuracy/test_cli_flow.py +++ b/tests/integration/defs/accuracy/test_cli_flow.py @@ -529,6 +529,10 @@ class TestLlama7B(CliFlowAccuracyTestHarness): f"--quant_ckpt_path={llm_models_root()}/int4-quantized-gptq-awq/llama-7b-4bit-gs128.safetensors" ]) + @pytest.mark.skip( + reason= + "Waived for now because attention sink cannot work with the non-cyclic kv cache kernel & runtime changes." + ) def test_streamingllm(self): self.run(extra_acc_spec="streamingllm", extra_build_args=["--streamingllm=enable"], diff --git a/tests/integration/defs/accuracy/test_disaggregated_serving.py b/tests/integration/defs/accuracy/test_disaggregated_serving.py index da71961389..ab3ffb50f8 100644 --- a/tests/integration/defs/accuracy/test_disaggregated_serving.py +++ b/tests/integration/defs/accuracy/test_disaggregated_serving.py @@ -516,12 +516,12 @@ class TestGemma3_1BInstruct(LlmapiAccuracyTestHarness): } } ctx_server_config["kv_cache_config"] = { - "max_attention_window": [512, 512, 512, 512, 512, 32768], - "enable_block_reuse": False + # "max_attention_window": [512, 512, 512, 512, 512, 32768], + "enable_block_reuse": True } gen_server_config["kv_cache_config"] = { - "max_attention_window": [512, 512, 512, 512, 512, 32768], - "enable_block_reuse": False + # "max_attention_window": [512, 512, 512, 512, 512, 32768], + "enable_block_reuse": True } disaggregated_server_config = { "hostname": "localhost", @@ -541,6 +541,8 @@ class TestGemma3_1BInstruct(LlmapiAccuracyTestHarness): self.MODEL_PATH) as llm: task = GSM8K(self.MODEL_NAME) task.evaluate(llm) + task = MMLU(self.MODEL_NAME) + task.evaluate(llm) @pytest.mark.timeout(3600) diff --git a/tests/integration/defs/accuracy/test_llm_api_pytorch.py b/tests/integration/defs/accuracy/test_llm_api_pytorch.py index 3a9d53ce6e..a7a3ecd57e 100644 --- a/tests/integration/defs/accuracy/test_llm_api_pytorch.py +++ b/tests/integration/defs/accuracy/test_llm_api_pytorch.py @@ -757,7 +757,7 @@ class TestGemma3_1BInstruct(LlmapiAccuracyTestHarness): MODEL_PATH = f"{llm_models_root()}/gemma/gemma-3-1b-it/" # NOTE: Disable block reuse for SWA window model. - kv_cache_config = KvCacheConfig(enable_block_reuse=False) + kv_cache_config = KvCacheConfig(enable_block_reuse=True) def test_auto_dtype(self): # Disabling kv cache reuse as a WAR to deal with gaps in kernel support for Gemma3's non-inclusive sliding window size. @@ -788,24 +788,25 @@ class TestGemma3_1BInstruct(LlmapiAccuracyTestHarness): task.evaluate(llm) def test_auto_dtype_vswa(self): - # NOTE: Test with VSWA kv cache config. - self.kv_cache_config.max_attention_window = [ - 512, 512, 512, 512, 512, 32768 - ] # Gemma3 1B attention window size pattern + # # NOTE: Test with VSWA kv cache config. + # self.kv_cache_config.max_attention_window = [ + # 512, 512, 512, 512, 512, 32768 + # ] # Gemma3 1B attention window size pattern + # # TODO: uncomment to use the real window pattern when optimal KV cache allocation is supported with LLM(self.MODEL_PATH, kv_cache_config=self.kv_cache_config) as llm: task = GSM8K(self.MODEL_NAME) task.evaluate(llm) + task = MMLU(self.MODEL_NAME) + task.evaluate(llm) - @pytest.mark.skip( - reason= - "remove this skip after the kernel support mentioned in this nvbug is fixed: https://nvbugspro.nvidia.com/bug/5338620" - ) def test_auto_dtype_chunked_prefill(self): - # NOTE: Test with VSWA kv cache config. - self.kv_cache_config.max_attention_window = [ - 512, 512, 512, 512, 512, 32768 - ] # Gemma3 1B attention window size pattern + # # NOTE: Test with VSWA kv cache config. + # self.kv_cache_config.max_attention_window = [ + # 512, 512, 512, 512, 512, 32768 + # ] # Gemma3 1B attention window size pattern + # # TODO: uncomment to use the real window pattern when optimal KV cache allocation is supported + # chunked prefill case or more features extra_llm_config = dict( enable_chunked_prefill=True, @@ -816,6 +817,8 @@ class TestGemma3_1BInstruct(LlmapiAccuracyTestHarness): **extra_llm_config) as llm: task = GSM8K(self.MODEL_NAME) task.evaluate(llm) + task = MMLU(self.MODEL_NAME) + task.evaluate(llm) class TestMixtral8x7B(LlmapiAccuracyTestHarness): diff --git a/tests/integration/test_lists/waives.txt b/tests/integration/test_lists/waives.txt index 13175b3050..7b04a7c6d5 100644 --- a/tests/integration/test_lists/waives.txt +++ b/tests/integration/test_lists/waives.txt @@ -227,6 +227,7 @@ examples/test_multimodal.py::test_llm_multimodal_general[kosmos-2-pp:1-tp:1-floa examples/test_multimodal.py::test_llm_multimodal_general[fuyu-8b-pp:1-tp:1-float16-bs:1-cpp_e2e:True-nb:1] SKIP (https://nvbugs/5354936) examples/test_llama.py::test_llm_llama_v3_1_2nodes_8gpus[llama-3.1-8b-disable_fp8-tp16pp1-build] SKIP (https://nvbugs/5247243) examples/test_llama.py::test_llm_llama_v3_1_2nodes_8gpus[llama-3.1-8b-disable_fp8-tp16pp1-infer] SKIP (https://nvbugs/5247243) +examples/test_llama.py::test_llm_llama_1gpu_streaming_llm[ailab-deepseek-coder-6.7b-instruct] SKIP (https://nvbugs/5435714) test_e2e.py::test_openai_multinodes_chat_tp16pp1 SKIP (https://nvbugs/5112075) examples/test_qwen.py::test_llm_hf_qwen_quantization_1gpu[qwen2_vl_7b_instruct-fp8-bfloat16] SKIP (https://nvbugs/5322488) accuracy/test_cli_flow.py::TestSantacoder::test_auto_dtype SKIP (https://nvbugs/5234043) diff --git a/tests/unittest/llmapi/test_llm.py b/tests/unittest/llmapi/test_llm.py index 6b612b7db7..2b7c606bf4 100644 --- a/tests/unittest/llmapi/test_llm.py +++ b/tests/unittest/llmapi/test_llm.py @@ -708,6 +708,7 @@ def test_generate_with_beam_search(llm_for_sampling_params: LLM): check_output(outputs, references) +@pytest.mark.skip(reason="https://nvbugs/5435714") @force_ampere @pytest.mark.part0 def test_generate_with_streaming_llm(): diff --git a/tests/unittest/trt/attention/test_gpt_attention.py b/tests/unittest/trt/attention/test_gpt_attention.py index afc592cb7e..cbe5c1309e 100644 --- a/tests/unittest/trt/attention/test_gpt_attention.py +++ b/tests/unittest/trt/attention/test_gpt_attention.py @@ -453,6 +453,11 @@ class TestFunctional(unittest.TestCase): tokens_per_block = 128 if paged_kv_cache else -1 streamingllm = sink_token_len > 0 + if streamingllm: + pytest.skip( + "Waived for now because attention sink cannot work with the non-cyclic kv cache kernel & runtime changes." + ) + def _construct_execution( session, input_tensor, weight, bias, past_key_value, host_kv_cache_block_offsets, host_kv_cache_pool_pointers, diff --git a/tests/unittest/trt/attention/test_gpt_attention_IFB.py b/tests/unittest/trt/attention/test_gpt_attention_IFB.py index 41e0e015de..68c45583ab 100644 --- a/tests/unittest/trt/attention/test_gpt_attention_IFB.py +++ b/tests/unittest/trt/attention/test_gpt_attention_IFB.py @@ -206,6 +206,12 @@ class TestFunctional(unittest.TestCase): pytest.skip("Beam search is not supported in this test yet") tokens_per_block = 128 + streamingllm = sink_token_len > 0 + + if streamingllm: + pytest.skip( + "Waived for now because attention sink cannot work with the non-cyclic kv cache kernel & runtime changes." + ) remove_input_padding = True