Merge branch 'main' into fix_spec_gate

Signed-off-by: Zheyu Fu <zheyuf@nvidia.com>
This commit is contained in:
Zheyu Fu 2025-12-21 19:38:26 -08:00 committed by GitHub
commit b51ee2bb0d
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3012 changed files with 10972 additions and 5610 deletions

View File

@ -68,6 +68,7 @@ option(USING_OSS_CUTLASS_MOE_GEMM "Using open sourced Cutlass moe gemm kernel"
ON)
option(USING_OSS_CUTLASS_ALLREDUCE_GEMM
"Using open sourced Cutlass AR gemm kernel" ON)
option(SKIP_SOFTMAX_STAT "Enable Statistics of Skip-Softmax" OFF)
message(STATUS "ENABLE_NVSHMEM is ${ENABLE_NVSHMEM}")
@ -360,6 +361,11 @@ else()
$<$<COMPILE_LANGUAGE:CUDA>:ENABLE_NVSHMEM=0>)
endif()
if(SKIP_SOFTMAX_STAT)
add_compile_definitions("SKIP_SOFTMAX_STAT")
message(STATUS "SKIP_SOFTMAX_STAT is enabled")
endif()
# Fix linking issue with TRT 10, the detailed description about `--mcmodel` can
# be found in
# https://gcc.gnu.org/onlinedocs/gcc/x86-Options.html#index-mcmodel_003dmedium-1

View File

@ -69,6 +69,11 @@ PREPROCESSOR_FLAGS += -DUSE_SAME_SUM_ORDER_IN_SOFTMAX_AS_REF_CODE
# Do we want to use half accumulation for flash attention
PREPROCESSOR_FLAGS += -DHALF_ACCUMULATION_FOR_FLASH_ATTENTION
# Print the resulted sparsity given threshold in Skip-Softmax attention
# Note: You only need to "python scripts/build_wheel.py -D SKIP_SOFTMAX_STAT=ON ..." to use it inside TRTLLM.
# Turn this on manually only if you want to build&run the unittest (bin/fmha.exe) with SKIP_SOFTMAX_STAT.
# PREPROCESSOR_FLAGS += -DSKIP_SOFTMAX_STAT
# Add FLAGS when generating cubins.
ifdef GENERATE_CUBIN
PREPROCESSOR_FLAGS += -DGENERATE_CUBIN

View File

@ -154,7 +154,9 @@ spec_fields = (
'head_size_v',
'sage_block_sizes',
'output_dtype',
'is_mtp')
'is_mtp',
'enable_skip_softmax',
)
kernel_spec = namedtuple('kernel_spec', spec_fields)
kernel_spec.__new__.__defaults__ = (
1, # ctas_per_head
@ -179,7 +181,9 @@ kernel_spec.__new__.__defaults__ = (
0, # head size of V
None, # sage_block_sizes
None, # output_dtype, same as dtype by default.
False) # use MTP or not
False, # use MTP or not
False, # enable skip softmax
)
generate_cu_trtllm = os.environ.get('GENERATE_CU_TRTLLM',
'False').lower() == 'true'
@ -1435,6 +1439,7 @@ using Ktraits = {kernel_traits_header}
USE_TMA_STORE,
{enable_attn_logit_softcapping_flag},
{return_softmax_stats_flag},
{enable_skip_softmax_flag},
{output_dtype_},
{sage_block_size_q},
{sage_block_size_k},
@ -1458,6 +1463,7 @@ using Ktraits_causal = {kernel_traits_header}
USE_TMA_STORE,
{enable_attn_logit_softcapping_flag},
{return_softmax_stats_flag},
{enable_skip_softmax_flag},
{output_dtype_}>;
using Ktraits_sliding_or_chunked_causal = {kernel_traits_header}
@ -1478,6 +1484,7 @@ using Ktraits_sliding_or_chunked_causal = {kernel_traits_header}
USE_TMA_STORE && false,
{enable_attn_logit_softcapping_flag},
{return_softmax_stats_flag},
{enable_skip_softmax_flag},
{output_dtype_}>;
using Ktraits_custom_mask = {kernel_traits_header}
@ -1498,6 +1505,7 @@ using Ktraits_custom_mask = {kernel_traits_header}
USE_TMA_STORE && false,
{enable_attn_logit_softcapping_flag},
{return_softmax_stats_flag},
{enable_skip_softmax_flag},
{output_dtype_}>;
////////////////////////////////////////////////////////////////////////////////////////////////////
@ -1835,6 +1843,8 @@ def encode_name(kernel_spec):
if kernel_spec.enable_attn_logit_softcapping:
feature_tags += '_softcapping'
if kernel_spec.enable_skip_softmax:
feature_tags += '_skipSoftmax'
if kernel_spec.sage_block_sizes:
feature_tags += f"_sage_{'_'.join(map(str, kernel_spec.sage_block_sizes))}"
if kernel_spec.output_dtype:
@ -2131,6 +2141,8 @@ def get_kernel_code(kspec, kname, lname):
return_softmax_stats_flag = pythonBoolean2cpp[kspec.return_softmax_stats]
enable_skip_softmax_flag = pythonBoolean2cpp[kspec.enable_skip_softmax]
# needed by warpspec kernels.
fp8_kernel = kspec.dtype in ["e4m3", "e4m3_fp32"]
kernel_traits_header = "fmha::ws::Kernel_traits_Hopper_qgmma_e4m3_fp32<" if fp8_kernel \
@ -2331,6 +2343,8 @@ def get_api_code(specs_names):
f'&& sage_block_size_k == {sage_block_size_k} ' \
f'&& sage_block_size_v == {sage_block_size_v} '
il_check += '&& enable_skip_softmax ' if kspec.enable_skip_softmax else '&& !enable_skip_softmax '
il_check += '&& params.use_int8_scale_max ' if kspec.has_scale_max else '&& !params.use_int8_scale_max '
slen = kspec.seq_len * kspec.ctas_per_head if not kspec.flash_attention else 0
@ -2607,6 +2621,7 @@ const bool warp_specialization = launch_params.warp_specialization
const bool use_tma = launch_params.use_tma;
const bool use_flash_attention = launch_params.flash_attention;
const bool enable_attn_logit_softcapping = launch_params.enable_attn_logit_softcapping;
const bool enable_skip_softmax = launch_params.enable_skip_softmax;
const int attention_input_layout = static_cast<int>(launch_params.attention_input_layout);
// tiled variant uses ldgsts
const bool use_tiled = launch_params.use_granular_tiling;
@ -2785,6 +2800,8 @@ def get_kernel_traits_code(specs_names):
enable_attn_logit_softcapping_flag = pythonBoolean2cpp[
kspec.enable_attn_logit_softcapping]
enable_skip_softmax_flag = pythonBoolean2cpp[kspec.enable_skip_softmax]
tmp = dict(locals(), **kspec._asdict())
if effective_sm < 90:
@ -2903,7 +2920,8 @@ def get_kernel_traits_code(specs_names):
{input_layout_flag},
__use_tma_store__ /* USE_TMA_STORE */,
{enable_attn_logit_softcapping_flag},
{return_softmax_stats_flag}>;
{return_softmax_stats_flag},
{enable_skip_softmax_flag}>;
printf("%s %d %d %s %d %d\\n",
\"{kname}\",
@ -3062,9 +3080,16 @@ def get_kernel_traits_code(specs_names):
# For now:
# 1. Hopper head_size 128 kernel uses cubins for performance regressions.
# 2. Hopper sm89 with e4m3/e4m3_fp32 dtype uses cubins for accuracy regressions (will be fixed).
# 3. For skip-softmax attention feature, we force not to use cubins.
# You should set the condition `use_cubin_header` to false if you have modified the source codes of those kernels that use cubins.
# This ensures that the kernels will be recompiled using the updated source code rather than relying on precompiled cubins.
def use_cubin_header(sm, head_size, dtype, output_dtype=None):
def use_cubin_header(sm,
head_size,
dtype,
output_dtype=None,
enable_skip_softmax=False):
if enable_skip_softmax:
return False
if 'e4m3' in dtype and output_dtype in ['bf16', 'fp16']:
return False
return (sm == 90 and head_size == 128) or (sm == 89 and 'e4m3' in dtype)
@ -3079,7 +3104,8 @@ def get_cubin_header(kernel_traits, specs_names):
launchers_dict = {}
for kspec, fname, lname, kname in specs_names:
if generate_cu_trtllm and not use_cubin_header(
kspec.sm, kspec.head_size, kspec.dtype, kspec.output_dtype):
kspec.sm, kspec.head_size, kspec.dtype, kspec.output_dtype,
kspec.enable_skip_softmax):
continue
name = fname.replace('.', '_')
data = 'extern unsigned char cubin_{name}_cubin[];'.format(name=name)
@ -3111,8 +3137,9 @@ def get_cubin_header(kernel_traits, specs_names):
'q_kv_', '').replace('q_paged_kv_', '').replace(
'q_k_v_', '').replace('ws_', '').replace(
'softcapping_',
'').replace('sage_',
'').replace('output_', ''))
'').replace('sage_', '').replace(
'skipSoftmax_',
'').replace('output_', ''))
flash_attention = 'flash_attention' in kname
warp_specialization = 'tma_ws' in kname
toks = tname.split('_')
@ -3209,6 +3236,8 @@ def get_cubin_header(kernel_traits, specs_names):
return_softmax_stats_flag = pythonBoolean2cpp[sm != '90' or (
sm == '90' and '_softmax' in kname)]
enable_skip_softmax_flag = pythonBoolean2cpp['_skipSoftmax' in kname]
# meta_unroll_step
meta_unroll_step = unroll_step if ('_nl' in kname
or '_ws' in kname) else '0'
@ -3235,7 +3264,8 @@ def get_cubin_header(kernel_traits, specs_names):
def get_lname_from_kname(kname: str) -> str:
if use_cubin_header(int(sm), int(head_size), prec.lower(),
output_prec.lower()):
output_prec.lower(),
enable_skip_softmax_flag):
return 'nullptr'
lname = kname.replace('_kernel', '')
mask_types = [
@ -3253,15 +3283,15 @@ def get_cubin_header(kernel_traits, specs_names):
{sage_block_sizes[0]}, {sage_block_sizes[1]}, {sage_block_sizes[2]}, kSM_{sm}, {cubin_name}, \
{cubin_name}_len, \"{kname}\", {smem}, {threads}, {meta_unroll_step}, {attention_mask_type_value}, \
{attention_input_layout_value}, {is_il}, {is_flash_atten}, {is_warp_specialization}, {is_fp32_accu}, \
{is_alibi_supported}, {is_tiled}, {has_softcapping_scale}, {return_softmax_stats_flag}, {lname}}}\
'''.format(**locals()) if use_cubin_header(int(sm),
int(head_size), prec.lower(),
output_prec.lower()) else '''\
{is_alibi_supported}, {is_tiled}, {has_softcapping_scale}, {return_softmax_stats_flag}, {enable_skip_softmax_flag}, {lname}}}\
'''.format(**locals()) if use_cubin_header(int(sm), int(head_size),
prec.lower(), output_prec.lower(),
enable_skip_softmax_flag) else '''\
{{ DATA_TYPE_{prec}, DATA_TYPE_{output_prec}, {seq_len}, {q_step}, {kv_step}, {head_size}, {head_size_v}, \
{sage_block_sizes[0]}, {sage_block_sizes[1]}, {sage_block_sizes[2]}, kSM_{sm}, nullptr, \
0, \"{kname}\", {smem}, {threads}, {meta_unroll_step}, {attention_mask_type_value}, \
{attention_input_layout_value}, {is_il}, {is_flash_atten}, {is_warp_specialization}, {is_fp32_accu}, \
{is_alibi_supported}, {is_tiled}, {has_softcapping_scale}, {return_softmax_stats_flag}, {lname}}}\
{is_alibi_supported}, {is_tiled}, {has_softcapping_scale}, {return_softmax_stats_flag}, {enable_skip_softmax_flag}, {lname}}}\
'''.format(**locals())
else:
code = '''\
@ -3269,7 +3299,7 @@ def get_cubin_header(kernel_traits, specs_names):
{sage_block_sizes[0]}, {sage_block_sizes[1]}, {sage_block_sizes[2]}, kSM_{sm}, {cubin_name}, \
{cubin_name}_len, \"{kname}\", {smem}, {threads}, {meta_unroll_step}, {attention_mask_type_value}, \
{attention_input_layout_value}, {is_il}, {is_flash_atten}, {is_warp_specialization}, {is_fp32_accu}, \
{is_alibi_supported}, {is_tiled}, {has_softcapping_scale}, {return_softmax_stats_flag}}}\
{is_alibi_supported}, {is_tiled}, {has_softcapping_scale}, {return_softmax_stats_flag}, {enable_skip_softmax_flag}}}\
'''.format(**locals())
if sm in metadata_v2_dict:
metadata_v2_dict[sm].append(code)
@ -3377,7 +3407,8 @@ static const struct FusedMultiHeadAttentionKernelMetaInfoV2
bool mAlibiSupported;
bool mTiled;
bool mEnableAttnLogitSoftcapping;
bool mReturnSoftmaxStats;{launcher_line}
bool mReturnSoftmaxStats;
bool mEnableSkipSoftmax;{launcher_line}
}} sMhaKernelMetaInfosV2[] = {{
{metadata_v2}
}};
@ -3438,6 +3469,7 @@ static const struct TestMetaV2
bool mTiled;
bool mEnableAttnLogitSoftcapping;
bool mReturnSoftmaxStats;
bool mEnableSkipSoftmax;
}} metaV2[] = {{
{metadata_v2}
}};
@ -3484,7 +3516,8 @@ struct FusedMultiHeadAttentionKernelMetaInfoV2
bool mAlibiSupported;
bool mTiled;
bool mEnableAttnLogitSoftcapping;
bool mReturnSoftmaxStats;{launcher_line}
bool mReturnSoftmaxStats;
bool mEnableSkipSoftmax;{launcher_line}
}};
extern const FusedMultiHeadAttentionKernelMetaInfoV2 sMhaKernelMetaInfosV2[];
@ -3580,7 +3613,8 @@ struct FusedMultiHeadAttentionKernelMetaInfoV2
bool mAlibiSupported;
bool mTiled;
bool mEnableAttnLogitSoftcapping;
bool mReturnSoftmaxStats;{launcher_line}
bool mReturnSoftmaxStats;
bool mEnableSkipSoftmax;{launcher_line}
}};
extern const FusedMultiHeadAttentionKernelMetaInfoV2 sMhaKernelMetaInfosV2[] = {{
@ -3637,7 +3671,7 @@ extern uint32_t cubin_fmha_v2_flash_attention_fp16_64_128_S_q_paged_kv_128_sm80_
return '\n'.join(lines)
target = "fmha_v2_flash_attention_fp16_64_128_S_q_paged_kv_128_causal_sm80_kernel_nl_tiled"
new_line = '{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 128, 128, 0, 0, 0, kSM_80, cubin_fmha_v2_flash_attention_fp16_64_128_S_q_paged_kv_128_sm80_cu_cubin, cubin_fmha_v2_flash_attention_fp16_64_128_S_q_paged_kv_128_sm80_cu_cubin_len, "fmha_v2_flash_attention_fp16_64_128_S_q_paged_kv_128_causal_sm80_kernel_nl_tiled", 81920, 128, 64, 1, 2, false, true, false, false, true, true, false, true, nullptr},'
new_line = '{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 128, 128, 0, 0, 0, kSM_80, cubin_fmha_v2_flash_attention_fp16_64_128_S_q_paged_kv_128_sm80_cu_cubin, cubin_fmha_v2_flash_attention_fp16_64_128_S_q_paged_kv_128_sm80_cu_cubin_len, "fmha_v2_flash_attention_fp16_64_128_S_q_paged_kv_128_causal_sm80_kernel_nl_tiled", 81920, 128, 64, 1, 2, false, true, false, false, true, true, false, true, false, nullptr},'
result = modify_kernel_line(result, target, new_line)
# make sure only one empty line at the end
@ -3801,7 +3835,10 @@ def enumerate_hgmma_ldgsts_kernels(specs, sm=90, dtype='fp16'):
# Note this will be used in TRT-LLM.
def enumerate_hgmma_flash_warpspec_kernels(specs, sm=90, dtype='fp16'):
def enumerate_hgmma_flash_warpspec_kernels(specs,
sm=90,
dtype='fp16',
enable_skip_softmax=False):
scheduling_mode = int(os.getenv('SCHEDULING_MODE', '1'))
@ -3851,7 +3888,8 @@ def enumerate_hgmma_flash_warpspec_kernels(specs, sm=90, dtype='fp16'):
enable_attn_logit_softcapping=enable_attn_logit_softcapping,
return_softmax_stats=return_softmax,
scheduling_mode=scheduling_mode,
input_layout=input_layout))
input_layout=input_layout,
enable_skip_softmax=enable_skip_softmax))
specs.append(
kernel_spec(
@ -3883,7 +3921,8 @@ def enumerate_hgmma_flash_warpspec_kernels(specs, sm=90, dtype='fp16'):
enable_attn_logit_softcapping=enable_attn_logit_softcapping,
return_softmax_stats=return_softmax,
scheduling_mode=scheduling_mode,
input_layout=input_layout))
input_layout=input_layout,
enable_skip_softmax=enable_skip_softmax))
specs.append(
kernel_spec(
@ -3915,7 +3954,8 @@ def enumerate_hgmma_flash_warpspec_kernels(specs, sm=90, dtype='fp16'):
enable_attn_logit_softcapping=enable_attn_logit_softcapping,
return_softmax_stats=return_softmax,
scheduling_mode=scheduling_mode,
input_layout=input_layout))
input_layout=input_layout,
enable_skip_softmax=enable_skip_softmax))
'''
smem size = (q_step * d * q_buffers * NUM_COMPUTE_GROUPS
+ (kv_step * d + kv_step * dv) * kv_buffers) * ele_size
@ -3967,7 +4007,8 @@ def enumerate_qgmma_flash_warpspec_kernels(specs,
sm=90,
dtype='e4m3',
sage_block_sizes=None,
output_dtype=None):
output_dtype=None,
enable_skip_softmax=False):
scheduling_mode = int(os.getenv('SCHEDULING_MODE', '1'))
@ -4021,7 +4062,8 @@ def enumerate_qgmma_flash_warpspec_kernels(specs,
scheduling_mode=scheduling_mode,
input_layout=input_layout,
sage_block_sizes=sage_block_sizes,
output_dtype=output_dtype))
output_dtype=output_dtype,
enable_skip_softmax=enable_skip_softmax))
# 64 < D <=128: KV_STEP = 128
specs.append(
@ -4056,7 +4098,8 @@ def enumerate_qgmma_flash_warpspec_kernels(specs,
scheduling_mode=scheduling_mode,
input_layout=input_layout,
sage_block_sizes=sage_block_sizes,
output_dtype=output_dtype))
output_dtype=output_dtype,
enable_skip_softmax=enable_skip_softmax))
# 128 < D <=256: KV_STEP = 128
specs.append(
@ -4092,7 +4135,8 @@ def enumerate_qgmma_flash_warpspec_kernels(specs,
scheduling_mode=scheduling_mode,
input_layout=input_layout,
sage_block_sizes=sage_block_sizes,
output_dtype=output_dtype))
output_dtype=output_dtype,
enable_skip_softmax=enable_skip_softmax))
if not skip_mla_combination:
# context MLA (192x128)
@ -6374,13 +6418,21 @@ def enumerate_kernels():
enumerate_igmma_kernels(specs, sm=90)
enumerate_qgmma_kernels(specs, sm=90)
# need to add bf16 kernels if needed
enumerate_hgmma_flash_warpspec_kernels(specs, sm=90, dtype='fp16')
enumerate_hgmma_flash_warpspec_kernels(specs, sm=90, dtype='bf16')
enumerate_qgmma_flash_warpspec_kernels(specs, sm=90, dtype='e4m3')
enumerate_qgmma_flash_warpspec_kernels(specs,
sm=90,
dtype='e4m3',
output_dtype="bf16")
for enable_skip_softmax in [False, True]:
if enable_skip_softmax and 'DISABLE_SKIP_SOFTMAX' in os.environ:
continue
enumerate_hgmma_flash_warpspec_kernels(
specs, sm=90, dtype='fp16', enable_skip_softmax=enable_skip_softmax)
enumerate_hgmma_flash_warpspec_kernels(
specs, sm=90, dtype='bf16', enable_skip_softmax=enable_skip_softmax)
enumerate_qgmma_flash_warpspec_kernels(
specs, sm=90, dtype='e4m3', enable_skip_softmax=enable_skip_softmax)
enumerate_qgmma_flash_warpspec_kernels(
specs,
sm=90,
dtype='e4m3',
output_dtype="bf16",
enable_skip_softmax=enable_skip_softmax)
# For now SageAttention only needs BF16
# block_size_q should be divisible by 64

View File

@ -256,7 +256,8 @@ struct Compute
actual_kv_seqlen, alibi_head_scale, \
USE_CUSTOM_MASK ? (head_info.mask_sum_s + q_step_idx * STEP_Q + local_q_tile_offset) \
: (q_step_idx * STEP_Q + head_info.q_tile_offset), \
kv_step_idx * STEP_KV, sage_scale_row, cbr, cbr_v, mutex_accessor, kv_step_idx == kv_idx_end - 1);
kv_step_idx * STEP_KV, sage_scale_row, cbr, cbr_v, mutex_accessor, \
&shared->skip_softmax_votes[kv_step_idx & 1][warpgroup_id], kv_step_idx == kv_idx_end - 1);
////////////////////////////////////////////////////////////////////////////////////////////////
@ -360,6 +361,12 @@ struct Compute
// Contiguous QKV FMHA assumes q, and kv have the same sequence length.
int const actual_kv_seqlen = SEPARATE_Q_KV_BUFFER ? head_info.actual_kv_seqlen : actual_q_seqlen;
// Update threshold of Skip-Softmax
if constexpr (Kernel_traits::ENABLE_SKIP_SOFTMAX)
{
softmax.skip_softmax_threshold = params.skip_softmax_threshold_scale_factor / actual_kv_seqlen;
}
// Calculate the alibi head_scaling_factor.
float alibi_head_scale
= APPLY_ALIBI ? get_alibi_head_scaling_factor<AlibiParams>(head_info.bidh, params.alibi_params) : 0.f;
@ -513,6 +520,13 @@ struct Compute
}
}
}
#ifdef SKIP_SOFTMAX_STAT
if (tidx == 0)
{
atomicAdd(params.skip_softmax_total_blocks, softmax.total_blocks);
atomicAdd(params.skip_softmax_skipped_blocks, softmax.skipped_blocks);
}
#endif
}
////////////////////////////////////////////////////////////////////////////////////////////////
@ -522,8 +536,15 @@ struct Compute
Compute_tile_o& ctile_o, float (&p_max)[Mma_tile_p::CORES_M], float (&p_sum)[Mma_tile_p::CORES_M],
int const tidx, int const actual_kv_seqlen, float const alibi_head_scale, int const row_offset,
int const col_offset, int const sage_scale_row, Circular_buffer_q_reader& cbr, Circular_buffer_kv_reader& cbr_v,
OrderedMutexAccessor& mutex, bool complete = false)
OrderedMutexAccessor& mutex, uint32_t* skip_softmax_vote, bool complete = false)
{
// Skip-softmax vote initialization
if (tidx == 0)
{
// Note that we need a named_barrier_wait in compute_single_tile to make sure init is before voting.
*skip_softmax_vote = 1;
}
// load the scales of K/V from global memory
#define LOAD_SCALES_KV(dst, which, blocks_per_step, block_size) \
if constexpr (block_size > 0) \
@ -557,6 +578,10 @@ struct Compute
// Ctile_p is only used once by each n step.
ctile_p.clear();
// If skip_softmax is enabled, make sure there is no racing between the initialization and writing of
// skip_softmax_vote.
named_barrier_wait(Kernel_traits::SKIP_SOFTMAX_BARRIER_ID + threadIdx.x / 128, 128);
// BMM1 (Q x K').
warpgroup_arrive();
@ -626,8 +651,22 @@ struct Compute
softmax.apply_alibi_and_mask<APPLY_MASK>(
ctile_p, params.alibi_params, alibi_head_scale, actual_kv_seqlen, row_offset, col_offset);
// Softmax Exp, max/sum, and update scales.
softmax.compute_and_update_scale<IS_FIRST_COL>(p_max, p_sum);
// Softmax Exp, max/sum, and update scales. If returns false we skip the rest.
if (!softmax.compute_and_update_scale<IS_FIRST_COL>(p_max, p_sum, skip_softmax_vote))
{
if constexpr (ENABLE_MUTEX && Kernel_traits::ELEMENT_BYTES == 1)
{
// Notify another warpgroup to execute QGMMA.
mutex.named_bar_arrive();
}
// Need to wait V, otherwise compute-sanitizer synccheck will fail.
int ready2 = cbr_v.peek();
if (!ready2)
{
cbr_v.wait();
}
return;
}
// experiments show that here is the best place to load scales of V
float scales_v[SAGE_BLOCKS_PER_STEP_V];

View File

@ -17,6 +17,8 @@
#pragma once
#include "fmha/hopper/arrive_wait.h"
#include <fmha/softmax.h>
#include <fmha/traits.h>
#include <fmha/utils.h>
@ -104,6 +106,12 @@ struct Softmax_base
CHECK_IF_NEG_INF_EXISTS = SLIDING_OR_CHUNKED_ATTENTION || USE_CUSTOM_MASK
};
// There are 2 warpgroups so 0x3 and 0x4 are used
enum
{
SKIP_SOFTMAX_BARRIER = Kernel_traits::SKIP_SOFTMAX_BARRIER_ID
};
// Ctor.
template <typename Params>
inline __device__ Softmax_base(Params params, int tidx)
@ -114,6 +122,11 @@ struct Softmax_base
, log2_chunked_attention_size_(params.log2_chunked_attention_size)
, packed_mask_ptr_{reinterpret_cast<uint32_t*>(params.packed_mask_ptr)}
, params_packed_mask_stride_in_bytes_{params.packed_mask_stride_in_bytes}
#ifdef SKIP_SOFTMAX_STAT
, total_blocks(0)
, skipped_blocks(0)
#endif
, skip_softmax_threshold(0)
{
int warp = tidx / 32;
@ -330,24 +343,22 @@ struct Softmax_base
}
// Calculate max/sum, and update flash-attention scales.
// Returns false if skipped due to skip-softmax attention feature.
template <bool IS_FIRST_COL>
inline __device__ void compute_and_update_scale(
float (&global_max)[Mma_tile_p::CORES_M], float (&global_sum)[Mma_tile_p::CORES_M])
inline __device__ bool compute_and_update_scale(
float (&global_max)[Mma_tile_p::CORES_M], float (&global_sum)[Mma_tile_p::CORES_M], uint32_t* skip_softmax_vote)
{
float const scale = reinterpret_cast<float const&>(scale_bmm1_);
// whether this warpgroup skips the softmax
constexpr bool may_skip = Kernel_traits::ENABLE_SKIP_SOFTMAX && !IS_FIRST_COL;
bool skip = may_skip;
// Row-wise max of current tile.
#pragma unroll
for (int mi = 0; mi < Mma_tile_p::CORES_M; mi++)
{
if (IS_FIRST_COL)
{
local_max_[mi] = elt_[mi][0];
}
else
{
local_max_[mi] = fmaxf(global_max[mi], elt_[mi][0]);
}
local_max_[mi] = elt_[mi][0];
#pragma unroll
for (int ni = 1; ni < Mma_tile_p::CORES_N * 2; ni++)
{
@ -355,6 +366,56 @@ struct Softmax_base
}
local_max_[mi] = fmaxf(__shfl_xor_sync(uint32_t(-1), local_max_[mi], 1), local_max_[mi]);
local_max_[mi] = fmaxf(__shfl_xor_sync(uint32_t(-1), local_max_[mi], 2), local_max_[mi]);
if constexpr (may_skip)
{
// AND(&) the CORES_M results, then `skip` means whether to skip
// the CORES_M(=2) rows
if constexpr (!EXP2F_OPTIMIZATION)
{
skip &= expf(local_max_[mi] - global_max[mi]) < skip_softmax_threshold;
}
else
{
skip &= exp2f((local_max_[mi] - global_max[mi]) * scale) < skip_softmax_threshold;
}
}
if (!IS_FIRST_COL)
{
local_max_[mi] = fmaxf(local_max_[mi], global_max[mi]);
}
}
if constexpr (Kernel_traits::ENABLE_SKIP_SOFTMAX)
{
#ifdef SKIP_SOFTMAX_STAT
total_blocks++;
#endif
if constexpr (may_skip)
{
// AND(&) the results together in a warp, then `skip` means whether to skip
// all the 16 rows managed by this warp.
// each 4 threads (e.g. T0~T3) have the same `skip`, only 0x11111111 is needed
// instead of 0xffffffff. But the perf is the same.
skip = __all_sync(0xffffffff, skip);
if (threadIdx.x % 32 == 0)
{
// The leader of each warp votes.
atomicAnd(skip_softmax_vote, uint32_t(skip));
}
// WG0 uses 0x3 barrier, WG1 uses 0x4 barrier
named_barrier_wait(SKIP_SOFTMAX_BARRIER + threadIdx.x / 128, 128);
skip = *((uint32_t volatile*) skip_softmax_vote);
if (skip)
{
#ifdef SKIP_SOFTMAX_STAT
skipped_blocks++;
#endif
return false;
}
}
}
// Softmax Exp.
@ -436,6 +497,7 @@ struct Softmax_base
global_max[mi] = max_new;
}
}
return true;
}
// Update flash attention scales and pack elements for BMM2.
@ -513,6 +575,13 @@ struct Softmax_base
float correction_[Mma_tile_p::CORES_M];
// The packed mask.
uint4 packed_mask_;
// Skip softmax when exp(local_max - global_max) < skip_softmax_threshold.
float skip_softmax_threshold;
#ifdef SKIP_SOFTMAX_STAT
// Statistics of skip-softmax
uint32_t total_blocks;
uint32_t skipped_blocks;
#endif
};
////////////////////////////////////////////////////////////////////////////////////////////////////
@ -868,9 +937,10 @@ struct Softmax<Hopper_qgmma_e4m3_fp32_traits, Kernel_traits>
}
// Calculate max/sum, and update flash-attention scales.
// Returns false if skipped due to skip-softmax attention feature.
template <bool IS_FIRST_COL>
inline __device__ void compute_and_update_scale(
float (&global_max)[Mma_tile_p::CORES_M], float (&global_sum)[Mma_tile_p::CORES_M])
inline __device__ bool compute_and_update_scale(
float (&global_max)[Mma_tile_p::CORES_M], float (&global_sum)[Mma_tile_p::CORES_M], uint32_t* skip_softmax_vote)
{
float const scale = reinterpret_cast<float const&>(this->scale_bmm1_);
float(&local_max_)[Mma_tile_p::CORES_M] = this->local_max_;
@ -878,18 +948,15 @@ struct Softmax<Hopper_qgmma_e4m3_fp32_traits, Kernel_traits>
float(&correction_)[Mma_tile_p::CORES_M] = this->correction_;
float(&elt_)[Mma_tile_p::CORES_M][Mma_tile_p::CORES_N * 2] = this->elt_;
// whether this warpgroup skips the softmax
constexpr bool may_skip = Kernel_traits::ENABLE_SKIP_SOFTMAX && !IS_FIRST_COL;
bool skip = may_skip;
// Row-wise max of current tile.
#pragma unroll
for (int mi = 0; mi < Mma_tile_p::CORES_M; mi++)
{
if (IS_FIRST_COL)
{
local_max_[mi] = elt_[mi][0];
}
else
{
local_max_[mi] = fmaxf(global_max[mi], elt_[mi][0]);
}
local_max_[mi] = elt_[mi][0];
#pragma unroll
for (int ni = 1; ni < Mma_tile_p::CORES_N * 2; ni++)
{
@ -897,6 +964,56 @@ struct Softmax<Hopper_qgmma_e4m3_fp32_traits, Kernel_traits>
}
local_max_[mi] = fmaxf(__shfl_xor_sync(uint32_t(-1), local_max_[mi], 1), local_max_[mi]);
local_max_[mi] = fmaxf(__shfl_xor_sync(uint32_t(-1), local_max_[mi], 2), local_max_[mi]);
// AND(&) the CORES_M results, then `skip` means whether to skip
// the CORES_M(=2) rows
if constexpr (may_skip)
{
// AND(&) the CORES_M results, then `skip` means whether to skip
// the CORES_M(=2) rows
if constexpr (!EXP2F_OPTIMIZATION)
{
skip &= expf(local_max_[mi] - global_max[mi]) < this->skip_softmax_threshold;
}
else
{
skip &= exp2f((local_max_[mi] - global_max[mi]) * scale) < this->skip_softmax_threshold;
}
}
if (!IS_FIRST_COL)
{
local_max_[mi] = fmaxf(local_max_[mi], global_max[mi]);
}
}
if constexpr (Kernel_traits::ENABLE_SKIP_SOFTMAX)
{
#ifdef SKIP_SOFTMAX_STAT
this->total_blocks++;
#endif
if constexpr (may_skip)
{
// AND(&) the results together in a warp, then `skip` means whether to skip
// all the 16 rows managed by this warp.
// each 4 threads (e.g. T0~T3) have the same `skip`, only 0x11111111 is needed
// instead of 0xffffffff. But the perf is the same.
skip = __all_sync(0xffffffff, skip);
if (threadIdx.x % 32 == 0)
{
// The leader of each warp votes.
atomicAnd(skip_softmax_vote, uint32_t(skip));
}
// WG0 uses 0x3 barrier, WG1 uses 0x4 barrier
named_barrier_wait(Base::SKIP_SOFTMAX_BARRIER + threadIdx.x / 128, 128);
skip = *((uint32_t volatile*) skip_softmax_vote);
if (skip)
{
#ifdef SKIP_SOFTMAX_STAT
this->skipped_blocks++;
#endif
return false;
}
}
}
// Softmax Exp.
@ -987,6 +1104,7 @@ struct Softmax<Hopper_qgmma_e4m3_fp32_traits, Kernel_traits>
global_max[mi] = max_new;
}
}
return true;
}
// Update flash attention scales and pack elements for BMM2.

View File

@ -71,6 +71,8 @@ template <
bool ENABLE_BMM1_SOFTCAPPING_SCALE_ = false,
// Save softmax stats ?
bool RETURN_SOFTMAX_STATS_ = false,
// Enable skip softmax attention feature
bool ENABLE_SKIP_SOFTMAX_ = false,
// The output type (only used by fp8 kernels).
typename OutputType = typename Instruction_traits<STEP_Q_, STEP_KV_, 0, false, false>::A_type,
// The sage attention block size for Q, K and V
@ -290,6 +292,12 @@ struct Kernel_traits
USE_CUSTOM_MASK = ATTENTION_MASK_TYPE_ == 3
};
// Are we enabling skip softmax attention feature?
enum
{
ENABLE_SKIP_SOFTMAX = ENABLE_SKIP_SOFTMAX_
};
static_assert(!USE_CUSTOM_MASK || STEP_KV == 64 || STEP_KV == 128 || STEP_KV == 256, "Not implemented!");
// Apply the exp2f optimization (fuse bmm1_scale and -max into FMAs).
@ -384,6 +392,8 @@ struct Kernel_traits
// Named barrier ids
static constexpr int DMA_SYNC_BARRIER_ID = 0x1;
static constexpr int MMA_SYNC_BARRIER_ID = 0x2;
// There are 2 warpgroups so 0x3 and 0x4 are used for skip-softmax
static constexpr int SKIP_SOFTMAX_BARRIER_ID = 0x3;
// How many threads get involved in the dma group.
enum
@ -518,6 +528,10 @@ struct Kernel_traits
// Mutex
OrderedMutex compute_mutex;
// 4 warps in a warpgroup vote to an atomic variable in shared memory
// to decide whether to skip this STEP_KV. Double-buffered to avoid races between consecutive KV_STEPS.
uint32_t skip_softmax_votes[2][NUM_COMPUTE_GROUPS];
inline __device__ void init(int tid0)
{
@ -580,6 +594,8 @@ template < // The step size in query sequence dimension (M of BMM1 and BMM2).
bool ENABLE_BMM1_SOFTCAPPING_SCALE_ = false,
// Save softmax stats ?
bool RETURN_SOFTMAX_STATS_ = false,
// Enable skip softmax attention feature
bool ENABLE_SKIP_SOFTMAX_ = false,
// The output type (only used by fp8 kernels).
typename OutputType = e4m3_t,
// The sage attention block size for Q, K and V
@ -588,14 +604,15 @@ struct Kernel_traits_Hopper_qgmma_e4m3_fp32
: public Kernel_traits<Hopper_qgmma_e4m3_fp32_traits, STEP_Q_, STEP_KV_, D_, DV_, Q_BUFFERS_, KV_BUFFERS_,
NUM_COMPUTE_GROUPS_, DMA2COMPUTE_DEPTH_, ATTENTION_MASK_TYPE_, HEADS_INTERLEAVED_, APPLY_ALIBI_,
ENABLE_MUTEX_, SCHEDULING_MODE_, INPUT_LAYOUT_, USE_TMA_STORE_, ENABLE_BMM1_SOFTCAPPING_SCALE_,
RETURN_SOFTMAX_STATS_, OutputType, SAGE_BLOCK_SIZE_Q_, SAGE_BLOCK_SIZE_K_, SAGE_BLOCK_SIZE_V_>
RETURN_SOFTMAX_STATS_, ENABLE_SKIP_SOFTMAX_, OutputType, SAGE_BLOCK_SIZE_Q_, SAGE_BLOCK_SIZE_K_,
SAGE_BLOCK_SIZE_V_>
{
// Base class.
using Base = Kernel_traits<Hopper_qgmma_e4m3_fp32_traits, STEP_Q_, STEP_KV_, D_, DV_, Q_BUFFERS_, KV_BUFFERS_,
NUM_COMPUTE_GROUPS_, DMA2COMPUTE_DEPTH_, ATTENTION_MASK_TYPE_, HEADS_INTERLEAVED_, APPLY_ALIBI_, ENABLE_MUTEX_,
SCHEDULING_MODE_, INPUT_LAYOUT_, USE_TMA_STORE_, ENABLE_BMM1_SOFTCAPPING_SCALE_, RETURN_SOFTMAX_STATS_,
OutputType, SAGE_BLOCK_SIZE_Q_, SAGE_BLOCK_SIZE_K_, SAGE_BLOCK_SIZE_V_>;
ENABLE_SKIP_SOFTMAX_, OutputType, SAGE_BLOCK_SIZE_Q_, SAGE_BLOCK_SIZE_K_, SAGE_BLOCK_SIZE_V_>;
enum
{
@ -693,6 +710,10 @@ struct Kernel_traits_Hopper_qgmma_e4m3_fp32
// Mutex
OrderedMutex compute_mutex;
// 4 warps in a warpgroup vote to an atomic variable in shared memory
// to decide whether to skip this STEP_KV. Double-buffered to avoid races between consecutive STEP_KVs.
uint32_t skip_softmax_votes[2][Base::NUM_COMPUTE_GROUPS];
inline __device__ void init(int tid0)
{

View File

@ -276,7 +276,8 @@ static inline void set_params(bert::Fused_multihead_attention_params_v2& params,
// scale factors
float const scale_bmm1, float const scale_softmax, float const scale_bmm2, float const softcapping_scale_bmm1,
// flags
bool const use_int8_scale_max, bool const interleaved, bool const is_s_padded, bool const has_alibi)
bool const use_int8_scale_max, bool const interleaved, bool const is_s_padded, bool const has_alibi,
float const skip_softmax_threshold_scale_factor)
{
memset(&params, 0, sizeof(params));
@ -421,6 +422,9 @@ static inline void set_params(bert::Fused_multihead_attention_params_v2& params,
params.enable_i2f_trick
= -double(1 << 22) * double(scale_bmm2) <= -128.f && double(1 << 22) * double(scale_bmm2) >= 127.f;
}
// Skip-softmax attention
params.skip_softmax_threshold_scale_factor = skip_softmax_threshold_scale_factor;
}
////////////////////////////////////////////////////////////////////////////////////////////////////
@ -429,7 +433,7 @@ static inline void determine_launch_params(Launch_params& launch_params, Data_ty
const size_t d, const Attention_mask_type attention_mask_type, const Attention_input_layout input_layout,
bool const interleaved, bool const ignore_b1opt, bool const force_unroll, bool const use_tma,
bool const force_non_flash_attention, bool const force_non_warp_specialization,
bool const force_non_granular_tiling, bool const force_fp32_acc,
bool const force_non_granular_tiling, bool const force_fp32_acc, float const skip_softmax_threshold_scale_factor,
// device props
const cudaDeviceProp props)
{
@ -470,6 +474,9 @@ static inline void determine_launch_params(Launch_params& launch_params, Data_ty
"are not supported on Ada currently.\n");
launch_params.use_granular_tiling = false;
}
// Enable skip softmax attention or not.
launch_params.enable_skip_softmax = skip_softmax_threshold_scale_factor > 0.f;
}
////////////////////////////////////////////////////////////////////////////////////////////////////
@ -589,6 +596,9 @@ int main(int argc, char** argv)
// Use attention sinks (added to the denominator of softmax)
bool use_attention_sinks = false;
// Skip-softmax attention
float skip_softmax_threshold_scale_factor = 0;
// Read the parameters from the command-line.
for (int ii = 1; ii < argc; ++ii)
{
@ -885,6 +895,10 @@ int main(int argc, char** argv)
{
use_attention_sinks = true;
}
else if (!strcmp(argv[ii], "-skip-softmax-threshold-scale-factor") && ++ii < argc)
{
skip_softmax_threshold_scale_factor = strtof(argv[ii], nullptr);
}
else
{
fprintf(stderr, "Unrecognized option: %s. Aborting!\n", argv[ii]);
@ -1057,7 +1071,7 @@ int main(int argc, char** argv)
Launch_params launch_params;
determine_launch_params(launch_params, data_type, sm, s, d, attention_mask_type, input_layout, interleaved,
ignore_b1opt, force_unroll, use_tma, force_non_flash_attention, force_non_warp_specialization,
force_non_granular_tiling, force_fp32_acc, props);
force_non_granular_tiling, force_fp32_acc, skip_softmax_threshold_scale_factor, props);
// The Q, K and V matrices are packed into one big matrix of size S x B x H x 3 x D.
const size_t qkv_size = s * b * h * (2 * d + dv);
@ -1713,7 +1727,13 @@ int main(int argc, char** argv)
tokens_per_block, qkv_d_view, q_d, k_d, v_d, contiguous_kv_d, kv_cache_pool_ptr, kv_cache_block_offsets_d,
packed_mask_d, cu_mask_rows_d, attention_sinks_d, cu_seqlens_d, cu_q_seqlens_d, o_d_view, p_d, s_d,
softmax_stats_ptr, scale_bmm2_d, scale_bmm1, scale_softmax, scale_bmm2, softcapping_scale_bmm1,
use_int8_scale_max, interleaved, is_s_padded, has_alibi);
use_int8_scale_max, interleaved, is_s_padded, has_alibi, skip_softmax_threshold_scale_factor);
#ifdef SKIP_SOFTMAX_STAT
FMHA_CHECK_CUDA(cudaMalloc(&params_v2.skip_softmax_total_blocks, sizeof(uint32_t)));
FMHA_CHECK_CUDA(cudaMalloc(&params_v2.skip_softmax_skipped_blocks, sizeof(uint32_t)));
FMHA_CHECK_CUDA(cudaMemset(params_v2.skip_softmax_total_blocks, 0, sizeof(uint32_t)));
FMHA_CHECK_CUDA(cudaMemset(params_v2.skip_softmax_skipped_blocks, 0, sizeof(uint32_t)));
#endif
// total number of tokens is needed to set TMA desc on the host.
launch_params.total_q_seqlen = q_seqlens[b];
@ -2101,6 +2121,18 @@ int main(int argc, char** argv)
non_fused_elapsed / fused_elapsed, total_flops / (fused_elapsed / float(runs) / 1e-9),
total_bytes / (fused_elapsed / float(runs) / 1e-6));
}
#ifdef SKIP_SOFTMAX_STAT
if (skip_softmax_threshold_scale_factor > 0)
{
uint32_t total_blocks, skipped_blocks;
FMHA_CHECK_CUDA(
cudaMemcpy(&total_blocks, params_v2.skip_softmax_total_blocks, sizeof(uint32_t), cudaMemcpyDeviceToHost));
FMHA_CHECK_CUDA(cudaMemcpy(
&skipped_blocks, params_v2.skip_softmax_skipped_blocks, sizeof(uint32_t), cudaMemcpyDeviceToHost));
printf("Skip-Softmax .: %u / %u = %.2f%%\n", skipped_blocks, total_blocks,
total_blocks ? 100.f * skipped_blocks / total_blocks : 0.f);
}
#endif
#if defined(DEBUG_HAS_PRINT_BUFFER)
FMHA_CHECK_CUDA(cuda_memcpy_d2h(print_buffer.data(), params.print_ptr, print_buffer.size(), DATA_TYPE_FP32));
@ -2141,6 +2173,11 @@ int main(int argc, char** argv)
FMHA_CHECK_CUDA(cudaFree(kv_cache_block_offsets_d));
FMHA_CHECK_CUDA(cudaFree(contiguous_kv_d));
FMHA_CHECK_CUDA(cudaFree(softmax_stats_d));
FMHA_CHECK_CUDA(cudaFree(attention_sinks_d));
#ifdef SKIP_SOFTMAX_STAT
FMHA_CHECK_CUDA(cudaFree(params_v2.skip_softmax_total_blocks));
FMHA_CHECK_CUDA(cudaFree(params_v2.skip_softmax_skipped_blocks));
#endif
free(qkv_h);
free(mask_h);

View File

@ -283,6 +283,16 @@ struct Fused_multihead_attention_params_v2 : Fused_multihead_attention_params_ba
float* scales;
} q, k, v;
} sage;
// Skip softmax when exp(local_max - global_max) < skip_softmax_threshold_scale_factor / seqlen.
// A positive value means skip-softmax is enabled.
float skip_softmax_threshold_scale_factor = 0;
#ifdef SKIP_SOFTMAX_STAT
// Statistics of skip-softmax, pointers of device memory for output
uint32_t* skip_softmax_total_blocks;
uint32_t* skip_softmax_skipped_blocks;
#endif
};
#endif
@ -322,6 +332,8 @@ struct Fused_multihead_attention_launch_params
// harward properties to determine how to launch blocks
int multi_processor_count = 0;
int device_l2_cache_size = 0;
// skip softmax attention
bool enable_skip_softmax = false;
};
////////////////////////////////////////////////////////////////////////////////////////////////////

View File

@ -177,4 +177,13 @@ struct Fused_multihead_attention_params_v2
float* scales;
} q, k, v;
} sage;
// Skip softmax when exp(local_max - global_max) < skip_softmax_threshold_scale_factor / seqlen.
// A positive value means skip-softmax is enabled.
float skip_softmax_threshold_scale_factor = 0;
#ifdef SKIP_SOFTMAX_STAT
// Statistics of skip-softmax, pointers of device memory for output
uint32_t* skip_softmax_total_blocks;
uint32_t* skip_softmax_skipped_blocks;
#endif
};

View File

@ -296,7 +296,8 @@ bool AttentionOp::convertMMHAParamsToXQAParams(tensorrt_llm::kernels::XQAParams&
// Parameters for sparse attention
xqaParams.sparse_params = mRuntimeSparseAttentionParams;
xqaParams.use_sparse_attention = useTllmGenSparseAttention();
// Skip softmax threshold.
xqaParams.skip_softmax_threshold_scale_factor = mSkipSoftmaxThresholdScaleFactorDecode;
// Cross attention parameters.
xqaParams.encoder_input_lengths = generationsParams.encoder_input_lengths;
@ -1313,6 +1314,8 @@ int AttentionOp::mlaGeneration(
fmhaParams.sparse_params = mRuntimeSparseAttentionParams;
}
// MLA does not support skip-softmax attention right now
// Run the fmha kernel
mDecoderFMHARunner->run(fmhaParams);
}
@ -1885,6 +1888,18 @@ int AttentionOp::enqueueContext(EnqueueContextParams<T> const& params, cudaStrea
fmhaParams.sparse_params = mRuntimeSparseAttentionParams;
}
// Skip-softmax attention parameters
fmhaParams.skipSoftmaxThresholdScaleFactor = mSkipSoftmaxThresholdScaleFactorPrefill;
#ifdef SKIP_SOFTMAX_STAT
fmhaParams.skipSoftmaxTotalBlocks = mSkipSoftmaxTotalBlocks;
fmhaParams.skipSoftmaxSkippedBlocks = mSkipSoftmaxSkippedBlocks;
#else
if (tensorrt_llm::common::getEnvPrintSkipSoftmaxStat())
{
TLLM_THROW("To print skip softmax stat, please run build_wheel.py with -DSKIP_SOFTMAX_STAT");
}
#endif
if (mAttentionChunkSize)
{
fmhaParams.chunkedAttentionSize = *mAttentionChunkSize;

View File

@ -494,6 +494,14 @@ public:
// See [Chunked Attention] in _torch/modules/attention.py
std::optional<int64_t> mAttentionChunkSize = std::nullopt;
// Skip softmax threshold scale factor.
float mSkipSoftmaxThresholdScaleFactorPrefill = 0;
float mSkipSoftmaxThresholdScaleFactorDecode = 0;
#ifdef SKIP_SOFTMAX_STAT
uint32_t* mSkipSoftmaxTotalBlocks;
uint32_t* mSkipSoftmaxSkippedBlocks;
#endif
[[nodiscard]] auto data() const
{
return std::make_tuple(mLayerIdx, mNumHeads, mVisionStart, mVisionLength, mNumKVHeads, mHeadSize,
@ -510,7 +518,8 @@ public:
mMLAParams.data(), mCpSize, mCpRank, mCpGroup, mNumAttnHeads, mNumAttnKVHeads, mNumKVHeadsOrigin,
mAttnTpSize, mAttnTpRank, mAttnCpSize, mAttnCpRank, mUlyssesMQABroadcast, mEnableContextFMHA,
mFMHAForceFP32Acc, mMultiBlockMode, mEnableXQA, mUseKVCache, mSkipAttn, mFuseFp4Quant,
mNbMultiBlockSemaphores, mAttentionChunkSize.value_or(-1));
mNbMultiBlockSemaphores, mAttentionChunkSize.value_or(-1), mSkipSoftmaxThresholdScaleFactorPrefill,
mSkipSoftmaxThresholdScaleFactorDecode);
};
private:

View File

@ -554,6 +554,11 @@ bool getEnvEplbForceGdrcopy()
return getBoolEnv("TRTLLM_EPLB_FORCE_GDRCOPY");
}
bool getEnvPrintSkipSoftmaxStat()
{
return getBoolEnv("TRTLLM_PRINT_SKIP_SOFTMAX_STAT");
}
} // namespace common
TRTLLM_NAMESPACE_END

View File

@ -161,6 +161,8 @@ bool getEnvKVCacheTransferAllBlocksForWindow();
bool getEnvEplbForceGdrcopy();
bool getEnvPrintSkipSoftmaxStat();
} // namespace common
TRTLLM_NAMESPACE_END

View File

@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:f6509dd36fb92554c6078595951a8de698d7bdaa07b9b817bfcdd255d4303bca
size 687070
oid sha256:4f1f3679968b8f6dea77f53534af9eb1348b6f476d4c3880833b41dd4cc9c803
size 687860

View File

@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:b22d606e19b52047ae67319d61f138562f2b81df08ccde3f8fa04f040d408d7a
size 669688
oid sha256:a0d7061b400ab387309af00ae12f7a840b5abb91757183f415ca18329bbdb358
size 670478

View File

@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:2a70e335677a1b0f9d98267fe7701735e42f105720403489276d48a4247ea1b5
size 423835
oid sha256:4a91ff0238b0c8f1d40f8441f22a60a2c64d344b8550de68737292ff449d1d7e
size 426203

View File

@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:8289200bf78517033295966e9dbdf5c647da9aa7089669ff473ba436fef6a798
size 1230152
oid sha256:4d094c39dbdd372166facb297a4a91be80fb231bf3cca89afa97e61cc725f67e
size 1228572

View File

@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:97cc5f8d42d92332a92fa216847bbacccc7ef9f9d5208bd26585cd702d03fe57
size 1725040
oid sha256:1fe830d32459fd9a25d54e1d00a98720afd938d9e9042e2b5903f969e991d72d
size 1721882

View File

@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:1264927817c08da144e387a7258f6c6fe424c0ff159f3ab0d6ffa3c4e3947598
size 375671
oid sha256:09af1ef9197c628c4a31cc58276ee6dcfad03f751069a78b5242594f93ea8c97
size 378039

View File

@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:950fb45e94ffc8e2ec9f5a4b682075be55cb85d6415b3eeb172ce2cf7d53220d
size 1140954
oid sha256:9e93bb514c30bc5a4cda8f402a386ab85d079f9b97aeff04788cf3c8a8cc87a6
size 1137008

View File

@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:ba97e1bf342788eaf74a78f542f870d3967214aed98b98600fae772aad5bad5f
size 653960
oid sha256:0dc47824dfc41004c5b243ce9f40eefeee15c69b88474e33ec13137ef56604e8
size 651592

View File

@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:337cc83d1880b1496e2f054285472b693c181e081819f425ddf2ea45a5dfe9f4
size 1130682
oid sha256:c0f042eabb29ee9db7ddf9791840337a7544653b295e4b2a5068b7f80bcd8251
size 1128314

View File

@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:859ffffa18f1c9c8068a1cfedec487c2e0eab84af2c3720eaa7bb2a044ea16f6
size 1534006
oid sha256:7a9d887dd0acea6d82a25e0dda908f4c5421eaa1ddbfeeb49d382c079156d67e
size 1535586

View File

@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:02bc55faacb50d0501c590ed11b40d802b374618cbde58db725cc67495762064
size 698136
oid sha256:22a7eaab8e44194acd83621e5546f164ad9cbeda8b67867f864a235036a03931
size 690242

View File

@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:510d6c9942dea4bef976c2307fc63f1d7341d78ad8b41cca3bf80bae0a377575
size 380847
oid sha256:e22fe2dde7f5542975db7517b37cdce0eaa656fed2bc58378b37a872c54a43ef
size 374533

View File

@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:d0e0d34e15f533f756ac4ad6ef8889e5ed7556d859b6263509f608f2e7194e0a
size 964134

View File

@ -1,3 +0,0 @@
version https://git-lfs.github.com/spec/v1
oid sha256:6fd7941b92a10c3116b3d93b50ce94d90627ed020e1aa4263b2c46926db60250
size 1008328

View File

@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:04439f4bdd5bf15dce0d59e455545236ed5b98c963a9b491c40d473eb766a04f
size 988580
oid sha256:ec624d7dceea5234b9dd4e43125f271e46ed4f2a4118837a23e00eb89571dcb2
size 985422

View File

@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:46413d67059a15237e0f7f26b4d75c1965d135c4b28a1effe3b6f40a51dbe543
size 606983

View File

@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:0c526229c1eea9eec08dd2c4a6d9f2052e54d6ece9f4fdf0b9a73f371e72ae36
size 614063
oid sha256:d33f3798292038d22d4e61732da397b3466a8983892fcc14448f63e5705d2dd0
size 629062

View File

@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:2d07d4142403fc5d3004e6831b12f1cf3236c397e61448cbe49e7c7e47a5aef4
size 2482034

View File

@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:26232545325ecf363f12b49db62c92a1294dc155ea22cb6e6593fc920b734aec
size 1862432

View File

@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:ba18343579abe3aee6e75545c8ec25a244d24864ff69c23146ee2a13b5eccdd4
size 1916872
oid sha256:41df1bdb745c0efd7961c44dbcd30a1bad202103d301ca785b5b7cdef3cd0ce9
size 1882140

View File

@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:0e6ba601df471fff8f5beb1bdb9af6b8f41f57903ee762bb042b023917882d95
size 2608304
oid sha256:053ddc81e3885a583adb9bfbfea6a263f023638a2162430dc62faeba1b101d37
size 2527002

View File

@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:25f59e66bbafb18273bf7fc34eade730ef12e805e59abb6ef345b6d8574b8eb8
size 565135

View File

@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:91906c0234a97455f5733ec6e359d61e9b9a0481aa72fd5eec72ae5cc04b8d22
size 571425
oid sha256:2194a3863b3dd880c0788260e6368d8453050e7c02e7055eb8d9b87f4ce32429
size 588001

View File

@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:19a154061aa471d1ef60d6a6e6cc32afe7d5fc9e0856059c22220d333f855318
size 2291002

View File

@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:6233042269b80360ec9b76dff1204188338c85d4c647c234df21405138e8e584
size 704076
oid sha256:3fbf61a84913ea7865981c9d2df49a2c4db4aff6959e0864ba619878af8894dd
size 641720

View File

@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:73c371164cb89be22699cfc3973d6b3bc03a892fed504f74e89f17b7130deb12
size 1765330

View File

@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:ee37ada8d3e1d32b5b7227008e29a73e1b2e2dcfcd9d63a25f818a607445d4ca
size 1798458
oid sha256:17b06132679a9db8eb555012bfb53fe941ea092126af235837deff4848b3b13b
size 1786618

View File

@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:1c69925c289bbda6bb7579eb6c84d1432612a96485ee97bdc04dcbba035c93da
size 2342284
oid sha256:f2ffd14c273aeb544cf055e6b984f25b03116eb30d067c98bf00af306ec55962
size 2335970

View File

@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:0f310dc88b134cfee3c3ef703bb764c175bfeacbef3845ad8e75fbf3bbe9d75c
size 604267
oid sha256:0bb606a262a25c8cdb18ee9beff02931a133ebebe7777600479241d291825b9e
size 602689

View File

@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:547ad9a31f84c26651688c8911e566c9a05ac88283de8d54c8031017a4f51105
size 917634
oid sha256:90c07881943263544ffc233453b9b5613351e07fdef3dd21bb893134fecc304f
size 916844

View File

@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:05b540e55a0124ab1599f69dae20172b17ef20688a24edc8db841f90a1952e8f
size 1384932
oid sha256:b6d7ee26961634f6a7b62d8adae6c97927e85d9fbc8182ef0b1d59ee9d5e2cfb
size 1378616

View File

@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:3214745c4089e516ff3e8d5584773b2edb697151f3872d3a0fb7def131ccb48e
size 1432292
oid sha256:962dfabab27f08bbc8be8e9b8ad3e91f56917a66ef96f7fab4d16f6e1959ed4a
size 1426766

View File

@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:105d01d606d4e52b8646cedba3de42be7edb4936326d5bf803fc31f687da060f
size 1432292
oid sha256:951fba3609c416eb6f510801cea84799fe0cfcaa1eff2e2fc2819a0f3baf27ef
size 1426766

View File

@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:9d4f00e3d3d9da69f8dab35b2ccaae523ae3776d5f0a9dc9847ec9a913001aa6
size 1976932
oid sha256:462a75cf4729835b28cd2ed018f7f8efbf7483e2b3e17c2212f126ee923de7dd
size 1971406

View File

@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:7ef6b870160c17dd50a94e854ff3e80dea49a4d00e00cd8afb44fea987971f42
size 1395968
oid sha256:112ccd86150a8a1c3089ebd51f936acebd9ed43f43a98f6cf64b84aaab880b84
size 1392810

View File

@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:c29f502fd3045dde0fed26bfdfd02d982088c45adbf35d256192f5a3acd4b745
size 1417280
oid sha256:627d836744c606dc8fe38739714c866df81d6fa5bb24be5c1492198ce2980f54
size 1414122

View File

@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:a4e4efc619f4f40ed4545d93fef155ad1e91b78278dac39a93ac57e6232d8854
size 1417280
oid sha256:234eb49ca0767ba2eb21328a056aae01d58924c34f3a88ae65f0ca0eb7f5a63d
size 1414122

View File

@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:9f5f6e59a6ea43f83e1ee80fe304ee0b2315b149b91e61b1f959d91bf1d0da0c
size 1954816
oid sha256:d44e58432a6a36e099da6186eac14cc458717a99643467176217f8dc23b677cd
size 1950080

View File

@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:afe2b2f66132680635127950d28848efdc6f9d09760ddaa747773549eca03c4e
oid sha256:6da2ec5063988ec246e767476637010ec3721b69afe9df9c7b1aae679a5ee64f
size 309055

View File

@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:c5bc3108c268b2d7de40a2924c6211180ff7b70b0244d21fa94d0d482a0ca1a3
size 296423
oid sha256:0b1bbefa7f2d6966d5a12474cc98970054dc1070198504d9cfb23e67aa93cbdd
size 295635

View File

@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:bf15a32bbbba2a60f79935da04afd0a6e30a842a82834d8a1dc5065743514aae
oid sha256:fcf4fa1a5f6471aee9e193ef4a20cd2d8ab707b270b909da55c12af7151cd4da
size 504821

View File

@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:87b5a8853cc0e07726e781f22efd704d3855add2d9af71bd40566fbb4d5d81b1
size 676898
oid sha256:650e03222f3cf2bd313b076e4ea6ed8d6cda4aa65f937e425919d6e370e05776
size 678476

View File

@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:8c5d711863c33b007d0771d82cc64fa3e067d375cf07b0b3c2a35ab628863335
size 713996
oid sha256:d0152f6ba5ce04b2daa7672ec379ef7e17c652865b8b5d3b07037f3cc9da2578
size 715574

View File

@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:2793d457837759f626cb80f35524725896b4f627443029259b201a9b56596839
oid sha256:5e5c9dd60b38610f5bb2be8242bcf4d8a83ff07cacad2ad5840bb1c39d8ce5de
size 756644

View File

@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:8cf91a1a28b8a82be78030364d508c1b5020f98a29a04a7b36625cd0a5e110e3
size 758198
oid sha256:26636e4e4faa0ead121a48d28ed92ed8fe596cbb33c2475e059df25213994ad4
size 754252

View File

@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:43198c508411ad462f51d7305c3c486cb89e910e6395744b6ee2fc0f18033eba
size 957900
oid sha256:7cb58307f7d64ff48883a6aa8d080dc407b41854e0674186113b1c2267b4a09a
size 957110

View File

@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:24bf19e4329af834296cd06ba38386e072275f431d77282866c4a869c93e28c2
size 498505
oid sha256:6a5b811c7fad2ca3e96109d1eb3f0a616972ac1e515029a6b0ccdfc2da49b86f
size 496137

View File

@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:a7189aad0d5f9074d819978392ab5189dda6eec7d8357d1aac6dd6dd13db1fe3
size 498505
oid sha256:5a1660e00015f318ecbd4a94b152df85350501c5ff2b785bf853a54341ff3be0
size 496137

View File

@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:95019b3b08b2a1edf7a6060fbd4374be3bdb9761403d508d5501ad04d7ec4948
size 508767
oid sha256:3e24f0a97fc1e34502bef25a6e352850a56acf352c5c91f2c9143817b91480ef
size 505609

View File

@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:02cd29bc83b5bbc1dba474f9291bf7703489265bb95a7e3c15ef3250fc560727
size 504807
oid sha256:2e9c07ad0ed4e3dc628a0f869e8fb251c7d888bc1f2293813a490d98cac3de66
size 503229

View File

@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:fa854e128ad1030a6111c558882ec825cf3f2cec3dd5478efc7af7f6c7def6b2
size 186759
oid sha256:85a6f1698c9121d3d46a655196426f685eff59fcafd1ce7a0726c9c534e4c185
size 185969

View File

@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:e92e5a6878df5e5f8d599498dd161222e51e904fa92ce00bfb83573dc3267f4e
size 186759
oid sha256:b3ba5d378487514688d8243c233304bfc474e2605eee39ba041f2d66bbee9cac
size 185969

View File

@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:0feae28ea044d43628c20de7546f27ea63b86ce401e34c5d0b83b5ee55b2179d
size 671358
oid sha256:b71a96899d5f82ef3d739a038786796c21e730336aa903f2a0b349debd98582e
size 670568

View File

@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:3ee51f64ca9f4aa9ce726813ff714dbf02f41b67238c5a0079be2b81938ea470
size 681620
oid sha256:f898fc6ec92441e61de5a80e7efda5d700112a9753f9b7b79ceee5e505241725
size 672936

View File

@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:f197cb5fb0cbe8ae3830b60f1b5caf29c9a51030c18ec4433cd91522302d6f3d
size 726636
oid sha256:812e90f56446d8595de0189638db259aa5d59c7b2110a5e7317209af578f4811
size 728214

View File

@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:e53abec3bc15e7a401ef2f33d4cc603a16de09310b84a21f0da29b97635cec36
size 728190
oid sha256:b0f2cffd832b0c8186b490dc6673f4fe959e23cc5d42a4e47b07844295c4ac3d
size 725822

View File

@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:43c7eed764f0558af9142e804dda4a3e3e6fe2e32ef12648c838e70d4f16a8e5
size 942888
oid sha256:53ca2064f14a0739bf86cce9f2675417dc3f3344eaa7f3224873f28a5b07bd9f
size 936574

View File

@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:a34940bb17fb8b47898fcffe4647b89d1cb1cc0db2c6daa707b41e3c27d77974
size 642940
oid sha256:73ebcc6bb3182ba0f3fdc6cdeeb30ffa5899b55ac51a90e30ce82b03e4c4e383
size 619260

View File

@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:95b4a97f34e27a549c0e45b5fcd4f70ea886528485d3f81399a702d6efa4af7b
size 164655
oid sha256:d03230a130470ae336de35f8406b18d3e5ac8f67d89b06f250deb296478582aa
size 171759

View File

@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:6ab7a8f5ecf3eff80160a50c400b2aaaa770dcdcb8f0d0ba1f6dd4907882954e
size 164655
oid sha256:74d5ad7353a2bdc7b3b9f78e12dcbbe6d1f763fcede6dd67859ee685bacf456d
size 171759

View File

@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:a800789202ce7b2b45f2c5fb802757736d1826ca1806776cb52104dd57594a15
size 642940
oid sha256:b149f7e48a1b929559b949ba13a63f32c1e79a1b46d4c22525ea4f8a78f83dd0
size 619260

View File

@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:842bc25f77747de3d2dc62188106809fe661c034ba34325dae56f26ab17c5e77
size 500069
oid sha256:f1ca65530dd837ec88348a09517ba942572ba6da39d10f98ced2670944aad7a8
size 489019

View File

@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:fda4fe88bc5228428f8e8e0a2f0dd77efe80fef8c37f123c11eb46cc83ebc2e5
size 715486
oid sha256:15824f4d720f5fdaab3dc4428d236e6a1d117da368252588c40df39991fef589
size 720222

View File

@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:2df4194edb6df203d67fd984749e48467f94e4ef5fd1e8f70b652cd571ce411a
size 700472
oid sha256:e4b3c0421958a2765e073ee6bb892271cc3386915b08a9e9f7faa83d0c0da2d2
size 698894

View File

@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:3fd8099f7f2044b092a7c8d333ac24f6c7bfc5f68941a6d9567f61ee42b2a1af
size 456987
oid sha256:1dbabdbf3f6dc0cbb4a0b1beb88997c33dd8342a3d5c8aa21bdcdb234431efe1
size 458565

View File

@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:31e17e46754dababe6432e5b39480a467649804d3bbaadf21e17f2ad4be92338
size 1249946
oid sha256:f10e2e699e969f1246ec7255addee2d3af254183a02994ede81917b7d4236447
size 1252314

View File

@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:643db350b63b6a0b61c7a539e764d9ef3318ca87b029564d01dfbd6795cec37d
size 1265672
oid sha256:fc636fc9f49244335817c0863c7d917c50be20ed6c477bb4cd415229239614ad
size 1248306

View File

@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:d7e1a558e10eb20d43c421dde8608f750c984fde61df7f3b40d4b783c9ca65f4
size 1800816
oid sha256:f265c9ef95df4aeab96a1030d057ec6d28cb1d62a64e5dfadb1d2a0b041f3b8f
size 1790554

View File

@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:82eda655528a11824c39dd4a8383230a43c5010b3b07de66a189cf757de0f50b
size 416717
oid sha256:5a966bc2bfeadeb3c98fabd16f2e4fd873438b163a2444d2c408481ebd9061f7
size 418295

View File

@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:411f56f27f634640f8be0601dab59bc4f377d639653b0a0434d68bd0209f1252
size 1116484
oid sha256:dbfdd2e41aed8319810cb0b11eae510365f8193bb8efc3c6eb432face74c61df
size 1117274

View File

@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:2fad29994b66ce57ac2ca5d5a8d7c34f79b5c9836f10c46f05c96bb4323dcf6c
size 640540
oid sha256:26a4948d90b87a212155feaf9107595717b18e02f76b0a17eaae7da1198d60c5
size 641330

View File

@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:77d5db630efc5423aed2d68a4ed9b17c0ec008d9aafbe1dc40a1d40dc9d6d1d8
size 1168570
oid sha256:fda34fb9af0c3da4a27afe65cbabaedc20d7397641d7ecf742bbbec6a765d5de
size 1179620

View File

@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:c4ef5092061cccedf9161c46391af469fbf2f539b3b0ac33abb4107ae937e85e
size 1631884
oid sha256:d1d684b7f0b1c68fcf8117c5326fca8a50a7f6e6b2228905f4b0e7a7d35f79a3
size 1646882

View File

@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:6cbb1e5e5ad3d96c3cd1c252bddcfb63ded6b176944b220d219c21337e4dc503
size 618414
oid sha256:8871bb4a42aa388a81a2715d1f7f9468b763da17d6034692d2c788b92665585e
size 614465

View File

@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:ce99295c7cf1c2f5f083ce5c89cd1f3bc9413f4d312ba19aa1a3a4cf7ab34500
size 338223
oid sha256:97aadedec04a46bc71c549003416d2f547f4e50fdf3828c73ab8536563aca99a
size 336645

View File

@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:23581eecd2559333655f5fa591debf45f206e4886e7e01974ca6948b24f49e6d
size 688658
oid sha256:37a95b7847ba2b77c8ccef3051ce7cf27216a47d6541074230c76176bf2e6084
size 689448

View File

@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:85d585b303ab8f2d87fdcb4fde7a60785a023ae0b5059c2625a964d8ca3198f1
oid sha256:2e4788c3c3fb03defe163e31675eb4fd965ef7dc665af3ee9aa73009dd9f9dde
size 672066

View File

@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:aeec57989671707758378702be6f934669173b6a6cd572de050129fe52ad00d6
size 424633
oid sha256:4b7b96c1f392966bbaa090f54df017a41852e25d76930847643606bf91ee4ad7
size 427001

View File

@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:b93c716b918a663df3399b7ffd05b77d2dc6bb0ebc5d5722ef46d9ae198d95c4
size 1231740
oid sha256:385e79fbc4b57a29c809fae1185217ddbcc657cc9ff23fc57ff35034668f2274
size 1230162

View File

@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:72b5d9098bb48ab12e149a07a969693b6d1bed5c21267c9a6146427b7adf37b6
size 1727418
oid sha256:934396e10b06250e6bf436e2f47747f4038f98fe084792e05edb5923f2e34e01
size 1724260

View File

@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:91fd342608928df24da3ff3acb4e498a1e51d110c53c9d7257c470c151a27b54
size 377259
oid sha256:1b164663219eda6704b876c10b9794ffe473b76a9e69efc556a91fa86012067d
size 379627

View File

@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:0b01f483fafdc17b16c664a939367a71f59883a2886f015ad969c194c1cc4a29
size 1142542
oid sha256:3044f3dca75e5992142ea2bf43f27736212452e449a9480c6f6207c99533b103
size 1138596

View File

@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:9a043dada38b23b1abddac2b437fad372ee1f34f5104e0ddb2e7028772cddde0
size 654758
oid sha256:a807e0ae3448e63fa7c37f6de16e10b488c59399faf0e2dd9584f1a78149b983
size 652390

View File

@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:309f0fa8fd451c63c731ea1808a9a008656a6fbb0c461b9b040adcdf2f646c6f
size 1132270
oid sha256:0f5bd71775f0189439f2f0e00fdeed773b46b544d520805d24a5b2a781355efa
size 1129902

View File

@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:174be71c01c900552f2d4a54604012806db84e946f92e6f671a0dd3fd7d5df0d
size 1536384
oid sha256:af61c7d59a3c590488fa42805536f50a36454b73decb626e8ac7958993d60c0f
size 1537964

View File

@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:92a0b66977279106585c101e883818375f99246c5df4d4d5ca1d7be3c1efc094
size 700514
oid sha256:f58e340db03cdbc494325003bb34a9efffff5d1b2d6f57dcc8948de51b240420
size 691832

View File

@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:dad21f7e77e13e768419d1674bd986ba914d4f765e809606061b5343d604f42c
size 381647
oid sha256:3436f09614ae18411e04a17cb93933a3ee41439a897bd2999dd8efbfc34c466d
size 375331

Some files were not shown because too many files have changed in this diff Show More