TensorRT-LLMs/cpp/kernels/fmha_v2/test_sm80_configs.sh
qsang-nv 0fd59d64ab
infra: open source fmha v2 kernels (#4185)
* add fmha repo

Signed-off-by: Qidi Sang <200703406+qsang-nv@users.noreply.github.com>

* fix format

Signed-off-by: Qidi Sang <200703406+qsang-nv@users.noreply.github.com>

* fix code style

Signed-off-by: Qidi Sang <200703406+qsang-nv@users.noreply.github.com>

* fix header

Signed-off-by: Qidi Sang <200703406+qsang-nv@users.noreply.github.com>

* fix header kernel_traits.h

Signed-off-by: Qidi Sang <200703406+qsang-nv@users.noreply.github.com>

* add .gitignore file

Signed-off-by: Qidi Sang <200703406+qsang-nv@users.noreply.github.com>

* add SLIDING_WINDOW_ATTENTION

Signed-off-by: Qidi Sang <200703406+qsang-nv@users.noreply.github.com>

* fix style

Signed-off-by: Qidi Sang <200703406+qsang-nv@users.noreply.github.com>

* fix format

Signed-off-by: Qidi Sang <200703406+qsang-nv@users.noreply.github.com>

* update setup.py

Signed-off-by: Qidi Sang <200703406+qsang-nv@users.noreply.github.com>

* update build_wheel.py

Signed-off-by: Qidi Sang <200703406+qsang-nv@users.noreply.github.com>

---------

Signed-off-by: Qidi Sang <200703406+qsang-nv@users.noreply.github.com>
Signed-off-by: qsang-nv <200703406+qsang-nv@users.noreply.github.com>
2025-05-15 10:56:34 +08:00

270 lines
12 KiB
Bash
Executable File

# FP16: different b, fixed and var.seqlen
bin/fmha.exe -v 0 -runs 1 -s 64 -d 64 -min-s 64 -b 1
bin/fmha.exe -v 0 -runs 1 -s 64 -d 64 -min-s 64 -b 128
bin/fmha.exe -v 0 -runs 1 -s 64 -d 64 -min-s 1 -b 1
bin/fmha.exe -v 0 -runs 1 -s 64 -d 64 -min-s 1 -b 128
bin/fmha.exe -v 0 -runs 1 -s 96 -d 64 -min-s 96 -b 1
bin/fmha.exe -v 0 -runs 1 -s 96 -d 64 -min-s 96 -b 128
bin/fmha.exe -v 0 -runs 1 -s 96 -d 64 -min-s 1 -b 1
bin/fmha.exe -v 0 -runs 1 -s 96 -d 64 -min-s 1 -b 128
bin/fmha.exe -v 0 -runs 1 -s 128 -d 64 -min-s 128 -b 1
bin/fmha.exe -v 0 -runs 1 -s 128 -d 64 -min-s 128 -b 128
bin/fmha.exe -v 0 -runs 1 -s 128 -d 64 -min-s 1 -b 1
bin/fmha.exe -v 0 -runs 1 -s 128 -d 64 -min-s 1 -b 128
bin/fmha.exe -v 0 -runs 1 -s 128 -d 32 -min-s 128 -b 1
bin/fmha.exe -v 0 -runs 1 -s 128 -d 32 -min-s 128 -b 128
bin/fmha.exe -v 0 -runs 1 -s 128 -d 32 -min-s 1 -b 1
bin/fmha.exe -v 0 -runs 1 -s 128 -d 32 -min-s 1 -b 128
bin/fmha.exe -v 0 -runs 1 -s 128 -d 16 -min-s 128 -b 1
bin/fmha.exe -v 0 -runs 1 -s 128 -d 16 -min-s 128 -b 128
bin/fmha.exe -v 0 -runs 1 -s 128 -d 16 -min-s 1 -b 1
bin/fmha.exe -v 0 -runs 1 -s 128 -d 16 -min-s 1 -b 128
bin/fmha.exe -v 0 -runs 1 -s 256 -d 64 -min-s 256 -b 1
bin/fmha.exe -v 0 -runs 1 -s 256 -d 64 -min-s 256 -b 128
bin/fmha.exe -v 0 -runs 1 -s 256 -d 64 -min-s 1 -b 1
bin/fmha.exe -v 0 -runs 1 -s 256 -d 64 -min-s 1 -b 128
bin/fmha.exe -v 0 -runs 1 -s 256 -d 32 -min-s 256 -b 1
bin/fmha.exe -v 0 -runs 1 -s 256 -d 32 -min-s 256 -b 128
bin/fmha.exe -v 0 -runs 1 -s 256 -d 32 -min-s 1 -b 1
bin/fmha.exe -v 0 -runs 1 -s 256 -d 32 -min-s 1 -b 128
bin/fmha.exe -v 0 -runs 1 -s 256 -d 16 -min-s 256 -b 1
bin/fmha.exe -v 0 -runs 1 -s 256 -d 16 -min-s 256 -b 128
bin/fmha.exe -v 0 -runs 1 -s 256 -d 16 -min-s 1 -b 1
bin/fmha.exe -v 0 -runs 1 -s 256 -d 16 -min-s 1 -b 128
bin/fmha.exe -v 0 -runs 1 -s 384 -d 64 -min-s 384 -b 1
bin/fmha.exe -v 0 -runs 1 -s 384 -d 64 -min-s 384 -b 128
bin/fmha.exe -v 0 -runs 1 -s 384 -d 64 -min-s 1 -b 1
bin/fmha.exe -v 0 -runs 1 -s 384 -d 64 -min-s 1 -b 128
# FP16: different b, fixed and var.seqlen (longer sequence length for flash attention)
# NOTE: HALF_ACCUMULATION_FOR_FLASH_ATTENTION has larger epsilon.
bin/fmha.exe -v 0 -runs 1 -s 512 -d 512 -min-s 512 -b 1 -epsilon 0.02
bin/fmha.exe -v 0 -runs 1 -s 512 -d 512 -min-s 512 -b 32 -epsilon 0.02
bin/fmha.exe -v 0 -runs 1 -s 512 -d 512 -min-s 1 -b 1 -epsilon 0.02
bin/fmha.exe -v 0 -runs 1 -s 512 -d 512 -min-s 1 -b 32 -epsilon 0.02
bin/fmha.exe -v 0 -runs 1 -s 512 -d 256 -min-s 512 -b 1 -epsilon 0.02
bin/fmha.exe -v 0 -runs 1 -s 512 -d 256 -min-s 512 -b 32 -epsilon 0.02
bin/fmha.exe -v 0 -runs 1 -s 512 -d 256 -min-s 1 -b 1 -epsilon 0.02
bin/fmha.exe -v 0 -runs 1 -s 512 -d 256 -min-s 1 -b 32 -epsilon 0.02
bin/fmha.exe -v 0 -runs 1 -s 512 -d 128 -min-s 512 -b 1 -epsilon 0.02
bin/fmha.exe -v 0 -runs 1 -s 512 -d 128 -min-s 512 -b 32 -epsilon 0.02
bin/fmha.exe -v 0 -runs 1 -s 512 -d 128 -min-s 1 -b 1 -epsilon 0.02
bin/fmha.exe -v 0 -runs 1 -s 512 -d 128 -min-s 1 -b 32 -epsilon 0.02
bin/fmha.exe -v 0 -runs 1 -s 512 -d 64 -min-s 512 -b 1 -epsilon 0.02
bin/fmha.exe -v 0 -runs 1 -s 512 -d 64 -min-s 512 -b 128 -epsilon 0.02
bin/fmha.exe -v 0 -runs 1 -s 512 -d 64 -min-s 1 -b 1 -epsilon 0.02
bin/fmha.exe -v 0 -runs 1 -s 512 -d 64 -min-s 1 -b 128 -epsilon 0.02
bin/fmha.exe -v 0 -runs 1 -s 512 -d 32 -min-s 512 -b 1 -epsilon 0.02
bin/fmha.exe -v 0 -runs 1 -s 512 -d 32 -min-s 512 -b 128 -epsilon 0.02
bin/fmha.exe -v 0 -runs 1 -s 512 -d 32 -min-s 1 -b 1 -epsilon 0.02
bin/fmha.exe -v 0 -runs 1 -s 512 -d 32 -min-s 1 -b 128 -epsilon 0.02
bin/fmha.exe -v 0 -runs 1 -s 512 -d 16 -min-s 512 -b 1 -epsilon 0.02
bin/fmha.exe -v 0 -runs 1 -s 512 -d 16 -min-s 512 -b 128 -epsilon 0.02
bin/fmha.exe -v 0 -runs 1 -s 512 -d 16 -min-s 1 -b 1 -epsilon 0.02
bin/fmha.exe -v 0 -runs 1 -s 512 -d 16 -min-s 1 -b 128 -epsilon 0.02
bin/fmha.exe -v 0 -runs 1 -s 1024 -d 64 -min-s 1024 -b 1 -epsilon 0.02
bin/fmha.exe -v 0 -runs 1 -s 1024 -d 64 -min-s 1024 -b 128 -epsilon 0.02
bin/fmha.exe -v 0 -runs 1 -s 1024 -d 64 -min-s 1 -b 1 -epsilon 0.02
bin/fmha.exe -v 0 -runs 1 -s 1024 -d 64 -min-s 1 -b 128 -epsilon 0.02
bin/fmha.exe -v 0 -runs 1 -s 1024 -d 32 -min-s 1024 -b 1 -epsilon 0.02
bin/fmha.exe -v 0 -runs 1 -s 1024 -d 32 -min-s 1024 -b 128 -epsilon 0.02
bin/fmha.exe -v 0 -runs 1 -s 1024 -d 32 -min-s 1 -b 1 -epsilon 0.02
bin/fmha.exe -v 0 -runs 1 -s 1024 -d 32 -min-s 1 -b 128 -epsilon 0.02
bin/fmha.exe -v 0 -runs 1 -s 1024 -d 16 -min-s 1024 -b 1 -epsilon 0.02
bin/fmha.exe -v 0 -runs 1 -s 1024 -d 16 -min-s 1024 -b 128 -epsilon 0.02
bin/fmha.exe -v 0 -runs 1 -s 1024 -d 16 -min-s 1 -b 1 -epsilon 0.02
bin/fmha.exe -v 0 -runs 1 -s 1024 -d 16 -min-s 1 -b 128 -epsilon 0.02
bin/fmha.exe -v 0 -runs 1 -s 2048 -d 64 -min-s 2048 -b 1 -epsilon 0.02
bin/fmha.exe -v 0 -runs 1 -s 2048 -d 64 -min-s 2048 -b 64 -epsilon 0.02
bin/fmha.exe -v 0 -runs 1 -s 2048 -d 64 -min-s 1 -b 1 -epsilon 0.02
bin/fmha.exe -v 0 -runs 1 -s 2048 -d 64 -min-s 1 -b 64 -epsilon 0.02
bin/fmha.exe -v 0 -runs 1 -s 2048 -d 32 -min-s 2048 -b 1 -epsilon 0.02
bin/fmha.exe -v 0 -runs 1 -s 2048 -d 32 -min-s 2048 -b 64 -epsilon 0.02
bin/fmha.exe -v 0 -runs 1 -s 2048 -d 32 -min-s 1 -b 1 -epsilon 0.02
bin/fmha.exe -v 0 -runs 1 -s 2048 -d 32 -min-s 1 -b 64 -epsilon 0.02
bin/fmha.exe -v 0 -runs 1 -s 2048 -d 16 -min-s 2048 -b 1 -epsilon 0.02
bin/fmha.exe -v 0 -runs 1 -s 2048 -d 16 -min-s 2048 -b 64 -epsilon 0.02
bin/fmha.exe -v 0 -runs 1 -s 2048 -d 16 -min-s 1 -b 1 -epsilon 0.02
bin/fmha.exe -v 0 -runs 1 -s 2048 -d 16 -min-s 1 -b 64 -epsilon 0.02
# INT8: different b, fixed and var.seqlen
bin/fmha.exe -v 0 -runs 1 -int8 -s 128 -d 64 -min-s 128 -b 1
bin/fmha.exe -v 0 -runs 1 -int8 -s 128 -d 64 -min-s 128 -b 128
bin/fmha.exe -v 0 -runs 1 -int8 -s 128 -d 64 -min-s 1 -b 1
bin/fmha.exe -v 0 -runs 1 -int8 -s 128 -d 64 -min-s 1 -b 128
bin/fmha.exe -v 0 -runs 1 -int8 -s 128 -d 32 -min-s 128 -b 1
bin/fmha.exe -v 0 -runs 1 -int8 -s 128 -d 32 -min-s 128 -b 128
bin/fmha.exe -v 0 -runs 1 -int8 -s 128 -d 32 -min-s 1 -b 1
bin/fmha.exe -v 0 -runs 1 -int8 -s 128 -d 32 -min-s 1 -b 128
bin/fmha.exe -v 0 -runs 1 -int8 -s 128 -d 16 -min-s 128 -b 1
bin/fmha.exe -v 0 -runs 1 -int8 -s 128 -d 16 -min-s 128 -b 128
bin/fmha.exe -v 0 -runs 1 -int8 -s 128 -d 16 -min-s 1 -b 1
bin/fmha.exe -v 0 -runs 1 -int8 -s 128 -d 16 -min-s 1 -b 128
bin/fmha.exe -v 0 -runs 1 -int8 -s 192 -d 64 -min-s 192 -b 1
bin/fmha.exe -v 0 -runs 1 -int8 -s 192 -d 64 -min-s 192 -b 128
bin/fmha.exe -v 0 -runs 1 -int8 -s 192 -d 64 -min-s 1 -b 1
bin/fmha.exe -v 0 -runs 1 -int8 -s 192 -d 64 -min-s 1 -b 128
bin/fmha.exe -v 0 -runs 1 -int8 -s 256 -d 64 -min-s 256 -b 1
bin/fmha.exe -v 0 -runs 1 -int8 -s 256 -d 64 -min-s 256 -b 128
bin/fmha.exe -v 0 -runs 1 -int8 -s 256 -d 64 -min-s 1 -b 1
bin/fmha.exe -v 0 -runs 1 -int8 -s 256 -d 64 -min-s 1 -b 128
bin/fmha.exe -v 0 -runs 1 -int8 -s 256 -d 32 -min-s 256 -b 1
bin/fmha.exe -v 0 -runs 1 -int8 -s 256 -d 32 -min-s 256 -b 128
bin/fmha.exe -v 0 -runs 1 -int8 -s 256 -d 32 -min-s 1 -b 1
bin/fmha.exe -v 0 -runs 1 -int8 -s 256 -d 32 -min-s 1 -b 128
bin/fmha.exe -v 0 -runs 1 -int8 -s 256 -d 16 -min-s 256 -b 1
bin/fmha.exe -v 0 -runs 1 -int8 -s 256 -d 16 -min-s 256 -b 128
bin/fmha.exe -v 0 -runs 1 -int8 -s 256 -d 16 -min-s 1 -b 1
bin/fmha.exe -v 0 -runs 1 -int8 -s 256 -d 16 -min-s 1 -b 128
bin/fmha.exe -v 0 -runs 1 -int8 -s 384 -d 64 -min-s 384 -b 1
bin/fmha.exe -v 0 -runs 1 -int8 -s 384 -d 64 -min-s 384 -b 128
bin/fmha.exe -v 0 -runs 1 -int8 -s 384 -d 64 -min-s 1 -b 1
bin/fmha.exe -v 0 -runs 1 -int8 -s 384 -d 64 -min-s 1 -b 128
bin/fmha.exe -v 0 -runs 1 -int8 -s 512 -d 64 -min-s 512 -b 1
bin/fmha.exe -v 0 -runs 1 -int8 -s 512 -d 64 -min-s 512 -b 128
bin/fmha.exe -v 0 -runs 1 -int8 -s 512 -d 64 -min-s 1 -b 1
bin/fmha.exe -v 0 -runs 1 -int8 -s 512 -d 64 -min-s 1 -b 128
bin/fmha.exe -v 0 -runs 1 -int8 -s 512 -d 32 -min-s 512 -b 1
bin/fmha.exe -v 0 -runs 1 -int8 -s 512 -d 32 -min-s 512 -b 128
bin/fmha.exe -v 0 -runs 1 -int8 -s 512 -d 32 -min-s 1 -b 1
bin/fmha.exe -v 0 -runs 1 -int8 -s 512 -d 32 -min-s 1 -b 128
bin/fmha.exe -v 0 -runs 1 -int8 -s 512 -d 16 -min-s 512 -b 1
bin/fmha.exe -v 0 -runs 1 -int8 -s 512 -d 16 -min-s 512 -b 128
bin/fmha.exe -v 0 -runs 1 -int8 -s 512 -d 16 -min-s 1 -b 1
bin/fmha.exe -v 0 -runs 1 -int8 -s 512 -d 16 -min-s 1 -b 128
# INT8 Interleaved
bin/fmha.exe -v 0 -runs 1 -il -int8 -s 128 -d 64 -min-s 128 -b 1
bin/fmha.exe -v 0 -runs 1 -il -int8 -s 128 -d 64 -min-s 128 -b 128
bin/fmha.exe -v 0 -runs 1 -il -int8 -s 128 -d 64 -min-s 1 -b 1
bin/fmha.exe -v 0 -runs 1 -il -int8 -s 128 -d 64 -min-s 1 -b 128
bin/fmha.exe -v 0 -runs 1 -il -int8 -s 192 -d 64 -min-s 192 -b 1
bin/fmha.exe -v 0 -runs 1 -il -int8 -s 192 -d 64 -min-s 192 -b 128
bin/fmha.exe -v 0 -runs 1 -il -int8 -s 192 -d 64 -min-s 1 -b 1
bin/fmha.exe -v 0 -runs 1 -il -int8 -s 192 -d 64 -min-s 1 -b 128
bin/fmha.exe -v 0 -runs 1 -il -int8 -s 256 -d 64 -min-s 256 -b 1
bin/fmha.exe -v 0 -runs 1 -il -int8 -s 256 -d 64 -min-s 256 -b 128
bin/fmha.exe -v 0 -runs 1 -il -int8 -s 256 -d 64 -min-s 1 -b 1
bin/fmha.exe -v 0 -runs 1 -il -int8 -s 256 -d 64 -min-s 1 -b 128
bin/fmha.exe -v 0 -runs 1 -il -int8 -s 384 -d 64 -min-s 384 -b 1
bin/fmha.exe -v 0 -runs 1 -il -int8 -s 384 -d 64 -min-s 384 -b 128
bin/fmha.exe -v 0 -runs 1 -il -int8 -s 384 -d 64 -min-s 1 -b 1
bin/fmha.exe -v 0 -runs 1 -il -int8 -s 384 -d 64 -min-s 1 -b 128
# FP16: different b, fixed and var.seqlen
#bin/fmha.exe -v 0 -runs 1 -s 64 -d 64 -v1 -b 1
#bin/fmha.exe -v 0 -runs 1 -s 64 -d 64 -v1 -b 128
#
#bin/fmha.exe -v 0 -runs 1 -s 96 -d 64 -v1 -b 1
#bin/fmha.exe -v 0 -runs 1 -s 96 -d 64 -v1 -b 128
#
#bin/fmha.exe -v 0 -runs 1 -s 128 -d 64 -v1 -b 1
#bin/fmha.exe -v 0 -runs 1 -s 128 -d 64 -v1 -b 128
#
#bin/fmha.exe -v 0 -runs 1 -s 128 -d 32 -v1 -b 1
#bin/fmha.exe -v 0 -runs 1 -s 128 -d 32 -v1 -b 128
#
#bin/fmha.exe -v 0 -runs 1 -s 128 -d 16 -v1 -b 1
#bin/fmha.exe -v 0 -runs 1 -s 128 -d 16 -v1 -b 128
#
#bin/fmha.exe -v 0 -runs 1 -s 256 -d 64 -v1 -b 1
#bin/fmha.exe -v 0 -runs 1 -s 256 -d 64 -v1 -b 128
#
#bin/fmha.exe -v 0 -runs 1 -s 256 -d 32 -v1 -b 1
#bin/fmha.exe -v 0 -runs 1 -s 256 -d 32 -v1 -b 128
#
#bin/fmha.exe -v 0 -runs 1 -s 256 -d 16 -v1 -b 1
#bin/fmha.exe -v 0 -runs 1 -s 256 -d 16 -v1 -b 128
#
#bin/fmha.exe -v 0 -runs 1 -s 384 -d 64 -v1 -b 1
#bin/fmha.exe -v 0 -runs 1 -s 384 -d 64 -v1 -b 128
#
#bin/fmha.exe -v 0 -runs 1 -s 512 -d 64 -v1 -b 1
#bin/fmha.exe -v 0 -runs 1 -s 512 -d 64 -v1 -b 128
#
#bin/fmha.exe -v 0 -runs 1 -s 512 -d 32 -v1 -b 1
#bin/fmha.exe -v 0 -runs 1 -s 512 -d 32 -v1 -b 128
#
#bin/fmha.exe -v 0 -runs 1 -s 512 -d 16 -v1 -b 1
#bin/fmha.exe -v 0 -runs 1 -s 512 -d 16 -v1 -b 128
#
## INT8: different b, fixed and var.seqlen, interleaved
#
#bin/fmha.exe -v 0 -runs 1 -int8 -s 128 -d 64 -v1 -b 1
#bin/fmha.exe -v 0 -runs 1 -int8 -s 128 -d 64 -v1 -b 128
#
#bin/fmha.exe -v 0 -runs 1 -int8 -s 128 -d 32 -v1 -b 1
#bin/fmha.exe -v 0 -runs 1 -int8 -s 128 -d 32 -v1 -b 128
#
#bin/fmha.exe -v 0 -runs 1 -int8 -s 128 -d 16 -v1 -b 1
#bin/fmha.exe -v 0 -runs 1 -int8 -s 128 -d 16 -v1 -b 128
#
#bin/fmha.exe -v 0 -runs 1 -int8 -s 192 -d 64 -v1 -b 1
#bin/fmha.exe -v 0 -runs 1 -int8 -s 192 -d 64 -v1 -b 128
#
#bin/fmha.exe -v 0 -runs 1 -int8 -s 256 -d 64 -v1 -b 1
#bin/fmha.exe -v 0 -runs 1 -int8 -s 256 -d 64 -v1 -b 128
#
#bin/fmha.exe -v 0 -runs 1 -int8 -s 256 -d 32 -v1 -b 1
#bin/fmha.exe -v 0 -runs 1 -int8 -s 256 -d 32 -v1 -b 128
#
#bin/fmha.exe -v 0 -runs 1 -int8 -s 256 -d 16 -v1 -b 1
#bin/fmha.exe -v 0 -runs 1 -int8 -s 256 -d 16 -v1 -b 128
#
#bin/fmha.exe -v 0 -runs 1 -int8 -s 384 -d 64 -v1 -b 1
#bin/fmha.exe -v 0 -runs 1 -int8 -s 384 -d 64 -v1 -b 128
#
#bin/fmha.exe -v 0 -runs 1 -int8 -s 512 -d 64 -v1 -b 1
#bin/fmha.exe -v 0 -runs 1 -int8 -s 512 -d 64 -v1 -b 128
#
#bin/fmha.exe -v 0 -runs 1 -int8 -s 512 -d 32 -v1 -b 1
#bin/fmha.exe -v 0 -runs 1 -int8 -s 512 -d 32 -v1 -b 128
#
#bin/fmha.exe -v 0 -runs 1 -int8 -s 512 -d 16 -v1 -b 1
#bin/fmha.exe -v 0 -runs 1 -int8 -s 512 -d 16 -v1 -b 128
# Multi-CTA versions for longer sequence length (experimental).
bin/fmha.exe -v 0 -runs 1 -s 1024 -d 64 -b 16 -h 16
bin/fmha.exe -v 0 -runs 1 -s 1024 -d 32 -b 16 -h 16
bin/fmha.exe -v 0 -runs 1 -s 1024 -d 64 -b 16 -h 16 -int8
bin/fmha.exe -v 0 -runs 1 -s 1024 -d 32 -b 16 -h 16 -int8