mirror of
https://github.com/NVIDIA/TensorRT-LLM.git
synced 2026-02-13 06:23:57 +08:00
* add fmha repo Signed-off-by: Qidi Sang <200703406+qsang-nv@users.noreply.github.com> * fix format Signed-off-by: Qidi Sang <200703406+qsang-nv@users.noreply.github.com> * fix code style Signed-off-by: Qidi Sang <200703406+qsang-nv@users.noreply.github.com> * fix header Signed-off-by: Qidi Sang <200703406+qsang-nv@users.noreply.github.com> * fix header kernel_traits.h Signed-off-by: Qidi Sang <200703406+qsang-nv@users.noreply.github.com> * add .gitignore file Signed-off-by: Qidi Sang <200703406+qsang-nv@users.noreply.github.com> * add SLIDING_WINDOW_ATTENTION Signed-off-by: Qidi Sang <200703406+qsang-nv@users.noreply.github.com> * fix style Signed-off-by: Qidi Sang <200703406+qsang-nv@users.noreply.github.com> * fix format Signed-off-by: Qidi Sang <200703406+qsang-nv@users.noreply.github.com> * update setup.py Signed-off-by: Qidi Sang <200703406+qsang-nv@users.noreply.github.com> * update build_wheel.py Signed-off-by: Qidi Sang <200703406+qsang-nv@users.noreply.github.com> --------- Signed-off-by: Qidi Sang <200703406+qsang-nv@users.noreply.github.com> Signed-off-by: qsang-nv <200703406+qsang-nv@users.noreply.github.com>
57 lines
1.7 KiB
C++
57 lines
1.7 KiB
C++
/*
|
|
* SPDX-FileCopyrightText: Copyright (c) 2011-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
|
* SPDX-License-Identifier: NVIDIA TensorRT Source Code License Agreement
|
|
*
|
|
* NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
|
|
* property and proprietary rights in and to this material, related
|
|
* documentation and any modifications thereto. Any use, reproduction,
|
|
* disclosure or distribution of this material and related documentation
|
|
* without an express license agreement from NVIDIA CORPORATION or
|
|
* its affiliates is strictly prohibited.
|
|
*/
|
|
|
|
#pragma once
|
|
|
|
namespace fmha
|
|
{
|
|
|
|
struct AlibiParams
|
|
{
|
|
constexpr static int round_down_to_power_two(int x)
|
|
{
|
|
x = x | (x >> 1);
|
|
x = x | (x >> 2);
|
|
x = x | (x >> 4);
|
|
x = x | (x >> 8);
|
|
x = x | (x >> 16);
|
|
return x - (x >> 1);
|
|
}
|
|
|
|
AlibiParams() = default;
|
|
|
|
AlibiParams(int h, float scale_after_alibi = 1.f)
|
|
: scale_after_alibi(scale_after_alibi)
|
|
{
|
|
h_pow_2 = round_down_to_power_two(h);
|
|
alibi_neg4_div_h = -4.0f / h_pow_2;
|
|
}
|
|
|
|
AlibiParams(int h, int s, int tp_size, int rank, float scale_after_alibi = 1.f)
|
|
: AlibiParams(h * tp_size, scale_after_alibi)
|
|
{
|
|
head_idx_offset = h * rank;
|
|
sequence_pos_offset = s * rank;
|
|
}
|
|
|
|
int h_pow_2{};
|
|
float alibi_neg4_div_h{};
|
|
float scale_after_alibi{};
|
|
// Could be simplified to `int rank` derive the others as `num_heads * rank, s * rank` at
|
|
// runtime, but this makes assumptions about the layout downstream
|
|
// (e.g. downstream may only split across the head dimension, so s would be the full sequence)
|
|
int head_idx_offset = 0;
|
|
int sequence_pos_offset = 0;
|
|
};
|
|
|
|
} // namespace fmha
|