mirror of
https://github.com/NVIDIA/TensorRT-LLM.git
synced 2026-01-13 22:18:36 +08:00
[TRTLLM-8777][feat] Update DeepGEMM to the latest commit to include optimizations for DeepSeek-v3.2 (#9380)
Signed-off-by: Fanrong Li <23290157+lfr-0531@users.noreply.github.com>
This commit is contained in:
parent
786d308b88
commit
5a99c9734d
2
3rdparty/CMakeLists.txt
vendored
2
3rdparty/CMakeLists.txt
vendored
@ -39,7 +39,7 @@ FetchContent_Declare(
|
||||
FetchContent_Declare(
|
||||
deepgemm
|
||||
GIT_REPOSITORY https://github.com/ruoqianguo/DeepGEMM
|
||||
GIT_TAG 9fa5965e265e27995f539e0dd73a06351a8a9eaf
|
||||
GIT_TAG 6cb8161516302550785d9af924d2778afef1f3f6 # swapab_sm100 branch
|
||||
GIT_SUBMODULES_RECURSE
|
||||
ON
|
||||
SOURCE_SUBDIR
|
||||
|
||||
@ -308,9 +308,9 @@ def test_deepgemm_fp8_mqa_logits_basic():
|
||||
"""
|
||||
torch.manual_seed(0)
|
||||
|
||||
num_heads, head_dim = 32, 128
|
||||
seq_len = 512
|
||||
seq_len_kv = 1024
|
||||
num_heads, head_dim = 64, 128
|
||||
seq_len = 2048
|
||||
seq_len_kv = 4096
|
||||
#[seq_len, num_heads, head_dim]
|
||||
q = torch.randn(
|
||||
seq_len,
|
||||
@ -335,8 +335,8 @@ def test_deepgemm_fp8_mqa_logits_basic():
|
||||
)
|
||||
# ks[i] -> ke[i] for each q[i]
|
||||
ks = torch.zeros(seq_len, dtype=torch.int, device="cuda")
|
||||
ke = torch.arange(seq_len, dtype=torch.int, device="cuda") + (
|
||||
seq_len_kv - seq_len) + 1 # +1 for exclusive end
|
||||
ke = torch.arange(seq_len, dtype=torch.int,
|
||||
device="cuda") + (seq_len_kv - seq_len)
|
||||
|
||||
# Convert to FP8
|
||||
q_fp8 = q.to(torch.float8_e4m3fn)
|
||||
|
||||
Loading…
Reference in New Issue
Block a user