mirror of
https://github.com/NVIDIA/TensorRT-LLM.git
synced 2026-01-14 06:27:45 +08:00
[https://nvbugs/5455651][fix] Make ngram use XQA attention on Blackwell (#6873)
Signed-off-by: Michael Iovine <miovine@nvidia.com> Signed-off-by: Mike Iovine <miovine@nvidia.com> Signed-off-by: Mike Iovine <mike.iovine7@gmail.com>
This commit is contained in:
parent
26f413ad90
commit
078e907b16
@ -91,11 +91,13 @@ class SpeculativeDecodingMode(IntEnum):
|
||||
any spec dec mode that uses the SpecExecutor.
|
||||
"""
|
||||
|
||||
# Fixme: only trtllm attention backend supports eagle3 generation-phase kernels on blackwell.
|
||||
return ((self.is_eagle3() or self.is_draft_target())
|
||||
and not (issubclass(attention_backend, TrtllmAttention)
|
||||
and get_sm_version() == 100)
|
||||
) or self.is_ngram() or self.is_user_provided()
|
||||
if self.use_one_engine():
|
||||
# 1-model has separate logic for handling draft tokens
|
||||
return False
|
||||
|
||||
# The special XQA generation kernels only exist with the TRTLLM backend on blackwell.
|
||||
return not issubclass(attention_backend,
|
||||
TrtllmAttention) or get_sm_version() != 100
|
||||
|
||||
def attention_need_spec_dec_mode(self):
|
||||
"""
|
||||
|
||||
Loading…
Reference in New Issue
Block a user