[https://nvbugs/5455651][fix] Make ngram use XQA attention on Blackwell (#6873)

Signed-off-by: Michael Iovine <miovine@nvidia.com>
Signed-off-by: Mike Iovine <miovine@nvidia.com>
Signed-off-by: Mike Iovine <mike.iovine7@gmail.com>
This commit is contained in:
Mike Iovine 2025-08-14 18:36:19 -04:00 committed by GitHub
parent 26f413ad90
commit 078e907b16
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -91,11 +91,13 @@ class SpeculativeDecodingMode(IntEnum):
any spec dec mode that uses the SpecExecutor.
"""
# Fixme: only trtllm attention backend supports eagle3 generation-phase kernels on blackwell.
return ((self.is_eagle3() or self.is_draft_target())
and not (issubclass(attention_backend, TrtllmAttention)
and get_sm_version() == 100)
) or self.is_ngram() or self.is_user_provided()
if self.use_one_engine():
# 1-model has separate logic for handling draft tokens
return False
# The special XQA generation kernels only exist with the TRTLLM backend on blackwell.
return not issubclass(attention_backend,
TrtllmAttention) or get_sm_version() != 100
def attention_need_spec_dec_mode(self):
"""