From 078e907b16864cb760dad9092043c4c7e1adcadf Mon Sep 17 00:00:00 2001 From: Mike Iovine Date: Thu, 14 Aug 2025 18:36:19 -0400 Subject: [PATCH] [https://nvbugs/5455651][fix] Make ngram use XQA attention on Blackwell (#6873) Signed-off-by: Michael Iovine Signed-off-by: Mike Iovine Signed-off-by: Mike Iovine --- tensorrt_llm/_torch/speculative/interface.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/tensorrt_llm/_torch/speculative/interface.py b/tensorrt_llm/_torch/speculative/interface.py index acc68b4b94..f7cdd92a56 100644 --- a/tensorrt_llm/_torch/speculative/interface.py +++ b/tensorrt_llm/_torch/speculative/interface.py @@ -91,11 +91,13 @@ class SpeculativeDecodingMode(IntEnum): any spec dec mode that uses the SpecExecutor. """ - # Fixme: only trtllm attention backend supports eagle3 generation-phase kernels on blackwell. - return ((self.is_eagle3() or self.is_draft_target()) - and not (issubclass(attention_backend, TrtllmAttention) - and get_sm_version() == 100) - ) or self.is_ngram() or self.is_user_provided() + if self.use_one_engine(): + # 1-model has separate logic for handling draft tokens + return False + + # The special XQA generation kernels only exist with the TRTLLM backend on blackwell. + return not issubclass(attention_backend, + TrtllmAttention) or get_sm_version() != 100 def attention_need_spec_dec_mode(self): """