From 078e907b16864cb760dad9092043c4c7e1adcadf Mon Sep 17 00:00:00 2001
From: Mike Iovine <miovine@nvidia.com>
Date: Thu, 14 Aug 2025 18:36:19 -0400
Subject: [PATCH] [https://nvbugs/5455651][fix] Make ngram use XQA attention on
 Blackwell (#6873)

Signed-off-by: Michael Iovine <miovine@nvidia.com>
Signed-off-by: Mike Iovine <miovine@nvidia.com>
Signed-off-by: Mike Iovine <mike.iovine7@gmail.com>
---
 tensorrt_llm/_torch/speculative/interface.py | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/tensorrt_llm/_torch/speculative/interface.py b/tensorrt_llm/_torch/speculative/interface.py
index acc68b4b94..f7cdd92a56 100644
--- a/tensorrt_llm/_torch/speculative/interface.py
+++ b/tensorrt_llm/_torch/speculative/interface.py
@@ -91,11 +91,13 @@ class SpeculativeDecodingMode(IntEnum):
         any spec dec mode that uses the SpecExecutor.
         """
 
-        # Fixme: only trtllm attention backend supports eagle3 generation-phase kernels on blackwell.
-        return ((self.is_eagle3() or self.is_draft_target())
-                and not (issubclass(attention_backend, TrtllmAttention)
-                         and get_sm_version() == 100)
-                ) or self.is_ngram() or self.is_user_provided()
+        if self.use_one_engine():
+            # 1-model has separate logic for handling draft tokens
+            return False
+
+        # The special XQA generation kernels only exist with the TRTLLM backend on blackwell.
+        return not issubclass(attention_backend,
+                              TrtllmAttention) or get_sm_version() != 100
 
     def attention_need_spec_dec_mode(self):
         """