[https://nvbugs/5455651][fix] Make ngram use XQA attention on Blackwell (#6873)

Signed-off-by: Michael Iovine <miovine@nvidia.com> Signed-off-by: Mike Iovine <miovine@nvidia.com> Signed-off-by: Mike Iovine <mike.iovine7@gmail.com>
2026-01-14 06:27:45 +08:00 · 2025-08-14 18:36:19 -04:00 · 2025-08-14 18:36:19 -04:00 · 078e907b16
commit 078e907b16
parent 26f413ad90
1 changed files with 7 additions and 5 deletions
--- a/tensorrt_llm/_torch/speculative/interface.py
+++ b/tensorrt_llm/_torch/speculative/interface.py
@ -91,11 +91,13 @@ class SpeculativeDecodingMode(IntEnum):
        any spec dec mode that uses the SpecExecutor.
        """

-        # Fixme: only trtllm attention backend supports eagle3 generation-phase kernels on blackwell.
-        return ((self.is_eagle3() or self.is_draft_target())
-                and not (issubclass(attention_backend, TrtllmAttention)
-                         and get_sm_version() == 100)
-                ) or self.is_ngram() or self.is_user_provided()
+        if self.use_one_engine():
+            # 1-model has separate logic for handling draft tokens
+            return False
+
+        # The special XQA generation kernels only exist with the TRTLLM backend on blackwell.
+        return not issubclass(attention_backend,
+                              TrtllmAttention) or get_sm_version() != 100

    def attention_need_spec_dec_mode(self):
        """