From f6fff181422b7ac0f6de10ea41ebb4cbda8c5a25 Mon Sep 17 00:00:00 2001 From: "Yueh-Ting (eop) Chen" Date: Wed, 4 Feb 2026 19:21:50 +0800 Subject: [PATCH] [https://nvbugs/5624818][fix] Work around accuracy issue by enforcing paged_context_fmha on Hopper for fmha_v2 (#11192) Signed-off-by: eopXD --- tensorrt_llm/_torch/attention_backend/trtllm.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/tensorrt_llm/_torch/attention_backend/trtllm.py b/tensorrt_llm/_torch/attention_backend/trtllm.py index 3ba32dc9ac..47f433e1fc 100644 --- a/tensorrt_llm/_torch/attention_backend/trtllm.py +++ b/tensorrt_llm/_torch/attention_backend/trtllm.py @@ -1549,6 +1549,11 @@ class TrtllmAttention(AttentionBackend[TrtllmAttentionMetadata]): or metadata.runtime_features.has_speculative_draft_tokens ) if metadata.runtime_features else False + # This is a workaround for https://nvbugs/5624818 + # Paged context FMHA is forced on SM90 for correctness + if get_sm_version() == 90: + use_paged_context_fmha = True + return self.wrapper.is_nvfp4_output_kernel_available( tokens_per_block=metadata.tokens_per_block, attention_mask=attention_mask, @@ -1648,6 +1653,11 @@ class TrtllmAttention(AttentionBackend[TrtllmAttentionMetadata]): or metadata.runtime_features.has_speculative_draft_tokens ) if metadata.runtime_features else False + # This is a workaround for https://nvbugs/5624818 + # Paged context FMHA is forced on SM90 for correctness + if get_sm_version() == 90: + use_paged_context_fmha = True + if self.is_mla_enable: # Context MLA uses separate qkv instead of paged_context_fmha use_paged_context_fmha = False