From 2b4ef3a014bb042ec84fc6673211c693c6547cec Mon Sep 17 00:00:00 2001 From: Pengbo Wang <221450789+pengbowang-nv@users.noreply.github.com> Date: Wed, 4 Feb 2026 16:26:24 +0800 Subject: [PATCH] [https://nvbugs/5815025][fix] Fix spec-dec mode flag and related cpp requirements (#10996) Signed-off-by: Pengbo Wang <221450789+pengbowang-nv@users.noreply.github.com> Signed-off-by: Wangshanshan <30051912+dominicshanshan@users.noreply.github.com> --- cpp/tensorrt_llm/common/attentionOp.cpp | 8 -------- tensorrt_llm/_torch/pyexecutor/model_engine.py | 8 ++------ tensorrt_llm/_torch/speculative/interface.py | 14 ++++++-------- tests/integration/test_lists/waives.txt | 1 - 4 files changed, 8 insertions(+), 23 deletions(-) diff --git a/cpp/tensorrt_llm/common/attentionOp.cpp b/cpp/tensorrt_llm/common/attentionOp.cpp index a31750ce6e..33a92a8c03 100644 --- a/cpp/tensorrt_llm/common/attentionOp.cpp +++ b/cpp/tensorrt_llm/common/attentionOp.cpp @@ -1271,14 +1271,6 @@ int AttentionOp::mlaGeneration( mXqaDispatcher->run(xqaParams, kv_cache_buffer, kv_scale_cache_buffer); return 0; } - else if (mIsSpecDecodingEnabled && mUseSpecDecoding) - { - TLLM_CHECK_WITH_INFO(false, "No available XQA kernels are found for speculative decoding mode."); - } - else if (mFuseFp4Quant) - { - TLLM_CHECK_WITH_INFO(false, "No available kernels are found for FP4 output."); - } } // Use FMHA otherwise. diff --git a/tensorrt_llm/_torch/pyexecutor/model_engine.py b/tensorrt_llm/_torch/pyexecutor/model_engine.py index 866bc4cf05..cb9971b9ca 100644 --- a/tensorrt_llm/_torch/pyexecutor/model_engine.py +++ b/tensorrt_llm/_torch/pyexecutor/model_engine.py @@ -3445,13 +3445,9 @@ class PyTorchModelEngine(ModelEngine): no_cache=kv_cache_manager is None) # attn_metadata now depends on spec_metadata since it determines the shape/content of spec_dec parameter Tensors - enable_mla = is_mla(self.model.model_config.pretrained_config) is_spec_dec_mode = spec_metadata.spec_dec_mode.attention_need_spec_dec_mode( - spec_resource_manager, - self.is_draft_model, - self.attn_backend, - self.model_is_wrapped, - is_mla=enable_mla) + spec_resource_manager, self.is_draft_model, self.attn_backend, + self.model_is_wrapped) attn_metadata.update_spec_dec_param( batch_size=scheduled_requests.batch_size, is_spec_decoding_enabled=is_spec_dec_mode, diff --git a/tensorrt_llm/_torch/speculative/interface.py b/tensorrt_llm/_torch/speculative/interface.py index ea623bc442..523a78129f 100644 --- a/tensorrt_llm/_torch/speculative/interface.py +++ b/tensorrt_llm/_torch/speculative/interface.py @@ -164,12 +164,11 @@ class SpeculativeDecodingMode(IntEnum): TrtllmAttention) or not xqa_supported def attention_need_spec_dec_mode( - self, - spec_resource_manager: Optional[BaseResourceManager], - is_draft_model: bool, - attention_backend: Type[AttentionBackend], - use_chain_drafter: bool, # CDL - is_mla: bool, + self, + spec_resource_manager: Optional[BaseResourceManager], + is_draft_model: bool, + attention_backend: Type[AttentionBackend], + use_chain_drafter: bool, # CDL ): """ If true, the attention backend kernel needs to run in spec-dec mode (multi-token query mode). @@ -182,8 +181,7 @@ class SpeculativeDecodingMode(IntEnum): is_trtllm_attention = issubclass(attention_backend, TrtllmAttention) # Always use the multi-token query mode for 1-model if the kernels are available. - xqa_supported = not is_mla or get_sm_version() < 120 - use_case_1 = self.use_one_engine() and xqa_supported + use_case_1 = self.use_one_engine() # For 2-model, we need to enable it when we process multiple tokens at once. This occurs with # the target model (verification) or on the first draft for CDL based speculation. use_case_2 = not self.use_one_engine() and ( diff --git a/tests/integration/test_lists/waives.txt b/tests/integration/test_lists/waives.txt index 27abc344a5..6ca72b1253 100644 --- a/tests/integration/test_lists/waives.txt +++ b/tests/integration/test_lists/waives.txt @@ -246,7 +246,6 @@ accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_vswa_reuse_4gpus[one_m accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_vswa_reuse_4gpus[two_model] SKIP (https://nvbugs/5756028) accuracy/test_llm_api_pytorch.py::TestLlama3_2_1B::test_fp8_prequantized SKIP (https://nvbugs/5785465) accuracy/test_llm_api_pytorch.py::TestMinistral8BInstruct::test_fp8 SKIP (https://nvbugs/5785485) -accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_fp8_eagle3_tp8[eagle3_one_model=True-torch_compile=False] SKIP (https://nvbugs/5787892) accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_fp8_eagle3_tp8[eagle3_one_model=False-torch_compile=False] SKIP (https://nvbugs/5787892) accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_fp8_chunked_prefill[tp8ep8-cuda_graph=False] SKIP (https://nvbugs/5795918) accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_auto_dtype[mtp_nextn=2-overlap_scheduler=True] SKIP (https://nvbugs/5800591)