From 4b8ba7ad61faea39a9b380183757fba88a49b730 Mon Sep 17 00:00:00 2001
From: Mike Iovine <miovine@nvidia.com>
Date: Fri, 9 May 2025 10:45:14 -0400
Subject: [PATCH] [fix][nvbug/5244009] Fix llama 4 test lists/scout accuracy
 issue (#4069)

[fix] Fix llama 4 test lists

Signed-off-by: Mike Iovine <6158008+mikeiovine@users.noreply.github.com>
---
 tensorrt_llm/_torch/modules/attention.py             | 9 ++++-----
 tests/integration/test_lists/test-db/l0_dgx_h200.yml | 4 +---
 tests/integration/test_lists/waives.txt              | 2 --
 3 files changed, 5 insertions(+), 10 deletions(-)

diff --git a/tensorrt_llm/_torch/modules/attention.py b/tensorrt_llm/_torch/modules/attention.py
index b5075e81df..7fa18f5f6b 100644
--- a/tensorrt_llm/_torch/modules/attention.py
+++ b/tensorrt_llm/_torch/modules/attention.py
@@ -119,13 +119,12 @@ class Attention(nn.Module):
         self.o_lora = LoraLayer([LoraModuleType.ATTENTION_DENSE],
                                 [self.hidden_size])
 
-        self.use_qk_norm = (
-            config.pretrained_config
-            and (config.pretrained_config.model_type == 'qwen3'
-                 or config.pretrained_config.model_type == 'qwen3_moe'))
+        use_qk_norm = (config.pretrained_config and
+                       (config.pretrained_config.model_type == 'qwen3'
+                        or config.pretrained_config.model_type == 'qwen3_moe'))
         attn_cls = get_attention_backend(self.attn_backend)
         self.enable_rope_fusion = attn_cls.support_fused_rope(
-        ) and not self.use_qk_norm
+        ) and not use_qk_norm
         self.attn = create_attention(
             self.attn_backend,
             self.layer_idx,
diff --git a/tests/integration/test_lists/test-db/l0_dgx_h200.yml b/tests/integration/test_lists/test-db/l0_dgx_h200.yml
index 97cac4d7f0..b36682d060 100644
--- a/tests/integration/test_lists/test-db/l0_dgx_h200.yml
+++ b/tests/integration/test_lists/test-db/l0_dgx_h200.yml
@@ -17,6 +17,4 @@ l0_dgx_h200:
   - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[ep4--attention_dp-cuda_graph-overlap_scheduler]
   # - accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_fp8_blockscale[throughput] # OOM
   - accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_fp8_blockscale[latency] # 1h
-  - unittest/_torch/multi_gpu_modeling/test_llama4.py::test_llama4[tp8-pp1-trtllm-scout-enable_graph]
-  - unittest/_torch/multi_gpu_modeling/test_llama4.py::test_llama4[tp1-ep1-pp8-trtllm-scout-enable_graph]
-  # - unittest/_torch/multi_gpu_modeling/test_llama4.py::test_llama4[tp8-trtllm-maverick] # 3h will timeout
+  - unittest/_torch/multi_gpu_modeling/test_llama4.py::test_llama4[pp1-ep1-enable_graph-tp8-trtllm-scout]
diff --git a/tests/integration/test_lists/waives.txt b/tests/integration/test_lists/waives.txt
index e9bba817bf..e252a4b2d5 100644
--- a/tests/integration/test_lists/waives.txt
+++ b/tests/integration/test_lists/waives.txt
@@ -472,8 +472,6 @@ examples/test_bert.py::test_llm_bert_general[compare_hf-enable_remove_input_padd
 examples/test_bert.py::test_llm_bert_general[compare_hf-enable_remove_input_padding-disable_attention_plugin-disable_context_fmha-tp:2-pp:1-float16-RobertaForSequenceClassification-bert/twitter-roberta-base-emotion] SKIP (https://nvbugs/5234058)
 examples/test_bert.py::test_llm_bert_general[compare_hf-enable_remove_input_padding-use_attention_plugin-enable_context_fmha-tp:2-pp:1-float16-BertForSequenceClassification-bert/bert-base-uncased-yelp-polarity] SKIP (https://nvbugs/5234058)
 examples/test_bert.py::test_llm_bert_general[compare_hf-enable_remove_input_padding-use_attention_plugin-enable_context_fmha-tp:2-pp:1-float16-RobertaForQuestionAnswering-bert/roberta-base-squad2] SKIP (https://nvbugs/5234058)
-unittest/_torch/multi_gpu_modeling/test_llama4.py::test_llama4[tp1-ep1-pp8-trtllm-scout-enable_graph] SKIP (https://nvbugs/5260488)
-unittest/_torch/multi_gpu_modeling/test_llama4.py::test_llama4[tp8-pp1-trtllm-scout-enable_graph] SKIP (https://nvbugs/5260488)
 accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[tp2pp2-mtp_nextn=2-fp8kv-attention_dp-cuda_graph-overlap_scheduler] SKIP (https://nvbugs/5261055, https://nvbugs/5170160)
 accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[tp2pp2-mtp_nextn=2-attention_dp-cuda_graph-overlap_scheduler] SKIP (https://nvbugs/5261055, https://nvbugs/5170160)
 accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[tp2pp2-mtp_nextn=2-attention_dp] SKIP (https://nvbugs/5261055, https://nvbugs/5170160)