From 4b8ba7ad61faea39a9b380183757fba88a49b730 Mon Sep 17 00:00:00 2001 From: Mike Iovine Date: Fri, 9 May 2025 10:45:14 -0400 Subject: [PATCH] [fix][nvbug/5244009] Fix llama 4 test lists/scout accuracy issue (#4069) [fix] Fix llama 4 test lists Signed-off-by: Mike Iovine <6158008+mikeiovine@users.noreply.github.com> --- tensorrt_llm/_torch/modules/attention.py | 9 ++++----- tests/integration/test_lists/test-db/l0_dgx_h200.yml | 4 +--- tests/integration/test_lists/waives.txt | 2 -- 3 files changed, 5 insertions(+), 10 deletions(-) diff --git a/tensorrt_llm/_torch/modules/attention.py b/tensorrt_llm/_torch/modules/attention.py index b5075e81df..7fa18f5f6b 100644 --- a/tensorrt_llm/_torch/modules/attention.py +++ b/tensorrt_llm/_torch/modules/attention.py @@ -119,13 +119,12 @@ class Attention(nn.Module): self.o_lora = LoraLayer([LoraModuleType.ATTENTION_DENSE], [self.hidden_size]) - self.use_qk_norm = ( - config.pretrained_config - and (config.pretrained_config.model_type == 'qwen3' - or config.pretrained_config.model_type == 'qwen3_moe')) + use_qk_norm = (config.pretrained_config and + (config.pretrained_config.model_type == 'qwen3' + or config.pretrained_config.model_type == 'qwen3_moe')) attn_cls = get_attention_backend(self.attn_backend) self.enable_rope_fusion = attn_cls.support_fused_rope( - ) and not self.use_qk_norm + ) and not use_qk_norm self.attn = create_attention( self.attn_backend, self.layer_idx, diff --git a/tests/integration/test_lists/test-db/l0_dgx_h200.yml b/tests/integration/test_lists/test-db/l0_dgx_h200.yml index 97cac4d7f0..b36682d060 100644 --- a/tests/integration/test_lists/test-db/l0_dgx_h200.yml +++ b/tests/integration/test_lists/test-db/l0_dgx_h200.yml @@ -17,6 +17,4 @@ l0_dgx_h200: - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[ep4--attention_dp-cuda_graph-overlap_scheduler] # - accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_fp8_blockscale[throughput] # OOM - accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_fp8_blockscale[latency] # 1h - - unittest/_torch/multi_gpu_modeling/test_llama4.py::test_llama4[tp8-pp1-trtllm-scout-enable_graph] - - unittest/_torch/multi_gpu_modeling/test_llama4.py::test_llama4[tp1-ep1-pp8-trtllm-scout-enable_graph] - # - unittest/_torch/multi_gpu_modeling/test_llama4.py::test_llama4[tp8-trtllm-maverick] # 3h will timeout + - unittest/_torch/multi_gpu_modeling/test_llama4.py::test_llama4[pp1-ep1-enable_graph-tp8-trtllm-scout] diff --git a/tests/integration/test_lists/waives.txt b/tests/integration/test_lists/waives.txt index e9bba817bf..e252a4b2d5 100644 --- a/tests/integration/test_lists/waives.txt +++ b/tests/integration/test_lists/waives.txt @@ -472,8 +472,6 @@ examples/test_bert.py::test_llm_bert_general[compare_hf-enable_remove_input_padd examples/test_bert.py::test_llm_bert_general[compare_hf-enable_remove_input_padding-disable_attention_plugin-disable_context_fmha-tp:2-pp:1-float16-RobertaForSequenceClassification-bert/twitter-roberta-base-emotion] SKIP (https://nvbugs/5234058) examples/test_bert.py::test_llm_bert_general[compare_hf-enable_remove_input_padding-use_attention_plugin-enable_context_fmha-tp:2-pp:1-float16-BertForSequenceClassification-bert/bert-base-uncased-yelp-polarity] SKIP (https://nvbugs/5234058) examples/test_bert.py::test_llm_bert_general[compare_hf-enable_remove_input_padding-use_attention_plugin-enable_context_fmha-tp:2-pp:1-float16-RobertaForQuestionAnswering-bert/roberta-base-squad2] SKIP (https://nvbugs/5234058) -unittest/_torch/multi_gpu_modeling/test_llama4.py::test_llama4[tp1-ep1-pp8-trtllm-scout-enable_graph] SKIP (https://nvbugs/5260488) -unittest/_torch/multi_gpu_modeling/test_llama4.py::test_llama4[tp8-pp1-trtllm-scout-enable_graph] SKIP (https://nvbugs/5260488) accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[tp2pp2-mtp_nextn=2-fp8kv-attention_dp-cuda_graph-overlap_scheduler] SKIP (https://nvbugs/5261055, https://nvbugs/5170160) accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[tp2pp2-mtp_nextn=2-attention_dp-cuda_graph-overlap_scheduler] SKIP (https://nvbugs/5261055, https://nvbugs/5170160) accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[tp2pp2-mtp_nextn=2-attention_dp] SKIP (https://nvbugs/5261055, https://nvbugs/5170160)