From 582dec5bb5de63931efd430addd1eeb0e45f4cfc Mon Sep 17 00:00:00 2001
From: Bo Li <22713281+bobboli@users.noreply.github.com>
Date: Wed, 14 Jan 2026 20:03:01 +0800
Subject: [PATCH] [https://nvbugs/5774869][infra] Use 2 GPUs to test skip
 softmax attention on H100. (#10420)

Signed-off-by: Bo Li <22713281+bobboli@users.noreply.github.com>
---
 .../accuracy/references/longbench_v1.yaml     |  6 +--
 .../defs/accuracy/test_llm_api_pytorch.py     | 41 +++++++++++++++++--
 .../test_lists/test-db/l0_b200.yml            |  1 -
 .../test_lists/test-db/l0_dgx_h100.yml        |  3 ++
 .../test_lists/test-db/l0_h100.yml            |  4 --
 tests/integration/test_lists/waives.txt       |  3 --
 6 files changed, 44 insertions(+), 14 deletions(-)

diff --git a/tests/integration/defs/accuracy/references/longbench_v1.yaml b/tests/integration/defs/accuracy/references/longbench_v1.yaml
index c638ab92bb..e54288d094 100644
--- a/tests/integration/defs/accuracy/references/longbench_v1.yaml
+++ b/tests/integration/defs/accuracy/references/longbench_v1.yaml
@@ -1,8 +1,8 @@
 Qwen3/Qwen3-30B-A3B-Instruct-2507:
   # Skip Softmax Attention ref accuracy
   - extra_acc_spec: "target_sparsity=0.0"
-    accuracy: 47.22
+    accuracy: 47.357
   - extra_acc_spec: "target_sparsity=0.5"
-    accuracy: 47.22
+    accuracy: 47.102
   - extra_acc_spec: "target_sparsity=0.9"
-    accuracy: 45.90
+    accuracy: 46.169
diff --git a/tests/integration/defs/accuracy/test_llm_api_pytorch.py b/tests/integration/defs/accuracy/test_llm_api_pytorch.py
index c3d6c87435..d5835744a7 100644
--- a/tests/integration/defs/accuracy/test_llm_api_pytorch.py
+++ b/tests/integration/defs/accuracy/test_llm_api_pytorch.py
@@ -3884,7 +3884,6 @@ class TestQwen3_30B_A3B_Instruct_2507(LlmapiAccuracyTestHarness):
     MODEL_PATH = f"{llm_models_root()}/{MODEL_NAME}"
 
     @skip_pre_hopper
-    # @pytest.mark.skip_less_device_memory(140000)  # Only test for H200, B200
     @pytest.mark.parametrize(
         "target_sparsity,thr_prefill,thr_decode",
         [
@@ -3903,10 +3902,11 @@ class TestQwen3_30B_A3B_Instruct_2507(LlmapiAccuracyTestHarness):
                 "prefill": thr_prefill,
                 "decode": thr_decode,
             })
-        kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.85)
+        kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.75,
+                                        enable_block_reuse=False)
 
         if get_sm_version() >= 100:
-            pytest.skip("Bug to be fixed on Blackwell")
+            pytest.skip("https://nvbugs/5783509: Bug to be fixed on Blackwell")
 
         with LLM(self.MODEL_PATH,
                  attn_backend="TRTLLM",
@@ -3918,6 +3918,41 @@ class TestQwen3_30B_A3B_Instruct_2507(LlmapiAccuracyTestHarness):
             task.evaluate(llm,
                           extra_acc_spec=f"target_sparsity={target_sparsity}")
 
+    @pytest.mark.parametrize(
+        "target_sparsity,thr_prefill,thr_decode",
+        [
+            (0.0, 0.0, 0.0),
+            (0.5, 85.97384174442398, 55.48258322852407),
+            (0.9, 1418.142868970396, 863.147841750025),
+        ],
+        ids=[
+            "target_sparsity_0.0", "target_sparsity_0.5", "target_sparsity_0.9"
+        ],
+    )
+    def test_skip_softmax_attention_2gpus(self, target_sparsity: float,
+                                          thr_prefill: float,
+                                          thr_decode: float):
+        sparse_attention_config = SkipSoftmaxAttentionConfig(
+            threshold_scale_factor={
+                "prefill": thr_prefill,
+                "decode": thr_decode,
+            })
+        kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.75,
+                                        enable_block_reuse=False)
+
+        with LLM(self.MODEL_PATH,
+                 attn_backend="TRTLLM",
+                 max_batch_size=256,
+                 max_num_tokens=100000,
+                 tensor_parallel_size=2,
+                 moe_expert_parallel_size=2,
+                 enable_attention_dp=True,
+                 kv_cache_config=kv_cache_config,
+                 sparse_attention_config=sparse_attention_config) as llm:
+            task = LongBenchV1(self.MODEL_NAME)
+            task.evaluate(llm,
+                          extra_acc_spec=f"target_sparsity={target_sparsity}")
+
 
 class TestPhi4MiniInstruct(LlmapiAccuracyTestHarness):
     MODEL_NAME = "microsoft/Phi-4-mini-instruct"
diff --git a/tests/integration/test_lists/test-db/l0_b200.yml b/tests/integration/test_lists/test-db/l0_b200.yml
index 809810c6d9..02616d7eda 100644
--- a/tests/integration/test_lists/test-db/l0_b200.yml
+++ b/tests/integration/test_lists/test-db/l0_b200.yml
@@ -55,7 +55,6 @@ l0_b200:
   - accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_w4a8_mxfp4[mxfp8-latency-TRTLLM]
   - accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_w4a8_mxfp4[mxfp8-latency-CUTLASS]
   - accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_w4a16_mxfp4[latency-TRTLLM]
-  - accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B_Instruct_2507::test_skip_softmax_attention[target_sparsity_0.0]
   - accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B_Instruct_2507::test_skip_softmax_attention[target_sparsity_0.5]
   - accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B_Instruct_2507::test_skip_softmax_attention[target_sparsity_0.9]
   - accuracy/test_llm_api_pytorch.py::TestQwen3NextInstruct::test_nvfp4[tp1-cutlass]
diff --git a/tests/integration/test_lists/test-db/l0_dgx_h100.yml b/tests/integration/test_lists/test-db/l0_dgx_h100.yml
index becc112325..91bf2542b7 100644
--- a/tests/integration/test_lists/test-db/l0_dgx_h100.yml
+++ b/tests/integration/test_lists/test-db/l0_dgx_h100.yml
@@ -48,6 +48,9 @@ l0_dgx_h100:
   - accuracy/test_llm_api_autodeploy.py::TestLlama3_1_8B::test_auto_dtype[False-2]
   # llmapi
   - unittest/llmapi/test_mpi_session.py::test_llmapi_launch_multiple_tasks
+  # ------------- Skip softmax attention tests ---------------
+  - accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B_Instruct_2507::test_skip_softmax_attention_2gpus[target_sparsity_0.5]
+  - accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B_Instruct_2507::test_skip_softmax_attention_2gpus[target_sparsity_0.9]
 - condition:
     ranges:
       system_gpu_count:
diff --git a/tests/integration/test_lists/test-db/l0_h100.yml b/tests/integration/test_lists/test-db/l0_h100.yml
index adae47f626..994c43a1fc 100644
--- a/tests/integration/test_lists/test-db/l0_h100.yml
+++ b/tests/integration/test_lists/test-db/l0_h100.yml
@@ -78,10 +78,6 @@ l0_h100:
   - accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_fp8[latency-torch_compile=False]
   - accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_fp8[latency-torch_compile=True]
   - accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_dummy_load_format
-  # Waive known failures in https://nvbugs/5774869
-  # - accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B_Instruct_2507::test_skip_softmax_attention[target_sparsity_0.0] TIMEOUT (90)
-  # - accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B_Instruct_2507::test_skip_softmax_attention[target_sparsity_0.5] TIMEOUT (90)
-  # - accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B_Instruct_2507::test_skip_softmax_attention[target_sparsity_0.9] TIMEOUT (90)
   - accuracy/test_llm_api_pytorch.py::TestQwen3_8B::test_eagle3[enable_chunked_prefill=False-eagle3_one_model=False]
   - accuracy/test_llm_api_pytorch.py::TestQwen3_8B::test_eagle3[enable_chunked_prefill=True-eagle3_one_model=True]
   - accuracy/test_llm_api_pytorch.py::TestQwen3_8B::test_eagle3[enable_chunked_prefill=False-eagle3_one_model=True]
diff --git a/tests/integration/test_lists/waives.txt b/tests/integration/test_lists/waives.txt
index edeb4af670..ab77f07e41 100644
--- a/tests/integration/test_lists/waives.txt
+++ b/tests/integration/test_lists/waives.txt
@@ -318,9 +318,6 @@ accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_fp8_eagle3_tp8[
 accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_nvfp4_tp4[torch_compile=False] SKIP (https://nvbugs/5794796)
 accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_fp8_tp4[torch_compile=False] SKIP (https://nvbugs/5794796)
 accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_fp8[throughput_latency] SKIP (https://nvbugs/5775544)
-accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B_Instruct_2507::test_skip_softmax_attention[target_sparsity_0.5] SKIP (https://nvbugs/5774869)
-accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B_Instruct_2507::test_skip_softmax_attention[target_sparsity_0.0] SKIP (https://nvbugs/5774869)
-accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B_Instruct_2507::test_skip_softmax_attention[target_sparsity_0.9] SKIP (https://nvbugs/5774869)
 triton_server/test_triton.py::test_llava_onevision[llava_onevision] SKIP (https://nvbugs/5775205)
 triton_server/test_triton.py::test_gpt_ib_lad[gpt-ib-lad] SKIP (https://nvbugs/5775223)
 unittest/_torch/modules/test_fused_moe.py::test_fused_moe_fp8_blockwise_cute_dsl_multi_gpu[MoEWeightLoadingMode.FUSED_GATE_UP_PROJ-DefaultMoeRoutingMethod-1] SKIP (https://nvbugs/5775256)