From 582dec5bb5de63931efd430addd1eeb0e45f4cfc Mon Sep 17 00:00:00 2001 From: Bo Li <22713281+bobboli@users.noreply.github.com> Date: Wed, 14 Jan 2026 20:03:01 +0800 Subject: [PATCH] [https://nvbugs/5774869][infra] Use 2 GPUs to test skip softmax attention on H100. (#10420) Signed-off-by: Bo Li <22713281+bobboli@users.noreply.github.com> --- .../accuracy/references/longbench_v1.yaml | 6 +-- .../defs/accuracy/test_llm_api_pytorch.py | 41 +++++++++++++++++-- .../test_lists/test-db/l0_b200.yml | 1 - .../test_lists/test-db/l0_dgx_h100.yml | 3 ++ .../test_lists/test-db/l0_h100.yml | 4 -- tests/integration/test_lists/waives.txt | 3 -- 6 files changed, 44 insertions(+), 14 deletions(-) diff --git a/tests/integration/defs/accuracy/references/longbench_v1.yaml b/tests/integration/defs/accuracy/references/longbench_v1.yaml index c638ab92bb..e54288d094 100644 --- a/tests/integration/defs/accuracy/references/longbench_v1.yaml +++ b/tests/integration/defs/accuracy/references/longbench_v1.yaml @@ -1,8 +1,8 @@ Qwen3/Qwen3-30B-A3B-Instruct-2507: # Skip Softmax Attention ref accuracy - extra_acc_spec: "target_sparsity=0.0" - accuracy: 47.22 + accuracy: 47.357 - extra_acc_spec: "target_sparsity=0.5" - accuracy: 47.22 + accuracy: 47.102 - extra_acc_spec: "target_sparsity=0.9" - accuracy: 45.90 + accuracy: 46.169 diff --git a/tests/integration/defs/accuracy/test_llm_api_pytorch.py b/tests/integration/defs/accuracy/test_llm_api_pytorch.py index c3d6c87435..d5835744a7 100644 --- a/tests/integration/defs/accuracy/test_llm_api_pytorch.py +++ b/tests/integration/defs/accuracy/test_llm_api_pytorch.py @@ -3884,7 +3884,6 @@ class TestQwen3_30B_A3B_Instruct_2507(LlmapiAccuracyTestHarness): MODEL_PATH = f"{llm_models_root()}/{MODEL_NAME}" @skip_pre_hopper - # @pytest.mark.skip_less_device_memory(140000) # Only test for H200, B200 @pytest.mark.parametrize( "target_sparsity,thr_prefill,thr_decode", [ @@ -3903,10 +3902,11 @@ class TestQwen3_30B_A3B_Instruct_2507(LlmapiAccuracyTestHarness): "prefill": thr_prefill, "decode": thr_decode, }) - kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.85) + kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.75, + enable_block_reuse=False) if get_sm_version() >= 100: - pytest.skip("Bug to be fixed on Blackwell") + pytest.skip("https://nvbugs/5783509: Bug to be fixed on Blackwell") with LLM(self.MODEL_PATH, attn_backend="TRTLLM", @@ -3918,6 +3918,41 @@ class TestQwen3_30B_A3B_Instruct_2507(LlmapiAccuracyTestHarness): task.evaluate(llm, extra_acc_spec=f"target_sparsity={target_sparsity}") + @pytest.mark.parametrize( + "target_sparsity,thr_prefill,thr_decode", + [ + (0.0, 0.0, 0.0), + (0.5, 85.97384174442398, 55.48258322852407), + (0.9, 1418.142868970396, 863.147841750025), + ], + ids=[ + "target_sparsity_0.0", "target_sparsity_0.5", "target_sparsity_0.9" + ], + ) + def test_skip_softmax_attention_2gpus(self, target_sparsity: float, + thr_prefill: float, + thr_decode: float): + sparse_attention_config = SkipSoftmaxAttentionConfig( + threshold_scale_factor={ + "prefill": thr_prefill, + "decode": thr_decode, + }) + kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.75, + enable_block_reuse=False) + + with LLM(self.MODEL_PATH, + attn_backend="TRTLLM", + max_batch_size=256, + max_num_tokens=100000, + tensor_parallel_size=2, + moe_expert_parallel_size=2, + enable_attention_dp=True, + kv_cache_config=kv_cache_config, + sparse_attention_config=sparse_attention_config) as llm: + task = LongBenchV1(self.MODEL_NAME) + task.evaluate(llm, + extra_acc_spec=f"target_sparsity={target_sparsity}") + class TestPhi4MiniInstruct(LlmapiAccuracyTestHarness): MODEL_NAME = "microsoft/Phi-4-mini-instruct" diff --git a/tests/integration/test_lists/test-db/l0_b200.yml b/tests/integration/test_lists/test-db/l0_b200.yml index 809810c6d9..02616d7eda 100644 --- a/tests/integration/test_lists/test-db/l0_b200.yml +++ b/tests/integration/test_lists/test-db/l0_b200.yml @@ -55,7 +55,6 @@ l0_b200: - accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_w4a8_mxfp4[mxfp8-latency-TRTLLM] - accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_w4a8_mxfp4[mxfp8-latency-CUTLASS] - accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_w4a16_mxfp4[latency-TRTLLM] - - accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B_Instruct_2507::test_skip_softmax_attention[target_sparsity_0.0] - accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B_Instruct_2507::test_skip_softmax_attention[target_sparsity_0.5] - accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B_Instruct_2507::test_skip_softmax_attention[target_sparsity_0.9] - accuracy/test_llm_api_pytorch.py::TestQwen3NextInstruct::test_nvfp4[tp1-cutlass] diff --git a/tests/integration/test_lists/test-db/l0_dgx_h100.yml b/tests/integration/test_lists/test-db/l0_dgx_h100.yml index becc112325..91bf2542b7 100644 --- a/tests/integration/test_lists/test-db/l0_dgx_h100.yml +++ b/tests/integration/test_lists/test-db/l0_dgx_h100.yml @@ -48,6 +48,9 @@ l0_dgx_h100: - accuracy/test_llm_api_autodeploy.py::TestLlama3_1_8B::test_auto_dtype[False-2] # llmapi - unittest/llmapi/test_mpi_session.py::test_llmapi_launch_multiple_tasks + # ------------- Skip softmax attention tests --------------- + - accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B_Instruct_2507::test_skip_softmax_attention_2gpus[target_sparsity_0.5] + - accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B_Instruct_2507::test_skip_softmax_attention_2gpus[target_sparsity_0.9] - condition: ranges: system_gpu_count: diff --git a/tests/integration/test_lists/test-db/l0_h100.yml b/tests/integration/test_lists/test-db/l0_h100.yml index adae47f626..994c43a1fc 100644 --- a/tests/integration/test_lists/test-db/l0_h100.yml +++ b/tests/integration/test_lists/test-db/l0_h100.yml @@ -78,10 +78,6 @@ l0_h100: - accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_fp8[latency-torch_compile=False] - accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_fp8[latency-torch_compile=True] - accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_dummy_load_format - # Waive known failures in https://nvbugs/5774869 - # - accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B_Instruct_2507::test_skip_softmax_attention[target_sparsity_0.0] TIMEOUT (90) - # - accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B_Instruct_2507::test_skip_softmax_attention[target_sparsity_0.5] TIMEOUT (90) - # - accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B_Instruct_2507::test_skip_softmax_attention[target_sparsity_0.9] TIMEOUT (90) - accuracy/test_llm_api_pytorch.py::TestQwen3_8B::test_eagle3[enable_chunked_prefill=False-eagle3_one_model=False] - accuracy/test_llm_api_pytorch.py::TestQwen3_8B::test_eagle3[enable_chunked_prefill=True-eagle3_one_model=True] - accuracy/test_llm_api_pytorch.py::TestQwen3_8B::test_eagle3[enable_chunked_prefill=False-eagle3_one_model=True] diff --git a/tests/integration/test_lists/waives.txt b/tests/integration/test_lists/waives.txt index edeb4af670..ab77f07e41 100644 --- a/tests/integration/test_lists/waives.txt +++ b/tests/integration/test_lists/waives.txt @@ -318,9 +318,6 @@ accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_fp8_eagle3_tp8[ accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_nvfp4_tp4[torch_compile=False] SKIP (https://nvbugs/5794796) accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_fp8_tp4[torch_compile=False] SKIP (https://nvbugs/5794796) accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_fp8[throughput_latency] SKIP (https://nvbugs/5775544) -accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B_Instruct_2507::test_skip_softmax_attention[target_sparsity_0.5] SKIP (https://nvbugs/5774869) -accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B_Instruct_2507::test_skip_softmax_attention[target_sparsity_0.0] SKIP (https://nvbugs/5774869) -accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B_Instruct_2507::test_skip_softmax_attention[target_sparsity_0.9] SKIP (https://nvbugs/5774869) triton_server/test_triton.py::test_llava_onevision[llava_onevision] SKIP (https://nvbugs/5775205) triton_server/test_triton.py::test_gpt_ib_lad[gpt-ib-lad] SKIP (https://nvbugs/5775223) unittest/_torch/modules/test_fused_moe.py::test_fused_moe_fp8_blockwise_cute_dsl_multi_gpu[MoEWeightLoadingMode.FUSED_GATE_UP_PROJ-DefaultMoeRoutingMethod-1] SKIP (https://nvbugs/5775256)