mirror of
https://github.com/NVIDIA/TensorRT-LLM.git
synced 2026-02-04 02:02:01 +08:00
[https://nvbugs/5774869][infra] Use 2 GPUs to test skip softmax attention on H100. (#10420)
Signed-off-by: Bo Li <22713281+bobboli@users.noreply.github.com>
This commit is contained in:
parent
babd5ecacc
commit
582dec5bb5
@ -1,8 +1,8 @@
|
||||
Qwen3/Qwen3-30B-A3B-Instruct-2507:
|
||||
# Skip Softmax Attention ref accuracy
|
||||
- extra_acc_spec: "target_sparsity=0.0"
|
||||
accuracy: 47.22
|
||||
accuracy: 47.357
|
||||
- extra_acc_spec: "target_sparsity=0.5"
|
||||
accuracy: 47.22
|
||||
accuracy: 47.102
|
||||
- extra_acc_spec: "target_sparsity=0.9"
|
||||
accuracy: 45.90
|
||||
accuracy: 46.169
|
||||
|
||||
@ -3884,7 +3884,6 @@ class TestQwen3_30B_A3B_Instruct_2507(LlmapiAccuracyTestHarness):
|
||||
MODEL_PATH = f"{llm_models_root()}/{MODEL_NAME}"
|
||||
|
||||
@skip_pre_hopper
|
||||
# @pytest.mark.skip_less_device_memory(140000) # Only test for H200, B200
|
||||
@pytest.mark.parametrize(
|
||||
"target_sparsity,thr_prefill,thr_decode",
|
||||
[
|
||||
@ -3903,10 +3902,11 @@ class TestQwen3_30B_A3B_Instruct_2507(LlmapiAccuracyTestHarness):
|
||||
"prefill": thr_prefill,
|
||||
"decode": thr_decode,
|
||||
})
|
||||
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.85)
|
||||
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.75,
|
||||
enable_block_reuse=False)
|
||||
|
||||
if get_sm_version() >= 100:
|
||||
pytest.skip("Bug to be fixed on Blackwell")
|
||||
pytest.skip("https://nvbugs/5783509: Bug to be fixed on Blackwell")
|
||||
|
||||
with LLM(self.MODEL_PATH,
|
||||
attn_backend="TRTLLM",
|
||||
@ -3918,6 +3918,41 @@ class TestQwen3_30B_A3B_Instruct_2507(LlmapiAccuracyTestHarness):
|
||||
task.evaluate(llm,
|
||||
extra_acc_spec=f"target_sparsity={target_sparsity}")
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"target_sparsity,thr_prefill,thr_decode",
|
||||
[
|
||||
(0.0, 0.0, 0.0),
|
||||
(0.5, 85.97384174442398, 55.48258322852407),
|
||||
(0.9, 1418.142868970396, 863.147841750025),
|
||||
],
|
||||
ids=[
|
||||
"target_sparsity_0.0", "target_sparsity_0.5", "target_sparsity_0.9"
|
||||
],
|
||||
)
|
||||
def test_skip_softmax_attention_2gpus(self, target_sparsity: float,
|
||||
thr_prefill: float,
|
||||
thr_decode: float):
|
||||
sparse_attention_config = SkipSoftmaxAttentionConfig(
|
||||
threshold_scale_factor={
|
||||
"prefill": thr_prefill,
|
||||
"decode": thr_decode,
|
||||
})
|
||||
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.75,
|
||||
enable_block_reuse=False)
|
||||
|
||||
with LLM(self.MODEL_PATH,
|
||||
attn_backend="TRTLLM",
|
||||
max_batch_size=256,
|
||||
max_num_tokens=100000,
|
||||
tensor_parallel_size=2,
|
||||
moe_expert_parallel_size=2,
|
||||
enable_attention_dp=True,
|
||||
kv_cache_config=kv_cache_config,
|
||||
sparse_attention_config=sparse_attention_config) as llm:
|
||||
task = LongBenchV1(self.MODEL_NAME)
|
||||
task.evaluate(llm,
|
||||
extra_acc_spec=f"target_sparsity={target_sparsity}")
|
||||
|
||||
|
||||
class TestPhi4MiniInstruct(LlmapiAccuracyTestHarness):
|
||||
MODEL_NAME = "microsoft/Phi-4-mini-instruct"
|
||||
|
||||
@ -55,7 +55,6 @@ l0_b200:
|
||||
- accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_w4a8_mxfp4[mxfp8-latency-TRTLLM]
|
||||
- accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_w4a8_mxfp4[mxfp8-latency-CUTLASS]
|
||||
- accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_w4a16_mxfp4[latency-TRTLLM]
|
||||
- accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B_Instruct_2507::test_skip_softmax_attention[target_sparsity_0.0]
|
||||
- accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B_Instruct_2507::test_skip_softmax_attention[target_sparsity_0.5]
|
||||
- accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B_Instruct_2507::test_skip_softmax_attention[target_sparsity_0.9]
|
||||
- accuracy/test_llm_api_pytorch.py::TestQwen3NextInstruct::test_nvfp4[tp1-cutlass]
|
||||
|
||||
@ -48,6 +48,9 @@ l0_dgx_h100:
|
||||
- accuracy/test_llm_api_autodeploy.py::TestLlama3_1_8B::test_auto_dtype[False-2]
|
||||
# llmapi
|
||||
- unittest/llmapi/test_mpi_session.py::test_llmapi_launch_multiple_tasks
|
||||
# ------------- Skip softmax attention tests ---------------
|
||||
- accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B_Instruct_2507::test_skip_softmax_attention_2gpus[target_sparsity_0.5]
|
||||
- accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B_Instruct_2507::test_skip_softmax_attention_2gpus[target_sparsity_0.9]
|
||||
- condition:
|
||||
ranges:
|
||||
system_gpu_count:
|
||||
|
||||
@ -78,10 +78,6 @@ l0_h100:
|
||||
- accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_fp8[latency-torch_compile=False]
|
||||
- accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_fp8[latency-torch_compile=True]
|
||||
- accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_dummy_load_format
|
||||
# Waive known failures in https://nvbugs/5774869
|
||||
# - accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B_Instruct_2507::test_skip_softmax_attention[target_sparsity_0.0] TIMEOUT (90)
|
||||
# - accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B_Instruct_2507::test_skip_softmax_attention[target_sparsity_0.5] TIMEOUT (90)
|
||||
# - accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B_Instruct_2507::test_skip_softmax_attention[target_sparsity_0.9] TIMEOUT (90)
|
||||
- accuracy/test_llm_api_pytorch.py::TestQwen3_8B::test_eagle3[enable_chunked_prefill=False-eagle3_one_model=False]
|
||||
- accuracy/test_llm_api_pytorch.py::TestQwen3_8B::test_eagle3[enable_chunked_prefill=True-eagle3_one_model=True]
|
||||
- accuracy/test_llm_api_pytorch.py::TestQwen3_8B::test_eagle3[enable_chunked_prefill=False-eagle3_one_model=True]
|
||||
|
||||
@ -318,9 +318,6 @@ accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_fp8_eagle3_tp8[
|
||||
accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_nvfp4_tp4[torch_compile=False] SKIP (https://nvbugs/5794796)
|
||||
accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_fp8_tp4[torch_compile=False] SKIP (https://nvbugs/5794796)
|
||||
accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_fp8[throughput_latency] SKIP (https://nvbugs/5775544)
|
||||
accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B_Instruct_2507::test_skip_softmax_attention[target_sparsity_0.5] SKIP (https://nvbugs/5774869)
|
||||
accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B_Instruct_2507::test_skip_softmax_attention[target_sparsity_0.0] SKIP (https://nvbugs/5774869)
|
||||
accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B_Instruct_2507::test_skip_softmax_attention[target_sparsity_0.9] SKIP (https://nvbugs/5774869)
|
||||
triton_server/test_triton.py::test_llava_onevision[llava_onevision] SKIP (https://nvbugs/5775205)
|
||||
triton_server/test_triton.py::test_gpt_ib_lad[gpt-ib-lad] SKIP (https://nvbugs/5775223)
|
||||
unittest/_torch/modules/test_fused_moe.py::test_fused_moe_fp8_blockwise_cute_dsl_multi_gpu[MoEWeightLoadingMode.FUSED_GATE_UP_PROJ-DefaultMoeRoutingMethod-1] SKIP (https://nvbugs/5775256)
|
||||
|
||||
Loading…
Reference in New Issue
Block a user