Turn off KV block reuse.

Remove target_sparsity_0 from CI.

Signed-off-by: Bo Li <22713281+bobboli@users.noreply.github.com>
This commit is contained in:
Bo Li 2026-01-10 22:00:09 +08:00
parent b07c5668f5
commit b97044612f
3 changed files with 4 additions and 4 deletions

View File

@ -3902,7 +3902,8 @@ class TestQwen3_30B_A3B_Instruct_2507(LlmapiAccuracyTestHarness):
"prefill": thr_prefill,
"decode": thr_decode,
})
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.75)
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.75,
enable_block_reuse=False)
if get_sm_version() >= 100:
pytest.skip("https://nvbugs/5783509: Bug to be fixed on Blackwell")
@ -3936,7 +3937,8 @@ class TestQwen3_30B_A3B_Instruct_2507(LlmapiAccuracyTestHarness):
"prefill": thr_prefill,
"decode": thr_decode,
})
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.75)
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.75,
enable_block_reuse=False)
with LLM(self.MODEL_PATH,
attn_backend="TRTLLM",

View File

@ -55,7 +55,6 @@ l0_b200:
- accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_w4a8_mxfp4[mxfp8-latency-TRTLLM]
- accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_w4a8_mxfp4[mxfp8-latency-CUTLASS]
- accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_w4a16_mxfp4[latency-TRTLLM]
- accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B_Instruct_2507::test_skip_softmax_attention[target_sparsity_0.0]
- accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B_Instruct_2507::test_skip_softmax_attention[target_sparsity_0.5]
- accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B_Instruct_2507::test_skip_softmax_attention[target_sparsity_0.9]
- accuracy/test_llm_api_pytorch.py::TestQwen3NextInstruct::test_nvfp4[tp1-cutlass]

View File

@ -48,7 +48,6 @@ l0_dgx_h100:
# llmapi
- unittest/llmapi/test_mpi_session.py::test_llmapi_launch_multiple_tasks
# ------------- Skip softmax attention tests ---------------
- accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B_Instruct_2507::test_skip_softmax_attention_2gpus[target_sparsity_0.0]
- accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B_Instruct_2507::test_skip_softmax_attention_2gpus[target_sparsity_0.5]
- accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B_Instruct_2507::test_skip_softmax_attention_2gpus[target_sparsity_0.9]
- condition: