mirror of
https://github.com/NVIDIA/TensorRT-LLM.git
synced 2026-01-13 22:18:36 +08:00
Turn off KV block reuse.
Remove target_sparsity_0 from CI. Signed-off-by: Bo Li <22713281+bobboli@users.noreply.github.com>
This commit is contained in:
parent
b07c5668f5
commit
b97044612f
@ -3902,7 +3902,8 @@ class TestQwen3_30B_A3B_Instruct_2507(LlmapiAccuracyTestHarness):
|
||||
"prefill": thr_prefill,
|
||||
"decode": thr_decode,
|
||||
})
|
||||
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.75)
|
||||
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.75,
|
||||
enable_block_reuse=False)
|
||||
|
||||
if get_sm_version() >= 100:
|
||||
pytest.skip("https://nvbugs/5783509: Bug to be fixed on Blackwell")
|
||||
@ -3936,7 +3937,8 @@ class TestQwen3_30B_A3B_Instruct_2507(LlmapiAccuracyTestHarness):
|
||||
"prefill": thr_prefill,
|
||||
"decode": thr_decode,
|
||||
})
|
||||
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.75)
|
||||
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.75,
|
||||
enable_block_reuse=False)
|
||||
|
||||
with LLM(self.MODEL_PATH,
|
||||
attn_backend="TRTLLM",
|
||||
|
||||
@ -55,7 +55,6 @@ l0_b200:
|
||||
- accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_w4a8_mxfp4[mxfp8-latency-TRTLLM]
|
||||
- accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_w4a8_mxfp4[mxfp8-latency-CUTLASS]
|
||||
- accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_w4a16_mxfp4[latency-TRTLLM]
|
||||
- accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B_Instruct_2507::test_skip_softmax_attention[target_sparsity_0.0]
|
||||
- accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B_Instruct_2507::test_skip_softmax_attention[target_sparsity_0.5]
|
||||
- accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B_Instruct_2507::test_skip_softmax_attention[target_sparsity_0.9]
|
||||
- accuracy/test_llm_api_pytorch.py::TestQwen3NextInstruct::test_nvfp4[tp1-cutlass]
|
||||
|
||||
@ -48,7 +48,6 @@ l0_dgx_h100:
|
||||
# llmapi
|
||||
- unittest/llmapi/test_mpi_session.py::test_llmapi_launch_multiple_tasks
|
||||
# ------------- Skip softmax attention tests ---------------
|
||||
- accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B_Instruct_2507::test_skip_softmax_attention_2gpus[target_sparsity_0.0]
|
||||
- accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B_Instruct_2507::test_skip_softmax_attention_2gpus[target_sparsity_0.5]
|
||||
- accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B_Instruct_2507::test_skip_softmax_attention_2gpus[target_sparsity_0.9]
|
||||
- condition:
|
||||
|
||||
Loading…
Reference in New Issue
Block a user