mirror of
https://github.com/NVIDIA/TensorRT-LLM.git
synced 2026-01-14 06:27:45 +08:00
Turn off KV block reuse.
Remove target_sparsity_0 from CI. Signed-off-by: Bo Li <22713281+bobboli@users.noreply.github.com>
This commit is contained in:
parent
b07c5668f5
commit
b97044612f
@ -3902,7 +3902,8 @@ class TestQwen3_30B_A3B_Instruct_2507(LlmapiAccuracyTestHarness):
|
|||||||
"prefill": thr_prefill,
|
"prefill": thr_prefill,
|
||||||
"decode": thr_decode,
|
"decode": thr_decode,
|
||||||
})
|
})
|
||||||
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.75)
|
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.75,
|
||||||
|
enable_block_reuse=False)
|
||||||
|
|
||||||
if get_sm_version() >= 100:
|
if get_sm_version() >= 100:
|
||||||
pytest.skip("https://nvbugs/5783509: Bug to be fixed on Blackwell")
|
pytest.skip("https://nvbugs/5783509: Bug to be fixed on Blackwell")
|
||||||
@ -3936,7 +3937,8 @@ class TestQwen3_30B_A3B_Instruct_2507(LlmapiAccuracyTestHarness):
|
|||||||
"prefill": thr_prefill,
|
"prefill": thr_prefill,
|
||||||
"decode": thr_decode,
|
"decode": thr_decode,
|
||||||
})
|
})
|
||||||
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.75)
|
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.75,
|
||||||
|
enable_block_reuse=False)
|
||||||
|
|
||||||
with LLM(self.MODEL_PATH,
|
with LLM(self.MODEL_PATH,
|
||||||
attn_backend="TRTLLM",
|
attn_backend="TRTLLM",
|
||||||
|
|||||||
@ -55,7 +55,6 @@ l0_b200:
|
|||||||
- accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_w4a8_mxfp4[mxfp8-latency-TRTLLM]
|
- accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_w4a8_mxfp4[mxfp8-latency-TRTLLM]
|
||||||
- accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_w4a8_mxfp4[mxfp8-latency-CUTLASS]
|
- accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_w4a8_mxfp4[mxfp8-latency-CUTLASS]
|
||||||
- accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_w4a16_mxfp4[latency-TRTLLM]
|
- accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_w4a16_mxfp4[latency-TRTLLM]
|
||||||
- accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B_Instruct_2507::test_skip_softmax_attention[target_sparsity_0.0]
|
|
||||||
- accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B_Instruct_2507::test_skip_softmax_attention[target_sparsity_0.5]
|
- accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B_Instruct_2507::test_skip_softmax_attention[target_sparsity_0.5]
|
||||||
- accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B_Instruct_2507::test_skip_softmax_attention[target_sparsity_0.9]
|
- accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B_Instruct_2507::test_skip_softmax_attention[target_sparsity_0.9]
|
||||||
- accuracy/test_llm_api_pytorch.py::TestQwen3NextInstruct::test_nvfp4[tp1-cutlass]
|
- accuracy/test_llm_api_pytorch.py::TestQwen3NextInstruct::test_nvfp4[tp1-cutlass]
|
||||||
|
|||||||
@ -48,7 +48,6 @@ l0_dgx_h100:
|
|||||||
# llmapi
|
# llmapi
|
||||||
- unittest/llmapi/test_mpi_session.py::test_llmapi_launch_multiple_tasks
|
- unittest/llmapi/test_mpi_session.py::test_llmapi_launch_multiple_tasks
|
||||||
# ------------- Skip softmax attention tests ---------------
|
# ------------- Skip softmax attention tests ---------------
|
||||||
- accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B_Instruct_2507::test_skip_softmax_attention_2gpus[target_sparsity_0.0]
|
|
||||||
- accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B_Instruct_2507::test_skip_softmax_attention_2gpus[target_sparsity_0.5]
|
- accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B_Instruct_2507::test_skip_softmax_attention_2gpus[target_sparsity_0.5]
|
||||||
- accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B_Instruct_2507::test_skip_softmax_attention_2gpus[target_sparsity_0.9]
|
- accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B_Instruct_2507::test_skip_softmax_attention_2gpus[target_sparsity_0.9]
|
||||||
- condition:
|
- condition:
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user