mirror of
https://github.com/NVIDIA/TensorRT-LLM.git
synced 2026-02-04 02:02:01 +08:00
[https://nvbugs/5598674][fix] enable partial reuse in gemma and gpt oss test (#10559)
Signed-off-by: Chuang Zhu <111838961+chuangz0@users.noreply.github.com>
This commit is contained in:
parent
e3f27e06c7
commit
7e2cbc0756
@ -1037,12 +1037,12 @@ class TestGemma3_1BInstruct(LlmapiAccuracyTestHarness):
|
||||
ctx_server_config["kv_cache_config"] = {
|
||||
"max_attention_window": [512, 512, 512, 512, 512, 32768],
|
||||
"enable_block_reuse": block_reuse,
|
||||
"enable_partial_reuse": False,
|
||||
"enable_partial_reuse": block_reuse,
|
||||
}
|
||||
gen_server_config["kv_cache_config"] = {
|
||||
"max_attention_window": [512, 512, 512, 512, 512, 32768],
|
||||
"enable_block_reuse": block_reuse,
|
||||
"enable_partial_reuse": False,
|
||||
"enable_partial_reuse": block_reuse,
|
||||
}
|
||||
disaggregated_server_config = {
|
||||
"hostname": "localhost",
|
||||
@ -1066,7 +1066,7 @@ class TestGemma3_1BInstruct(LlmapiAccuracyTestHarness):
|
||||
task.evaluate(llm)
|
||||
|
||||
|
||||
@skip_pre_hopper
|
||||
@skip_pre_blackwell
|
||||
@pytest.mark.skip_less_device_memory(80000)
|
||||
class TestGPTOSS(LlmapiAccuracyTestHarness):
|
||||
extra_evaluator_kwargs = {
|
||||
@ -1099,13 +1099,13 @@ class TestGPTOSS(LlmapiAccuracyTestHarness):
|
||||
ctx_server_config["kv_cache_config"] = {
|
||||
"max_attention_window": [128, 32768],
|
||||
"enable_block_reuse": block_reuse,
|
||||
"enable_partial_reuse": False,
|
||||
"enable_partial_reuse": block_reuse,
|
||||
"free_gpu_memory_fraction": 0.5,
|
||||
}
|
||||
gen_server_config["kv_cache_config"] = {
|
||||
"max_attention_window": [128, 32768],
|
||||
"enable_block_reuse": block_reuse,
|
||||
"enable_partial_reuse": False,
|
||||
"enable_partial_reuse": block_reuse,
|
||||
"free_gpu_memory_fraction": 0.5,
|
||||
}
|
||||
disaggregated_server_config = {
|
||||
|
||||
@ -308,8 +308,6 @@ full:sm89/accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_
|
||||
unittest/_torch/auto_deploy/unit/multigpu/transformations/library/test_tp_sharding.py::test_sharding[GQA_Block-torch_dist_all_reduce-True-False-2] SKIP (https://nvbugs/5766982)
|
||||
accuracy/test_cli_flow.py::TestVicuna7B::test_eagle_2[cuda_graph=True-chunked_context=True] SKIP (https://nvbugs/5773185)
|
||||
accuracy/test_cli_flow.py::TestVicuna7B::test_eagle_2[cuda_graph=True-chunked_context=False] SKIP (https://nvbugs/5773185)
|
||||
accuracy/test_disaggregated_serving.py::TestGPTOSS::test_auto_dtype[True] SKIP (https://nvbugs/5596343)
|
||||
accuracy/test_disaggregated_serving.py::TestGPTOSS::test_auto_dtype[False] SKIP (https://nvbugs/5596343)
|
||||
accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_fp8_eagle3_tp8[eagle3_one_model=False-torch_compile=True] SKIP (https://nvbugs/5775326)
|
||||
accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_nvfp4_tp4[torch_compile=False] SKIP (https://nvbugs/5794796)
|
||||
accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_fp8_tp4[torch_compile=False] SKIP (https://nvbugs/5794796)
|
||||
|
||||
Loading…
Reference in New Issue
Block a user