fix: [nvbugs/5351130] Adjust DSV3-Lite tests free_gpu_memory_fraction to 0.75 to prevent OOM on CI. (#5896)

Signed-off-by: Bo Li <22713281+bobboli@users.noreply.github.com>
2026-01-14 06:27:45 +08:00 · 2025-07-10 19:16:38 +08:00 · 2025-07-10 19:16:38 +08:00 · 537757e669
commit 537757e669
parent db77d83a2a
1 changed files with 10 additions and 10 deletions
--- a/tests/integration/defs/accuracy/test_llm_api_pytorch.py
+++ b/tests/integration/defs/accuracy/test_llm_api_pytorch.py
@ -647,7 +647,7 @@ class TestDeepSeekV3Lite(LlmapiAccuracyTestHarness):
        if torch_compile and mtp_nextn > 0:
            pytest.skip("https://nvbugs/5252313")

-        kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.9)
+        kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.75)
        torch_compile_config = TorchCompileConfig(
            enable_fullgraph=True,
            enable_piecewise_cuda_graph=cuda_graph,
@ -687,7 +687,7 @@ class TestDeepSeekV3Lite(LlmapiAccuracyTestHarness):
            pytest.skip("https://nvbugs/5252313")
        if torch_compile and pp_size > 1:
            pytest.skip("PP with torch.compile is not supported yet.")
-        kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.9)
+        kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.75)
        torch_compile_config = TorchCompileConfig(
            enable_fullgraph=True,
            enable_piecewise_cuda_graph=cuda_graph and not attention_dp,
@ -725,7 +725,7 @@ class TestDeepSeekV3Lite(LlmapiAccuracyTestHarness):
                              overlap_scheduler, torch_compile):
        if torch_compile and mtp != "disable":
            pytest.skip("https://nvbugs/5252313")
-        kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.9)
+        kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.75)
        torch_compile_config = TorchCompileConfig(
            enable_fullgraph=True,
            enable_piecewise_cuda_graph=cuda_graph,
@ -813,7 +813,7 @@ class TestDeepSeekV3Lite(LlmapiAccuracyTestHarness):
    @pytest.mark.skip_device_not_contain(["H100"])
    @parametrize_with_ids("mtp_nextn", [0, 2])
    def test_fp8_block_scales_cuda_graph_padding(self, mtp_nextn):
-        kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.9)
+        kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.75)
        mtp_config = None
        if mtp_nextn > 0:
            mtp_config = MTPDecodingConfig(num_nextn_predict_layers=mtp_nextn)
@ -838,7 +838,7 @@ class TestDeepSeekV3Lite(LlmapiAccuracyTestHarness):
    @parametrize_with_ids("attention_dp", [False, True])
    def test_fp8_block_scales_cuda_graph_padding_4gpus(self, mtp_nextn,
                                                       attention_dp):
-        kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.9)
+        kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.75)
        mtp_config = None
        if mtp_nextn > 0:
            mtp_config = MTPDecodingConfig(num_nextn_predict_layers=mtp_nextn)
@ -879,7 +879,7 @@ class TestDeepSeekV3Lite(LlmapiAccuracyTestHarness):
            pytest.skip("https://nvbugs/5252313")
        if torch_compile and pp_size > 1:
            pytest.skip("PP with torch.compile is not supported yet.")
-        kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.9)
+        kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.75)
        torch_compile_config = TorchCompileConfig(
            enable_fullgraph=True,
            enable_piecewise_cuda_graph=cuda_graph and not attention_dp,
@ -979,7 +979,7 @@ class TestDeepSeekV3Lite(LlmapiAccuracyTestHarness):
    @pytest.mark.skip_less_device(4)
    @pytest.mark.skip_device_not_contain(["H100", "H200"])
    def test_fp8_block_scales_4gpus_static_eplb(self):
-        kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.9)
+        kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.75)

        num_experts = 72
        num_slots = 80
@ -1070,7 +1070,7 @@ class TestDeepSeekV3Lite(LlmapiAccuracyTestHarness):
                   torch_compile, mtp_nextn, moe_backend):
        if torch_compile and mtp_nextn > 0:
            pytest.skip("https://nvbugs/5252313")
-        kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.9)
+        kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.75)
        torch_compile_config = TorchCompileConfig(
            enable_fullgraph=True,
            enable_piecewise_cuda_graph=cuda_graph,
@ -1121,7 +1121,7 @@ class TestDeepSeekV3Lite(LlmapiAccuracyTestHarness):
            pytest.skip("PP with torch.compile is not supported yet.")
        if moe_backend == "TRTLLM" and get_sm_version() == 120:
            pytest.skip("MOE TRTLLM backend does not support SM version 120")
-        kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.9)
+        kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.75)
        # Picewise Cuda Graph cannot be enabled for nvfp4 attention dp.
        torch_compile_config = TorchCompileConfig(
            enable_fullgraph=True,
@ -1178,7 +1178,7 @@ class TestDeepSeekV3Lite(LlmapiAccuracyTestHarness):
        elif quant_dtype == "nvfp4":
            model_path = f"{llm_models_root()}/DeepSeek-V3-Lite/nvfp4_moe_only"

-        kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.9,
+        kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.75,
                                        enable_block_reuse=False)
        pytorch_config = dict(
            disable_overlap_scheduler=not overlap_scheduler,