fix: [nvbugs/5351130] Adjust DSV3-Lite tests free_gpu_memory_fraction to 0.75 to prevent OOM on CI. (#5896)

Signed-off-by: Bo Li <22713281+bobboli@users.noreply.github.com>
This commit is contained in:
Bo Li 2025-07-10 19:16:38 +08:00 committed by Zhenhuan Chen
parent db77d83a2a
commit 537757e669

View File

@ -647,7 +647,7 @@ class TestDeepSeekV3Lite(LlmapiAccuracyTestHarness):
if torch_compile and mtp_nextn > 0:
pytest.skip("https://nvbugs/5252313")
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.9)
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.75)
torch_compile_config = TorchCompileConfig(
enable_fullgraph=True,
enable_piecewise_cuda_graph=cuda_graph,
@ -687,7 +687,7 @@ class TestDeepSeekV3Lite(LlmapiAccuracyTestHarness):
pytest.skip("https://nvbugs/5252313")
if torch_compile and pp_size > 1:
pytest.skip("PP with torch.compile is not supported yet.")
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.9)
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.75)
torch_compile_config = TorchCompileConfig(
enable_fullgraph=True,
enable_piecewise_cuda_graph=cuda_graph and not attention_dp,
@ -725,7 +725,7 @@ class TestDeepSeekV3Lite(LlmapiAccuracyTestHarness):
overlap_scheduler, torch_compile):
if torch_compile and mtp != "disable":
pytest.skip("https://nvbugs/5252313")
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.9)
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.75)
torch_compile_config = TorchCompileConfig(
enable_fullgraph=True,
enable_piecewise_cuda_graph=cuda_graph,
@ -813,7 +813,7 @@ class TestDeepSeekV3Lite(LlmapiAccuracyTestHarness):
@pytest.mark.skip_device_not_contain(["H100"])
@parametrize_with_ids("mtp_nextn", [0, 2])
def test_fp8_block_scales_cuda_graph_padding(self, mtp_nextn):
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.9)
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.75)
mtp_config = None
if mtp_nextn > 0:
mtp_config = MTPDecodingConfig(num_nextn_predict_layers=mtp_nextn)
@ -838,7 +838,7 @@ class TestDeepSeekV3Lite(LlmapiAccuracyTestHarness):
@parametrize_with_ids("attention_dp", [False, True])
def test_fp8_block_scales_cuda_graph_padding_4gpus(self, mtp_nextn,
attention_dp):
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.9)
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.75)
mtp_config = None
if mtp_nextn > 0:
mtp_config = MTPDecodingConfig(num_nextn_predict_layers=mtp_nextn)
@ -879,7 +879,7 @@ class TestDeepSeekV3Lite(LlmapiAccuracyTestHarness):
pytest.skip("https://nvbugs/5252313")
if torch_compile and pp_size > 1:
pytest.skip("PP with torch.compile is not supported yet.")
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.9)
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.75)
torch_compile_config = TorchCompileConfig(
enable_fullgraph=True,
enable_piecewise_cuda_graph=cuda_graph and not attention_dp,
@ -979,7 +979,7 @@ class TestDeepSeekV3Lite(LlmapiAccuracyTestHarness):
@pytest.mark.skip_less_device(4)
@pytest.mark.skip_device_not_contain(["H100", "H200"])
def test_fp8_block_scales_4gpus_static_eplb(self):
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.9)
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.75)
num_experts = 72
num_slots = 80
@ -1070,7 +1070,7 @@ class TestDeepSeekV3Lite(LlmapiAccuracyTestHarness):
torch_compile, mtp_nextn, moe_backend):
if torch_compile and mtp_nextn > 0:
pytest.skip("https://nvbugs/5252313")
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.9)
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.75)
torch_compile_config = TorchCompileConfig(
enable_fullgraph=True,
enable_piecewise_cuda_graph=cuda_graph,
@ -1121,7 +1121,7 @@ class TestDeepSeekV3Lite(LlmapiAccuracyTestHarness):
pytest.skip("PP with torch.compile is not supported yet.")
if moe_backend == "TRTLLM" and get_sm_version() == 120:
pytest.skip("MOE TRTLLM backend does not support SM version 120")
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.9)
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.75)
# Picewise Cuda Graph cannot be enabled for nvfp4 attention dp.
torch_compile_config = TorchCompileConfig(
enable_fullgraph=True,
@ -1178,7 +1178,7 @@ class TestDeepSeekV3Lite(LlmapiAccuracyTestHarness):
elif quant_dtype == "nvfp4":
model_path = f"{llm_models_root()}/DeepSeek-V3-Lite/nvfp4_moe_only"
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.9,
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.75,
enable_block_reuse=False)
pytorch_config = dict(
disable_overlap_scheduler=not overlap_scheduler,