mirror of
https://github.com/NVIDIA/TensorRT-LLM.git
synced 2026-01-14 06:27:45 +08:00
fix: [nvbugs/5351130] Adjust DSV3-Lite tests free_gpu_memory_fraction to 0.75 to prevent OOM on CI. (#5896)
Signed-off-by: Bo Li <22713281+bobboli@users.noreply.github.com>
This commit is contained in:
parent
db77d83a2a
commit
537757e669
@ -647,7 +647,7 @@ class TestDeepSeekV3Lite(LlmapiAccuracyTestHarness):
|
||||
if torch_compile and mtp_nextn > 0:
|
||||
pytest.skip("https://nvbugs/5252313")
|
||||
|
||||
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.9)
|
||||
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.75)
|
||||
torch_compile_config = TorchCompileConfig(
|
||||
enable_fullgraph=True,
|
||||
enable_piecewise_cuda_graph=cuda_graph,
|
||||
@ -687,7 +687,7 @@ class TestDeepSeekV3Lite(LlmapiAccuracyTestHarness):
|
||||
pytest.skip("https://nvbugs/5252313")
|
||||
if torch_compile and pp_size > 1:
|
||||
pytest.skip("PP with torch.compile is not supported yet.")
|
||||
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.9)
|
||||
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.75)
|
||||
torch_compile_config = TorchCompileConfig(
|
||||
enable_fullgraph=True,
|
||||
enable_piecewise_cuda_graph=cuda_graph and not attention_dp,
|
||||
@ -725,7 +725,7 @@ class TestDeepSeekV3Lite(LlmapiAccuracyTestHarness):
|
||||
overlap_scheduler, torch_compile):
|
||||
if torch_compile and mtp != "disable":
|
||||
pytest.skip("https://nvbugs/5252313")
|
||||
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.9)
|
||||
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.75)
|
||||
torch_compile_config = TorchCompileConfig(
|
||||
enable_fullgraph=True,
|
||||
enable_piecewise_cuda_graph=cuda_graph,
|
||||
@ -813,7 +813,7 @@ class TestDeepSeekV3Lite(LlmapiAccuracyTestHarness):
|
||||
@pytest.mark.skip_device_not_contain(["H100"])
|
||||
@parametrize_with_ids("mtp_nextn", [0, 2])
|
||||
def test_fp8_block_scales_cuda_graph_padding(self, mtp_nextn):
|
||||
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.9)
|
||||
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.75)
|
||||
mtp_config = None
|
||||
if mtp_nextn > 0:
|
||||
mtp_config = MTPDecodingConfig(num_nextn_predict_layers=mtp_nextn)
|
||||
@ -838,7 +838,7 @@ class TestDeepSeekV3Lite(LlmapiAccuracyTestHarness):
|
||||
@parametrize_with_ids("attention_dp", [False, True])
|
||||
def test_fp8_block_scales_cuda_graph_padding_4gpus(self, mtp_nextn,
|
||||
attention_dp):
|
||||
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.9)
|
||||
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.75)
|
||||
mtp_config = None
|
||||
if mtp_nextn > 0:
|
||||
mtp_config = MTPDecodingConfig(num_nextn_predict_layers=mtp_nextn)
|
||||
@ -879,7 +879,7 @@ class TestDeepSeekV3Lite(LlmapiAccuracyTestHarness):
|
||||
pytest.skip("https://nvbugs/5252313")
|
||||
if torch_compile and pp_size > 1:
|
||||
pytest.skip("PP with torch.compile is not supported yet.")
|
||||
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.9)
|
||||
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.75)
|
||||
torch_compile_config = TorchCompileConfig(
|
||||
enable_fullgraph=True,
|
||||
enable_piecewise_cuda_graph=cuda_graph and not attention_dp,
|
||||
@ -979,7 +979,7 @@ class TestDeepSeekV3Lite(LlmapiAccuracyTestHarness):
|
||||
@pytest.mark.skip_less_device(4)
|
||||
@pytest.mark.skip_device_not_contain(["H100", "H200"])
|
||||
def test_fp8_block_scales_4gpus_static_eplb(self):
|
||||
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.9)
|
||||
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.75)
|
||||
|
||||
num_experts = 72
|
||||
num_slots = 80
|
||||
@ -1070,7 +1070,7 @@ class TestDeepSeekV3Lite(LlmapiAccuracyTestHarness):
|
||||
torch_compile, mtp_nextn, moe_backend):
|
||||
if torch_compile and mtp_nextn > 0:
|
||||
pytest.skip("https://nvbugs/5252313")
|
||||
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.9)
|
||||
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.75)
|
||||
torch_compile_config = TorchCompileConfig(
|
||||
enable_fullgraph=True,
|
||||
enable_piecewise_cuda_graph=cuda_graph,
|
||||
@ -1121,7 +1121,7 @@ class TestDeepSeekV3Lite(LlmapiAccuracyTestHarness):
|
||||
pytest.skip("PP with torch.compile is not supported yet.")
|
||||
if moe_backend == "TRTLLM" and get_sm_version() == 120:
|
||||
pytest.skip("MOE TRTLLM backend does not support SM version 120")
|
||||
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.9)
|
||||
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.75)
|
||||
# Picewise Cuda Graph cannot be enabled for nvfp4 attention dp.
|
||||
torch_compile_config = TorchCompileConfig(
|
||||
enable_fullgraph=True,
|
||||
@ -1178,7 +1178,7 @@ class TestDeepSeekV3Lite(LlmapiAccuracyTestHarness):
|
||||
elif quant_dtype == "nvfp4":
|
||||
model_path = f"{llm_models_root()}/DeepSeek-V3-Lite/nvfp4_moe_only"
|
||||
|
||||
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.9,
|
||||
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.75,
|
||||
enable_block_reuse=False)
|
||||
pytorch_config = dict(
|
||||
disable_overlap_scheduler=not overlap_scheduler,
|
||||
|
||||
Loading…
Reference in New Issue
Block a user