[https://nvbugs/5823465][fix] Add CUTEDSL moe backend for deepseek r1 nvfp4 checkpoint in stress test (#10920)

Signed-off-by: Wangshanshan <30051912+dominicshanshan@users.noreply.github.com>
This commit is contained in:
dominicshanshan 2026-02-02 14:47:41 +08:00 committed by Yanchao Lu
parent 80235e53cf
commit d8e7c61ea9
2 changed files with 67 additions and 9 deletions

View File

@ -378,7 +378,7 @@ def is_port_available(port: int,
["GUARANTEED_NO_EVICT", "MAX_UTILIZATION"],
ids=lambda x: x)
@pytest.mark.parametrize("stress_time_timeout", [(180, 300), (300, 450),
(600, 900), (3600, 5400)],
(600, 900), (3600, 10800)],
ids=lambda x: f"stress_time_{x[0]}s_timeout_{x[1]}s")
@pytest.mark.parametrize(
"config",
@ -393,10 +393,22 @@ def is_port_available(port: int,
memory_requirement=12),
# Configuration for DeepSeek-V3 model
ModelConfig(model_dir="DeepSeek-V3", tp_size=8, memory_requirement=96),
# Configuration for DeepSeek-R1 model
# Configuration for DeepSeek-R1 model with FP8 checkpoints (8 GPU setup)
ModelConfig(model_dir="DeepSeek-R1/DeepSeek-R1",
tp_size=8,
memory_requirement=96),
# Configuration for DeepSeek-R1 model with FP8 checkpoints (4 GPU setup, requires GB300 288GB)
ModelConfig(model_dir="DeepSeek-R1/DeepSeek-R1",
tp_size=4,
memory_requirement=256),
# Configuration for DeepSeek-R1 model with NVFP4 checkpoints (8 GPU setup)
ModelConfig(model_dir="DeepSeek-R1/DeepSeek-R1-0528-FP4",
tp_size=8,
memory_requirement=96),
# Configuration for DeepSeek-R1 model with NVFP4 checkpoints (4 GPU setup)
ModelConfig(model_dir="DeepSeek-R1/DeepSeek-R1-0528-FP4",
tp_size=4,
memory_requirement=168),
],
ids=lambda x: f"{os.path.basename(x.model_dir)}_tp{x.tp_size}")
def test_run_stress_test(config, stress_time_timeout, backend,
@ -506,19 +518,20 @@ def stress_test(config,
36000 # 10 hours for DeepSeek-V3 or DeepSeek-R1, change this value if needed
)
# For DeepSeek-V3 specific server parameters
# For DeepSeek-V3 or DeepSeek-R1 specific server parameters
if "DeepSeek-V3" in config.model_dir or "DeepSeek-R1" in config.model_dir:
test_server_config = ServerConfig(
port=test_server_config.port,
host=test_server_config.host,
pp_size=test_server_config.pp_size,
ep_size=8, # DeepSeek-V3 or DeepSeek-R1 specific ep_size
ep_size=config.
tp_size, # ep_size matches tp_size for DeepSeek models
max_batch_size=
2048, # DeepSeek-V3 or DeepSeek-R1 specific max_batch_size
max_num_tokens=
2048, # DeepSeek-V3 or DeepSeek-R1 specific max_num_tokens
8192, # DeepSeek-V3 or DeepSeek-R1 specific max_num_tokens
kv_cache_free_gpu_memory_fraction=
0.7, # DeepSeek-V3 or DeepSeek-R1 specific kv_cache fraction
0.85, # DeepSeek-V3 or DeepSeek-R1 specific kv_cache fraction
capacity_scheduler_policy=test_server_config.
capacity_scheduler_policy,
wait_interval=test_server_config.wait_interval,
@ -583,6 +596,35 @@ def stress_test(config,
extra_llm_options["enable_attention_dp"] = True
# Set MOE backend based on GPU architecture and checkpoint type
# B200/GB200 (Blackwell, SM100+) with FP8 checkpoints: use DEEPGEMM backend
# B200/GB200 (Blackwell, SM100+) with NVFP4 checkpoints: use CUTEDSL backend
# H100/H200 (Hopper, SM90) with FP8 checkpoints: use CUTLASS backend (default)
try:
import torch
if torch.cuda.is_available():
device_capability = torch.cuda.get_device_capability(0)
is_blackwell = device_capability[0] >= 10
is_nvfp4 = "FP4" in config.model_dir.upper()
if is_blackwell:
if is_nvfp4:
moe_backend = "CUTEDSL"
else:
moe_backend = "DEEPGEMM"
extra_llm_options["moe_config"] = {
"backend": moe_backend,
}
checkpoint_type = "NVFP4" if is_nvfp4 else "FP8"
print_info(
f"Detected GPU architecture is SM{device_capability[0]}{device_capability[1]} (Blackwell), "
f"using {moe_backend} MOE backend for DeepSeek-R1/DeepSeek-V3 with {checkpoint_type} checkpoints"
)
except Exception as e:
print_warning(f"Failed to detect GPU architecture: {e}. "
"Using default MOE backend (CUTLASS).")
if config.backend == "pytorch":
extra_llm_options.update({
"cuda_graph_config": {
@ -1047,6 +1089,18 @@ def stress_stage(model_name,
request_count = int(stress_request_rate * stress_time)
test_timeout = stress_config.stress_timeout
# Cap request count for large MoE models (DeepSeek-V3/R1) to prevent timeout
if "DeepSeek-V3" in model_path or "DeepSeek-R1" in model_path:
max_sustainable_rate = 3.0 # req/s - conservative estimate
max_request_count = int(max_sustainable_rate *
stress_config.stress_time)
if request_count > max_request_count:
print_info(
f"Capping request_count from {request_count} to {max_request_count} "
f"for DeepSeek V3/R1 model (sustainable rate: {max_sustainable_rate} req/s)"
)
request_count = max_request_count
print_info(
f"Running stress test with concurrency={stress_concurrency}, request_count={request_count}"
)

View File

@ -1,6 +1,10 @@
stress_test/stress_test.py::test_run_stress_test[DeepSeek-V3_tp8-stress_time_3600s_timeout_5400s-GUARANTEED_NO_EVICT-pytorch-stress-test-with-accuracy]
stress_test/stress_test.py::test_run_stress_test[DeepSeek-V3_tp8-stress_time_3600s_timeout_5400s-MAX_UTILIZATION-pytorch-stress-test-with-accuracy]
stress_test/stress_test.py::test_run_stress_test[DeepSeek-R1_tp8-stress_time_3600s_timeout_5400s-MAX_UTILIZATION-pytorch-stress-test-with-accuracy]
stress_test/stress_test.py::test_run_stress_test[DeepSeek-V3_tp8-stress_time_3600s_timeout_10800s-GUARANTEED_NO_EVICT-pytorch-stress-test-with-accuracy]
stress_test/stress_test.py::test_run_stress_test[DeepSeek-V3_tp8-stress_time_3600s_timeout_10800s-MAX_UTILIZATION-pytorch-stress-test-with-accuracy]
stress_test/stress_test.py::test_run_stress_test[DeepSeek-R1_tp8-stress_time_3600s_timeout_10800s-MAX_UTILIZATION-pytorch-stress-test-with-accuracy]
stress_test/stress_test.py::test_run_stress_test[DeepSeek-R1_tp4-stress_time_3600s_timeout_10800s-GUARANTEED_NO_EVICT-pytorch-stress-test-with-accuracy]
stress_test/stress_test.py::test_run_stress_test[DeepSeek-R1_tp4-stress_time_3600s_timeout_10800s-MAX_UTILIZATION-pytorch-stress-test-with-accuracy]
stress_test/stress_test.py::test_run_stress_test[DeepSeek-R1-0528-FP4_tp4-stress_time_3600s_timeout_10800s-GUARANTEED_NO_EVICT-pytorch-stress-test-with-accuracy]
stress_test/stress_test.py::test_run_stress_test[DeepSeek-R1-0528-FP4_tp4-stress_time_3600s_timeout_10800s-MAX_UTILIZATION-pytorch-stress-test-with-accuracy]
disaggregated/test_disaggregated.py::test_disaggregated_stress_test[input8k-output1k-conc512-deepseek_r1_v2_fp4_stress]
disaggregated/test_disaggregated.py::test_disaggregated_stress_test[input8k-output1k-conc512-gpt_oss_120b_stress]
accuracy/test_llm_api_pytorch.py::TestDeepSeekR1LongBenchV2::test_fp8_8gpus