From d8e7c61ea9ee9da88a2cd9ebfbdbae66177e9885 Mon Sep 17 00:00:00 2001 From: dominicshanshan <30051912+dominicshanshan@users.noreply.github.com> Date: Mon, 2 Feb 2026 14:47:41 +0800 Subject: [PATCH] [https://nvbugs/5823465][fix] Add CUTEDSL moe backend for deepseek r1 nvfp4 checkpoint in stress test (#10920) Signed-off-by: Wangshanshan <30051912+dominicshanshan@users.noreply.github.com> --- .../defs/stress_test/stress_test.py | 66 +++++++++++++++++-- .../test_lists/qa/llm_function_stress.txt | 10 ++- 2 files changed, 67 insertions(+), 9 deletions(-) diff --git a/tests/integration/defs/stress_test/stress_test.py b/tests/integration/defs/stress_test/stress_test.py index f5fed7b0eb..2de8a38801 100644 --- a/tests/integration/defs/stress_test/stress_test.py +++ b/tests/integration/defs/stress_test/stress_test.py @@ -378,7 +378,7 @@ def is_port_available(port: int, ["GUARANTEED_NO_EVICT", "MAX_UTILIZATION"], ids=lambda x: x) @pytest.mark.parametrize("stress_time_timeout", [(180, 300), (300, 450), - (600, 900), (3600, 5400)], + (600, 900), (3600, 10800)], ids=lambda x: f"stress_time_{x[0]}s_timeout_{x[1]}s") @pytest.mark.parametrize( "config", @@ -393,10 +393,22 @@ def is_port_available(port: int, memory_requirement=12), # Configuration for DeepSeek-V3 model ModelConfig(model_dir="DeepSeek-V3", tp_size=8, memory_requirement=96), - # Configuration for DeepSeek-R1 model + # Configuration for DeepSeek-R1 model with FP8 checkpoints (8 GPU setup) ModelConfig(model_dir="DeepSeek-R1/DeepSeek-R1", tp_size=8, memory_requirement=96), + # Configuration for DeepSeek-R1 model with FP8 checkpoints (4 GPU setup, requires GB300 288GB) + ModelConfig(model_dir="DeepSeek-R1/DeepSeek-R1", + tp_size=4, + memory_requirement=256), + # Configuration for DeepSeek-R1 model with NVFP4 checkpoints (8 GPU setup) + ModelConfig(model_dir="DeepSeek-R1/DeepSeek-R1-0528-FP4", + tp_size=8, + memory_requirement=96), + # Configuration for DeepSeek-R1 model with NVFP4 checkpoints (4 GPU setup) + ModelConfig(model_dir="DeepSeek-R1/DeepSeek-R1-0528-FP4", + tp_size=4, + memory_requirement=168), ], ids=lambda x: f"{os.path.basename(x.model_dir)}_tp{x.tp_size}") def test_run_stress_test(config, stress_time_timeout, backend, @@ -506,19 +518,20 @@ def stress_test(config, 36000 # 10 hours for DeepSeek-V3 or DeepSeek-R1, change this value if needed ) - # For DeepSeek-V3 specific server parameters + # For DeepSeek-V3 or DeepSeek-R1 specific server parameters if "DeepSeek-V3" in config.model_dir or "DeepSeek-R1" in config.model_dir: test_server_config = ServerConfig( port=test_server_config.port, host=test_server_config.host, pp_size=test_server_config.pp_size, - ep_size=8, # DeepSeek-V3 or DeepSeek-R1 specific ep_size + ep_size=config. + tp_size, # ep_size matches tp_size for DeepSeek models max_batch_size= 2048, # DeepSeek-V3 or DeepSeek-R1 specific max_batch_size max_num_tokens= - 2048, # DeepSeek-V3 or DeepSeek-R1 specific max_num_tokens + 8192, # DeepSeek-V3 or DeepSeek-R1 specific max_num_tokens kv_cache_free_gpu_memory_fraction= - 0.7, # DeepSeek-V3 or DeepSeek-R1 specific kv_cache fraction + 0.85, # DeepSeek-V3 or DeepSeek-R1 specific kv_cache fraction capacity_scheduler_policy=test_server_config. capacity_scheduler_policy, wait_interval=test_server_config.wait_interval, @@ -583,6 +596,35 @@ def stress_test(config, extra_llm_options["enable_attention_dp"] = True + # Set MOE backend based on GPU architecture and checkpoint type + # B200/GB200 (Blackwell, SM100+) with FP8 checkpoints: use DEEPGEMM backend + # B200/GB200 (Blackwell, SM100+) with NVFP4 checkpoints: use CUTEDSL backend + # H100/H200 (Hopper, SM90) with FP8 checkpoints: use CUTLASS backend (default) + try: + import torch + if torch.cuda.is_available(): + device_capability = torch.cuda.get_device_capability(0) + is_blackwell = device_capability[0] >= 10 + is_nvfp4 = "FP4" in config.model_dir.upper() + + if is_blackwell: + if is_nvfp4: + moe_backend = "CUTEDSL" + else: + moe_backend = "DEEPGEMM" + + extra_llm_options["moe_config"] = { + "backend": moe_backend, + } + checkpoint_type = "NVFP4" if is_nvfp4 else "FP8" + print_info( + f"Detected GPU architecture is SM{device_capability[0]}{device_capability[1]} (Blackwell), " + f"using {moe_backend} MOE backend for DeepSeek-R1/DeepSeek-V3 with {checkpoint_type} checkpoints" + ) + except Exception as e: + print_warning(f"Failed to detect GPU architecture: {e}. " + "Using default MOE backend (CUTLASS).") + if config.backend == "pytorch": extra_llm_options.update({ "cuda_graph_config": { @@ -1047,6 +1089,18 @@ def stress_stage(model_name, request_count = int(stress_request_rate * stress_time) test_timeout = stress_config.stress_timeout + # Cap request count for large MoE models (DeepSeek-V3/R1) to prevent timeout + if "DeepSeek-V3" in model_path or "DeepSeek-R1" in model_path: + max_sustainable_rate = 3.0 # req/s - conservative estimate + max_request_count = int(max_sustainable_rate * + stress_config.stress_time) + if request_count > max_request_count: + print_info( + f"Capping request_count from {request_count} to {max_request_count} " + f"for DeepSeek V3/R1 model (sustainable rate: {max_sustainable_rate} req/s)" + ) + request_count = max_request_count + print_info( f"Running stress test with concurrency={stress_concurrency}, request_count={request_count}" ) diff --git a/tests/integration/test_lists/qa/llm_function_stress.txt b/tests/integration/test_lists/qa/llm_function_stress.txt index cabba3def0..baee4172c8 100644 --- a/tests/integration/test_lists/qa/llm_function_stress.txt +++ b/tests/integration/test_lists/qa/llm_function_stress.txt @@ -1,6 +1,10 @@ -stress_test/stress_test.py::test_run_stress_test[DeepSeek-V3_tp8-stress_time_3600s_timeout_5400s-GUARANTEED_NO_EVICT-pytorch-stress-test-with-accuracy] -stress_test/stress_test.py::test_run_stress_test[DeepSeek-V3_tp8-stress_time_3600s_timeout_5400s-MAX_UTILIZATION-pytorch-stress-test-with-accuracy] -stress_test/stress_test.py::test_run_stress_test[DeepSeek-R1_tp8-stress_time_3600s_timeout_5400s-MAX_UTILIZATION-pytorch-stress-test-with-accuracy] +stress_test/stress_test.py::test_run_stress_test[DeepSeek-V3_tp8-stress_time_3600s_timeout_10800s-GUARANTEED_NO_EVICT-pytorch-stress-test-with-accuracy] +stress_test/stress_test.py::test_run_stress_test[DeepSeek-V3_tp8-stress_time_3600s_timeout_10800s-MAX_UTILIZATION-pytorch-stress-test-with-accuracy] +stress_test/stress_test.py::test_run_stress_test[DeepSeek-R1_tp8-stress_time_3600s_timeout_10800s-MAX_UTILIZATION-pytorch-stress-test-with-accuracy] +stress_test/stress_test.py::test_run_stress_test[DeepSeek-R1_tp4-stress_time_3600s_timeout_10800s-GUARANTEED_NO_EVICT-pytorch-stress-test-with-accuracy] +stress_test/stress_test.py::test_run_stress_test[DeepSeek-R1_tp4-stress_time_3600s_timeout_10800s-MAX_UTILIZATION-pytorch-stress-test-with-accuracy] +stress_test/stress_test.py::test_run_stress_test[DeepSeek-R1-0528-FP4_tp4-stress_time_3600s_timeout_10800s-GUARANTEED_NO_EVICT-pytorch-stress-test-with-accuracy] +stress_test/stress_test.py::test_run_stress_test[DeepSeek-R1-0528-FP4_tp4-stress_time_3600s_timeout_10800s-MAX_UTILIZATION-pytorch-stress-test-with-accuracy] disaggregated/test_disaggregated.py::test_disaggregated_stress_test[input8k-output1k-conc512-deepseek_r1_v2_fp4_stress] disaggregated/test_disaggregated.py::test_disaggregated_stress_test[input8k-output1k-conc512-gpt_oss_120b_stress] accuracy/test_llm_api_pytorch.py::TestDeepSeekR1LongBenchV2::test_fp8_8gpus