[https://nvbugs/5823465][fix] Add CUTEDSL moe backend for deepseek r1 nvfp4 checkpoint in stress test (#10920)

Signed-off-by: Wangshanshan <30051912+dominicshanshan@users.noreply.github.com>
2026-02-18 16:55:08 +08:00 · 2026-02-02 14:47:41 +08:00 · 2026-02-02 14:47:41 +08:00 · d8e7c61ea9
commit d8e7c61ea9
parent 80235e53cf
2 changed files with 67 additions and 9 deletions
--- a/tests/integration/defs/stress_test/stress_test.py
+++ b/tests/integration/defs/stress_test/stress_test.py
@ -378,7 +378,7 @@ def is_port_available(port: int,
                         ["GUARANTEED_NO_EVICT", "MAX_UTILIZATION"],
                         ids=lambda x: x)
@pytest.mark.parametrize("stress_time_timeout", [(180, 300), (300, 450),
-                                                 (600, 900), (3600, 5400)],
+                                                 (600, 900), (3600, 10800)],
                         ids=lambda x: f"stress_time_{x[0]}s_timeout_{x[1]}s")
@pytest.mark.parametrize(
    "config",
@ -393,10 +393,22 @@ def is_port_available(port: int,
                    memory_requirement=12),
        # Configuration for DeepSeek-V3 model
        ModelConfig(model_dir="DeepSeek-V3", tp_size=8, memory_requirement=96),
-        # Configuration for DeepSeek-R1 model
+        # Configuration for DeepSeek-R1 model with FP8 checkpoints (8 GPU setup)
        ModelConfig(model_dir="DeepSeek-R1/DeepSeek-R1",
                    tp_size=8,
                    memory_requirement=96),
+        # Configuration for DeepSeek-R1 model with FP8 checkpoints (4 GPU setup, requires GB300 288GB)
+        ModelConfig(model_dir="DeepSeek-R1/DeepSeek-R1",
+                    tp_size=4,
+                    memory_requirement=256),
+        # Configuration for DeepSeek-R1 model with NVFP4 checkpoints (8 GPU setup)
+        ModelConfig(model_dir="DeepSeek-R1/DeepSeek-R1-0528-FP4",
+                    tp_size=8,
+                    memory_requirement=96),
+        # Configuration for DeepSeek-R1 model with NVFP4 checkpoints (4 GPU setup)
+        ModelConfig(model_dir="DeepSeek-R1/DeepSeek-R1-0528-FP4",
+                    tp_size=4,
+                    memory_requirement=168),
    ],
    ids=lambda x: f"{os.path.basename(x.model_dir)}_tp{x.tp_size}")
 def test_run_stress_test(config, stress_time_timeout, backend,
@ -506,19 +518,20 @@ def stress_test(config,
                36000  # 10 hours for DeepSeek-V3 or DeepSeek-R1, change this value if needed
            )

-    # For DeepSeek-V3 specific server parameters
+    # For DeepSeek-V3 or DeepSeek-R1 specific server parameters
    if "DeepSeek-V3" in config.model_dir or "DeepSeek-R1" in config.model_dir:
        test_server_config = ServerConfig(
            port=test_server_config.port,
            host=test_server_config.host,
            pp_size=test_server_config.pp_size,
-            ep_size=8,  # DeepSeek-V3 or DeepSeek-R1 specific ep_size
+            ep_size=config.
+            tp_size,  # ep_size matches tp_size for DeepSeek models
            max_batch_size=
            2048,  # DeepSeek-V3 or DeepSeek-R1 specific max_batch_size
            max_num_tokens=
-            2048,  # DeepSeek-V3 or DeepSeek-R1 specific max_num_tokens
+            8192,  # DeepSeek-V3 or DeepSeek-R1 specific max_num_tokens
            kv_cache_free_gpu_memory_fraction=
-            0.7,  # DeepSeek-V3 or DeepSeek-R1 specific kv_cache fraction
+            0.85,  # DeepSeek-V3 or DeepSeek-R1 specific kv_cache fraction
            capacity_scheduler_policy=test_server_config.
            capacity_scheduler_policy,
            wait_interval=test_server_config.wait_interval,
@ -583,6 +596,35 @@ def stress_test(config,

        extra_llm_options["enable_attention_dp"] = True

+        # Set MOE backend based on GPU architecture and checkpoint type
+        # B200/GB200 (Blackwell, SM100+) with FP8 checkpoints: use DEEPGEMM backend
+        # B200/GB200 (Blackwell, SM100+) with NVFP4 checkpoints: use CUTEDSL backend
+        # H100/H200 (Hopper, SM90) with FP8 checkpoints: use CUTLASS backend (default)
+        try:
+            import torch
+            if torch.cuda.is_available():
+                device_capability = torch.cuda.get_device_capability(0)
+                is_blackwell = device_capability[0] >= 10
+                is_nvfp4 = "FP4" in config.model_dir.upper()
+
+                if is_blackwell:
+                    if is_nvfp4:
+                        moe_backend = "CUTEDSL"
+                    else:
+                        moe_backend = "DEEPGEMM"
+
+                    extra_llm_options["moe_config"] = {
+                        "backend": moe_backend,
+                    }
+                    checkpoint_type = "NVFP4" if is_nvfp4 else "FP8"
+                    print_info(
+                        f"Detected GPU architecture is SM{device_capability[0]}{device_capability[1]} (Blackwell), "
+                        f"using {moe_backend} MOE backend for DeepSeek-R1/DeepSeek-V3 with {checkpoint_type} checkpoints"
+                    )
+        except Exception as e:
+            print_warning(f"Failed to detect GPU architecture: {e}. "
+                          "Using default MOE backend (CUTLASS).")
+
        if config.backend == "pytorch":
            extra_llm_options.update({
                "cuda_graph_config": {
@ -1047,6 +1089,18 @@ def stress_stage(model_name,
        request_count = int(stress_request_rate * stress_time)
        test_timeout = stress_config.stress_timeout

+    # Cap request count for large MoE models (DeepSeek-V3/R1) to prevent timeout
+    if "DeepSeek-V3" in model_path or "DeepSeek-R1" in model_path:
+        max_sustainable_rate = 3.0  # req/s - conservative estimate
+        max_request_count = int(max_sustainable_rate *
+                                stress_config.stress_time)
+        if request_count > max_request_count:
+            print_info(
+                f"Capping request_count from {request_count} to {max_request_count} "
+                f"for DeepSeek V3/R1 model (sustainable rate: {max_sustainable_rate} req/s)"
+            )
+            request_count = max_request_count
+
    print_info(
        f"Running stress test with concurrency={stress_concurrency}, request_count={request_count}"
    )
--- a/tests/integration/test_lists/qa/llm_function_stress.txt
+++ b/tests/integration/test_lists/qa/llm_function_stress.txt
@ -1,6 +1,10 @@
-stress_test/stress_test.py::test_run_stress_test[DeepSeek-V3_tp8-stress_time_3600s_timeout_5400s-GUARANTEED_NO_EVICT-pytorch-stress-test-with-accuracy]
-stress_test/stress_test.py::test_run_stress_test[DeepSeek-V3_tp8-stress_time_3600s_timeout_5400s-MAX_UTILIZATION-pytorch-stress-test-with-accuracy]
-stress_test/stress_test.py::test_run_stress_test[DeepSeek-R1_tp8-stress_time_3600s_timeout_5400s-MAX_UTILIZATION-pytorch-stress-test-with-accuracy]
+stress_test/stress_test.py::test_run_stress_test[DeepSeek-V3_tp8-stress_time_3600s_timeout_10800s-GUARANTEED_NO_EVICT-pytorch-stress-test-with-accuracy]
+stress_test/stress_test.py::test_run_stress_test[DeepSeek-V3_tp8-stress_time_3600s_timeout_10800s-MAX_UTILIZATION-pytorch-stress-test-with-accuracy]
+stress_test/stress_test.py::test_run_stress_test[DeepSeek-R1_tp8-stress_time_3600s_timeout_10800s-MAX_UTILIZATION-pytorch-stress-test-with-accuracy]
+stress_test/stress_test.py::test_run_stress_test[DeepSeek-R1_tp4-stress_time_3600s_timeout_10800s-GUARANTEED_NO_EVICT-pytorch-stress-test-with-accuracy]
+stress_test/stress_test.py::test_run_stress_test[DeepSeek-R1_tp4-stress_time_3600s_timeout_10800s-MAX_UTILIZATION-pytorch-stress-test-with-accuracy]
+stress_test/stress_test.py::test_run_stress_test[DeepSeek-R1-0528-FP4_tp4-stress_time_3600s_timeout_10800s-GUARANTEED_NO_EVICT-pytorch-stress-test-with-accuracy]
+stress_test/stress_test.py::test_run_stress_test[DeepSeek-R1-0528-FP4_tp4-stress_time_3600s_timeout_10800s-MAX_UTILIZATION-pytorch-stress-test-with-accuracy]
 disaggregated/test_disaggregated.py::test_disaggregated_stress_test[input8k-output1k-conc512-deepseek_r1_v2_fp4_stress]
 disaggregated/test_disaggregated.py::test_disaggregated_stress_test[input8k-output1k-conc512-gpt_oss_120b_stress]
 accuracy/test_llm_api_pytorch.py::TestDeepSeekR1LongBenchV2::test_fp8_8gpus