From d8e7c61ea9ee9da88a2cd9ebfbdbae66177e9885 Mon Sep 17 00:00:00 2001
From: dominicshanshan <30051912+dominicshanshan@users.noreply.github.com>
Date: Mon, 2 Feb 2026 14:47:41 +0800
Subject: [PATCH] [https://nvbugs/5823465][fix] Add CUTEDSL moe backend for
 deepseek r1 nvfp4 checkpoint in stress test (#10920)

Signed-off-by: Wangshanshan <30051912+dominicshanshan@users.noreply.github.com>
---
 .../defs/stress_test/stress_test.py           | 66 +++++++++++++++++--
 .../test_lists/qa/llm_function_stress.txt     | 10 ++-
 2 files changed, 67 insertions(+), 9 deletions(-)

diff --git a/tests/integration/defs/stress_test/stress_test.py b/tests/integration/defs/stress_test/stress_test.py
index f5fed7b0eb..2de8a38801 100644
--- a/tests/integration/defs/stress_test/stress_test.py
+++ b/tests/integration/defs/stress_test/stress_test.py
@@ -378,7 +378,7 @@ def is_port_available(port: int,
                          ["GUARANTEED_NO_EVICT", "MAX_UTILIZATION"],
                          ids=lambda x: x)
 @pytest.mark.parametrize("stress_time_timeout", [(180, 300), (300, 450),
-                                                 (600, 900), (3600, 5400)],
+                                                 (600, 900), (3600, 10800)],
                          ids=lambda x: f"stress_time_{x[0]}s_timeout_{x[1]}s")
 @pytest.mark.parametrize(
     "config",
@@ -393,10 +393,22 @@ def is_port_available(port: int,
                     memory_requirement=12),
         # Configuration for DeepSeek-V3 model
         ModelConfig(model_dir="DeepSeek-V3", tp_size=8, memory_requirement=96),
-        # Configuration for DeepSeek-R1 model
+        # Configuration for DeepSeek-R1 model with FP8 checkpoints (8 GPU setup)
         ModelConfig(model_dir="DeepSeek-R1/DeepSeek-R1",
                     tp_size=8,
                     memory_requirement=96),
+        # Configuration for DeepSeek-R1 model with FP8 checkpoints (4 GPU setup, requires GB300 288GB)
+        ModelConfig(model_dir="DeepSeek-R1/DeepSeek-R1",
+                    tp_size=4,
+                    memory_requirement=256),
+        # Configuration for DeepSeek-R1 model with NVFP4 checkpoints (8 GPU setup)
+        ModelConfig(model_dir="DeepSeek-R1/DeepSeek-R1-0528-FP4",
+                    tp_size=8,
+                    memory_requirement=96),
+        # Configuration for DeepSeek-R1 model with NVFP4 checkpoints (4 GPU setup)
+        ModelConfig(model_dir="DeepSeek-R1/DeepSeek-R1-0528-FP4",
+                    tp_size=4,
+                    memory_requirement=168),
     ],
     ids=lambda x: f"{os.path.basename(x.model_dir)}_tp{x.tp_size}")
 def test_run_stress_test(config, stress_time_timeout, backend,
@@ -506,19 +518,20 @@ def stress_test(config,
                 36000  # 10 hours for DeepSeek-V3 or DeepSeek-R1, change this value if needed
             )
 
-    # For DeepSeek-V3 specific server parameters
+    # For DeepSeek-V3 or DeepSeek-R1 specific server parameters
     if "DeepSeek-V3" in config.model_dir or "DeepSeek-R1" in config.model_dir:
         test_server_config = ServerConfig(
             port=test_server_config.port,
             host=test_server_config.host,
             pp_size=test_server_config.pp_size,
-            ep_size=8,  # DeepSeek-V3 or DeepSeek-R1 specific ep_size
+            ep_size=config.
+            tp_size,  # ep_size matches tp_size for DeepSeek models
             max_batch_size=
             2048,  # DeepSeek-V3 or DeepSeek-R1 specific max_batch_size
             max_num_tokens=
-            2048,  # DeepSeek-V3 or DeepSeek-R1 specific max_num_tokens
+            8192,  # DeepSeek-V3 or DeepSeek-R1 specific max_num_tokens
             kv_cache_free_gpu_memory_fraction=
-            0.7,  # DeepSeek-V3 or DeepSeek-R1 specific kv_cache fraction
+            0.85,  # DeepSeek-V3 or DeepSeek-R1 specific kv_cache fraction
             capacity_scheduler_policy=test_server_config.
             capacity_scheduler_policy,
             wait_interval=test_server_config.wait_interval,
@@ -583,6 +596,35 @@ def stress_test(config,
 
         extra_llm_options["enable_attention_dp"] = True
 
+        # Set MOE backend based on GPU architecture and checkpoint type
+        # B200/GB200 (Blackwell, SM100+) with FP8 checkpoints: use DEEPGEMM backend
+        # B200/GB200 (Blackwell, SM100+) with NVFP4 checkpoints: use CUTEDSL backend
+        # H100/H200 (Hopper, SM90) with FP8 checkpoints: use CUTLASS backend (default)
+        try:
+            import torch
+            if torch.cuda.is_available():
+                device_capability = torch.cuda.get_device_capability(0)
+                is_blackwell = device_capability[0] >= 10
+                is_nvfp4 = "FP4" in config.model_dir.upper()
+
+                if is_blackwell:
+                    if is_nvfp4:
+                        moe_backend = "CUTEDSL"
+                    else:
+                        moe_backend = "DEEPGEMM"
+
+                    extra_llm_options["moe_config"] = {
+                        "backend": moe_backend,
+                    }
+                    checkpoint_type = "NVFP4" if is_nvfp4 else "FP8"
+                    print_info(
+                        f"Detected GPU architecture is SM{device_capability[0]}{device_capability[1]} (Blackwell), "
+                        f"using {moe_backend} MOE backend for DeepSeek-R1/DeepSeek-V3 with {checkpoint_type} checkpoints"
+                    )
+        except Exception as e:
+            print_warning(f"Failed to detect GPU architecture: {e}. "
+                          "Using default MOE backend (CUTLASS).")
+
         if config.backend == "pytorch":
             extra_llm_options.update({
                 "cuda_graph_config": {
@@ -1047,6 +1089,18 @@ def stress_stage(model_name,
         request_count = int(stress_request_rate * stress_time)
         test_timeout = stress_config.stress_timeout
 
+    # Cap request count for large MoE models (DeepSeek-V3/R1) to prevent timeout
+    if "DeepSeek-V3" in model_path or "DeepSeek-R1" in model_path:
+        max_sustainable_rate = 3.0  # req/s - conservative estimate
+        max_request_count = int(max_sustainable_rate *
+                                stress_config.stress_time)
+        if request_count > max_request_count:
+            print_info(
+                f"Capping request_count from {request_count} to {max_request_count} "
+                f"for DeepSeek V3/R1 model (sustainable rate: {max_sustainable_rate} req/s)"
+            )
+            request_count = max_request_count
+
     print_info(
         f"Running stress test with concurrency={stress_concurrency}, request_count={request_count}"
     )
diff --git a/tests/integration/test_lists/qa/llm_function_stress.txt b/tests/integration/test_lists/qa/llm_function_stress.txt
index cabba3def0..baee4172c8 100644
--- a/tests/integration/test_lists/qa/llm_function_stress.txt
+++ b/tests/integration/test_lists/qa/llm_function_stress.txt
@@ -1,6 +1,10 @@
-stress_test/stress_test.py::test_run_stress_test[DeepSeek-V3_tp8-stress_time_3600s_timeout_5400s-GUARANTEED_NO_EVICT-pytorch-stress-test-with-accuracy]
-stress_test/stress_test.py::test_run_stress_test[DeepSeek-V3_tp8-stress_time_3600s_timeout_5400s-MAX_UTILIZATION-pytorch-stress-test-with-accuracy]
-stress_test/stress_test.py::test_run_stress_test[DeepSeek-R1_tp8-stress_time_3600s_timeout_5400s-MAX_UTILIZATION-pytorch-stress-test-with-accuracy]
+stress_test/stress_test.py::test_run_stress_test[DeepSeek-V3_tp8-stress_time_3600s_timeout_10800s-GUARANTEED_NO_EVICT-pytorch-stress-test-with-accuracy]
+stress_test/stress_test.py::test_run_stress_test[DeepSeek-V3_tp8-stress_time_3600s_timeout_10800s-MAX_UTILIZATION-pytorch-stress-test-with-accuracy]
+stress_test/stress_test.py::test_run_stress_test[DeepSeek-R1_tp8-stress_time_3600s_timeout_10800s-MAX_UTILIZATION-pytorch-stress-test-with-accuracy]
+stress_test/stress_test.py::test_run_stress_test[DeepSeek-R1_tp4-stress_time_3600s_timeout_10800s-GUARANTEED_NO_EVICT-pytorch-stress-test-with-accuracy]
+stress_test/stress_test.py::test_run_stress_test[DeepSeek-R1_tp4-stress_time_3600s_timeout_10800s-MAX_UTILIZATION-pytorch-stress-test-with-accuracy]
+stress_test/stress_test.py::test_run_stress_test[DeepSeek-R1-0528-FP4_tp4-stress_time_3600s_timeout_10800s-GUARANTEED_NO_EVICT-pytorch-stress-test-with-accuracy]
+stress_test/stress_test.py::test_run_stress_test[DeepSeek-R1-0528-FP4_tp4-stress_time_3600s_timeout_10800s-MAX_UTILIZATION-pytorch-stress-test-with-accuracy]
 disaggregated/test_disaggregated.py::test_disaggregated_stress_test[input8k-output1k-conc512-deepseek_r1_v2_fp4_stress]
 disaggregated/test_disaggregated.py::test_disaggregated_stress_test[input8k-output1k-conc512-gpt_oss_120b_stress]
 accuracy/test_llm_api_pytorch.py::TestDeepSeekR1LongBenchV2::test_fp8_8gpus