fix: disable cuda graph and MTP for overlap tests (#3155)

Signed-off-by: Iman Tabrizian <itabrizian@nvidia.com>
2026-01-14 06:27:45 +08:00 · 2025-03-31 11:35:35 -07:00 · 2025-03-31 11:35:35 -07:00 · e8731ba3b7
commit e8731ba3b7
parent f665f83256
3 changed files with 10 additions and 14 deletions
--- a/tests/integration/defs/disaggregated/test_configs/disagg_config_overlap.yaml
+++ b/tests/integration/defs/disaggregated/test_configs/disagg_config_overlap.yaml
@ -14,8 +14,8 @@ context_servers:
    free_gpu_memory_fraction: 0.2
    enable_partial_reuse: False
  pytorch_backend_config:
-    use_cuda_graph: True
-    enable_overlap_scheduler: false
+    use_cuda_graph: False
+    enable_overlap_scheduler: False
  urls:
      - "localhost:8001"
 generation_servers:
@ -29,7 +29,7 @@ generation_servers:
    free_gpu_memory_fraction: 0.2
    enable_partial_reuse: False
  pytorch_backend_config:
-    use_cuda_graph: True
-    enable_overlap_scheduler: true
+    use_cuda_graph: False
+    enable_overlap_scheduler: True
  urls:
      - "localhost:8002"
--- a/tests/integration/defs/disaggregated/test_configs/disagg_config_overlap_dp.yaml
+++ b/tests/integration/defs/disaggregated/test_configs/disagg_config_overlap_dp.yaml
@ -8,13 +8,12 @@ context_servers:
  max_batch_size: 1
  max_num_tokens: 3000
  max_seq_len: 4096
-  enable_attention_dp: true
-  mtp_nextn: 1
+  enable_attention_dp: True
  tensor_parallel_size: 2
  pipeline_parallel_size: 1
  pytorch_backend_config:
-    use_cuda_graph: True
-    enable_overlap_scheduler: false
+    use_cuda_graph: False
+    enable_overlap_scheduler: False
  urls:
      - "localhost:8001"
 generation_servers:
@ -23,11 +22,10 @@ generation_servers:
  pipeline_parallel_size: 1
  max_batch_size: 256
  max_num_tokens: 4096
-  enable_attention_dp: true
-  mtp_nextn: 1
+  enable_attention_dp: True
  max_seq_len: 4096
  pytorch_backend_config:
-    use_cuda_graph: True
-    enable_overlap_scheduler: true
+    use_cuda_graph: False
+    enable_overlap_scheduler: True
  urls:
      - "localhost:8002"
--- a/tests/integration/test_lists/waives.txt
+++ b/tests/integration/test_lists/waives.txt
@ -450,8 +450,6 @@ full:B40/perf/test_perf.py::test_perf[t5_11b] SKIP (bert_attention_plugin does n
 full:B40/perf/test_perf.py::test_perf[t5_3b] SKIP (bert_attention_plugin does not support SM >= 100)
 full:B40/perf/test_perf.py::test_perf[t5_base] SKIP (bert_attention_plugin does not support SM >= 100)
 full:B40/perf/test_perf.py::test_perf[t5_large] SKIP (bert_attention_plugin does not support SM >= 100)
-disaggregated/test_disaggregated.py::test_disaggregated_overlap_dp[DeepSeek-V3-Lite-fp8] SKIP (https://nvbugs/5166600)
-disaggregated/test_disaggregated.py::test_disaggregated_overlap[TinyLlama-1.1B-Chat-v1.0] SKIP (https://nvbugs/5184661)
 examples/test_recurrentgemma.py::test_llm_recurrentgemma_1gpu[use_cpp_session-recurrentgemma-2b-use_paged_cache-disable_quant-float16-enable_attn_plugin-enable_gemm_plugin] SKIP (https://nvbugs/5174573)
 examples/test_mistral.py::test_llm_mistral_nemo_fp8_quantization_1gpu[Mistral-Nemo-12b-Base-summarization] SKIP (https://nvbugspro.nvidia.com/bug/5181262)
 examples/test_qwen.py::test_llm_qwen_moe_single_gpu_summary[qwen1.5_moe_a2.7b_chat-enable_paged_kv_cache-enable_remove_input_padding-enable_weight_only-enable_fmha] SKIP (https://nvbugs/5180961)