fix: disable cuda graph and MTP for overlap tests (#3155)

Signed-off-by: Iman Tabrizian <itabrizian@nvidia.com>
This commit is contained in:
Iman Tabrizian 2025-03-31 11:35:35 -07:00 committed by GitHub
parent f665f83256
commit e8731ba3b7
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 10 additions and 14 deletions

View File

@ -14,8 +14,8 @@ context_servers:
free_gpu_memory_fraction: 0.2
enable_partial_reuse: False
pytorch_backend_config:
use_cuda_graph: True
enable_overlap_scheduler: false
use_cuda_graph: False
enable_overlap_scheduler: False
urls:
- "localhost:8001"
generation_servers:
@ -29,7 +29,7 @@ generation_servers:
free_gpu_memory_fraction: 0.2
enable_partial_reuse: False
pytorch_backend_config:
use_cuda_graph: True
enable_overlap_scheduler: true
use_cuda_graph: False
enable_overlap_scheduler: True
urls:
- "localhost:8002"

View File

@ -8,13 +8,12 @@ context_servers:
max_batch_size: 1
max_num_tokens: 3000
max_seq_len: 4096
enable_attention_dp: true
mtp_nextn: 1
enable_attention_dp: True
tensor_parallel_size: 2
pipeline_parallel_size: 1
pytorch_backend_config:
use_cuda_graph: True
enable_overlap_scheduler: false
use_cuda_graph: False
enable_overlap_scheduler: False
urls:
- "localhost:8001"
generation_servers:
@ -23,11 +22,10 @@ generation_servers:
pipeline_parallel_size: 1
max_batch_size: 256
max_num_tokens: 4096
enable_attention_dp: true
mtp_nextn: 1
enable_attention_dp: True
max_seq_len: 4096
pytorch_backend_config:
use_cuda_graph: True
enable_overlap_scheduler: true
use_cuda_graph: False
enable_overlap_scheduler: True
urls:
- "localhost:8002"

View File

@ -450,8 +450,6 @@ full:B40/perf/test_perf.py::test_perf[t5_11b] SKIP (bert_attention_plugin does n
full:B40/perf/test_perf.py::test_perf[t5_3b] SKIP (bert_attention_plugin does not support SM >= 100)
full:B40/perf/test_perf.py::test_perf[t5_base] SKIP (bert_attention_plugin does not support SM >= 100)
full:B40/perf/test_perf.py::test_perf[t5_large] SKIP (bert_attention_plugin does not support SM >= 100)
disaggregated/test_disaggregated.py::test_disaggregated_overlap_dp[DeepSeek-V3-Lite-fp8] SKIP (https://nvbugs/5166600)
disaggregated/test_disaggregated.py::test_disaggregated_overlap[TinyLlama-1.1B-Chat-v1.0] SKIP (https://nvbugs/5184661)
examples/test_recurrentgemma.py::test_llm_recurrentgemma_1gpu[use_cpp_session-recurrentgemma-2b-use_paged_cache-disable_quant-float16-enable_attn_plugin-enable_gemm_plugin] SKIP (https://nvbugs/5174573)
examples/test_mistral.py::test_llm_mistral_nemo_fp8_quantization_1gpu[Mistral-Nemo-12b-Base-summarization] SKIP (https://nvbugspro.nvidia.com/bug/5181262)
examples/test_qwen.py::test_llm_qwen_moe_single_gpu_summary[qwen1.5_moe_a2.7b_chat-enable_paged_kv_cache-enable_remove_input_padding-enable_weight_only-enable_fmha] SKIP (https://nvbugs/5180961)