diff --git a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_128k8k_ctx1_pp4_gen13_tep4_bs1_eplb0_mtp0-Default.yaml b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_128k8k_ctx1_pp4_gen13_tep4_bs1_eplb0_mtp0-Default.yaml index 2fdb13e410..a4ad607842 100644 --- a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_128k8k_ctx1_pp4_gen13_tep4_bs1_eplb0_mtp0-Default.yaml +++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_128k8k_ctx1_pp4_gen13_tep4_bs1_eplb0_mtp0-Default.yaml @@ -83,7 +83,7 @@ worker_config: max_seq_len: 131104 tensor_parallel_size: 1 moe_expert_parallel_size: 1 - enable_attention_dp: true + enable_attention_dp: false pipeline_parallel_size: 4 print_iter_log: true cuda_graph_config: null diff --git a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_128k8k_ctx1_pp4_gen5_tep4_bs4_eplb0_mtp0-Default.yaml b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_128k8k_ctx1_pp4_gen5_tep4_bs4_eplb0_mtp0-Default.yaml index c894c1e193..f2b1074ea4 100644 --- a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_128k8k_ctx1_pp4_gen5_tep4_bs4_eplb0_mtp0-Default.yaml +++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_128k8k_ctx1_pp4_gen5_tep4_bs4_eplb0_mtp0-Default.yaml @@ -78,12 +78,12 @@ worker_config: num_postprocess_workers: 4 allreduce_strategy: MNNVL ctx: - max_batch_size: 4 + max_batch_size: 1 max_num_tokens: 131104 max_seq_len: 131104 tensor_parallel_size: 1 moe_expert_parallel_size: 1 - enable_attention_dp: true + enable_attention_dp: false pipeline_parallel_size: 4 print_iter_log: true cuda_graph_config: null diff --git a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_128k8k_ctx1_pp4_gen6_tep8_bs1_eplb0_mtp3-Default.yaml b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_128k8k_ctx1_pp4_gen6_tep8_bs1_eplb0_mtp3-Default.yaml index 8010f1aaaa..238aa8b6ef 100644 --- a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_128k8k_ctx1_pp4_gen6_tep8_bs1_eplb0_mtp3-Default.yaml +++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_128k8k_ctx1_pp4_gen6_tep8_bs1_eplb0_mtp3-Default.yaml @@ -87,7 +87,7 @@ worker_config: max_seq_len: 131104 tensor_parallel_size: 1 moe_expert_parallel_size: 1 - enable_attention_dp: true + enable_attention_dp: false pipeline_parallel_size: 4 print_iter_log: true cuda_graph_config: null diff --git a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_128k8k_ctx1_pp4_gen7_tep8_bs1_eplb0_mtp0-Default.yaml b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_128k8k_ctx1_pp4_gen7_tep8_bs1_eplb0_mtp0-Default.yaml index 66f080aa4e..ca20d69038 100644 --- a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_128k8k_ctx1_pp4_gen7_tep8_bs1_eplb0_mtp0-Default.yaml +++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_128k8k_ctx1_pp4_gen7_tep8_bs1_eplb0_mtp0-Default.yaml @@ -83,7 +83,7 @@ worker_config: max_seq_len: 131104 tensor_parallel_size: 1 moe_expert_parallel_size: 1 - enable_attention_dp: true + enable_attention_dp: false pipeline_parallel_size: 4 print_iter_log: true cuda_graph_config: null diff --git a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_128k8k_ctx1_pp4_gen8_tep4_bs2_eplb0_mtp0-Default.yaml b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_128k8k_ctx1_pp4_gen8_tep4_bs2_eplb0_mtp0-Default.yaml index 9462ae86bf..a22245dee9 100644 --- a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_128k8k_ctx1_pp4_gen8_tep4_bs2_eplb0_mtp0-Default.yaml +++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_128k8k_ctx1_pp4_gen8_tep4_bs2_eplb0_mtp0-Default.yaml @@ -78,12 +78,12 @@ worker_config: num_postprocess_workers: 4 allreduce_strategy: MNNVL ctx: - max_batch_size: 2 + max_batch_size: 1 max_num_tokens: 131104 max_seq_len: 131104 tensor_parallel_size: 1 moe_expert_parallel_size: 1 - enable_attention_dp: true + enable_attention_dp: false pipeline_parallel_size: 4 print_iter_log: true cuda_graph_config: null diff --git a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_128k8k_ctx1_pp4_gen8_tep8_bs1_eplb0_mtp0-Default.yaml b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_128k8k_ctx1_pp4_gen8_tep8_bs1_eplb0_mtp0-Default.yaml index 0087e871ca..59d835780b 100644 --- a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_128k8k_ctx1_pp4_gen8_tep8_bs1_eplb0_mtp0-Default.yaml +++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_128k8k_ctx1_pp4_gen8_tep8_bs1_eplb0_mtp0-Default.yaml @@ -83,7 +83,7 @@ worker_config: max_seq_len: 131104 tensor_parallel_size: 1 moe_expert_parallel_size: 1 - enable_attention_dp: true + enable_attention_dp: false pipeline_parallel_size: 4 print_iter_log: true cuda_graph_config: null diff --git a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_128k8k_ctx1_pp8_gen11_tep4_bs2_eplb0_mtp0-Default.yaml b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_128k8k_ctx1_pp8_gen11_tep4_bs2_eplb0_mtp0-Default.yaml index 826b01a6bb..5be853b4ac 100644 --- a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_128k8k_ctx1_pp8_gen11_tep4_bs2_eplb0_mtp0-Default.yaml +++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_128k8k_ctx1_pp8_gen11_tep4_bs2_eplb0_mtp0-Default.yaml @@ -78,12 +78,12 @@ worker_config: num_postprocess_workers: 4 allreduce_strategy: MNNVL ctx: - max_batch_size: 2 + max_batch_size: 1 max_num_tokens: 131104 max_seq_len: 131104 tensor_parallel_size: 1 moe_expert_parallel_size: 1 - enable_attention_dp: true + enable_attention_dp: false pipeline_parallel_size: 8 print_iter_log: true cuda_graph_config: null diff --git a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_128k8k_ctx1_pp8_gen14_tep4_bs1_eplb0_mtp0-Default.yaml b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_128k8k_ctx1_pp8_gen14_tep4_bs1_eplb0_mtp0-Default.yaml index ada7c29e99..dfbf6222ed 100644 --- a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_128k8k_ctx1_pp8_gen14_tep4_bs1_eplb0_mtp0-Default.yaml +++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_128k8k_ctx1_pp8_gen14_tep4_bs1_eplb0_mtp0-Default.yaml @@ -83,7 +83,7 @@ worker_config: max_seq_len: 131104 tensor_parallel_size: 1 moe_expert_parallel_size: 1 - enable_attention_dp: true + enable_attention_dp: false pipeline_parallel_size: 8 print_iter_log: true cuda_graph_config: null diff --git a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_128k8k_ctx1_pp8_gen1_dep16_bs1_eplb0_mtp3-Default.yaml b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_128k8k_ctx1_pp8_gen1_dep16_bs1_eplb0_mtp3-Default.yaml index ba0b4d003e..55972145a5 100644 --- a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_128k8k_ctx1_pp8_gen1_dep16_bs1_eplb0_mtp3-Default.yaml +++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_128k8k_ctx1_pp8_gen1_dep16_bs1_eplb0_mtp3-Default.yaml @@ -86,7 +86,7 @@ worker_config: max_seq_len: 131104 tensor_parallel_size: 1 moe_expert_parallel_size: 1 - enable_attention_dp: true + enable_attention_dp: false pipeline_parallel_size: 8 print_iter_log: true cuda_graph_config: null diff --git a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_128k8k_ctx1_pp8_gen1_dep8_bs4_eplb0_mtp2-Default.yaml b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_128k8k_ctx1_pp8_gen1_dep8_bs4_eplb0_mtp2-Default.yaml index 81fbcf581e..02bc0863a9 100644 --- a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_128k8k_ctx1_pp8_gen1_dep8_bs4_eplb0_mtp2-Default.yaml +++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_128k8k_ctx1_pp8_gen1_dep8_bs4_eplb0_mtp2-Default.yaml @@ -81,12 +81,12 @@ worker_config: decoding_type: MTP num_nextn_predict_layers: 2 ctx: - max_batch_size: 4 + max_batch_size: 1 max_num_tokens: 131104 max_seq_len: 131104 tensor_parallel_size: 1 moe_expert_parallel_size: 1 - enable_attention_dp: true + enable_attention_dp: false pipeline_parallel_size: 8 print_iter_log: true cuda_graph_config: null diff --git a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_128k8k_ctx1_pp8_gen1_tep8_bs1_eplb0_mtp0-Default.yaml b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_128k8k_ctx1_pp8_gen1_tep8_bs1_eplb0_mtp0-Default.yaml index 675e3e66f2..fd3ad8c0f1 100644 --- a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_128k8k_ctx1_pp8_gen1_tep8_bs1_eplb0_mtp0-Default.yaml +++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_128k8k_ctx1_pp8_gen1_tep8_bs1_eplb0_mtp0-Default.yaml @@ -83,7 +83,7 @@ worker_config: max_seq_len: 131104 tensor_parallel_size: 1 moe_expert_parallel_size: 1 - enable_attention_dp: true + enable_attention_dp: false pipeline_parallel_size: 8 print_iter_log: true cuda_graph_config: null diff --git a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_128k8k_ctx1_pp8_gen1_tep8_bs1_eplb0_mtp3-Default.yaml b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_128k8k_ctx1_pp8_gen1_tep8_bs1_eplb0_mtp3-Default.yaml index cabf7bcf5c..1cd58fc393 100644 --- a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_128k8k_ctx1_pp8_gen1_tep8_bs1_eplb0_mtp3-Default.yaml +++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_128k8k_ctx1_pp8_gen1_tep8_bs1_eplb0_mtp3-Default.yaml @@ -87,7 +87,7 @@ worker_config: max_seq_len: 131104 tensor_parallel_size: 1 moe_expert_parallel_size: 1 - enable_attention_dp: true + enable_attention_dp: false pipeline_parallel_size: 8 print_iter_log: true cuda_graph_config: null diff --git a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_128k8k_ctx1_pp8_gen1_tep8_bs2_eplb0_mtp3-Default.yaml b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_128k8k_ctx1_pp8_gen1_tep8_bs2_eplb0_mtp3-Default.yaml index 0aa4795d7c..1565c88347 100644 --- a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_128k8k_ctx1_pp8_gen1_tep8_bs2_eplb0_mtp3-Default.yaml +++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_128k8k_ctx1_pp8_gen1_tep8_bs2_eplb0_mtp3-Default.yaml @@ -82,12 +82,12 @@ worker_config: decoding_type: MTP num_nextn_predict_layers: 3 ctx: - max_batch_size: 2 + max_batch_size: 1 max_num_tokens: 131104 max_seq_len: 131104 tensor_parallel_size: 1 moe_expert_parallel_size: 1 - enable_attention_dp: true + enable_attention_dp: false pipeline_parallel_size: 8 print_iter_log: true cuda_graph_config: null diff --git a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_128k8k_ctx1_pp8_gen5_tep8_bs2_eplb0_mtp3-Default.yaml b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_128k8k_ctx1_pp8_gen5_tep8_bs2_eplb0_mtp3-Default.yaml index 278d4afa78..281ab82151 100644 --- a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_128k8k_ctx1_pp8_gen5_tep8_bs2_eplb0_mtp3-Default.yaml +++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_128k8k_ctx1_pp8_gen5_tep8_bs2_eplb0_mtp3-Default.yaml @@ -81,12 +81,12 @@ worker_config: decoding_type: MTP num_nextn_predict_layers: 3 ctx: - max_batch_size: 2 + max_batch_size: 1 max_num_tokens: 131104 max_seq_len: 131104 tensor_parallel_size: 1 moe_expert_parallel_size: 1 - enable_attention_dp: true + enable_attention_dp: false pipeline_parallel_size: 8 print_iter_log: true cuda_graph_config: null diff --git a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_128k8k_ctx1_pp8_gen7_tep4_bs2_eplb0_mtp2-Default.yaml b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_128k8k_ctx1_pp8_gen7_tep4_bs2_eplb0_mtp2-Default.yaml index 144a05c120..77e113fec2 100644 --- a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_128k8k_ctx1_pp8_gen7_tep4_bs2_eplb0_mtp2-Default.yaml +++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_128k8k_ctx1_pp8_gen7_tep4_bs2_eplb0_mtp2-Default.yaml @@ -81,12 +81,12 @@ worker_config: decoding_type: MTP num_nextn_predict_layers: 2 ctx: - max_batch_size: 2 + max_batch_size: 1 max_num_tokens: 131104 max_seq_len: 131104 tensor_parallel_size: 1 moe_expert_parallel_size: 1 - enable_attention_dp: true + enable_attention_dp: false pipeline_parallel_size: 8 print_iter_log: true cuda_graph_config: null diff --git a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_128k8k_ctx1_pp8_gen7_tep8_bs1_eplb0_mtp0-Default.yaml b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_128k8k_ctx1_pp8_gen7_tep8_bs1_eplb0_mtp0-Default.yaml index 065c280cea..517b5c61e7 100644 --- a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_128k8k_ctx1_pp8_gen7_tep8_bs1_eplb0_mtp0-Default.yaml +++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_128k8k_ctx1_pp8_gen7_tep8_bs1_eplb0_mtp0-Default.yaml @@ -83,7 +83,7 @@ worker_config: max_seq_len: 131104 tensor_parallel_size: 1 moe_expert_parallel_size: 1 - enable_attention_dp: true + enable_attention_dp: false pipeline_parallel_size: 8 print_iter_log: true cuda_graph_config: null diff --git a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_128k8k_ctx1_pp8_gen8_tep4_bs4_eplb0_mtp0-Default.yaml b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_128k8k_ctx1_pp8_gen8_tep4_bs4_eplb0_mtp0-Default.yaml index 69f5763bb4..449fd368a3 100644 --- a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_128k8k_ctx1_pp8_gen8_tep4_bs4_eplb0_mtp0-Default.yaml +++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_128k8k_ctx1_pp8_gen8_tep4_bs4_eplb0_mtp0-Default.yaml @@ -78,12 +78,12 @@ worker_config: num_postprocess_workers: 4 allreduce_strategy: MNNVL ctx: - max_batch_size: 4 + max_batch_size: 1 max_num_tokens: 131104 max_seq_len: 131104 tensor_parallel_size: 1 moe_expert_parallel_size: 1 - enable_attention_dp: true + enable_attention_dp: false pipeline_parallel_size: 8 print_iter_log: true cuda_graph_config: null diff --git a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_128k8k_ctx2_pp4_gen7_tep8_bs2_eplb0_mtp3-Default.yaml b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_128k8k_ctx2_pp4_gen7_tep8_bs2_eplb0_mtp3-Default.yaml index c59bbe4ac9..d794643060 100644 --- a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_128k8k_ctx2_pp4_gen7_tep8_bs2_eplb0_mtp3-Default.yaml +++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_128k8k_ctx2_pp4_gen7_tep8_bs2_eplb0_mtp3-Default.yaml @@ -81,12 +81,12 @@ worker_config: num_nextn_predict_layers: 3 allreduce_strategy: MNNVL ctx: - max_batch_size: 2 + max_batch_size: 1 max_num_tokens: 131104 max_seq_len: 131104 tensor_parallel_size: 1 moe_expert_parallel_size: 1 - enable_attention_dp: true + enable_attention_dp: false pipeline_parallel_size: 4 print_iter_log: true cuda_graph_config: null diff --git a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_128k8k_ctx2_pp8_gen1_dep32_bs2_eplb0_mtp0-Default.yaml b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_128k8k_ctx2_pp8_gen1_dep32_bs2_eplb0_mtp0-Default.yaml index 63b81d5c7a..d547dae706 100644 --- a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_128k8k_ctx2_pp8_gen1_dep32_bs2_eplb0_mtp0-Default.yaml +++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_128k8k_ctx2_pp8_gen1_dep32_bs2_eplb0_mtp0-Default.yaml @@ -77,12 +77,12 @@ worker_config: stream_interval: 20 num_postprocess_workers: 4 ctx: - max_batch_size: 2 + max_batch_size: 1 max_num_tokens: 131104 max_seq_len: 131104 tensor_parallel_size: 1 moe_expert_parallel_size: 1 - enable_attention_dp: true + enable_attention_dp: false pipeline_parallel_size: 8 print_iter_log: true cuda_graph_config: null diff --git a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_128k8k_ctx3_pp4_gen1_dep8_bs16_eplb0_mtp1-Default.yaml b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_128k8k_ctx3_pp4_gen1_dep8_bs16_eplb0_mtp1-Default.yaml index 6dee7193fc..90d2770057 100644 --- a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_128k8k_ctx3_pp4_gen1_dep8_bs16_eplb0_mtp1-Default.yaml +++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_128k8k_ctx3_pp4_gen1_dep8_bs16_eplb0_mtp1-Default.yaml @@ -80,12 +80,12 @@ worker_config: decoding_type: MTP num_nextn_predict_layers: 1 ctx: - max_batch_size: 16 + max_batch_size: 1 max_num_tokens: 131104 max_seq_len: 131104 tensor_parallel_size: 1 moe_expert_parallel_size: 1 - enable_attention_dp: true + enable_attention_dp: false pipeline_parallel_size: 4 print_iter_log: true cuda_graph_config: null diff --git a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_128k8k_ctx3_pp8_gen1_dep16_bs16_eplb0_mtp0-Default.yaml b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_128k8k_ctx3_pp8_gen1_dep16_bs16_eplb0_mtp0-Default.yaml index 5578abce0b..1c935ff7c4 100644 --- a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_128k8k_ctx3_pp8_gen1_dep16_bs16_eplb0_mtp0-Default.yaml +++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_128k8k_ctx3_pp8_gen1_dep16_bs16_eplb0_mtp0-Default.yaml @@ -77,12 +77,12 @@ worker_config: stream_interval: 20 num_postprocess_workers: 4 ctx: - max_batch_size: 16 + max_batch_size: 1 max_num_tokens: 131104 max_seq_len: 131104 tensor_parallel_size: 1 moe_expert_parallel_size: 1 - enable_attention_dp: true + enable_attention_dp: false pipeline_parallel_size: 8 print_iter_log: true cuda_graph_config: null diff --git a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_128k8k_ctx3_pp8_gen1_dep16_bs8_eplb0_mtp2-Default.yaml b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_128k8k_ctx3_pp8_gen1_dep16_bs8_eplb0_mtp2-Default.yaml index 1d5531383d..ee9d98cdaa 100644 --- a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_128k8k_ctx3_pp8_gen1_dep16_bs8_eplb0_mtp2-Default.yaml +++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_128k8k_ctx3_pp8_gen1_dep16_bs8_eplb0_mtp2-Default.yaml @@ -81,12 +81,12 @@ worker_config: decoding_type: MTP num_nextn_predict_layers: 2 ctx: - max_batch_size: 8 + max_batch_size: 1 max_num_tokens: 131104 max_seq_len: 131104 tensor_parallel_size: 1 moe_expert_parallel_size: 1 - enable_attention_dp: true + enable_attention_dp: false pipeline_parallel_size: 8 print_iter_log: true cuda_graph_config: null diff --git a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_128k8k_ctx3_pp8_gen1_dep32_bs2_eplb0_mtp3-Default.yaml b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_128k8k_ctx3_pp8_gen1_dep32_bs2_eplb0_mtp3-Default.yaml index 12246924da..d69db0a1ca 100644 --- a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_128k8k_ctx3_pp8_gen1_dep32_bs2_eplb0_mtp3-Default.yaml +++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_128k8k_ctx3_pp8_gen1_dep32_bs2_eplb0_mtp3-Default.yaml @@ -81,12 +81,12 @@ worker_config: decoding_type: MTP num_nextn_predict_layers: 3 ctx: - max_batch_size: 2 + max_batch_size: 1 max_num_tokens: 131104 max_seq_len: 131104 tensor_parallel_size: 1 moe_expert_parallel_size: 1 - enable_attention_dp: true + enable_attention_dp: false pipeline_parallel_size: 8 print_iter_log: true cuda_graph_config: null diff --git a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_128k8k_ctx3_pp8_gen1_dep32_bs4_eplb0_mtp0-Default.yaml b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_128k8k_ctx3_pp8_gen1_dep32_bs4_eplb0_mtp0-Default.yaml index 21735da398..a51d1073e3 100644 --- a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_128k8k_ctx3_pp8_gen1_dep32_bs4_eplb0_mtp0-Default.yaml +++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_128k8k_ctx3_pp8_gen1_dep32_bs4_eplb0_mtp0-Default.yaml @@ -77,12 +77,12 @@ worker_config: stream_interval: 20 num_postprocess_workers: 4 ctx: - max_batch_size: 4 + max_batch_size: 1 max_num_tokens: 131104 max_seq_len: 131104 tensor_parallel_size: 1 moe_expert_parallel_size: 1 - enable_attention_dp: true + enable_attention_dp: false pipeline_parallel_size: 8 print_iter_log: true cuda_graph_config: null diff --git a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_128k8k_ctx5_pp4_gen1_dep16_bs16_eplb0_mtp0-Default.yaml b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_128k8k_ctx5_pp4_gen1_dep16_bs16_eplb0_mtp0-Default.yaml index 74477f81a2..05d6a10d32 100644 --- a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_128k8k_ctx5_pp4_gen1_dep16_bs16_eplb0_mtp0-Default.yaml +++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_128k8k_ctx5_pp4_gen1_dep16_bs16_eplb0_mtp0-Default.yaml @@ -77,12 +77,12 @@ worker_config: stream_interval: 20 num_postprocess_workers: 4 ctx: - max_batch_size: 16 + max_batch_size: 1 max_num_tokens: 131104 max_seq_len: 131104 tensor_parallel_size: 1 moe_expert_parallel_size: 1 - enable_attention_dp: true + enable_attention_dp: false pipeline_parallel_size: 4 print_iter_log: true cuda_graph_config: null diff --git a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_128k8k_ctx5_pp4_gen1_dep16_bs8_eplb0_mtp3-Default.yaml b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_128k8k_ctx5_pp4_gen1_dep16_bs8_eplb0_mtp3-Default.yaml index bf927bc583..5befdee833 100644 --- a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_128k8k_ctx5_pp4_gen1_dep16_bs8_eplb0_mtp3-Default.yaml +++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_128k8k_ctx5_pp4_gen1_dep16_bs8_eplb0_mtp3-Default.yaml @@ -81,12 +81,12 @@ worker_config: decoding_type: MTP num_nextn_predict_layers: 3 ctx: - max_batch_size: 8 + max_batch_size: 1 max_num_tokens: 131104 max_seq_len: 131104 tensor_parallel_size: 1 moe_expert_parallel_size: 1 - enable_attention_dp: true + enable_attention_dp: false pipeline_parallel_size: 4 print_iter_log: true cuda_graph_config: null diff --git a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_128k8k_ctx5_pp4_gen1_dep32_bs2_eplb0_mtp3-Default.yaml b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_128k8k_ctx5_pp4_gen1_dep32_bs2_eplb0_mtp3-Default.yaml index 693a9c995f..e2bcac6224 100644 --- a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_128k8k_ctx5_pp4_gen1_dep32_bs2_eplb0_mtp3-Default.yaml +++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_128k8k_ctx5_pp4_gen1_dep32_bs2_eplb0_mtp3-Default.yaml @@ -81,12 +81,12 @@ worker_config: decoding_type: MTP num_nextn_predict_layers: 3 ctx: - max_batch_size: 2 + max_batch_size: 1 max_num_tokens: 131104 max_seq_len: 131104 tensor_parallel_size: 1 moe_expert_parallel_size: 1 - enable_attention_dp: true + enable_attention_dp: false pipeline_parallel_size: 4 print_iter_log: true cuda_graph_config: null diff --git a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_128k8k_ctx5_pp4_gen1_dep32_bs4_eplb0_mtp0-Default.yaml b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_128k8k_ctx5_pp4_gen1_dep32_bs4_eplb0_mtp0-Default.yaml index 83967207ee..d449c173c7 100644 --- a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_128k8k_ctx5_pp4_gen1_dep32_bs4_eplb0_mtp0-Default.yaml +++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_128k8k_ctx5_pp4_gen1_dep32_bs4_eplb0_mtp0-Default.yaml @@ -77,12 +77,12 @@ worker_config: stream_interval: 20 num_postprocess_workers: 4 ctx: - max_batch_size: 4 + max_batch_size: 1 max_num_tokens: 131104 max_seq_len: 131104 tensor_parallel_size: 1 moe_expert_parallel_size: 1 - enable_attention_dp: true + enable_attention_dp: false pipeline_parallel_size: 4 print_iter_log: true cuda_graph_config: null diff --git a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_128k8k_ctx7_pp4_gen1_dep16_bs16_eplb0_mtp1-Default.yaml b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_128k8k_ctx7_pp4_gen1_dep16_bs16_eplb0_mtp1-Default.yaml index 90a2879c52..90ed3bd0d3 100644 --- a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_128k8k_ctx7_pp4_gen1_dep16_bs16_eplb0_mtp1-Default.yaml +++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_128k8k_ctx7_pp4_gen1_dep16_bs16_eplb0_mtp1-Default.yaml @@ -81,12 +81,12 @@ worker_config: decoding_type: MTP num_nextn_predict_layers: 1 ctx: - max_batch_size: 16 + max_batch_size: 1 max_num_tokens: 131104 max_seq_len: 131104 tensor_parallel_size: 1 moe_expert_parallel_size: 1 - enable_attention_dp: true + enable_attention_dp: false pipeline_parallel_size: 4 print_iter_log: true cuda_graph_config: null diff --git a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_128k8k_ctx7_pp4_gen1_dep16_bs32_eplb0_mtp0-Default.yaml b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_128k8k_ctx7_pp4_gen1_dep16_bs32_eplb0_mtp0-Default.yaml index 80da2262de..2eed9c9959 100644 --- a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_128k8k_ctx7_pp4_gen1_dep16_bs32_eplb0_mtp0-Default.yaml +++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_128k8k_ctx7_pp4_gen1_dep16_bs32_eplb0_mtp0-Default.yaml @@ -77,12 +77,12 @@ worker_config: stream_interval: 20 num_postprocess_workers: 4 ctx: - max_batch_size: 32 + max_batch_size: 1 max_num_tokens: 131104 max_seq_len: 131104 tensor_parallel_size: 1 moe_expert_parallel_size: 1 - enable_attention_dp: true + enable_attention_dp: false pipeline_parallel_size: 4 print_iter_log: true cuda_graph_config: null diff --git a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_128k8k_ctx8_pp4_gen1_dep16_bs32_eplb0_mtp1-Default.yaml b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_128k8k_ctx8_pp4_gen1_dep16_bs32_eplb0_mtp1-Default.yaml index cb0ab7c034..c9226167aa 100644 --- a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_128k8k_ctx8_pp4_gen1_dep16_bs32_eplb0_mtp1-Default.yaml +++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_128k8k_ctx8_pp4_gen1_dep16_bs32_eplb0_mtp1-Default.yaml @@ -81,12 +81,12 @@ worker_config: decoding_type: MTP num_nextn_predict_layers: 1 ctx: - max_batch_size: 32 + max_batch_size: 1 max_num_tokens: 131104 max_seq_len: 131104 tensor_parallel_size: 1 moe_expert_parallel_size: 1 - enable_attention_dp: true + enable_attention_dp: false pipeline_parallel_size: 4 print_iter_log: true cuda_graph_config: null diff --git a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_128k8k_ctx8_pp4_gen1_dep32_bs4_eplb0_mtp3-Default.yaml b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_128k8k_ctx8_pp4_gen1_dep32_bs4_eplb0_mtp3-Default.yaml index 9e94dd864a..e92e50d77f 100644 --- a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_128k8k_ctx8_pp4_gen1_dep32_bs4_eplb0_mtp3-Default.yaml +++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_128k8k_ctx8_pp4_gen1_dep32_bs4_eplb0_mtp3-Default.yaml @@ -81,12 +81,12 @@ worker_config: decoding_type: MTP num_nextn_predict_layers: 3 ctx: - max_batch_size: 4 + max_batch_size: 1 max_num_tokens: 131104 max_seq_len: 131104 tensor_parallel_size: 1 moe_expert_parallel_size: 1 - enable_attention_dp: true + enable_attention_dp: false pipeline_parallel_size: 4 print_iter_log: true cuda_graph_config: null diff --git a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_128k8k_ctx8_pp4_gen1_dep32_bs8_eplb0_mtp0-Default.yaml b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_128k8k_ctx8_pp4_gen1_dep32_bs8_eplb0_mtp0-Default.yaml index 984fb5bb6d..fb0c3d5483 100644 --- a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_128k8k_ctx8_pp4_gen1_dep32_bs8_eplb0_mtp0-Default.yaml +++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_128k8k_ctx8_pp4_gen1_dep32_bs8_eplb0_mtp0-Default.yaml @@ -77,12 +77,12 @@ worker_config: stream_interval: 20 num_postprocess_workers: 4 ctx: - max_batch_size: 8 + max_batch_size: 1 max_num_tokens: 131104 max_seq_len: 131104 tensor_parallel_size: 1 moe_expert_parallel_size: 1 - enable_attention_dp: true + enable_attention_dp: false pipeline_parallel_size: 4 print_iter_log: true cuda_graph_config: null diff --git a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_128k8k_ctx8_pp4_gen1_dep32_bs8_eplb0_mtp3-Default.yaml b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_128k8k_ctx8_pp4_gen1_dep32_bs8_eplb0_mtp3-Default.yaml index 0b1402b3cc..8740d2861c 100644 --- a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_128k8k_ctx8_pp4_gen1_dep32_bs8_eplb0_mtp3-Default.yaml +++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_128k8k_ctx8_pp4_gen1_dep32_bs8_eplb0_mtp3-Default.yaml @@ -81,12 +81,12 @@ worker_config: decoding_type: MTP num_nextn_predict_layers: 3 ctx: - max_batch_size: 8 + max_batch_size: 1 max_num_tokens: 131104 max_seq_len: 131104 tensor_parallel_size: 1 moe_expert_parallel_size: 1 - enable_attention_dp: true + enable_attention_dp: false pipeline_parallel_size: 4 print_iter_log: true cuda_graph_config: null diff --git a/tests/integration/defs/perf/test_perf.py b/tests/integration/defs/perf/test_perf.py index d70d1c9f40..62a671c099 100644 --- a/tests/integration/defs/perf/test_perf.py +++ b/tests/integration/defs/perf/test_perf.py @@ -238,6 +238,7 @@ TRUST_REMOTE_CODE_MODELS = { # these models require explicit trust_remote_code= "llama_v3.3_nemotron_super_49b_fp8", "llama_v3.1_nemotron_ultra_253b", "llama_v3.1_nemotron_ultra_253b_fp8", + "kimi_k2_nvfp4", } # Autodeploy model configs - maps model name to config file path (relative to TRT-LLM root)