diff --git a/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-v32-fp4_1k1k_ctx1_gen1_dep32_bs32_eplb288_mtp0_ccb-NIXL.yaml b/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-v32-fp4_1k1k_ctx1_gen1_dep32_bs32_eplb288_mtp0_ccb-NIXL.yaml index 4dd4d7fb46..460a48e8e2 100644 --- a/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-v32-fp4_1k1k_ctx1_gen1_dep32_bs32_eplb288_mtp0_ccb-NIXL.yaml +++ b/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-v32-fp4_1k1k_ctx1_gen1_dep32_bs32_eplb288_mtp0_ccb-NIXL.yaml @@ -43,13 +43,12 @@ environment: profiling: nsys_on: false accuracy: - enable_accuracy_test: false + enable_accuracy_test: false # Set to true to enable accuracy evaluation model: local-completions tasks: gsm8k model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096 worker_config: gen: - enable_layerwise_nvtx_marker: true tensor_parallel_size: 32 moe_expert_parallel_size: 32 enable_attention_dp: true @@ -80,17 +79,20 @@ worker_config: free_gpu_memory_fraction: 0.9 dtype: fp8 moe_config: - backend: WIDEEP - load_balancer: - num_slots: 288 - layer_updates_per_iter: 1 + backend: CUTEDSL + use_low_precision_moe_combine: true + nvfp4_gemm_config: + allowed_backends: + - cutlass + - cublaslt + - cutedsl + - cuda_core cache_transceiver_config: max_tokens_in_buffer: 4608 - backend: NIXL + backend: NIXLf stream_interval: 20 num_postprocess_workers: 4 ctx: - enable_layerwise_nvtx_marker: true max_batch_size: 4 max_num_tokens: 4608 max_seq_len: 2251 @@ -101,6 +103,8 @@ worker_config: print_iter_log: true cuda_graph_config: null disable_overlap_scheduler: true + moe_config: + backend: TRTLLM kv_cache_config: enable_block_reuse: false free_gpu_memory_fraction: 0.85 diff --git a/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-v32-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb288_mtp3_ccb-NIXL.yaml b/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-v32-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb288_mtp3_ccb-NIXL.yaml index ca80042c69..dff8eec4d9 100644 --- a/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-v32-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb288_mtp3_ccb-NIXL.yaml +++ b/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-v32-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb288_mtp3_ccb-NIXL.yaml @@ -49,7 +49,6 @@ accuracy: model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096 worker_config: gen: - enable_layerwise_nvtx_marker: true tensor_parallel_size: 16 moe_expert_parallel_size: 16 enable_attention_dp: true @@ -80,10 +79,14 @@ worker_config: free_gpu_memory_fraction: 0.9 dtype: fp8 moe_config: - backend: WIDEEP - load_balancer: - num_slots: 288 - layer_updates_per_iter: 1 + backend: CUTEDSL + use_low_precision_moe_combine: true + nvfp4_gemm_config: + allowed_backends: + - cutlass + - cublaslt + - cutedsl + - cuda_core cache_transceiver_config: max_tokens_in_buffer: 4608 backend: NIXL @@ -93,7 +96,6 @@ worker_config: decoding_type: MTP num_nextn_predict_layers: 3 ctx: - enable_layerwise_nvtx_marker: true max_batch_size: 4 max_num_tokens: 4608 max_seq_len: 2251 @@ -104,6 +106,8 @@ worker_config: print_iter_log: true cuda_graph_config: null disable_overlap_scheduler: true + moe_config: + backend: TRTLLM kv_cache_config: enable_block_reuse: false free_gpu_memory_fraction: 0.85 diff --git a/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-v32-fp4_1k1k_ctx2_gen1_dep48_bs16_eplb288_mtp3_ccb-DEFAULT.yaml b/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-v32-fp4_1k1k_ctx2_gen1_dep48_bs16_eplb288_mtp3_ccb-DEFAULT.yaml index 76f4f78276..f3fe861df0 100644 --- a/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-v32-fp4_1k1k_ctx2_gen1_dep48_bs16_eplb288_mtp3_ccb-DEFAULT.yaml +++ b/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-v32-fp4_1k1k_ctx2_gen1_dep48_bs16_eplb288_mtp3_ccb-DEFAULT.yaml @@ -50,7 +50,6 @@ accuracy: model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096 worker_config: gen: - enable_layerwise_nvtx_marker: true tensor_parallel_size: 48 moe_expert_parallel_size: 48 enable_attention_dp: true @@ -81,16 +80,19 @@ worker_config: free_gpu_memory_fraction: 0.7 dtype: fp8 moe_config: - backend: WIDEEP - load_balancer: - num_slots: 288 - layer_updates_per_iter: 1 + backend: CUTEDSL + use_low_precision_moe_combine: true + nvfp4_gemm_config: + allowed_backends: + - cutlass + - cublaslt + - cutedsl + - cuda_core cache_transceiver_config: max_tokens_in_buffer: 8320 backend: DEFAULT stream_interval: 20 ctx: - enable_layerwise_nvtx_marker: true max_batch_size: 4 max_num_tokens: 4480 max_seq_len: 2176 @@ -101,6 +103,8 @@ worker_config: print_iter_log: true cuda_graph_config: null disable_overlap_scheduler: true + moe_config: + backend: TRTLLM kv_cache_config: enable_block_reuse: false free_gpu_memory_fraction: 0.85 diff --git a/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-v32-fp4_8k1k_ctx2_gen1_dep32_bs128_eplb288_mtp3_ccb-DEFAULT.yaml b/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-v32-fp4_8k1k_ctx2_gen1_dep32_bs128_eplb288_mtp3_ccb-DEFAULT.yaml index 4a91160a99..f24b1a5e9e 100644 --- a/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-v32-fp4_8k1k_ctx2_gen1_dep32_bs128_eplb288_mtp3_ccb-DEFAULT.yaml +++ b/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-v32-fp4_8k1k_ctx2_gen1_dep32_bs128_eplb288_mtp3_ccb-DEFAULT.yaml @@ -49,7 +49,6 @@ accuracy: model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096 worker_config: gen: - enable_layerwise_nvtx_marker: true tensor_parallel_size: 32 moe_expert_parallel_size: 32 enable_attention_dp: true @@ -81,10 +80,14 @@ worker_config: free_gpu_memory_fraction: 0.6 dtype: fp8 moe_config: - backend: WIDEEP - load_balancer: - num_slots: 288 - layer_updates_per_iter: 1 + backend: CUTEDSL + use_low_precision_moe_combine: true + nvfp4_gemm_config: + allowed_backends: + - cutlass + - cublaslt + - cutedsl + - cuda_core cache_transceiver_config: max_tokens_in_buffer: 8448 backend: DEFAULT @@ -94,7 +97,6 @@ worker_config: decoding_type: MTP num_nextn_predict_layers: 3 ctx: - enable_layerwise_nvtx_marker: true max_batch_size: 1 max_num_tokens: 8448 max_seq_len: 9423 @@ -109,6 +111,8 @@ worker_config: enable_block_reuse: false free_gpu_memory_fraction: 0.75 dtype: fp8 + moe_config: + backend: TRTLLM cache_transceiver_config: max_tokens_in_buffer: 8448 backend: DEFAULT diff --git a/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-v32-fp4_8k1k_ctx6_gen1_dep16_bs64_eplb288_mtp0_ccb-NIXL.yaml b/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-v32-fp4_8k1k_ctx6_gen1_dep16_bs64_eplb288_mtp0_ccb-NIXL.yaml index c262e3f661..2516e69a98 100644 --- a/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-v32-fp4_8k1k_ctx6_gen1_dep16_bs64_eplb288_mtp0_ccb-NIXL.yaml +++ b/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-v32-fp4_8k1k_ctx6_gen1_dep16_bs64_eplb288_mtp0_ccb-NIXL.yaml @@ -49,7 +49,6 @@ accuracy: model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096 worker_config: gen: - enable_layerwise_nvtx_marker: true tensor_parallel_size: 16 moe_expert_parallel_size: 16 enable_attention_dp: true @@ -80,17 +79,20 @@ worker_config: free_gpu_memory_fraction: 0.7 dtype: fp8 moe_config: - backend: WIDEEP - load_balancer: - num_slots: 288 - layer_updates_per_iter: 1 + backend: CUTEDSL + use_low_precision_moe_combine: true + nvfp4_gemm_config: + allowed_backends: + - cutlass + - cublaslt + - cutedsl + - cuda_core cache_transceiver_config: max_tokens_in_buffer: 8448 backend: NIXL stream_interval: 20 num_postprocess_workers: 4 ctx: - enable_layerwise_nvtx_marker: true max_batch_size: 1 max_num_tokens: 8448 max_seq_len: 9419 @@ -105,6 +107,8 @@ worker_config: enable_block_reuse: false free_gpu_memory_fraction: 0.75 dtype: fp8 + moe_config: + backend: TRTLLM cache_transceiver_config: max_tokens_in_buffer: 8448 backend: NIXL diff --git a/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-v32-fp4_8k1k_ctx8_gen1_dep32_bs16_eplb288_mtp3_ccb-NIXL.yaml b/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-v32-fp4_8k1k_ctx8_gen1_dep32_bs16_eplb288_mtp3_ccb-NIXL.yaml index 0b7bc63e3f..705cc33ea7 100644 --- a/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-v32-fp4_8k1k_ctx8_gen1_dep32_bs16_eplb288_mtp3_ccb-NIXL.yaml +++ b/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-v32-fp4_8k1k_ctx8_gen1_dep32_bs16_eplb288_mtp3_ccb-NIXL.yaml @@ -49,7 +49,6 @@ accuracy: model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096 worker_config: gen: - enable_layerwise_nvtx_marker: true tensor_parallel_size: 32 moe_expert_parallel_size: 32 enable_attention_dp: true @@ -80,10 +79,14 @@ worker_config: free_gpu_memory_fraction: 0.7 dtype: fp8 moe_config: - backend: WIDEEP - load_balancer: - num_slots: 288 - layer_updates_per_iter: 1 + backend: CUTEDSL + use_low_precision_moe_combine: true + nvfp4_gemm_config: + allowed_backends: + - cutlass + - cublaslt + - cutedsl + - cuda_core cache_transceiver_config: max_tokens_in_buffer: 8448 backend: NIXL @@ -93,7 +96,6 @@ worker_config: decoding_type: MTP num_nextn_predict_layers: 3 ctx: - enable_layerwise_nvtx_marker: true max_batch_size: 1 max_num_tokens: 8448 max_seq_len: 9419 @@ -108,6 +110,8 @@ worker_config: enable_block_reuse: false free_gpu_memory_fraction: 0.75 dtype: fp8 + moe_config: + backend: TRTLLM cache_transceiver_config: max_tokens_in_buffer: 8448 backend: NIXL