fix (NvBug 5354925): Fix static EPLB (#5411)

Signed-off-by: Enwei Zhu <21126786+syuoni@users.noreply.github.com>
2026-01-13 22:18:36 +08:00 · 2025-06-25 13:14:40 +08:00 · 2025-06-25 13:14:40 +08:00 · 76da7fed86
commit 76da7fed86
parent da98e03747
3 changed files with 7 additions and 5 deletions
--- a/examples/ep_load_balancer/README.md
+++ b/examples/ep_load_balancer/README.md
@ -27,6 +27,7 @@ Run 32-way expert parallelism inference on the prepared dataset. Please refer to
 cat > ./extra_llm_api_options.yaml <<EOF
 enable_attention_dp: true
 use_cuda_graph: true
+moe_backend: WideEP
 moe_max_num_tokens: 8192
 EOF

@ -116,6 +117,7 @@ Run 36-way expert parallelism inference with the EPLB configuration incorporated
 cat > ./extra_llm_api_options_eplb.yaml <<EOF
 enable_attention_dp: true
 use_cuda_graph: true
+moe_backend: WideEP
 moe_max_num_tokens: 9216
 moe_load_balancer: ./moe_load_balancer.yaml
 EOF
@ -181,6 +183,7 @@ Run 36-way expert parallelism inference with the EPLB configuration incorporated
 cat > ./extra_llm_api_options_eplb.yaml <<EOF
 enable_attention_dp: true
 use_cuda_graph: true
+moe_backend: WideEP
 moe_max_num_tokens: 9216
 moe_load_balancer: ./moe_load_balancer.yaml
 EOF
--- a/tensorrt_llm/_torch/modules/fused_moe/fused_moe_wide_ep.py
+++ b/tensorrt_llm/_torch/modules/fused_moe/fused_moe_wide_ep.py
@ -196,7 +196,7 @@ class WideEPMoE(MoE):
                                            hidden_size, self.num_slots)
            else:
                raise NotImplementedError(
-                    f"Not available alltoall method type: {alltoall_method_type!r}"
+                    f"Not available alltoall method type: {self.alltoall_method_type!r}"
                )

        # If True, the router weight will be multiplied on the input rather than at the end of FC2
@ -433,7 +433,7 @@ class WideEPMoE(MoE):
                        token_selected_slots, dtype=token_final_scales.dtype)
            else:
                raise NotImplementedError(
-                    f"Not available alltoall method type: {alltoall_method_type!r}"
+                    f"Not available alltoall method type: {self.alltoall_method_type!r}"
                )

        x_sf = None
@ -552,7 +552,7 @@ class WideEPMoE(MoE):
                )
            else:
                raise NotImplementedError(
-                    f"Not available alltoall method type: {alltoall_method_type!r}"
+                    f"Not available alltoall method type: {self.alltoall_method_type!r}"
                )

        if self.enable_alltoall:
@ -631,7 +631,7 @@ class WideEPMoE(MoE):
                    deep_ep_topk_weights, deep_ep_handle)
            else:
                raise NotImplementedError(
-                    f"Not available alltoall method type: {alltoall_method_type!r}"
+                    f"Not available alltoall method type: {self.alltoall_method_type!r}"
                )

        if self.layer_load_balancer and not self.layer_load_balancer.is_static_routing(
--- a/tests/integration/test_lists/waives.txt
+++ b/tests/integration/test_lists/waives.txt
@ -441,4 +441,3 @@ accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_auto_dtype[mtp_
 accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales[mtp=vanilla-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False] SKIP (https://nvbugs/5354946)
 examples/test_multimodal.py::test_llm_multimodal_general[kosmos-2-pp:1-tp:1-float16-bs:1-cpp_e2e:True-nb:1] SKIP (https://nvbugs/5354936)
 examples/test_multimodal.py::test_llm_multimodal_general[fuyu-8b-pp:1-tp:1-float16-bs:1-cpp_e2e:True-nb:1] SKIP (https://nvbugs/5354936)
-accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus_static_eplb SKIP (https://nvbugs/5354925)