fix (NvBug 5354925): Fix static EPLB (#5411)

Signed-off-by: Enwei Zhu <21126786+syuoni@users.noreply.github.com>
This commit is contained in:
Enwei Zhu 2025-06-25 13:14:40 +08:00 committed by GitHub
parent da98e03747
commit 76da7fed86
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 7 additions and 5 deletions

View File

@ -27,6 +27,7 @@ Run 32-way expert parallelism inference on the prepared dataset. Please refer to
cat > ./extra_llm_api_options.yaml <<EOF
enable_attention_dp: true
use_cuda_graph: true
moe_backend: WideEP
moe_max_num_tokens: 8192
EOF
@ -116,6 +117,7 @@ Run 36-way expert parallelism inference with the EPLB configuration incorporated
cat > ./extra_llm_api_options_eplb.yaml <<EOF
enable_attention_dp: true
use_cuda_graph: true
moe_backend: WideEP
moe_max_num_tokens: 9216
moe_load_balancer: ./moe_load_balancer.yaml
EOF
@ -181,6 +183,7 @@ Run 36-way expert parallelism inference with the EPLB configuration incorporated
cat > ./extra_llm_api_options_eplb.yaml <<EOF
enable_attention_dp: true
use_cuda_graph: true
moe_backend: WideEP
moe_max_num_tokens: 9216
moe_load_balancer: ./moe_load_balancer.yaml
EOF

View File

@ -196,7 +196,7 @@ class WideEPMoE(MoE):
hidden_size, self.num_slots)
else:
raise NotImplementedError(
f"Not available alltoall method type: {alltoall_method_type!r}"
f"Not available alltoall method type: {self.alltoall_method_type!r}"
)
# If True, the router weight will be multiplied on the input rather than at the end of FC2
@ -433,7 +433,7 @@ class WideEPMoE(MoE):
token_selected_slots, dtype=token_final_scales.dtype)
else:
raise NotImplementedError(
f"Not available alltoall method type: {alltoall_method_type!r}"
f"Not available alltoall method type: {self.alltoall_method_type!r}"
)
x_sf = None
@ -552,7 +552,7 @@ class WideEPMoE(MoE):
)
else:
raise NotImplementedError(
f"Not available alltoall method type: {alltoall_method_type!r}"
f"Not available alltoall method type: {self.alltoall_method_type!r}"
)
if self.enable_alltoall:
@ -631,7 +631,7 @@ class WideEPMoE(MoE):
deep_ep_topk_weights, deep_ep_handle)
else:
raise NotImplementedError(
f"Not available alltoall method type: {alltoall_method_type!r}"
f"Not available alltoall method type: {self.alltoall_method_type!r}"
)
if self.layer_load_balancer and not self.layer_load_balancer.is_static_routing(

View File

@ -441,4 +441,3 @@ accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_auto_dtype[mtp_
accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales[mtp=vanilla-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False] SKIP (https://nvbugs/5354946)
examples/test_multimodal.py::test_llm_multimodal_general[kosmos-2-pp:1-tp:1-float16-bs:1-cpp_e2e:True-nb:1] SKIP (https://nvbugs/5354936)
examples/test_multimodal.py::test_llm_multimodal_general[fuyu-8b-pp:1-tp:1-float16-bs:1-cpp_e2e:True-nb:1] SKIP (https://nvbugs/5354936)
accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus_static_eplb SKIP (https://nvbugs/5354925)