mirror of
https://github.com/NVIDIA/TensorRT-LLM.git
synced 2026-01-13 22:18:36 +08:00
fix (NvBug 5354925): Fix static EPLB (#5411)
Signed-off-by: Enwei Zhu <21126786+syuoni@users.noreply.github.com>
This commit is contained in:
parent
da98e03747
commit
76da7fed86
@ -27,6 +27,7 @@ Run 32-way expert parallelism inference on the prepared dataset. Please refer to
|
||||
cat > ./extra_llm_api_options.yaml <<EOF
|
||||
enable_attention_dp: true
|
||||
use_cuda_graph: true
|
||||
moe_backend: WideEP
|
||||
moe_max_num_tokens: 8192
|
||||
EOF
|
||||
|
||||
@ -116,6 +117,7 @@ Run 36-way expert parallelism inference with the EPLB configuration incorporated
|
||||
cat > ./extra_llm_api_options_eplb.yaml <<EOF
|
||||
enable_attention_dp: true
|
||||
use_cuda_graph: true
|
||||
moe_backend: WideEP
|
||||
moe_max_num_tokens: 9216
|
||||
moe_load_balancer: ./moe_load_balancer.yaml
|
||||
EOF
|
||||
@ -181,6 +183,7 @@ Run 36-way expert parallelism inference with the EPLB configuration incorporated
|
||||
cat > ./extra_llm_api_options_eplb.yaml <<EOF
|
||||
enable_attention_dp: true
|
||||
use_cuda_graph: true
|
||||
moe_backend: WideEP
|
||||
moe_max_num_tokens: 9216
|
||||
moe_load_balancer: ./moe_load_balancer.yaml
|
||||
EOF
|
||||
|
||||
@ -196,7 +196,7 @@ class WideEPMoE(MoE):
|
||||
hidden_size, self.num_slots)
|
||||
else:
|
||||
raise NotImplementedError(
|
||||
f"Not available alltoall method type: {alltoall_method_type!r}"
|
||||
f"Not available alltoall method type: {self.alltoall_method_type!r}"
|
||||
)
|
||||
|
||||
# If True, the router weight will be multiplied on the input rather than at the end of FC2
|
||||
@ -433,7 +433,7 @@ class WideEPMoE(MoE):
|
||||
token_selected_slots, dtype=token_final_scales.dtype)
|
||||
else:
|
||||
raise NotImplementedError(
|
||||
f"Not available alltoall method type: {alltoall_method_type!r}"
|
||||
f"Not available alltoall method type: {self.alltoall_method_type!r}"
|
||||
)
|
||||
|
||||
x_sf = None
|
||||
@ -552,7 +552,7 @@ class WideEPMoE(MoE):
|
||||
)
|
||||
else:
|
||||
raise NotImplementedError(
|
||||
f"Not available alltoall method type: {alltoall_method_type!r}"
|
||||
f"Not available alltoall method type: {self.alltoall_method_type!r}"
|
||||
)
|
||||
|
||||
if self.enable_alltoall:
|
||||
@ -631,7 +631,7 @@ class WideEPMoE(MoE):
|
||||
deep_ep_topk_weights, deep_ep_handle)
|
||||
else:
|
||||
raise NotImplementedError(
|
||||
f"Not available alltoall method type: {alltoall_method_type!r}"
|
||||
f"Not available alltoall method type: {self.alltoall_method_type!r}"
|
||||
)
|
||||
|
||||
if self.layer_load_balancer and not self.layer_load_balancer.is_static_routing(
|
||||
|
||||
@ -441,4 +441,3 @@ accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_auto_dtype[mtp_
|
||||
accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales[mtp=vanilla-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False] SKIP (https://nvbugs/5354946)
|
||||
examples/test_multimodal.py::test_llm_multimodal_general[kosmos-2-pp:1-tp:1-float16-bs:1-cpp_e2e:True-nb:1] SKIP (https://nvbugs/5354936)
|
||||
examples/test_multimodal.py::test_llm_multimodal_general[fuyu-8b-pp:1-tp:1-float16-bs:1-cpp_e2e:True-nb:1] SKIP (https://nvbugs/5354936)
|
||||
accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus_static_eplb SKIP (https://nvbugs/5354925)
|
||||
|
||||
Loading…
Reference in New Issue
Block a user