[https://nvbugs/5848377][fix] fix deepeplowlatency with trtllm moe backend running fp8 DS_R1 (#11266)

Signed-off-by: leslie-fang25 <leslief@nvidia.com>
Signed-off-by: Leslie Fang <leslief@nvidia.com>
Co-authored-by: Tailing Yuan <yuantailing@gmail.com>
This commit is contained in:
Leslie Fang 2026-02-10 20:09:00 +08:00 committed by GitHub
parent cf02456613
commit d6e49542bd
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 12 additions and 1 deletions

View File

@ -283,6 +283,10 @@ class DeepEPLowLatency(Communication):
self.expert_size_per_partition, num_tokens_per_expert, self.hidden_size
)
if deep_ep_topk_weights.dtype != torch.float32:
# Deep ep low latency combine requires for fp32 weights
deep_ep_topk_weights = deep_ep_topk_weights.to(torch.float32)
if self.use_low_precision_combine:
if self._has_nvfp4():
precision = "nvfp4"

View File

@ -756,7 +756,10 @@ class ConfigurableMoE(MoE):
if self.enable_dummy_allreduce:
self.dummy_allreduce()
# Use unified combine interface (reads dispatch state from strategy)
final_hidden_states = self.comm.combine(final_hidden_states)
all_rank_max_num_tokens = max(all_rank_num_tokens)
final_hidden_states = self.comm.combine(
final_hidden_states, all_rank_max_num_tokens=all_rank_max_num_tokens
)
else:
# For non-comm case, It should be attention TP or single rank.
# only check if allreduce is needed

View File

@ -531,6 +531,10 @@ class TRTLLMGenFusedMoE(MoE):
routing_bias = routing_bias if router_logits is not None else None
if token_selected_experts is not None:
# for cases like deepep low latency where fake top_k=1 might be used
top_k = token_selected_experts.shape[-1]
# Ensure x_sf is 2D before flattening
if x_sf is not None:
assert len(