mirror of
https://github.com/NVIDIA/TensorRT-LLM.git
synced 2026-02-16 15:55:08 +08:00
[https://nvbugs/5848377][fix] fix deepeplowlatency with trtllm moe backend running fp8 DS_R1 (#11266)
Signed-off-by: leslie-fang25 <leslief@nvidia.com> Signed-off-by: Leslie Fang <leslief@nvidia.com> Co-authored-by: Tailing Yuan <yuantailing@gmail.com>
This commit is contained in:
parent
cf02456613
commit
d6e49542bd
@ -283,6 +283,10 @@ class DeepEPLowLatency(Communication):
|
||||
self.expert_size_per_partition, num_tokens_per_expert, self.hidden_size
|
||||
)
|
||||
|
||||
if deep_ep_topk_weights.dtype != torch.float32:
|
||||
# Deep ep low latency combine requires for fp32 weights
|
||||
deep_ep_topk_weights = deep_ep_topk_weights.to(torch.float32)
|
||||
|
||||
if self.use_low_precision_combine:
|
||||
if self._has_nvfp4():
|
||||
precision = "nvfp4"
|
||||
|
||||
@ -756,7 +756,10 @@ class ConfigurableMoE(MoE):
|
||||
if self.enable_dummy_allreduce:
|
||||
self.dummy_allreduce()
|
||||
# Use unified combine interface (reads dispatch state from strategy)
|
||||
final_hidden_states = self.comm.combine(final_hidden_states)
|
||||
all_rank_max_num_tokens = max(all_rank_num_tokens)
|
||||
final_hidden_states = self.comm.combine(
|
||||
final_hidden_states, all_rank_max_num_tokens=all_rank_max_num_tokens
|
||||
)
|
||||
else:
|
||||
# For non-comm case, It should be attention TP or single rank.
|
||||
# only check if allreduce is needed
|
||||
|
||||
@ -531,6 +531,10 @@ class TRTLLMGenFusedMoE(MoE):
|
||||
|
||||
routing_bias = routing_bias if router_logits is not None else None
|
||||
|
||||
if token_selected_experts is not None:
|
||||
# for cases like deepep low latency where fake top_k=1 might be used
|
||||
top_k = token_selected_experts.shape[-1]
|
||||
|
||||
# Ensure x_sf is 2D before flattening
|
||||
if x_sf is not None:
|
||||
assert len(
|
||||
|
||||
Loading…
Reference in New Issue
Block a user