From f7de285a825aca801bb23df96174c7c64821c4af Mon Sep 17 00:00:00 2001 From: Void <18275976+yilin-void@users.noreply.github.com> Date: Thu, 15 Jan 2026 11:15:29 +0800 Subject: [PATCH] [None][fix] add quantization check for DeepEP LL low precision combine in new moe comm api (#10072) Signed-off-by: Yilin Zhang <18275976+yilin-void@users.noreply.github.com> --- .../fused_moe/communication/deep_ep_low_latency.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/tensorrt_llm/_torch/modules/fused_moe/communication/deep_ep_low_latency.py b/tensorrt_llm/_torch/modules/fused_moe/communication/deep_ep_low_latency.py index d2c6a8164c..d7e96a656e 100644 --- a/tensorrt_llm/_torch/modules/fused_moe/communication/deep_ep_low_latency.py +++ b/tensorrt_llm/_torch/modules/fused_moe/communication/deep_ep_low_latency.py @@ -59,7 +59,9 @@ class DeepEPLowLatency(Communication): self.moe_max_num_tokens = moe_max_num_tokens self.expert_size_per_partition = expert_size_per_partition - self.use_low_precision_combine = use_low_precision_combine + self.use_low_precision_combine = ( + use_low_precision_combine and self.supports_low_precision_combine() + ) # Read from environment variable, same as wideEP self.enable_postquant_alltoall = ( os.environ.get("TRTLLM_MOE_POST_QUANT_ALLTOALLV", "1") == "1" @@ -96,7 +98,12 @@ class DeepEPLowLatency(Communication): """ if not self.enable_postquant_alltoall: return False + return self._has_nvfp4() or self._has_fp8_qdq() or self._has_w4afp8() + def supports_low_precision_combine(self) -> bool: + """ + DeepEP Low Latency supports low-precision combine for: fp8_qdq, nvfp4, w4afp8 + """ return self._has_nvfp4() or self._has_fp8_qdq() or self._has_w4afp8() def is_workload_feasible(self, all_rank_num_tokens: List[int], num_chunks: int) -> bool: