From fc2347eaf57bd8f9eace9392be9811dc82e34f13 Mon Sep 17 00:00:00 2001 From: Bo Li <22713281+bobboli@users.noreply.github.com> Date: Wed, 16 Jul 2025 17:54:36 +0800 Subject: [PATCH] chore: Cleanup disable_fp4_allgather. (#6006) Signed-off-by: Bo Li <22713281+bobboli@users.noreply.github.com> --- tensorrt_llm/_torch/models/modeling_deepseekv3.py | 2 -- tensorrt_llm/_torch/models/modeling_qwen3_moe.py | 7 +------ .../_torch/modules/fused_moe/fused_moe_cutlass.py | 6 ++---- tensorrt_llm/_torch/utils.py | 8 -------- 4 files changed, 3 insertions(+), 20 deletions(-) diff --git a/tensorrt_llm/_torch/models/modeling_deepseekv3.py b/tensorrt_llm/_torch/models/modeling_deepseekv3.py index b92cef4dc5..ac9b85f016 100644 --- a/tensorrt_llm/_torch/models/modeling_deepseekv3.py +++ b/tensorrt_llm/_torch/models/modeling_deepseekv3.py @@ -511,8 +511,6 @@ class Deepseekv3MoE(nn.Module): # max-throughput use_dp_padding = False if self.use_dp and self.mapping.tp_size > 1: - # FP4 all_gather moves this bf16 allgather in to after topk and fp4 quantization - # to reduce allreduce BW if isinstance(self.experts, TRTLLMGenFusedMoE): hidden_states = allgather(hidden_states, self.mapping, diff --git a/tensorrt_llm/_torch/models/modeling_qwen3_moe.py b/tensorrt_llm/_torch/models/modeling_qwen3_moe.py index d01bce4ded..5877f3daf5 100644 --- a/tensorrt_llm/_torch/models/modeling_qwen3_moe.py +++ b/tensorrt_llm/_torch/models/modeling_qwen3_moe.py @@ -20,7 +20,6 @@ from ..modules.fused_moe import (BaseMoeRoutingMethod, CutlassFusedMoE, MoE, from ..modules.linear import TensorParallelMode from ..modules.rms_norm import RMSNorm from ..speculative import SpecMetadata -from ..utils import disable_fp4_allgather from .modeling_qwen3 import Qwen3Attention from .modeling_speculative import SpecDecOneEngineForCausalLM from .modeling_utils import (DecoderModel, EagerFusionConfig, @@ -133,11 +132,7 @@ class Qwen3MoE(nn.Module): assert not self.enable_attention_dp if self.enable_attention_dp and self.mapping.tp_size > 1: - # FP4 all_gather moves this bf16 allgather in to after topk and fp4 quantization - # to reduce allreduce BW - if (disable_fp4_allgather() - and not self.experts.enable_alltoall) or isinstance( - self.experts, TRTLLMGenFusedMoE): + if isinstance(self.experts, TRTLLMGenFusedMoE): hidden_states = allgather(hidden_states, self.mapping, dim=0, diff --git a/tensorrt_llm/_torch/modules/fused_moe/fused_moe_cutlass.py b/tensorrt_llm/_torch/modules/fused_moe/fused_moe_cutlass.py index c0a2c4fbec..c42d6da267 100755 --- a/tensorrt_llm/_torch/modules/fused_moe/fused_moe_cutlass.py +++ b/tensorrt_llm/_torch/modules/fused_moe/fused_moe_cutlass.py @@ -4,8 +4,7 @@ import torch from ...distributed import allgather, reducescatter from ...model_config import ModelConfig -from ...utils import (EventType, Fp4QuantizedTensor, ceil_div, - disable_fp4_allgather, swizzle_sf) +from ...utils import EventType, Fp4QuantizedTensor, ceil_div, swizzle_sf from .interface import MoE from .quantization import (DeepSeekFP8BlockScalesFusedMoEMethod, FP8QDQFusedMoEMethod, MoEWeightLoadingMode, @@ -220,8 +219,7 @@ class CutlassFusedMoE(MoE): # TODO: remove this once we have correct fusedmoe kernel ready token_final_scales = None - use_allgather = self.use_dp and self.parallel_size > 1 and not disable_fp4_allgather( - ) + use_allgather = self.use_dp and self.parallel_size > 1 # quantize inputs use_deepseek_fp8_block_scale = False diff --git a/tensorrt_llm/_torch/utils.py b/tensorrt_llm/_torch/utils.py index f687e9d9f5..59cbb214f8 100644 --- a/tensorrt_llm/_torch/utils.py +++ b/tensorrt_llm/_torch/utils.py @@ -1,5 +1,4 @@ import contextlib -import os import threading from dataclasses import dataclass from enum import Enum @@ -100,13 +99,6 @@ class Fp4QuantizedTensor: return self.fp4_tensor.shape -_disable_fp4_allgather = os.getenv("TLLM_DISABLE_FP4_ALLGATHER", "0") == "1" - - -def disable_fp4_allgather(): - return _disable_fp4_allgather - - def compute_swizzled_sf_shape(row: int, col: int): padded_row = pad_up(row, 128) padded_col = pad_up(col, 4)