From 44d7c3b245cfbd7de0fa056dfad40a0911cd7a73 Mon Sep 17 00:00:00 2001 From: "Tao Li @ NVIDIA" Date: Thu, 18 Sep 2025 03:34:05 +0800 Subject: [PATCH] [https://nvbugs/1234567][fix] Revert https://github.com/NVIDIA/TensorRT-LLM/pull/7768/files (#7813) Signed-off-by: Tao Li Signed-off-by: Wangshanshan <30051912+dominicshanshan@users.noreply.github.com> --- tensorrt_llm/_torch/models/modeling_llama.py | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/tensorrt_llm/_torch/models/modeling_llama.py b/tensorrt_llm/_torch/models/modeling_llama.py index 2b98800037..64246736a8 100644 --- a/tensorrt_llm/_torch/models/modeling_llama.py +++ b/tensorrt_llm/_torch/models/modeling_llama.py @@ -11,8 +11,7 @@ from transformers.modeling_utils import load_sharded_checkpoint from transformers.models.llama4.modeling_llama4 import Llama4MultiModalProjector from tensorrt_llm._torch.distributed import (AllReduce, AllReduceFusionOp, - AllReduceParams, AllReduceStrategy, - MoEAllReduce) + AllReduceParams, MoEAllReduce) from tensorrt_llm._torch.models.checkpoints.base_weight_mapper import \ BaseWeightMapper from tensorrt_llm._utils import get_sm_version @@ -650,12 +649,7 @@ class LlamaDecoderLayer(DecoderLayer): eps=config.rms_norm_eps, dtype=config.torch_dtype) - # TODO: This is a temporary fix to disable oneshot kernel for pre-Blackwell arch to avoid perf regressions - self.all_reduce = AllReduce( - strategy=model_config.allreduce_strategy - if get_sm_version() >= 100 else AllReduceStrategy.NCCL, - mapping=model_config.mapping, - ) + self.all_reduce = AllReduce(mapping=model_config.mapping) self.next_layer_layernorm: RMSNorm = None self.next_attn: LlamaAttention = None