diff --git a/tensorrt_llm/_torch/custom_ops/torch_custom_ops.py b/tensorrt_llm/_torch/custom_ops/torch_custom_ops.py index 2ee8d29ccc..74dd97b3fe 100644 --- a/tensorrt_llm/_torch/custom_ops/torch_custom_ops.py +++ b/tensorrt_llm/_torch/custom_ops/torch_custom_ops.py @@ -1690,8 +1690,7 @@ class AllReduceRunner(TunableRunner): **kwargs, ) -> List[int]: valid_strategies = [ - # TODO: NCCL_SYMMETRIC will cause hang during tuning process - # AllReduceStrategy.NCCL_SYMMETRIC.value, + AllReduceStrategy.NCCL_SYMMETRIC.value, AllReduceStrategy.NCCL.value, ] # Fallback in allreduceOp is set to NCCL_SYMMETRIC as default @@ -1720,7 +1719,7 @@ class AllReduceRunner(TunableRunner): input, residual, norm_weight, scale, bias, workspace = inputs if tactic == -1: # TODO: Use NCCL instead of NCCL_SYMMETRIC to avoid hanging during tuning process - tactic = AllReduceStrategy.NCCL.value + tactic = AllReduceStrategy.NCCL_SYMMETRIC.value return torch.ops.trtllm.allreduce( input,