[https://nvbugs/5814309][fix] Use NCCL as fallback to avoid crash due to insufficient memory (#10928)

Signed-off-by: Yukun He <23156053+hyukn@users.noreply.github.com> Signed-off-by: Wangshanshan <30051912+dominicshanshan@users.noreply.github.com>
2026-02-05 02:31:33 +08:00 · 2026-01-23 15:15:24 +08:00 · 2026-01-23 15:15:24 +08:00 · de465efc5f
commit de465efc5f
parent d31482686c
2 changed files with 7 additions and 7 deletions
--- a/cpp/tensorrt_llm/common/customAllReduceUtils.h
+++ b/cpp/tensorrt_llm/common/customAllReduceUtils.h
@ -115,7 +115,7 @@ inline AllReduceStrategyType selectStrategyLookUpTable(
        || num_token_index
            >= AllReduceBestStrategyTable.at(sm_version).at(tp_index).at(fusion_op_index).at(hidden_size_index).size())
    {
-        return AllReduceStrategyType::NCCL_SYMMETRIC;
+        return AllReduceStrategyType::NCCL;
    }

    return static_cast<AllReduceStrategyType>(
--- a/cpp/tensorrt_llm/thop/allreduceOp.cpp
+++ b/cpp/tensorrt_llm/thop/allreduceOp.cpp
@ -282,7 +282,7 @@ public:
    std::vector<torch::Tensor> run(torch::Tensor const& input, torch::optional<torch::Tensor> const& residual,
        torch::optional<torch::Tensor> const& norm_weight, torch::optional<torch::Tensor> const& scale,
        torch::optional<torch::Tensor> const& bias, bool trigger_completion_at_end,
-        torch::optional<torch::Tensor> workspace) noexcept
+        torch::optional<torch::Tensor> workspace)
    {
        size_t size = input.numel();
        size_t seq_len = input.size(0);
@ -582,7 +582,7 @@ private:

    std::vector<torch::Tensor> runLowPrecisionAllReduce(torch::Tensor const& input,
        torch::optional<torch::Tensor> const& residual, torch::optional<torch::Tensor> const& norm_weight,
-        torch::optional<torch::Tensor> const& scale, torch::optional<torch::Tensor> const& bias) noexcept
+        torch::optional<torch::Tensor> const& scale, torch::optional<torch::Tensor> const& bias)
    {
 #ifdef ENABLE_FP8
        auto stream = at::cuda::getCurrentCUDAStream(input.get_device());
@ -650,8 +650,7 @@ private:
    std::vector<torch::Tensor> runFusionAllReduce(torch::Tensor const& input,
        torch::optional<torch::Tensor> const& residual, torch::optional<torch::Tensor> const& norm_weight,
        torch::optional<torch::Tensor> const& scale, torch::optional<torch::Tensor> const& bias,
-        bool trigger_completion_at_end, torch::optional<torch::Tensor> workspace,
-        AllReduceStrategyType strategy) noexcept
+        bool trigger_completion_at_end, torch::optional<torch::Tensor> workspace, AllReduceStrategyType strategy)
    {
        // Should handle only Lamport implementation
        auto stream = at::cuda::getCurrentCUDAStream(input.get_device());
@ -1224,7 +1223,7 @@ private:

        if (ifFallbackToNCCL(seq_len, message_size_bytes, max_workspace_size))
        {
-            return AllReduceStrategyType::NCCL_SYMMETRIC;
+            return AllReduceStrategyType::NCCL;
        }

        // This rule based heuristic only chooses between NCCL_SYMMETRIC and MIN_LATENCY strategies.
@ -1250,7 +1249,8 @@ private:

    bool ifFallbackToNCCL(size_t seq_len, size_t message_size_bytes, size_t max_workspace_size)
    {
-        // If messageSize is greater than maxWorkspaceSize or topology is unsuitable, use NCCL_SYMMETRIC fallback.
+        // If messageSize is greater than maxWorkspaceSize or topology is unsuitable, use NCCL fallback.
+        // TODO: Use NCCL_SYMMETRIC once the memory allocation issue is resolved.
        if (message_size_bytes > max_workspace_size || !mIsP2PSupported || !mIsNVLINKSupported)
        {
            return true;