[https://nvbugs/5814309][fix] Use NCCL as fallback to avoid crash due to insufficient memory (#10928)

Signed-off-by: Yukun He <23156053+hyukn@users.noreply.github.com>
Signed-off-by: Wangshanshan <30051912+dominicshanshan@users.noreply.github.com>
This commit is contained in:
Yukun He 2026-01-23 15:15:24 +08:00 committed by Yanchao Lu
parent d31482686c
commit de465efc5f
2 changed files with 7 additions and 7 deletions

View File

@ -115,7 +115,7 @@ inline AllReduceStrategyType selectStrategyLookUpTable(
|| num_token_index
>= AllReduceBestStrategyTable.at(sm_version).at(tp_index).at(fusion_op_index).at(hidden_size_index).size())
{
return AllReduceStrategyType::NCCL_SYMMETRIC;
return AllReduceStrategyType::NCCL;
}
return static_cast<AllReduceStrategyType>(

View File

@ -282,7 +282,7 @@ public:
std::vector<torch::Tensor> run(torch::Tensor const& input, torch::optional<torch::Tensor> const& residual,
torch::optional<torch::Tensor> const& norm_weight, torch::optional<torch::Tensor> const& scale,
torch::optional<torch::Tensor> const& bias, bool trigger_completion_at_end,
torch::optional<torch::Tensor> workspace) noexcept
torch::optional<torch::Tensor> workspace)
{
size_t size = input.numel();
size_t seq_len = input.size(0);
@ -582,7 +582,7 @@ private:
std::vector<torch::Tensor> runLowPrecisionAllReduce(torch::Tensor const& input,
torch::optional<torch::Tensor> const& residual, torch::optional<torch::Tensor> const& norm_weight,
torch::optional<torch::Tensor> const& scale, torch::optional<torch::Tensor> const& bias) noexcept
torch::optional<torch::Tensor> const& scale, torch::optional<torch::Tensor> const& bias)
{
#ifdef ENABLE_FP8
auto stream = at::cuda::getCurrentCUDAStream(input.get_device());
@ -650,8 +650,7 @@ private:
std::vector<torch::Tensor> runFusionAllReduce(torch::Tensor const& input,
torch::optional<torch::Tensor> const& residual, torch::optional<torch::Tensor> const& norm_weight,
torch::optional<torch::Tensor> const& scale, torch::optional<torch::Tensor> const& bias,
bool trigger_completion_at_end, torch::optional<torch::Tensor> workspace,
AllReduceStrategyType strategy) noexcept
bool trigger_completion_at_end, torch::optional<torch::Tensor> workspace, AllReduceStrategyType strategy)
{
// Should handle only Lamport implementation
auto stream = at::cuda::getCurrentCUDAStream(input.get_device());
@ -1224,7 +1223,7 @@ private:
if (ifFallbackToNCCL(seq_len, message_size_bytes, max_workspace_size))
{
return AllReduceStrategyType::NCCL_SYMMETRIC;
return AllReduceStrategyType::NCCL;
}
// This rule based heuristic only chooses between NCCL_SYMMETRIC and MIN_LATENCY strategies.
@ -1250,7 +1249,8 @@ private:
bool ifFallbackToNCCL(size_t seq_len, size_t message_size_bytes, size_t max_workspace_size)
{
// If messageSize is greater than maxWorkspaceSize or topology is unsuitable, use NCCL_SYMMETRIC fallback.
// If messageSize is greater than maxWorkspaceSize or topology is unsuitable, use NCCL fallback.
// TODO: Use NCCL_SYMMETRIC once the memory allocation issue is resolved.
if (message_size_bytes > max_workspace_size || !mIsP2PSupported || !mIsNVLINKSupported)
{
return true;