mirror of
https://github.com/NVIDIA/TensorRT-LLM.git
synced 2026-02-05 02:31:33 +08:00
[https://nvbugs/5814309][fix] Use NCCL as fallback to avoid crash due to insufficient memory (#10928)
Signed-off-by: Yukun He <23156053+hyukn@users.noreply.github.com> Signed-off-by: Wangshanshan <30051912+dominicshanshan@users.noreply.github.com>
This commit is contained in:
parent
d31482686c
commit
de465efc5f
@ -115,7 +115,7 @@ inline AllReduceStrategyType selectStrategyLookUpTable(
|
||||
|| num_token_index
|
||||
>= AllReduceBestStrategyTable.at(sm_version).at(tp_index).at(fusion_op_index).at(hidden_size_index).size())
|
||||
{
|
||||
return AllReduceStrategyType::NCCL_SYMMETRIC;
|
||||
return AllReduceStrategyType::NCCL;
|
||||
}
|
||||
|
||||
return static_cast<AllReduceStrategyType>(
|
||||
|
||||
@ -282,7 +282,7 @@ public:
|
||||
std::vector<torch::Tensor> run(torch::Tensor const& input, torch::optional<torch::Tensor> const& residual,
|
||||
torch::optional<torch::Tensor> const& norm_weight, torch::optional<torch::Tensor> const& scale,
|
||||
torch::optional<torch::Tensor> const& bias, bool trigger_completion_at_end,
|
||||
torch::optional<torch::Tensor> workspace) noexcept
|
||||
torch::optional<torch::Tensor> workspace)
|
||||
{
|
||||
size_t size = input.numel();
|
||||
size_t seq_len = input.size(0);
|
||||
@ -582,7 +582,7 @@ private:
|
||||
|
||||
std::vector<torch::Tensor> runLowPrecisionAllReduce(torch::Tensor const& input,
|
||||
torch::optional<torch::Tensor> const& residual, torch::optional<torch::Tensor> const& norm_weight,
|
||||
torch::optional<torch::Tensor> const& scale, torch::optional<torch::Tensor> const& bias) noexcept
|
||||
torch::optional<torch::Tensor> const& scale, torch::optional<torch::Tensor> const& bias)
|
||||
{
|
||||
#ifdef ENABLE_FP8
|
||||
auto stream = at::cuda::getCurrentCUDAStream(input.get_device());
|
||||
@ -650,8 +650,7 @@ private:
|
||||
std::vector<torch::Tensor> runFusionAllReduce(torch::Tensor const& input,
|
||||
torch::optional<torch::Tensor> const& residual, torch::optional<torch::Tensor> const& norm_weight,
|
||||
torch::optional<torch::Tensor> const& scale, torch::optional<torch::Tensor> const& bias,
|
||||
bool trigger_completion_at_end, torch::optional<torch::Tensor> workspace,
|
||||
AllReduceStrategyType strategy) noexcept
|
||||
bool trigger_completion_at_end, torch::optional<torch::Tensor> workspace, AllReduceStrategyType strategy)
|
||||
{
|
||||
// Should handle only Lamport implementation
|
||||
auto stream = at::cuda::getCurrentCUDAStream(input.get_device());
|
||||
@ -1224,7 +1223,7 @@ private:
|
||||
|
||||
if (ifFallbackToNCCL(seq_len, message_size_bytes, max_workspace_size))
|
||||
{
|
||||
return AllReduceStrategyType::NCCL_SYMMETRIC;
|
||||
return AllReduceStrategyType::NCCL;
|
||||
}
|
||||
|
||||
// This rule based heuristic only chooses between NCCL_SYMMETRIC and MIN_LATENCY strategies.
|
||||
@ -1250,7 +1249,8 @@ private:
|
||||
|
||||
bool ifFallbackToNCCL(size_t seq_len, size_t message_size_bytes, size_t max_workspace_size)
|
||||
{
|
||||
// If messageSize is greater than maxWorkspaceSize or topology is unsuitable, use NCCL_SYMMETRIC fallback.
|
||||
// If messageSize is greater than maxWorkspaceSize or topology is unsuitable, use NCCL fallback.
|
||||
// TODO: Use NCCL_SYMMETRIC once the memory allocation issue is resolved.
|
||||
if (message_size_bytes > max_workspace_size || !mIsP2PSupported || !mIsNVLINKSupported)
|
||||
{
|
||||
return true;
|
||||
|
||||
Loading…
Reference in New Issue
Block a user