From ec1b5e22e618d342698fda659efdd5918da6bd9f Mon Sep 17 00:00:00 2001 From: Sylvain Jeaugey Date: Fri, 19 Jun 2020 10:40:33 -0700 Subject: [PATCH] Change all_gather/reduce_scatter algbw to match the documentation. Fix #45 : All_gather and reduce_scatter algorithm bandwidth was computed as time/count*(nranks-1) which is not consistent with the way we compute it for other collectives. This change makes algbw higher; busbw is unchanged. --- src/all_gather.cu | 4 ++-- src/reduce_scatter.cu | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/all_gather.cu b/src/all_gather.cu index cfb2ec3..f5bc44c 100644 --- a/src/all_gather.cu +++ b/src/all_gather.cu @@ -48,10 +48,10 @@ testResult_t AllGatherInitData(struct threadArgs* args, ncclDataType_t type, ncc } void AllGatherGetBw(size_t count, int typesize, double sec, double* algBw, double* busBw, int nranks) { - double baseBw = (double)(count * typesize * (nranks - 1)) / 1.0E9 / sec; + double baseBw = (double)(count * typesize * nranks) / 1.0E9 / sec; *algBw = baseBw; - double factor = 1; + double factor = ((double)(nranks - 1))/((double)nranks); *busBw = baseBw * factor; } diff --git a/src/reduce_scatter.cu b/src/reduce_scatter.cu index 0b1d986..86e789c 100644 --- a/src/reduce_scatter.cu +++ b/src/reduce_scatter.cu @@ -47,10 +47,10 @@ testResult_t ReduceScatterInitData(struct threadArgs* args, ncclDataType_t type, } void ReduceScatterGetBw(size_t count, int typesize, double sec, double* algBw, double* busBw, int nranks) { - double baseBw = (double)(count * typesize * (nranks - 1)) / 1.0E9 / sec; + double baseBw = (double)(count * typesize * nranks) / 1.0E9 / sec; *algBw = baseBw; - double factor = 1; + double factor = ((double)(nranks - 1))/((double)nranks); *busBw = baseBw * factor; }