From e37545e4911c210558baba789941ea7bf59db00d Mon Sep 17 00:00:00 2001
From: David Addison <daddison@nvidia.com>
Date: Mon, 15 Mar 2021 14:44:06 -0700
Subject: [PATCH 1/7] Add support for new datatype: bfloat16

---
 src/common.cu | 41 +++++++++++++++++++++++++++++++++++++----
 src/common.h  |  3 +++
 2 files changed, 40 insertions(+), 4 deletions(-)
diff --git a/src/common.cu b/src/common.cu
index ff4e1fd..4589593 100644
--- a/src/common.cu
+++ b/src/common.cu
@@ -12,8 +12,16 @@
 #include "cuda.h"
 
 #if NCCL_MAJOR >= 2
-ncclDataType_t test_types[ncclNumTypes] = {ncclInt8, ncclUint8, ncclInt32, ncclUint32, ncclInt64, ncclUint64, ncclHalf, ncclFloat, ncclDouble};
-const char *test_typenames[ncclNumTypes] = {"int8", "uint8", "int32", "uint32", "int64", "uint64", "half", "float", "double"};
+ncclDataType_t test_types[ncclNumTypes] = {ncclInt8, ncclUint8, ncclInt32, ncclUint32, ncclInt64, ncclUint64, ncclHalf, ncclFloat, ncclDouble,
+#if defined(__CUDA_BF16_TYPES_EXIST__)
+                                           ncclBfloat16
+#endif
+};
+const char *test_typenames[ncclNumTypes] = {"int8", "uint8", "int32", "uint32", "int64", "uint64", "half", "float", "double",
+#if defined(__CUDA_BF16_TYPES_EXIST__)
+                                            "bfloat16"
+#endif
+};
 #else
 ncclDataType_t test_types[ncclNumTypes] = {ncclChar, ncclInt, ncclHalf, ncclFloat, ncclDouble, ncclInt64, ncclUint64};
 const char *test_typenames[ncclNumTypes] = {"char", "int", "half", "float", "double", "int64", "uint64"};
@@ -61,6 +69,9 @@ double parsesize(char *value) {
 double DeltaMaxValue(ncclDataType_t type) {
   switch(type) {
     case ncclHalf: return 1e-2;
+#if defined(__CUDA_BF16_TYPES_EXIST__)
+    case ncclBfloat16: return 1e-2;
+#endif
     case ncclFloat: return 1e-5;
     case ncclDouble: return 1e-12;
     case ncclInt:
@@ -95,6 +106,12 @@ template<> __device__
 float toFloat(half a) {
   return __half2float(a);
 }
+#if defined(__CUDA_BF16_TYPES_EXIST__)
+template<> __device__
+float toFloat(__nv_bfloat16 a) {
+  return __bfloat162float(a);
+}
+#endif
 
 template<typename T, int BSIZE> __global__
 void deltaKern(void* A_, void* B_, size_t count, double* max) {
@@ -128,6 +145,10 @@ void deltaKern(void* A_, void* B_, size_t count, double* max) {
 
 testResult_t CheckDelta(void* expected, void* results, size_t count, ncclDataType_t type, double* devmax) {
   switch (type) {
+#if defined(__CUDA_BF16_TYPES_EXIST__)
+    case ncclBfloat16:
+      deltaKern<__nv_bfloat16, 512><<<1, 512>>>(results, expected, count, devmax); break;
+#endif
     case ncclHalf:
       deltaKern<half, 512><<<1, 512>>>(results, expected, count, devmax); break;
     case ncclFloat:
@@ -174,6 +195,12 @@ template<>
 __device__ half testValue<half>(const size_t offset, const int rep, const int rank) {
   return __float2half(testValue<float>(offset, rep, rank));
 }
+#if defined(__CUDA_BF16_TYPES_EXIST__)
+template<>
+__device__ __nv_bfloat16 testValue<__nv_bfloat16>(const size_t offset, const int rep, const int rank) {
+  return __float2bfloat16(testValue<float>(offset, rep, rank));
+}
+#endif
 
 // Operations
 template<typename T>
@@ -210,7 +237,10 @@ __global__ void InitDataReduceKernel(T* data, const size_t N, const size_t offse
 #define OPS(type) KERN(type, ncclOpSum), KERN(type, ncclOpProd), KERN(type, ncclOpMax), KERN(type, ncclOpMin)
 
 static void* const redInitDataKerns[ncclNumOps*ncclNumTypes] = {
-  OPS(int8_t), OPS(uint8_t), OPS(int32_t), OPS(uint32_t), OPS(int64_t), OPS(uint64_t), OPS(half), OPS(float), OPS(double)
+  OPS(int8_t), OPS(uint8_t), OPS(int32_t), OPS(uint32_t), OPS(int64_t), OPS(uint64_t), OPS(half), OPS(float), OPS(double),
+#if defined(__CUDA_BF16_TYPES_EXIST__)
+  OPS(__nv_bfloat16)
+#endif
 };
 
 testResult_t InitDataReduce(void* data, const size_t count, const size_t offset, ncclDataType_t type, ncclRedOp_t op, const int rep, const int nranks) {
@@ -236,7 +266,10 @@ static void* const initDataKerns[ncclNumTypes] = {
   (void*)InitDataKernel<uint64_t>,
   (void*)InitDataKernel<    half>,
   (void*)InitDataKernel<   float>,
-  (void*)InitDataKernel<  double>
+  (void*)InitDataKernel<  double>,
+#if defined(__CUDA_BF16_TYPES_EXIST__)
+  (void*)InitDataKernel<__nv_bfloat16>,
+#endif
 };
 
 template<typename T>
diff --git a/src/common.h b/src/common.h
index 865ee25..c869254 100644
--- a/src/common.h
+++ b/src/common.h
@@ -213,6 +213,9 @@ static size_t wordSize(ncclDataType_t type) {
 #endif
       return 1;
     case ncclHalf:
+#if defined(__CUDA_BF16_TYPES_EXIST__)
+    case ncclBfloat16:
+#endif
     //case ncclFloat16:
       return 2;
     case ncclInt:

From cde7e769c1879a77daddebe9da164513e030105b Mon Sep 17 00:00:00 2001
From: David Addison <daddison@nvidia.com>
Date: Thu, 17 Jun 2021 14:08:43 -0700
Subject: [PATCH 2/7] Add support for ncclAvg operation

---
 src/all_gather.cu     |  2 +-
 src/all_reduce.cu     |  4 +-
 src/alltoall.cu       |  2 +-
 src/broadcast.cu      |  2 +-
 src/common.cu         | 92 ++++++++++++++++++++++++++++++++++++-------
 src/common.h          |  5 ++-
 src/reduce.cu         |  4 +-
 src/reduce_scatter.cu |  4 +-
 8 files changed, 90 insertions(+), 25 deletions(-)

diff --git a/src/all_gather.cu b/src/all_gather.cu
index f5bc44c..ee1d0ea 100644
--- a/src/all_gather.cu
+++ b/src/all_gather.cu
@@ -84,7 +84,7 @@ testResult_t AllGatherRunTest(struct threadArgs* args, int root, ncclDataType_t
     run_types = &type;
     run_typenames = &typeName;
   } else {
-    type_count = ncclNumTypes;
+    type_count = test_typenum;
     run_types = test_types;
     run_typenames = test_typenames;
   }
diff --git a/src/all_reduce.cu b/src/all_reduce.cu
index bd8daaf..52dce89 100644
--- a/src/all_reduce.cu
+++ b/src/all_reduce.cu
@@ -83,7 +83,7 @@ testResult_t AllReduceRunTest(struct threadArgs* args, int root, ncclDataType_t
     run_types = &type;
     run_typenames = &typeName;
   } else {
-    type_count = ncclNumTypes;
+    type_count = test_typenum;
     run_types = test_types;
     run_typenames = test_typenames;
   }
@@ -93,7 +93,7 @@ testResult_t AllReduceRunTest(struct threadArgs* args, int root, ncclDataType_t
     run_ops = &op;
     run_opnames = &opName;
   } else {
-    op_count = ncclNumOps;
+    op_count = test_opnum;
     run_ops = test_ops;
     run_opnames = test_opnames;
   }
diff --git a/src/alltoall.cu b/src/alltoall.cu
index 31cfca0..4afd3eb 100644
--- a/src/alltoall.cu
+++ b/src/alltoall.cu
@@ -102,7 +102,7 @@ testResult_t AlltoAllRunTest(struct threadArgs* args, int root, ncclDataType_t t
     run_types = &type;
     run_typenames = &typeName;
   } else {
-    type_count = ncclNumTypes;
+    type_count = test_typenum;
     run_types = test_types;
     run_typenames = test_typenames;
   }
diff --git a/src/broadcast.cu b/src/broadcast.cu
index c62a99f..f7c0094 100644
--- a/src/broadcast.cu
+++ b/src/broadcast.cu
@@ -92,7 +92,7 @@ testResult_t BroadcastRunTest(struct threadArgs* args, int root, ncclDataType_t
     run_types = &type;
     run_typenames = &typeName;
   } else {
-    type_count = ncclNumTypes;
+    type_count = test_typenum;
     run_types = test_types;
     run_typenames = test_typenames;
   }
diff --git a/src/common.cu b/src/common.cu
index 4589593..1313079 100644
--- a/src/common.cu
+++ b/src/common.cu
@@ -11,23 +11,41 @@
 #include <libgen.h>
 #include "cuda.h"
 
+int test_ncclVersion = 0; // init'd with ncclGetVersion()
+
 #if NCCL_MAJOR >= 2
 ncclDataType_t test_types[ncclNumTypes] = {ncclInt8, ncclUint8, ncclInt32, ncclUint32, ncclInt64, ncclUint64, ncclHalf, ncclFloat, ncclDouble,
-#if defined(__CUDA_BF16_TYPES_EXIST__)
+#if defined(__CUDA_BF16_TYPES_EXIST__) && NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0)
                                            ncclBfloat16
 #endif
 };
 const char *test_typenames[ncclNumTypes] = {"int8", "uint8", "int32", "uint32", "int64", "uint64", "half", "float", "double",
-#if defined(__CUDA_BF16_TYPES_EXIST__)
+#if defined(__CUDA_BF16_TYPES_EXIST__) && NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0)
                                             "bfloat16"
 #endif
 };
+
+#if defined(__CUDA_BF16_TYPES_EXIST__) && NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0)
+int test_typenum = 10;
+#else
+int test_typenum = 9;
+#endif
+
 #else
 ncclDataType_t test_types[ncclNumTypes] = {ncclChar, ncclInt, ncclHalf, ncclFloat, ncclDouble, ncclInt64, ncclUint64};
 const char *test_typenames[ncclNumTypes] = {"char", "int", "half", "float", "double", "int64", "uint64"};
+int test_typenum = 7;
 #endif
+
+#if NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0)
+ncclRedOp_t test_ops[ncclNumOps] = {ncclSum, ncclProd, ncclMax, ncclMin, ncclAvg};
+const char *test_opnames[ncclNumOps] = {"sum", "prod", "max", "min", "avg"};
+int test_opnum = 5;
+#else
 ncclRedOp_t test_ops[ncclNumOps] = {ncclSum, ncclProd, ncclMax, ncclMin};
 const char *test_opnames[ncclNumOps] = {"sum", "prod", "max", "min"};
+int test_opnum = 4;
+#endif
 
 thread_local int is_main_thread = 0;
 
@@ -126,7 +144,7 @@ void deltaKern(void* A_, void* B_, size_t count, double* max) {
     if( delta > locmax ) {
       locmax = delta;
 #ifdef DEBUG_PRINT
-      if (delta > .1) printf("Error at %d/%ld : %f != %f\n", i, count, toFloat(A[i]), toFloat(B[i]));
+      if (delta > .1) printf("Error at %ld/%ld(%p) : %f != %f\n", i, count, B+i, toFloat(A[i]), toFloat(B[i]));
 #endif
     }
   }
@@ -222,23 +240,48 @@ __device__ half ncclOpMax(half a, half b) { return __half2float(a)>__half2float(
 template<>
 __device__ half ncclOpMin(half a, half b) { return __half2float(a)<__half2float(b) ? a : b; }
 
-template<typename T, T (*Op)(T, T)>
+template<typename T>
+__device__ T ncclPostOpIdent(T x, int n) { return x; }
+
+template<typename T>
+__device__ T ncclPostOpDiv(T x, int n) { return x/n; }
+template<>
+__device__ half ncclPostOpDiv<half>(half x, int n) { return __float2half(__half2float(x)/n); }
+#if defined(__CUDA_BF16_TYPES_EXIST__)
+template<>
+__device__ __nv_bfloat16 ncclPostOpDiv<__nv_bfloat16>(__nv_bfloat16 x, int n) { return __float2bfloat16(__bfloat162float(x)/n); }
+#endif
+
+template<typename T, T (*Op)(T, T), T(*PostOp)(T,int)>
 __global__ void InitDataReduceKernel(T* data, const size_t N, const size_t offset, const int rep, const int nranks) {
   for (size_t o=blockIdx.x*blockDim.x+threadIdx.x; o<N; o+=gridDim.x*blockDim.x) {
     T val = testValue<T>(o+offset, rep, 0);
     for (int i=1; i<nranks; i++) {
       val = Op(val, testValue<T>(o+offset, rep, i));
     }
-    data[o] = val;
+    data[o] = PostOp(val, nranks);
   }
 }
 
-#define KERN(type, op) (void*)InitDataReduceKernel<type, op<type>>
-#define OPS(type) KERN(type, ncclOpSum), KERN(type, ncclOpProd), KERN(type, ncclOpMax), KERN(type, ncclOpMin)
+#define KERN(type, op, postop) (void*)InitDataReduceKernel<type, op<type>, postop<type> >
+#if NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0)
+  #define OPS(type) \
+    KERN(type, ncclOpSum, ncclPostOpIdent), \
+    KERN(type, ncclOpProd, ncclPostOpIdent), \
+    KERN(type, ncclOpMax, ncclPostOpIdent), \
+    KERN(type, ncclOpMin, ncclPostOpIdent), \
+    KERN(type, ncclOpSum/*Avg*/, ncclPostOpDiv)
+#else
+  #define OPS(type) \
+    KERN(type, ncclOpSum, ncclPostOpIdent), \
+    KERN(type, ncclOpProd, ncclPostOpIdent), \
+    KERN(type, ncclOpMax, ncclPostOpIdent), \
+    KERN(type, ncclOpMin, ncclPostOpIdent)
+#endif
 
 static void* const redInitDataKerns[ncclNumOps*ncclNumTypes] = {
   OPS(int8_t), OPS(uint8_t), OPS(int32_t), OPS(uint32_t), OPS(int64_t), OPS(uint64_t), OPS(half), OPS(float), OPS(double),
-#if defined(__CUDA_BF16_TYPES_EXIST__)
+#if defined(__CUDA_BF16_TYPES_EXIST__) && NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0)
   OPS(__nv_bfloat16)
 #endif
 };
@@ -267,7 +310,7 @@ static void* const initDataKerns[ncclNumTypes] = {
   (void*)InitDataKernel<    half>,
   (void*)InitDataKernel<   float>,
   (void*)InitDataKernel<  double>,
-#if defined(__CUDA_BF16_TYPES_EXIST__)
+#if defined(__CUDA_BF16_TYPES_EXIST__) && NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0)
   (void*)InitDataKernel<__nv_bfloat16>,
 #endif
 };
@@ -367,7 +410,7 @@ testResult_t testStreamSynchronize(int ngpus, cudaStream_t* streams, ncclComm_t*
      if (cudaErr != cudaErrorNotReady) CUDACHECK(cudaErr);
 
 #if NCCL_VERSION_CODE >= NCCL_VERSION(2,4,0)
-     if (comms) {
+     if (test_ncclVersion >= NCCL_VERSION(2,4,0) && comms) {
        ncclResult_t ncclAsyncErr;
        NCCLCHECK(ncclCommGetAsyncError(comms[i], &ncclAsyncErr));
        if (ncclAsyncErr != ncclSuccess) {
@@ -602,6 +645,17 @@ int main(int argc, char* argv[]) {
   // Make sure everyline is flushed so that we see the progress of the test
   setlinebuf(stdout);
 
+  #if NCCL_VERSION_CODE >= NCCL_VERSION(2,4,0)
+    ncclGetVersion(&test_ncclVersion);
+  #else
+    test_ncclVersion = NCCL_VERSION_CODE;
+  #endif
+  //printf("# NCCL_VERSION_CODE=%d ncclGetVersion=%d\n", NCCL_VERSION_CODE, test_ncclVersion);
+  if (NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0) &&  test_ncclVersion < NCCL_VERSION(2,10,0)) {
+    test_opnum -= 1; // exclude ncclAvg
+    test_typenum -= 1; // exclude bfloat16
+  }
+
   // Parse args
   int longindex;
   static struct option longopts[] = {
@@ -653,7 +707,7 @@ int main(int argc, char* argv[]) {
         iters = (int)strtol(optarg, NULL, 0);
         break;
       case 'm':
-#if NCCL_MAJOR >= 2 && NCCL_MINOR >= 2
+#if NCCL_MAJOR > 2 || (NCCL_MAJOR >= 2 && NCCL_MINOR >= 2)
         agg_iters = (int)strtol(optarg, NULL, 0);
 #else
         printf("Option -m not supported before NCCL 2.2. Ignoring\n");
@@ -693,7 +747,11 @@ int main(int argc, char* argv[]) {
             "[-w,--warmup_iters <warmup iteration count>] \n\t"
             "[-p,--parallel_init <0/1>] \n\t"
             "[-c,--check <0/1>] \n\t"
+#if NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0)
+            "[-o,--op <sum/prod/min/max/avg/all>] \n\t"
+#else
             "[-o,--op <sum/prod/min/max/all>] \n\t"
+#endif
             "[-d,--datatype <nccltype/all>] \n\t"
             "[-r,--root <root>] \n\t"
             "[-z,--blocking <0/1>] \n\t"
@@ -701,8 +759,8 @@ int main(int argc, char* argv[]) {
 	    basename(argv[0]));
 	return 0;
       default:
-        printf("invalid option \n");
-	printf("USAGE: %s \n\t"
+        if (c != 'h') printf("invalid option '%c'\n", c);
+        printf("USAGE: %s \n\t"
             "[-t,--nthreads <num threads>] \n\t"
             "[-g,--ngpus <gpus per thread>] \n\t"
             "[-b,--minbytes <min size in bytes>] \n\t"
@@ -714,7 +772,11 @@ int main(int argc, char* argv[]) {
             "[-w,--warmup_iters <warmup iteration count>] \n\t"
             "[-p,--parallel_init <0/1>] \n\t"
             "[-c,--check <0/1>] \n\t"
+#if NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0)
+            "[-o,--op <sum/prod/min/max/avg/all>] \n\t"
+#else
             "[-o,--op <sum/prod/min/max/all>] \n\t"
+#endif
             "[-d,--datatype <nccltype/all>] \n\t"
             "[-r,--root <root>] \n\t"
             "[-z,--blocking <0/1>] \n\t"
@@ -899,8 +961,8 @@ testResult_t run() {
 
   // Free off CUDA allocated memory
   for (int i=0; i<nGpus*nThreads; i++) {
-    CUDACHECK(cudaFree(sendbuffs[i]));
-    CUDACHECK(cudaFree(recvbuffs[i]));
+    if (sendbuffs[i]) CUDACHECK(cudaFree((char*)sendbuffs[i]));
+    if (recvbuffs[i]) CUDACHECK(cudaFree((char*)recvbuffs[i]));
     if (datacheck) CUDACHECK(cudaFree(expected[i]));
   }
   CUDACHECK(cudaFreeHost(delta));
diff --git a/src/common.h b/src/common.h
index c869254..00103f7 100644
--- a/src/common.h
+++ b/src/common.h
@@ -235,10 +235,13 @@ static size_t wordSize(ncclDataType_t type) {
   }
 }
 
+extern int test_ncclVersion; // init'd with ncclGetVersion()
 extern ncclDataType_t test_types[ncclNumTypes];
 extern const char *test_typenames[ncclNumTypes];
 extern ncclRedOp_t test_ops[ncclNumOps];
 extern const char *test_opnames[ncclNumOps];
+extern int test_opnum;
+extern int test_typenum;
 
 static int ncclstringtotype(char *str) {
     for (int t=0; t<ncclNumTypes; t++) {
@@ -254,7 +257,7 @@ static int ncclstringtotype(char *str) {
 }
 
 static int ncclstringtoop (char *str) {
-    for (int o=0; o<ncclNumOps; o++) {
+    for (int o=0; o<test_opnum; o++) {
       if (strcmp(str, test_opnames[o]) == 0) {
         return o;
       }
diff --git a/src/reduce.cu b/src/reduce.cu
index 08825e4..e40b501 100644
--- a/src/reduce.cu
+++ b/src/reduce.cu
@@ -83,7 +83,7 @@ testResult_t ReduceRunTest(struct threadArgs* args, int root, ncclDataType_t typ
     run_types = &type;
     run_typenames = &typeName;
   } else {
-    type_count = ncclNumTypes;
+    type_count = test_typenum;
     run_types = test_types;
     run_typenames = test_typenames;
   }
@@ -93,7 +93,7 @@ testResult_t ReduceRunTest(struct threadArgs* args, int root, ncclDataType_t typ
     run_ops = &op;
     run_opnames = &opName;
   } else {
-    op_count = ncclNumOps;
+    op_count = test_opnum;
     run_ops = test_ops;
     run_opnames = test_opnames;
   }
diff --git a/src/reduce_scatter.cu b/src/reduce_scatter.cu
index 86e789c..c6de434 100644
--- a/src/reduce_scatter.cu
+++ b/src/reduce_scatter.cu
@@ -84,7 +84,7 @@ testResult_t ReduceScatterRunTest(struct threadArgs* args, int root, ncclDataTyp
     run_types = &type;
     run_typenames = &typeName;
   } else {
-    type_count = ncclNumTypes;
+    type_count = test_typenum;
     run_types = test_types;
     run_typenames = test_typenames;
   }
@@ -94,7 +94,7 @@ testResult_t ReduceScatterRunTest(struct threadArgs* args, int root, ncclDataTyp
     run_opnames = &opName;
     op_count = 1;
   } else {
-    op_count = sizeof(test_ops)/sizeof(test_ops[0]);
+    op_count = test_opnum;
     run_ops = test_ops;
     run_opnames = test_opnames;
   }

From 526eacadf79debe32ae0078cf2cb0fc9ed9c1baf Mon Sep 17 00:00:00 2001
From: David Addison <daddison@nvidia.com>
Date: Mon, 28 Jun 2021 10:12:34 -0700
Subject: [PATCH 3/7] Fixed formatting for bfloat16 support

---
 src/all_gather.cu     | 8 ++++----
 src/all_reduce.cu     | 8 ++++----
 src/alltoall.cu       | 8 ++++----
 src/broadcast.cu      | 8 ++++----
 src/reduce.cu         | 8 ++++----
 src/reduce_scatter.cu | 8 ++++----
 6 files changed, 24 insertions(+), 24 deletions(-)

diff --git a/src/all_gather.cu b/src/all_gather.cu
index ee1d0ea..0b9e0cc 100644
--- a/src/all_gather.cu
+++ b/src/all_gather.cu
@@ -8,15 +8,15 @@
 #include "common.h"
 
 void print_header() {
-  PRINT("# %10s  %12s  %6s            out-of-place                       in-place          \n", "", "", "");
-  PRINT("# %10s  %12s  %6s  %7s  %6s  %6s  %5s  %7s  %6s  %6s  %5s\n", "size", "count", "type",
+  PRINT("# %10s  %12s  %8s            out-of-place                       in-place          \n", "", "", "");
+  PRINT("# %10s  %12s  %8s  %7s  %6s  %6s  %5s  %7s  %6s  %6s  %5s\n", "size", "count", "type",
         "time", "algbw", "busbw", "error", "time", "algbw", "busbw", "error");
-  PRINT("# %10s  %12s  %6s  %7s  %6s  %6s  %5s  %7s  %6s  %6s  %5s\n", "(B)", "(elements)", "",
+  PRINT("# %10s  %12s  %8s  %7s  %6s  %6s  %5s  %7s  %6s  %6s  %5s\n", "(B)", "(elements)", "",
         "(us)", "(GB/s)", "(GB/s)", "", "(us)", "(GB/s)", "(GB/s)", "");
 }
 
 void print_line_header (size_t size, size_t count, const char *typeName, const char *opName, int root) {
-  PRINT("%12li  %12li  %6s", size, count, typeName);
+  PRINT("%12li  %12li  %8s", size, count, typeName);
 }
 
 void AllGatherGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) {
diff --git a/src/all_reduce.cu b/src/all_reduce.cu
index 52dce89..9b6b7f0 100644
--- a/src/all_reduce.cu
+++ b/src/all_reduce.cu
@@ -8,15 +8,15 @@
 #include "common.h"
 
 void print_header() {
-  PRINT("# %10s  %12s  %6s  %6s            out-of-place                       in-place          \n", "", "", "", "");
-  PRINT("# %10s  %12s  %6s  %6s  %7s  %6s  %6s  %5s  %7s  %6s  %6s  %5s\n", "size", "count", "type", "redop",
+  PRINT("# %10s  %12s  %8s  %6s            out-of-place                       in-place          \n", "", "", "", "");
+  PRINT("# %10s  %12s  %8s  %6s  %7s  %6s  %6s  %5s  %7s  %6s  %6s  %5s\n", "size", "count", "type", "redop",
         "time", "algbw", "busbw", "error", "time", "algbw", "busbw", "error");
-  PRINT("# %10s  %12s  %6s  %6s  %7s  %6s  %6s  %5s  %7s  %6s  %6s  %5s\n", "(B)", "(elements)", "", "",
+  PRINT("# %10s  %12s  %8s  %6s  %7s  %6s  %6s  %5s  %7s  %6s  %6s  %5s\n", "(B)", "(elements)", "", "",
         "(us)", "(GB/s)", "(GB/s)", "", "(us)", "(GB/s)", "(GB/s)", "");
 }
 
 void print_line_header (size_t size, size_t count, const char *typeName, const char *opName, int root) {
-  PRINT("%12li  %12li  %6s  %6s", size, count, typeName, opName);
+  PRINT("%12li  %12li  %8s  %6s", size, count, typeName, opName);
 }
 
 void AllReduceGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) {
diff --git a/src/alltoall.cu b/src/alltoall.cu
index 4afd3eb..8650997 100644
--- a/src/alltoall.cu
+++ b/src/alltoall.cu
@@ -8,15 +8,15 @@
 #include "common.h"
 
 void print_header() {
-  PRINT("# %10s  %12s  %6s  %6s            out-of-place                       in-place          \n", "", "", "", "");
-  PRINT("# %10s  %12s  %6s  %6s  %7s  %6s  %6s  %5s  %7s  %6s  %6s  %5s\n", "size", "count", "type", "redop",
+  PRINT("# %10s  %12s  %8s  %6s            out-of-place                       in-place          \n", "", "", "", "");
+  PRINT("# %10s  %12s  %8s  %6s  %7s  %6s  %6s  %5s  %7s  %6s  %6s  %5s\n", "size", "count", "type", "redop",
         "time", "algbw", "busbw", "error", "time", "algbw", "busbw", "error");
-  PRINT("# %10s  %12s  %6s  %6s  %7s  %6s  %6s  %5s  %7s  %6s  %6s  %5s\n", "(B)", "(elements)", "", "",
+  PRINT("# %10s  %12s  %8s  %6s  %7s  %6s  %6s  %5s  %7s  %6s  %6s  %5s\n", "(B)", "(elements)", "", "",
         "(us)", "(GB/s)", "(GB/s)", "", "(us)", "(GB/s)", "(GB/s)", "");
 }
 
 void print_line_header (size_t size, size_t count, const char *typeName, const char *opName, int root) {
-  PRINT("%12li  %12li  %6s  %6s", size, count, typeName, opName);
+  PRINT("%12li  %12li  %8s  %6s", size, count, typeName, opName);
 }
 
 void AlltoAllGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) {
diff --git a/src/broadcast.cu b/src/broadcast.cu
index f7c0094..e2b4421 100644
--- a/src/broadcast.cu
+++ b/src/broadcast.cu
@@ -8,15 +8,15 @@
 #include "common.h"
 
 void print_header() {
-  PRINT("# %10s  %12s  %6s  %6s            out-of-place                       in-place          \n", "", "", "", "");
-  PRINT("# %10s  %12s  %6s  %6s  %7s  %6s  %6s  %5s  %7s  %6s  %6s  %5s\n", "size", "count", "type", "root",
+  PRINT("# %10s  %12s  %8s  %6s            out-of-place                       in-place          \n", "", "", "", "");
+  PRINT("# %10s  %12s  %8s  %6s  %7s  %6s  %6s  %5s  %7s  %6s  %6s  %5s\n", "size", "count", "type", "root",
         "time", "algbw", "busbw", "error", "time", "algbw", "busbw", "error");
-  PRINT("# %10s  %12s  %6s  %6s  %7s  %6s  %6s  %5s  %7s  %6s  %6s  %5s\n", "(B)", "(elements)", "", "",
+  PRINT("# %10s  %12s  %8s  %6s  %7s  %6s  %6s  %5s  %7s  %6s  %6s  %5s\n", "(B)", "(elements)", "", "",
         "(us)", "(GB/s)", "(GB/s)", "", "(us)", "(GB/s)", "(GB/s)", "");
 }
 
 void print_line_header (size_t size, size_t count, const char *typeName, const char *opName, int root) {
-  PRINT("%12li  %12li  %6s  %6i", size, count, typeName, root);
+  PRINT("%12li  %12li  %8s  %6i", size, count, typeName, root);
 }
 
 void BroadcastGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) {
diff --git a/src/reduce.cu b/src/reduce.cu
index e40b501..2787688 100644
--- a/src/reduce.cu
+++ b/src/reduce.cu
@@ -8,15 +8,15 @@
 #include "common.h"
 
 void print_header() {
-  PRINT("# %10s  %12s  %6s  %6s            out-of-place                       in-place          \n", "", "", "", "");
-  PRINT("# %10s  %12s  %6s  %6s  %6s  %7s  %6s  %6s  %5s  %7s  %6s  %6s  %5s\n", "size", "count", "type", "redop", "root",
+  PRINT("# %10s  %12s  %8s  %6s            out-of-place                       in-place          \n", "", "", "", "");
+  PRINT("# %10s  %12s  %8s  %6s  %6s  %7s  %6s  %6s  %5s  %7s  %6s  %6s  %5s\n", "size", "count", "type", "redop", "root",
         "time", "algbw", "busbw", "error", "time", "algbw", "busbw", "error");
-  PRINT("# %10s  %12s  %6s  %6s  %6s  %7s  %6s  %6s  %5s  %7s  %6s  %6s  %5s\n", "(B)", "(elements)", "", "", "",
+  PRINT("# %10s  %12s  %8s  %6s  %6s  %7s  %6s  %6s  %5s  %7s  %6s  %6s  %5s\n", "(B)", "(elements)", "", "", "",
         "(us)", "(GB/s)", "(GB/s)", "", "(us)", "(GB/s)", "(GB/s)", "");
 }
 
 void print_line_header (size_t size, size_t count, const char *typeName, const char *opName, int root) {
-  PRINT("%12li  %12li  %6s  %6s  %6i", size, count, typeName, opName, root);
+  PRINT("%12li  %12li  %8s  %6s  %6i", size, count, typeName, opName, root);
 }
 
 void ReduceGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) {
diff --git a/src/reduce_scatter.cu b/src/reduce_scatter.cu
index c6de434..b0c4fab 100644
--- a/src/reduce_scatter.cu
+++ b/src/reduce_scatter.cu
@@ -8,15 +8,15 @@
 #include "common.h"
 
 void print_header() {
-  PRINT("# %10s  %12s  %6s  %6s            out-of-place                       in-place          \n", "", "", "", "");
-  PRINT("# %10s  %12s  %6s  %6s  %7s  %6s  %6s  %5s  %7s  %6s  %6s  %5s\n", "size", "count", "type", "redop",
+  PRINT("# %10s  %12s  %8s  %6s            out-of-place                       in-place          \n", "", "", "", "");
+  PRINT("# %10s  %12s  %8s  %6s  %7s  %6s  %6s  %5s  %7s  %6s  %6s  %5s\n", "size", "count", "type", "redop",
         "time", "algbw", "busbw", "error", "time", "algbw", "busbw", "error");
-  PRINT("# %10s  %12s  %6s  %6s  %7s  %6s  %6s  %5s  %7s  %6s  %6s  %5s\n", "(B)", "(elements)", "", "",
+  PRINT("# %10s  %12s  %8s  %6s  %7s  %6s  %6s  %5s  %7s  %6s  %6s  %5s\n", "(B)", "(elements)", "", "",
         "(us)", "(GB/s)", "(GB/s)", "", "(us)", "(GB/s)", "(GB/s)", "");
 }
 
 void print_line_header (size_t size, size_t count, const char *typeName, const char *opName, int root) {
-  PRINT("%12li  %12li  %6s  %6s", size, count, typeName, opName);
+  PRINT("%12li  %12li  %8s  %6s", size, count, typeName, opName);
 }
 
 void ReduceScatterGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) {

From e55ad3796d710adcf72778dca02559dc6c9706bb Mon Sep 17 00:00:00 2001
From: David Addison <daddison@nvidia.com>
Date: Mon, 28 Jun 2021 14:19:45 -0700
Subject: [PATCH 4/7] Added support for CUDA graph capture/replay (-G)

---
 src/common.cu | 82 ++++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 81 insertions(+), 1 deletion(-)

diff --git a/src/common.cu b/src/common.cu
index 1313079..c180294 100644
--- a/src/common.cu
+++ b/src/common.cu
@@ -65,6 +65,7 @@ static int nccltype = ncclFloat;
 static int ncclroot = 0;
 static int parallel_init = 0;
 static int blocking_coll = 0;
+static int cudaGraphLaunches = 0;
 
 double parsesize(char *value) {
     long long int units;
@@ -481,6 +482,15 @@ testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t
 
   Barrier(args);
 
+  cudaGraph_t graphs[args->nGpus];
+  cudaGraphExec_t graphExec[args->nGpus];
+  if (cudaGraphLaunches >= 1) {
+    // Begin cuda graph capture
+    for (int i=0; i<args->nGpus; i++) {
+      CUDACHECK(cudaStreamBeginCapture(args->streams[i], args->nThreads > 1 ? cudaStreamCaptureModeThreadLocal : cudaStreamCaptureModeGlobal));
+    }
+  }
+
   // Performance Benchmark
   auto start = std::chrono::high_resolution_clock::now();
   for (int iter = 0; iter < iters; iter++) {
@@ -490,11 +500,40 @@ testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t
     }
     if (agg_iters>1) NCCLCHECK(ncclGroupEnd());
   }
+
+  if (cudaGraphLaunches >= 1) {
+    // End cuda graph capture
+    for (int i=0; i<args->nGpus; i++) {
+      CUDACHECK(cudaStreamEndCapture(args->streams[i], graphs+i));
+    }
+    // Instantiate cuda graph
+    for (int i=0; i<args->nGpus; i++) {
+      CUDACHECK(cudaGraphInstantiate(graphExec+i, graphs[i], NULL, NULL, 0));
+    }
+    // Resync CPU, restart timing, launch cuda graph
+    Barrier(args);
+    start = std::chrono::high_resolution_clock::now();
+    for (int l=0; l<cudaGraphLaunches; l++) {
+      for (int i=0; i<args->nGpus; i++) {
+        CUDACHECK(cudaGraphLaunch(graphExec[i], args->streams[i]));
+      }
+    }
+  }
+
   TESTCHECK(completeColl(args));
 
   auto delta = std::chrono::high_resolution_clock::now() - start;
   double deltaSec = std::chrono::duration_cast<std::chrono::duration<double>>(delta).count();
   deltaSec = deltaSec/(iters*agg_iters);
+  if (cudaGraphLaunches >= 1) deltaSec = deltaSec/cudaGraphLaunches;
+
+  if (cudaGraphLaunches >= 1) {
+    //destroy cuda graph
+    for (int i=0; i<args->nGpus; i++) {
+      CUDACHECK(cudaGraphExecDestroy(graphExec[i]));
+      CUDACHECK(cudaGraphDestroy(graphs[i]));
+    }
+  }
 
   double algBw, busBw;
   args->collTest->getBw(count, wordSize(type), deltaSec, &algBw, &busBw, args->nProcs*args->nThreads*args->nGpus);
@@ -508,10 +547,41 @@ testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t
       // Initialize sendbuffs, recvbuffs and expected
       TESTCHECK(args->collTest->initData(args, type, op, root, rep, in_place));
 
+      if (cudaGraphLaunches >= 1) {
+        // Begin cuda graph capture for data check
+        for (int i=0; i<args->nGpus; i++) {
+          CUDACHECK(cudaStreamBeginCapture(args->streams[i], cudaStreamCaptureModeThreadLocal));
+        }
+      }
+
       //test validation in single itertion, should ideally be included into the multi-iteration run
       TESTCHECK(startColl(args, type, op, root, in_place, 0));
+
+      if (cudaGraphLaunches >= 1) {
+        // End cuda graph capture
+        for (int i=0; i<args->nGpus; i++) {
+          CUDACHECK(cudaStreamEndCapture(args->streams[i], graphs+i));
+        }
+        // Instantiate cuda graph
+        for (int i=0; i<args->nGpus; i++) {
+          CUDACHECK(cudaGraphInstantiate(graphExec+i, graphs[i], NULL, NULL, 0));
+        }
+        // Launch cuda graph
+        for (int i=0; i<args->nGpus; i++) {
+          CUDACHECK(cudaGraphLaunch(graphExec[i], args->streams[i]));
+        }
+      }
+
       TESTCHECK(completeColl(args));
 
+      if (cudaGraphLaunches >= 1) {
+        //destroy cuda graph
+        for (int i=0; i<args->nGpus; i++) {
+          CUDACHECK(cudaGraphExecDestroy(graphExec[i]));
+          CUDACHECK(cudaGraphDestroy(graphs[i]));
+        }
+      }
+
       TESTCHECK(CheckData(args, type, op, root, in_place, &maxDelta));
 
       //aggregate delta from all threads and procs
@@ -674,12 +744,13 @@ int main(int argc, char* argv[]) {
     {"datatype", required_argument, 0, 'd'},
     {"root", required_argument, 0, 'r'},
     {"blocking", required_argument, 0, 'z'},
+    {"cudagraph", required_argument, 0, 'G'},
     {"help", no_argument, 0, 'h'}
   };
 
   while(1) {
     int c;
-    c = getopt_long(argc, argv, "t:g:b:e:i:f:n:m:w:p:c:o:d:r:z:h", longopts, &longindex);
+    c = getopt_long(argc, argv, "t:g:b:e:i:f:n:m:w:p:c:o:d:r:z:hG:", longopts, &longindex);
 
     if (c == -1)
       break;
@@ -734,6 +805,13 @@ int main(int argc, char* argv[]) {
       case 'z':
         blocking_coll = strtol(optarg, NULL, 0);
         break;
+      case 'G':
+#if (NCCL_MAJOR > 2 || (NCCL_MAJOR >= 2 && NCCL_MINOR >= 9)) && CUDART_VERSION >= 11030
+        cudaGraphLaunches = strtol(optarg, NULL, 0);
+#else
+        printf("Option -G (CUDA graph) not supported before NCCL 2.9 + CUDA 11.3. Ignoring\n");
+#endif
+        break;
       case 'h':
 	printf("USAGE: %s \n\t"
             "[-t,--nthreads <num threads>] \n\t"
@@ -755,6 +833,7 @@ int main(int argc, char* argv[]) {
             "[-d,--datatype <nccltype/all>] \n\t"
             "[-r,--root <root>] \n\t"
             "[-z,--blocking <0/1>] \n\t"
+            "[-G,--cudagraph <num graph launches>] \n\t"
             "[-h,--help]\n",
 	    basename(argv[0]));
 	return 0;
@@ -780,6 +859,7 @@ int main(int argc, char* argv[]) {
             "[-d,--datatype <nccltype/all>] \n\t"
             "[-r,--root <root>] \n\t"
             "[-z,--blocking <0/1>] \n\t"
+            "[-G,--cudagraph <num graph launches>] \n\t"
             "[-h,--help]\n",
 	    basename(argv[0]));
 	return 0;

From 9dae3d3a37a7505a9eb0622be4268e2d2a3cb5f9 Mon Sep 17 00:00:00 2001
From: David Addison <daddison@nvidia.com>
Date: Mon, 28 Jun 2021 16:49:10 -0700
Subject: [PATCH 5/7] Added new tests: scatter, sendrecv, hypercube

---
 src/Makefile     |   4 +-
 src/hypercube.cu | 124 +++++++++++++++++++++++++++++++++++++++++++++
 src/scatter.cu   | 125 ++++++++++++++++++++++++++++++++++++++++++++++
 src/sendrecv.cu  | 127 +++++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 378 insertions(+), 2 deletions(-)
 create mode 100644 src/hypercube.cu
 create mode 100644 src/scatter.cu
 create mode 100644 src/sendrecv.cu

diff --git a/src/Makefile b/src/Makefile
index 52169bb..26e653e 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2015-2021, NVIDIA CORPORATION. All rights reserved.
 #
 # See LICENSE.txt for license information
 #
@@ -70,7 +70,7 @@ NVLDFLAGS += $(LIBRARIES:%=-l%)
 DST_DIR := $(BUILDDIR)
 SRC_FILES := $(wildcard *.cu)
 OBJ_FILES := $(SRC_FILES:%.cu=${DST_DIR}/%.o)
-BIN_FILES_LIST := all_reduce all_gather broadcast reduce_scatter reduce alltoall
+BIN_FILES_LIST := all_reduce all_gather broadcast reduce_scatter reduce alltoall scatter sendrecv hypercube
 BIN_FILES := $(BIN_FILES_LIST:%=${DST_DIR}/%_perf)
 
 build: ${BIN_FILES}
diff --git a/src/hypercube.cu b/src/hypercube.cu
new file mode 100644
index 0000000..142f1a6
--- /dev/null
+++ b/src/hypercube.cu
@@ -0,0 +1,124 @@
+/*************************************************************************
+ * Copyright (c) 2016-2021, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "cuda_runtime.h"
+#include "common.h"
+
+#define ALIGN 4
+
+void print_header() {
+  PRINT("# %10s  %12s  %8s            out-of-place                       in-place          \n", "", "", "");
+  PRINT("# %10s  %12s  %8s  %7s  %6s  %6s  %5s  %7s  %6s  %6s  %5s\n", "size", "count", "type",
+        "time", "algbw", "busbw", "error", "time", "algbw", "busbw", "error");
+  PRINT("# %10s  %12s  %8s  %7s  %6s  %6s  %5s  %7s  %6s  %6s  %5s\n", "(B)", "(elements)", "",
+        "(us)", "(GB/s)", "(GB/s)", "", "(us)", "(GB/s)", "(GB/s)", "");
+}
+
+void print_line_header (size_t size, size_t count, const char *typeName, const char *opName, int root) {
+  PRINT("%12li  %12li  %8s", size, count, typeName);
+}
+
+void HyperCubeGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) {
+  size_t base = (count/(ALIGN*nranks))*ALIGN;
+  *sendcount = base;
+  *recvcount = base*nranks;
+  *sendInplaceOffset = base;
+  *recvInplaceOffset = 0;
+  *paramcount = base;
+}
+
+testResult_t HyperCubeInitData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int rep, int in_place) {
+  size_t sendcount = args->sendBytes / wordSize(type);
+  size_t recvcount = args->expectedBytes / wordSize(type);
+  int nranks = args->nProcs*args->nThreads*args->nGpus;
+
+  for (int i=0; i<args->nGpus; i++) {
+    int gpuid = args->localRank*args->nThreads*args->nGpus + args->thread*args->nGpus + i;
+    CUDACHECK(cudaSetDevice(gpuid));
+    int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i);
+    CUDACHECK(cudaMemset(args->recvbuffs[i], 0, args->expectedBytes));
+    void* data = in_place ? ((char*)args->recvbuffs[i])+rank*args->sendBytes : args->sendbuffs[i];
+    TESTCHECK(InitData(data, sendcount, type, rep, rank));
+    for (int j=0; j<nranks; j++) {
+      TESTCHECK(InitData(((char*)args->expected[i])+args->sendBytes*j, sendcount, type, rep, j));
+    }
+    CUDACHECK(cudaDeviceSynchronize());
+  }
+  return testSuccess;
+}
+
+void HyperCubeGetBw(size_t count, int typesize, double sec, double* algBw, double* busBw, int nranks) {
+  double baseBw = (double)(count * typesize * (nranks - 1)) / 1.0E9 / sec;
+
+  *algBw = baseBw;
+  double factor = 1;
+  *busBw = baseBw * factor;
+}
+
+testResult_t HyperCubeRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
+  char* sbuff = (char*)sendbuff;
+  char* rbuff = (char*)recvbuff;
+  int nRanks;
+  NCCLCHECK(ncclCommCount(comm, &nRanks));
+  int rank;
+  NCCLCHECK(ncclCommUserRank(comm, &rank));
+  size_t rankSize = count * wordSize(type);
+
+  if (rbuff+rank*rankSize != sbuff) CUDACHECK(cudaMemcpyAsync(rbuff+rank*rankSize, sbuff, rankSize, cudaMemcpyDeviceToDevice, stream));
+
+  // Hypercube AllGather
+  for (int mask=1; mask<nRanks; mask<<=1) {
+    NCCLCHECK(ncclGroupStart());
+    int s = rank & ~(mask-1);
+    int r = s ^ mask;
+    NCCLCHECK(ncclSend(rbuff+s*rankSize, count*mask, type, rank^mask, comm, stream));
+    NCCLCHECK(ncclRecv(rbuff+r*rankSize, count*mask, type, rank^mask, comm, stream));
+    NCCLCHECK(ncclGroupEnd());
+  }
+  return testSuccess;
+}
+
+struct testColl hyperCubeTest = {
+  "HyperCube",
+  HyperCubeGetCollByteCount,
+  HyperCubeInitData,
+  HyperCubeGetBw,
+  HyperCubeRunColl
+};
+
+void HyperCubeGetBuffSize(size_t *sendcount, size_t *recvcount, size_t count, int nranks) {
+  size_t paramcount, sendInplaceOffset, recvInplaceOffset;
+  HyperCubeGetCollByteCount(sendcount, recvcount, &paramcount, &sendInplaceOffset, &recvInplaceOffset, count, nranks);
+}
+
+testResult_t HyperCubeRunTest(struct threadArgs* args, int root, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName) {
+  args->collTest = &hyperCubeTest;
+  ncclDataType_t *run_types;
+  const char **run_typenames;
+  int type_count;
+
+  if ((int)type != -1) {
+    type_count = 1;
+    run_types = &type;
+    run_typenames = &typeName;
+  } else {
+    type_count = test_typenum;
+    run_types = test_types;
+    run_typenames = test_typenames;
+  }
+
+  for (int i=0; i<type_count; i++) {
+    TESTCHECK(TimeTest(args, run_types[i], run_typenames[i], (ncclRedOp_t)0, "", -1));
+  }
+  return testSuccess;
+}
+
+struct testEngine hyperCubeEngine = {
+  HyperCubeGetBuffSize,
+  HyperCubeRunTest
+};
+
+#pragma weak ncclTestEngine=hyperCubeEngine
diff --git a/src/scatter.cu b/src/scatter.cu
new file mode 100644
index 0000000..93ab2e6
--- /dev/null
+++ b/src/scatter.cu
@@ -0,0 +1,125 @@
+/*************************************************************************
+ * Copyright (c) 2016-2021, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "cuda_runtime.h"
+#include "common.h"
+
+void print_header() {
+  PRINT("# %10s  %12s  %8s  %6s            out-of-place                       in-place          \n", "", "", "", "");
+  PRINT("# %10s  %12s  %8s  %6s  %7s  %6s  %6s  %5s  %7s  %6s  %6s  %5s\n", "size", "count", "type", "root",
+        "time", "algbw", "busbw", "error", "time", "algbw", "busbw", "error");
+  PRINT("# %10s  %12s  %8s  %6s  %7s  %6s  %6s  %5s  %7s  %6s  %6s  %5s\n", "(B)", "(elements)", "", "",
+        "(us)", "(GB/s)", "(GB/s)", "", "(us)", "(GB/s)", "(GB/s)", "");
+}
+
+void print_line_header (size_t size, size_t count, const char *typeName, const char *opName, int root) {
+  PRINT("%12li  %12li  %8s  %6i", size, count, typeName, root);
+}
+
+void ScatterGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) {
+  *sendcount = (count/nranks)*nranks;
+  *recvcount = count/nranks;
+  *sendInplaceOffset = 0;
+  *recvInplaceOffset = count/nranks;
+  *paramcount = count/nranks;
+}
+
+testResult_t ScatterInitData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int rep, int in_place) {
+  size_t sendcount = args->sendBytes / wordSize(type);
+  size_t recvcount = args->expectedBytes / wordSize(type);
+
+  for (int i=0; i<args->nGpus; i++) {
+    int gpuid = args->localRank*args->nThreads*args->nGpus + args->thread*args->nGpus + i;
+    CUDACHECK(cudaSetDevice(gpuid));
+    int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i);
+    CUDACHECK(cudaMemset(args->recvbuffs[i], 0, args->expectedBytes));
+    void* data = in_place ? args->recvbuffs[i] : args->sendbuffs[i];
+    if (rank == root) TESTCHECK(InitData(data, sendcount, type, rep, rank));
+    TESTCHECK(InitData(args->expected[i], recvcount, type, rep+rank*recvcount, root));
+    CUDACHECK(cudaDeviceSynchronize());
+  }
+  return testSuccess;
+}
+
+void ScatterGetBw(size_t count, int typesize, double sec, double* algBw, double* busBw, int nranks) {
+  double baseBw = (double)(count * nranks * typesize) / 1.0E9 / sec;
+
+  *algBw = baseBw;
+  double factor = ((double)(nranks-1))/((double)(nranks));
+  *busBw = baseBw * factor;
+}
+
+testResult_t ScatterRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
+  int nRanks;
+  NCCLCHECK(ncclCommCount(comm, &nRanks));
+  int rank;
+  NCCLCHECK(ncclCommUserRank(comm, &rank));
+  size_t rankOffset = count * wordSize(type);
+  if (count == 0) return testSuccess;
+
+  NCCLCHECK(ncclGroupStart());
+  if (rank == root) {
+    for (int r=0; r<nRanks; r++) {
+      NCCLCHECK(ncclSend(((char*)sendbuff)+r*rankOffset, count, type, r, comm, stream));
+    }
+  }
+  NCCLCHECK(ncclRecv(recvbuff, count, type, root, comm, stream));
+  NCCLCHECK(ncclGroupEnd());
+
+  return testSuccess;
+}
+
+struct testColl scatterTest = {
+  "Scatter",
+  ScatterGetCollByteCount,
+  ScatterInitData,
+  ScatterGetBw,
+  ScatterRunColl
+};
+
+void ScatterGetBuffSize(size_t *sendcount, size_t *recvcount, size_t count, int nranks) {
+  size_t paramcount, sendInplaceOffset, recvInplaceOffset;
+  ScatterGetCollByteCount(sendcount, recvcount, &paramcount, &sendInplaceOffset, &recvInplaceOffset, count, nranks);
+}
+
+testResult_t ScatterRunTest(struct threadArgs* args, int root, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName) {
+  args->collTest = &scatterTest;
+  ncclDataType_t *run_types;
+  const char **run_typenames;
+  int type_count;
+  int begin_root, end_root;
+
+  if ((int)type != -1) {
+    type_count = 1;
+    run_types = &type;
+    run_typenames = &typeName;
+  } else {
+    type_count = test_typenum;
+    run_types = test_types;
+    run_typenames = test_typenames;
+  }
+
+  if (root != -1) {
+    begin_root = end_root = root;
+  } else {
+    begin_root = 0;
+    end_root = args->nProcs*args->nThreads*args->nGpus-1;
+  }
+
+  for (int i=0; i<type_count; i++) {
+    for (int j=begin_root; j<=end_root; j++) {
+      TESTCHECK(TimeTest(args, run_types[i], run_typenames[i], (ncclRedOp_t)0, "", j));
+    }
+  }
+  return testSuccess;
+}
+
+struct testEngine scatterEngine = {
+  ScatterGetBuffSize,
+  ScatterRunTest
+};
+
+#pragma weak ncclTestEngine=scatterEngine
diff --git a/src/sendrecv.cu b/src/sendrecv.cu
new file mode 100644
index 0000000..8bebc48
--- /dev/null
+++ b/src/sendrecv.cu
@@ -0,0 +1,127 @@
+/*************************************************************************
+ * Copyright (c) 2016-2021, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "cuda_runtime.h"
+#include "common.h"
+
+void print_header() {
+  PRINT("# %10s  %12s  %8s            out-of-place                       in-place          \n", "", "", "");
+  PRINT("# %10s  %12s  %8s  %7s  %6s  %6s  %5s  %7s  %6s  %6s  %5s\n", "size", "count", "type",
+        "time", "algbw", "busbw", "error", "time", "algbw", "busbw", "error");
+  PRINT("# %10s  %12s  %8s  %7s  %6s  %6s  %5s  %7s  %6s  %6s  %5s\n", "(B)", "(elements)", "",
+        "(us)", "(GB/s)", "(GB/s)", "", "(us)", "(GB/s)", "(GB/s)", "");
+}
+
+void print_line_header (size_t size, size_t count, const char *typeName, const char *opName, int root) {
+  PRINT("%12li  %12li  %8s", size, count, typeName);
+}
+
+void SendRecvGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) {
+  *sendcount = count;
+  *recvcount = count;
+  *sendInplaceOffset = 0;
+  *recvInplaceOffset = 0;
+  *paramcount = *sendcount;
+}
+
+testResult_t SendRecvInitData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int rep, int in_place) {
+  size_t sendcount = args->sendBytes / wordSize(type);
+  size_t recvcount = args->expectedBytes / wordSize(type);
+  int nranks = args->nProcs*args->nThreads*args->nGpus;
+
+  for (int i=0; i<args->nGpus; i++) {
+    int gpuid = args->localRank*args->nThreads*args->nGpus + args->thread*args->nGpus + i;
+    CUDACHECK(cudaSetDevice(gpuid));
+    int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i);
+    CUDACHECK(cudaMemset(args->recvbuffs[i], 0, args->expectedBytes));
+    void* data = in_place ? args->recvbuffs[i] : args->sendbuffs[i];
+    TESTCHECK(InitData(data, sendcount, type, rep, rank));
+    int peer = (rank-1+nranks)%nranks;
+    TESTCHECK(InitData(args->expected[i], recvcount, type, rep, peer));
+    CUDACHECK(cudaDeviceSynchronize());
+  }
+  // We don't support in-place sendrecv
+  args->reportErrors = in_place ? 0 : 1;
+  return testSuccess;
+}
+
+void SendRecvGetBw(size_t count, int typesize, double sec, double* algBw, double* busBw, int nranks) {
+  double baseBw = (double)(count * typesize) / 1.0E9 / sec;
+
+  *algBw = baseBw;
+  double factor = 1;
+  *busBw = baseBw * factor;
+}
+
+testResult_t SendRecvRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
+  int nRanks;
+  NCCLCHECK(ncclCommCount(comm, &nRanks));
+  int rank;
+  NCCLCHECK(ncclCommUserRank(comm, &rank));
+  int recvPeer = (rank-1+nRanks) % nRanks;
+  int sendPeer = (rank+1) % nRanks;
+
+  NCCLCHECK(ncclGroupStart());
+  NCCLCHECK(ncclSend(sendbuff, count, type, sendPeer, comm, stream));
+  NCCLCHECK(ncclRecv(recvbuff, count, type, recvPeer, comm, stream));
+  NCCLCHECK(ncclGroupEnd());
+  return testSuccess;
+}
+
+struct testColl sendRecvTest = {
+  "SendRecv",
+  SendRecvGetCollByteCount,
+  SendRecvInitData,
+  SendRecvGetBw,
+  SendRecvRunColl
+};
+
+void SendRecvGetBuffSize(size_t *sendcount, size_t *recvcount, size_t count, int nranks) {
+  size_t paramcount, sendInplaceOffset, recvInplaceOffset;
+  SendRecvGetCollByteCount(sendcount, recvcount, &paramcount, &sendInplaceOffset, &recvInplaceOffset, count, nranks);
+}
+
+testResult_t SendRecvRunTest(struct threadArgs* args, int root, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName) {
+  args->collTest = &sendRecvTest;
+  ncclDataType_t *run_types;
+  ncclRedOp_t *run_ops;
+  const char **run_typenames, **run_opnames;
+  int type_count, op_count;
+
+  if ((int)type != -1) {
+    type_count = 1;
+    run_types = &type;
+    run_typenames = &typeName;
+  } else {
+    type_count = test_typenum;
+    run_types = test_types;
+    run_typenames = test_typenames;
+  }
+
+  if ((int)op != -1) {
+    op_count = 1;
+    run_ops = &op;
+    run_opnames = &opName;
+  } else {
+    op_count = test_opnum;
+    run_ops = test_ops;
+    run_opnames = test_opnames;
+  }
+
+  for (int i=0; i<type_count; i++) {
+    for (int j=0; j<op_count; j++) {
+      TESTCHECK(TimeTest(args, run_types[i], run_typenames[i], run_ops[j], run_opnames[j], -1));
+    }
+  }
+  return testSuccess;
+}
+
+struct testEngine sendRecvEngine = {
+  SendRecvGetBuffSize,
+  SendRecvRunTest
+};
+
+#pragma weak ncclTestEngine=sendRecvEngine

From 1ae8cdc315d10c3f65764a1915e0fb0f1563d893 Mon Sep 17 00:00:00 2001
From: David Addison <daddison@nvidia.com>
Date: Mon, 28 Jun 2021 18:23:12 -0700
Subject: [PATCH 6/7] Resync with changes in gitilab-master code

---
 src/common.cu | 81 +++++++++++++++++++++++----------------------------
 src/common.h  |  5 ++--
 2 files changed, 40 insertions(+), 46 deletions(-)

diff --git a/src/common.cu b/src/common.cu
index c180294..7aad2c1 100644
--- a/src/common.cu
+++ b/src/common.cu
@@ -67,6 +67,8 @@ static int parallel_init = 0;
 static int blocking_coll = 0;
 static int cudaGraphLaunches = 0;
 
+#define NUM_BLOCKS 32
+
 double parsesize(char *value) {
     long long int units;
     double size;
@@ -137,9 +139,9 @@ void deltaKern(void* A_, void* B_, size_t count, double* max) {
   const T* A = (const T*)A_;
   const T* B = (const T*)B_;
   __shared__ double temp[BSIZE];
-  int tid = threadIdx.x;
+  int tid = blockIdx.x*blockDim.x + threadIdx.x;
   double locmax = 0.0;
-  for(int i=tid; i<count; i+=blockDim.x) {
+  for(size_t i=tid; i<count; i+=blockDim.x*gridDim.x) {
 
     double delta = absDiff(A[i], B[i]);
     if( delta > locmax ) {
@@ -150,6 +152,7 @@ void deltaKern(void* A_, void* B_, size_t count, double* max) {
     }
   }
 
+  tid = threadIdx.x;
   temp[tid] = locmax;
   for(int stride = BSIZE/2; stride > 1; stride>>=1) {
     __syncthreads();
@@ -158,38 +161,38 @@ void deltaKern(void* A_, void* B_, size_t count, double* max) {
   }
   __syncthreads();
   if( threadIdx.x == 0)
-    *max = temp[0] > temp[1] ? temp[0] : temp[1];
+    max[blockIdx.x] = temp[0] > temp[1] ? temp[0] : temp[1];
 }
 
-
-testResult_t CheckDelta(void* expected, void* results, size_t count, ncclDataType_t type, double* devmax) {
+testResult_t CheckDelta(void* results, void* expected, size_t count, ncclDataType_t type, double* devmax) {
   switch (type) {
 #if defined(__CUDA_BF16_TYPES_EXIST__)
     case ncclBfloat16:
-      deltaKern<__nv_bfloat16, 512><<<1, 512>>>(results, expected, count, devmax); break;
+      deltaKern<__nv_bfloat16, 512><<<NUM_BLOCKS, 512>>>(results, expected, count, devmax); break;
 #endif
     case ncclHalf:
-      deltaKern<half, 512><<<1, 512>>>(results, expected, count, devmax); break;
+      deltaKern<half, 512><<<NUM_BLOCKS, 512>>>(results, expected, count, devmax); break;
     case ncclFloat:
-      deltaKern<float, 512><<<1, 512>>>(results, expected, count, devmax); break;
+      deltaKern<float, 512><<<NUM_BLOCKS, 512>>>(results, expected, count, devmax); break;
     case ncclDouble:
-      deltaKern<double, 512><<<1, 512>>>(results, expected, count, devmax); break;
+      deltaKern<double, 512><<<NUM_BLOCKS, 512>>>(results, expected, count, devmax); break;
 
     case ncclChar:
 #if NCCL_MAJOR >= 2
     case ncclUint8:
 #endif
-      deltaKern<uint8_t, 512><<<1, 512>>>(results, expected, count, devmax); break;
+      deltaKern<uint8_t, 512><<<NUM_BLOCKS, 512>>>(results, expected, count, devmax); break;
     case ncclInt:
 #if NCCL_MAJOR >= 2
     case ncclUint32:
 #endif
-      deltaKern<uint32_t, 512><<<1, 512>>>(results, expected, count, devmax); break;
+      deltaKern<uint32_t, 512><<<NUM_BLOCKS, 512>>>(results, expected, count, devmax); break;
     case ncclInt64:
     case ncclUint64:
-      deltaKern<uint64_t, 512><<<1, 512>>>(results, expected, count, devmax); break;
+      deltaKern<uint64_t, 512><<<NUM_BLOCKS, 512>>>(results, expected, count, devmax); break;
   }
   CUDACHECK(cudaDeviceSynchronize());
+  for (int i=1; i<NUM_BLOCKS; i++) devmax[0] = std::max(devmax[0], devmax[i]);
   return testSuccess;
 }
 
@@ -438,8 +441,8 @@ testResult_t startColl(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t
 
   // Try to change offset for each iteration so that we avoid cache effects and catch race conditions in ptrExchange
   size_t totalnbytes = max(args->sendBytes, args->expectedBytes);
-  size_t shift = (totalnbytes * iter) % args->maxbytes;
-  if (shift + totalnbytes > args->maxbytes) shift = 0;
+  size_t steps = totalnbytes ? args->maxbytes / totalnbytes : 1;
+  size_t shift = totalnbytes * (iter % steps);
 
   if (args->nGpus > 1) NCCLCHECK(ncclGroupStart());
   for (int i = 0; i < args->nGpus; i++) {
@@ -475,6 +478,10 @@ testResult_t completeColl(struct threadArgs* args) {
 
 testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int in_place) {
   size_t count = args->nbytes / wordSize(type);
+  if (datacheck) {
+    // Initialize sendbuffs, recvbuffs and expected
+    TESTCHECK(args->collTest->initData(args, type, op, root, 99, in_place));
+  }
 
   // Sync
   TESTCHECK(startColl(args, type, op, root, in_place, 0));
@@ -598,10 +605,10 @@ testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t
   }
 
   double timeUsec = deltaSec*1.0E6;
-  char timeStr[10];
+  char timeStr[100];
   if (timeUsec > 10000.0) {
     sprintf(timeStr, "%7.0f", timeUsec);
-  } else if (timeUsec > 100.0) {
+  } else if (timeUsec >= 100.0) {
     sprintf(timeStr, "%7.1f", timeUsec);
   } else {
     sprintf(timeStr, "%7.2f", timeUsec);
@@ -812,31 +819,6 @@ int main(int argc, char* argv[]) {
         printf("Option -G (CUDA graph) not supported before NCCL 2.9 + CUDA 11.3. Ignoring\n");
 #endif
         break;
-      case 'h':
-	printf("USAGE: %s \n\t"
-            "[-t,--nthreads <num threads>] \n\t"
-            "[-g,--ngpus <gpus per thread>] \n\t"
-            "[-b,--minbytes <min size in bytes>] \n\t"
-            "[-e,--maxbytes <max size in bytes>] \n\t"
-            "[-i,--stepbytes <increment size>] \n\t"
-            "[-f,--stepfactor <increment factor>] \n\t"
-            "[-n,--iters <iteration count>] \n\t"
-            "[-m,--agg_iters <aggregated iteration count>] \n\t"
-            "[-w,--warmup_iters <warmup iteration count>] \n\t"
-            "[-p,--parallel_init <0/1>] \n\t"
-            "[-c,--check <0/1>] \n\t"
-#if NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0)
-            "[-o,--op <sum/prod/min/max/avg/all>] \n\t"
-#else
-            "[-o,--op <sum/prod/min/max/all>] \n\t"
-#endif
-            "[-d,--datatype <nccltype/all>] \n\t"
-            "[-r,--root <root>] \n\t"
-            "[-z,--blocking <0/1>] \n\t"
-            "[-G,--cudagraph <num graph launches>] \n\t"
-            "[-h,--help]\n",
-	    basename(argv[0]));
-	return 0;
       default:
         if (c != 'h') printf("invalid option '%c'\n", c);
         printf("USAGE: %s \n\t"
@@ -868,7 +850,8 @@ int main(int argc, char* argv[]) {
 #ifdef MPI_SUPPORT
   MPI_Init(&argc, &argv);
 #endif
-  return run();
+  TESTCHECK(run());
+  return 0;
 }
 
 testResult_t run() {
@@ -900,6 +883,7 @@ testResult_t run() {
 #define MAX_LINE 2048
   char line[MAX_LINE];
   int len = 0;
+  size_t maxMem = ~0;
   for (int i=0; i<nThreads*nGpus; i++) {
     int cudaDev = localRank*nThreads*nGpus+i;
     int rank = proc*nThreads*nGpus+i;
@@ -907,6 +891,7 @@ testResult_t run() {
     CUDACHECK(cudaGetDeviceProperties(&prop, cudaDev));
     len += snprintf(line+len, MAX_LINE-len, "#   Rank %2d Pid %6d on %10s device %2d [0x%02x] %s\n",
                     rank, getpid(), hostname, cudaDev, prop.pciBusID, prop.name);
+    maxMem = std::min(maxMem, prop.totalGlobalMem);
   }
 
 #if MPI_SUPPORT
@@ -918,10 +903,18 @@ testResult_t run() {
       PRINT("%s", lines+MAX_LINE*p);
     free(lines);
   }
+  MPI_Allreduce(MPI_IN_PLACE, &maxMem, 1, MPI_LONG, MPI_MIN, MPI_COMM_WORLD);
 #else
   PRINT("%s", line);
 #endif
 
+  // We need sendbuff, recvbuff, expected (when datacheck enabled), plus 1G for the rest.
+  size_t memMaxBytes = (maxMem - (1<<30)) / (datacheck ? 3 : 2);
+  if (maxBytes > memMaxBytes) {
+    maxBytes = memMaxBytes;
+    if (proc == 0) printf("#\n# Reducing maxBytes to %ld due to memory limitation\n", maxBytes);
+  }
+
   ncclUniqueId ncclId;
   if (proc == 0) {
     NCCLCHECK(ncclGetUniqueId(&ncclId));
@@ -963,7 +956,7 @@ testResult_t run() {
   int errors[nThreads];
   double bw[nThreads];
   double* delta;
-  CUDACHECK(cudaHostAlloc(&delta, sizeof(double)*nThreads, cudaHostAllocPortable | cudaHostAllocMapped));
+  CUDACHECK(cudaHostAlloc(&delta, sizeof(double)*nThreads*NUM_BLOCKS, cudaHostAllocPortable | cudaHostAllocMapped));
   int bw_count[nThreads];
   for (int t=0; t<nThreads; t++) {
     bw[t] = 0.0;
@@ -1003,7 +996,7 @@ testResult_t run() {
     threads[t].args.sync = (volatile int*)sync;
     threads[t].args.sync_idx = 0;
     threads[t].args.deltaThreads = delta;
-    threads[t].args.deltaHost = (delta + t);
+    threads[t].args.deltaHost = (delta + t*NUM_BLOCKS);
     threads[t].args.delta = delta;
     threads[t].args.errors=errors+t;
     threads[t].args.bw=bw+t;
diff --git a/src/common.h b/src/common.h
index 00103f7..44b298d 100644
--- a/src/common.h
+++ b/src/common.h
@@ -54,8 +54,8 @@ typedef enum {
   if (r!= testSuccess) {                            \
     char hostname[1024];                            \
     getHostName(hostname, 1024);                    \
-    printf(" .. %s: Test failure %s:%d\n",          \
-         hostname,                                  \
+    printf(" .. %s pid %d: Test failure %s:%d\n",   \
+         hostname, getpid(),                        \
         __FILE__,__LINE__);                         \
     return r;                                       \
   }                                                 \
@@ -78,6 +78,7 @@ extern struct testColl allGatherTest;
 extern struct testColl reduceScatterTest;
 extern struct testColl broadcastTest;
 extern struct testColl reduceTest;
+extern struct testColl alltoAllTest;
 
 struct testEngine {
   void (*getBuffSize)(size_t *sendcount, size_t *recvcount, size_t count, int nranks);

From 1dfc76ecccb73f7e7336730663e5901d6d1600ef Mon Sep 17 00:00:00 2001
From: David Addison <daddison@nvidia.com>
Date: Wed, 30 Jun 2021 19:36:07 -0700
Subject: [PATCH 7/7] Added new option to report average iteration time

---
 src/common.cu | 32 +++++++++++++++++++++++++++++++-
 1 file changed, 31 insertions(+), 1 deletion(-)

diff --git a/src/common.cu b/src/common.cu
index 7aad2c1..d9f0368 100644
--- a/src/common.cu
+++ b/src/common.cu
@@ -66,6 +66,10 @@ static int ncclroot = 0;
 static int parallel_init = 0;
 static int blocking_coll = 0;
 static int cudaGraphLaunches = 0;
+#ifdef MPI_SUPPORT
+// Report average iteration time: (0=RANK0,1=AVG,2=MIN,3=MAX)
+static int average = 1;
+#endif
 
 #define NUM_BLOCKS 32
 
@@ -533,6 +537,23 @@ testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t
   double deltaSec = std::chrono::duration_cast<std::chrono::duration<double>>(delta).count();
   deltaSec = deltaSec/(iters*agg_iters);
   if (cudaGraphLaunches >= 1) deltaSec = deltaSec/cudaGraphLaunches;
+#ifdef MPI_SUPPORT
+  switch (average) {
+  case 1:
+    // Calculate the average time across all ranks
+    MPI_Allreduce(MPI_IN_PLACE, &deltaSec, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
+    deltaSec = deltaSec/(args->nProcs*args->nThreads*args->nGpus);
+    break;
+  case 2:
+    // Obtain the minimum time across all ranks
+    MPI_Allreduce(MPI_IN_PLACE, &deltaSec, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);
+    break;
+  case 3:
+    // Obtain the maximum time across all ranks
+    MPI_Allreduce(MPI_IN_PLACE, &deltaSec, 1, MPI_DOUBLE, MPI_MAX, MPI_COMM_WORLD);
+    break;
+  }
+#endif
 
   if (cudaGraphLaunches >= 1) {
     //destroy cuda graph
@@ -752,12 +773,13 @@ int main(int argc, char* argv[]) {
     {"root", required_argument, 0, 'r'},
     {"blocking", required_argument, 0, 'z'},
     {"cudagraph", required_argument, 0, 'G'},
+    {"average", required_argument, 0, 'a'},
     {"help", no_argument, 0, 'h'}
   };
 
   while(1) {
     int c;
-    c = getopt_long(argc, argv, "t:g:b:e:i:f:n:m:w:p:c:o:d:r:z:hG:", longopts, &longindex);
+    c = getopt_long(argc, argv, "t:g:b:e:i:f:n:m:w:p:c:o:d:r:z:hG:a:", longopts, &longindex);
 
     if (c == -1)
       break;
@@ -819,6 +841,11 @@ int main(int argc, char* argv[]) {
         printf("Option -G (CUDA graph) not supported before NCCL 2.9 + CUDA 11.3. Ignoring\n");
 #endif
         break;
+#ifdef MPI_SUPPORT
+      case 'a':
+        average = (int)strtol(optarg, NULL, 0);
+        break;
+#endif
       default:
         if (c != 'h') printf("invalid option '%c'\n", c);
         printf("USAGE: %s \n\t"
@@ -842,6 +869,9 @@ int main(int argc, char* argv[]) {
             "[-r,--root <root>] \n\t"
             "[-z,--blocking <0/1>] \n\t"
             "[-G,--cudagraph <num graph launches>] \n\t"
+#ifdef MPI_SUPPORT
+            "[-a,--average <0/1/2/3> report average iteration time <0=RANK0/1=AVG/2=MIN/3=MAX>] \n\t"
+#endif
             "[-h,--help]\n",
 	    basename(argv[0]));
 	return 0;