From 119a0ecf600f30d6b82897126f6301e15b6582b8 Mon Sep 17 00:00:00 2001
From: Sylvain Jeaugey <sjeaugey@nvidia.com>
Date: Tue, 17 Mar 2020 12:00:19 -0700
Subject: [PATCH] Add alltoall perf test

---
 src/Makefile    |   2 +-
 src/alltoall.cu | 117 ++++++++++++++++++++++++++++++++++++++++++++++++
 src/common.cu   |   4 +-
 src/common.h    |  14 +++---
 4 files changed, 129 insertions(+), 8 deletions(-)
 create mode 100644 src/alltoall.cu
diff --git a/src/Makefile b/src/Makefile
index 034cc67..33ca479 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -60,7 +60,7 @@ NVLDFLAGS += $(LIBRARIES:%=-l%)
 DST_DIR := $(BUILDDIR)
 SRC_FILES := $(wildcard *.cu)
 OBJ_FILES := $(SRC_FILES:%.cu=${DST_DIR}/%.o)
-BIN_FILES_LIST := all_reduce all_gather broadcast reduce_scatter reduce
+BIN_FILES_LIST := all_reduce all_gather broadcast reduce_scatter reduce alltoall
 BIN_FILES := $(BIN_FILES_LIST:%=${DST_DIR}/%_perf)
 
 build: ${BIN_FILES}
diff --git a/src/alltoall.cu b/src/alltoall.cu
new file mode 100644
index 0000000..aea9370
--- /dev/null
+++ b/src/alltoall.cu
@@ -0,0 +1,117 @@
+/*************************************************************************
+ * Copyright (c) 2016-2020, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "cuda_runtime.h"
+#include "common.h"
+
+void print_header() {
+  PRINT("# %10s  %12s  %6s  %6s            out-of-place                       in-place          \n", "", "", "", "");
+  PRINT("# %10s  %12s  %6s  %6s  %7s  %6s  %6s  %5s  %7s  %6s  %6s  %5s\n", "size", "count", "type", "redop",
+        "time", "algbw", "busbw", "error", "time", "algbw", "busbw", "error");
+  PRINT("# %10s  %12s  %6s  %6s  %7s  %6s  %6s  %5s  %7s  %6s  %6s  %5s\n", "(B)", "(elements)", "", "",
+        "(us)", "(GB/s)", "(GB/s)", "", "(us)", "(GB/s)", "(GB/s)", "");
+}
+
+void print_line_header (size_t size, size_t count, const char *typeName, const char *opName, int root) {
+  PRINT("%12li  %12li  %6s  %6s", size, count, typeName, opName);
+}
+
+void AlltoAllGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) {
+  *sendcount = (count/nranks)*nranks;
+  *recvcount = (count/nranks)*nranks;
+  *sendInplaceOffset = 0;
+  *recvInplaceOffset = 0;
+  *paramcount = count/nranks;
+}
+
+testResult_t AlltoAllInitData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int rep, int in_place) {
+  size_t sendcount = args->sendBytes / wordSize(type);
+  size_t recvcount = args->expectedBytes / wordSize(type);
+  int nranks = args->nProcs*args->nThreads*args->nGpus;
+
+  for (int i=0; i<args->nGpus; i++) {
+    char* str = getenv("NCCL_TESTS_DEVICE");
+    int gpuid = str ? atoi(str) : args->localRank*args->nThreads*args->nGpus + args->thread*args->nGpus + i;
+    CUDACHECK(cudaSetDevice(gpuid));
+    int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i);
+    CUDACHECK(cudaMemset(args->recvbuffs[i], 0, args->expectedBytes));
+    void* data = in_place ? args->recvbuffs[i] : args->sendbuffs[i];
+    TESTCHECK(InitData(data, sendcount, type, rep, rank));
+    for (int j=0; j<nranks; j++) {
+      TESTCHECK(InitData(((char*)args->expected[i])+args->sendBytes/nranks*j, sendcount/nranks, type, rep+rank*sendcount/nranks, j));
+    }
+    CUDACHECK(cudaDeviceSynchronize());
+  }
+  // We don't support in-place alltoall
+  args->reportErrors = in_place ? 0 : 1;
+  return testSuccess;
+}
+
+void AlltoAllGetBw(size_t count, int typesize, double sec, double* algBw, double* busBw, int nranks) {
+  double baseBw = (double)(count * nranks * typesize) / 1.0E9 / sec;
+
+  *algBw = baseBw;
+  double factor = ((double)(nranks-1))/((double)(nranks));
+  *busBw = baseBw * factor;
+}
+
+testResult_t AlltoAllRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
+  int nRanks;
+  NCCLCHECK(ncclCommCount(comm, &nRanks));
+  size_t rankOffset = count * wordSize(type);
+  if (count == 0) return testSuccess;
+
+  NCCLCHECK(ncclGroupStart());
+  for (int r=0; r<nRanks; r++) {
+    NCCLCHECK(ncclSend(((char*)sendbuff)+r*rankOffset, count, type, r, comm, stream));
+    NCCLCHECK(ncclRecv(((char*)recvbuff)+r*rankOffset, count, type, r, comm, stream));
+  }
+  NCCLCHECK(ncclGroupEnd());
+
+  return testSuccess;
+}
+
+struct testColl alltoAllTest = {
+  "AlltoAll",
+  AlltoAllGetCollByteCount,
+  AlltoAllInitData,
+  AlltoAllGetBw,
+  AlltoAllRunColl
+};
+
+void AlltoAllGetBuffSize(size_t *sendcount, size_t *recvcount, size_t count, int nranks) {
+  size_t paramcount, sendInplaceOffset, recvInplaceOffset;
+  AlltoAllGetCollByteCount(sendcount, recvcount, &paramcount, &sendInplaceOffset, &recvInplaceOffset, count, nranks);
+}
+
+testResult_t AlltoAllRunTest(struct threadArgs* args, int root, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName) {
+  args->collTest = &alltoAllTest;
+  ncclDataType_t *run_types;
+  const char **run_typenames;
+  int type_count;
+
+  if ((int)type != -1) {
+    type_count = 1;
+    run_types = &type;
+    run_typenames = &typeName;
+  } else {
+    type_count = ncclNumTypes;
+    run_types = test_types;
+    run_typenames = test_typenames;
+  }
+
+  for (int i=0; i<type_count; i++) {
+      TESTCHECK(TimeTest(args, run_types[i], run_typenames[i], (ncclRedOp_t)0, "", -1));
+  }
+  return testSuccess;
+}
+
+struct testEngine alltoAllEngine = {
+  AlltoAllGetBuffSize,
+  AlltoAllRunTest
+};
+
+#pragma weak ncclTestEngine=alltoAllEngine
diff --git a/src/common.cu b/src/common.cu
index 5a3ae52..6d44b9a 100644
--- a/src/common.cu
+++ b/src/common.cu
@@ -307,7 +307,7 @@ testResult_t CheckData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t
 #endif
   }
   double nranks = args->nProcs*args->nThreads*args->nGpus;
-  if (maxDelta > DeltaMaxValue(type)*(nranks - 1)) args->errors[0]++;
+  if (args->reportErrors && maxDelta > DeltaMaxValue(type)*(nranks - 1)) args->errors[0]++;
   *delta = maxDelta;
   return testSuccess;
 }
@@ -832,6 +832,8 @@ testResult_t run() {
     threads[t].args.bw=bw+t;
     threads[t].args.bw_count=bw_count+t;
 
+    threads[t].args.reportErrors = 1;
+
     threads[t].func = parallel_init ? threadInit : threadRunTests;
     if (t)
       TESTCHECK(threadLaunch(threads+t));
diff --git a/src/common.h b/src/common.h
index 8fb5b8c..a2d7ae2 100644
--- a/src/common.h
+++ b/src/common.h
@@ -17,25 +17,25 @@
 #include "nccl1_compat.h"
 
 #define CUDACHECK(cmd) do {                         \
-  cudaError_t e = cmd;                              \
-  if( e != cudaSuccess ) {                          \
+  cudaError_t err = cmd;                            \
+  if( err != cudaSuccess ) {                        \
     char hostname[1024];                            \
     getHostName(hostname, 1024);                    \
     printf("%s: Test CUDA failure %s:%d '%s'\n",    \
          hostname,                                  \
-        __FILE__,__LINE__,cudaGetErrorString(e));   \
+        __FILE__,__LINE__,cudaGetErrorString(err)); \
     return testCudaError;                           \
   }                                                 \
 } while(0)
 
 #define NCCLCHECK(cmd) do {                         \
-  ncclResult_t r = cmd;                             \
-  if (r!= ncclSuccess) {                            \
+  ncclResult_t res = cmd;                           \
+  if (res != ncclSuccess) {                         \
     char hostname[1024];                            \
     getHostName(hostname, 1024);                    \
     printf("%s: Test NCCL failure %s:%d '%s'\n",    \
          hostname,                                  \
-        __FILE__,__LINE__,ncclGetErrorString(r));   \
+        __FILE__,__LINE__,ncclGetErrorString(res)); \
     return testNcclError;                           \
   }                                                 \
 } while(0)
@@ -124,6 +124,8 @@ struct threadArgs {
   double* bw;
   int* bw_count;
 
+  int reportErrors;
+
   struct testColl* collTest;
 };