Resync all tests with test code from NCCL 2.4

Major rework to merge most of the changes from the NCCL internal tests into the public ones Added "-m <agg_iters>" operation aggregation option. Data integrity checking is now much more performant at scale. Startup times at scale are improved. Test latency units are now displayed in usec.
2026-01-14 02:47:21 +08:00 · 2019-03-06 18:17:20 -08:00 · 2019-03-06 18:17:20 -08:00 · cbe7f65400
commit cbe7f65400
parent dcf818955f
10 changed files with 949 additions and 1097 deletions
--- a/README.md
+++ b/README.md
@ -1,6 +1,6 @@
 # NCCL Tests

-These tests check both the performance and the correctness of NCCL operations. They can be compiled against [NCCL 1](http://github.com/nvidia/nccl) and [NCCL 2](http://developer.nvidia.com/nccl).
+These tests check both the performance and the correctness of NCCL operations. They can be compiled against [NCCL](http://github.com/nvidia/nccl)

 ## Build

@ -20,7 +20,7 @@ $ make MPI=1 MPI_HOME=/path/to/mpi CUDA_HOME=/path/to/cuda NCCL_HOME=/path/to/nc

 ## Usage

-NCCL tests can run on multiple processes, multiple threads, and multiple CUDA devices per thread. The number of process is managed by MPI and is therefore not passed to the tests as argument. The total number of ranks (=CUDA devices) will be equal to (number of processes)\*(number of threads)\*(number of gpus per thread).
+NCCL tests can run on multiple processes, multiple threads, and multiple CUDA devices per thread. The number of process is managed by MPI and is therefore not passed to the tests as argument. The total number of ranks (=CUDA devices) will be equal to (number of processes)\*(number of threads)\*(number of GPUs per thread).

 ### Quick examples

@ -44,7 +44,7 @@ All tests support the same set of arguments :

 * Number of GPUs
  * `-t,--nthreads <num threads>` number of threads per process. Default : 1.
-  * `-g,--ngpus <gpus per thread>` number of gpus per thread. Default : 1.
+  * `-g,--ngpus <GPUs per thread>` number of gpus per thread. Default : 1.
 * Sizes to scan
  * `-b,--minbytes <min size in bytes>` minimum size to start with. Default : 32M.
  * `-e,--maxbytes <max size in bytes>` maximum size to end at. Default : 32M.
@ -55,16 +55,16 @@ All tests support the same set of arguments :
  * `-o,--op <sum/prod/min/max/all>` Specify which reduction operation to perform. Only relevant for reduction operations like Allreduce, Reduce or ReduceScatter. Default : Sum.
  * `-d,--datatype <nccltype/all>` Specify which datatype to use. Default : Float.
  * `-r,--root <root/all>` Specify which root to use. Only for operations with a root like broadcast or reduce. Default : 0.
-* Performance 
+* Performance
  * `-n,--iters <iteration count>` number of iterations. Default : 20.
  * `-w,--warmup_iters <warmup iteration count>` number of warmup iterations (not timed). Default : 5.
+  * `-m,--agg_iters <aggregation count>` number of operations to aggregate together in each iteration. Default : 1.
 * Test operation
-  * `-s,--swap_args <0/1>` when used with multiple threads, have threads manage different GPUs for each iteration. Default : 0.
  * `-p,--parallel_init <0/1>` use threads to initialize NCCL in parallel. Default : 0.
  * `-c,--check <0/1>` check correctness of results. This can be quite slow on large numbers of GPUs. Default : 1.
  * `-z,--blocking <0/1>` Make NCCL collective blocking, i.e. have CPUs wait and sync after each collective. Default : 0.

 ## Copyright

-NCCL tests are provided under the BSD licence. All source code and accompanying documentation is copyright (c) 2016-2017, NVIDIA CORPORATION. All rights reserved.
+NCCL tests are provided under the BSD license. All source code and accompanying documentation is copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.

--- a/src/Makefile
+++ b/src/Makefile
@ -1,7 +1,7 @@
 #
-# Copyright (c) 2015-2017, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
 #
-# See LICENCE.txt for license information
+# See LICENSE.txt for license information
 #

 CUDA_HOME ?= /usr/local/cuda
@ -18,10 +18,10 @@ NVCC = $(CUDA_HOME)/bin/nvcc
 NVCC_GENCODE ?= -gencode=arch=compute_30,code=sm_30 \
 		-gencode=arch=compute_35,code=sm_35 \
                -gencode=arch=compute_50,code=sm_50 \
-                -gencode=arch=compute_52,code=sm_52 \
-                -gencode=arch=compute_60,code=sm_60 \
+		-gencode=arch=compute_60,code=sm_60 \
                -gencode=arch=compute_61,code=sm_61 \
-                -gencode=arch=compute_61,code=compute_61
+		-gencode=arch=compute_70,code=compute_70 \
+		-gencode=arch=compute_70,code=sm_70

 NVCUFLAGS  := -ccbin $(CXX) $(NVCC_GENCODE) -std=c++11

@ -29,14 +29,16 @@ LDFLAGS    := -L${CUDA_LIB} -lcudart -lrt
 NVLDFLAGS  := -L${CUDA_LIB} -lcudart -lrt

 ifeq ($(DEBUG), 0)
-NVCUFLAGS += -O3
-CXXFLAGS  += -O3
+NVCUFLAGS += -O3 -g
+CXXFLAGS  += -O3 -g
 else
 NVCUFLAGS += -O0 -G -g
 CXXFLAGS  += -O0 -g -ggdb3
 endif

-ifeq ($(VERBOSE), 0)
+ifneq ($(VERBOSE), 0)
+NVCUFLAGS += -Xcompiler -Wall,-Wextra,-Wno-unused-parameter
+else
 .SILENT:
 endif

@ -45,7 +47,7 @@ endif
 BUILDDIR ?= ../build
 ifneq ($(NCCL_HOME), "")
 NVCUFLAGS += -I$(NCCL_HOME)/include/
-NVLDFLAGS   += -L$(NCCL_HOME)/lib
+NVLDFLAGS += -L$(NCCL_HOME)/lib
 endif

 ifeq ($(MPI), 1)
@ -53,7 +55,7 @@ NVCUFLAGS += -DMPI_SUPPORT -I$(MPI_HOME)/include
 NVLDFLAGS += -L$(MPI_HOME)/lib -lmpi
 endif
 LIBRARIES += curand nccl nvToolsExt
-NVLDFLAGS   += $(LIBRARIES:%=-l%)
+NVLDFLAGS += $(LIBRARIES:%=-l%)

 DST_DIR := $(BUILDDIR)
 SRC_FILES := $(wildcard *.cu)
@ -66,7 +68,7 @@ build: ${BIN_FILES}
 clean:
 	rm -rf ${DST_DIR}

-${DST_DIR}/%.o: %.cu
+${DST_DIR}/%.o: %.cu common.h
 	@printf "Compiling  %-35s > %s\n" $< $@
 	@mkdir -p ${DST_DIR}
 	$(NVCC) -o $@ $(NVCUFLAGS) -c $<
--- a/src/all_gather.cu
+++ b/src/all_gather.cu
@ -1,79 +1,53 @@
 /*************************************************************************
- * Copyright (c) 2016-2017, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
 *
- * See LICENCE.txt for license information
+ * See LICENSE.txt for license information
 ************************************************************************/

 #include "cuda_runtime.h"
 #include "common.h"

-
 void print_header() {
-  PRINT("# %10s  %12s  %6s  %6s        out-of-place                    in-place\n", "", "", "", "");
-  PRINT("# %10s  %12s  %6s  %7s  %5s  %5s  %7s  %7s  %5s  %5s  %7s\n", "bytes", "N", "type", 
-      "time", "algbw", "busbw", "res", "time", "algbw", "busbw", "res");
+  PRINT("# %10s  %12s  %6s            out-of-place                       in-place          \n", "", "", "");
+  PRINT("# %10s  %12s  %6s  %7s  %6s  %6s  %5s  %7s  %6s  %6s  %5s\n", "size", "count", "type",
+        "time", "algbw", "busbw", "error", "time", "algbw", "busbw", "error");
+  PRINT("# %10s  %12s  %6s  %7s  %6s  %6s  %5s  %7s  %6s  %6s  %5s\n", "(B)", "(elements)", "",
+        "(us)", "(GB/s)", "(GB/s)", "", "(us)", "(GB/s)", "(GB/s)", "");
 }

 void print_line_header (size_t size, size_t count, const char *typeName, const char *opName, int root) {
  PRINT("%12li  %12li  %6s", size, count, typeName);
 }

-void getCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t *procSharedCount, int *sameExpected, size_t count, int nranks) {
-    *sendcount = count/nranks;
-    *recvcount = (count/nranks)*nranks;
-    *sameExpected = 1;
-    *procSharedCount = 0;
-    *sendInplaceOffset = count/nranks;
-    *recvInplaceOffset = 0;
-    *paramcount = *sendcount;
+void AllGatherGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) {
+  *sendcount = count/nranks;
+  *recvcount = (count/nranks)*nranks;
+  *sendInplaceOffset = count/nranks;
+  *recvInplaceOffset = 0;
+  *paramcount = *sendcount;
 }

-void InitRecvResult(struct threadArgs_t* args, ncclDataType_t type, ncclRedOp_t op, int root, int in_place, int is_first) {
-  size_t nBytes = args->nbytes;
-  size_t count = nBytes / wordSize(type);
-  int proc = args->proc;
-  int nThreads = args->nThreads;
-  int t = args->thread;
-  int nGpus = args->nGpus;
+testResult_t AllGatherInitData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int rep, int in_place) {
+  size_t sendcount = args->sendBytes / wordSize(type);
+  size_t recvcount = args->expectedBytes / wordSize(type);
+  int nranks = args->nProcs*args->nThreads*args->nGpus;

-  while (args->sync[args->sync_idx] != t) pthread_yield();
-
-  for (int i=0; i<nGpus; i++) {
-    int device;
+  for (int i=0; i<args->nGpus; i++) {
+    int gpuid = args->localRank*args->nThreads*args->nGpus + args->thread*args->nGpus + i;
+    CUDACHECK(cudaSetDevice(gpuid));
    int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i);
-    NCCLCHECK(ncclCommCuDevice(args->comms[i], &device));
-    CUDACHECK(cudaSetDevice(device));
-
-    void* data = in_place ? (void *)((uintptr_t)args->recvbuffs[i] + args->sendInplaceOffset*rank) : args->sendbuffs[i];
-
-    CUDACHECK(cudaMemcpy((void *)((uintptr_t)args->expectedHost[0] + ((proc*nThreads + t)*nGpus + i)*nBytes), 
-                data, 
-                nBytes, cudaMemcpyDeviceToHost));
-
-    if (in_place == 0) {
-      CUDACHECK(cudaMemset(args->recvbuffs[i], 0, args->expectedBytes));
+    CUDACHECK(cudaMemset(args->recvbuffs[i], 0, args->expectedBytes));
+    void* data = in_place ? ((char*)args->recvbuffs[i])+rank*args->sendBytes : args->sendbuffs[i];
+    TESTCHECK(InitData(data, sendcount, type, rep, rank));
+    for (int j=0; j<nranks; j++) {
+      TESTCHECK(InitData(((char*)args->expected[i])+args->sendBytes*j, sendcount, type, rep, j));
    }
    CUDACHECK(cudaDeviceSynchronize());
  }
-
-  args->sync[args->sync_idx] = t + 1;
-
-  if (t+1 == nThreads) {
-#ifdef MPI_SUPPORT
-    // Last thread does the MPI allgather
-    MPI_Allgather(MPI_IN_PLACE, nBytes*nThreads*nGpus, MPI_BYTE, 
-        args->expectedHost[0], 
-        nBytes*nThreads*nGpus, MPI_BYTE, MPI_COMM_WORLD);
-#endif
-    args->sync[args->sync_idx] = 0;
-  } else {
-    while (args->sync[args->sync_idx]) pthread_yield();
-  }
-
-  args->sync_idx=!args->sync_idx;
+  return testSuccess;
 }

-void GetBw(size_t count, int typesize, double sec, double* algBw, double* busBw, int nranks) {
+void AllGatherGetBw(size_t count, int typesize, double sec, double* algBw, double* busBw, int nranks) {
  double baseBw = (double)(count * typesize * (nranks - 1)) / 1.0E9 / sec;

  *algBw = baseBw;
@ -81,26 +55,49 @@ void GetBw(size_t count, int typesize, double sec, double* algBw, double* busBw,
  *busBw = baseBw * factor;
 }

-void RunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
+testResult_t AllGatherRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
  NCCLCHECK(ncclAllGather(sendbuff, recvbuff, count, type, comm, stream));
+  return testSuccess;
 }

-void RunTest(struct threadArgs_t* args, int root, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName) {
+struct testColl allGatherTest = {
+  "AllGather",
+  AllGatherGetCollByteCount,
+  AllGatherInitData,
+  AllGatherGetBw,
+  AllGatherRunColl
+};
+
+void AllGatherGetBuffSize(size_t *sendcount, size_t *recvcount, size_t count, int nranks) {
+  size_t paramcount, sendInplaceOffset, recvInplaceOffset;
+  AllGatherGetCollByteCount(sendcount, recvcount, &paramcount, &sendInplaceOffset, &recvInplaceOffset, count, nranks);
+}
+
+testResult_t AllGatherRunTest(struct threadArgs* args, int root, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName) {
+  args->collTest = &allGatherTest;
  ncclDataType_t *run_types;
  const char **run_typenames;
  int type_count;

-  if ((int)type != -1) { 
+  if ((int)type != -1) {
    type_count = 1;
    run_types = &type;
    run_typenames = &typeName;
-  } else { 
+  } else {
    type_count = ncclNumTypes;
    run_types = test_types;
    run_typenames = test_typenames;
  }

-  for (int i=0; i<type_count; i++) { 
-     TimeTest(args, run_types[i], run_typenames[i], (ncclRedOp_t)0, NULL, 0, 1);
-  }   
+  for (int i=0; i<type_count; i++) {
+    TESTCHECK(TimeTest(args, run_types[i], run_typenames[i], (ncclRedOp_t)0, "", -1));
+  }
+  return testSuccess;
 }
+
+struct testEngine allGatherEngine = {
+  AllGatherGetBuffSize,
+  AllGatherRunTest
+};
+
+#pragma weak ncclTestEngine=allGatherEngine
--- a/src/all_reduce.cu
+++ b/src/all_reduce.cu
@ -1,89 +1,51 @@
 /*************************************************************************
- * Copyright (c) 2016-2017, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
 *
- * See LICENCE.txt for license information
+ * See LICENSE.txt for license information
 ************************************************************************/

 #include "cuda_runtime.h"
 #include "common.h"

 void print_header() {
-  PRINT("# %10s  %12s  %6s  %6s        out-of-place                    in-place\n", "", "", "", "");
-  PRINT("# %10s  %12s  %6s  %6s  %7s  %5s  %5s  %7s  %7s  %5s  %5s  %7s\n", "bytes", "N", "type", "op",
-      "time", "algbw", "busbw", "res", "time", "algbw", "busbw", "res");
+  PRINT("# %10s  %12s  %6s  %6s            out-of-place                       in-place          \n", "", "", "", "");
+  PRINT("# %10s  %12s  %6s  %6s  %7s  %6s  %6s  %5s  %7s  %6s  %6s  %5s\n", "size", "count", "type", "redop",
+        "time", "algbw", "busbw", "error", "time", "algbw", "busbw", "error");
+  PRINT("# %10s  %12s  %6s  %6s  %7s  %6s  %6s  %5s  %7s  %6s  %6s  %5s\n", "(B)", "(elements)", "", "",
+        "(us)", "(GB/s)", "(GB/s)", "", "(us)", "(GB/s)", "(GB/s)", "");
 }

 void print_line_header (size_t size, size_t count, const char *typeName, const char *opName, int root) {
  PRINT("%12li  %12li  %6s  %6s", size, count, typeName, opName);
 }

-void getCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t *procSharedCount, int *sameExpected, size_t count, int nranks) {
-    *sendcount = count;
-    *recvcount = count;
-    *sameExpected = 1;
-    *procSharedCount = 0;
-    *sendInplaceOffset = 0;
-    *recvInplaceOffset = 0;
-    *paramcount = *sendcount;
- }
-
-void InitRecvResult(struct threadArgs_t* args, ncclDataType_t type, ncclRedOp_t op, int root, int in_place, int is_first) {
-  size_t count = args->nbytes / wordSize(type);
-
-  while (args->sync[args->sync_idx] != args->thread) pthread_yield();
-
-  for (int i=0; i<args->nGpus; i++) {
-    int device;
-    NCCLCHECK(ncclCommCuDevice(args->comms[i], &device));
-    CUDACHECK(cudaSetDevice(device));
-    void* data = in_place ? args->recvbuffs[i] : args->sendbuffs[i];
-
-    if (is_first && i == 0) {
-      CUDACHECK(cudaMemcpy(args->expected[0], data, count*wordSize(type), cudaMemcpyDeviceToHost));
-    } else {
-      Accumulate(args->expected[0], data, count, type, op);
-    }
-
-    if (in_place == 0) {
-      CUDACHECK(cudaMemset(args->recvbuffs[i], 0, args->nbytes));
-    }
-    CUDACHECK(cudaDeviceSynchronize());
-  }
-
-  args->sync[args->sync_idx] = args->thread + 1;
-
-  if (args->thread+1 == args->nThreads) {
-#ifdef MPI_SUPPORT
-    // Last thread does the MPI reduction
-    if (args->nbytes > 0) {
-      void* remote, *remoteHost = malloc(args->nbytes);
-      void* myInitialData = malloc(args->nbytes);
-      memcpy(myInitialData, args->expectedHost[0], args->nbytes);
-      CUDACHECK(cudaHostRegister(remoteHost, args->nbytes, cudaHostRegisterPortable | cudaHostRegisterMapped));
-      CUDACHECK(cudaHostGetDevicePointer(&remote, remoteHost, 0));
-      for (int i=0; i<args->nProcs; i++) {
-        if (i == args->proc) {
-          MPI_Bcast(myInitialData, args->nbytes, MPI_BYTE, i, MPI_COMM_WORLD);
-          free(myInitialData);
-        } else {
-          MPI_Bcast(remoteHost, args->nbytes, MPI_BYTE, i, MPI_COMM_WORLD);
-          Accumulate(args->expected[0], remote, count, type, op);
-          cudaDeviceSynchronize();
-        }
-      }
-      CUDACHECK(cudaHostUnregister(remoteHost));
-      free(remoteHost);
-    }
-#endif
-    args->sync[args->sync_idx] = 0;
-  } else {
-    while (args->sync[args->sync_idx]) pthread_yield();
-  }
-
-  args->sync_idx = !args->sync_idx;
+void AllReduceGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) {
+  *sendcount = count;
+  *recvcount = count;
+  *sendInplaceOffset = 0;
+  *recvInplaceOffset = 0;
+  *paramcount = *sendcount;
 }

-void GetBw(size_t count, int typesize, double sec, double* algBw, double* busBw, int nranks) {
+testResult_t AllReduceInitData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int rep, int in_place) {
+  size_t sendcount = args->sendBytes / wordSize(type);
+  size_t recvcount = args->expectedBytes / wordSize(type);
+  int nranks = args->nProcs*args->nThreads*args->nGpus;
+
+  for (int i=0; i<args->nGpus; i++) {
+    int gpuid = args->localRank*args->nThreads*args->nGpus + args->thread*args->nGpus + i;
+    CUDACHECK(cudaSetDevice(gpuid));
+    int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i);
+    CUDACHECK(cudaMemset(args->recvbuffs[i], 0, args->expectedBytes));
+    void* data = in_place ? args->recvbuffs[i] : args->sendbuffs[i];
+    TESTCHECK(InitData(data, sendcount, type, rep, rank));
+    TESTCHECK(InitDataReduce(args->expected[i], recvcount, 0, type, op, rep, nranks));
+    CUDACHECK(cudaDeviceSynchronize());
+  }
+  return testSuccess;
+}
+
+void AllReduceGetBw(size_t count, int typesize, double sec, double* algBw, double* busBw, int nranks) {
  double baseBw = (double)(count * typesize) / 1.0E9 / sec;

  *algBw = baseBw;
@ -91,40 +53,62 @@ void GetBw(size_t count, int typesize, double sec, double* algBw, double* busBw,
  *busBw = baseBw * factor;
 }

-void RunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
+testResult_t AllReduceRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
  NCCLCHECK(ncclAllReduce(sendbuff, recvbuff, count, type, op, comm, stream));
+  return testSuccess;
 }

+struct testColl allReduceTest = {
+  "AllReduce",
+  AllReduceGetCollByteCount,
+  AllReduceInitData,
+  AllReduceGetBw,
+  AllReduceRunColl
+};

-void RunTest(struct threadArgs_t* args, int root, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName) {
+void AllReduceGetBuffSize(size_t *sendcount, size_t *recvcount, size_t count, int nranks) {
+  size_t paramcount, sendInplaceOffset, recvInplaceOffset;
+  AllReduceGetCollByteCount(sendcount, recvcount, &paramcount, &sendInplaceOffset, &recvInplaceOffset, count, nranks);
+}
+
+testResult_t AllReduceRunTest(struct threadArgs* args, int root, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName) {
+  args->collTest = &allReduceTest;
  ncclDataType_t *run_types;
  ncclRedOp_t *run_ops;
  const char **run_typenames, **run_opnames;
  int type_count, op_count;

-  if ((int)type != -1) { 
+  if ((int)type != -1) {
    type_count = 1;
    run_types = &type;
    run_typenames = &typeName;
-  } else { 
+  } else {
    type_count = ncclNumTypes;
    run_types = test_types;
    run_typenames = test_typenames;
  }

-  if ((int)op != -1) { 
+  if ((int)op != -1) {
    op_count = 1;
    run_ops = &op;
    run_opnames = &opName;
-  } else { 
+  } else {
    op_count = ncclNumOps;
    run_ops = test_ops;
    run_opnames = test_opnames;
  }

-  for (int i=0; i<type_count; i++) { 
-      for (int j=0; j<op_count; j++) { 
-          TimeTest(args, run_types[i], run_typenames[i], run_ops[j], run_opnames[j], 0, 1);
-      }
-  }   
+  for (int i=0; i<type_count; i++) {
+    for (int j=0; j<op_count; j++) {
+      TESTCHECK(TimeTest(args, run_types[i], run_typenames[i], run_ops[j], run_opnames[j], -1));
+    }
+  }
+  return testSuccess;
 }
+
+struct testEngine allReduceEngine = {
+  AllReduceGetBuffSize,
+  AllReduceRunTest
+};
+
+#pragma weak ncclTestEngine=allReduceEngine
--- a/src/broadcast.cu
+++ b/src/broadcast.cu
@ -1,78 +1,50 @@
 /*************************************************************************
- * Copyright (c) 2016-2017, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2016, NVIDIA CORPORATION. All rights reserved.
 *
- * See LICENCE.txt for license information
+ * See LICENSE.txt for license information
 ************************************************************************/

 #include "cuda_runtime.h"
 #include "common.h"
-#include <assert.h>

 void print_header() {
-  PRINT("# %10s  %12s  %6s  %6s        out-of-place\n", "", "", "", "");
-  PRINT("# %10s  %12s  %6s  %6s  %7s  %5s  %5s  %7s\n", "bytes", "N", "type", "root", 
-      "time", "algbw", "busbw", "res");
+  PRINT("# %10s  %12s  %6s  %6s            out-of-place                       in-place          \n", "", "", "", "");
+  PRINT("# %10s  %12s  %6s  %6s  %7s  %6s  %6s  %5s  %7s  %6s  %6s  %5s\n", "size", "count", "type", "root",
+        "time", "algbw", "busbw", "error", "time", "algbw", "busbw", "error");
+  PRINT("# %10s  %12s  %6s  %6s  %7s  %6s  %6s  %5s  %7s  %6s  %6s  %5s\n", "(B)", "(elements)", "", "",
+        "(us)", "(GB/s)", "(GB/s)", "", "(us)", "(GB/s)", "(GB/s)", "");
 }

 void print_line_header (size_t size, size_t count, const char *typeName, const char *opName, int root) {
  PRINT("%12li  %12li  %6s  %6i", size, count, typeName, root);
 }

-void getCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t *procSharedCount, int *sameExpected, size_t count, int nranks) {
-    *sendcount = count;
-    *recvcount = count;
-    *sameExpected = 0;
-    *procSharedCount = count;
-    *sendInplaceOffset = 0;
-    *recvInplaceOffset = 0;
-    *paramcount = *sendcount;
+void BroadcastGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) {
+  *sendcount = count;
+  *recvcount = count;
+  *sendInplaceOffset = 0;
+  *recvInplaceOffset = 0;
+  *paramcount = *sendcount;
 }

-void InitRecvResult(struct threadArgs_t* args, ncclDataType_t type, ncclRedOp_t op, int root, int in_place, int is_first) {
-  int root_proc = root/(args->nThreads*args->nGpus);
-  int root_thread = (root/args->nGpus)%(args->nThreads);
-  int root_gpu = root%args->nGpus;
-
-  assert(args->expectedBytes == args->nbytes);
-
-  if (root_thread == args->thread) {
-      if (root_proc == args->proc) {  
-         CUDACHECK(cudaMemcpy(args->procSharedHost,
-                    args->sendbuffs[root_gpu],
-                    args->nbytes, cudaMemcpyDeviceToHost));
-      }
-#ifdef MPI_SUPPORT 
-      MPI_Bcast(args->procSharedHost, args->nbytes, MPI_BYTE, root_proc, MPI_COMM_WORLD);
-#endif
-
-      args->sync[0] = 0;
-  }
-
-  Barrier(args);
+testResult_t BroadcastInitData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int rep, int in_place) {
+  size_t sendcount = args->sendBytes / wordSize(type);
+  size_t recvcount = args->expectedBytes / wordSize(type);

  for (int i=0; i<args->nGpus; i++) {
-     int device;
-     NCCLCHECK(ncclCommCuDevice(args->comms[i], &device)); 
-     CUDACHECK(cudaSetDevice(device));
-
-     //set expected buf to zero at root, copy over source data at others
-     if ((root_proc == args->proc) 
-         && (root_thread == args->thread) 
-         && (root_gpu == i)) { 
-         memset(args->expectedHost[i], 0, args->nbytes); 
-     } else { 
-         memcpy(args->expectedHost[i], args->procSharedHost, args->nbytes);
-     }
-
-     //reset recvbufs to zero
-     CUDACHECK(cudaMemset(args->recvbuffs[i], 0, args->nbytes));
-     CUDACHECK(cudaDeviceSynchronize());
+    int gpuid = args->localRank*args->nThreads*args->nGpus + args->thread*args->nGpus + i;
+    CUDACHECK(cudaSetDevice(gpuid));
+    int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i);
+    CUDACHECK(cudaMemset(args->recvbuffs[i], 0, args->expectedBytes));
+    void* data = in_place ? args->recvbuffs[i] : args->sendbuffs[i];
+    if (rank == root) TESTCHECK(InitData(data, sendcount, type, rep, rank));
+    TESTCHECK(InitData(args->expected[i], recvcount, type, rep, root));
+    CUDACHECK(cudaDeviceSynchronize());
  }
-
-  Barrier(args);
+  return testSuccess;
 }

-void GetBw(size_t count, int typesize, double sec, double* algBw, double* busBw, int nranks) {
+void BroadcastGetBw(size_t count, int typesize, double sec, double* algBw, double* busBw, int nranks) {
  double baseBw = (double)(count * typesize) / 1.0E9 / sec;

  *algBw = baseBw;
@ -80,42 +52,69 @@ void GetBw(size_t count, int typesize, double sec, double* algBw, double* busBw,
  *busBw = baseBw * factor;
 }

-void RunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
-  int rank; 
+testResult_t BroadcastRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
+  int rank;
  NCCLCHECK(ncclCommUserRank(comm, &rank));
-  if (rank == root) { 
+#if NCCL_MAJOR >= 2 && NCCL_MINOR >= 2
+  NCCLCHECK(ncclBroadcast(sendbuff, recvbuff, count, type, root, comm, stream));
+#else
+  if (rank == root) {
      NCCLCHECK(ncclBcast(sendbuff, count, type, root, comm, stream));
-  } else { 
+  } else {
      NCCLCHECK(ncclBcast(recvbuff, count, type, root, comm, stream));
-  } 
+  }
+#endif
+  return testSuccess;
 }

-void RunTest(struct threadArgs_t* args, int root, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName) {
+struct testColl broadcastTest = {
+  "Broadcast",
+  BroadcastGetCollByteCount,
+  BroadcastInitData,
+  BroadcastGetBw,
+  BroadcastRunColl
+};
+
+void BroadcastGetBuffSize(size_t *sendcount, size_t *recvcount, size_t count, int nranks) {
+  size_t paramcount, sendInplaceOffset, recvInplaceOffset;
+  BroadcastGetCollByteCount(sendcount, recvcount, &paramcount, &sendInplaceOffset, &recvInplaceOffset, count, nranks);
+}
+
+testResult_t BroadcastRunTest(struct threadArgs* args, int root, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName) {
+  args->collTest = &broadcastTest;
  ncclDataType_t *run_types;
  const char **run_typenames;
  int type_count;
-  int begin_root, end_root; 
+  int begin_root, end_root;

-  if ((int)type != -1) { 
+  if ((int)type != -1) {
    type_count = 1;
    run_types = &type;
    run_typenames = &typeName;
-  } else { 
+  } else {
    type_count = ncclNumTypes;
    run_types = test_types;
    run_typenames = test_typenames;
  }

-  if (root != -1) { 
-     begin_root = end_root = root;
-  } else { 
-     begin_root = 0;
-     end_root = args->nProcs*args->nThreads*args->nGpus-1;
+  if (root != -1) {
+    begin_root = end_root = root;
+  } else {
+    begin_root = 0;
+    end_root = args->nProcs*args->nThreads*args->nGpus-1;
  }

-  for (int i=0; i<type_count; i++) { 
-       for (int j=begin_root; j<=end_root; j++) {
-          TimeTest(args, run_types[i], run_typenames[i], (ncclRedOp_t)0, NULL, j, 0);
-       }
-  }   
+  for (int i=0; i<type_count; i++) {
+    for (int j=begin_root; j<=end_root; j++) {
+      TESTCHECK(TimeTest(args, run_types[i], run_typenames[i], (ncclRedOp_t)0, "", j));
+    }
+  }
+  return testSuccess;
 }
+
+struct testEngine broadcastEngine = {
+  BroadcastGetBuffSize,
+  BroadcastRunTest
+};
+
+#pragma weak ncclTestEngine=broadcastEngine
--- a/src/common.cu
+++ b/src/common.cu
--- a/src/common.h
+++ b/src/common.h
@ -1,8 +1,10 @@
 /*************************************************************************
- * Copyright (c) 2016-2017, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
 *
- * See LICENCE.txt for license information
+ * See LICENSE.txt for license information
 ************************************************************************/
+#ifndef __COMMON_H__
+#define __COMMON_H__

 #include "nccl.h"
 #include <stdio.h>
@ -17,23 +19,75 @@
 #define CUDACHECK(cmd) do {                         \
  cudaError_t e = cmd;                              \
  if( e != cudaSuccess ) {                          \
-    printf("Cuda failure %s:%d '%s'\n",             \
+    char hostname[1024];                            \
+    getHostName(hostname, 1024);                    \
+    printf("%s: Test CUDA failure %s:%d '%s'\n",    \
+         hostname,                                  \
        __FILE__,__LINE__,cudaGetErrorString(e));   \
-    exit(EXIT_FAILURE);                             \
+    return testCudaError;                           \
  }                                                 \
 } while(0)

 #define NCCLCHECK(cmd) do {                         \
  ncclResult_t r = cmd;                             \
  if (r!= ncclSuccess) {                            \
-    printf("NCCL failure %s:%d '%s'\n",             \
+    char hostname[1024];                            \
+    getHostName(hostname, 1024);                    \
+    printf("%s: Test NCCL failure %s:%d '%s'\n",    \
+         hostname,                                  \
        __FILE__,__LINE__,ncclGetErrorString(r));   \
-    exit(EXIT_FAILURE);                             \
+    return testNcclError;                           \
  }                                                 \
 } while(0)

-struct threadArgs_t {
-  void *proc_args;
+typedef enum {
+  testSuccess = 0,
+  testInternalError = 1,
+  testCudaError = 2,
+  testNcclError = 3,
+  testCuRandError = 4
+} testResult_t;
+
+// Relay errors up and trace
+#define TESTCHECK(cmd) do {                         \
+  testResult_t r = cmd;                             \
+  if (r!= testSuccess) {                            \
+    char hostname[1024];                            \
+    getHostName(hostname, 1024);                    \
+    printf(" .. %s: Test failure %s:%d\n",          \
+         hostname,                                  \
+        __FILE__,__LINE__);                         \
+    return r;                                       \
+  }                                                 \
+} while(0)
+
+struct testColl {
+  const char name[20];
+  void (*getCollByteCount)(
+      size_t *sendcount, size_t *recvcount, size_t *paramcount,
+      size_t *sendInplaceOffset, size_t *recvInplaceOffset,
+      size_t count, int nranks);
+  testResult_t (*initData)(struct threadArgs* args, ncclDataType_t type,
+      ncclRedOp_t op, int root, int rep, int in_place);
+  void (*getBw)(size_t count, int typesize, double sec, double* algBw, double* busBw, int nranks);
+  testResult_t (*runColl)(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type,
+      ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream);
+};
+extern struct testColl allReduceTest;
+extern struct testColl allGatherTest;
+extern struct testColl reduceScatterTest;
+extern struct testColl broadcastTest;
+extern struct testColl reduceTest;
+
+struct testEngine {
+  void (*getBuffSize)(size_t *sendcount, size_t *recvcount, size_t count, int nranks);
+  testResult_t (*runTest)(struct threadArgs* args, int root, ncclDataType_t type,
+      const char* typeName, ncclRedOp_t op, const char* opName);
+};
+
+extern struct testEngine ncclTestEngine;
+
+struct threadArgs {
  size_t nbytes;
  size_t minbytes;
  size_t maxbytes;
@ -55,11 +109,8 @@ struct threadArgs_t {
  ncclComm_t* comms;
  cudaStream_t* streams;

-  void** expectedHost;
  void** expected;
  size_t expectedBytes;
-  void* procSharedHost;
-  void* procShared;
  volatile int* sync;
  int sync_idx;
  volatile int* barrier;
@ -72,27 +123,28 @@ struct threadArgs_t {
  int* errors;
  double* bw;
  int* bw_count;
+
+  struct testColl* collTest;
+};
+
+typedef testResult_t (*threadFunc_t)(struct threadArgs* args);
+struct testThread {
+  pthread_t thread;
+  threadFunc_t func;
+  struct threadArgs args;
+  testResult_t ret;
 };

 #include <chrono>

 // Provided by common.cu
-extern void Barrier(struct threadArgs_t* args);
-extern void TimeTest(struct threadArgs_t* args, ncclDataType_t type, const char* typeName, ncclRedOp_t op,  const char* opName, int root, int inPlace);
-extern void Randomize(void* ptr, size_t count, ncclDataType_t type, int seed);
-extern void Accumulate(void* out, void* in, size_t n, ncclDataType_t type, ncclRedOp_t op);
-extern void CheckDelta(void* expected, void* results, size_t count, ncclDataType_t type, double* devmax);
-extern double DeltaMaxValue(ncclDataType_t type);
+extern void Barrier(struct threadArgs* args);
+extern testResult_t TimeTest(struct threadArgs* args, ncclDataType_t type, const char* typeName, ncclRedOp_t op,  const char* opName, int root);
+extern testResult_t InitDataReduce(void* data, const size_t count, const size_t offset, ncclDataType_t type, ncclRedOp_t op, const int rep, const int nranks);
+extern testResult_t InitData(void* data, const size_t count, ncclDataType_t type, const int rep, const int rank);
+extern void AllocateBuffs(void **sendbuff, void **recvbuff, void **expected, void **expectedHost, size_t nbytes, int nranks);

 // Provided by each coll
-void RunTest(struct threadArgs_t* args, int root, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName);
-extern void GetBw(size_t count, int typeSize, double sec, double* algBw, double* busBw, int nranks);
-extern void RunColl(void* sendbuf, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op,  int root, ncclComm_t comm, cudaStream_t stream);
-extern void InitData(struct threadArgs_t* args, ncclDataType_t type, ncclRedOp_t op,  int in_place, int is_first);
-extern double CheckData(struct threadArgs_t* args, ncclDataType_t type, ncclRedOp_t op);
-extern void AllocateBuffs(void **sendbuff, void **recvbuff, void **expected, void **expectedHost, size_t nbytes, int nranks);
-extern void InitRecvResult(struct threadArgs_t* args, ncclDataType_t type, ncclRedOp_t op,  int root, int in_place, int is_first);
-extern void getCollByteCount(size_t *sendbytes, size_t *recvbytes, size_t *parambytes, size_t *sendInlineOffset, size_t *recvInlineOffset, size_t *procSharedBytes, int *sameexpected, size_t nbytes, int nranks);
 extern void print_line_header (size_t size, size_t count, const char *typeName, const char *opName, int root);
 extern void print_header();

@ -152,7 +204,33 @@ extern const char *test_typenames[ncclNumTypes];
 extern ncclRedOp_t test_ops[ncclNumOps];
 extern const char *test_opnames[ncclNumOps];

+static int ncclstringtotype(char *str) {
+    for (int t=0; t<ncclNumTypes; t++) {
+      if (strcmp(str, test_typenames[t]) == 0) {
+        return t;
+      }
+    }
+    if (strcmp(str, "all") == 0) {
+      return -1;
+    }
+    printf("invalid type %s, defaulting to %s .. \n", str, test_typenames[ncclFloat]);
+    return ncclFloat;
+}
+
+static int ncclstringtoop (char *str) {
+    for (int o=0; o<ncclNumOps; o++) {
+      if (strcmp(str, test_opnames[o]) == 0) {
+        return o;
+      }
+    }
+    if (strcmp(str, "all") == 0) {
+      return -1;
+    }
+    printf("invalid op %s, defaulting to %s .. \n", str, test_opnames[ncclSum]);
+    return ncclSum;
+}
+
 extern thread_local int is_main_thread;
 #define PRINT if (is_main_thread) printf

-
+#endif
--- a/src/nccl1_compat.h
+++ b/src/nccl1_compat.h
@ -1,7 +1,7 @@
 /*************************************************************************
- * Copyright (c) 2017, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved.
 *
- * See LICENCE.txt for license information
+ * See LICENSE.txt for license information
 ************************************************************************/

 #ifndef NCCL1_COMPAT_H
--- a/src/reduce.cu
+++ b/src/reduce.cu
@ -1,159 +1,123 @@
 /*************************************************************************
- * Copyright (c) 2016-2017, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
 *
- * See LICENCE.txt for license information
+ * See LICENSE.txt for license information
 ************************************************************************/

-#include <assert.h>
 #include "cuda_runtime.h"
 #include "common.h"

 void print_header() {
-  PRINT("# %10s  %12s  %6s  %6s        out-of-place                    in-place\n", "", "", "", "");
-  PRINT("# %10s  %12s  %6s  %6s  %6s %7s  %5s  %5s  %7s  %7s  %5s  %5s  %7s\n", "bytes", "N", "type", "op", "root",
-      "time", "algbw", "busbw", "res", "time", "algbw", "busbw", "res");
+  PRINT("# %10s  %12s  %6s  %6s            out-of-place                       in-place          \n", "", "", "", "");
+  PRINT("# %10s  %12s  %6s  %6s  %6s  %7s  %6s  %6s  %5s  %7s  %6s  %6s  %5s\n", "size", "count", "type", "redop", "root",
+        "time", "algbw", "busbw", "error", "time", "algbw", "busbw", "error");
+  PRINT("# %10s  %12s  %6s  %6s  %6s  %7s  %6s  %6s  %5s  %7s  %6s  %6s  %5s\n", "(B)", "(elements)", "", "", "",
+        "(us)", "(GB/s)", "(GB/s)", "", "(us)", "(GB/s)", "(GB/s)", "");
 }

 void print_line_header (size_t size, size_t count, const char *typeName, const char *opName, int root) {
  PRINT("%12li  %12li  %6s  %6s  %6i", size, count, typeName, opName, root);
 }

-void getCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t *procSharedCount, int *sameExpected, size_t count, int nranks) {
-    *sendcount = count;
-    *recvcount = count;
-    *sameExpected = 0;
-    *procSharedCount = count;
-    *sendInplaceOffset = 0;
-    *recvInplaceOffset = 0;
-    *paramcount = *sendcount;
- }
-
-void InitRecvResult(struct threadArgs_t* args, ncclDataType_t type, ncclRedOp_t op, int root, int in_place, int is_first) {
-  size_t count = args->expectedBytes / wordSize(type);
-  int root_gpu = root%args->nGpus;
-
-  assert(args->expectedBytes == args->nbytes);
-
-  while (args->sync[args->sync_idx] != args->thread) pthread_yield();
-
-  for (int i=0; i<args->nGpus; i++) {
-    int device;
-    NCCLCHECK(ncclCommCuDevice(args->comms[i], &device));
-    CUDACHECK(cudaSetDevice(device));
-    void* data = in_place ? args->recvbuffs[i] : args->sendbuffs[i];
-
-    if (is_first && i == 0) {
-      CUDACHECK(cudaMemcpy(args->procSharedHost, data, count*wordSize(type), cudaMemcpyDeviceToHost));
-    } else {
-      Accumulate(args->procShared, data, count, type, op);
-    }
-
-    if (in_place == 0) {
-      CUDACHECK(cudaMemset(args->recvbuffs[i], 0, args->expectedBytes));
-    }
-    CUDACHECK(cudaDeviceSynchronize());
-  }
-
-  args->sync[args->sync_idx] = args->thread + 1;
-
-  if (args->thread+1 == args->nThreads) {
-#ifdef MPI_SUPPORT
-    int root_proc = root/(args->nThreads*args->nGpus);
-    if (args->expectedBytes) {
-      // Last thread does the MPI reduction
-      if (root_proc == args->proc) { 
-        void* temp, *tempHost = malloc(args->expectedBytes);
-        CUDACHECK(cudaHostRegister(tempHost, args->expectedBytes, 0));
-        CUDACHECK(cudaHostGetDevicePointer(&temp, tempHost, 0));
-
-        for (int i=0; i<args->nProcs; i++) {
-          if (i == args->proc) continue;
-          MPI_Recv(tempHost, args->expectedBytes, MPI_BYTE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
-
-          Accumulate(args->procShared, temp, count, type, op);
-          CUDACHECK(cudaDeviceSynchronize());
-        }
-
-        CUDACHECK(cudaHostUnregister(tempHost));
-        free(tempHost);
-      } else {
-        MPI_Send(args->procSharedHost, args->expectedBytes, MPI_BYTE, root_proc, 0, MPI_COMM_WORLD);
-      }
-    }
-#endif
-    args->sync[args->sync_idx] = 0;
-  } else {
-    while (args->sync[args->sync_idx]) pthread_yield();
-  }
-
-  //if root fill expected bytes with reduced data
-  // else if in_place, leave fill it with original data, else set to zero
-  for (int i=0; i<args->nGpus; i++) {
-      int rank = (args->proc*args->nThreads + args->thread)*args->nGpus + i;
-      if (rank == root) { 
-          memcpy(args->expectedHost[root_gpu], args->procSharedHost, args->expectedBytes); 
-      } else { 
-         if (in_place == 1) {
-              CUDACHECK(cudaMemcpy(args->expectedHost[i], args->recvbuffs[i], args->expectedBytes, cudaMemcpyDeviceToHost));
-          } else {
-              memset(args->expectedHost[i], 0, args->expectedBytes); 
-          }
-      } 
-  }
-
-  args->sync_idx = !args->sync_idx;
+void ReduceGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) {
+  *sendcount = count;
+  *recvcount = count;
+  *sendInplaceOffset = 0;
+  *recvInplaceOffset = 0;
+  *paramcount = *sendcount;
 }

-void GetBw(size_t count, int typesize, double sec, double* algBw, double* busBw, int nranks) {
+testResult_t ReduceInitData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int rep, int in_place) {
+  size_t sendcount = args->sendBytes / wordSize(type);
+  size_t recvcount = args->expectedBytes / wordSize(type);
+  int nranks = args->nProcs*args->nThreads*args->nGpus;
+
+  for (int i=0; i<args->nGpus; i++) {
+    int gpuid = args->localRank*args->nThreads*args->nGpus + args->thread*args->nGpus + i;
+    CUDACHECK(cudaSetDevice(gpuid));
+    int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i);
+    CUDACHECK(cudaMemset(args->recvbuffs[i], 0, args->expectedBytes));
+    void* data = in_place ? args->recvbuffs[i] : args->sendbuffs[i];
+    TESTCHECK(InitData(data, sendcount, type, rep, rank));
+    CUDACHECK(cudaMemcpy(args->expected[i], args->recvbuffs[i], args->expectedBytes, cudaMemcpyDefault));
+    if (rank == root) TESTCHECK(InitDataReduce(args->expected[i], recvcount, 0, type, op, rep, nranks));
+    CUDACHECK(cudaDeviceSynchronize());
+  }
+  return testSuccess;
+}
+
+void ReduceGetBw(size_t count, int typesize, double sec, double* algBw, double* busBw, int nranks) {
  double baseBw = (double)(count * typesize) / 1.0E9 / sec;
  *algBw = baseBw;
  *busBw = baseBw;
 }

-void RunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
+testResult_t ReduceRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
  NCCLCHECK(ncclReduce(sendbuff, recvbuff, count, type, op, root, comm, stream));
+  return testSuccess;
 }

+struct testColl reduceTest = {
+  "Reduce",
+  ReduceGetCollByteCount,
+  ReduceInitData,
+  ReduceGetBw,
+  ReduceRunColl
+};

-void RunTest(struct threadArgs_t* args, int root, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName) {
+void ReduceGetBuffSize(size_t *sendcount, size_t *recvcount, size_t count, int nranks) {
+  size_t paramcount, sendInplaceOffset, recvInplaceOffset;
+  ReduceGetCollByteCount(sendcount, recvcount, &paramcount, &sendInplaceOffset, &recvInplaceOffset, count, nranks);
+}
+
+testResult_t ReduceRunTest(struct threadArgs* args, int root, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName) {
+  args->collTest = &reduceTest;
  ncclDataType_t *run_types;
  ncclRedOp_t *run_ops;
  const char **run_typenames, **run_opnames;
  int type_count, op_count;
  int begin_root, end_root;

-  if ((int)type != -1) { 
+  if ((int)type != -1) {
    type_count = 1;
    run_types = &type;
    run_typenames = &typeName;
-  } else { 
+  } else {
    type_count = ncclNumTypes;
    run_types = test_types;
    run_typenames = test_typenames;
  }

-  if ((int)op != -1) { 
+  if ((int)op != -1) {
    op_count = 1;
    run_ops = &op;
    run_opnames = &opName;
-  } else { 
+  } else {
    op_count = ncclNumOps;
    run_ops = test_ops;
    run_opnames = test_opnames;
  }

-  if (root != -1) { 
-     begin_root = end_root = root;
-  } else { 
-     begin_root = 0;
-     end_root = args->nProcs*args->nThreads*args->nGpus-1;
+  if (root != -1) {
+    begin_root = end_root = root;
+  } else {
+    begin_root = 0;
+    end_root = args->nProcs*args->nThreads*args->nGpus-1;
  }

-  for (int i=0; i<type_count; i++) { 
-      for (int j=0; j<op_count; j++) { 
-         for (int k=begin_root; k<=end_root; k++) { 
-             TimeTest(args, run_types[i], run_typenames[i], run_ops[j], run_opnames[j], k, 1);
-         }
+  for (int i=0; i<type_count; i++) {
+    for (int j=0; j<op_count; j++) {
+      for (int k=begin_root; k<=end_root; k++) {
+        TESTCHECK(TimeTest(args, run_types[i], run_typenames[i], run_ops[j], run_opnames[j], k));
      }
-  }   
+    }
+  }
+  return testSuccess;
 }
+
+struct testEngine reduceEngine = {
+  ReduceGetBuffSize,
+  ReduceRunTest
+};
+
+#pragma weak ncclTestEngine=reduceEngine
--- a/src/reduce_scatter.cu
+++ b/src/reduce_scatter.cu
@ -1,99 +1,52 @@
 /*************************************************************************
- * Copyright (c) 2016-2017, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
 *
- * See LICENCE.txt for license information
+ * See LICENSE.txt for license information
 ************************************************************************/

 #include "cuda_runtime.h"
 #include "common.h"

 void print_header() {
-  PRINT("# %10s  %12s  %6s  %6s        out-of-place                    in-place\n", "", "", "", "");
-  PRINT("# %10s  %12s  %6s  %6s %7s  %5s  %5s  %7s  %7s  %5s  %5s  %7s\n", "bytes", "N", "type", "op",
-      "time", "algbw", "busbw", "res", "time", "algbw", "busbw", "res");
+  PRINT("# %10s  %12s  %6s  %6s            out-of-place                       in-place          \n", "", "", "", "");
+  PRINT("# %10s  %12s  %6s  %6s  %7s  %6s  %6s  %5s  %7s  %6s  %6s  %5s\n", "size", "count", "type", "redop",
+        "time", "algbw", "busbw", "error", "time", "algbw", "busbw", "error");
+  PRINT("# %10s  %12s  %6s  %6s  %7s  %6s  %6s  %5s  %7s  %6s  %6s  %5s\n", "(B)", "(elements)", "", "",
+        "(us)", "(GB/s)", "(GB/s)", "", "(us)", "(GB/s)", "(GB/s)", "");
 }

 void print_line_header (size_t size, size_t count, const char *typeName, const char *opName, int root) {
  PRINT("%12li  %12li  %6s  %6s", size, count, typeName, opName);
 }

-void getCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t *procSharedCount, int *sameExpected, size_t count, int nranks) {
-    *sendcount = (count/nranks)*nranks;
-    *recvcount = count/nranks;
-    *sameExpected = 0;
-    *procSharedCount = *sendcount;
-    *sendInplaceOffset = 0;
-    *recvInplaceOffset = count/nranks;
-    *paramcount = *recvcount;
+void ReduceScatterGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) {
+  *sendcount = (count/nranks)*nranks;
+  *recvcount = count/nranks;
+  *sendInplaceOffset = 0;
+  *recvInplaceOffset = count/nranks;
+  *paramcount = *recvcount;
 }

-void InitRecvResult(struct threadArgs_t* args, ncclDataType_t type, ncclRedOp_t op, int root, int in_place, int is_first) {
-  size_t recvbytes = args->expectedBytes;
-  size_t recvcount = args->expectedBytes / wordSize(type);
-  size_t sendbytes = args->sendBytes;
+testResult_t ReduceScatterInitData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int rep, int in_place) {
  size_t sendcount = args->sendBytes / wordSize(type);
-
-  while (args->sync[args->sync_idx] != args->thread) pthread_yield();
+  size_t recvcount = args->expectedBytes / wordSize(type);
+  int nranks = args->nProcs*args->nThreads*args->nGpus;

  for (int i=0; i<args->nGpus; i++) {
-    int device;
-    NCCLCHECK(ncclCommCuDevice(args->comms[i], &device));
-    CUDACHECK(cudaSetDevice(device));
+    int gpuid = args->localRank*args->nThreads*args->nGpus + args->thread*args->nGpus + i;
+    CUDACHECK(cudaSetDevice(gpuid));
+    int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i);
+    CUDACHECK(cudaMemset(args->recvbuffs[i], 0, args->expectedBytes));
    void* data = in_place ? args->recvbuffs[i] : args->sendbuffs[i];
-
-    if (is_first && i == 0) {
-      CUDACHECK(cudaMemcpy(args->procSharedHost, data, sendbytes, cudaMemcpyDeviceToHost));
-    } else {
-      Accumulate(args->procShared, data, sendcount, type, op);
-    }
-
-    CUDACHECK(cudaDeviceSynchronize());
-    if (in_place == 0) {
-      CUDACHECK(cudaMemset(args->recvbuffs[i], 0, recvbytes));
-    }
+    TESTCHECK(InitData(data, sendcount, type, rep, rank));
+    CUDACHECK(cudaMemcpy(args->expected[i], args->recvbuffs[i], args->expectedBytes, cudaMemcpyDefault));
+    TESTCHECK(InitDataReduce(args->expected[i], recvcount, rank*recvcount, type, op, rep, nranks));
    CUDACHECK(cudaDeviceSynchronize());
  }
-
-  args->sync[args->sync_idx] = args->thread + 1;
-
-  if (args->thread+1 == args->nThreads) {
-#ifdef MPI_SUPPORT
-    if (sendbytes > 0) {
-      // Last thread does the MPI reduction
-      void* remote, *remoteHost = malloc(sendbytes);
-      void* myInitialData = malloc(sendbytes);
-      memcpy(myInitialData, args->procSharedHost, sendbytes);
-      CUDACHECK(cudaHostRegister(remoteHost, sendbytes, 0));
-      CUDACHECK(cudaHostGetDevicePointer(&remote, remoteHost, 0));
-
-      for (int i=0; i<args->nProcs; i++) {
-        if (i == args->proc) {
-          MPI_Bcast(myInitialData, sendbytes, MPI_BYTE, i, MPI_COMM_WORLD);
-          free(myInitialData);
-        } else {
-          MPI_Bcast(remoteHost, sendbytes, MPI_BYTE, i, MPI_COMM_WORLD);
-          Accumulate(args->procShared, remote, sendcount, type, op);
-          cudaDeviceSynchronize();
-        }
-      }
-      CUDACHECK(cudaHostUnregister(remoteHost));
-      free(remoteHost);
-    }
-#endif
-    args->sync[args->sync_idx] = 0;
-  } else {
-    while (args->sync[args->sync_idx]) pthread_yield();
-  }
-
-  for (int i=0; i<args->nGpus; i++) {
-      int offset = ((args->proc*args->nThreads + args->thread)*args->nGpus + i)*recvbytes;
-      memcpy(args->expectedHost[i], (void *)((uintptr_t)args->procSharedHost + offset), recvbytes);
-  }
-
-  args->sync_idx = !args->sync_idx;
+  return testSuccess;
 }

-void GetBw(size_t count, int typesize, double sec, double* algBw, double* busBw, int nranks) {
+void ReduceScatterGetBw(size_t count, int typesize, double sec, double* algBw, double* busBw, int nranks) {
  double baseBw = (double)(count * typesize * (nranks - 1)) / 1.0E9 / sec;

  *algBw = baseBw;
@ -101,17 +54,32 @@ void GetBw(size_t count, int typesize, double sec, double* algBw, double* busBw,
  *busBw = baseBw * factor;
 }

-void RunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
+testResult_t ReduceScatterRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
  NCCLCHECK(ncclReduceScatter(sendbuff, recvbuff, count, type, op, comm, stream));
+  return testSuccess;
 }

-void RunTest(struct threadArgs_t* args, int root, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName) {
+struct testColl reduceScatterTest = {
+  "ReduceScatter",
+  ReduceScatterGetCollByteCount,
+  ReduceScatterInitData,
+  ReduceScatterGetBw,
+  ReduceScatterRunColl
+};
+
+void ReduceScatterGetBuffSize(size_t *sendcount, size_t *recvcount, size_t count, int nranks) {
+  size_t paramcount, sendInplaceOffset, recvInplaceOffset;
+  ReduceScatterGetCollByteCount(sendcount, recvcount, &paramcount, &sendInplaceOffset, &recvInplaceOffset, count, nranks);
+}
+
+testResult_t ReduceScatterRunTest(struct threadArgs* args, int root, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName) {
+  args->collTest = &reduceScatterTest;
  ncclDataType_t *run_types;
  ncclRedOp_t *run_ops;
  const char **run_typenames, **run_opnames;
  int type_count, op_count;

-  if ((int)type != -1) { 
+  if ((int)type != -1) {
    type_count = 1;
    run_types = &type;
    run_typenames = &typeName;
@ -121,19 +89,27 @@ void RunTest(struct threadArgs_t* args, int root, ncclDataType_t type, const cha
    run_typenames = test_typenames;
  }

-  if ((int)op != -1) { 
+  if ((int)op != -1) {
    run_ops = &op;
    run_opnames = &opName;
    op_count = 1;
-  } else { 
+  } else {
    op_count = sizeof(test_ops)/sizeof(test_ops[0]);
    run_ops = test_ops;
    run_opnames = test_opnames;
  }

-  for (int i=0; i<type_count; i++) { 
-      for (int j=0; j<op_count; j++) { 
-          TimeTest(args, run_types[i], run_typenames[i], run_ops[j], run_opnames[j], 0, 1);
-      }
-  }   
+  for (int i=0; i<type_count; i++) {
+    for (int j=0; j<op_count; j++) {
+      TESTCHECK(TimeTest(args, run_types[i], run_typenames[i], run_ops[j], run_opnames[j], -1));
+    }
+  }
+  return testSuccess;
 }
+
+struct testEngine reduceScatterEngine = {
+  ReduceScatterGetBuffSize,
+  ReduceScatterRunTest
+};
+
+#pragma weak ncclTestEngine=reduceScatterEngine