NCCL_TESTS_VERSION 2.17.8

Clarified use of Mebibytes and Gibibytes for sizes
NCCL_TESTS_VERSION 2.17.7
2026-01-14 02:47:21 +08:00 · 2026-01-06 15:00:17 -08:00 · 2026-01-06 14:59:17 -08:00 · 2025-12-30 20:18:25 +00:00 · 2025-12-30 20:18:25 +00:00 · 2025-12-30 20:18:25 +00:00
16 changed files with 1366 additions and 278 deletions
--- a/README.md
+++ b/README.md
@ -32,13 +32,14 @@ NCCL tests can run on multiple processes, multiple threads, and multiple CUDA de

 ### Quick examples

-Run on single node with 8 GPUs (`-g 8`), scanning from 8 Bytes to 128MBytes :
+Run on single node with 8 GPUs (`-g 8`), scanning from 8 Bytes to 128MiB (Mebibytes), doubling between each test (`-f 2`) :

 ```shell
 $ ./build/all_reduce_perf -b 8 -e 128M -f 2 -g 8
 ```

-Run 64 MPI processes on nodes with 8 GPUs each, for a total of 64 GPUs spread across 8 nodes :
+Run 64 MPI processes on nodes with 8 GPUs each, for a total of 64 GPUs spread across 8 nodes.
+Scanning from 8 Bytes to 32GiB (Gibibytes), doubling between each test (`-f 2`).
 (NB: The nccl-tests binaries must be compiled with `MPI=1` for this case)

 ```shell
@ -57,10 +58,10 @@ All tests support the same set of arguments :
  * `-t,--nthreads <num threads>` number of threads per process. Default : 1.
  * `-g,--ngpus <GPUs per thread>` number of gpus per thread. Default : 1.
 * Sizes to scan
-  * `-b,--minbytes <min size in bytes>` minimum size to start with. Default : 32M.
-  * `-e,--maxbytes <max size in bytes>` maximum size to end at. Default : 32M.
-  * Increments can be either fixed or a multiplication factor. Only one of those should be used
-    * `-i,--stepbytes <increment size>` fixed increment between sizes. Default : 1M.
+  * `-b,--minbytes <min size in bytes>` minimum size to start with. Default : 32M (Mebibytes).
+  * `-e,--maxbytes <max size in bytes>` maximum size to end at. Default : 32M (Mebibytes).
+  * Increments can be either fixed or a multiplication factor. Only one of those should be used.
+    * `-i,--stepbytes <increment size>` fixed increment between sizes. Default : 1M (Mebibytes).
    * `-f,--stepfactor <increment factor>` multiplication factor between sizes. Default : disabled.
 * NCCL operations arguments
  * `-o,--op <sum/prod/min/max/avg/all>` Specify which reduction operation to perform. Only relevant for reduction operations like Allreduce, Reduce or ReduceScatter. Default : Sum.
@ -79,6 +80,8 @@ All tests support the same set of arguments :
  * `-G,--cudagraph <num graph launches>` Capture iterations as a CUDA graph and then replay specified number of times. Default : 0.
  * `-C,--report_cputime <0/1>` Report CPU time instead of latency. Default : 0.
  * `-R,--local_register <0/1/2>` enable local (1) or symmetric (2) buffer registration on send/recv buffers. Default : 0.
+  * `-S,--report_timestamps <0/1>` Add timestamp (`"%Y-%m-%d %H:%M:%S"`) to each performance report line. Default : 0.
+  * `-J,--output_file <file>` Write [JSON] output to filepath. Infer type from suffix (only `json` supported presently).
  * `-T,--timeout <time in seconds>` timeout each test after specified number of seconds. Default : disabled.

 ### Running multiple operations in parallel
--- a/src/Makefile
+++ b/src/Makefile
@ -48,12 +48,12 @@ include ../verifiable/verifiable.mk

 .PRECIOUS: ${DST_DIR}/%.o

-${DST_DIR}/%.o: %.cu common.h $(TEST_VERIFIABLE_HDRS)
+${DST_DIR}/%.o: %.cu common.h util.h $(TEST_VERIFIABLE_HDRS)
 	@printf "Compiling  %-35s > %s\n" $< $@
 	@mkdir -p ${DST_DIR}
 	$(NVCC) -o $@ $(NVCUFLAGS) -c $<

-${DST_DIR}/%$(NAME_SUFFIX).o: %.cu common.h $(TEST_VERIFIABLE_HDRS)
+${DST_DIR}/%$(NAME_SUFFIX).o: %.cu common.h util.h $(TEST_VERIFIABLE_HDRS)
 	@printf "Compiling  %-35s > %s\n" $< $@
 	@mkdir -p ${DST_DIR}
 	$(NVCC) -o $@ $(NVCUFLAGS) -c $<
@ -64,12 +64,12 @@ ${DST_DIR}/timer.o: timer.cc timer.h
 	$(CXX) $(CXXFLAGS) -o $@ -c $<

 ifeq ($(DSO), 1)
-${DST_DIR}/%_perf$(NAME_SUFFIX): ${DST_DIR}/%.o ${DST_DIR}/common$(NAME_SUFFIX).o ${DST_DIR}/timer.o $(TEST_VERIFIABLE_LIBS)
+${DST_DIR}/%_perf$(NAME_SUFFIX): ${DST_DIR}/%.o ${DST_DIR}/common$(NAME_SUFFIX).o ${DST_DIR}/util$(NAME_SUFFIX).o ${DST_DIR}/timer.o $(TEST_VERIFIABLE_LIBS)
 	@printf "Linking  %-35s > %s\n" $< $@
 	@mkdir -p ${DST_DIR}
 	$(NVCC) -o $@ $(NVCUFLAGS) $^ -L$(TEST_VERIFIABLE_BUILDDIR) -lverifiable ${NVLDFLAGS} -Xlinker "--enable-new-dtags" -Xlinker "-rpath,\$$ORIGIN:\$$ORIGIN/verifiable"
 else
-${DST_DIR}/%_perf$(NAME_SUFFIX):${DST_DIR}/%.o ${DST_DIR}/common$(NAME_SUFFIX).o ${DST_DIR}/timer.o $(TEST_VERIFIABLE_OBJS)
+${DST_DIR}/%_perf$(NAME_SUFFIX):${DST_DIR}/%.o ${DST_DIR}/common$(NAME_SUFFIX).o ${DST_DIR}/util$(NAME_SUFFIX).o ${DST_DIR}/timer.o $(TEST_VERIFIABLE_OBJS)
 	@printf "Linking  %-35s > %s\n" $< $@
 	@mkdir -p ${DST_DIR}
 	$(NVCC) -o $@ $(NVCUFLAGS) $^ ${NVLDFLAGS}
--- a/src/all_gather.cu
+++ b/src/all_gather.cu
@ -90,8 +90,8 @@ testResult_t AllGatherRunTest(struct threadArgs* args, int root, ncclDataType_t
 }

 struct testEngine allGatherEngine = {
-  AllGatherGetBuffSize,
-  AllGatherRunTest
+  .getBuffSize = AllGatherGetBuffSize,
+  .runTest = AllGatherRunTest
 };

 #pragma weak ncclTestEngine=allGatherEngine
--- a/src/all_reduce.cu
+++ b/src/all_reduce.cu
@ -64,6 +64,49 @@ void AllReduceGetBw(size_t count, int typesize, double sec, double* algBw, doubl
  *busBw = baseBw * factor;
 }

+#if NCCL_VERSION_CODE >= NCCL_VERSION(2,29,0)
+ // set devComm reqs for allreduce device kernels
+testResult_t AllReduceGetDevCommRequirements(int deviceImpl, ncclDevCommRequirements* reqs, ncclCommProperties_t* commProperties) {
+  if (!reqs || !commProperties) return testInternalError;
+
+  switch(deviceImpl) {
+    case 1: // allReduceLsaKernel
+    case 2: // allReduceLsaVectorizedKernel
+      reqs->lsaBarrierCount = deviceCtaCount;
+      return testSuccess;
+    case 3: // allReduceMultimemKernel
+    case 4: // allReduceMultimemVectorizedKernel
+      if (!commProperties->multimemSupport) {
+        fprintf(stderr, "This test requires multimem support, but multimem support is not enabled for this communicator.\n");
+        return testInternalError;
+      }
+      reqs->lsaMultimem = true;
+      reqs->lsaBarrierCount = deviceCtaCount;
+      return testSuccess;
+    default:
+      return testNotImplemented;
+  }
+}
+#elif NCCL_VERSION_CODE >= NCCL_VERSION(2,28,0)
+ bool AllReduceGetDevCommRequirements(int deviceImpl, ncclDevCommRequirements* reqs) {
+   if (!reqs) return false;
+   memset(reqs, 0, sizeof(*reqs));
+   switch(deviceImpl) {
+    case 1: // allReduceLsaKernel
+    case 2: // allReduceLsaVectorizedKernel
+      reqs->lsaBarrierCount = deviceCtaCount;
+      return true;
+    case 3: // allReduceMultimemKernel
+    case 4: // allReduceMultimemVectorizedKernelMultimem = true;
+    reqs->lsaMultimem = true;
+    reqs->lsaBarrierCount = deviceCtaCount;
+      return true;
+    default:
+      return false;
+  }
+ }
+#endif
+
 #if NCCL_VERSION_CODE >= NCCL_VERSION(2,28,0)
 /*
 * Kernel 1: allReduceLsaKernel - Basic LSA-based AllReduce
@ -453,19 +496,19 @@ testResult_t AllReduceRunColl(void* sendbuff, size_t sendoffset, void* recvbuff,
 #if NCCL_VERSION_CODE >= NCCL_VERSION(2,28,0)
  case 1:
    TESTCHECK(testLaunchDeviceKernel(SPECIALIZE_KERNEL(allReduceLsaKernel, type, op),
-               sendbuff, sendoffset, recvbuff, recvoffset, count, type, op, root, comm, stream, 0));
+               sendbuff, sendoffset, recvbuff, recvoffset, count, type, op, root, comm, stream));
    return testSuccess;
  case 2:
    TESTCHECK(testLaunchDeviceKernel(SPECIALIZE_KERNEL(allReduceLsaVectorizedKernel, type, op),
-               sendbuff, sendoffset, recvbuff, recvoffset, count, type, op, root, comm, stream, 0));
+               sendbuff, sendoffset, recvbuff, recvoffset, count, type, op, root, comm, stream));
    return testSuccess;
  case 3:
    TESTCHECK(testLaunchDeviceKernel(SPECIALIZE_KERNEL(allReduceMultimemKernel, type, op),
-               sendbuff, sendoffset, recvbuff, recvoffset, count, type, op, root, comm, stream, 1));
+               sendbuff, sendoffset, recvbuff, recvoffset, count, type, op, root, comm, stream));
    return testSuccess;
  case 4:
    TESTCHECK(testLaunchDeviceKernel(SPECIALIZE_KERNEL(allReduceMultimemVectorizedKernel, type, op),
-               sendbuff, sendoffset, recvbuff, recvoffset, count, type, op, root, comm, stream, 1));
+               sendbuff, sendoffset, recvbuff, recvoffset, count, type, op, root, comm, stream));
    return testSuccess;
 #endif
  }
@ -522,8 +565,11 @@ testResult_t AllReduceRunTest(struct threadArgs* args, int root, ncclDataType_t
 }

 struct testEngine allReduceEngine = {
-  AllReduceGetBuffSize,
-  AllReduceRunTest
+  .getBuffSize = AllReduceGetBuffSize,
+  .runTest = AllReduceRunTest,
+#if NCCL_VERSION_CODE >= NCCL_VERSION(2,28,0)
+  .getDevCommRequirements = AllReduceGetDevCommRequirements
+#endif
 };

 #pragma weak ncclTestEngine=allReduceEngine
--- a/src/alltoall.cu
+++ b/src/alltoall.cu
@ -11,6 +11,8 @@
 #include "vector_types.h"
 #endif

+#pragma weak ncclAlltoAll
+
 void AlltoAllGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, size_t eltSize, int nranks) {
  *paramcount = (count/nranks) & -(16/eltSize);
  *sendcount = nranks*(*paramcount);
@ -49,6 +51,53 @@ void AlltoAllGetBw(size_t count, int typesize, double sec, double* algBw, double
  *busBw = baseBw * factor;
 }

+#if NCCL_VERSION_CODE >= NCCL_VERSION(2,29,0)
+// set devComm reqs for alltoall device kernels
+testResult_t AlltoAllGetDevCommRequirements(int deviceImpl, ncclDevCommRequirements* reqs, ncclCommProperties_t* commProperties) {
+  if (!reqs || !commProperties) return testInternalError;
+
+  switch(deviceImpl) {
+    case 1: // NvlAlltoAllKernel
+    case 2: // NvlAlltoAllKernelOptimized
+      reqs->lsaBarrierCount = deviceCtaCount;
+      return testSuccess;
+    case 3: // GinAlltoAllKernel
+    case 4: // HybridAlltoAllKernel (LSA+GIN)
+      if (commProperties->ginType == NCCL_GIN_TYPE_NONE) {
+        fprintf(stderr, "This test requires GIN support, but GIN support is not enabled for this communicator.\n");
+        return testInternalError;
+      }
+      reqs->barrierCount = deviceCtaCount;
+      reqs->ginSignalCount = deviceCtaCount;
+      return testSuccess;
+    default:
+      return testNotImplemented;
+  }
+}
+#elif NCCL_VERSION_CODE >= NCCL_VERSION(2,28,0)
+// set devComm reqs for alltoall device kernels
+bool AlltoAllGetDevCommRequirements(int deviceImpl, ncclDevCommRequirements* reqs) {
+  if (!reqs) return false;
+  memset(reqs, 0, sizeof(*reqs));
+
+  switch(deviceImpl) {
+    case 1: // NvlAlltoAllKernel
+    case 2: // NvlAlltoAllKernelOptimized
+      reqs->lsaBarrierCount = deviceCtaCount;
+      return true;
+#if NCCL_VERSION_CODE >= NCCL_VERSION(2,28,7)
+    case 3: // GinAlltoAllKernel
+    case 4: // HybridAlltoAllKernel (LSA+GIN)
+      reqs->barrierCount = deviceCtaCount;
+      reqs->ginSignalCount = deviceCtaCount;
+      return true;
+#endif
+    default:
+      return false;
+  }
+}
+#endif
+
 #if NCCL_VERSION_CODE >= NCCL_VERSION(2,28,0)
 // shared scalar AlltoAll implementation used by both kernels
 template <typename T>
@ -159,6 +208,86 @@ __global__ void NvlAlltoAllKernelOptimized(ncclWindow_t sendwin, size_t sendoffs

  bar.sync(ncclCoopCta(), cuda::memory_order_release);
 }
+
+#if NCCL_VERSION_CODE >= NCCL_VERSION(2,28,7)
+template <typename T>
+__global__ void GinAlltoAllKernel(ncclWindow_t sendwin, size_t sendoffset, ncclWindow_t recvwin, size_t recvoffset, size_t count, int root, struct ncclDevComm devComm) {
+  int ginContext = 0;
+  unsigned int signalIndex = 0;
+  ncclGin gin { devComm, ginContext };
+  uint64_t signalValue = gin.readSignal(signalIndex);
+
+  ncclBarrierSession<ncclCoopCta> bar { ncclCoopCta(), ncclTeamTagWorld(), gin, blockIdx.x };
+  bar.sync(ncclCoopCta(), cuda::memory_order_relaxed, ncclGinFenceLevel::Relaxed);
+
+  int tid = threadIdx.x + blockIdx.x * blockDim.x;
+  int nthreads = blockDim.x * gridDim.x;
+
+  /* send to all peers via GIN */
+  const size_t size = count * sizeof(T);
+  for (int r=tid; r<devComm.nRanks; r+=nthreads) {
+    gin.put(ncclTeamWorld(devComm), r,
+        recvwin, recvoffset + devComm.rank * size,
+        sendwin, sendoffset + r * size,
+        size, ncclGin_SignalInc{signalIndex});
+  }
+
+  gin.waitSignal(ncclCoopCta(), signalIndex, signalValue + devComm.nRanks);
+  gin.flush(ncclCoopCta());
+
+  bar.sync(ncclCoopCta(), cuda::memory_order_release, ncclGinFenceLevel::Relaxed);
+}
+
+template <typename T>
+__global__ void HybridAlltoAllKernel(ncclWindow_t sendwin, size_t sendoffset, ncclWindow_t recvwin, size_t recvoffset, size_t count, int root, struct ncclDevComm devComm) {
+  int ginContext = 0;
+  unsigned int signalIndex = 0;
+  ncclGin gin { devComm, ginContext };
+  uint64_t signalValue = gin.readSignal(signalIndex);
+
+  ncclBarrierSession<ncclCoopCta> bar { ncclCoopCta(), ncclTeamTagWorld(), gin, blockIdx.x };
+  bar.sync(ncclCoopCta(), cuda::memory_order_relaxed, ncclGinFenceLevel::Relaxed);
+
+  int tid = threadIdx.x + blockIdx.x*blockDim.x;
+  int nthreads = blockDim.x * gridDim.x;
+
+  ncclTeam world = ncclTeamWorld(devComm);
+  ncclTeam lsa = ncclTeamLsa(devComm);
+  const int startLsa = world.rank - lsa.rank;
+  const int lsaSize  = lsa.nRanks;
+
+  /* handle remote peers (i.e., non-LSA) using GIN */
+  const size_t size = count * sizeof(T);
+  for (int r = tid; r < startLsa; r += nthreads) {
+    gin.put(world, r,
+        recvwin, recvoffset + world.rank * size,
+        sendwin, sendoffset + r * size,
+        size, ncclGin_SignalInc{signalIndex});
+  }
+  for (int r = startLsa + lsaSize + tid; r < world.nRanks; r += nthreads) {
+    gin.put(world, r,
+        recvwin, recvoffset + world.rank * size,
+        sendwin, sendoffset + r * size,
+        size, ncclGin_SignalInc{signalIndex});
+  }
+
+  /* handle local peers with LSA */
+  T* sendLocal = (T*)ncclGetLocalPointer(sendwin, sendoffset);
+  for (size_t offset = tid; offset < count; offset += nthreads) {
+    for (int lp = 0; lp < lsa.nRanks; lp++) {
+      int wr = startLsa + lp;
+      T* recvPtr = (T*)ncclGetLsaPointer(recvwin, recvoffset, lp);
+      recvPtr[world.rank * count + offset] = sendLocal[wr * count + offset];
+    }
+  }
+
+  int numRemotePeers = world.nRanks - lsa.nRanks;
+  gin.waitSignal(ncclCoopCta(), signalIndex, signalValue + numRemotePeers);
+  gin.flush(ncclCoopCta());
+
+  bar.sync(ncclCoopCta(), cuda::memory_order_release, ncclGinFenceLevel::Relaxed);
+}
+#endif
 #endif

 testResult_t AlltoAllRunColl(void* sendbuff, size_t sendoffset, void* recvbuff, size_t recvoffset, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream, int deviceImpl) {
@ -166,8 +295,13 @@ testResult_t AlltoAllRunColl(void* sendbuff, size_t sendoffset, void* recvbuff,
    char* sptr = (char*)sendbuff + sendoffset;
    char* rptr = (char*)recvbuff + recvoffset;
 #if NCCL_VERSION_CODE >= NCCL_VERSION(2,28,0)
-    NCCLCHECK(ncclAlltoAll(sptr, rptr, count, type, comm, stream));
-#elif NCCL_VERSION_CODE >= NCCL_VERSION(2,7,0)
+    if (test_ncclVersion >= NCCL_VERSION(2,28,0)) {
+      NCCLCHECK(ncclAlltoAll(sptr, rptr, count, type, comm, stream));
+      return testSuccess;
+    }
+    // fall-through to send/recv implementation if ncclAlltoAll is not available
+#endif
+#if NCCL_VERSION_CODE >= NCCL_VERSION(2,7,0)
    int nRanks;
    NCCLCHECK(ncclCommCount(comm, &nRanks));
    size_t rankOffset = count * wordSize(type);
@ -183,12 +317,22 @@ testResult_t AlltoAllRunColl(void* sendbuff, size_t sendoffset, void* recvbuff,
 #endif
  } else {
    switch(deviceImpl) {
+#if NCCL_VERSION_CODE >= NCCL_VERSION(2,28,0)
      case 1:
-        TESTCHECK(testLaunchDeviceKernel(SPECIALIZE_KERNEL(NvlAlltoAllKernel, type, op), sendbuff, sendoffset, recvbuff, recvoffset, count, type, op, root, comm, stream, 0));
+        TESTCHECK(testLaunchDeviceKernel(SPECIALIZE_KERNEL(NvlAlltoAllKernel, type, op), sendbuff, sendoffset, recvbuff, recvoffset, count, type, op, root, comm, stream));
        return testSuccess;
      case 2:
-        TESTCHECK(testLaunchDeviceKernel(SPECIALIZE_KERNEL(NvlAlltoAllKernelOptimized, type, op), sendbuff, sendoffset, recvbuff, recvoffset, count, type, op, root, comm, stream, 0));
+        TESTCHECK(testLaunchDeviceKernel(SPECIALIZE_KERNEL(NvlAlltoAllKernelOptimized, type, op), sendbuff, sendoffset, recvbuff, recvoffset, count, type, op, root, comm, stream));
        return testSuccess;
+#endif
+#if NCCL_VERSION_CODE >= NCCL_VERSION(2,28,7)
+      case 3:
+        TESTCHECK(testLaunchDeviceKernel(SPECIALIZE_KERNEL(GinAlltoAllKernel, type, op), sendbuff, sendoffset, recvbuff, recvoffset, count, type, op, root, comm, stream));
+        return testSuccess;
+      case 4:
+        TESTCHECK(testLaunchDeviceKernel(SPECIALIZE_KERNEL(HybridAlltoAllKernel, type, op), sendbuff, sendoffset, recvbuff, recvoffset, count, type, op, root, comm, stream));
+        return testSuccess;
+#endif
      default:
        return testNotImplemented;
    }
@ -232,8 +376,11 @@ testResult_t AlltoAllRunTest(struct threadArgs* args, int root, ncclDataType_t t
 }

 struct testEngine alltoAllEngine = {
-  AlltoAllGetBuffSize,
-  AlltoAllRunTest
+  .getBuffSize = AlltoAllGetBuffSize,
+  .runTest = AlltoAllRunTest,
+#if NCCL_VERSION_CODE >= NCCL_VERSION(2,28,0)
+  .getDevCommRequirements = AlltoAllGetDevCommRequirements
+#endif
 };

 #pragma weak ncclTestEngine=alltoAllEngine
--- a/src/broadcast.cu
+++ b/src/broadcast.cu
@ -107,8 +107,8 @@ testResult_t BroadcastRunTest(struct threadArgs* args, int root, ncclDataType_t
 }

 struct testEngine broadcastEngine = {
-  BroadcastGetBuffSize,
-  BroadcastRunTest
+  .getBuffSize = BroadcastGetBuffSize,
+  .runTest = BroadcastRunTest
 };

 #pragma weak ncclTestEngine=broadcastEngine
--- a/src/common.cu
+++ b/src/common.cu
@ -8,6 +8,7 @@
 #include <pthread.h>
 #include <cstdio>
 #include <type_traits>
+#include <limits>
 #include <getopt.h>
 #include <libgen.h>
 #include <string.h>
@ -15,12 +16,14 @@
 #include "cuda.h"
 #include <errno.h>     /* program_invocation_short_name */

+#include "util.h"
 #include "../verifiable/verifiable.h"

 #pragma weak ncclCommWindowRegister
 #pragma weak ncclCommWindowDeregister
 #pragma weak ncclDevCommCreate
 #pragma weak ncclDevCommDestroy
+#pragma weak ncclCommQueryProperties

 #define DIVUP(x, y) \
    (((x)+(y)-1)/(y))
@ -76,30 +79,31 @@ int is_main_proc = 0;
 thread_local int is_main_thread = 0;

 // Command line parameter defaults
-static int nThreads = 1;
-static int nGpus = 1;
-static size_t minBytes = 32*1024*1024;
-static size_t maxBytes = 32*1024*1024;
-static size_t stepBytes = 1*1024*1024;
-static size_t stepFactor = 1;
-static int datacheck = 1;
-static int warmup_iters = 1;
-static int iters = 20;
-static int agg_iters = 1;
+int nThreads = 1;
+int nGpus = 1;
+size_t minBytes = 32*1024*1024;
+size_t maxBytes = 32*1024*1024;
+size_t stepBytes = 1*1024*1024;
+size_t stepFactor = 1;
+int datacheck = 1;
+int warmup_iters = 1;
+int iters = 20;
+int agg_iters = 1;
 static int run_cycles = 1;
 static int ncclop = ncclSum;
 static int nccltype = ncclFloat;
 static int ncclroot = 0;
-static int parallel_init = 0;
-static int blocking_coll = 0;
+int parallel_init = 0;
+int blocking_coll = 0;
 static int streamnull = 0;
 static int timeout = 0;
-static int cudaGraphLaunches = 0;
+int cudaGraphLaunches = 0;
 static int report_cputime = 0;
+static int report_timestamps = 0;
 static int deviceImpl = 0;
+int memory_report = 0;

 int deviceCtaCount = 16; // Default number of CTAs for device implementation
-bool deviceMultimemEnabled = false; // Track whether multimem was successfully enabled

 // Report average iteration time: (0=RANK0,1=AVG,2=MIN,3=MAX)
 static int average = 1;
@ -111,8 +115,102 @@ static int ctaPolicy = -1;
 #endif
 static int minCudaArch = 1<<30;

-#define NUM_BLOCKS 32
+enum output_file_type_t {
+  JSON_FILE_OUTPUT,
+  UNSPECIFIED_FILE_OUTPUT
+};

+// Return pointer to extension in `path` if one is found. An extension
+// is the last `.` in the `path`, if there is no `/` following the `.`
+// and there are characters after `.`.
+//
+// Therefore: returns 0 if no meaningful extension was found, or returns offset
+// into string where extension begins
+static const char *getExtension(const char *path) {
+  if (path == nullptr) return nullptr;
+  int last_dot = -1;
+  int last_slash = -1;
+
+  int pos;
+  for (pos = 0; path[pos] != '\0'; ++pos) {
+    switch (path[pos]) {
+    case '.':
+      last_dot = pos;
+      break;
+    case '/':
+      last_slash = pos;
+      break;
+    default:
+      break;
+    }
+  }
+
+  if (last_dot > last_slash && last_dot + 1 != pos) {
+    return path + last_dot + 1;
+  }
+
+  return nullptr;
+}
+
+static output_file_type_t classifyOutputFile(const char *filename) {
+  const char *extension = getExtension(filename);
+  if (extension != nullptr && strcasecmp(extension, "json") == 0) {
+    return JSON_FILE_OUTPUT;
+  }
+
+  return UNSPECIFIED_FILE_OUTPUT;
+}
+
+static void outputFileInit(output_file_type_t output_file_type,
+                           const char *output_file, char argc, char **argv, char **envp) {
+  switch (output_file_type) {
+  case JSON_FILE_OUTPUT:
+    jsonOutputInit(output_file, argc, argv, envp);
+    break;
+  case UNSPECIFIED_FILE_OUTPUT:
+  default:
+    break;
+  }
+}
+
+static void outputFileFinalize(output_file_type_t output_file_type) {
+  switch (output_file_type) {
+  case JSON_FILE_OUTPUT:
+    jsonOutputFinalize();
+    break;
+  case UNSPECIFIED_FILE_OUTPUT:
+  default:
+    break;
+  }
+}
+
+testResult_t initComms(ncclComm_t* comms, int nComms, int firstRank, int nRanks, int* cudaDevs, ncclUniqueId& ncclId) {
+#if NCCL_VERSION_CODE >= NCCL_VERSION(2,14,0)
+  ncclConfig_t config = NCCL_CONFIG_INITIALIZER;
+#if NCCL_VERSION_CODE >= NCCL_VERSION(2,27,0)
+  if (ctaPolicy >= 0)
+    config.CTAPolicy = ctaPolicy;
+#if NCCL_VERSION_CODE >= NCCL_VERSION(2,28,0)
+  config.nvlinkCentricSched = 1;
+#endif
+#endif
+#endif
+
+  NCCLCHECK(ncclGroupStart());
+  for (int i=0; i<nComms; i++) {
+    int rank = firstRank + i;
+    CUDACHECK(cudaSetDevice(cudaDevs[i]));
+#if NCCL_VERSION_CODE >= NCCL_VERSION(2,14,0)
+    NCCLCHECK(ncclCommInitRankConfig(comms+i, nRanks, ncclId, rank, &config));
+#else
+    NCCLCHECK(ncclCommInitRank(comms+i, nRanks, ncclId, rank));
+#endif
+  }
+  NCCLCHECK(ncclGroupEnd());
+  return testSuccess;
+}
+
+// NOTE: We use the binary system, so M=Mebibytes and G=Gibibytes
 static double parsesize(const char *value) {
    long long int units;
    double size;
@ -593,19 +691,7 @@ testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t
  }

  double timeUsec = (report_cputime ? cputimeSec : deltaSec)*1.0E6;
-  char timeStr[100];
-  if (timeUsec >= 10000.0) {
-    sprintf(timeStr, "%7.0f", timeUsec);
-  } else if (timeUsec >= 100.0) {
-    sprintf(timeStr, "%7.1f", timeUsec);
-  } else {
-    sprintf(timeStr, "%7.2f", timeUsec);
-  }
-  if (args->reportErrors) {
-    PRINT("  %7s  %6.2f  %6.2f  %5g", timeStr, algBw, busBw, (double)wrongElts);
-  } else {
-    PRINT("  %7s  %6.2f  %6.2f  %5s", timeStr, algBw, busBw, "N/A");
-  }
+  writeBenchmarkLineBody(timeUsec, algBw, busBw, args->reportErrors, wrongElts, report_cputime, report_timestamps, in_place==0);

  args->bw[0] += busBw;
  args->bw_count[0]++;
@ -644,57 +730,72 @@ testResult_t TimeTest(struct threadArgs* args, ncclDataType_t type, const char*
  do {
    for (size_t size = args->minbytes; size<=args->maxbytes; size = ((args->stepfactor > 1) ? size*args->stepfactor : size+args->stepbytes)) {
      setupArgs(size, type, args);
-      char rootName[100];
-      sprintf(rootName, "%6i", root);
-      PRINT("%12li  %12li  %8s  %6s  %6s", max(args->sendBytes, args->expectedBytes), args->nbytes / wordSize(type), typeName, opName, rootName);
+      writeBenchmarkLinePreamble(max(args->sendBytes, args->expectedBytes), args->nbytes / wordSize(type), typeName, opName, root);
      TESTCHECK(BenchTime(args, type, op, root, 0));
      TESTCHECK(BenchTime(args, type, op, root, 1));
-      PRINT("\n");
+      writeBenchmarkLineTerminator(iters, "");
    }
  } while (--repeat);

  return testSuccess;
 }

+static void getGPUMemoryInfo(int64_t* ptotalGpuMem, int64_t* pfreeGpuMem) {
+  size_t freeGpuMem, totalGpuMem = 0;
+  cudaMemGetInfo(&freeGpuMem, &totalGpuMem);
+  if (ptotalGpuMem != nullptr) *ptotalGpuMem = totalGpuMem;
+  if (pfreeGpuMem != nullptr) *pfreeGpuMem = freeGpuMem;
+}
+
 testResult_t threadRunTests(struct threadArgs* args) {
+  //  capture the free memory before
+  int64_t* totalGpuFreeMem = (int64_t*)calloc(args->nGpus*2, sizeof(int64_t));
+  for (int g = 0; g < args->nGpus; ++g) {
+    CUDACHECK(cudaSetDevice(args->gpus[g]));
+    getGPUMemoryInfo(nullptr, &totalGpuFreeMem[g]);
+  }
+
  // Set device to the first of our GPUs. If we don't do that, some operations
  // will be done on the current GPU (by default : 0) and if the GPUs are in
  // exclusive mode those operations will fail.
  CUDACHECK(cudaSetDevice(args->gpus[0]));
  TESTCHECK(ncclTestEngine.runTest(args, ncclroot, (ncclDataType_t)nccltype, test_typenames[nccltype], (ncclRedOp_t)ncclop, test_opnames[ncclop]));
+
+  // Capture the memory used by the GPUs
+  for (int g = 0; g < args->nGpus; ++g) {
+    CUDACHECK(cudaSetDevice(args->gpus[g]));
+    getGPUMemoryInfo(nullptr, &totalGpuFreeMem[g + args->nGpus]);
+    *args->devMemUsed = std::max(*args->devMemUsed, totalGpuFreeMem[g] - totalGpuFreeMem[g + args->nGpus]);
+  }
+  free(totalGpuFreeMem);
  return testSuccess;
 }

 testResult_t threadInit(struct threadArgs* args) {
-  char hostname[1024];
-  getHostName(hostname, 1024);
  int nranks =  args->nProcs*args->nThreads*args->nGpus;

  //set main thread again
  is_main_thread = (is_main_proc && args->thread == 0) ? 1 : 0;

-#if NCCL_VERSION_CODE >= NCCL_VERSION(2,14,0)
-     ncclConfig_t config = NCCL_CONFIG_INITIALIZER;
-#if NCCL_VERSION_CODE >= NCCL_VERSION(2,27,0)
-     if (ctaPolicy >= 0)
-       config.CTAPolicy = ctaPolicy;
-#if NCCL_VERSION_CODE >= NCCL_VERSION(2,28,0)
-     config.nvlinkCentricSched = 1;
-#endif
-#endif
-#endif
+  jsonIdentifyWriter(is_main_thread);

-  NCCLCHECK(ncclGroupStart());
-  for (int i=0; i<args->nGpus; i++) {
-    int rank = args->proc*args->nThreads*args->nGpus + args->thread*args->nGpus + i;
-    CUDACHECK(cudaSetDevice(args->gpus[i]));
-#if NCCL_VERSION_CODE >= NCCL_VERSION(2,14,0)
-    NCCLCHECK(ncclCommInitRankConfig(args->comms+i, nranks, args->ncclId, rank, &config));
-#else
-    NCCLCHECK(ncclCommInitRank(args->comms+i, nranks, args->ncclId, rank));
-#endif
+  // Capture GPU memory before initializing the NCCL communicators
+  int64_t* initFreeGpuMem = (int64_t*)calloc(args->nGpus*3, sizeof(int64_t));
+  for (int g = 0; g < args->nGpus; ++g) {
+    CUDACHECK(cudaSetDevice(args->gpus[g]));
+    getGPUMemoryInfo(nullptr, &initFreeGpuMem[g]);
  }
-  NCCLCHECK(ncclGroupEnd());
+
+  int firstRank = args->proc*args->nThreads*args->nGpus + args->thread*args->nGpus;
+  TESTCHECK(initComms(args->comms, args->nGpus, firstRank, nranks, args->gpus, args->ncclId));
+
+  // Capture the memory used by the GPUs after initializing the NCCL communicators
+  for (int g = 0; g < args->nGpus; ++g) {
+    CUDACHECK(cudaSetDevice(args->gpus[g]));
+    getGPUMemoryInfo(nullptr, &initFreeGpuMem[g + args->nGpus]);
+    *args->initGpuMem = std::max(*args->initGpuMem, initFreeGpuMem[g] - initFreeGpuMem[g + args->nGpus]);
+  }
+
 #if NCCL_VERSION_CODE >= NCCL_VERSION(2,19,0)
  NCCLCHECK(ncclGroupStart());
  for (int i=0; i<args->nGpus; i++) {
@ -711,54 +812,63 @@ testResult_t threadInit(struct threadArgs* args) {
  }
  NCCLCHECK(ncclGroupEnd());
 #endif
+  // Capture memory used by test buffers
+  for (int g = 0; g < args->nGpus; ++g) {
+    CUDACHECK(cudaSetDevice(args->gpus[g]));
+    getGPUMemoryInfo(nullptr, &initFreeGpuMem[g + args->nGpus*2]);
+    args->bufferMemory[args->thread] = std::max(args->bufferMemory[args->thread], initFreeGpuMem[g + args->nGpus] - initFreeGpuMem[g + args->nGpus*2]);
+  }
 #if NCCL_VERSION_CODE >= NCCL_VERSION(2,28,0)
-  /* Create device communicators with multimem fallback */
+  /* Create device communicators based on test-specific requirements */
  if (deviceImpl) {
-    // Duplicate comms so our checks here do not affect the originals
-    ncclComm_t tmpComms[args->nGpus];
-    memset(tmpComms, 0, sizeof(tmpComms));
-    NCCLCHECK(ncclGroupStart());
-    for (int i = 0; i < args->nGpus; i++) {
-      int rank;
-      NCCLCHECK(ncclCommUserRank(args->comms[i], &rank));
-      NCCLCHECK(ncclCommSplit(args->comms[i], 0, rank, &tmpComms[i], NULL));
+#if NCCL_VERSION_CODE >= NCCL_VERSION(2,29,0)
+    if (test_ncclVersion < NCCL_VERSION(2,29,0)) {
+      fprintf(stderr,
+        "Incompatible NCCL versions. nccl-tests was compiled with NCCL %d, but is running with NCCL %d. "
+        "The %d Device API is not compatible with versions before 2.29.\n",
+        NCCL_VERSION_CODE, test_ncclVersion, NCCL_VERSION_CODE);
+      return testInvalidUsage;
    }
-    NCCLCHECK(ncclGroupEnd());
-
-    // Check multimem support on the duplicated comms
-    bool checkMultimemFailed = false;
-    ncclResult_t result;
-    ncclDevComm tmpDevComms[args->nGpus];
-    memset(tmpDevComms, 0, sizeof(tmpDevComms));
-    NCCLCHECK(ncclGroupStart());
-    for (int i = 0; i < args->nGpus; i++) {
-      ncclDevCommRequirements reqs;
-      memset(&reqs, 0, sizeof(reqs));
-      reqs.lsaBarrierCount = deviceCtaCount;
-      reqs.lsaMultimem = true;
-      result = ncclDevCommCreate(tmpComms[i], &reqs, &tmpDevComms[i]);
-      if (result != ncclInProgress && result != ncclSuccess) {
-        checkMultimemFailed = true;
-      }
+    ncclDevCommRequirements reqs = NCCL_DEV_COMM_REQUIREMENTS_INITIALIZER;
+    if (!ncclTestEngine.getDevCommRequirements) {
+      fprintf(stderr, "Device implementation %d is not supported by this test\n", deviceImpl);
+      return testNotImplemented;
    }
-    result = ncclGroupEnd();
-    if (result != ncclSuccess) checkMultimemFailed = true;
-    deviceMultimemEnabled = !checkMultimemFailed;
+    ncclCommProperties commProperties = NCCL_COMM_PROPERTIES_INITIALIZER;
+    NCCLCHECK(ncclCommQueryProperties(args->comms[0], &commProperties));
+    TESTCHECK(ncclTestEngine.getDevCommRequirements(deviceImpl, &reqs, &commProperties));
+#else
+    if (test_ncclVersion >= NCCL_VERSION(2,29,0)) {
+      fprintf(stderr, "Incompatible NCCL versions. nccl-tests was compiled with NCCL 2.28, but is running with NCCL %d. "
+        "The 2.28 Device API is not compatible with later.\n",
+        test_ncclVersion);
+      return testInvalidUsage;
+    }
+    ncclDevCommRequirements reqs = {};
+    if (!ncclTestEngine.getDevCommRequirements ||
+        !ncclTestEngine.getDevCommRequirements(deviceImpl, &reqs)) {
+      fprintf(stderr, "Device implementation %d is not supported by this test\n", deviceImpl);
+      return testNotImplemented;
+    }
+#endif

-    // Create final dev comms with correct multimem setting and cleanup temps
    NCCLCHECK(ncclGroupStart());
    for (int i = 0; i < args->nGpus; i++) {
-      ncclDevCommRequirements reqs;
-      memset(&reqs, 0, sizeof(reqs));
-      reqs.lsaBarrierCount = deviceCtaCount;
-      reqs.lsaMultimem = deviceMultimemEnabled;
      NCCLCHECK(ncclDevCommCreate(args->comms[i], &reqs, args->devComms+i));
-      NCCLCHECK(ncclDevCommDestroy(tmpComms[i], &tmpDevComms[i]));
-      NCCLCHECK(ncclCommDestroy(tmpComms[i]));
    }
    NCCLCHECK(ncclGroupEnd());
  }
+  // Capture memory used by test buffers
+  int64_t deviceCommMaxMem = 0;
+  for (int g = 0; g < args->nGpus; ++g) {
+    CUDACHECK(cudaSetDevice(args->gpus[g]));
+    int64_t freeGpuMem;
+    getGPUMemoryInfo(nullptr, &freeGpuMem);
+    deviceCommMaxMem = std::max(deviceCommMaxMem, initFreeGpuMem[g + args->nGpus*2] - freeGpuMem);
+  }
+  *args->initGpuMem += deviceCommMaxMem;
 #endif
+  free(initFreeGpuMem);

  TESTCHECK(threadRunTests(args));

@ -790,7 +900,7 @@ testResult_t AllocateBuffs(void **sendbuff, size_t sendBytes, void **recvbuff, s

 testResult_t run(); // Main function

-int main(int argc, char* argv[]) {
+int main(int argc, char* argv[], char **envp) {
  // Make sure everyline is flushed so that we see the progress of the test
  setlinebuf(stdout);

@ -799,7 +909,7 @@ int main(int argc, char* argv[]) {
  #else
    test_ncclVersion = NCCL_VERSION_CODE;
  #endif
-  //printf("# NCCL_VERSION_CODE=%d ncclGetVersion=%d\n", NCCL_VERSION_CODE, test_ncclVersion);
+  //printf("# nccl-tests version %s NCCL_VERSION_CODE=%d ncclGetVersion=%d\n", NCCL_TESTS_VERSION, NCCL_VERSION_CODE, test_ncclVersion);
  #if NCCL_VERSION_CODE >= NCCL_VERSION(2,0,0)
    test_opnum = 4;
    test_typenum = 9;
@ -824,6 +934,8 @@ int main(int argc, char* argv[]) {
  // Parse args
  double parsed;
  int longindex;
+  char *output_file = nullptr;
+
  static struct option longopts[] = {
    {"nthreads", required_argument, 0, 't'},
    {"ngpus", required_argument, 0, 'g'},
@ -845,18 +957,22 @@ int main(int argc, char* argv[]) {
    {"timeout", required_argument, 0, 'T'},
    {"cudagraph", required_argument, 0, 'G'},
    {"report_cputime", required_argument, 0, 'C'},
+    {"report_timestamps", required_argument, 0, 'S'},
+    {"output_file", required_argument, 0, 'J'},
    {"average", required_argument, 0, 'a'},
    {"local_register", required_argument, 0, 'R'},
    {"cta_policy", required_argument, 0, 'x'},
    {"device_implementation", required_argument, 0, 'D'},
    {"device_cta_count", required_argument, 0, 'V'},
+    {"memory", required_argument, 0, 'M'},
+
    {"help", no_argument, 0, 'h'},
    {}
  };

  while(1) {
    int c;
-    c = getopt_long(argc, argv, "t:g:b:e:i:f:n:m:w:N:p:c:o:d:r:z:y:T:hG:C:a:R:x:D:V:", longopts, &longindex);
+    c = getopt_long(argc, argv, "t:g:b:e:i:f:n:m:w:N:p:c:o:d:r:z:y:T:hG:C:a:R:x:D:V:J:S:M:", longopts, &longindex);

    if (c == -1)
      break;
@ -945,6 +1061,12 @@ int main(int argc, char* argv[]) {
      case 'C':
        report_cputime = strtol(optarg, NULL, 0);
        break;
+      case 'J':
+        output_file = strdup(optarg);
+        break;
+      case 'S':
+        report_timestamps = strtol(optarg, NULL, 0);
+        break;
      case 'a':
        average = (int)strtol(optarg, NULL, 0);
        break;
@ -959,6 +1081,9 @@ int main(int argc, char* argv[]) {
        printf("Option -R (register) is not supported before NCCL 2.19. Ignoring\n");
 #endif
        break;
+      case 'M':
+        memory_report = (int)strtol(optarg, NULL, 0);
+        break;
      case 'x':
 #if NCCL_VERSION_CODE >= NCCL_VERSION(2,27,0)
        ctaPolicy = (int)strtol(optarg, NULL, 0);
@ -973,8 +1098,7 @@ int main(int argc, char* argv[]) {
      case 'D':
 	if (test_ncclVersion >= NCCL_VERSION(2,28,0)) {
          deviceImpl = (int)strtol(optarg, NULL, 0);
-	}
-	else {
+        } else {
          fprintf(stderr, "Option -D (device implementation) requires NCCL >= 2.28.0\n");
          return -1;
        }
@ -986,9 +1110,8 @@ int main(int argc, char* argv[]) {
            fprintf(stderr, "device_cta_count (-V) must be positive and less than 128, got %d. "
                    "Using default value 16.\n", deviceCtaCount);
            deviceCtaCount = 16;
-	  }
-        }
-	else {
+          }
+        } else {
          fprintf(stderr, "Option -V (device CTA count) requires NCCL >= 2.28.0\n");
          return -1;
 	}
@ -1023,11 +1146,14 @@ int main(int argc, char* argv[]) {
            "[-T,--timeout <time in seconds>] \n\t"
            "[-G,--cudagraph <num graph launches>] \n\t"
            "[-C,--report_cputime <0/1>] \n\t"
+            "[-S,--report_timestamps <0/1> report timestamps (default 0)] \n\t"
+            "[-J,--output_file <file> write output to filepath, if accessible. Infer type from suffix (only json supported presently.)] \n\t"
            "[-a,--average <0/1/2/3> report average iteration time <0=RANK0/1=AVG/2=MIN/3=MAX>] \n\t"
            "[-R,--local_register <0/1/2> enable local (1) or symmetric (2) buffer registration on send/recv buffers (default: disable (0))] \n\t"
            "[-x,--cta_policy <0/1/2> set CTA policy (NCCL_CTA_POLICY_DEFAULT (0), NCCL_CTA_POLICY_EFFICIENCY (1), NCCL_CTA_POLICY_ZERO (2)) (default: do not set)] \n\t"
            "[-D,--device_implementation <implementation number> enable device implementation (default: 0, use NCCL implementation; requires -R 2 if > 0)] \n\t"
            "[-V,--device_cta_count <number> set number of CTAs for device implementation (default: 16)] \n\t"
+            "[-M,--memory_report <0/1> enable memory usage report (default: 0)] \n\t"
            "[-h,--help]\n",
          basename(argv[0]));
        return 0;
@ -1047,7 +1173,21 @@ int main(int argc, char* argv[]) {
 #ifdef MPI_SUPPORT
  MPI_Init(&argc, &argv);
 #endif
-  TESTCHECK(run());
+
+  const output_file_type_t output_file_type = classifyOutputFile(output_file);
+  outputFileInit(output_file_type, output_file, argc, argv, envp);
+
+  if(output_file) {
+    free(output_file);
+    output_file = nullptr;
+  }
+
+  testResult_t result = run();
+
+  outputFileFinalize(output_file_type);
+
+  TESTCHECK(result);
+
  return 0;
 }

@ -1121,58 +1261,14 @@ testResult_t run() {
 #endif
  is_main_thread = is_main_proc = (proc == 0) ? 1 : 0;

-  PRINT("# Collective test starting: %s\n", program_invocation_short_name);
-  PRINT("# nThread %d nGpus %d minBytes %ld maxBytes %ld step: %ld(%s) warmup iters: %d iters: %d agg iters: %d validation: %d graph: %d\n",
-        nThreads, nGpus, minBytes, maxBytes,
-        (stepFactor > 1)?stepFactor:stepBytes, (stepFactor > 1)?"factor":"bytes",
-        warmup_iters, iters, agg_iters, datacheck, cudaGraphLaunches);
-  if (blocking_coll) PRINT("# Blocking Enabled: wait for completion and barrier after each collective \n");
-  if (parallel_init) PRINT("# Parallel Init Enabled: threads call into NcclInitRank concurrently \n");
-  PRINT("#\n");
+  jsonIdentifyWriter(is_main_thread);

-  PRINT("# Using devices\n");
-#define MAX_LINE 2048
-  char line[MAX_LINE];
-  int len = 0;
  size_t maxMem = ~0;
-  char* envstr = getenv("NCCL_TESTS_DEVICE");
-  int gpu0 = envstr ? atoi(envstr) : -1;
-  int available_devices;
-  CUDACHECK(cudaGetDeviceCount(&available_devices));
-  for (int i=0; i<nThreads*nGpus; i++) {
-    int cudaDev = (gpu0 != -1 ? gpu0 : localRank*nThreads*nGpus) + i;
-    int rank = proc*nThreads*nGpus+i;
-    if (cudaDev >= available_devices) {
-      fprintf(stderr,
-              "Invalid number of GPUs: %d requested but only %d were found.\n",
-              (gpu0 != -1 ? gpu0 : localRank * nThreads * nGpus) +
-                  nThreads * nGpus,
-              available_devices);
-      fprintf(stderr,
-              "Please check the number of processes and GPUs per process.\n");
-      return testNotImplemented;
-    }
-    cudaDeviceProp prop;
-    CUDACHECK(cudaGetDeviceProperties(&prop, cudaDev));
-    len += snprintf(line+len, MAX_LINE-len, "#  Rank %2d Group %2d Pid %6d on %10s device %2d [%04x:%02x:%02x] %s\n",
-                    rank, color, getpid(), hostname, cudaDev, prop.pciDomainID, prop.pciBusID, prop.pciDeviceID, prop.name);
-    maxMem = std::min(maxMem, prop.totalGlobalMem);
+  testResult_t report_result = writeDeviceReport(&maxMem, localRank, proc, totalProcs, color, hostname, program_invocation_short_name);
+  if(report_result != testSuccess) {
+    return report_result;
  }

-#if MPI_SUPPORT
-  char *lines = (proc == 0) ? (char *)malloc(totalProcs*MAX_LINE) : NULL;
-  // Gather all output in rank order to root (0)
-  MPI_Gather(line, MAX_LINE, MPI_BYTE, lines, MAX_LINE, MPI_BYTE, 0, MPI_COMM_WORLD);
-  if (proc == 0) {
-    for (int p = 0; p < totalProcs; p++)
-      PRINT("%s", lines+MAX_LINE*p);
-    free(lines);
-  }
-  MPI_Allreduce(MPI_IN_PLACE, &maxMem, 1, MPI_LONG, MPI_MIN, MPI_COMM_WORLD);
-#else
-  PRINT("%s", line);
-#endif
-
  // Reserve 1GiB of memory for each 16GiB installed, but limit to a max of 4GiB
  const size_t GB = (1ULL << 30);
  size_t reserveMem =  std::min(DIVUP(maxMem, 16*GB) * 1*GB, 4*GB);
@ -1201,12 +1297,11 @@ testResult_t run() {

  ncclTestEngine.getBuffSize(&sendBytes, &recvBytes, (size_t)maxBytes, (size_t)ncclProcs*nGpus*nThreads);

-  envstr = getenv("NCCL_TESTS_DEVICE");
-  gpu0 = envstr ? atoi(envstr) : -1;
+  char* envstr = getenv("NCCL_TESTS_DEVICE");
+  int gpu0 = envstr ? atoi(envstr) : -1;
  for (int i=0; i<nGpus*nThreads; i++) {
    gpus[i] = (gpu0 != -1 ? gpu0 : localRank*nThreads*nGpus) + i;
    CUDACHECK(cudaSetDevice(gpus[i]));
-    TESTCHECK(AllocateBuffs(sendbuffs+i, sendBytes, recvbuffs+i, recvBytes, expected+i, (size_t)maxBytes));
    if (streamnull) {
      streams[i] = NULL;
    }
@ -1249,30 +1344,33 @@ testResult_t run() {
 #if NCCL_VERSION_CODE >= NCCL_VERSION(2,28,0)
  ncclDevComm devComms[nThreads*nGpus];
 #endif
+  int64_t initGpuMem[nThreads] = {0};
+  int64_t bufferMemory[nThreads] = {0};
  if (!parallel_init) {
-#if NCCL_VERSION_CODE >= NCCL_VERSION(2,14,0)
-     ncclConfig_t config = NCCL_CONFIG_INITIALIZER;
-#if NCCL_VERSION_CODE >= NCCL_VERSION(2,27,0)
-     if (ctaPolicy >= 0)
-       config.CTAPolicy = ctaPolicy;
-#if NCCL_VERSION_CODE >= NCCL_VERSION(2,28,0)
-     config.nvlinkCentricSched = 1;
-#endif
-#endif
-#endif
-     NCCLCHECK(ncclGroupStart());
-     for (int i=0; i<nGpus*nThreads; i++) {
-       CUDACHECK(cudaSetDevice(gpus[i]));
-#if NCCL_VERSION_CODE >= NCCL_VERSION(2,14,0)
-       NCCLCHECK(ncclCommInitRankConfig(comms+i, ncclProcs*nThreads*nGpus, ncclId, ncclProc*nThreads*nGpus+i, &config));
-#else
-       NCCLCHECK(ncclCommInitRank(comms+i, ncclProcs*nThreads*nGpus, ncclId, ncclProc*nThreads*nGpus+i));
-#endif
+    // Capture the memory used by the GPUs before initializing the NCCL communicators
+    int64_t* initFreeGpuMem = (int64_t*)calloc(nGpus*3, sizeof(int64_t));
+    for (int g = 0; g < nGpus; ++g) {
+      CUDACHECK(cudaSetDevice(gpus[g]));
+      getGPUMemoryInfo(nullptr, &initFreeGpuMem[g]);
+    }
+    //if parallel init is not selected, use main thread to initialize NCCL
+    TESTCHECK(initComms(comms, nGpus*nThreads, ncclProc*nThreads*nGpus, ncclProcs*nThreads*nGpus, gpus, ncclId));
+
+     // Capture the memory used by the GPUs after initializing the NCCL communicators
+     for (int g = 0; g < nGpus; ++g) {
+       CUDACHECK(cudaSetDevice(gpus[g]));
+       getGPUMemoryInfo(nullptr, &initFreeGpuMem[g + nGpus]);
+     }
+     for ( size_t t = 0; t < nThreads; ++t) {
+       for (int g = 0; g < nGpus; ++g) {
+         initGpuMem[t] = std::max(initGpuMem[t], initFreeGpuMem[g] - initFreeGpuMem[g + nGpus]);
+       }
     }
-     NCCLCHECK(ncclGroupEnd());
 #if NCCL_VERSION_CODE >= NCCL_VERSION(2,19,0)
     NCCLCHECK(ncclGroupStart());
     for (int i=0; i<nGpus*nThreads; i++) {
+       CUDACHECK(cudaSetDevice(gpus[i]));
+       TESTCHECK(AllocateBuffs(sendbuffs+i, sendBytes, recvbuffs+i, recvBytes, expected+i, (size_t)maxBytes));
 #if NCCL_VERSION_CODE >= NCCL_VERSION(2,27,0)
       if (test_ncclVersion >= NCCL_VERSION(2,27,0) && (local_register == SYMMETRIC_REGISTER)) {
         NCCLCHECK(ncclCommWindowRegister(comms[i], sendbuffs[i], maxBytes, (ncclWindow_t*)&sendRegHandles[i], NCCL_WIN_COLL_SYMMETRIC));
@ -1286,73 +1384,82 @@ testResult_t run() {
     }
     NCCLCHECK(ncclGroupEnd());
 #endif
+     // Capture memory used by after allocating buffers
+     for (int g = 0; g < nGpus; ++g) {
+       CUDACHECK(cudaSetDevice(gpus[g]));
+       getGPUMemoryInfo(nullptr, &initFreeGpuMem[g + nGpus*2]);
+     }
+     for ( size_t t = 0; t < nThreads; ++t) {
+      for (int g = 0; g < nGpus; ++g) {
+        bufferMemory[t] = std::max(bufferMemory[t], initFreeGpuMem[g + nGpus] - initFreeGpuMem[g + nGpus*2]);
+      }
+     }
 #if NCCL_VERSION_CODE >= NCCL_VERSION(2,28,0)
-     /* Create device communicators with multimem fallback */
+     /* Create device communicators based on test-specific requirements */
     if (deviceImpl) {
-       // Duplicate comms so our checks here do not affect the originals
-       ncclComm_t tmpComms[nGpus * nThreads];
-       memset(tmpComms, 0, sizeof(tmpComms));
-       NCCLCHECK(ncclGroupStart());
-       for (int i = 0; i < nGpus * nThreads; i++) {
-         int rank;
-         NCCLCHECK(ncclCommUserRank(comms[i], &rank));
-         NCCLCHECK(ncclCommSplit(comms[i], 0, rank, &tmpComms[i], NULL));
-       }
-       NCCLCHECK(ncclGroupEnd());
+#if NCCL_VERSION_CODE >= NCCL_VERSION(2,29,0)
+      if (test_ncclVersion < NCCL_VERSION(2,29,0)) {
+        fprintf(stderr,
+          "Incompatible NCCL versions. nccl-tests was compiled with NCCL %d, but is running with NCCL %d. "
+          "The %d Device API is not compatible with versions before 2.29.\n",
+          NCCL_VERSION_CODE, test_ncclVersion, NCCL_VERSION_CODE);
+        return testInvalidUsage;
+      }
+      ncclDevCommRequirements reqs = NCCL_DEV_COMM_REQUIREMENTS_INITIALIZER;
+      if (!ncclTestEngine.getDevCommRequirements) {
+        fprintf(stderr, "Device implementation %d is not supported by this test\n", deviceImpl);
+        return testNotImplemented;
+      }
+      ncclCommProperties commProperties = NCCL_COMM_PROPERTIES_INITIALIZER;
+      NCCLCHECK(ncclCommQueryProperties(comms[0], &commProperties));
+      TESTCHECK(ncclTestEngine.getDevCommRequirements(deviceImpl, &reqs, &commProperties));
+#else
+      if (test_ncclVersion >= NCCL_VERSION(2,29,0)) {
+        fprintf(stderr, "Incompatible NCCL versions. nccl-tests was compiled with NCCL 2.28, but is running with NCCL %d. "
+          "The 2.28 Device API is not compatible with later versions.\n", test_ncclVersion);
+        return testInvalidUsage;
+      }
+      ncclDevCommRequirements reqs = {};
+      if (!ncclTestEngine.getDevCommRequirements ||
+          !ncclTestEngine.getDevCommRequirements(deviceImpl, &reqs)) {
+        fprintf(stderr, "Device implementation %d is not supported by this test\n", deviceImpl);
+        return testNotImplemented;
+      }
+#endif

-       // Check multimem support on the duplicated comms
-       bool checkMultimemFailed = false;
-       ncclResult_t result;
-       ncclDevComm tmpDevComms[nGpus * nThreads];
-       memset(tmpDevComms, 0, sizeof(tmpDevComms));
       NCCLCHECK(ncclGroupStart());
       for (int i = 0; i < nGpus * nThreads; i++) {
-         ncclDevCommRequirements reqs;
-         memset(&reqs, 0, sizeof(reqs));
-         reqs.lsaBarrierCount = deviceCtaCount;
-         reqs.lsaMultimem = true;
-         result = ncclDevCommCreate(tmpComms[i], &reqs, &tmpDevComms[i]);
-         if (result != ncclInProgress && result != ncclSuccess) {
-           checkMultimemFailed = true;
-         }
-       }
-       result = ncclGroupEnd();
-       if (result != ncclSuccess) checkMultimemFailed = true;
-       deviceMultimemEnabled = !checkMultimemFailed;
-
-       // Create final dev comms with correct multimem setting and cleanup temps
-       NCCLCHECK(ncclGroupStart());
-       for (int i = 0; i < nGpus * nThreads; i++) {
-         ncclDevCommRequirements reqs;
-         memset(&reqs, 0, sizeof(reqs));
-         reqs.lsaBarrierCount = deviceCtaCount;
-         reqs.lsaMultimem = deviceMultimemEnabled;
         NCCLCHECK(ncclDevCommCreate(comms[i], &reqs, devComms+i));
-         NCCLCHECK(ncclDevCommDestroy(tmpComms[i], &tmpDevComms[i]));
-         NCCLCHECK(ncclCommDestroy(tmpComms[i]));
       }
       NCCLCHECK(ncclGroupEnd());
     }
+     int64_t deviceCommMaxMem = 0;
+     for (int g = 0; g < nGpus; ++g) {
+       CUDACHECK(cudaSetDevice(gpus[g]));
+       int64_t freeGpuMem;
+       getGPUMemoryInfo(nullptr, &freeGpuMem);
+       deviceCommMaxMem = std::max(deviceCommMaxMem, initFreeGpuMem[g + nGpus*2] - freeGpuMem);
+     }
+     for ( size_t t = 0; t < nThreads; ++t) {
+       initGpuMem[t] += deviceCommMaxMem;
+     }
 #endif
+    free(initFreeGpuMem);
  }

  int errors[nThreads];
  double bw[nThreads];
+  int64_t devMemUsed[nThreads];
  int bw_count[nThreads];
  for (int t=0; t<nThreads; t++) {
    bw[t] = 0.0;
    errors[t] = bw_count[t] = 0;
+    devMemUsed[t] = std::numeric_limits<int64_t>::min();
  }

  fflush(stdout);

-  const char* timeStr = report_cputime ? "cputime" : "time";
-  PRINT("#\n");
-  PRINT("# %10s  %12s  %8s  %6s  %6s           out-of-place                       in-place          \n", "", "", "", "", "");
-  PRINT("# %10s  %12s  %8s  %6s  %6s  %7s  %6s  %6s %6s  %7s  %6s  %6s %6s\n", "size", "count", "type", "redop", "root",
-      timeStr, "algbw", "busbw", "#wrong", timeStr, "algbw", "busbw", "#wrong");
-  PRINT("# %10s  %12s  %8s  %6s  %6s  %7s  %6s  %6s  %5s  %7s  %6s  %6s  %5s\n", "(B)", "(elements)", "", "", "",
-      "(us)", "(GB/s)", "(GB/s)", "", "(us)", "(GB/s)", "(GB/s)", "");
+  writeResultHeader(report_cputime, report_timestamps);

  struct testThread threads[nThreads];
  memset(threads, 0, sizeof(struct testThread)*nThreads);
@ -1388,6 +1495,9 @@ testResult_t run() {
    threads[t].args.errors=errors+t;
    threads[t].args.bw=bw+t;
    threads[t].args.bw_count=bw_count+t;
+    threads[t].args.initGpuMem = initGpuMem + t;
+    threads[t].args.bufferMemory = bufferMemory + t;
+    threads[t].args.devMemUsed = devMemUsed + t;

    threads[t].args.reportErrors = datacheck;

@ -1406,11 +1516,17 @@ testResult_t run() {
      errors[0] += errors[t];
      bw[0] += bw[t];
      bw_count[0] += bw_count[t];
+      devMemUsed[0] = std::max(devMemUsed[0], devMemUsed[t]);
+      initGpuMem[0] = std::max(initGpuMem[0], initGpuMem[t]);
+      bufferMemory[0] = std::max(bufferMemory[0], bufferMemory[t]);
    }
  }

 #ifdef MPI_SUPPORT
  MPI_Allreduce(MPI_IN_PLACE, &errors[0], 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
+  MPI_Allreduce(MPI_IN_PLACE, &devMemUsed[0], 1, MPI_INT64_T, MPI_MAX, MPI_COMM_WORLD);
+  MPI_Allreduce(MPI_IN_PLACE, &initGpuMem[0], 1, MPI_INT64_T, MPI_MAX, MPI_COMM_WORLD);
+  MPI_Allreduce(MPI_IN_PLACE, &bufferMemory[0], 1, MPI_INT64_T, MPI_MAX, MPI_COMM_WORLD);
 #endif

  if (!parallel_init) {
@ -1446,25 +1562,31 @@ testResult_t run() {
  }

  envstr = getenv("NCCL_TESTS_MIN_BW");
-  double check_avg_bw = envstr ? atof(envstr) : -1;
+  const double check_avg_bw = envstr ? atof(envstr) : -1;
  bw[0] /= bw_count[0];

-  PRINT("# Out of bounds values : %d %s\n", errors[0], errors[0] ? "FAILED" : "OK");
-  PRINT("# Avg bus bandwidth    : %g %s\n", bw[0], check_avg_bw == -1 ? "" : (bw[0] < check_avg_bw*(0.9) ? "FAILED" : "OK"));
-  PRINT("#\n");
-  PRINT("# Collective test concluded: %s\n", program_invocation_short_name);
+  writeResultFooter(errors, bw, check_avg_bw, program_invocation_short_name);
+  if (memory_report) {
+    memInfo_t memInfos[3];
+    memInfos[0] = { initGpuMem[0], "Initialization" };
+    memInfos[1] = { bufferMemory[0], "User-Allocated" };
+    memInfos[2] = { devMemUsed[0], "Collective" };
+    writeMemInfo(memInfos, 3);
+  }
+  finalizeFooter();
+
 #ifdef MPI_SUPPORT
  MPI_Comm_free(&mpi_comm);
  MPI_Finalize();
 #endif

-  PRINT("%s\n", ncclGetLastError(NULL));
+  writeErrors();

  // 'cuda-memcheck --leak-check full' requires this
  cudaDeviceReset();

  if (errors[0] || bw[0] < check_avg_bw*(0.9))
-    exit(EXIT_FAILURE);
+    return testNumResults;
  else
-    exit(EXIT_SUCCESS);
+    return testSuccess;
 }
--- a/src/common.h
+++ b/src/common.h
@ -6,6 +6,8 @@
 #ifndef __COMMON_H__
 #define __COMMON_H__

+#define NCCL_TESTS_VERSION "2.17.8"
+
 #include "nccl.h"
 #if NCCL_VERSION_CODE >= NCCL_VERSION(2,28,0)
 #include "nccl_device.h"
@ -70,7 +72,8 @@ typedef enum {
  testNcclError = 3,
  testTimeout = 4,
  testNotImplemented = 5,
-  testNumResults = 6
+  testInvalidUsage = 6,
+  testNumResults = 7, // Must be last
 } testResult_t;

 // Relay errors up and trace
@ -109,6 +112,12 @@ struct testEngine {
  void (*getBuffSize)(size_t *sendcount, size_t *recvcount, size_t count, int nranks);
  testResult_t (*runTest)(struct threadArgs* args, int root, ncclDataType_t type,
      const char* typeName, ncclRedOp_t op, const char* opName);
+
+#if NCCL_VERSION_CODE >= NCCL_VERSION(2,29,0)
+  testResult_t (*getDevCommRequirements)(int deviceImpl, ncclDevCommRequirements* reqs, ncclCommProperties_t* commProperties);
+#elif NCCL_VERSION_CODE >= NCCL_VERSION(2,28,0)
+  bool (*getDevCommRequirements)(int deviceImpl, ncclDevCommRequirements* reqs);
+#endif
 };

 extern struct testEngine ncclTestEngine;
@ -150,6 +159,10 @@ struct threadArgs {

  struct testColl* collTest;

+  int64_t* initGpuMem;
+  int64_t* bufferMemory;
+  int64_t* devMemUsed;
+
 #if NCCL_VERSION_CODE >= NCCL_VERSION(2,19,0)
  void** sendRegHandles;
  void** recvRegHandles;
@ -176,6 +189,9 @@ extern void AllocateBuffs(void **sendbuff, void **recvbuff, void **expected, voi
 static void getHostName(char* hostname, int maxlen) {
  gethostname(hostname, maxlen);
  for (int i=0; i< maxlen; i++) {
+    if (hostname[i] == '\0') {
+      return;
+    }
    if (hostname[i] == '.') {
      hostname[i] = '\0';
      return;
@ -276,7 +292,6 @@ static size_t wordSize(ncclDataType_t type) {

 extern int test_ncclVersion; // init'd with ncclGetVersion()
 extern int deviceCtaCount; // number of CTAs for device implementation
-extern bool deviceMultimemEnabled; // whether multimem was successfully enabled
 constexpr int test_opNumMax = (int)ncclNumOps + (NCCL_VERSION_CODE >= NCCL_VERSION(2,11,0) ? 1 : 0);
 extern int test_opnum;
 extern int test_typenum;
@ -313,27 +328,13 @@ static int ncclstringtoop (char *str) {

 extern int is_main_proc;
 extern thread_local int is_main_thread;
-#define PRINT if (is_main_thread) printf

 #if NCCL_VERSION_CODE >= NCCL_VERSION(2,28,0)
 template <typename F>
-testResult_t testLaunchDeviceKernel(F kernel, void* sendbuff, size_t sendoffset, void* recvbuff, size_t recvoffset, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream, int useMultimem) {
+testResult_t testLaunchDeviceKernel(F kernel, void* sendbuff, size_t sendoffset, void* recvbuff, size_t recvoffset, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
  if (kernel == nullptr) return testNotImplemented;
  ncclDevComm* devComm = (ncclDevComm*)comm;

-  // Check if multimem is enabled for this kernel
-  if (useMultimem && !deviceMultimemEnabled) {
-    printf("[KERNEL_LAUNCH_ERROR] Device kernel requires multimem but it was not available during "
-           "DevComm creation. Multimem support may not be available on this hardware.\n");
-    return testInternalError;
-  }
-
-  // Only check mcBasePtr if multimem is active for this kernel
-  if (useMultimem && devComm->lsaMultimem.mcBasePtr == nullptr) {
-    printf("[KERNEL_LAUNCH_ERROR] Device kernel requires multimem, which may not be available.\n");
-    return testInternalError;
-  }
-
  ncclWindow_t sendwin = (ncclWindow_t)sendbuff;
  ncclWindow_t recvwin = (ncclWindow_t)recvbuff;
  kernel<<<deviceCtaCount, 512, 0, stream>>>(sendwin, sendoffset, recvwin, recvoffset, count, root, *devComm);
@ -355,7 +356,7 @@ testResult_t testLaunchDeviceKernel(F kernel, void* sendbuff, size_t sendoffset,
  )
 #else
 template <typename F>
-testResult_t testLaunchDeviceKernel(F kernel, void* sendbuff, size_t sendoffset, void* recvbuff, size_t recvoffset, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream, int useMultimem) {
+testResult_t testLaunchDeviceKernel(F kernel, void* sendbuff, size_t sendoffset, void* recvbuff, size_t recvoffset, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
  return testNotImplemented;
 }
 #define SPECIALIZE_KERNEL(kernel, type, op) nullptr
--- a/src/gather.cu
+++ b/src/gather.cu
@ -121,8 +121,8 @@ testResult_t GatherRunTest(struct threadArgs* args, int root, ncclDataType_t typ
 }

 struct testEngine gatherEngine = {
-  GatherGetBuffSize,
-  GatherRunTest
+  .getBuffSize = GatherGetBuffSize,
+  .runTest = GatherRunTest
 };

 #pragma weak ncclTestEngine=gatherEngine
--- a/src/hypercube.cu
+++ b/src/hypercube.cu
@ -115,8 +115,8 @@ testResult_t HyperCubeRunTest(struct threadArgs* args, int root, ncclDataType_t
 }

 struct testEngine hyperCubeEngine = {
-  HyperCubeGetBuffSize,
-  HyperCubeRunTest
+  .getBuffSize = HyperCubeGetBuffSize,
+  .runTest = HyperCubeRunTest
 };

 #pragma weak ncclTestEngine=hyperCubeEngine
--- a/src/reduce.cu
+++ b/src/reduce.cu
@ -109,8 +109,8 @@ testResult_t ReduceRunTest(struct threadArgs* args, int root, ncclDataType_t typ
 }

 struct testEngine reduceEngine = {
-  ReduceGetBuffSize,
-  ReduceRunTest
+  .getBuffSize = ReduceGetBuffSize,
+  .runTest = ReduceRunTest
 };

 #pragma weak ncclTestEngine=reduceEngine
--- a/src/reduce_scatter.cu
+++ b/src/reduce_scatter.cu
@ -102,8 +102,8 @@ testResult_t ReduceScatterRunTest(struct threadArgs* args, int root, ncclDataTyp
 }

 struct testEngine reduceScatterEngine = {
-  ReduceScatterGetBuffSize,
-  ReduceScatterRunTest
+  .getBuffSize = ReduceScatterGetBuffSize,
+  .runTest = ReduceScatterRunTest
 };

 #pragma weak ncclTestEngine=reduceScatterEngine
--- a/src/scatter.cu
+++ b/src/scatter.cu
@ -117,8 +117,8 @@ testResult_t ScatterRunTest(struct threadArgs* args, int root, ncclDataType_t ty
 }

 struct testEngine scatterEngine = {
-  ScatterGetBuffSize,
-  ScatterRunTest
+  .getBuffSize = ScatterGetBuffSize,
+  .runTest = ScatterRunTest
 };

 #pragma weak ncclTestEngine=scatterEngine
--- a/src/sendrecv.cu
+++ b/src/sendrecv.cu
@ -113,8 +113,8 @@ testResult_t SendRecvRunTest(struct threadArgs* args, int root, ncclDataType_t t
 }

 struct testEngine sendRecvEngine = {
-  SendRecvGetBuffSize,
-  SendRecvRunTest
+  .getBuffSize = SendRecvGetBuffSize,
+  .runTest = SendRecvRunTest
 };

 #pragma weak ncclTestEngine=sendRecvEngine
--- a/src/util.cu
+++ b/src/util.cu
@ -0,0 +1,725 @@
+/*************************************************************************
+ * Copyright (c) 2016-2025, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+// This contains an utlities to handle output both to stdout and to
+// json files.
+//
+// An ad-hoc, libc-based approach to writing json has been adopted to
+// keep things simple and to avoid injecting a dependency on the
+// library for an external JSON utility.
+//
+// However, this means that the code is a brittle to changes and care
+// should be taken when adding/removing things. We also essentially
+// give up when passed non-ASCII strings and non-printable characters
+// except some of the usual ones.
+
+#include "nccl.h"
+#include "util.h"
+#include <assert.h>
+#include <errno.h>
+#include <string>
+#include <iomanip>
+
+#define PRINT if (is_main_thread) printf
+
+extern int nThreads;
+extern int nGpus;
+extern size_t minBytes;
+extern size_t maxBytes;
+extern size_t stepBytes;
+extern size_t stepFactor;
+extern int datacheck;
+extern int warmup_iters;
+extern int iters;
+extern int agg_iters;
+extern int parallel_init;
+extern int blocking_coll;
+extern int cudaGraphLaunches;
+
+static FILE *json_report_fp;
+static thread_local bool write_json;
+
+#define JSON_FILE_VERSION 1
+
+#define TIME_STRING_FORMAT "%Y-%m-%d %H:%M:%S"
+
+typedef enum {
+  JSON_NONE, // A pseudo-state meaning that the document is empty
+  JSON_KEY,
+  JSON_OBJECT_EMPTY,
+  JSON_OBJECT_SOME,
+  JSON_LIST_EMPTY,
+  JSON_LIST_SOME,
+} json_state_t;
+
+// We use these statics to maintain a stack of states where we are writing.
+// the init_json_output function gets this set up, and it's the finalize_json_output function's job to clean this up.
+json_state_t *states = nullptr;
+size_t state_cap = 0; // Allocated stack capacity
+size_t state_n = 0;   // # of items in the stack.
+
+// This tries to sanitize/quote a string from 'in' into 'out',
+// assuming 'out' has length 'lim'.  We mainly quote ",/,\,\t,\n, and
+// bail if we encounter non-printable stuff or non-ASCII stuff.
+// 'in' should be null-terminated, of course.
+//
+// We return false if we were not able to copy all of 'in', either for
+// length reasons or for unhandled characters.
+static bool sanitizeJson(char out[], int lim, const char *in) {
+  int c = 0;
+  while(*in) {
+    if(c+1 >= lim) {
+      out[c] = 0;
+      return false;
+    }
+    switch(*in) {
+    case '"':
+    case '\\':
+    case '/':
+    case '\t':
+    case '\n':
+      if(c + 2 > lim) {
+        out[c] = 0;
+        return false;
+      }
+
+      out[c++] = '\\';
+      if(*in == '\n') {
+        out[c++] = 'n';
+      }
+      else if( *in == '\t') {
+        out[c++] = 't';
+      }
+      else {
+        out[c++] = *in;
+      }
+      break;
+    default:
+      if (*in >= 0x7F || *in <= 0x1F) {
+        out[c] = 0;
+        return false;
+      }
+      out[c++] = *in;
+      break;
+    }
+    ++in;
+  }
+  out[c] = 0;
+  return true;
+}
+
+// Push state onto the state stack. Reallocate for extra storage if needed.
+// Because JSON_NONE is a pseudo-state, don't allow it to be pushed.
+static void jsonPushState(json_state_t state) {
+  assert(state != JSON_NONE);
+  if(state_cap <= (state_n+1)) {
+    state_cap = max((size_t)16, state_cap*2);
+    states = (json_state_t *)realloc(states, sizeof(json_state_t)*state_cap);
+    assert(states);
+  }
+  states[state_n++] = state;
+}
+
+// Return the current state at the top of the stack
+static json_state_t jsonCurrState() {
+  if(state_n == 0) {
+    return JSON_NONE;
+  }
+  return states[state_n-1];
+}
+
+// Replace the stack with state (equivalent to a pop & push if stack is not empty)
+static void jsonReplaceState(json_state_t state) {
+  assert(state != JSON_NONE);
+  assert(state_n != 0);
+  states[state_n-1] = state;
+}
+
+// Pop the top state off the stack, or return that the state is empty
+static json_state_t jsonPopState() {
+  if(state_n == 0) {
+    return JSON_NONE;
+  }
+  return states[--state_n];
+}
+
+// Emit a key and separator. Santize the key.
+// This is only acceptable if the top state is an object
+// Emit a ',' separator of we aren't the first item.
+static void jsonKey(const char *name) {
+  switch(jsonCurrState()) {
+  case JSON_OBJECT_EMPTY:
+    jsonReplaceState(JSON_OBJECT_SOME);
+    break;
+  case JSON_OBJECT_SOME:
+    fprintf(json_report_fp, ",");
+    break;
+  default:
+    assert(0);
+    break;
+  }
+  char tmp[2048];
+  sanitizeJson(tmp, sizeof(tmp), name);
+  fprintf(json_report_fp, "\"%s\":", tmp);
+  jsonPushState(JSON_KEY);
+}
+
+// Helper function for inserting values.
+// Only acceptable after keys, top-level, or in lists.
+// Emit preceeding ',' if in a list and not first item.
+static void jsonValHelper() {
+  switch(jsonCurrState()) {
+  case JSON_LIST_EMPTY:
+    jsonReplaceState(JSON_LIST_SOME);
+    break;
+  case JSON_LIST_SOME:
+    fprintf(json_report_fp, ",");
+    break;
+  case JSON_KEY:
+    jsonPopState();
+    break;
+  case JSON_NONE:
+    break;
+  default:
+    assert(0);
+  }
+}
+
+// Start an object
+static void jsonStartObject() {
+  jsonValHelper();
+  fprintf(json_report_fp, "{");
+  jsonPushState(JSON_OBJECT_EMPTY);
+}
+
+// Close an object
+static void jsonFinishObject() {
+  switch(jsonPopState()) {
+  case JSON_OBJECT_EMPTY:
+  case JSON_OBJECT_SOME:
+    break;
+  default:
+    assert(0);
+  }
+  fprintf(json_report_fp, "}");
+}
+
+// Start a list
+static void jsonStartList() {
+  jsonValHelper();
+  fprintf(json_report_fp, "[");
+  jsonPushState(JSON_LIST_EMPTY);
+}
+
+// Close a list
+static void jsonFinishList() {
+  switch(jsonPopState()) {
+  case JSON_LIST_EMPTY:
+  case JSON_LIST_SOME:
+    break;
+  default:
+    assert(0);
+  }
+  fprintf(json_report_fp, "]");
+}
+
+// Write a null value
+static void jsonNull() {
+  jsonValHelper();
+  fprintf(json_report_fp, "null");
+}
+
+// Write a (sanititzed) string
+static void jsonStr(const char *str) {
+  if(str == nullptr) {
+    jsonNull();
+    return;
+  }
+  jsonValHelper();
+  char tmp[2048];
+  sanitizeJson(tmp, sizeof(tmp), str);
+  fprintf(json_report_fp, "\"%s\"", tmp);
+}
+
+// Write a bool as "true" or "false" strings.
+static void jsonBool(bool val) {
+  jsonStr(val ? "true" : "false");
+}
+
+// Write an integer value
+static void jsonInt(const int val) {
+  jsonValHelper();
+  fprintf(json_report_fp, "%d", val);
+}
+
+// Write a size_t value
+static void jsonSize_t(const size_t val) {
+  jsonValHelper();
+  fprintf(json_report_fp, "%zu", val);
+}
+
+// Write a double value
+static void jsonDouble(const double val) {
+  jsonValHelper();
+  if(val != val) {
+    fprintf(json_report_fp, "\"nan\"");
+  }
+  else {
+    fprintf(json_report_fp, "%lf", val);
+  }
+}
+
+// Fill buff with a formatted time string corresponding to 'now.
+// Write len or fewer bytes.
+void formatNow(char *buff, int len) {
+  time_t now;
+  time(&now);
+  struct tm *timeinfo = localtime(&now);
+
+  strftime(buff, len, TIME_STRING_FORMAT, timeinfo);
+}
+
+// We provide some status line to stdout.
+// The JSON stream is left with a trailing comma and the top-level
+// object open for the next set of top-level items (config and
+// results).
+
+// This uses unguarded 'printf' rather than the PRINT() macro because
+// is_main_thread is not set up at this point.
+void jsonOutputInit(const char *in_path,
+                    int argc, char **argv,
+                    char **envp) {
+  if(in_path == nullptr) {
+    return;
+  }
+
+  #ifdef MPI_SUPPORT
+  int proc;
+  MPI_Comm_rank(MPI_COMM_WORLD, &proc);
+  if(proc != 0) {
+    return;
+  }
+  #endif
+
+  char *try_path = strdup(in_path);
+  int try_count = 0;
+  json_report_fp = fopen(try_path, "wx");
+  while(json_report_fp == NULL) {
+    if(errno != EEXIST) {
+      printf("# skipping json output; %s not accessible\n", try_path);
+      free(try_path);
+      return;
+    }
+    free(try_path);
+    if(asprintf(&try_path, "%s.%d", in_path, try_count++) == -1) {
+      printf("# skipping json output; failed to probe destination\n");
+      return;
+    }
+    json_report_fp = fopen(try_path, "wx");
+  }
+
+  printf("# Writing JSON output to %s\n", try_path);
+  free(try_path);
+
+  write_json = true;
+
+  jsonStartObject(); // will be closed finalize_json_output
+
+  jsonKey("version"); jsonInt(JSON_FILE_VERSION);
+
+  jsonKey("start_time");
+  {
+    char timebuffer[128];
+    formatNow(timebuffer, sizeof(timebuffer));
+    jsonStr(timebuffer);
+  }
+
+  jsonKey("args");
+  jsonStartList();
+  for(int i = 0; i < argc; i++) {
+    jsonStr(argv[i]);
+  }
+  jsonFinishList();
+
+  jsonKey("env");
+  jsonStartList();
+  for(char **e = envp; *e; e++) {
+    jsonStr(*e);
+  }
+  jsonFinishList();
+  jsonKey("nccl_version"); jsonInt(test_ncclVersion);
+}
+
+void jsonIdentifyWriter(bool is_writer) {
+  write_json &= is_writer;
+}
+
+// This cleans up the json output, finishing the object and closing the file.
+// If we were not writing json output, we don't do anything.
+void jsonOutputFinalize() {
+  if(write_json) {
+
+    jsonKey("end_time");
+    char timebuffer[128];
+    formatNow(timebuffer, sizeof(timebuffer));
+    jsonStr(timebuffer);
+
+    jsonFinishObject();
+
+    assert(jsonCurrState() == JSON_NONE);
+    free(states);
+    states = nullptr;
+    state_n = 0;
+    state_cap = 0;
+
+    fclose(json_report_fp);
+    json_report_fp = nullptr;
+  }
+}
+
+struct rankInfo_t {
+  int rank;
+  int group;
+  int pid;
+  char hostname[1024];
+  int device;
+  char device_hex[128];
+  char devinfo[1024];
+};
+
+// Helper function to parse the device info lines passed via MPI to the root rank.
+// This fills 'rank' with the parsed contents of 'instring'.
+static int parseRankInfo(rankInfo_t *rank, const char *instring) {
+  int end;
+  sscanf(instring,
+         "#  Rank %d Group %d Pid %d on %1024s device %d [%128[^]]] %1024[^\n]\n%n",
+         &rank->rank,
+         &rank->group,
+         &rank->pid,
+         rank->hostname,
+         &rank->device,
+         rank->device_hex,
+         rank->devinfo,
+         &end);
+  return end;
+}
+
+static void jsonRankInfo(const rankInfo_t *ri) {
+  jsonStartObject();
+  jsonKey("rank");        jsonInt(ri->rank);
+  jsonKey("group");       jsonInt(ri->group);
+  jsonKey("pid");         jsonInt(ri->pid);
+  jsonKey("hostname");    jsonStr(ri->hostname);
+  jsonKey("device");      jsonInt(ri->device);
+  jsonKey("device_hex");  jsonStr(ri->device_hex);
+  jsonKey("device_info"); jsonStr(ri->devinfo);
+  jsonFinishObject();
+}
+
+// Write the start of a benchmark output line containing the bytes &
+// op type, both to stdout and to json if we are writing there.
+void writeBenchmarkLinePreamble(size_t nBytes, size_t nElem, const char typeName[], const char opName[], int root) {
+  char rootName[100];
+  sprintf(rootName, "%6i", root);
+  PRINT("%12li  %12li  %8s  %6s  %6s", nBytes, nElem, typeName, opName, rootName);
+
+  if(write_json) {
+    jsonStartObject();
+    jsonKey("size");  jsonSize_t(nBytes);
+    jsonKey("count"); jsonSize_t(nElem);
+    jsonKey("type");  jsonStr(typeName);
+    jsonKey("redop"); jsonStr(opName);
+    jsonKey("root");  jsonStr(rootName);
+  }
+}
+
+// Finish a result record we were writing to stdout/json
+void writeBenchmarkLineTerminator(int actualIters, const char *name) {
+  PRINT("\n");
+  if(write_json) {
+    jsonKey("actual_iterations"); jsonInt(actualIters);
+    jsonKey("experiment_name");   jsonStr(name);
+    jsonFinishObject();
+  }
+}
+
+// Handle a cases where we don't write out of place results
+void writeBenchMarkLineNullBody() {
+  PRINT("                                ");  // only do in-place for trace replay
+  if(write_json) {
+    jsonKey("out_of_place"); jsonNull();
+  }
+}
+
+void getFloatStr(double value, int width, char* str) {
+  int power = 0;
+  for (uint64_t val = 1; value >= val; val *= 10) power++;
+
+  if (power < width-2) sprintf(str, "%*.2f", width, value);
+  else if (power < width-1) sprintf(str, "%*.1f", width, value);
+  else if (power < width+1) sprintf(str, "%*.0f", width, value);
+  else if (width >= 7) sprintf(str, "%*.1e", width, value);
+  else if (width >= 8) sprintf(str, "%*.2e", width, value);
+  else sprintf(str, "%*.0e", width, value);
+}
+
+// Write the performance-related payload to stdout/json.
+// We call this function twice at the top level per test: once for out-of-place, and once for in-place.
+// The Json output assumes out-of-place happens first.
+void writeBenchmarkLineBody(double timeUsec, double algBw, double busBw, bool reportErrors, int64_t wrongElts, bool report_cputime, bool report_timestamps, bool out_of_place) {
+  char timeStr[8];
+  getFloatStr(timeUsec, 7, timeStr);
+
+  char algBwStr[7];
+  getFloatStr(algBw, 6, algBwStr);
+
+  char busBwStr[7];
+  getFloatStr(busBw, 6, busBwStr);
+
+  if (reportErrors) {
+    PRINT("  %7s  %6s  %6s  %6g", timeStr, algBwStr, busBwStr, (double)wrongElts);
+  } else {
+    PRINT("  %7s  %6s  %6s    N/A", timeStr, algBwStr, busBwStr);
+  }
+
+  if (!out_of_place && report_timestamps) {
+    char timebuffer[128];
+    formatNow(timebuffer, sizeof(timebuffer));
+    PRINT("%21s", timebuffer);
+  }
+
+  if(write_json) {
+    jsonKey(out_of_place ? "out_of_place" : "in_place");
+    jsonStartObject();
+    jsonKey(report_cputime ? "cpu_time" : "time"); jsonDouble(timeUsec);
+    jsonKey("alg_bw");                             jsonDouble(algBw);
+    jsonKey("bus_bw");                             jsonDouble(busBw);
+    jsonKey("nwrong");                             (reportErrors ? jsonDouble((double)wrongElts) : jsonNull());
+    jsonFinishObject();
+  }
+}
+
+// This writes out a report about the run parameters and devices
+// involved to stdout and json.  For MPI, this will use a collective
+// to gather from each rank to the root.
+
+// Root then consumes this output, printing raw lines for stdout and
+// parsing them for JSON for proper formatting.
+
+// Perhaps actually sending records around instead of formatted
+// strings would be smarter/easier, but I chose to adapt what was
+// already in place.
+testResult_t writeDeviceReport(size_t *maxMem, int localRank, int proc, int totalProcs, int color, const char hostname[], const char *program_name) {
+  PRINT("# nccl-tests version %s nccl-headers=%d nccl-library=%d\n", NCCL_TESTS_VERSION, NCCL_VERSION_CODE, test_ncclVersion);
+  PRINT("# Collective test starting: %s\n", program_name);
+  PRINT("# nThread %d nGpus %d minBytes %ld maxBytes %ld step: %ld(%s) warmup iters: %d iters: %d agg iters: %d validation: %d graph: %d\n",
+        nThreads, nGpus, minBytes, maxBytes,
+        (stepFactor > 1)?stepFactor:stepBytes, (stepFactor > 1)?"factor":"bytes",
+        warmup_iters, iters, agg_iters, datacheck, cudaGraphLaunches);
+  if (blocking_coll) PRINT("# Blocking Enabled: wait for completion and barrier after each collective \n");
+  if (parallel_init) PRINT("# Parallel Init Enabled: threads call into NcclInitRank concurrently \n");
+  PRINT("#\n");
+
+  if(write_json) {
+    jsonKey("config");
+    jsonStartObject();
+    jsonKey("nthreads");      jsonInt(nThreads);
+    jsonKey("ngpus");         jsonInt(nGpus);
+    jsonKey("minimum_bytes"); jsonSize_t(minBytes);
+    jsonKey("maximum_bytes"); jsonSize_t(maxBytes);
+    if(stepFactor > 1) {
+      jsonKey("step_factor");   jsonInt(stepFactor);
+    }
+    else {
+      jsonKey("step_bytes");  jsonSize_t(stepBytes);
+    }
+
+    jsonKey("warmup_iters");          jsonInt(warmup_iters);
+    jsonKey("iterations");            jsonInt(iters);
+    jsonKey("aggregated_iterations"); jsonInt(agg_iters);
+    jsonKey("validation");            jsonInt(datacheck);
+    jsonKey("graph");                 jsonInt(cudaGraphLaunches);
+    jsonKey("blocking_collectives");  jsonBool(blocking_coll);
+    jsonKey("parallel_init");         jsonBool(parallel_init);
+  }
+
+  PRINT("# Using devices\n");
+#define MAX_LINE 2048
+  char line[MAX_LINE];
+  int len = 0;
+  const char* envstr = getenv("NCCL_TESTS_DEVICE");
+  const int gpu0 = envstr ? atoi(envstr) : -1;
+  int available_devices;
+  CUDACHECK(cudaGetDeviceCount(&available_devices));
+  for (int i=0; i<nThreads*nGpus; i++) {
+    const int cudaDev = (gpu0 != -1 ? gpu0 : localRank*nThreads*nGpus) + i;
+    const int rank = proc*nThreads*nGpus+i;
+    cudaDeviceProp prop;
+    if (cudaDev >= available_devices) {
+      fprintf(stderr, "Invalid number of GPUs: %d requested but only %d were found.\n",
+              (gpu0 != -1 ? gpu0 : localRank*nThreads*nGpus) + nThreads*nGpus, available_devices);
+      fprintf(stderr, "Please check the number of processes and GPUs per process.\n");
+      return testNotImplemented;
+    }
+    CUDACHECK(cudaGetDeviceProperties(&prop, cudaDev));
+    if (len < MAX_LINE) {
+      len += snprintf(line+len, MAX_LINE-len, "#  Rank %2d Group %2d Pid %6d on %10s device %2d [%04x:%02x:%02x] %s\n",
+                      rank, color, getpid(), hostname, cudaDev, prop.pciDomainID, prop.pciBusID, prop.pciDeviceID, prop.name);
+    }
+    *maxMem = std::min(*maxMem, prop.totalGlobalMem);
+  }
+  if (len >= MAX_LINE) {
+    strcpy(line+MAX_LINE-5, "...\n");
+  }
+
+#if MPI_SUPPORT
+  char *lines = (proc == 0) ? (char *)malloc(totalProcs*MAX_LINE) : NULL;
+  // Gather all output in rank order to root (0)
+  MPI_Gather(line, MAX_LINE, MPI_BYTE, lines, MAX_LINE, MPI_BYTE, 0, MPI_COMM_WORLD);
+  if (proc == 0) {
+    if(write_json) {
+      jsonKey("devices");
+      jsonStartList();
+    }
+    for (int p = 0; p < totalProcs; p++) {
+      PRINT("%s", lines+MAX_LINE*p);
+      if(write_json) {
+        rankInfo_t rankinfo;
+        parseRankInfo(&rankinfo, lines + MAX_LINE*p);
+        jsonRankInfo(&rankinfo);
+      }
+    }
+    if(write_json) {
+      jsonFinishList();
+    }
+    free(lines);
+  }
+  MPI_Allreduce(MPI_IN_PLACE, maxMem, 1, MPI_LONG, MPI_MIN, MPI_COMM_WORLD);
+#else
+  PRINT("%s", line);
+  if(write_json) {
+    rankInfo_t rankinfo;
+    parseRankInfo(&rankinfo, line);
+    jsonKey("devices");
+    jsonStartList();
+    jsonRankInfo(&rankinfo);
+    jsonFinishList();
+  }
+#endif
+  if(write_json) {
+    jsonFinishObject();
+  }
+
+  return testSuccess;
+}
+
+// Write a result header to stdout/json.
+// Json results object and contained table list are left open
+void writeResultHeader(bool report_cputime, bool report_timestamps) {
+  const char* tsLbl  = report_timestamps ? "timestamp" : "";
+  const int tsPad = report_timestamps ? 19 : 0;
+  const char* tsFmt = report_timestamps ? TIME_STRING_FORMAT : "";
+  const char* timeStr = report_cputime ? "cputime" : "time";
+  PRINT("#\n");
+  PRINT("# %10s  %12s  %8s  %6s  %6s           out-of-place                       in-place          \n", "", "", "", "", "");
+  PRINT("# %10s  %12s  %8s  %6s  %6s  %7s  %6s  %6s  %6s  %7s  %6s  %6s  %6s %*s\n", "size", "count", "type", "redop", "root",
+        timeStr, "algbw", "busbw", "#wrong", timeStr, "algbw", "busbw", "#wrong", tsPad, tsLbl);
+  PRINT("# %10s  %12s  %8s  %6s  %6s  %7s  %6s  %6s  %6s  %7s  %6s  %6s  %6s %*s\n", "(B)", "(elements)", "", "", "",
+        "(us)", "(GB/s)", "(GB/s)", "", "(us)", "(GB/s)", "(GB/s)", "", tsPad, tsFmt);
+
+  if(write_json) {
+    jsonKey("results"); jsonStartList();
+  }
+}
+
+// Write the footer for results to stdout/json.
+// We close the table list and write out the summary items.
+// Results object is left open for errors.
+void writeResultFooter(const int errors[], const double bw[], double check_avg_bw, const char *program_name) {
+
+  if(write_json) {
+    jsonFinishList();
+  }
+
+  PRINT("# %-20s : %d %s\n", "Out of bounds values", errors[0], errors[0] ? "FAILED" : "OK");
+  PRINT("# %-20s : %g %s\n", "Avg bus bandwidth", bw[0], check_avg_bw == -1 ? "" : (bw[0] < check_avg_bw*(0.9) ? "FAILED" : "OK"));
+  PRINT("#\n");
+  PRINT("# Collective test concluded: %s\n", program_name);
+
+  if(write_json) {
+    jsonKey("out_of_bounds");
+    jsonStartObject();
+    jsonKey("count");      jsonInt(errors[0]);
+    jsonKey("okay");       jsonBool(errors[0] == 0);
+    jsonFinishObject();
+    jsonKey("average_bus_bandwidith");
+    jsonStartObject();
+    jsonKey("bandwidith"); jsonDouble(bw[0]);
+    jsonKey("okay");       check_avg_bw == -1 ? jsonStr("unchecked") : jsonBool(bw[0] >= check_avg_bw*(0.9));
+    jsonFinishObject();
+  }
+}
+
+std::string getMemString(double amount) {
+  std::string postfix = " B";
+  if (abs(amount) >= 1024.0*1024.0*1024.0) {
+    postfix = " GB";
+    amount /= 1024.0 * 1024.0 * 1024.0;
+  } else if (abs(amount) >= 1024.0*1024.0) {
+    postfix = " MB";
+    amount /= 1024.0 * 1024.0;
+  } else if (abs(amount) >= 1024.0) {
+    postfix = " KB";
+    amount /= 1024.0;
+  }
+  int precision = 0;
+  if (abs(amount) < 10.0) {
+    precision = 2;
+  } else if (abs(amount) < 100.0) {
+    precision = 1;
+  }
+  std::stringstream ss;
+  ss << std::fixed << std::setprecision(precision) << amount << postfix;
+  return ss.str();
+}
+
+void writeMemInfo(memInfo_t* memInfos, int numMemInfos) {
+
+  std::stringstream ss;
+  uint64_t maxAmount = 0;
+  for (int i = 0; i < numMemInfos; i++) {
+    ss << memInfos[i].name << " "
+      << getMemString(memInfos[i].amount)
+      << " ";
+    if (i < numMemInfos - 1) {
+      ss << "| ";
+    }
+    maxAmount += memInfos[i].amount;
+  }
+  ss << "| Total  " << getMemString(maxAmount);
+  PRINT("# %-20s : %s\n", "GPU memory usage", ss.str().c_str());
+}
+
+// Write out remaining errors to stdout/json.
+void writeErrors() {
+  const char *error = ncclGetLastError(NULL);
+  if(error && strlen(error) > 0) {
+    PRINT("# error: %s\n", error);
+  } else {
+    PRINT("\n");
+  }
+  if(write_json) {
+    jsonKey("errors");
+    jsonStartList();
+    if(error) {
+      jsonStr(error);
+    }
+    jsonFinishList();
+  }
+}
+
+void finalizeFooter() {
+  PRINT("#\n");
+}
--- a/src/util.h
+++ b/src/util.h
@ -0,0 +1,44 @@
+/*************************************************************************
+ * Copyright (c) 2016-2025, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+#ifndef __UTIL_H__
+#define __UTIL_H__
+
+#include "common.h"
+
+struct memInfo_t {
+  int64_t amount;
+  const char* name;
+};
+
+// Try to set up JSON file output. If MPI is used, only rank 0 will proceed.
+// This should be called by only a single thread.
+// If 'in_path' is NULL, we stop.
+// Otherwise, we borrow 'in_path' and try to open it as a new file.
+// If it already exists, we probe for new files by appending integers
+// until we succeed.
+// Then we write argv and envp to the json output, santizing them. We also
+// write the nccl version.
+// The top-level object remains open for the rest of the output.
+void jsonOutputInit(const char *path, int argc, char **argv, char **envp);
+
+// Should be called to identify main thread after threads are started to ensure we don't duplicate output
+void jsonIdentifyWriter(bool is_writer);
+
+// Write end time and close top-level object. Reset json state and close output file.
+void jsonOutputFinalize();
+
+void writeBenchmarkLinePreamble(size_t nBytes, size_t nElem, const char typeName[], const char opName[], int root);
+void writeBenchmarkLineTerminator(int actualIters, const char *name);
+void writeBenchMarkLineNullBody();
+void writeBenchmarkLineBody(double timeUsec, double algBw, double busBw, bool reportErrors, int64_t wrongElts, bool report_cputime, bool report_timestamps, bool out_of_place);
+testResult_t writeDeviceReport(size_t *maxMem, int localRank, int proc, int totalProcs, int color, const char hostname[], const char *program_name);
+void writeResultHeader(bool report_cputime, bool report_timestamps);
+void writeResultFooter(const int errors[], const double bw[], double check_avg_bw, const char *program_name);
+void finalizeFooter();
+void writeMemInfo(memInfo_t* memInfos, int numMemInfos);
+void writeErrors();
+
+#endif
Author	SHA1	Message	Date
David Addison	81463c58d0	NCCL_TESTS_VERSION 2.17.8	2026-01-06 15:00:17 -08:00
David Addison	7278698c1b	Clarified use of Mebibytes and Gibibytes for sizes	2026-01-06 14:59:17 -08:00
Katie Gioioso	2656c58421	NCCL_TESTS_VERSION 2.17.7	2025-12-30 20:18:25 +00:00
Katie Gioioso	070d17528c	refactor comm init	2025-12-30 20:18:25 +00:00
Katie Gioioso	332e61896f	device api 2.28 is not compatible with 2.29. Check versions and print error if there is a mismatch	2025-12-30 20:18:25 +00:00
Katie Gioioso	24874bdaa8	Compatibility with 2.29 device API: use NCCL_DEV_COMM_REQUIREMENTS_INTIIALIZER, query properties to check for device api support	2025-12-30 20:18:24 +00:00
David Addison	7106245178	Add include of <limits> due to compilation error	2025-12-30 20:13:13 +00:00
David Addison	760c467f12	Add memory usage report option Use -M 1 to dump library memory usage information	2025-12-30 20:12:58 +00:00
David Addison	4bc314aa27	Add README.md text for -J option	2025-11-21 11:31:48 -08:00
David Addison	51f2e7ed7c	Remove trailing WS when timestamp option not used	2025-11-03 11:23:52 -08:00
David Addison	da0b547b1b	NCCL_TESTS_VERSION 2.17.6	2025-10-28 10:22:08 -07:00
David Addison	e2af90af76	Add new report_timestamps option to README.md	2025-10-28 10:21:58 -07:00
David Addison	a62c975681	Add option to suffix a timestamp to each perf line Based on code from yakovdyadkin & Scott Moe in MR 349 Adds -S 1 option to suffix each performance report line with a timestamp. Format is "%Y-%m-%d %H:%M:%S" This is especially useful when using the -N 0 option and looking for hangs or failure events.	2025-10-28 10:11:23 -07:00
David Addison	0bb567cc02	NCCL_TESTS_VERSION 2.17.5	2025-10-28 09:34:56 -07:00
Shane Snyder	013c49e930	add necessary ifdef guards for device API tests	2025-10-28 09:34:26 -07:00
Shane Snyder	f66d20e360	add runtime guards for ncclAlltoAll()	2025-10-28 09:32:17 -07:00
David Addison	3744121a2d	NCCL_TESTS_VERSION 2.17.4	2025-10-24 17:11:08 -07:00
David Addison	9641693e9b	Add PRINT of nccl-tests, NCCL header and library versions	2025-10-24 17:10:57 -07:00
Shane Snyder	9829ea42b5	add GIN-based device API kernels to alltoall - add GIN-only A2A kernel implementation - add hybrid LSA+GIN A2A kernel implementation - update perf test cases to expose a function for setting devCommRequirements for each device implementation and simplify devCommCreate code path to use this directly instead of complex fallback logic - add missing call to devCommDestroy	2025-10-24 17:10:34 -07:00
David Addison	00f52811b8	Add support for JSON output to perf test framework This adds support for writing structured information about the run to a JSON file. Enable with -J <filename>.json If the target JSON filename already exists then an incrementing numeric suffix will be added to create <filename>.json.<n>	2025-10-17 12:01:25 -07:00