From a1efb427e764241bc43d2d91be875c9f55da03a5 Mon Sep 17 00:00:00 2001
From: Giuseppe Congiu <gcongiu@nvidia.com>
Date: Wed, 28 Feb 2024 05:18:40 -0800
Subject: [PATCH 1/6] Add -R option to register user buffers

---
 src/common.cu | 63 +++++++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 61 insertions(+), 2 deletions(-)
diff --git a/src/common.cu b/src/common.cu
index 4ac00fb..fc5af1e 100644
--- a/src/common.cu
+++ b/src/common.cu
@@ -80,6 +80,9 @@ static int cudaGraphLaunches = 0;
 static int report_cputime = 0;
 // Report average iteration time: (0=RANK0,1=AVG,2=MIN,3=MAX)
 static int average = 1;
+#if NCCL_VERSION_CODE >= NCCL_VERSION(2,19,0)
+static int local_register = 0;
+#endif
 
 #define NUM_BLOCKS 32
 
@@ -631,10 +634,22 @@ testResult_t threadInit(struct threadArgs* args) {
     NCCLCHECK(ncclCommInitRank(args->comms+i, nranks, args->ncclId, rank));
   }
   NCCLCHECK(ncclGroupEnd());
+#if NCCL_VERSION_CODE >= NCCL_VERSION(2,19,0)
+  void **sendRegHandles = (local_register) ? (void **)malloc(sizeof(*sendRegHandles)*args->nGpus) : NULL;
+  void **recvRegHandles = (local_register) ? (void **)malloc(sizeof(*recvRegHandles)*args->nGpus) : NULL;
+  for (int i=0; i<args->nGpus; i++) {
+    if (local_register) NCCLCHECK(ncclCommRegister(args->comms[i], args->sendbuffs[i], args->maxbytes, &sendRegHandles[i]));
+    if (local_register) NCCLCHECK(ncclCommRegister(args->comms[i], args->recvbuffs[i], args->maxbytes, &recvRegHandles[i]));
+  }
+#endif
 
   TESTCHECK(threadRunTests(args));
 
   for (int i=0; i<args->nGpus; i++) {
+#if NCCL_VERSION_CODE >= NCCL_VERSION(2,19,0)
+    if (local_register) NCCLCHECK(ncclCommDeregister(args->comms[i], sendRegHandles[i]));
+    if (local_register) NCCLCHECK(ncclCommDeregister(args->comms[i], recvRegHandles[i]));
+#endif
     NCCLCHECK(ncclCommDestroy(args->comms[i]));
   }
   return testSuccess;
@@ -651,9 +666,15 @@ testResult_t threadLaunch(struct testThread* thread) {
 }
 
 testResult_t AllocateBuffs(void **sendbuff, size_t sendBytes, void **recvbuff, size_t recvBytes, void **expected, size_t nbytes) {
+#if NCCL_VERSION_CODE >= NCCL_VERSION(2,19,0)
+    NCCLCHECK(ncclMemAlloc(sendbuff, nbytes));
+    NCCLCHECK(ncclMemAlloc(recvbuff, nbytes));
+    if (datacheck) NCCLCHECK(ncclMemAlloc(expected, recvBytes));
+#else
     CUDACHECK(cudaMalloc(sendbuff, nbytes));
     CUDACHECK(cudaMalloc(recvbuff, nbytes));
     if (datacheck) CUDACHECK(cudaMalloc(expected, recvBytes));
+#endif
     return testSuccess;
 }
 
@@ -707,13 +728,14 @@ int main(int argc, char* argv[]) {
     {"cudagraph", required_argument, 0, 'G'},
     {"report_cputime", required_argument, 0, 'C'},
     {"average", required_argument, 0, 'a'},
+    {"local_register", required_argument, 0, 'R'},
     {"help", no_argument, 0, 'h'},
     {}
   };
 
   while(1) {
     int c;
-    c = getopt_long(argc, argv, "t:g:b:e:i:f:n:m:w:p:c:o:d:r:z:y:T:hG:C:a:", longopts, &longindex);
+    c = getopt_long(argc, argv, "t:g:b:e:i:f:n:m:w:p:c:o:d:r:z:y:T:hG:C:a:R:", longopts, &longindex);
 
     if (c == -1)
       break;
@@ -797,6 +819,15 @@ int main(int argc, char* argv[]) {
       case 'a':
         average = (int)strtol(optarg, NULL, 0);
         break;
+      case 'R':
+#if NCCL_VERSION_CODE >= NCCL_VERSION(2,19,0)
+        if ((int)strtol(optarg, NULL, 0)) {
+          local_register = 1;
+        }
+#else
+        printf("Option -R (register) is not supported before NCCL 2.19. Ignoring\n");
+#endif
+        break;
       case 'h':
       default:
         if (c != 'h') printf("invalid option '%c'\n", c);
@@ -827,6 +858,7 @@ int main(int argc, char* argv[]) {
             "[-G,--cudagraph <num graph launches>] \n\t"
             "[-C,--report_cputime <0/1>] \n\t"
             "[-a,--average <0/1/2/3> report average iteration time <0=RANK0/1=AVG/2=MIN/3=MAX>] \n\t"
+            "[-R,--local_register <1/0> enable local buffer registration on send/recv buffers (default: disable)] \n\t"
             "[-h,--help]\n",
           basename(argv[0]));
         return 0;
@@ -949,6 +981,10 @@ testResult_t run() {
 
   //if parallel init is not selected, use main thread to initialize NCCL
   ncclComm_t* comms = (ncclComm_t*)malloc(sizeof(ncclComm_t)*nThreads*nGpus);
+#if NCCL_VERSION_CODE >= NCCL_VERSION(2,19,0)
+  void **sendRegHandles = NULL;
+  void **recvRegHandles = NULL;
+#endif
   if (!parallel_init) {
      if (ncclProcs == 1) {
        NCCLCHECK(ncclCommInitAll(comms, nGpus*nThreads, gpus));
@@ -960,6 +996,14 @@ testResult_t run() {
        }
        NCCLCHECK(ncclGroupEnd());
      }
+#if NCCL_VERSION_CODE >= NCCL_VERSION(2,19,0)
+     sendRegHandles = (local_register) ? (void **)malloc(sizeof(*sendRegHandles)*nThreads*nGpus) : NULL;
+     recvRegHandles = (local_register) ? (void **)malloc(sizeof(*recvRegHandles)*nThreads*nGpus) : NULL;
+     for (int i=0; i<nGpus*nThreads; i++) {
+       if (local_register) NCCLCHECK(ncclCommRegister(comms[i], sendbuffs[i], sendBytes, &sendRegHandles[i]));
+       if (local_register) NCCLCHECK(ncclCommRegister(comms[i], recvbuffs[i], recvBytes, &recvRegHandles[i]));
+     }
+#endif
   }
 
   int errors[nThreads];
@@ -1035,18 +1079,33 @@ testResult_t run() {
 #endif
 
   if (!parallel_init) {
-    for(int i=0; i<nGpus*nThreads; ++i)
+    for(int i=0; i<nGpus*nThreads; ++i) {
+#if NCCL_VERSION_CODE >= NCCL_VERSION(2,19,0)
+      if (local_register) NCCLCHECK(ncclCommDeregister(comms[i], sendRegHandles[i]));
+      if (local_register) NCCLCHECK(ncclCommDeregister(comms[i], recvRegHandles[i]));
+#endif
       NCCLCHECK(ncclCommDestroy(comms[i]));
+    }
     free(comms);
   }
 
   // Free off CUDA allocated memory
   for (int i=0; i<nGpus*nThreads; i++) {
+#if NCCL_VERSION_CODE >= NCCL_VERSION(2,19,0)
+    if (sendbuffs[i]) NCCLCHECK(ncclMemFree((char*)sendbuffs[i]));
+    if (recvbuffs[i]) NCCLCHECK(ncclMemFree((char*)recvbuffs[i]));
+    if (datacheck) NCCLCHECK(ncclMemFree(expected[i]));
+#else
     if (sendbuffs[i]) CUDACHECK(cudaFree((char*)sendbuffs[i]));
     if (recvbuffs[i]) CUDACHECK(cudaFree((char*)recvbuffs[i]));
     if (datacheck) CUDACHECK(cudaFree(expected[i]));
+#endif
   }
   CUDACHECK(cudaFreeHost(delta));
+#if NCCL_VERSION_CODE >= NCCL_VERSION(2,19,0)
+  free(sendRegHandles);
+  free(recvRegHandles);
+#endif
 
   envstr = getenv("NCCL_TESTS_MIN_BW");
   double check_avg_bw = envstr ? atof(envstr) : -1;

From d028efcf35101c6663ae8c5f33ad41bad00efb4d Mon Sep 17 00:00:00 2001
From: Kaiming Ouyang <kouyang@nvidia.com>
Date: Thu, 6 Jun 2024 04:59:28 -0700
Subject: [PATCH 2/6] Change ncclCommRegister size to maxBytes in serial comm
 init

---
 src/common.cu | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/common.cu b/src/common.cu
index fc5af1e..04e8142 100644
--- a/src/common.cu
+++ b/src/common.cu
@@ -1000,8 +1000,8 @@ testResult_t run() {
      sendRegHandles = (local_register) ? (void **)malloc(sizeof(*sendRegHandles)*nThreads*nGpus) : NULL;
      recvRegHandles = (local_register) ? (void **)malloc(sizeof(*recvRegHandles)*nThreads*nGpus) : NULL;
      for (int i=0; i<nGpus*nThreads; i++) {
-       if (local_register) NCCLCHECK(ncclCommRegister(comms[i], sendbuffs[i], sendBytes, &sendRegHandles[i]));
-       if (local_register) NCCLCHECK(ncclCommRegister(comms[i], recvbuffs[i], recvBytes, &recvRegHandles[i]));
+       if (local_register) NCCLCHECK(ncclCommRegister(comms[i], sendbuffs[i], maxBytes, &sendRegHandles[i]));
+       if (local_register) NCCLCHECK(ncclCommRegister(comms[i], recvbuffs[i], maxBytes, &recvRegHandles[i]));
      }
 #endif
   }

From 746549b28d3b654e0670feca0065f51affdb7db8 Mon Sep 17 00:00:00 2001
From: Stefano Salsano <stefano.salsano@uniroma2.it>
Date: Fri, 14 Jun 2024 11:28:55 +0200
Subject: [PATCH 3/6] improve parsing of stepbytes (increment size) argument

---
 src/common.cu | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/src/common.cu b/src/common.cu
index 04e8142..7706dd9 100644
--- a/src/common.cu
+++ b/src/common.cu
@@ -764,7 +764,12 @@ int main(int argc, char* argv[]) {
         maxBytes = (size_t)parsed;
         break;
       case 'i':
-        stepBytes = strtol(optarg, NULL, 0);
+        parsed = parsesize(optarg);
+        if (parsed < 0) {
+          fprintf(stderr, "invalid size specified for 'stepBytes'\n");
+          return -1;
+        }
+        stepBytes = (size_t)parsed;
         break;
       case 'f':
         stepFactor = strtol(optarg, NULL, 0);

From c6eb15875f508076f3f26de4f7da3899701bc4db Mon Sep 17 00:00:00 2001
From: Oren <47992694+OrenLeung@users.noreply.github.com>
Date: Wed, 24 Jul 2024 22:55:00 -0400
Subject: [PATCH 4/6] doc: add all2all factor

---
 doc/PERFORMANCE.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/doc/PERFORMANCE.md b/doc/PERFORMANCE.md
index 21fef60..942f054 100644
--- a/doc/PERFORMANCE.md
+++ b/doc/PERFORMANCE.md
@@ -140,5 +140,6 @@ To obtain a bus bandwidth which should be independent of the number of ranks _n_
 * AllGather : (_n_-1)/_n_
 * Broadcast : 1
 * Reduce : 1
+* AlltoAll: (_n_-1)/_n_
 
 The bus bandwidth should reflect the speed of the hardware bottleneck : NVLink, PCI, QPI, or network.

From d2d40cc8249378efa4d7e2c949528c15eeb7d8e7 Mon Sep 17 00:00:00 2001
From: David Addison <daddison@nvidia.com>
Date: Thu, 25 Jul 2024 21:47:40 -0700
Subject: [PATCH 5/6] Added -N,--run_cycles option

---
 src/common.cu | 16 +++++++++++++---
 1 file changed, 13 insertions(+), 3 deletions(-)

diff --git a/src/common.cu b/src/common.cu
index 04e8142..872a18a 100644
--- a/src/common.cu
+++ b/src/common.cu
@@ -69,6 +69,7 @@ static int datacheck = 1;
 static int warmup_iters = 5;
 static int iters = 20;
 static int agg_iters = 1;
+static int run_cycles = 1;
 static int ncclop = ncclSum;
 static int nccltype = ncclFloat;
 static int ncclroot = 0;
@@ -598,7 +599,9 @@ testResult_t TimeTest(struct threadArgs* args, ncclDataType_t type, const char*
   TESTCHECK(completeColl(args));
 
   // Benchmark
-  for (size_t size = args->minbytes; size<=args->maxbytes; size = ((args->stepfactor > 1) ? size*args->stepfactor : size+args->stepbytes)) {
+  long repeat = run_cycles;
+  do {
+    for (size_t size = args->minbytes; size<=args->maxbytes; size = ((args->stepfactor > 1) ? size*args->stepfactor : size+args->stepbytes)) {
       setupArgs(size, type, args);
       char rootName[100];
       sprintf(rootName, "%6i", root);
@@ -606,7 +609,9 @@ testResult_t TimeTest(struct threadArgs* args, ncclDataType_t type, const char*
       TESTCHECK(BenchTime(args, type, op, root, 0));
       TESTCHECK(BenchTime(args, type, op, root, 1));
       PRINT("\n");
-  }
+    }
+  } while (--repeat);
+
   return testSuccess;
 }
 
@@ -717,6 +722,7 @@ int main(int argc, char* argv[]) {
     {"iters", required_argument, 0, 'n'},
     {"agg_iters", required_argument, 0, 'm'},
     {"warmup_iters", required_argument, 0, 'w'},
+    {"run_cycles", required_argument, 0, 'N'},
     {"parallel_init", required_argument, 0, 'p'},
     {"check", required_argument, 0, 'c'},
     {"op", required_argument, 0, 'o'},
@@ -735,7 +741,7 @@ int main(int argc, char* argv[]) {
 
   while(1) {
     int c;
-    c = getopt_long(argc, argv, "t:g:b:e:i:f:n:m:w:p:c:o:d:r:z:y:T:hG:C:a:R:", longopts, &longindex);
+    c = getopt_long(argc, argv, "t:g:b:e:i:f:n:m:w:N:p:c:o:d:r:z:y:T:hG:C:a:R:", longopts, &longindex);
 
     if (c == -1)
       break;
@@ -782,6 +788,9 @@ int main(int argc, char* argv[]) {
       case 'w':
         warmup_iters = (int)strtol(optarg, NULL, 0);
         break;
+      case 'N':
+        run_cycles = (int)strtol(optarg, NULL, 0);
+        break;
       case 'c':
         datacheck = (int)strtol(optarg, NULL, 0);
         break;
@@ -841,6 +850,7 @@ int main(int argc, char* argv[]) {
             "[-n,--iters <iteration count>] \n\t"
             "[-m,--agg_iters <aggregated iteration count>] \n\t"
             "[-w,--warmup_iters <warmup iteration count>] \n\t"
+            "[-N,--run_cycles <cycle count> run & print each cycle (default: 1; 0=infinite)] \n\t"
             "[-p,--parallel_init <0/1>] \n\t"
             "[-c,--check <check iteration count>] \n\t"
 #if NCCL_VERSION_CODE >= NCCL_VERSION(2,11,0)

From 0d86b5a6e755c52be6f23ef3f4792385f5e255b1 Mon Sep 17 00:00:00 2001
From: David Addison <daddison@nvidia.com>
Date: Tue, 30 Jul 2024 14:50:45 -0700
Subject: [PATCH 6/6] Added some missing command line options to README.md Also
 updated single and multi-node examples.

---
 README.md | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/README.md b/README.md
index 4281799..44e406a 100644
--- a/README.md
+++ b/README.md
@@ -24,14 +24,15 @@ NCCL tests can run on multiple processes, multiple threads, and multiple CUDA de
 
 ### Quick examples
 
-Run on 8 GPUs (`-g 8`), scanning from 8 Bytes to 128MBytes :
+Run on single node with 8 GPUs (`-g 8`), scanning from 8 Bytes to 128MBytes :
 ```shell
 $ ./build/all_reduce_perf -b 8 -e 128M -f 2 -g 8
 ```
 
-Run with MPI on 10 processes (potentially on multiple nodes) with 4 GPUs each, for a total of 40 GPUs:
+Run 64 MPI processes on nodes with 8 GPUs each, for a total of 64 GPUs spread across 8 nodes :
+(NB: The nccl-tests binaries must be compiled with `MPI=1` for this case)
 ```shell
-$ mpirun -np 10 ./build/all_reduce_perf -b 8 -e 128M -f 2 -g 4
+$ mpirun -np 64 -N 8 ./build/all_reduce_perf -b 8 -e 8G -f 2 -g 1
 ```
 
 ### Performance
@@ -59,14 +60,18 @@ All tests support the same set of arguments :
   * `-n,--iters <iteration count>` number of iterations. Default : 20.
   * `-w,--warmup_iters <warmup iteration count>` number of warmup iterations (not timed). Default : 5.
   * `-m,--agg_iters <aggregation count>` number of operations to aggregate together in each iteration. Default : 1.
+  * `-N,--run_cycles <cycle count>` run & print each cycle. Default : 1; 0=infinite.
   * `-a,--average <0/1/2/3>` Report performance as an average across all ranks (MPI=1 only). <0=Rank0,1=Avg,2=Min,3=Max>. Default : 1.
 * Test operation
   * `-p,--parallel_init <0/1>` use threads to initialize NCCL in parallel. Default : 0.
   * `-c,--check <check iteration count>` perform count iterations, checking correctness of results on each iteration. This can be quite slow on large numbers of GPUs. Default : 1.
   * `-z,--blocking <0/1>` Make NCCL collective blocking, i.e. have CPUs wait and sync after each collective. Default : 0.
   * `-G,--cudagraph <num graph launches>` Capture iterations as a CUDA graph and then replay specified number of times. Default : 0.
+  * `-C,--report_cputime <0/1>]` Report CPU time instead of latency. Default : 0.
+  * `-R,--local_register <1/0>` enable local buffer registration on send/recv buffers. Default : 0.
+  * `-T,--timeout <time in seconds>` timeout each test after specified number of seconds. Default : disabled.
 
 ## Copyright
 
-NCCL tests are provided under the BSD license. All source code and accompanying documentation is copyright (c) 2016-2021, NVIDIA CORPORATION. All rights reserved.
+NCCL tests are provided under the BSD license. All source code and accompanying documentation is copyright (c) 2016-2024, NVIDIA CORPORATION. All rights reserved.