From a1efb427e764241bc43d2d91be875c9f55da03a5 Mon Sep 17 00:00:00 2001 From: Giuseppe Congiu Date: Wed, 28 Feb 2024 05:18:40 -0800 Subject: [PATCH 1/6] Add -R option to register user buffers --- src/common.cu | 63 +++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 61 insertions(+), 2 deletions(-) diff --git a/src/common.cu b/src/common.cu index 4ac00fb..fc5af1e 100644 --- a/src/common.cu +++ b/src/common.cu @@ -80,6 +80,9 @@ static int cudaGraphLaunches = 0; static int report_cputime = 0; // Report average iteration time: (0=RANK0,1=AVG,2=MIN,3=MAX) static int average = 1; +#if NCCL_VERSION_CODE >= NCCL_VERSION(2,19,0) +static int local_register = 0; +#endif #define NUM_BLOCKS 32 @@ -631,10 +634,22 @@ testResult_t threadInit(struct threadArgs* args) { NCCLCHECK(ncclCommInitRank(args->comms+i, nranks, args->ncclId, rank)); } NCCLCHECK(ncclGroupEnd()); +#if NCCL_VERSION_CODE >= NCCL_VERSION(2,19,0) + void **sendRegHandles = (local_register) ? (void **)malloc(sizeof(*sendRegHandles)*args->nGpus) : NULL; + void **recvRegHandles = (local_register) ? (void **)malloc(sizeof(*recvRegHandles)*args->nGpus) : NULL; + for (int i=0; inGpus; i++) { + if (local_register) NCCLCHECK(ncclCommRegister(args->comms[i], args->sendbuffs[i], args->maxbytes, &sendRegHandles[i])); + if (local_register) NCCLCHECK(ncclCommRegister(args->comms[i], args->recvbuffs[i], args->maxbytes, &recvRegHandles[i])); + } +#endif TESTCHECK(threadRunTests(args)); for (int i=0; inGpus; i++) { +#if NCCL_VERSION_CODE >= NCCL_VERSION(2,19,0) + if (local_register) NCCLCHECK(ncclCommDeregister(args->comms[i], sendRegHandles[i])); + if (local_register) NCCLCHECK(ncclCommDeregister(args->comms[i], recvRegHandles[i])); +#endif NCCLCHECK(ncclCommDestroy(args->comms[i])); } return testSuccess; @@ -651,9 +666,15 @@ testResult_t threadLaunch(struct testThread* thread) { } testResult_t AllocateBuffs(void **sendbuff, size_t sendBytes, void **recvbuff, size_t recvBytes, void **expected, size_t nbytes) { +#if NCCL_VERSION_CODE >= NCCL_VERSION(2,19,0) + NCCLCHECK(ncclMemAlloc(sendbuff, nbytes)); + NCCLCHECK(ncclMemAlloc(recvbuff, nbytes)); + if (datacheck) NCCLCHECK(ncclMemAlloc(expected, recvBytes)); +#else CUDACHECK(cudaMalloc(sendbuff, nbytes)); CUDACHECK(cudaMalloc(recvbuff, nbytes)); if (datacheck) CUDACHECK(cudaMalloc(expected, recvBytes)); +#endif return testSuccess; } @@ -707,13 +728,14 @@ int main(int argc, char* argv[]) { {"cudagraph", required_argument, 0, 'G'}, {"report_cputime", required_argument, 0, 'C'}, {"average", required_argument, 0, 'a'}, + {"local_register", required_argument, 0, 'R'}, {"help", no_argument, 0, 'h'}, {} }; while(1) { int c; - c = getopt_long(argc, argv, "t:g:b:e:i:f:n:m:w:p:c:o:d:r:z:y:T:hG:C:a:", longopts, &longindex); + c = getopt_long(argc, argv, "t:g:b:e:i:f:n:m:w:p:c:o:d:r:z:y:T:hG:C:a:R:", longopts, &longindex); if (c == -1) break; @@ -797,6 +819,15 @@ int main(int argc, char* argv[]) { case 'a': average = (int)strtol(optarg, NULL, 0); break; + case 'R': +#if NCCL_VERSION_CODE >= NCCL_VERSION(2,19,0) + if ((int)strtol(optarg, NULL, 0)) { + local_register = 1; + } +#else + printf("Option -R (register) is not supported before NCCL 2.19. Ignoring\n"); +#endif + break; case 'h': default: if (c != 'h') printf("invalid option '%c'\n", c); @@ -827,6 +858,7 @@ int main(int argc, char* argv[]) { "[-G,--cudagraph ] \n\t" "[-C,--report_cputime <0/1>] \n\t" "[-a,--average <0/1/2/3> report average iteration time <0=RANK0/1=AVG/2=MIN/3=MAX>] \n\t" + "[-R,--local_register <1/0> enable local buffer registration on send/recv buffers (default: disable)] \n\t" "[-h,--help]\n", basename(argv[0])); return 0; @@ -949,6 +981,10 @@ testResult_t run() { //if parallel init is not selected, use main thread to initialize NCCL ncclComm_t* comms = (ncclComm_t*)malloc(sizeof(ncclComm_t)*nThreads*nGpus); +#if NCCL_VERSION_CODE >= NCCL_VERSION(2,19,0) + void **sendRegHandles = NULL; + void **recvRegHandles = NULL; +#endif if (!parallel_init) { if (ncclProcs == 1) { NCCLCHECK(ncclCommInitAll(comms, nGpus*nThreads, gpus)); @@ -960,6 +996,14 @@ testResult_t run() { } NCCLCHECK(ncclGroupEnd()); } +#if NCCL_VERSION_CODE >= NCCL_VERSION(2,19,0) + sendRegHandles = (local_register) ? (void **)malloc(sizeof(*sendRegHandles)*nThreads*nGpus) : NULL; + recvRegHandles = (local_register) ? (void **)malloc(sizeof(*recvRegHandles)*nThreads*nGpus) : NULL; + for (int i=0; i= NCCL_VERSION(2,19,0) + if (local_register) NCCLCHECK(ncclCommDeregister(comms[i], sendRegHandles[i])); + if (local_register) NCCLCHECK(ncclCommDeregister(comms[i], recvRegHandles[i])); +#endif NCCLCHECK(ncclCommDestroy(comms[i])); + } free(comms); } // Free off CUDA allocated memory for (int i=0; i= NCCL_VERSION(2,19,0) + if (sendbuffs[i]) NCCLCHECK(ncclMemFree((char*)sendbuffs[i])); + if (recvbuffs[i]) NCCLCHECK(ncclMemFree((char*)recvbuffs[i])); + if (datacheck) NCCLCHECK(ncclMemFree(expected[i])); +#else if (sendbuffs[i]) CUDACHECK(cudaFree((char*)sendbuffs[i])); if (recvbuffs[i]) CUDACHECK(cudaFree((char*)recvbuffs[i])); if (datacheck) CUDACHECK(cudaFree(expected[i])); +#endif } CUDACHECK(cudaFreeHost(delta)); +#if NCCL_VERSION_CODE >= NCCL_VERSION(2,19,0) + free(sendRegHandles); + free(recvRegHandles); +#endif envstr = getenv("NCCL_TESTS_MIN_BW"); double check_avg_bw = envstr ? atof(envstr) : -1; From d028efcf35101c6663ae8c5f33ad41bad00efb4d Mon Sep 17 00:00:00 2001 From: Kaiming Ouyang Date: Thu, 6 Jun 2024 04:59:28 -0700 Subject: [PATCH 2/6] Change ncclCommRegister size to maxBytes in serial comm init --- src/common.cu | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/common.cu b/src/common.cu index fc5af1e..04e8142 100644 --- a/src/common.cu +++ b/src/common.cu @@ -1000,8 +1000,8 @@ testResult_t run() { sendRegHandles = (local_register) ? (void **)malloc(sizeof(*sendRegHandles)*nThreads*nGpus) : NULL; recvRegHandles = (local_register) ? (void **)malloc(sizeof(*recvRegHandles)*nThreads*nGpus) : NULL; for (int i=0; i Date: Fri, 14 Jun 2024 11:28:55 +0200 Subject: [PATCH 3/6] improve parsing of stepbytes (increment size) argument --- src/common.cu | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/src/common.cu b/src/common.cu index 04e8142..7706dd9 100644 --- a/src/common.cu +++ b/src/common.cu @@ -764,7 +764,12 @@ int main(int argc, char* argv[]) { maxBytes = (size_t)parsed; break; case 'i': - stepBytes = strtol(optarg, NULL, 0); + parsed = parsesize(optarg); + if (parsed < 0) { + fprintf(stderr, "invalid size specified for 'stepBytes'\n"); + return -1; + } + stepBytes = (size_t)parsed; break; case 'f': stepFactor = strtol(optarg, NULL, 0); From c6eb15875f508076f3f26de4f7da3899701bc4db Mon Sep 17 00:00:00 2001 From: Oren <47992694+OrenLeung@users.noreply.github.com> Date: Wed, 24 Jul 2024 22:55:00 -0400 Subject: [PATCH 4/6] doc: add all2all factor --- doc/PERFORMANCE.md | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/PERFORMANCE.md b/doc/PERFORMANCE.md index 21fef60..942f054 100644 --- a/doc/PERFORMANCE.md +++ b/doc/PERFORMANCE.md @@ -140,5 +140,6 @@ To obtain a bus bandwidth which should be independent of the number of ranks _n_ * AllGather : (_n_-1)/_n_ * Broadcast : 1 * Reduce : 1 +* AlltoAll: (_n_-1)/_n_ The bus bandwidth should reflect the speed of the hardware bottleneck : NVLink, PCI, QPI, or network. From d2d40cc8249378efa4d7e2c949528c15eeb7d8e7 Mon Sep 17 00:00:00 2001 From: David Addison Date: Thu, 25 Jul 2024 21:47:40 -0700 Subject: [PATCH 5/6] Added -N,--run_cycles option --- src/common.cu | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/src/common.cu b/src/common.cu index 04e8142..872a18a 100644 --- a/src/common.cu +++ b/src/common.cu @@ -69,6 +69,7 @@ static int datacheck = 1; static int warmup_iters = 5; static int iters = 20; static int agg_iters = 1; +static int run_cycles = 1; static int ncclop = ncclSum; static int nccltype = ncclFloat; static int ncclroot = 0; @@ -598,7 +599,9 @@ testResult_t TimeTest(struct threadArgs* args, ncclDataType_t type, const char* TESTCHECK(completeColl(args)); // Benchmark - for (size_t size = args->minbytes; size<=args->maxbytes; size = ((args->stepfactor > 1) ? size*args->stepfactor : size+args->stepbytes)) { + long repeat = run_cycles; + do { + for (size_t size = args->minbytes; size<=args->maxbytes; size = ((args->stepfactor > 1) ? size*args->stepfactor : size+args->stepbytes)) { setupArgs(size, type, args); char rootName[100]; sprintf(rootName, "%6i", root); @@ -606,7 +609,9 @@ testResult_t TimeTest(struct threadArgs* args, ncclDataType_t type, const char* TESTCHECK(BenchTime(args, type, op, root, 0)); TESTCHECK(BenchTime(args, type, op, root, 1)); PRINT("\n"); - } + } + } while (--repeat); + return testSuccess; } @@ -717,6 +722,7 @@ int main(int argc, char* argv[]) { {"iters", required_argument, 0, 'n'}, {"agg_iters", required_argument, 0, 'm'}, {"warmup_iters", required_argument, 0, 'w'}, + {"run_cycles", required_argument, 0, 'N'}, {"parallel_init", required_argument, 0, 'p'}, {"check", required_argument, 0, 'c'}, {"op", required_argument, 0, 'o'}, @@ -735,7 +741,7 @@ int main(int argc, char* argv[]) { while(1) { int c; - c = getopt_long(argc, argv, "t:g:b:e:i:f:n:m:w:p:c:o:d:r:z:y:T:hG:C:a:R:", longopts, &longindex); + c = getopt_long(argc, argv, "t:g:b:e:i:f:n:m:w:N:p:c:o:d:r:z:y:T:hG:C:a:R:", longopts, &longindex); if (c == -1) break; @@ -782,6 +788,9 @@ int main(int argc, char* argv[]) { case 'w': warmup_iters = (int)strtol(optarg, NULL, 0); break; + case 'N': + run_cycles = (int)strtol(optarg, NULL, 0); + break; case 'c': datacheck = (int)strtol(optarg, NULL, 0); break; @@ -841,6 +850,7 @@ int main(int argc, char* argv[]) { "[-n,--iters ] \n\t" "[-m,--agg_iters ] \n\t" "[-w,--warmup_iters ] \n\t" + "[-N,--run_cycles run & print each cycle (default: 1; 0=infinite)] \n\t" "[-p,--parallel_init <0/1>] \n\t" "[-c,--check ] \n\t" #if NCCL_VERSION_CODE >= NCCL_VERSION(2,11,0) From 0d86b5a6e755c52be6f23ef3f4792385f5e255b1 Mon Sep 17 00:00:00 2001 From: David Addison Date: Tue, 30 Jul 2024 14:50:45 -0700 Subject: [PATCH 6/6] Added some missing command line options to README.md Also updated single and multi-node examples. --- README.md | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 4281799..44e406a 100644 --- a/README.md +++ b/README.md @@ -24,14 +24,15 @@ NCCL tests can run on multiple processes, multiple threads, and multiple CUDA de ### Quick examples -Run on 8 GPUs (`-g 8`), scanning from 8 Bytes to 128MBytes : +Run on single node with 8 GPUs (`-g 8`), scanning from 8 Bytes to 128MBytes : ```shell $ ./build/all_reduce_perf -b 8 -e 128M -f 2 -g 8 ``` -Run with MPI on 10 processes (potentially on multiple nodes) with 4 GPUs each, for a total of 40 GPUs: +Run 64 MPI processes on nodes with 8 GPUs each, for a total of 64 GPUs spread across 8 nodes : +(NB: The nccl-tests binaries must be compiled with `MPI=1` for this case) ```shell -$ mpirun -np 10 ./build/all_reduce_perf -b 8 -e 128M -f 2 -g 4 +$ mpirun -np 64 -N 8 ./build/all_reduce_perf -b 8 -e 8G -f 2 -g 1 ``` ### Performance @@ -59,14 +60,18 @@ All tests support the same set of arguments : * `-n,--iters ` number of iterations. Default : 20. * `-w,--warmup_iters ` number of warmup iterations (not timed). Default : 5. * `-m,--agg_iters ` number of operations to aggregate together in each iteration. Default : 1. + * `-N,--run_cycles ` run & print each cycle. Default : 1; 0=infinite. * `-a,--average <0/1/2/3>` Report performance as an average across all ranks (MPI=1 only). <0=Rank0,1=Avg,2=Min,3=Max>. Default : 1. * Test operation * `-p,--parallel_init <0/1>` use threads to initialize NCCL in parallel. Default : 0. * `-c,--check ` perform count iterations, checking correctness of results on each iteration. This can be quite slow on large numbers of GPUs. Default : 1. * `-z,--blocking <0/1>` Make NCCL collective blocking, i.e. have CPUs wait and sync after each collective. Default : 0. * `-G,--cudagraph ` Capture iterations as a CUDA graph and then replay specified number of times. Default : 0. + * `-C,--report_cputime <0/1>]` Report CPU time instead of latency. Default : 0. + * `-R,--local_register <1/0>` enable local buffer registration on send/recv buffers. Default : 0. + * `-T,--timeout