Merge branch 'master' of github.com:x41lakazam/nccl-tests into bisection_test

This commit is contained in:
Eyal Chocron 2024-12-01 11:05:03 +02:00
commit a599734dbd
3 changed files with 89 additions and 9 deletions

View File

@ -24,14 +24,15 @@ NCCL tests can run on multiple processes, multiple threads, and multiple CUDA de
### Quick examples ### Quick examples
Run on 8 GPUs (`-g 8`), scanning from 8 Bytes to 128MBytes : Run on single node with 8 GPUs (`-g 8`), scanning from 8 Bytes to 128MBytes :
```shell ```shell
$ ./build/all_reduce_perf -b 8 -e 128M -f 2 -g 8 $ ./build/all_reduce_perf -b 8 -e 128M -f 2 -g 8
``` ```
Run with MPI on 10 processes (potentially on multiple nodes) with 4 GPUs each, for a total of 40 GPUs: Run 64 MPI processes on nodes with 8 GPUs each, for a total of 64 GPUs spread across 8 nodes :
(NB: The nccl-tests binaries must be compiled with `MPI=1` for this case)
```shell ```shell
$ mpirun -np 10 ./build/all_reduce_perf -b 8 -e 128M -f 2 -g 4 $ mpirun -np 64 -N 8 ./build/all_reduce_perf -b 8 -e 8G -f 2 -g 1
``` ```
### Performance ### Performance
@ -59,14 +60,18 @@ All tests support the same set of arguments :
* `-n,--iters <iteration count>` number of iterations. Default : 20. * `-n,--iters <iteration count>` number of iterations. Default : 20.
* `-w,--warmup_iters <warmup iteration count>` number of warmup iterations (not timed). Default : 5. * `-w,--warmup_iters <warmup iteration count>` number of warmup iterations (not timed). Default : 5.
* `-m,--agg_iters <aggregation count>` number of operations to aggregate together in each iteration. Default : 1. * `-m,--agg_iters <aggregation count>` number of operations to aggregate together in each iteration. Default : 1.
* `-N,--run_cycles <cycle count>` run & print each cycle. Default : 1; 0=infinite.
* `-a,--average <0/1/2/3>` Report performance as an average across all ranks (MPI=1 only). <0=Rank0,1=Avg,2=Min,3=Max>. Default : 1. * `-a,--average <0/1/2/3>` Report performance as an average across all ranks (MPI=1 only). <0=Rank0,1=Avg,2=Min,3=Max>. Default : 1.
* Test operation * Test operation
* `-p,--parallel_init <0/1>` use threads to initialize NCCL in parallel. Default : 0. * `-p,--parallel_init <0/1>` use threads to initialize NCCL in parallel. Default : 0.
* `-c,--check <check iteration count>` perform count iterations, checking correctness of results on each iteration. This can be quite slow on large numbers of GPUs. Default : 1. * `-c,--check <check iteration count>` perform count iterations, checking correctness of results on each iteration. This can be quite slow on large numbers of GPUs. Default : 1.
* `-z,--blocking <0/1>` Make NCCL collective blocking, i.e. have CPUs wait and sync after each collective. Default : 0. * `-z,--blocking <0/1>` Make NCCL collective blocking, i.e. have CPUs wait and sync after each collective. Default : 0.
* `-G,--cudagraph <num graph launches>` Capture iterations as a CUDA graph and then replay specified number of times. Default : 0. * `-G,--cudagraph <num graph launches>` Capture iterations as a CUDA graph and then replay specified number of times. Default : 0.
* `-C,--report_cputime <0/1>]` Report CPU time instead of latency. Default : 0.
* `-R,--local_register <1/0>` enable local buffer registration on send/recv buffers. Default : 0.
* `-T,--timeout <time in seconds>` timeout each test after specified number of seconds. Default : disabled.
## Copyright ## Copyright
NCCL tests are provided under the BSD license. All source code and accompanying documentation is copyright (c) 2016-2021, NVIDIA CORPORATION. All rights reserved. NCCL tests are provided under the BSD license. All source code and accompanying documentation is copyright (c) 2016-2024, NVIDIA CORPORATION. All rights reserved.

View File

@ -150,6 +150,7 @@ To obtain a bus bandwidth which should be independent of the number of ranks _n_
* AllGather : (_n_-1)/_n_ * AllGather : (_n_-1)/_n_
* Broadcast : 1 * Broadcast : 1
* Reduce : 1 * Reduce : 1
* AlltoAll: (_n_-1)/_n_
* Bisection : 1 * Bisection : 1
The bus bandwidth should reflect the speed of the hardware bottleneck : NVLink, PCI, QPI, or network. The bus bandwidth should reflect the speed of the hardware bottleneck : NVLink, PCI, QPI, or network.

View File

@ -69,6 +69,7 @@ static int datacheck = 1;
static int warmup_iters = 5; static int warmup_iters = 5;
static int iters = 20; static int iters = 20;
static int agg_iters = 1; static int agg_iters = 1;
static int run_cycles = 1;
static int ncclop = ncclSum; static int ncclop = ncclSum;
static int nccltype = ncclFloat; static int nccltype = ncclFloat;
static int ncclroot = 0; static int ncclroot = 0;
@ -80,6 +81,9 @@ static int cudaGraphLaunches = 0;
static int report_cputime = 0; static int report_cputime = 0;
// Report average iteration time: (0=RANK0,1=AVG,2=MIN,3=MAX) // Report average iteration time: (0=RANK0,1=AVG,2=MIN,3=MAX)
static int average = 1; static int average = 1;
#if NCCL_VERSION_CODE >= NCCL_VERSION(2,19,0)
static int local_register = 0;
#endif
#define NUM_BLOCKS 32 #define NUM_BLOCKS 32
@ -595,7 +599,9 @@ testResult_t TimeTest(struct threadArgs* args, ncclDataType_t type, const char*
TESTCHECK(completeColl(args)); TESTCHECK(completeColl(args));
// Benchmark // Benchmark
for (size_t size = args->minbytes; size<=args->maxbytes; size = ((args->stepfactor > 1) ? size*args->stepfactor : size+args->stepbytes)) { long repeat = run_cycles;
do {
for (size_t size = args->minbytes; size<=args->maxbytes; size = ((args->stepfactor > 1) ? size*args->stepfactor : size+args->stepbytes)) {
setupArgs(size, type, args); setupArgs(size, type, args);
char rootName[100]; char rootName[100];
sprintf(rootName, "%6i", root); sprintf(rootName, "%6i", root);
@ -603,7 +609,9 @@ testResult_t TimeTest(struct threadArgs* args, ncclDataType_t type, const char*
TESTCHECK(BenchTime(args, type, op, root, 0)); TESTCHECK(BenchTime(args, type, op, root, 0));
TESTCHECK(BenchTime(args, type, op, root, 1)); TESTCHECK(BenchTime(args, type, op, root, 1));
PRINT("\n"); PRINT("\n");
} }
} while (--repeat);
return testSuccess; return testSuccess;
} }
@ -631,10 +639,22 @@ testResult_t threadInit(struct threadArgs* args) {
NCCLCHECK(ncclCommInitRank(args->comms+i, nranks, args->ncclId, rank)); NCCLCHECK(ncclCommInitRank(args->comms+i, nranks, args->ncclId, rank));
} }
NCCLCHECK(ncclGroupEnd()); NCCLCHECK(ncclGroupEnd());
#if NCCL_VERSION_CODE >= NCCL_VERSION(2,19,0)
void **sendRegHandles = (local_register) ? (void **)malloc(sizeof(*sendRegHandles)*args->nGpus) : NULL;
void **recvRegHandles = (local_register) ? (void **)malloc(sizeof(*recvRegHandles)*args->nGpus) : NULL;
for (int i=0; i<args->nGpus; i++) {
if (local_register) NCCLCHECK(ncclCommRegister(args->comms[i], args->sendbuffs[i], args->maxbytes, &sendRegHandles[i]));
if (local_register) NCCLCHECK(ncclCommRegister(args->comms[i], args->recvbuffs[i], args->maxbytes, &recvRegHandles[i]));
}
#endif
TESTCHECK(threadRunTests(args)); TESTCHECK(threadRunTests(args));
for (int i=0; i<args->nGpus; i++) { for (int i=0; i<args->nGpus; i++) {
#if NCCL_VERSION_CODE >= NCCL_VERSION(2,19,0)
if (local_register) NCCLCHECK(ncclCommDeregister(args->comms[i], sendRegHandles[i]));
if (local_register) NCCLCHECK(ncclCommDeregister(args->comms[i], recvRegHandles[i]));
#endif
NCCLCHECK(ncclCommDestroy(args->comms[i])); NCCLCHECK(ncclCommDestroy(args->comms[i]));
} }
return testSuccess; return testSuccess;
@ -651,9 +671,15 @@ testResult_t threadLaunch(struct testThread* thread) {
} }
testResult_t AllocateBuffs(void **sendbuff, size_t sendBytes, void **recvbuff, size_t recvBytes, void **expected, size_t nbytes) { testResult_t AllocateBuffs(void **sendbuff, size_t sendBytes, void **recvbuff, size_t recvBytes, void **expected, size_t nbytes) {
#if NCCL_VERSION_CODE >= NCCL_VERSION(2,19,0)
NCCLCHECK(ncclMemAlloc(sendbuff, nbytes));
NCCLCHECK(ncclMemAlloc(recvbuff, nbytes));
if (datacheck) NCCLCHECK(ncclMemAlloc(expected, recvBytes));
#else
CUDACHECK(cudaMalloc(sendbuff, nbytes)); CUDACHECK(cudaMalloc(sendbuff, nbytes));
CUDACHECK(cudaMalloc(recvbuff, nbytes)); CUDACHECK(cudaMalloc(recvbuff, nbytes));
if (datacheck) CUDACHECK(cudaMalloc(expected, recvBytes)); if (datacheck) CUDACHECK(cudaMalloc(expected, recvBytes));
#endif
return testSuccess; return testSuccess;
} }
@ -696,6 +722,7 @@ int main(int argc, char* argv[]) {
{"iters", required_argument, 0, 'n'}, {"iters", required_argument, 0, 'n'},
{"agg_iters", required_argument, 0, 'm'}, {"agg_iters", required_argument, 0, 'm'},
{"warmup_iters", required_argument, 0, 'w'}, {"warmup_iters", required_argument, 0, 'w'},
{"run_cycles", required_argument, 0, 'N'},
{"parallel_init", required_argument, 0, 'p'}, {"parallel_init", required_argument, 0, 'p'},
{"check", required_argument, 0, 'c'}, {"check", required_argument, 0, 'c'},
{"op", required_argument, 0, 'o'}, {"op", required_argument, 0, 'o'},
@ -707,13 +734,14 @@ int main(int argc, char* argv[]) {
{"cudagraph", required_argument, 0, 'G'}, {"cudagraph", required_argument, 0, 'G'},
{"report_cputime", required_argument, 0, 'C'}, {"report_cputime", required_argument, 0, 'C'},
{"average", required_argument, 0, 'a'}, {"average", required_argument, 0, 'a'},
{"local_register", required_argument, 0, 'R'},
{"help", no_argument, 0, 'h'}, {"help", no_argument, 0, 'h'},
{} {}
}; };
while(1) { while(1) {
int c; int c;
c = getopt_long(argc, argv, "t:g:b:e:i:f:n:m:w:p:c:o:d:r:z:y:T:hG:C:a:", longopts, &longindex); c = getopt_long(argc, argv, "t:g:b:e:i:f:n:m:w:N:p:c:o:d:r:z:y:T:hG:C:a:R:", longopts, &longindex);
if (c == -1) if (c == -1)
break; break;
@ -742,7 +770,12 @@ int main(int argc, char* argv[]) {
maxBytes = (size_t)parsed; maxBytes = (size_t)parsed;
break; break;
case 'i': case 'i':
stepBytes = strtol(optarg, NULL, 0); parsed = parsesize(optarg);
if (parsed < 0) {
fprintf(stderr, "invalid size specified for 'stepBytes'\n");
return -1;
}
stepBytes = (size_t)parsed;
break; break;
case 'f': case 'f':
stepFactor = strtol(optarg, NULL, 0); stepFactor = strtol(optarg, NULL, 0);
@ -760,6 +793,9 @@ int main(int argc, char* argv[]) {
case 'w': case 'w':
warmup_iters = (int)strtol(optarg, NULL, 0); warmup_iters = (int)strtol(optarg, NULL, 0);
break; break;
case 'N':
run_cycles = (int)strtol(optarg, NULL, 0);
break;
case 'c': case 'c':
datacheck = (int)strtol(optarg, NULL, 0); datacheck = (int)strtol(optarg, NULL, 0);
break; break;
@ -797,6 +833,15 @@ int main(int argc, char* argv[]) {
case 'a': case 'a':
average = (int)strtol(optarg, NULL, 0); average = (int)strtol(optarg, NULL, 0);
break; break;
case 'R':
#if NCCL_VERSION_CODE >= NCCL_VERSION(2,19,0)
if ((int)strtol(optarg, NULL, 0)) {
local_register = 1;
}
#else
printf("Option -R (register) is not supported before NCCL 2.19. Ignoring\n");
#endif
break;
case 'h': case 'h':
default: default:
if (c != 'h') printf("invalid option '%c'\n", c); if (c != 'h') printf("invalid option '%c'\n", c);
@ -810,6 +855,7 @@ int main(int argc, char* argv[]) {
"[-n,--iters <iteration count>] \n\t" "[-n,--iters <iteration count>] \n\t"
"[-m,--agg_iters <aggregated iteration count>] \n\t" "[-m,--agg_iters <aggregated iteration count>] \n\t"
"[-w,--warmup_iters <warmup iteration count>] \n\t" "[-w,--warmup_iters <warmup iteration count>] \n\t"
"[-N,--run_cycles <cycle count> run & print each cycle (default: 1; 0=infinite)] \n\t"
"[-p,--parallel_init <0/1>] \n\t" "[-p,--parallel_init <0/1>] \n\t"
"[-c,--check <check iteration count>] \n\t" "[-c,--check <check iteration count>] \n\t"
#if NCCL_VERSION_CODE >= NCCL_VERSION(2,11,0) #if NCCL_VERSION_CODE >= NCCL_VERSION(2,11,0)
@ -827,6 +873,7 @@ int main(int argc, char* argv[]) {
"[-G,--cudagraph <num graph launches>] \n\t" "[-G,--cudagraph <num graph launches>] \n\t"
"[-C,--report_cputime <0/1>] \n\t" "[-C,--report_cputime <0/1>] \n\t"
"[-a,--average <0/1/2/3> report average iteration time <0=RANK0/1=AVG/2=MIN/3=MAX>] \n\t" "[-a,--average <0/1/2/3> report average iteration time <0=RANK0/1=AVG/2=MIN/3=MAX>] \n\t"
"[-R,--local_register <1/0> enable local buffer registration on send/recv buffers (default: disable)] \n\t"
"[-h,--help]\n", "[-h,--help]\n",
basename(argv[0])); basename(argv[0]));
return 0; return 0;
@ -949,6 +996,10 @@ testResult_t run() {
//if parallel init is not selected, use main thread to initialize NCCL //if parallel init is not selected, use main thread to initialize NCCL
ncclComm_t* comms = (ncclComm_t*)malloc(sizeof(ncclComm_t)*nThreads*nGpus); ncclComm_t* comms = (ncclComm_t*)malloc(sizeof(ncclComm_t)*nThreads*nGpus);
#if NCCL_VERSION_CODE >= NCCL_VERSION(2,19,0)
void **sendRegHandles = NULL;
void **recvRegHandles = NULL;
#endif
if (!parallel_init) { if (!parallel_init) {
if (ncclProcs == 1) { if (ncclProcs == 1) {
NCCLCHECK(ncclCommInitAll(comms, nGpus*nThreads, gpus)); NCCLCHECK(ncclCommInitAll(comms, nGpus*nThreads, gpus));
@ -960,6 +1011,14 @@ testResult_t run() {
} }
NCCLCHECK(ncclGroupEnd()); NCCLCHECK(ncclGroupEnd());
} }
#if NCCL_VERSION_CODE >= NCCL_VERSION(2,19,0)
sendRegHandles = (local_register) ? (void **)malloc(sizeof(*sendRegHandles)*nThreads*nGpus) : NULL;
recvRegHandles = (local_register) ? (void **)malloc(sizeof(*recvRegHandles)*nThreads*nGpus) : NULL;
for (int i=0; i<nGpus*nThreads; i++) {
if (local_register) NCCLCHECK(ncclCommRegister(comms[i], sendbuffs[i], maxBytes, &sendRegHandles[i]));
if (local_register) NCCLCHECK(ncclCommRegister(comms[i], recvbuffs[i], maxBytes, &recvRegHandles[i]));
}
#endif
} }
int errors[nThreads]; int errors[nThreads];
@ -1035,18 +1094,33 @@ testResult_t run() {
#endif #endif
if (!parallel_init) { if (!parallel_init) {
for(int i=0; i<nGpus*nThreads; ++i) for(int i=0; i<nGpus*nThreads; ++i) {
#if NCCL_VERSION_CODE >= NCCL_VERSION(2,19,0)
if (local_register) NCCLCHECK(ncclCommDeregister(comms[i], sendRegHandles[i]));
if (local_register) NCCLCHECK(ncclCommDeregister(comms[i], recvRegHandles[i]));
#endif
NCCLCHECK(ncclCommDestroy(comms[i])); NCCLCHECK(ncclCommDestroy(comms[i]));
}
free(comms); free(comms);
} }
// Free off CUDA allocated memory // Free off CUDA allocated memory
for (int i=0; i<nGpus*nThreads; i++) { for (int i=0; i<nGpus*nThreads; i++) {
#if NCCL_VERSION_CODE >= NCCL_VERSION(2,19,0)
if (sendbuffs[i]) NCCLCHECK(ncclMemFree((char*)sendbuffs[i]));
if (recvbuffs[i]) NCCLCHECK(ncclMemFree((char*)recvbuffs[i]));
if (datacheck) NCCLCHECK(ncclMemFree(expected[i]));
#else
if (sendbuffs[i]) CUDACHECK(cudaFree((char*)sendbuffs[i])); if (sendbuffs[i]) CUDACHECK(cudaFree((char*)sendbuffs[i]));
if (recvbuffs[i]) CUDACHECK(cudaFree((char*)recvbuffs[i])); if (recvbuffs[i]) CUDACHECK(cudaFree((char*)recvbuffs[i]));
if (datacheck) CUDACHECK(cudaFree(expected[i])); if (datacheck) CUDACHECK(cudaFree(expected[i]));
#endif
} }
CUDACHECK(cudaFreeHost(delta)); CUDACHECK(cudaFreeHost(delta));
#if NCCL_VERSION_CODE >= NCCL_VERSION(2,19,0)
free(sendRegHandles);
free(recvRegHandles);
#endif
envstr = getenv("NCCL_TESTS_MIN_BW"); envstr = getenv("NCCL_TESTS_MIN_BW");
double check_avg_bw = envstr ? atof(envstr) : -1; double check_avg_bw = envstr ? atof(envstr) : -1;