From d313d20a2695b7a9be9b22bd9417fe2e201fef3f Mon Sep 17 00:00:00 2001 From: Sylvain Jeaugey Date: Tue, 20 Sep 2022 02:21:36 -0700 Subject: [PATCH] Update NCCL tests --- src/Makefile | 9 ++- src/all_gather.cu | 18 ++--- src/all_reduce.cu | 5 +- src/alltoall.cu | 9 +-- src/broadcast.cu | 7 +- src/common.cu | 151 ++++++++++++++++++++++++++++-------------- src/common.h | 56 ++++------------ src/gather.cu | 7 +- src/hypercube.cu | 5 +- src/reduce.cu | 5 +- src/reduce_scatter.cu | 16 +++-- src/scatter.cu | 7 +- src/sendrecv.cu | 5 +- src/timer.cc | 28 ++++++++ src/timer.h | 15 +++++ 15 files changed, 206 insertions(+), 137 deletions(-) create mode 100644 src/timer.cc create mode 100644 src/timer.h diff --git a/src/Makefile b/src/Makefile index 137b9d7..6d8b1ef 100644 --- a/src/Makefile +++ b/src/Makefile @@ -1,5 +1,5 @@ # -# Copyright (c) 2015-2021, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved. # # See LICENSE.txt for license information # @@ -92,7 +92,12 @@ ${DST_DIR}/%.o: %.cu common.h $(TEST_VERIFIABLE_HDRS) @mkdir -p ${DST_DIR} $(NVCC) -o $@ $(NVCUFLAGS) -c $< -${DST_DIR}/%_perf:${DST_DIR}/%.o ${DST_DIR}/common.o $(TEST_VERIFIABLE_OBJS) +${DST_DIR}/timer.o: timer.cc timer.h + @printf "Compiling %-35s > %s\n" $< $@ + @mkdir -p ${DST_DIR} + $(CXX) $(CXXFLAGS) -o $@ -c timer.cc + +${DST_DIR}/%_perf:${DST_DIR}/%.o ${DST_DIR}/common.o ${DST_DIR}/timer.o $(TEST_VERIFIABLE_OBJS) @printf "Linking %-35s > %s\n" $< $@ @mkdir -p ${DST_DIR} $(NVCC) -o $@ $(NVCUFLAGS) $^ ${NVLDFLAGS} diff --git a/src/all_gather.cu b/src/all_gather.cu index 1eaafdd..0831207 100644 --- a/src/all_gather.cu +++ b/src/all_gather.cu @@ -1,5 +1,5 @@ /************************************************************************* - * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ @@ -7,12 +7,15 @@ #include "cuda_runtime.h" #include "common.h" +#define ALIGN 4 + void AllGatherGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) { - *sendcount = count/nranks; - *recvcount = (count/nranks)*nranks; - *sendInplaceOffset = count/nranks; + size_t base = (count/(ALIGN*nranks))*ALIGN; + *sendcount = base; + *recvcount = base*nranks; + *sendInplaceOffset = base; *recvInplaceOffset = 0; - *paramcount = *sendcount; + *paramcount = base; } testResult_t AllGatherInitData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int rep, int in_place) { @@ -21,8 +24,7 @@ testResult_t AllGatherInitData(struct threadArgs* args, ncclDataType_t type, ncc int nranks = args->nProcs*args->nThreads*args->nGpus; for (int i=0; inGpus; i++) { - int gpuid = args->localRank*args->nThreads*args->nGpus + args->thread*args->nGpus + i; - CUDACHECK(cudaSetDevice(gpuid)); + CUDACHECK(cudaSetDevice(args->gpus[i])); int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i); CUDACHECK(cudaMemset(args->recvbuffs[i], 0, args->expectedBytes)); void* data = in_place ? ((char*)args->recvbuffs[i])+rank*args->sendBytes : args->sendbuffs[i]; @@ -78,7 +80,7 @@ testResult_t AllGatherRunTest(struct threadArgs* args, int root, ncclDataType_t } for (int i=0; inProcs*args->nThreads*args->nGpus; for (int i=0; inGpus; i++) { - int gpuid = args->localRank*args->nThreads*args->nGpus + args->thread*args->nGpus + i; - CUDACHECK(cudaSetDevice(gpuid)); + CUDACHECK(cudaSetDevice(args->gpus[i])); int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i); CUDACHECK(cudaMemset(args->recvbuffs[i], 0, args->expectedBytes)); void* data = in_place ? args->recvbuffs[i] : args->sendbuffs[i]; diff --git a/src/alltoall.cu b/src/alltoall.cu index 0eae1b0..41c7c4a 100644 --- a/src/alltoall.cu +++ b/src/alltoall.cu @@ -1,5 +1,5 @@ /************************************************************************* - * Copyright (c) 2016-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ @@ -21,9 +21,7 @@ testResult_t AlltoAllInitData(struct threadArgs* args, ncclDataType_t type, nccl int nranks = args->nProcs*args->nThreads*args->nGpus; for (int i=0; inGpus; i++) { - char* str = getenv("NCCL_TESTS_DEVICE"); - int gpuid = str ? atoi(str) : args->localRank*args->nThreads*args->nGpus + args->thread*args->nGpus + i; - CUDACHECK(cudaSetDevice(gpuid)); + CUDACHECK(cudaSetDevice(args->gpus[i])); int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i); CUDACHECK(cudaMemset(args->recvbuffs[i], 0, args->expectedBytes)); void* data = in_place ? args->recvbuffs[i] : args->sendbuffs[i]; @@ -51,7 +49,6 @@ testResult_t AlltoAllRunColl(void* sendbuff, void* recvbuff, size_t count, ncclD int nRanks; NCCLCHECK(ncclCommCount(comm, &nRanks)); size_t rankOffset = count * wordSize(type); - if (count == 0) return testSuccess; #if NCCL_MAJOR < 2 || NCCL_MINOR < 7 printf("NCCL 2.7 or later is needed for alltoall. This test was compiled with %d.%d.\n", NCCL_MAJOR, NCCL_MINOR); @@ -97,7 +94,7 @@ testResult_t AlltoAllRunTest(struct threadArgs* args, int root, ncclDataType_t t } for (int i=0; iexpectedBytes / wordSize(type); for (int i=0; inGpus; i++) { - int gpuid = args->localRank*args->nThreads*args->nGpus + args->thread*args->nGpus + i; - CUDACHECK(cudaSetDevice(gpuid)); + CUDACHECK(cudaSetDevice(args->gpus[i])); int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i); CUDACHECK(cudaMemset(args->recvbuffs[i], 0, args->expectedBytes)); void* data = in_place ? args->recvbuffs[i] : args->sendbuffs[i]; @@ -94,7 +93,7 @@ testResult_t BroadcastRunTest(struct threadArgs* args, int root, ncclDataType_t for (int i=0; inProcs*args->nThreads; + if(average == 1) accumulator[epoch] /= args->totalProcs*args->nThreads; counter[epoch] = 0; pthread_cond_broadcast(&cond[epoch]); } @@ -220,10 +229,8 @@ testResult_t CheckData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t CUDACHECK(cudaHostAlloc((void**)&wrongPerGpu, args->nGpus*sizeof(int64_t), cudaHostAllocMapped)); for (int i=0; inGpus; i++) { - int device; int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i); - NCCLCHECK(ncclCommCuDevice(args->comms[i], &device)); - CUDACHECK(cudaSetDevice(device)); + CUDACHECK(cudaSetDevice(args->gpus[i])); void *data = in_place ? ((void *)((uintptr_t)args->recvbuffs[i] + args->recvInplaceOffset*rank)) : args->recvbuffs[i]; TESTCHECK(CheckDelta(data, args->expected[i], count, 0, type, op, 0, nranks, wrongPerGpu+i)); @@ -266,6 +273,8 @@ testResult_t testStreamSynchronize(int ngpus, cudaStream_t* streams, ncclComm_t* int remaining = ngpus; int* done = (int*)malloc(sizeof(int)*ngpus); memset(done, 0, sizeof(int)*ngpus); + timer tim; + while (remaining) { int idle = 1; for (int i=0; i timeout && timeout > 0) { + for (int i=0; inGpus > 1) NCCLCHECK(ncclGroupStart()); for (int i = 0; i < args->nGpus; i++) { #ifndef NCCL_MAJOR - int cudaDev; - NCCLCHECK(ncclCommCuDevice(args->comms[i], &cudaDev)); - CUDACHECK(cudaSetDevice(cudaDev)); + CUDACHECK(cudaSetDevice(args->gpus[i])); #endif int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i); char* recvBuff = ((char*)args->recvbuffs[i]) + shift; @@ -411,7 +431,7 @@ testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t #endif // Performance Benchmark - auto start = std::chrono::high_resolution_clock::now(); + timer tim; for (int iter = 0; iter < iters; iter++) { if (agg_iters>1) NCCLCHECK(ncclGroupStart()); for (int aiter = 0; aiter < agg_iters; aiter++) { @@ -432,7 +452,7 @@ testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t } // Resync CPU, restart timing, launch cuda graph Barrier(args); - start = std::chrono::high_resolution_clock::now(); + tim.reset(); for (int l=0; lnGpus; i++) { CUDACHECK(cudaGraphLaunch(graphExec[i], args->streams[i])); @@ -441,10 +461,10 @@ testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t } #endif + double cputimeSec = tim.elapsed()/(iters*agg_iters); TESTCHECK(completeColl(args)); - auto delta = std::chrono::high_resolution_clock::now() - start; - double deltaSec = std::chrono::duration_cast>(delta).count(); + double deltaSec = tim.elapsed(); deltaSec = deltaSec/(iters*agg_iters); if (cudaGraphLaunches >= 1) deltaSec = deltaSec/cudaGraphLaunches; Allreduce(args, &deltaSec, average); @@ -520,7 +540,7 @@ testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t wrongElts = wrongElts1; } - double timeUsec = deltaSec*1.0E6; + double timeUsec = (report_cputime ? cputimeSec : deltaSec)*1.0E6; char timeStr[100]; if (timeUsec >= 10000.0) { sprintf(timeStr, "%7.0f", timeUsec); @@ -555,6 +575,9 @@ void setupArgs(size_t size, ncclDataType_t type, struct threadArgs* args) { } testResult_t TimeTest(struct threadArgs* args, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName, int root) { + // Sync to avoid first-call timeout + Barrier(args); + // Warm-up for large size setupArgs(args->maxbytes, type, args); for (int iter = 0; iter < warmup_iters; iter++) { @@ -586,8 +609,7 @@ testResult_t threadRunTests(struct threadArgs* args) { // Set device to the first of our GPUs. If we don't do that, some operations // will be done on the current GPU (by default : 0) and if the GPUs are in // exclusive mode those operations will fail. - int gpuid = args->localRank*args->nThreads*args->nGpus + args->thread*args->nGpus; - CUDACHECK(cudaSetDevice(gpuid)); + CUDACHECK(cudaSetDevice(args->gpus[0])); TESTCHECK(ncclTestEngine.runTest(args, ncclroot, (ncclDataType_t)nccltype, test_typenames[nccltype], (ncclRedOp_t)ncclop, test_opnames[ncclop])); return testSuccess; } @@ -598,13 +620,12 @@ testResult_t threadInit(struct threadArgs* args) { int nranks = args->nProcs*args->nThreads*args->nGpus; //set main thread again - is_main_thread = (args->proc == 0 && args->thread == 0) ? 1 : 0; + is_main_thread = (is_main_proc && args->thread == 0) ? 1 : 0; NCCLCHECK(ncclGroupStart()); for (int i=0; inGpus; i++) { int rank = args->proc*args->nThreads*args->nGpus + args->thread*args->nGpus + i; - int gpuid = args->localRank*args->nThreads*args->nGpus + args->thread*args->nGpus + i; - CUDACHECK(cudaSetDevice(gpuid)); + CUDACHECK(cudaSetDevice(args->gpus[i])); NCCLCHECK(ncclCommInitRank(args->comms+i, nranks, args->ncclId, rank)); } NCCLCHECK(ncclGroupEnd()); @@ -679,7 +700,10 @@ int main(int argc, char* argv[]) { {"datatype", required_argument, 0, 'd'}, {"root", required_argument, 0, 'r'}, {"blocking", required_argument, 0, 'z'}, + {"stream_null", required_argument, 0, 'y'}, + {"timeout", required_argument, 0, 'T'}, {"cudagraph", required_argument, 0, 'G'}, + {"report_cputime", required_argument, 0, 'C'}, {"average", required_argument, 0, 'a'}, {"help", no_argument, 0, 'h'}, {} @@ -687,7 +711,7 @@ int main(int argc, char* argv[]) { while(1) { int c; - c = getopt_long(argc, argv, "t:g:b:e:i:f:n:m:w:p:c:o:d:r:z:hG:a:", longopts, &longindex); + c = getopt_long(argc, argv, "t:g:b:e:i:f:n:m:w:p:c:o:d:r:z:y:T:hG:C:a:", longopts, &longindex); if (c == -1) break; @@ -752,6 +776,12 @@ int main(int argc, char* argv[]) { case 'z': blocking_coll = strtol(optarg, NULL, 0); break; + case 'y': + streamnull = strtol(optarg, NULL, 0); + break; + case 'T': + timeout = strtol(optarg, NULL, 0); + break; case 'G': #if (NCCL_MAJOR > 2 || (NCCL_MAJOR >= 2 && NCCL_MINOR >= 9)) && CUDART_VERSION >= 11030 cudaGraphLaunches = strtol(optarg, NULL, 0); @@ -759,6 +789,9 @@ int main(int argc, char* argv[]) { printf("Option -G (CUDA graph) not supported before NCCL 2.9 + CUDA 11.3. Ignoring\n"); #endif break; + case 'C': + report_cputime = strtol(optarg, NULL, 0); + break; case 'a': average = (int)strtol(optarg, NULL, 0); break; @@ -787,11 +820,14 @@ int main(int argc, char* argv[]) { "[-d,--datatype ] \n\t" "[-r,--root ] \n\t" "[-z,--blocking <0/1>] \n\t" + "[-y,--stream_null <0/1>] \n\t" + "[-T,--timeout