diff --git a/src/common.cu b/src/common.cu index 566b13b..07f3890 100644 --- a/src/common.cu +++ b/src/common.cu @@ -801,6 +801,16 @@ testResult_t threadInit(struct threadArgs* args) { int firstRank = args->proc*args->nThreads*args->nGpus + args->thread*args->nGpus; TESTCHECK(initComms(args->comms, args->nGpus, firstRank, nranks, args->gpus, args->ncclId)); + /* Allocate buffers for each GPU (parallel_init: each thread allocates its own) */ + size_t sendBytes, recvBytes; + ncclTestEngine.getBuffSize(&sendBytes, &recvBytes, (size_t)args->maxbytes, (size_t)nranks); + NCCLCHECK(ncclGroupStart()); + for (int i = 0; i < args->nGpus; i++) { + CUDACHECK(cudaSetDevice(args->gpus[i])); + TESTCHECK(AllocateBuffs(args->sendbuffs + i, sendBytes, args->recvbuffs + i, recvBytes, args->expected + i, (size_t)args->maxbytes)); + } + NCCLCHECK(ncclGroupEnd()); + // Capture the memory used by the GPUs after initializing the NCCL communicators for (int g = 0; g < args->nGpus; ++g) { CUDACHECK(cudaSetDevice(args->gpus[g])); diff --git a/src/common.h b/src/common.h index 9476975..41d6c92 100644 --- a/src/common.h +++ b/src/common.h @@ -182,7 +182,7 @@ extern void Barrier(struct threadArgs* args); extern testResult_t TimeTest(struct threadArgs* args, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName, int root); extern testResult_t InitDataReduce(void* data, const size_t count, const size_t offset, ncclDataType_t type, ncclRedOp_t op, const uint64_t seed, const int nranks); extern testResult_t InitData(void* data, const size_t count, size_t offset, ncclDataType_t type, ncclRedOp_t op, const uint64_t seed, const int nranks, const int rank); -extern void AllocateBuffs(void **sendbuff, void **recvbuff, void **expected, void **expectedHost, size_t nbytes, int nranks); +extern testResult_t AllocateBuffs(void **sendbuff, size_t sendBytes, void **recvbuff, size_t recvBytes, void **expected, size_t nbytes); #include