Allocate buffers during thread initialization

Signed-off-by: Theofilos Ioannis Manitaras <tmanitaras@nvidia.com>
This commit is contained in:
Theofilos Ioannis Manitaras 2026-03-10 18:33:50 +01:00 committed by David Addison
parent dd0bafd178
commit 8d26b23319
2 changed files with 11 additions and 1 deletions

View File

@ -801,6 +801,16 @@ testResult_t threadInit(struct threadArgs* args) {
int firstRank = args->proc*args->nThreads*args->nGpus + args->thread*args->nGpus;
TESTCHECK(initComms(args->comms, args->nGpus, firstRank, nranks, args->gpus, args->ncclId));
/* Allocate buffers for each GPU (parallel_init: each thread allocates its own) */
size_t sendBytes, recvBytes;
ncclTestEngine.getBuffSize(&sendBytes, &recvBytes, (size_t)args->maxbytes, (size_t)nranks);
NCCLCHECK(ncclGroupStart());
for (int i = 0; i < args->nGpus; i++) {
CUDACHECK(cudaSetDevice(args->gpus[i]));
TESTCHECK(AllocateBuffs(args->sendbuffs + i, sendBytes, args->recvbuffs + i, recvBytes, args->expected + i, (size_t)args->maxbytes));
}
NCCLCHECK(ncclGroupEnd());
// Capture the memory used by the GPUs after initializing the NCCL communicators
for (int g = 0; g < args->nGpus; ++g) {
CUDACHECK(cudaSetDevice(args->gpus[g]));

View File

@ -182,7 +182,7 @@ extern void Barrier(struct threadArgs* args);
extern testResult_t TimeTest(struct threadArgs* args, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName, int root);
extern testResult_t InitDataReduce(void* data, const size_t count, const size_t offset, ncclDataType_t type, ncclRedOp_t op, const uint64_t seed, const int nranks);
extern testResult_t InitData(void* data, const size_t count, size_t offset, ncclDataType_t type, ncclRedOp_t op, const uint64_t seed, const int nranks, const int rank);
extern void AllocateBuffs(void **sendbuff, void **recvbuff, void **expected, void **expectedHost, size_t nbytes, int nranks);
extern testResult_t AllocateBuffs(void **sendbuff, size_t sendBytes, void **recvbuff, size_t recvBytes, void **expected, size_t nbytes);
#include <unistd.h>