diff --git a/src/common.cu b/src/common.cu
index 566b13b..07f3890 100644
--- a/src/common.cu
+++ b/src/common.cu
@@ -801,6 +801,16 @@ testResult_t threadInit(struct threadArgs* args) {
   int firstRank = args->proc*args->nThreads*args->nGpus + args->thread*args->nGpus;
   TESTCHECK(initComms(args->comms, args->nGpus, firstRank, nranks, args->gpus, args->ncclId));
 
+  /* Allocate buffers for each GPU (parallel_init: each thread allocates its own) */
+  size_t sendBytes, recvBytes;
+  ncclTestEngine.getBuffSize(&sendBytes, &recvBytes, (size_t)args->maxbytes, (size_t)nranks);
+  NCCLCHECK(ncclGroupStart());
+  for (int i = 0; i < args->nGpus; i++) {
+    CUDACHECK(cudaSetDevice(args->gpus[i]));
+    TESTCHECK(AllocateBuffs(args->sendbuffs + i, sendBytes, args->recvbuffs + i, recvBytes, args->expected + i, (size_t)args->maxbytes));
+  }
+  NCCLCHECK(ncclGroupEnd());
+
   // Capture the memory used by the GPUs after initializing the NCCL communicators
   for (int g = 0; g < args->nGpus; ++g) {
     CUDACHECK(cudaSetDevice(args->gpus[g]));
diff --git a/src/common.h b/src/common.h
index 9476975..41d6c92 100644
--- a/src/common.h
+++ b/src/common.h
@@ -182,7 +182,7 @@ extern void Barrier(struct threadArgs* args);
 extern testResult_t TimeTest(struct threadArgs* args, ncclDataType_t type, const char* typeName, ncclRedOp_t op,  const char* opName, int root);
 extern testResult_t InitDataReduce(void* data, const size_t count, const size_t offset, ncclDataType_t type, ncclRedOp_t op, const uint64_t seed, const int nranks);
 extern testResult_t InitData(void* data, const size_t count, size_t offset, ncclDataType_t type, ncclRedOp_t op, const uint64_t seed, const int nranks, const int rank);
-extern void AllocateBuffs(void **sendbuff, void **recvbuff, void **expected, void **expectedHost, size_t nbytes, int nranks);
+extern testResult_t AllocateBuffs(void **sendbuff, size_t sendBytes, void **recvbuff, size_t recvBytes, void **expected, size_t nbytes);
 
 #include <unistd.h>