diff --git a/src/common.cu b/src/common.cu index dcead4d..8588047 100644 --- a/src/common.cu +++ b/src/common.cu @@ -924,6 +924,7 @@ testResult_t run() { } #ifdef MPI_SUPPORT MPI_Bcast(&ncclId, sizeof(ncclId), MPI_BYTE, 0, mpi_comm); + MPI_Barrier(MPI_COMM_WORLD); // Ensure Bcast is complete for HCOLL #endif int gpus[nGpus*nThreads]; cudaStream_t streams[nGpus*nThreads];