Allow blocking collectives without MPI_Barrier in timing loop

This commit is contained in:
David Addison 2026-03-11 09:26:08 -07:00
parent 8d26b23319
commit ba52a70492
2 changed files with 4 additions and 3 deletions

View File

@ -540,7 +540,7 @@ testResult_t startColl(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t
// Complete op before returning
TESTCHECK(testStreamSynchronize(args->nGpus, args->streams, args->comms));
}
if (blocking_coll) Barrier(args);
if (blocking_coll == 1) Barrier(args);
return testSuccess;
}
@ -1167,7 +1167,7 @@ int main(int argc, char* argv[], char **envp) {
#endif
"[-d,--datatype <nccltype/all>] \n\t"
"[-r,--root <root>] \n\t"
"[-z,--blocking <0/1>] \n\t"
"[-z,--blocking <0/1/2> 1=wait for completion and barrier (default behavior), 2=wait without barrier] \n\t"
"[-y,--stream_null <0/1>] \n\t"
"[-T,--timeout <time in seconds>] \n\t"
"[-G,--cudagraph <num graph launches>] \n\t"

View File

@ -518,7 +518,8 @@ testResult_t writeDeviceReport(size_t *maxMem, int localRank, int proc, int tota
nThreads, nGpus, minBytes, maxBytes,
(stepFactor > 1)?stepFactor:stepBytes, (stepFactor > 1)?"factor":"bytes",
warmup_iters, iters, agg_iters, datacheck, cudaGraphLaunches);
if (blocking_coll) PRINT("# Blocking Enabled: wait for completion and barrier after each collective \n");
if (blocking_coll == 1) PRINT("# Blocking Enabled: wait for completion and barrier after each collective \n");
if (blocking_coll > 1) PRINT("# Blocking Enabled: wait for completion after each collective (no barrier) \n");
if (parallel_init) PRINT("# Parallel Init Enabled: threads call into NcclInitRank concurrently \n");
PRINT("#\n");