mirror of
https://github.com/NVIDIA/nccl-tests.git
synced 2026-04-23 16:08:20 +08:00
Allow blocking collectives without MPI_Barrier in timing loop
This commit is contained in:
parent
8d26b23319
commit
ba52a70492
@ -540,7 +540,7 @@ testResult_t startColl(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t
|
||||
// Complete op before returning
|
||||
TESTCHECK(testStreamSynchronize(args->nGpus, args->streams, args->comms));
|
||||
}
|
||||
if (blocking_coll) Barrier(args);
|
||||
if (blocking_coll == 1) Barrier(args);
|
||||
return testSuccess;
|
||||
}
|
||||
|
||||
@ -1167,7 +1167,7 @@ int main(int argc, char* argv[], char **envp) {
|
||||
#endif
|
||||
"[-d,--datatype <nccltype/all>] \n\t"
|
||||
"[-r,--root <root>] \n\t"
|
||||
"[-z,--blocking <0/1>] \n\t"
|
||||
"[-z,--blocking <0/1/2> 1=wait for completion and barrier (default behavior), 2=wait without barrier] \n\t"
|
||||
"[-y,--stream_null <0/1>] \n\t"
|
||||
"[-T,--timeout <time in seconds>] \n\t"
|
||||
"[-G,--cudagraph <num graph launches>] \n\t"
|
||||
|
||||
@ -518,7 +518,8 @@ testResult_t writeDeviceReport(size_t *maxMem, int localRank, int proc, int tota
|
||||
nThreads, nGpus, minBytes, maxBytes,
|
||||
(stepFactor > 1)?stepFactor:stepBytes, (stepFactor > 1)?"factor":"bytes",
|
||||
warmup_iters, iters, agg_iters, datacheck, cudaGraphLaunches);
|
||||
if (blocking_coll) PRINT("# Blocking Enabled: wait for completion and barrier after each collective \n");
|
||||
if (blocking_coll == 1) PRINT("# Blocking Enabled: wait for completion and barrier after each collective \n");
|
||||
if (blocking_coll > 1) PRINT("# Blocking Enabled: wait for completion after each collective (no barrier) \n");
|
||||
if (parallel_init) PRINT("# Parallel Init Enabled: threads call into NcclInitRank concurrently \n");
|
||||
PRINT("#\n");
|
||||
|
||||
|
||||
Loading…
Reference in New Issue
Block a user