Allow blocking collectives without MPI_Barrier in timing loop

2026-04-23 16:08:20 +08:00 · 2026-03-11 09:26:08 -07:00 · 2026-03-11 09:26:08 -07:00 · ba52a70492
commit ba52a70492
parent 8d26b23319
2 changed files with 4 additions and 3 deletions
--- a/src/common.cu
+++ b/src/common.cu
@ -540,7 +540,7 @@ testResult_t startColl(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t
    // Complete op before returning
    TESTCHECK(testStreamSynchronize(args->nGpus, args->streams, args->comms));
  }
-  if (blocking_coll) Barrier(args);
+  if (blocking_coll == 1) Barrier(args);
  return testSuccess;
 }

@ -1167,7 +1167,7 @@ int main(int argc, char* argv[], char **envp) {
 #endif
            "[-d,--datatype <nccltype/all>] \n\t"
            "[-r,--root <root>] \n\t"
-            "[-z,--blocking <0/1>] \n\t"
+            "[-z,--blocking <0/1/2> 1=wait for completion and barrier (default behavior), 2=wait without barrier] \n\t"
            "[-y,--stream_null <0/1>] \n\t"
            "[-T,--timeout <time in seconds>] \n\t"
            "[-G,--cudagraph <num graph launches>] \n\t"
--- a/src/util.cu
+++ b/src/util.cu
@ -518,7 +518,8 @@ testResult_t writeDeviceReport(size_t *maxMem, int localRank, int proc, int tota
        nThreads, nGpus, minBytes, maxBytes,
        (stepFactor > 1)?stepFactor:stepBytes, (stepFactor > 1)?"factor":"bytes",
        warmup_iters, iters, agg_iters, datacheck, cudaGraphLaunches);
-  if (blocking_coll) PRINT("# Blocking Enabled: wait for completion and barrier after each collective \n");
+  if (blocking_coll == 1) PRINT("# Blocking Enabled: wait for completion and barrier after each collective \n");
+  if (blocking_coll > 1)  PRINT("# Blocking Enabled: wait for completion after each collective (no barrier) \n");
  if (parallel_init) PRINT("# Parallel Init Enabled: threads call into NcclInitRank concurrently \n");
  PRINT("#\n");