From 6c46206a478203b6453035fe0d40dc6418acd089 Mon Sep 17 00:00:00 2001 From: David Addison Date: Wed, 13 Sep 2023 11:15:13 -0700 Subject: [PATCH] Make the -c option be a datacheck iteration count parameter Default is 1 --- README.md | 2 +- src/common.cu | 6 ++++-- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 580996b..4281799 100644 --- a/README.md +++ b/README.md @@ -62,7 +62,7 @@ All tests support the same set of arguments : * `-a,--average <0/1/2/3>` Report performance as an average across all ranks (MPI=1 only). <0=Rank0,1=Avg,2=Min,3=Max>. Default : 1. * Test operation * `-p,--parallel_init <0/1>` use threads to initialize NCCL in parallel. Default : 0. - * `-c,--check <0/1>` check correctness of results. This can be quite slow on large numbers of GPUs. Default : 1. + * `-c,--check ` perform count iterations, checking correctness of results on each iteration. This can be quite slow on large numbers of GPUs. Default : 1. * `-z,--blocking <0/1>` Make NCCL collective blocking, i.e. have CPUs wait and sync after each collective. Default : 0. * `-G,--cudagraph ` Capture iterations as a CUDA graph and then replay specified number of times. Default : 0. diff --git a/src/common.cu b/src/common.cu index 48a629c..dcead4d 100644 --- a/src/common.cu +++ b/src/common.cu @@ -487,7 +487,7 @@ testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t int64_t wrongElts = 0; static __thread int rep = 0; rep++; - if (datacheck) { + for (int c = 0; c < datacheck; c++) { // Initialize sendbuffs, recvbuffs and expected TESTCHECK(args->collTest->initData(args, type, op, root, rep, in_place)); @@ -536,8 +536,10 @@ testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t //aggregate delta from all threads and procs long long wrongElts1 = wrongElts; + //if (wrongElts) fprintf(stderr, "\nERROR: Data corruption : rank %d size %ld wrongElts %ld\n", args->proc, args->expectedBytes, wrongElts); Allreduce(args, &wrongElts1, /*sum*/4); wrongElts = wrongElts1; + if (wrongElts) break; } double timeUsec = (report_cputime ? cputimeSec : deltaSec)*1.0E6; @@ -809,7 +811,7 @@ int main(int argc, char* argv[]) { "[-m,--agg_iters ] \n\t" "[-w,--warmup_iters ] \n\t" "[-p,--parallel_init <0/1>] \n\t" - "[-c,--check <0/1>] \n\t" + "[-c,--check ] \n\t" #if NCCL_VERSION_CODE >= NCCL_VERSION(2,11,0) "[-o,--op ] \n\t" #elif NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0)