Make the -c option be a datacheck iteration count parameter

Default is 1
This commit is contained in:
David Addison
2023-09-13 11:15:13 -07:00
parent 1a5f551ffd
commit 6c46206a47
2 changed files with 5 additions and 3 deletions
+1 -1
View File
@@ -62,7 +62,7 @@ All tests support the same set of arguments :
* `-a,--average <0/1/2/3>` Report performance as an average across all ranks (MPI=1 only). <0=Rank0,1=Avg,2=Min,3=Max>. Default : 1. * `-a,--average <0/1/2/3>` Report performance as an average across all ranks (MPI=1 only). <0=Rank0,1=Avg,2=Min,3=Max>. Default : 1.
* Test operation * Test operation
* `-p,--parallel_init <0/1>` use threads to initialize NCCL in parallel. Default : 0. * `-p,--parallel_init <0/1>` use threads to initialize NCCL in parallel. Default : 0.
* `-c,--check <0/1>` check correctness of results. This can be quite slow on large numbers of GPUs. Default : 1. * `-c,--check <check iteration count>` perform count iterations, checking correctness of results on each iteration. This can be quite slow on large numbers of GPUs. Default : 1.
* `-z,--blocking <0/1>` Make NCCL collective blocking, i.e. have CPUs wait and sync after each collective. Default : 0. * `-z,--blocking <0/1>` Make NCCL collective blocking, i.e. have CPUs wait and sync after each collective. Default : 0.
* `-G,--cudagraph <num graph launches>` Capture iterations as a CUDA graph and then replay specified number of times. Default : 0. * `-G,--cudagraph <num graph launches>` Capture iterations as a CUDA graph and then replay specified number of times. Default : 0.
+4 -2
View File
@@ -487,7 +487,7 @@ testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t
int64_t wrongElts = 0; int64_t wrongElts = 0;
static __thread int rep = 0; static __thread int rep = 0;
rep++; rep++;
if (datacheck) { for (int c = 0; c < datacheck; c++) {
// Initialize sendbuffs, recvbuffs and expected // Initialize sendbuffs, recvbuffs and expected
TESTCHECK(args->collTest->initData(args, type, op, root, rep, in_place)); TESTCHECK(args->collTest->initData(args, type, op, root, rep, in_place));
@@ -536,8 +536,10 @@ testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t
//aggregate delta from all threads and procs //aggregate delta from all threads and procs
long long wrongElts1 = wrongElts; long long wrongElts1 = wrongElts;
//if (wrongElts) fprintf(stderr, "\nERROR: Data corruption : rank %d size %ld wrongElts %ld\n", args->proc, args->expectedBytes, wrongElts);
Allreduce(args, &wrongElts1, /*sum*/4); Allreduce(args, &wrongElts1, /*sum*/4);
wrongElts = wrongElts1; wrongElts = wrongElts1;
if (wrongElts) break;
} }
double timeUsec = (report_cputime ? cputimeSec : deltaSec)*1.0E6; double timeUsec = (report_cputime ? cputimeSec : deltaSec)*1.0E6;
@@ -809,7 +811,7 @@ int main(int argc, char* argv[]) {
"[-m,--agg_iters <aggregated iteration count>] \n\t" "[-m,--agg_iters <aggregated iteration count>] \n\t"
"[-w,--warmup_iters <warmup iteration count>] \n\t" "[-w,--warmup_iters <warmup iteration count>] \n\t"
"[-p,--parallel_init <0/1>] \n\t" "[-p,--parallel_init <0/1>] \n\t"
"[-c,--check <0/1>] \n\t" "[-c,--check <check iteration count>] \n\t"
#if NCCL_VERSION_CODE >= NCCL_VERSION(2,11,0) #if NCCL_VERSION_CODE >= NCCL_VERSION(2,11,0)
"[-o,--op <sum/prod/min/max/avg/mulsum/all>] \n\t" "[-o,--op <sum/prod/min/max/avg/mulsum/all>] \n\t"
#elif NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0) #elif NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0)