mirror of
https://github.com/NVIDIA/nccl-tests.git
synced 2026-05-03 13:02:36 +00:00
Make the -c option be a datacheck iteration count parameter
Default is 1
This commit is contained in:
@@ -62,7 +62,7 @@ All tests support the same set of arguments :
|
||||
* `-a,--average <0/1/2/3>` Report performance as an average across all ranks (MPI=1 only). <0=Rank0,1=Avg,2=Min,3=Max>. Default : 1.
|
||||
* Test operation
|
||||
* `-p,--parallel_init <0/1>` use threads to initialize NCCL in parallel. Default : 0.
|
||||
* `-c,--check <0/1>` check correctness of results. This can be quite slow on large numbers of GPUs. Default : 1.
|
||||
* `-c,--check <check iteration count>` perform count iterations, checking correctness of results on each iteration. This can be quite slow on large numbers of GPUs. Default : 1.
|
||||
* `-z,--blocking <0/1>` Make NCCL collective blocking, i.e. have CPUs wait and sync after each collective. Default : 0.
|
||||
* `-G,--cudagraph <num graph launches>` Capture iterations as a CUDA graph and then replay specified number of times. Default : 0.
|
||||
|
||||
|
||||
+4
-2
@@ -487,7 +487,7 @@ testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t
|
||||
int64_t wrongElts = 0;
|
||||
static __thread int rep = 0;
|
||||
rep++;
|
||||
if (datacheck) {
|
||||
for (int c = 0; c < datacheck; c++) {
|
||||
// Initialize sendbuffs, recvbuffs and expected
|
||||
TESTCHECK(args->collTest->initData(args, type, op, root, rep, in_place));
|
||||
|
||||
@@ -536,8 +536,10 @@ testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t
|
||||
|
||||
//aggregate delta from all threads and procs
|
||||
long long wrongElts1 = wrongElts;
|
||||
//if (wrongElts) fprintf(stderr, "\nERROR: Data corruption : rank %d size %ld wrongElts %ld\n", args->proc, args->expectedBytes, wrongElts);
|
||||
Allreduce(args, &wrongElts1, /*sum*/4);
|
||||
wrongElts = wrongElts1;
|
||||
if (wrongElts) break;
|
||||
}
|
||||
|
||||
double timeUsec = (report_cputime ? cputimeSec : deltaSec)*1.0E6;
|
||||
@@ -809,7 +811,7 @@ int main(int argc, char* argv[]) {
|
||||
"[-m,--agg_iters <aggregated iteration count>] \n\t"
|
||||
"[-w,--warmup_iters <warmup iteration count>] \n\t"
|
||||
"[-p,--parallel_init <0/1>] \n\t"
|
||||
"[-c,--check <0/1>] \n\t"
|
||||
"[-c,--check <check iteration count>] \n\t"
|
||||
#if NCCL_VERSION_CODE >= NCCL_VERSION(2,11,0)
|
||||
"[-o,--op <sum/prod/min/max/avg/mulsum/all>] \n\t"
|
||||
#elif NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0)
|
||||
|
||||
Reference in New Issue
Block a user