diff --git a/README.md b/README.md index ce952ef..a1fb8de 100644 --- a/README.md +++ b/README.md @@ -68,7 +68,7 @@ All tests support the same set of arguments : * `-r,--root ` Specify which root to use. Only for operations with a root like broadcast or reduce. Default : 0. * Performance * `-n,--iters ` number of iterations. Default : 20. - * `-w,--warmup_iters ` number of warmup iterations (not timed). Default : 5. + * `-w,--warmup_iters ` number of warmup iterations (not timed). Default : 1. * `-m,--agg_iters ` number of operations to aggregate together in each iteration. Default : 1. * `-N,--run_cycles ` run & print each cycle. Default : 1; 0=infinite. * `-a,--average <0/1/2/3>` Report performance as an average across all ranks (MPI=1 only). <0=Rank0,1=Avg,2=Min,3=Max>. Default : 1. diff --git a/src/common.cu b/src/common.cu index 99b1d59..42bc3eb 100644 --- a/src/common.cu +++ b/src/common.cu @@ -78,7 +78,7 @@ static size_t maxBytes = 32*1024*1024; static size_t stepBytes = 1*1024*1024; static size_t stepFactor = 1; static int datacheck = 1; -static int warmup_iters = 5; +static int warmup_iters = 1; static int iters = 20; static int agg_iters = 1; static int run_cycles = 1; @@ -607,19 +607,14 @@ testResult_t TimeTest(struct threadArgs* args, ncclDataType_t type, const char* // Sync to avoid first-call timeout Barrier(args); - // Warm-up for large size - setupArgs(args->maxbytes, type, args); - for (int iter = 0; iter < warmup_iters; iter++) { - TESTCHECK(startColl(args, type, op, root, 0, iter)); + // Warm-up for all sizes (using a stepfactor of 2) + for (size_t size = args->minbytes; size <= args->maxbytes; size = size * 2) { + setupArgs(size, type, args); + for (int iter = 0; iter < warmup_iters; iter++) { + TESTCHECK(startColl(args, type, op, root, 0, iter)); + } + TESTCHECK(completeColl(args)); } - TESTCHECK(completeColl(args)); - - // Warm-up for small size - setupArgs(args->minbytes, type, args); - for (int iter = 0; iter < warmup_iters; iter++) { - TESTCHECK(startColl(args, type, op, root, 0, iter)); - } - TESTCHECK(completeColl(args)); // Benchmark long repeat = run_cycles;