mirror of
https://github.com/NVIDIA/nccl-tests.git
synced 2026-05-03 13:02:36 +00:00
Modified warmup to run for more message sizes
Loops between minBytes and maxBytes doubling size each time Reduced default warmup iteration count to 1 (was 5)
This commit is contained in:
@@ -68,7 +68,7 @@ All tests support the same set of arguments :
|
|||||||
* `-r,--root <root/all>` Specify which root to use. Only for operations with a root like broadcast or reduce. Default : 0.
|
* `-r,--root <root/all>` Specify which root to use. Only for operations with a root like broadcast or reduce. Default : 0.
|
||||||
* Performance
|
* Performance
|
||||||
* `-n,--iters <iteration count>` number of iterations. Default : 20.
|
* `-n,--iters <iteration count>` number of iterations. Default : 20.
|
||||||
* `-w,--warmup_iters <warmup iteration count>` number of warmup iterations (not timed). Default : 5.
|
* `-w,--warmup_iters <warmup iteration count>` number of warmup iterations (not timed). Default : 1.
|
||||||
* `-m,--agg_iters <aggregation count>` number of operations to aggregate together in each iteration. Default : 1.
|
* `-m,--agg_iters <aggregation count>` number of operations to aggregate together in each iteration. Default : 1.
|
||||||
* `-N,--run_cycles <cycle count>` run & print each cycle. Default : 1; 0=infinite.
|
* `-N,--run_cycles <cycle count>` run & print each cycle. Default : 1; 0=infinite.
|
||||||
* `-a,--average <0/1/2/3>` Report performance as an average across all ranks (MPI=1 only). <0=Rank0,1=Avg,2=Min,3=Max>. Default : 1.
|
* `-a,--average <0/1/2/3>` Report performance as an average across all ranks (MPI=1 only). <0=Rank0,1=Avg,2=Min,3=Max>. Default : 1.
|
||||||
|
|||||||
+4
-9
@@ -78,7 +78,7 @@ static size_t maxBytes = 32*1024*1024;
|
|||||||
static size_t stepBytes = 1*1024*1024;
|
static size_t stepBytes = 1*1024*1024;
|
||||||
static size_t stepFactor = 1;
|
static size_t stepFactor = 1;
|
||||||
static int datacheck = 1;
|
static int datacheck = 1;
|
||||||
static int warmup_iters = 5;
|
static int warmup_iters = 1;
|
||||||
static int iters = 20;
|
static int iters = 20;
|
||||||
static int agg_iters = 1;
|
static int agg_iters = 1;
|
||||||
static int run_cycles = 1;
|
static int run_cycles = 1;
|
||||||
@@ -607,19 +607,14 @@ testResult_t TimeTest(struct threadArgs* args, ncclDataType_t type, const char*
|
|||||||
// Sync to avoid first-call timeout
|
// Sync to avoid first-call timeout
|
||||||
Barrier(args);
|
Barrier(args);
|
||||||
|
|
||||||
// Warm-up for large size
|
// Warm-up for all sizes (using a stepfactor of 2)
|
||||||
setupArgs(args->maxbytes, type, args);
|
for (size_t size = args->minbytes; size <= args->maxbytes; size = size * 2) {
|
||||||
|
setupArgs(size, type, args);
|
||||||
for (int iter = 0; iter < warmup_iters; iter++) {
|
for (int iter = 0; iter < warmup_iters; iter++) {
|
||||||
TESTCHECK(startColl(args, type, op, root, 0, iter));
|
TESTCHECK(startColl(args, type, op, root, 0, iter));
|
||||||
}
|
}
|
||||||
TESTCHECK(completeColl(args));
|
TESTCHECK(completeColl(args));
|
||||||
|
|
||||||
// Warm-up for small size
|
|
||||||
setupArgs(args->minbytes, type, args);
|
|
||||||
for (int iter = 0; iter < warmup_iters; iter++) {
|
|
||||||
TESTCHECK(startColl(args, type, op, root, 0, iter));
|
|
||||||
}
|
}
|
||||||
TESTCHECK(completeColl(args));
|
|
||||||
|
|
||||||
// Benchmark
|
// Benchmark
|
||||||
long repeat = run_cycles;
|
long repeat = run_cycles;
|
||||||
|
|||||||
Reference in New Issue
Block a user