diff --git a/src/common.cu b/src/common.cu index 411477c..9dff7af 100644 --- a/src/common.cu +++ b/src/common.cu @@ -101,7 +101,7 @@ int cudaGraphLaunches = 0; static int report_cputime = 0; static int report_timestamps = 0; static int deviceImpl = 0; -static int unalign = 0; +int unalign = 0; int memory_report = 0; int deviceCtaCount = 16; // Default number of CTAs for device implementation diff --git a/src/util.cu b/src/util.cu index d2a604b..5e22092 100644 --- a/src/util.cu +++ b/src/util.cu @@ -37,6 +37,7 @@ extern int agg_iters; extern int parallel_init; extern int blocking_coll; extern int cudaGraphLaunches; +extern int unalign; static FILE *json_report_fp; static thread_local bool write_json; @@ -514,10 +515,10 @@ void writeBenchmarkLineBody(double timeUsec, double algBw, double busBw, bool re testResult_t writeDeviceReport(size_t *maxMem, int localRank, int proc, int totalProcs, int color, const char hostname[], const char *program_name) { PRINT("# nccl-tests version %s nccl-headers=%d nccl-library=%d\n", NCCL_TESTS_VERSION, NCCL_VERSION_CODE, test_ncclVersion); PRINT("# Collective test starting: %s\n", program_name); - PRINT("# nThread %d nGpus %d minBytes %ld maxBytes %ld step: %ld(%s) warmup iters: %d iters: %d agg iters: %d validation: %d graph: %d\n", + PRINT("# nThread %d nGpus %d minBytes %ld maxBytes %ld step: %ld(%s) warmup iters: %d iters: %d agg iters: %d validation: %d graph: %d unalign: %d\n", nThreads, nGpus, minBytes, maxBytes, (stepFactor > 1)?stepFactor:stepBytes, (stepFactor > 1)?"factor":"bytes", - warmup_iters, iters, agg_iters, datacheck, cudaGraphLaunches); + warmup_iters, iters, agg_iters, datacheck, cudaGraphLaunches, unalign); if (blocking_coll == 1) PRINT("# Blocking Enabled: wait for completion and barrier after each collective \n"); if (blocking_coll > 1) PRINT("# Blocking Enabled: wait for completion after each collective (no barrier) \n"); if (parallel_init) PRINT("# Parallel Init Enabled: threads call into NcclInitRank concurrently \n");