From 5013240b747d4ee8041520ed3181759fa80b4c84 Mon Sep 17 00:00:00 2001 From: zhiyongww Date: Mon, 9 Jun 2025 11:58:51 -0700 Subject: [PATCH] json output support --- src/common.cu | 38 ++++++++++++++++++++++++++++++++++++++ src/common.h | 4 +++- 2 files changed, 41 insertions(+), 1 deletion(-) diff --git a/src/common.cu b/src/common.cu index 69b892a..b031f64 100644 --- a/src/common.cu +++ b/src/common.cu @@ -13,11 +13,14 @@ #include #include #include "cuda.h" +#include #include "../verifiable/verifiable.h" int test_ncclVersion = 0; // init'd with ncclGetVersion() +std::string structured_output; + #if NCCL_MAJOR >= 2 ncclDataType_t test_types[ncclNumTypes] = { ncclInt8, ncclUint8, ncclInt32, ncclUint32, ncclInt64, ncclUint64, ncclHalf, ncclFloat, ncclDouble @@ -65,6 +68,7 @@ extern "C" __attribute__((weak)) char const* ncclGetLastError(ncclComm_t comm) { int is_main_proc = 0; thread_local int is_main_thread = 0; +thread_local int json_flag = 0; // Command line parameter defaults static int nThreads = 1; @@ -574,6 +578,12 @@ testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t } else { sprintf(timeStr, "%7.2f", timeUsec); } + std::string prefix = (in_place == 1) ? "ip" : "oop"; + structured_output += "\"" + prefix + "_time\": " + std::to_string(timeUsec) + ", "; + structured_output += "\"" + prefix + "_algbw\": " + std::to_string(algBw) + ", "; + structured_output += "\"" + prefix + "_busbw\": " + std::to_string(busBw) + ", "; + structured_output += "\"" + prefix + "_error\": " + (args->reportErrors ? std::to_string(double(wrongElts)) : "\"N/A\""); + if (args->reportErrors) { PRINT(" %7s %6.2f %6.2f %5g", timeStr, algBw, busBw, (double)wrongElts); } else { @@ -619,17 +629,28 @@ testResult_t TimeTest(struct threadArgs* args, ncclDataType_t type, const char* // Benchmark long repeat = run_cycles; + structured_output += "["; do { + structured_output += "{"; for (size_t size = args->minbytes; size<=args->maxbytes; size = ((args->stepfactor > 1) ? size*args->stepfactor : size+args->stepbytes)) { setupArgs(size, type, args); char rootName[100]; sprintf(rootName, "%6i", root); + structured_output += "\"" + std::to_string(max(args->sendBytes, args->expectedBytes)) + "\": {"; + structured_output += "\"count\": " + std::to_string(args->nbytes / wordSize(type)) + ", "; + structured_output += "\"type\": \"" + std::string(typeName) + "\", "; + structured_output += "\"redop\": \"" + std::string(opName) + "\", "; + structured_output += "\"root\": \"" + std::to_string(root) + "\", "; PRINT("%12li %12li %8s %6s %6s", max(args->sendBytes, args->expectedBytes), args->nbytes / wordSize(type), typeName, opName, rootName); TESTCHECK(BenchTime(args, type, op, root, 0)); + structured_output += ", "; TESTCHECK(BenchTime(args, type, op, root, 1)); + structured_output += (size==args->maxbytes) ? "}" : "}, "; PRINT("\n"); } + structured_output += repeat == 1 ? "}" : "}, "; } while (--repeat); + structured_output += "], "; return testSuccess; } @@ -650,6 +671,7 @@ testResult_t threadInit(struct threadArgs* args) { //set main thread again is_main_thread = (is_main_proc && args->thread == 0) ? 1 : 0; + json_flag = (is_my_main_thread && args->jsonFlag) ? 1 : 0; NCCLCHECK(ncclGroupStart()); for (int i=0; inGpus; i++) { @@ -757,6 +779,7 @@ int main(int argc, char* argv[]) { double parsed; int longindex; static struct option longopts[] = { + {"json", no_argument, &json_flag, 1}, {"nthreads", required_argument, 0, 't'}, {"ngpus", required_argument, 0, 'g'}, {"minbytes", required_argument, 0, 'b'}, @@ -791,6 +814,7 @@ int main(int argc, char* argv[]) { break; switch(c) { + case 0: break; // flags case 't': nThreads = strtol(optarg, NULL, 0); break; @@ -1007,6 +1031,7 @@ testResult_t run() { MPI_Comm_rank(mpi_comm, &ncclProc); #endif is_main_thread = is_main_proc = (proc == 0) ? 1 : 0; + json_flag = (is_main_thread && json_flag) ? 1 : 0; PRINT("# nThread %d nGpus %d minBytes %ld maxBytes %ld step: %ld(%s) warmup iters: %d iters: %d agg iters: %d validation: %d graph: %d\n", nThreads, nGpus, minBytes, maxBytes, @@ -1157,6 +1182,9 @@ testResult_t run() { fflush(stdout); + structured_output += "{\"identifier\": \"nccl-tests structured output\", "; + structured_output += "\"version\": \"" + std::to_string(NCCL_MAJOR) + "." + std::to_string(NCCL_MINOR) + "." + std::to_string(NCCL_PATCH) + "\", "; + structured_output += "\"benchmark\": "; const char* timeStr = report_cputime ? "cputime" : "time"; PRINT("#\n"); PRINT("# %10s %12s %8s %6s %6s out-of-place in-place \n", "", "", "", "", ""); @@ -1258,9 +1286,19 @@ testResult_t run() { double check_avg_bw = envstr ? atof(envstr) : -1; bw[0] /= bw_count[0]; + structured_output += "\"out_of_bounds_value\": " + std::to_string(errors[0]) + ", "; + structured_output += "\"out_of_bounds_value_status\": "; + structured_output += errors[0] != 0 ? "\"FAILED\", " : "\"OK\", "; + structured_output += "\"avg_bus_bw\": " + std::to_string(bw[0]) + ", "; + structured_output += "\"avg_bus_bw_status\": "; + structured_output += (bw[0] < check_avg_bw*(0.9)) ? "\"FAILED\"" : "\"OK\""; + structured_output += "}"; PRINT("# Out of bounds values : %d %s\n", errors[0], errors[0] ? "FAILED" : "OK"); PRINT("# Avg bus bandwidth : %g %s\n", bw[0], check_avg_bw == -1 ? "" : (bw[0] < check_avg_bw*(0.9) ? "FAILED" : "OK")); PRINT("#\n"); + if (json_flag) { + printf("%s\n", structured_output.c_str()); + } #ifdef MPI_SUPPORT MPI_Comm_free(&mpi_comm); MPI_Finalize(); diff --git a/src/common.h b/src/common.h index ff834f6..3b77c00 100644 --- a/src/common.h +++ b/src/common.h @@ -140,6 +140,7 @@ struct threadArgs { int* bw_count; int reportErrors; + int jsonFlag; struct testColl* collTest; }; @@ -299,6 +300,7 @@ static int ncclstringtoop (char *str) { extern int is_main_proc; extern thread_local int is_main_thread; -#define PRINT if (is_main_thread) printf +extern thread_local int json_flag; +#define PRINT if (is_main_thread && !json_flag) printf #endif