diff --git a/src/common.cu b/src/common.cu index 7a91441..c932f7f 100644 --- a/src/common.cu +++ b/src/common.cu @@ -99,6 +99,7 @@ int cudaGraphLaunches = 0; static int report_cputime = 0; static int report_timestamps = 0; static int deviceImpl = 0; +int memory_report = 0; int deviceCtaCount = 16; // Default number of CTAs for device implementation @@ -710,18 +711,38 @@ testResult_t TimeTest(struct threadArgs* args, ncclDataType_t type, const char* return testSuccess; } +static void getGPUMemoryInfo(int64_t* ptotalGpuMem, int64_t* pfreeGpuMem) { + size_t freeGpuMem, totalGpuMem = 0; + cudaMemGetInfo(&freeGpuMem, &totalGpuMem); + if (ptotalGpuMem != nullptr) *ptotalGpuMem = totalGpuMem; + if (pfreeGpuMem != nullptr) *pfreeGpuMem = freeGpuMem; +} + testResult_t threadRunTests(struct threadArgs* args) { + // capture the free memory before + int64_t* totalGpuFreeMem = (int64_t*)calloc(args->nGpus*2, sizeof(int64_t)); + for (int g = 0; g < args->nGpus; ++g) { + CUDACHECK(cudaSetDevice(args->gpus[g])); + getGPUMemoryInfo(nullptr, &totalGpuFreeMem[g]); + } + // Set device to the first of our GPUs. If we don't do that, some operations // will be done on the current GPU (by default : 0) and if the GPUs are in // exclusive mode those operations will fail. CUDACHECK(cudaSetDevice(args->gpus[0])); TESTCHECK(ncclTestEngine.runTest(args, ncclroot, (ncclDataType_t)nccltype, test_typenames[nccltype], (ncclRedOp_t)ncclop, test_opnames[ncclop])); + + // Capture the memory used by the GPUs + for (int g = 0; g < args->nGpus; ++g) { + CUDACHECK(cudaSetDevice(args->gpus[g])); + getGPUMemoryInfo(nullptr, &totalGpuFreeMem[g + args->nGpus]); + *args->devMemUsed = std::max(*args->devMemUsed, totalGpuFreeMem[g] - totalGpuFreeMem[g + args->nGpus]); + } + free(totalGpuFreeMem); return testSuccess; } testResult_t threadInit(struct threadArgs* args) { - char hostname[1024]; - getHostName(hostname, 1024); int nranks = args->nProcs*args->nThreads*args->nGpus; //set main thread again @@ -729,6 +750,13 @@ testResult_t threadInit(struct threadArgs* args) { jsonIdentifyWriter(is_main_thread); + // Capture GPU memory before initializing the NCCL communicators + int64_t* initFreeGpuMem = (int64_t*)calloc(args->nGpus*3, sizeof(int64_t)); + for (int g = 0; g < args->nGpus; ++g) { + CUDACHECK(cudaSetDevice(args->gpus[g])); + getGPUMemoryInfo(nullptr, &initFreeGpuMem[g]); + } + #if NCCL_VERSION_CODE >= NCCL_VERSION(2,14,0) ncclConfig_t config = NCCL_CONFIG_INITIALIZER; #if NCCL_VERSION_CODE >= NCCL_VERSION(2,27,0) @@ -751,6 +779,14 @@ testResult_t threadInit(struct threadArgs* args) { #endif } NCCLCHECK(ncclGroupEnd()); + + // Capture the memory used by the GPUs after initializing the NCCL communicators + for (int g = 0; g < args->nGpus; ++g) { + CUDACHECK(cudaSetDevice(args->gpus[g])); + getGPUMemoryInfo(nullptr, &initFreeGpuMem[g + args->nGpus]); + *args->initGpuMem = std::max(*args->initGpuMem, initFreeGpuMem[g] - initFreeGpuMem[g + args->nGpus]); + } + #if NCCL_VERSION_CODE >= NCCL_VERSION(2,19,0) NCCLCHECK(ncclGroupStart()); for (int i=0; inGpus; i++) { @@ -767,6 +803,12 @@ testResult_t threadInit(struct threadArgs* args) { } NCCLCHECK(ncclGroupEnd()); #endif + // Capture memory used by test buffers + for (int g = 0; g < args->nGpus; ++g) { + CUDACHECK(cudaSetDevice(args->gpus[g])); + getGPUMemoryInfo(nullptr, &initFreeGpuMem[g + args->nGpus*2]); + args->bufferMemory[args->thread] = std::max(args->bufferMemory[args->thread], initFreeGpuMem[g + args->nGpus] - initFreeGpuMem[g + args->nGpus*2]); + } #if NCCL_VERSION_CODE >= NCCL_VERSION(2,28,0) /* Create device communicators based on test-specific requirements */ if (deviceImpl) { @@ -783,7 +825,17 @@ testResult_t threadInit(struct threadArgs* args) { } NCCLCHECK(ncclGroupEnd()); } + // Capture memory used by test buffers + int64_t deviceCommMaxMem = 0; + for (int g = 0; g < args->nGpus; ++g) { + CUDACHECK(cudaSetDevice(args->gpus[g])); + int64_t freeGpuMem; + getGPUMemoryInfo(nullptr, &freeGpuMem); + deviceCommMaxMem = std::max(deviceCommMaxMem, initFreeGpuMem[g + args->nGpus*2] - freeGpuMem); + } + *args->initGpuMem += deviceCommMaxMem; #endif + free(initFreeGpuMem); TESTCHECK(threadRunTests(args)); @@ -879,13 +931,15 @@ int main(int argc, char* argv[], char **envp) { {"cta_policy", required_argument, 0, 'x'}, {"device_implementation", required_argument, 0, 'D'}, {"device_cta_count", required_argument, 0, 'V'}, + {"memory", required_argument, 0, 'M'}, + {"help", no_argument, 0, 'h'}, {} }; while(1) { int c; - c = getopt_long(argc, argv, "t:g:b:e:i:f:n:m:w:N:p:c:o:d:r:z:y:T:hG:C:a:R:x:D:V:J:S:", longopts, &longindex); + c = getopt_long(argc, argv, "t:g:b:e:i:f:n:m:w:N:p:c:o:d:r:z:y:T:hG:C:a:R:x:D:V:J:S:M:", longopts, &longindex); if (c == -1) break; @@ -994,6 +1048,9 @@ int main(int argc, char* argv[], char **envp) { printf("Option -R (register) is not supported before NCCL 2.19. Ignoring\n"); #endif break; + case 'M': + memory_report = (int)strtol(optarg, NULL, 0); + break; case 'x': #if NCCL_VERSION_CODE >= NCCL_VERSION(2,27,0) ctaPolicy = (int)strtol(optarg, NULL, 0); @@ -1008,8 +1065,7 @@ int main(int argc, char* argv[], char **envp) { case 'D': if (test_ncclVersion >= NCCL_VERSION(2,28,0)) { deviceImpl = (int)strtol(optarg, NULL, 0); - } - else { + } else { fprintf(stderr, "Option -D (device implementation) requires NCCL >= 2.28.0\n"); return -1; } @@ -1021,9 +1077,8 @@ int main(int argc, char* argv[], char **envp) { fprintf(stderr, "device_cta_count (-V) must be positive and less than 128, got %d. " "Using default value 16.\n", deviceCtaCount); deviceCtaCount = 16; - } - } - else { + } + } else { fprintf(stderr, "Option -V (device CTA count) requires NCCL >= 2.28.0\n"); return -1; } @@ -1065,6 +1120,7 @@ int main(int argc, char* argv[], char **envp) { "[-x,--cta_policy <0/1/2> set CTA policy (NCCL_CTA_POLICY_DEFAULT (0), NCCL_CTA_POLICY_EFFICIENCY (1), NCCL_CTA_POLICY_ZERO (2)) (default: do not set)] \n\t" "[-D,--device_implementation enable device implementation (default: 0, use NCCL implementation; requires -R 2 if > 0)] \n\t" "[-V,--device_cta_count set number of CTAs for device implementation (default: 16)] \n\t" + "[-M,--memory_report <0/1> enable memory usage report (default: 0)] \n\t" "[-h,--help]\n", basename(argv[0])); return 0; @@ -1213,7 +1269,6 @@ testResult_t run() { for (int i=0; i= NCCL_VERSION(2,28,0) ncclDevComm devComms[nThreads*nGpus]; #endif + int64_t initGpuMem[nThreads] = {0}; + int64_t bufferMemory[nThreads] = {0}; if (!parallel_init) { + // Capture the memory used by the GPUs before initializing the NCCL communicators + int64_t* initFreeGpuMem = (int64_t*)calloc(nGpus*3, sizeof(int64_t)); + for (int g = 0; g < nGpus; ++g) { + CUDACHECK(cudaSetDevice(gpus[g])); + getGPUMemoryInfo(nullptr, &initFreeGpuMem[g]); + } + //if parallel init is not selected, use main thread to initialize NCCL #if NCCL_VERSION_CODE >= NCCL_VERSION(2,14,0) ncclConfig_t config = NCCL_CONFIG_INITIALIZER; #if NCCL_VERSION_CODE >= NCCL_VERSION(2,27,0) @@ -1277,9 +1341,22 @@ testResult_t run() { #endif } NCCLCHECK(ncclGroupEnd()); + + // Capture the memory used by the GPUs after initializing the NCCL communicators + for (int g = 0; g < nGpus; ++g) { + CUDACHECK(cudaSetDevice(gpus[g])); + getGPUMemoryInfo(nullptr, &initFreeGpuMem[g + nGpus]); + } + for ( size_t t = 0; t < nThreads; ++t) { + for (int g = 0; g < nGpus; ++g) { + initGpuMem[t] = std::max(initGpuMem[t], initFreeGpuMem[g] - initFreeGpuMem[g + nGpus]); + } + } #if NCCL_VERSION_CODE >= NCCL_VERSION(2,19,0) NCCLCHECK(ncclGroupStart()); for (int i=0; i= NCCL_VERSION(2,27,0) if (test_ncclVersion >= NCCL_VERSION(2,27,0) && (local_register == SYMMETRIC_REGISTER)) { NCCLCHECK(ncclCommWindowRegister(comms[i], sendbuffs[i], maxBytes, (ncclWindow_t*)&sendRegHandles[i], NCCL_WIN_COLL_SYMMETRIC)); @@ -1293,6 +1370,16 @@ testResult_t run() { } NCCLCHECK(ncclGroupEnd()); #endif + // Capture memory used by after allocating buffers + for (int g = 0; g < nGpus; ++g) { + CUDACHECK(cudaSetDevice(gpus[g])); + getGPUMemoryInfo(nullptr, &initFreeGpuMem[g + nGpus*2]); + } + for ( size_t t = 0; t < nThreads; ++t) { + for (int g = 0; g < nGpus; ++g) { + bufferMemory[t] = std::max(bufferMemory[t], initFreeGpuMem[g + nGpus] - initFreeGpuMem[g + nGpus*2]); + } + } #if NCCL_VERSION_CODE >= NCCL_VERSION(2,28,0) /* Create device communicators based on test-specific requirements */ if (deviceImpl) { @@ -1309,15 +1396,28 @@ testResult_t run() { } NCCLCHECK(ncclGroupEnd()); } + int64_t deviceCommMaxMem = 0; + for (int g = 0; g < nGpus; ++g) { + CUDACHECK(cudaSetDevice(gpus[g])); + int64_t freeGpuMem; + getGPUMemoryInfo(nullptr, &freeGpuMem); + deviceCommMaxMem = std::max(deviceCommMaxMem, initFreeGpuMem[g + nGpus*2] - freeGpuMem); + } + for ( size_t t = 0; t < nThreads; ++t) { + initGpuMem[t] += deviceCommMaxMem; + } #endif + free(initFreeGpuMem); } int errors[nThreads]; double bw[nThreads]; + int64_t devMemUsed[nThreads]; int bw_count[nThreads]; for (int t=0; t::min(); } fflush(stdout); @@ -1358,6 +1458,9 @@ testResult_t run() { threads[t].args.errors=errors+t; threads[t].args.bw=bw+t; threads[t].args.bw_count=bw_count+t; + threads[t].args.initGpuMem = initGpuMem + t; + threads[t].args.bufferMemory = bufferMemory + t; + threads[t].args.devMemUsed = devMemUsed + t; threads[t].args.reportErrors = datacheck; @@ -1376,11 +1479,17 @@ testResult_t run() { errors[0] += errors[t]; bw[0] += bw[t]; bw_count[0] += bw_count[t]; + devMemUsed[0] = std::max(devMemUsed[0], devMemUsed[t]); + initGpuMem[0] = std::max(initGpuMem[0], initGpuMem[t]); + bufferMemory[0] = std::max(bufferMemory[0], bufferMemory[t]); } } #ifdef MPI_SUPPORT MPI_Allreduce(MPI_IN_PLACE, &errors[0], 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD); + MPI_Allreduce(MPI_IN_PLACE, &devMemUsed[0], 1, MPI_INT64_T, MPI_MAX, MPI_COMM_WORLD); + MPI_Allreduce(MPI_IN_PLACE, &initGpuMem[0], 1, MPI_INT64_T, MPI_MAX, MPI_COMM_WORLD); + MPI_Allreduce(MPI_IN_PLACE, &bufferMemory[0], 1, MPI_INT64_T, MPI_MAX, MPI_COMM_WORLD); #endif if (!parallel_init) { @@ -1416,10 +1525,18 @@ testResult_t run() { } envstr = getenv("NCCL_TESTS_MIN_BW"); - double check_avg_bw = envstr ? atof(envstr) : -1; + const double check_avg_bw = envstr ? atof(envstr) : -1; bw[0] /= bw_count[0]; writeResultFooter(errors, bw, check_avg_bw, program_invocation_short_name); + if (memory_report) { + memInfo_t memInfos[3]; + memInfos[0] = { initGpuMem[0], "Initialization" }; + memInfos[1] = { bufferMemory[0], "User-Allocated" }; + memInfos[2] = { devMemUsed[0], "Collective" }; + writeMemInfo(memInfos, 3); + } + finalizeFooter(); #ifdef MPI_SUPPORT MPI_Comm_free(&mpi_comm); diff --git a/src/common.h b/src/common.h index b0b250c..3f3a0e2 100644 --- a/src/common.h +++ b/src/common.h @@ -155,6 +155,10 @@ struct threadArgs { struct testColl* collTest; + int64_t* initGpuMem; + int64_t* bufferMemory; + int64_t* devMemUsed; + #if NCCL_VERSION_CODE >= NCCL_VERSION(2,19,0) void** sendRegHandles; void** recvRegHandles; diff --git a/src/util.cu b/src/util.cu index 1e4d283..5585a70 100644 --- a/src/util.cu +++ b/src/util.cu @@ -19,6 +19,8 @@ #include "util.h" #include #include +#include +#include #define PRINT if (is_main_thread) printf @@ -641,8 +643,8 @@ void writeResultFooter(const int errors[], const double bw[], double check_avg_b jsonFinishList(); } - PRINT("# Out of bounds values : %d %s\n", errors[0], errors[0] ? "FAILED" : "OK"); - PRINT("# Avg bus bandwidth : %g %s\n", bw[0], check_avg_bw == -1 ? "" : (bw[0] < check_avg_bw*(0.9) ? "FAILED" : "OK")); + PRINT("# %-20s : %d %s\n", "Out of bounds values", errors[0], errors[0] ? "FAILED" : "OK"); + PRINT("# %-20s : %g %s\n", "Avg bus bandwidth", bw[0], check_avg_bw == -1 ? "" : (bw[0] < check_avg_bw*(0.9) ? "FAILED" : "OK")); PRINT("#\n"); PRINT("# Collective test concluded: %s\n", program_name); @@ -660,6 +662,46 @@ void writeResultFooter(const int errors[], const double bw[], double check_avg_b } } +std::string getMemString(double amount) { + std::string postfix = " B"; + if (abs(amount) >= 1024.0*1024.0*1024.0) { + postfix = " GB"; + amount /= 1024.0 * 1024.0 * 1024.0; + } else if (abs(amount) >= 1024.0*1024.0) { + postfix = " MB"; + amount /= 1024.0 * 1024.0; + } else if (abs(amount) >= 1024.0) { + postfix = " KB"; + amount /= 1024.0; + } + int precision = 0; + if (abs(amount) < 10.0) { + precision = 2; + } else if (abs(amount) < 100.0) { + precision = 1; + } + std::stringstream ss; + ss << std::fixed << std::setprecision(precision) << amount << postfix; + return ss.str(); +} + +void writeMemInfo(memInfo_t* memInfos, int numMemInfos) { + + std::stringstream ss; + uint64_t maxAmount = 0; + for (int i = 0; i < numMemInfos; i++) { + ss << memInfos[i].name << " " + << getMemString(memInfos[i].amount) + << " "; + if (i < numMemInfos - 1) { + ss << "| "; + } + maxAmount += memInfos[i].amount; + } + ss << "| Total " << getMemString(maxAmount); + PRINT("# %-20s : %s\n", "GPU memory usage", ss.str().c_str()); +} + // Write out remaining errors to stdout/json. void writeErrors() { const char *error = ncclGetLastError(NULL); @@ -678,3 +720,6 @@ void writeErrors() { } } +void finalizeFooter() { + PRINT("#\n"); +} diff --git a/src/util.h b/src/util.h index bea21ac..059ee7c 100644 --- a/src/util.h +++ b/src/util.h @@ -8,6 +8,11 @@ #include "common.h" +struct memInfo_t { + int64_t amount; + const char* name; +}; + // Try to set up JSON file output. If MPI is used, only rank 0 will proceed. // This should be called by only a single thread. // If 'in_path' is NULL, we stop. @@ -32,6 +37,8 @@ void writeBenchmarkLineBody(double timeUsec, double algBw, double busBw, bool re testResult_t writeDeviceReport(size_t *maxMem, int localRank, int proc, int totalProcs, int color, const char hostname[], const char *program_name); void writeResultHeader(bool report_cputime, bool report_timestamps); void writeResultFooter(const int errors[], const double bw[], double check_avg_bw, const char *program_name); +void finalizeFooter(); +void writeMemInfo(memInfo_t* memInfos, int numMemInfos); void writeErrors(); #endif