mirror of
https://github.com/NVIDIA/nccl-tests.git
synced 2026-01-14 02:47:21 +08:00
Add memory usage report option
Use -M 1 to dump library memory usage information
This commit is contained in:
parent
4bc314aa27
commit
760c467f12
137
src/common.cu
137
src/common.cu
@ -99,6 +99,7 @@ int cudaGraphLaunches = 0;
|
||||
static int report_cputime = 0;
|
||||
static int report_timestamps = 0;
|
||||
static int deviceImpl = 0;
|
||||
int memory_report = 0;
|
||||
|
||||
int deviceCtaCount = 16; // Default number of CTAs for device implementation
|
||||
|
||||
@ -710,18 +711,38 @@ testResult_t TimeTest(struct threadArgs* args, ncclDataType_t type, const char*
|
||||
return testSuccess;
|
||||
}
|
||||
|
||||
static void getGPUMemoryInfo(int64_t* ptotalGpuMem, int64_t* pfreeGpuMem) {
|
||||
size_t freeGpuMem, totalGpuMem = 0;
|
||||
cudaMemGetInfo(&freeGpuMem, &totalGpuMem);
|
||||
if (ptotalGpuMem != nullptr) *ptotalGpuMem = totalGpuMem;
|
||||
if (pfreeGpuMem != nullptr) *pfreeGpuMem = freeGpuMem;
|
||||
}
|
||||
|
||||
testResult_t threadRunTests(struct threadArgs* args) {
|
||||
// capture the free memory before
|
||||
int64_t* totalGpuFreeMem = (int64_t*)calloc(args->nGpus*2, sizeof(int64_t));
|
||||
for (int g = 0; g < args->nGpus; ++g) {
|
||||
CUDACHECK(cudaSetDevice(args->gpus[g]));
|
||||
getGPUMemoryInfo(nullptr, &totalGpuFreeMem[g]);
|
||||
}
|
||||
|
||||
// Set device to the first of our GPUs. If we don't do that, some operations
|
||||
// will be done on the current GPU (by default : 0) and if the GPUs are in
|
||||
// exclusive mode those operations will fail.
|
||||
CUDACHECK(cudaSetDevice(args->gpus[0]));
|
||||
TESTCHECK(ncclTestEngine.runTest(args, ncclroot, (ncclDataType_t)nccltype, test_typenames[nccltype], (ncclRedOp_t)ncclop, test_opnames[ncclop]));
|
||||
|
||||
// Capture the memory used by the GPUs
|
||||
for (int g = 0; g < args->nGpus; ++g) {
|
||||
CUDACHECK(cudaSetDevice(args->gpus[g]));
|
||||
getGPUMemoryInfo(nullptr, &totalGpuFreeMem[g + args->nGpus]);
|
||||
*args->devMemUsed = std::max(*args->devMemUsed, totalGpuFreeMem[g] - totalGpuFreeMem[g + args->nGpus]);
|
||||
}
|
||||
free(totalGpuFreeMem);
|
||||
return testSuccess;
|
||||
}
|
||||
|
||||
testResult_t threadInit(struct threadArgs* args) {
|
||||
char hostname[1024];
|
||||
getHostName(hostname, 1024);
|
||||
int nranks = args->nProcs*args->nThreads*args->nGpus;
|
||||
|
||||
//set main thread again
|
||||
@ -729,6 +750,13 @@ testResult_t threadInit(struct threadArgs* args) {
|
||||
|
||||
jsonIdentifyWriter(is_main_thread);
|
||||
|
||||
// Capture GPU memory before initializing the NCCL communicators
|
||||
int64_t* initFreeGpuMem = (int64_t*)calloc(args->nGpus*3, sizeof(int64_t));
|
||||
for (int g = 0; g < args->nGpus; ++g) {
|
||||
CUDACHECK(cudaSetDevice(args->gpus[g]));
|
||||
getGPUMemoryInfo(nullptr, &initFreeGpuMem[g]);
|
||||
}
|
||||
|
||||
#if NCCL_VERSION_CODE >= NCCL_VERSION(2,14,0)
|
||||
ncclConfig_t config = NCCL_CONFIG_INITIALIZER;
|
||||
#if NCCL_VERSION_CODE >= NCCL_VERSION(2,27,0)
|
||||
@ -751,6 +779,14 @@ testResult_t threadInit(struct threadArgs* args) {
|
||||
#endif
|
||||
}
|
||||
NCCLCHECK(ncclGroupEnd());
|
||||
|
||||
// Capture the memory used by the GPUs after initializing the NCCL communicators
|
||||
for (int g = 0; g < args->nGpus; ++g) {
|
||||
CUDACHECK(cudaSetDevice(args->gpus[g]));
|
||||
getGPUMemoryInfo(nullptr, &initFreeGpuMem[g + args->nGpus]);
|
||||
*args->initGpuMem = std::max(*args->initGpuMem, initFreeGpuMem[g] - initFreeGpuMem[g + args->nGpus]);
|
||||
}
|
||||
|
||||
#if NCCL_VERSION_CODE >= NCCL_VERSION(2,19,0)
|
||||
NCCLCHECK(ncclGroupStart());
|
||||
for (int i=0; i<args->nGpus; i++) {
|
||||
@ -767,6 +803,12 @@ testResult_t threadInit(struct threadArgs* args) {
|
||||
}
|
||||
NCCLCHECK(ncclGroupEnd());
|
||||
#endif
|
||||
// Capture memory used by test buffers
|
||||
for (int g = 0; g < args->nGpus; ++g) {
|
||||
CUDACHECK(cudaSetDevice(args->gpus[g]));
|
||||
getGPUMemoryInfo(nullptr, &initFreeGpuMem[g + args->nGpus*2]);
|
||||
args->bufferMemory[args->thread] = std::max(args->bufferMemory[args->thread], initFreeGpuMem[g + args->nGpus] - initFreeGpuMem[g + args->nGpus*2]);
|
||||
}
|
||||
#if NCCL_VERSION_CODE >= NCCL_VERSION(2,28,0)
|
||||
/* Create device communicators based on test-specific requirements */
|
||||
if (deviceImpl) {
|
||||
@ -783,7 +825,17 @@ testResult_t threadInit(struct threadArgs* args) {
|
||||
}
|
||||
NCCLCHECK(ncclGroupEnd());
|
||||
}
|
||||
// Capture memory used by test buffers
|
||||
int64_t deviceCommMaxMem = 0;
|
||||
for (int g = 0; g < args->nGpus; ++g) {
|
||||
CUDACHECK(cudaSetDevice(args->gpus[g]));
|
||||
int64_t freeGpuMem;
|
||||
getGPUMemoryInfo(nullptr, &freeGpuMem);
|
||||
deviceCommMaxMem = std::max(deviceCommMaxMem, initFreeGpuMem[g + args->nGpus*2] - freeGpuMem);
|
||||
}
|
||||
*args->initGpuMem += deviceCommMaxMem;
|
||||
#endif
|
||||
free(initFreeGpuMem);
|
||||
|
||||
TESTCHECK(threadRunTests(args));
|
||||
|
||||
@ -879,13 +931,15 @@ int main(int argc, char* argv[], char **envp) {
|
||||
{"cta_policy", required_argument, 0, 'x'},
|
||||
{"device_implementation", required_argument, 0, 'D'},
|
||||
{"device_cta_count", required_argument, 0, 'V'},
|
||||
{"memory", required_argument, 0, 'M'},
|
||||
|
||||
{"help", no_argument, 0, 'h'},
|
||||
{}
|
||||
};
|
||||
|
||||
while(1) {
|
||||
int c;
|
||||
c = getopt_long(argc, argv, "t:g:b:e:i:f:n:m:w:N:p:c:o:d:r:z:y:T:hG:C:a:R:x:D:V:J:S:", longopts, &longindex);
|
||||
c = getopt_long(argc, argv, "t:g:b:e:i:f:n:m:w:N:p:c:o:d:r:z:y:T:hG:C:a:R:x:D:V:J:S:M:", longopts, &longindex);
|
||||
|
||||
if (c == -1)
|
||||
break;
|
||||
@ -994,6 +1048,9 @@ int main(int argc, char* argv[], char **envp) {
|
||||
printf("Option -R (register) is not supported before NCCL 2.19. Ignoring\n");
|
||||
#endif
|
||||
break;
|
||||
case 'M':
|
||||
memory_report = (int)strtol(optarg, NULL, 0);
|
||||
break;
|
||||
case 'x':
|
||||
#if NCCL_VERSION_CODE >= NCCL_VERSION(2,27,0)
|
||||
ctaPolicy = (int)strtol(optarg, NULL, 0);
|
||||
@ -1008,8 +1065,7 @@ int main(int argc, char* argv[], char **envp) {
|
||||
case 'D':
|
||||
if (test_ncclVersion >= NCCL_VERSION(2,28,0)) {
|
||||
deviceImpl = (int)strtol(optarg, NULL, 0);
|
||||
}
|
||||
else {
|
||||
} else {
|
||||
fprintf(stderr, "Option -D (device implementation) requires NCCL >= 2.28.0\n");
|
||||
return -1;
|
||||
}
|
||||
@ -1021,9 +1077,8 @@ int main(int argc, char* argv[], char **envp) {
|
||||
fprintf(stderr, "device_cta_count (-V) must be positive and less than 128, got %d. "
|
||||
"Using default value 16.\n", deviceCtaCount);
|
||||
deviceCtaCount = 16;
|
||||
}
|
||||
}
|
||||
else {
|
||||
}
|
||||
} else {
|
||||
fprintf(stderr, "Option -V (device CTA count) requires NCCL >= 2.28.0\n");
|
||||
return -1;
|
||||
}
|
||||
@ -1065,6 +1120,7 @@ int main(int argc, char* argv[], char **envp) {
|
||||
"[-x,--cta_policy <0/1/2> set CTA policy (NCCL_CTA_POLICY_DEFAULT (0), NCCL_CTA_POLICY_EFFICIENCY (1), NCCL_CTA_POLICY_ZERO (2)) (default: do not set)] \n\t"
|
||||
"[-D,--device_implementation <implementation number> enable device implementation (default: 0, use NCCL implementation; requires -R 2 if > 0)] \n\t"
|
||||
"[-V,--device_cta_count <number> set number of CTAs for device implementation (default: 16)] \n\t"
|
||||
"[-M,--memory_report <0/1> enable memory usage report (default: 0)] \n\t"
|
||||
"[-h,--help]\n",
|
||||
basename(argv[0]));
|
||||
return 0;
|
||||
@ -1213,7 +1269,6 @@ testResult_t run() {
|
||||
for (int i=0; i<nGpus*nThreads; i++) {
|
||||
gpus[i] = (gpu0 != -1 ? gpu0 : localRank*nThreads*nGpus) + i;
|
||||
CUDACHECK(cudaSetDevice(gpus[i]));
|
||||
TESTCHECK(AllocateBuffs(sendbuffs+i, sendBytes, recvbuffs+i, recvBytes, expected+i, (size_t)maxBytes));
|
||||
if (streamnull) {
|
||||
streams[i] = NULL;
|
||||
}
|
||||
@ -1256,7 +1311,16 @@ testResult_t run() {
|
||||
#if NCCL_VERSION_CODE >= NCCL_VERSION(2,28,0)
|
||||
ncclDevComm devComms[nThreads*nGpus];
|
||||
#endif
|
||||
int64_t initGpuMem[nThreads] = {0};
|
||||
int64_t bufferMemory[nThreads] = {0};
|
||||
if (!parallel_init) {
|
||||
// Capture the memory used by the GPUs before initializing the NCCL communicators
|
||||
int64_t* initFreeGpuMem = (int64_t*)calloc(nGpus*3, sizeof(int64_t));
|
||||
for (int g = 0; g < nGpus; ++g) {
|
||||
CUDACHECK(cudaSetDevice(gpus[g]));
|
||||
getGPUMemoryInfo(nullptr, &initFreeGpuMem[g]);
|
||||
}
|
||||
//if parallel init is not selected, use main thread to initialize NCCL
|
||||
#if NCCL_VERSION_CODE >= NCCL_VERSION(2,14,0)
|
||||
ncclConfig_t config = NCCL_CONFIG_INITIALIZER;
|
||||
#if NCCL_VERSION_CODE >= NCCL_VERSION(2,27,0)
|
||||
@ -1277,9 +1341,22 @@ testResult_t run() {
|
||||
#endif
|
||||
}
|
||||
NCCLCHECK(ncclGroupEnd());
|
||||
|
||||
// Capture the memory used by the GPUs after initializing the NCCL communicators
|
||||
for (int g = 0; g < nGpus; ++g) {
|
||||
CUDACHECK(cudaSetDevice(gpus[g]));
|
||||
getGPUMemoryInfo(nullptr, &initFreeGpuMem[g + nGpus]);
|
||||
}
|
||||
for ( size_t t = 0; t < nThreads; ++t) {
|
||||
for (int g = 0; g < nGpus; ++g) {
|
||||
initGpuMem[t] = std::max(initGpuMem[t], initFreeGpuMem[g] - initFreeGpuMem[g + nGpus]);
|
||||
}
|
||||
}
|
||||
#if NCCL_VERSION_CODE >= NCCL_VERSION(2,19,0)
|
||||
NCCLCHECK(ncclGroupStart());
|
||||
for (int i=0; i<nGpus*nThreads; i++) {
|
||||
CUDACHECK(cudaSetDevice(gpus[i]));
|
||||
TESTCHECK(AllocateBuffs(sendbuffs+i, sendBytes, recvbuffs+i, recvBytes, expected+i, (size_t)maxBytes));
|
||||
#if NCCL_VERSION_CODE >= NCCL_VERSION(2,27,0)
|
||||
if (test_ncclVersion >= NCCL_VERSION(2,27,0) && (local_register == SYMMETRIC_REGISTER)) {
|
||||
NCCLCHECK(ncclCommWindowRegister(comms[i], sendbuffs[i], maxBytes, (ncclWindow_t*)&sendRegHandles[i], NCCL_WIN_COLL_SYMMETRIC));
|
||||
@ -1293,6 +1370,16 @@ testResult_t run() {
|
||||
}
|
||||
NCCLCHECK(ncclGroupEnd());
|
||||
#endif
|
||||
// Capture memory used by after allocating buffers
|
||||
for (int g = 0; g < nGpus; ++g) {
|
||||
CUDACHECK(cudaSetDevice(gpus[g]));
|
||||
getGPUMemoryInfo(nullptr, &initFreeGpuMem[g + nGpus*2]);
|
||||
}
|
||||
for ( size_t t = 0; t < nThreads; ++t) {
|
||||
for (int g = 0; g < nGpus; ++g) {
|
||||
bufferMemory[t] = std::max(bufferMemory[t], initFreeGpuMem[g + nGpus] - initFreeGpuMem[g + nGpus*2]);
|
||||
}
|
||||
}
|
||||
#if NCCL_VERSION_CODE >= NCCL_VERSION(2,28,0)
|
||||
/* Create device communicators based on test-specific requirements */
|
||||
if (deviceImpl) {
|
||||
@ -1309,15 +1396,28 @@ testResult_t run() {
|
||||
}
|
||||
NCCLCHECK(ncclGroupEnd());
|
||||
}
|
||||
int64_t deviceCommMaxMem = 0;
|
||||
for (int g = 0; g < nGpus; ++g) {
|
||||
CUDACHECK(cudaSetDevice(gpus[g]));
|
||||
int64_t freeGpuMem;
|
||||
getGPUMemoryInfo(nullptr, &freeGpuMem);
|
||||
deviceCommMaxMem = std::max(deviceCommMaxMem, initFreeGpuMem[g + nGpus*2] - freeGpuMem);
|
||||
}
|
||||
for ( size_t t = 0; t < nThreads; ++t) {
|
||||
initGpuMem[t] += deviceCommMaxMem;
|
||||
}
|
||||
#endif
|
||||
free(initFreeGpuMem);
|
||||
}
|
||||
|
||||
int errors[nThreads];
|
||||
double bw[nThreads];
|
||||
int64_t devMemUsed[nThreads];
|
||||
int bw_count[nThreads];
|
||||
for (int t=0; t<nThreads; t++) {
|
||||
bw[t] = 0.0;
|
||||
errors[t] = bw_count[t] = 0;
|
||||
devMemUsed[t] = std::numeric_limits<int64_t>::min();
|
||||
}
|
||||
|
||||
fflush(stdout);
|
||||
@ -1358,6 +1458,9 @@ testResult_t run() {
|
||||
threads[t].args.errors=errors+t;
|
||||
threads[t].args.bw=bw+t;
|
||||
threads[t].args.bw_count=bw_count+t;
|
||||
threads[t].args.initGpuMem = initGpuMem + t;
|
||||
threads[t].args.bufferMemory = bufferMemory + t;
|
||||
threads[t].args.devMemUsed = devMemUsed + t;
|
||||
|
||||
threads[t].args.reportErrors = datacheck;
|
||||
|
||||
@ -1376,11 +1479,17 @@ testResult_t run() {
|
||||
errors[0] += errors[t];
|
||||
bw[0] += bw[t];
|
||||
bw_count[0] += bw_count[t];
|
||||
devMemUsed[0] = std::max(devMemUsed[0], devMemUsed[t]);
|
||||
initGpuMem[0] = std::max(initGpuMem[0], initGpuMem[t]);
|
||||
bufferMemory[0] = std::max(bufferMemory[0], bufferMemory[t]);
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef MPI_SUPPORT
|
||||
MPI_Allreduce(MPI_IN_PLACE, &errors[0], 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
|
||||
MPI_Allreduce(MPI_IN_PLACE, &devMemUsed[0], 1, MPI_INT64_T, MPI_MAX, MPI_COMM_WORLD);
|
||||
MPI_Allreduce(MPI_IN_PLACE, &initGpuMem[0], 1, MPI_INT64_T, MPI_MAX, MPI_COMM_WORLD);
|
||||
MPI_Allreduce(MPI_IN_PLACE, &bufferMemory[0], 1, MPI_INT64_T, MPI_MAX, MPI_COMM_WORLD);
|
||||
#endif
|
||||
|
||||
if (!parallel_init) {
|
||||
@ -1416,10 +1525,18 @@ testResult_t run() {
|
||||
}
|
||||
|
||||
envstr = getenv("NCCL_TESTS_MIN_BW");
|
||||
double check_avg_bw = envstr ? atof(envstr) : -1;
|
||||
const double check_avg_bw = envstr ? atof(envstr) : -1;
|
||||
bw[0] /= bw_count[0];
|
||||
|
||||
writeResultFooter(errors, bw, check_avg_bw, program_invocation_short_name);
|
||||
if (memory_report) {
|
||||
memInfo_t memInfos[3];
|
||||
memInfos[0] = { initGpuMem[0], "Initialization" };
|
||||
memInfos[1] = { bufferMemory[0], "User-Allocated" };
|
||||
memInfos[2] = { devMemUsed[0], "Collective" };
|
||||
writeMemInfo(memInfos, 3);
|
||||
}
|
||||
finalizeFooter();
|
||||
|
||||
#ifdef MPI_SUPPORT
|
||||
MPI_Comm_free(&mpi_comm);
|
||||
|
||||
@ -155,6 +155,10 @@ struct threadArgs {
|
||||
|
||||
struct testColl* collTest;
|
||||
|
||||
int64_t* initGpuMem;
|
||||
int64_t* bufferMemory;
|
||||
int64_t* devMemUsed;
|
||||
|
||||
#if NCCL_VERSION_CODE >= NCCL_VERSION(2,19,0)
|
||||
void** sendRegHandles;
|
||||
void** recvRegHandles;
|
||||
|
||||
49
src/util.cu
49
src/util.cu
@ -19,6 +19,8 @@
|
||||
#include "util.h"
|
||||
#include <assert.h>
|
||||
#include <errno.h>
|
||||
#include <string>
|
||||
#include <iomanip>
|
||||
|
||||
#define PRINT if (is_main_thread) printf
|
||||
|
||||
@ -641,8 +643,8 @@ void writeResultFooter(const int errors[], const double bw[], double check_avg_b
|
||||
jsonFinishList();
|
||||
}
|
||||
|
||||
PRINT("# Out of bounds values : %d %s\n", errors[0], errors[0] ? "FAILED" : "OK");
|
||||
PRINT("# Avg bus bandwidth : %g %s\n", bw[0], check_avg_bw == -1 ? "" : (bw[0] < check_avg_bw*(0.9) ? "FAILED" : "OK"));
|
||||
PRINT("# %-20s : %d %s\n", "Out of bounds values", errors[0], errors[0] ? "FAILED" : "OK");
|
||||
PRINT("# %-20s : %g %s\n", "Avg bus bandwidth", bw[0], check_avg_bw == -1 ? "" : (bw[0] < check_avg_bw*(0.9) ? "FAILED" : "OK"));
|
||||
PRINT("#\n");
|
||||
PRINT("# Collective test concluded: %s\n", program_name);
|
||||
|
||||
@ -660,6 +662,46 @@ void writeResultFooter(const int errors[], const double bw[], double check_avg_b
|
||||
}
|
||||
}
|
||||
|
||||
std::string getMemString(double amount) {
|
||||
std::string postfix = " B";
|
||||
if (abs(amount) >= 1024.0*1024.0*1024.0) {
|
||||
postfix = " GB";
|
||||
amount /= 1024.0 * 1024.0 * 1024.0;
|
||||
} else if (abs(amount) >= 1024.0*1024.0) {
|
||||
postfix = " MB";
|
||||
amount /= 1024.0 * 1024.0;
|
||||
} else if (abs(amount) >= 1024.0) {
|
||||
postfix = " KB";
|
||||
amount /= 1024.0;
|
||||
}
|
||||
int precision = 0;
|
||||
if (abs(amount) < 10.0) {
|
||||
precision = 2;
|
||||
} else if (abs(amount) < 100.0) {
|
||||
precision = 1;
|
||||
}
|
||||
std::stringstream ss;
|
||||
ss << std::fixed << std::setprecision(precision) << amount << postfix;
|
||||
return ss.str();
|
||||
}
|
||||
|
||||
void writeMemInfo(memInfo_t* memInfos, int numMemInfos) {
|
||||
|
||||
std::stringstream ss;
|
||||
uint64_t maxAmount = 0;
|
||||
for (int i = 0; i < numMemInfos; i++) {
|
||||
ss << memInfos[i].name << " "
|
||||
<< getMemString(memInfos[i].amount)
|
||||
<< " ";
|
||||
if (i < numMemInfos - 1) {
|
||||
ss << "| ";
|
||||
}
|
||||
maxAmount += memInfos[i].amount;
|
||||
}
|
||||
ss << "| Total " << getMemString(maxAmount);
|
||||
PRINT("# %-20s : %s\n", "GPU memory usage", ss.str().c_str());
|
||||
}
|
||||
|
||||
// Write out remaining errors to stdout/json.
|
||||
void writeErrors() {
|
||||
const char *error = ncclGetLastError(NULL);
|
||||
@ -678,3 +720,6 @@ void writeErrors() {
|
||||
}
|
||||
}
|
||||
|
||||
void finalizeFooter() {
|
||||
PRINT("#\n");
|
||||
}
|
||||
|
||||
@ -8,6 +8,11 @@
|
||||
|
||||
#include "common.h"
|
||||
|
||||
struct memInfo_t {
|
||||
int64_t amount;
|
||||
const char* name;
|
||||
};
|
||||
|
||||
// Try to set up JSON file output. If MPI is used, only rank 0 will proceed.
|
||||
// This should be called by only a single thread.
|
||||
// If 'in_path' is NULL, we stop.
|
||||
@ -32,6 +37,8 @@ void writeBenchmarkLineBody(double timeUsec, double algBw, double busBw, bool re
|
||||
testResult_t writeDeviceReport(size_t *maxMem, int localRank, int proc, int totalProcs, int color, const char hostname[], const char *program_name);
|
||||
void writeResultHeader(bool report_cputime, bool report_timestamps);
|
||||
void writeResultFooter(const int errors[], const double bw[], double check_avg_bw, const char *program_name);
|
||||
void finalizeFooter();
|
||||
void writeMemInfo(memInfo_t* memInfos, int numMemInfos);
|
||||
void writeErrors();
|
||||
|
||||
#endif
|
||||
|
||||
Loading…
Reference in New Issue
Block a user