Add memory usage report option

Use -M 1 to dump library memory usage information
2026-01-14 02:47:21 +08:00 · 2025-11-02 20:32:20 -08:00 · 2025-11-02 20:32:20 -08:00 · 760c467f12
commit 760c467f12
parent 4bc314aa27
4 changed files with 185 additions and 12 deletions
--- a/src/common.cu
+++ b/src/common.cu
@ -99,6 +99,7 @@ int cudaGraphLaunches = 0;
 static int report_cputime = 0;
 static int report_timestamps = 0;
 static int deviceImpl = 0;
+int memory_report = 0;

 int deviceCtaCount = 16; // Default number of CTAs for device implementation

@ -710,18 +711,38 @@ testResult_t TimeTest(struct threadArgs* args, ncclDataType_t type, const char*
  return testSuccess;
 }

+static void getGPUMemoryInfo(int64_t* ptotalGpuMem, int64_t* pfreeGpuMem) {
+  size_t freeGpuMem, totalGpuMem = 0;
+  cudaMemGetInfo(&freeGpuMem, &totalGpuMem);
+  if (ptotalGpuMem != nullptr) *ptotalGpuMem = totalGpuMem;
+  if (pfreeGpuMem != nullptr) *pfreeGpuMem = freeGpuMem;
+}
+
 testResult_t threadRunTests(struct threadArgs* args) {
+  //  capture the free memory before
+  int64_t* totalGpuFreeMem = (int64_t*)calloc(args->nGpus*2, sizeof(int64_t));
+  for (int g = 0; g < args->nGpus; ++g) {
+    CUDACHECK(cudaSetDevice(args->gpus[g]));
+    getGPUMemoryInfo(nullptr, &totalGpuFreeMem[g]);
+  }
+
  // Set device to the first of our GPUs. If we don't do that, some operations
  // will be done on the current GPU (by default : 0) and if the GPUs are in
  // exclusive mode those operations will fail.
  CUDACHECK(cudaSetDevice(args->gpus[0]));
  TESTCHECK(ncclTestEngine.runTest(args, ncclroot, (ncclDataType_t)nccltype, test_typenames[nccltype], (ncclRedOp_t)ncclop, test_opnames[ncclop]));
+
+  // Capture the memory used by the GPUs
+  for (int g = 0; g < args->nGpus; ++g) {
+    CUDACHECK(cudaSetDevice(args->gpus[g]));
+    getGPUMemoryInfo(nullptr, &totalGpuFreeMem[g + args->nGpus]);
+    *args->devMemUsed = std::max(*args->devMemUsed, totalGpuFreeMem[g] - totalGpuFreeMem[g + args->nGpus]);
+  }
+  free(totalGpuFreeMem);
  return testSuccess;
 }

 testResult_t threadInit(struct threadArgs* args) {
-  char hostname[1024];
-  getHostName(hostname, 1024);
  int nranks =  args->nProcs*args->nThreads*args->nGpus;

  //set main thread again
@ -729,6 +750,13 @@ testResult_t threadInit(struct threadArgs* args) {

  jsonIdentifyWriter(is_main_thread);

+  // Capture GPU memory before initializing the NCCL communicators
+  int64_t* initFreeGpuMem = (int64_t*)calloc(args->nGpus*3, sizeof(int64_t));
+  for (int g = 0; g < args->nGpus; ++g) {
+    CUDACHECK(cudaSetDevice(args->gpus[g]));
+    getGPUMemoryInfo(nullptr, &initFreeGpuMem[g]);
+  }
+
 #if NCCL_VERSION_CODE >= NCCL_VERSION(2,14,0)
     ncclConfig_t config = NCCL_CONFIG_INITIALIZER;
 #if NCCL_VERSION_CODE >= NCCL_VERSION(2,27,0)
@ -751,6 +779,14 @@ testResult_t threadInit(struct threadArgs* args) {
 #endif
  }
  NCCLCHECK(ncclGroupEnd());
+
+  // Capture the memory used by the GPUs after initializing the NCCL communicators
+  for (int g = 0; g < args->nGpus; ++g) {
+    CUDACHECK(cudaSetDevice(args->gpus[g]));
+    getGPUMemoryInfo(nullptr, &initFreeGpuMem[g + args->nGpus]);
+    *args->initGpuMem = std::max(*args->initGpuMem, initFreeGpuMem[g] - initFreeGpuMem[g + args->nGpus]);
+  }
+
 #if NCCL_VERSION_CODE >= NCCL_VERSION(2,19,0)
  NCCLCHECK(ncclGroupStart());
  for (int i=0; i<args->nGpus; i++) {
@ -767,6 +803,12 @@ testResult_t threadInit(struct threadArgs* args) {
  }
  NCCLCHECK(ncclGroupEnd());
 #endif
+  // Capture memory used by test buffers
+  for (int g = 0; g < args->nGpus; ++g) {
+    CUDACHECK(cudaSetDevice(args->gpus[g]));
+    getGPUMemoryInfo(nullptr, &initFreeGpuMem[g + args->nGpus*2]);
+    args->bufferMemory[args->thread] = std::max(args->bufferMemory[args->thread], initFreeGpuMem[g + args->nGpus] - initFreeGpuMem[g + args->nGpus*2]);
+  }
 #if NCCL_VERSION_CODE >= NCCL_VERSION(2,28,0)
  /* Create device communicators based on test-specific requirements */
  if (deviceImpl) {
@ -783,7 +825,17 @@ testResult_t threadInit(struct threadArgs* args) {
    }
    NCCLCHECK(ncclGroupEnd());
  }
+  // Capture memory used by test buffers
+  int64_t deviceCommMaxMem = 0;
+  for (int g = 0; g < args->nGpus; ++g) {
+    CUDACHECK(cudaSetDevice(args->gpus[g]));
+    int64_t freeGpuMem;
+    getGPUMemoryInfo(nullptr, &freeGpuMem);
+    deviceCommMaxMem = std::max(deviceCommMaxMem, initFreeGpuMem[g + args->nGpus*2] - freeGpuMem);
+  }
+  *args->initGpuMem += deviceCommMaxMem;
 #endif
+  free(initFreeGpuMem);

  TESTCHECK(threadRunTests(args));

@ -879,13 +931,15 @@ int main(int argc, char* argv[], char **envp) {
    {"cta_policy", required_argument, 0, 'x'},
    {"device_implementation", required_argument, 0, 'D'},
    {"device_cta_count", required_argument, 0, 'V'},
+    {"memory", required_argument, 0, 'M'},
+
    {"help", no_argument, 0, 'h'},
    {}
  };

  while(1) {
    int c;
-    c = getopt_long(argc, argv, "t:g:b:e:i:f:n:m:w:N:p:c:o:d:r:z:y:T:hG:C:a:R:x:D:V:J:S:", longopts, &longindex);
+    c = getopt_long(argc, argv, "t:g:b:e:i:f:n:m:w:N:p:c:o:d:r:z:y:T:hG:C:a:R:x:D:V:J:S:M:", longopts, &longindex);

    if (c == -1)
      break;
@ -994,6 +1048,9 @@ int main(int argc, char* argv[], char **envp) {
        printf("Option -R (register) is not supported before NCCL 2.19. Ignoring\n");
 #endif
        break;
+      case 'M':
+        memory_report = (int)strtol(optarg, NULL, 0);
+        break;
      case 'x':
 #if NCCL_VERSION_CODE >= NCCL_VERSION(2,27,0)
        ctaPolicy = (int)strtol(optarg, NULL, 0);
@ -1008,8 +1065,7 @@ int main(int argc, char* argv[], char **envp) {
      case 'D':
 	if (test_ncclVersion >= NCCL_VERSION(2,28,0)) {
          deviceImpl = (int)strtol(optarg, NULL, 0);
-	}
-	else {
+        } else {
          fprintf(stderr, "Option -D (device implementation) requires NCCL >= 2.28.0\n");
          return -1;
        }
@ -1021,9 +1077,8 @@ int main(int argc, char* argv[], char **envp) {
            fprintf(stderr, "device_cta_count (-V) must be positive and less than 128, got %d. "
                    "Using default value 16.\n", deviceCtaCount);
            deviceCtaCount = 16;
-	  }
-        }
-	else {
+          }
+        } else {
          fprintf(stderr, "Option -V (device CTA count) requires NCCL >= 2.28.0\n");
          return -1;
 	}
@ -1065,6 +1120,7 @@ int main(int argc, char* argv[], char **envp) {
            "[-x,--cta_policy <0/1/2> set CTA policy (NCCL_CTA_POLICY_DEFAULT (0), NCCL_CTA_POLICY_EFFICIENCY (1), NCCL_CTA_POLICY_ZERO (2)) (default: do not set)] \n\t"
            "[-D,--device_implementation <implementation number> enable device implementation (default: 0, use NCCL implementation; requires -R 2 if > 0)] \n\t"
            "[-V,--device_cta_count <number> set number of CTAs for device implementation (default: 16)] \n\t"
+            "[-M,--memory_report <0/1> enable memory usage report (default: 0)] \n\t"
            "[-h,--help]\n",
          basename(argv[0]));
        return 0;
@ -1213,7 +1269,6 @@ testResult_t run() {
  for (int i=0; i<nGpus*nThreads; i++) {
    gpus[i] = (gpu0 != -1 ? gpu0 : localRank*nThreads*nGpus) + i;
    CUDACHECK(cudaSetDevice(gpus[i]));
-    TESTCHECK(AllocateBuffs(sendbuffs+i, sendBytes, recvbuffs+i, recvBytes, expected+i, (size_t)maxBytes));
    if (streamnull) {
      streams[i] = NULL;
    }
@ -1256,7 +1311,16 @@ testResult_t run() {
 #if NCCL_VERSION_CODE >= NCCL_VERSION(2,28,0)
  ncclDevComm devComms[nThreads*nGpus];
 #endif
+  int64_t initGpuMem[nThreads] = {0};
+  int64_t bufferMemory[nThreads] = {0};
  if (!parallel_init) {
+    // Capture the memory used by the GPUs before initializing the NCCL communicators
+    int64_t* initFreeGpuMem = (int64_t*)calloc(nGpus*3, sizeof(int64_t));
+    for (int g = 0; g < nGpus; ++g) {
+      CUDACHECK(cudaSetDevice(gpus[g]));
+      getGPUMemoryInfo(nullptr, &initFreeGpuMem[g]);
+    }
+    //if parallel init is not selected, use main thread to initialize NCCL
 #if NCCL_VERSION_CODE >= NCCL_VERSION(2,14,0)
     ncclConfig_t config = NCCL_CONFIG_INITIALIZER;
 #if NCCL_VERSION_CODE >= NCCL_VERSION(2,27,0)
@ -1277,9 +1341,22 @@ testResult_t run() {
 #endif
     }
     NCCLCHECK(ncclGroupEnd());
+
+     // Capture the memory used by the GPUs after initializing the NCCL communicators
+     for (int g = 0; g < nGpus; ++g) {
+       CUDACHECK(cudaSetDevice(gpus[g]));
+       getGPUMemoryInfo(nullptr, &initFreeGpuMem[g + nGpus]);
+     }
+     for ( size_t t = 0; t < nThreads; ++t) {
+       for (int g = 0; g < nGpus; ++g) {
+         initGpuMem[t] = std::max(initGpuMem[t], initFreeGpuMem[g] - initFreeGpuMem[g + nGpus]);
+       }
+     }
 #if NCCL_VERSION_CODE >= NCCL_VERSION(2,19,0)
     NCCLCHECK(ncclGroupStart());
     for (int i=0; i<nGpus*nThreads; i++) {
+       CUDACHECK(cudaSetDevice(gpus[i]));
+       TESTCHECK(AllocateBuffs(sendbuffs+i, sendBytes, recvbuffs+i, recvBytes, expected+i, (size_t)maxBytes));
 #if NCCL_VERSION_CODE >= NCCL_VERSION(2,27,0)
       if (test_ncclVersion >= NCCL_VERSION(2,27,0) && (local_register == SYMMETRIC_REGISTER)) {
         NCCLCHECK(ncclCommWindowRegister(comms[i], sendbuffs[i], maxBytes, (ncclWindow_t*)&sendRegHandles[i], NCCL_WIN_COLL_SYMMETRIC));
@ -1293,6 +1370,16 @@ testResult_t run() {
     }
     NCCLCHECK(ncclGroupEnd());
 #endif
+     // Capture memory used by after allocating buffers
+     for (int g = 0; g < nGpus; ++g) {
+       CUDACHECK(cudaSetDevice(gpus[g]));
+       getGPUMemoryInfo(nullptr, &initFreeGpuMem[g + nGpus*2]);
+     }
+     for ( size_t t = 0; t < nThreads; ++t) {
+      for (int g = 0; g < nGpus; ++g) {
+        bufferMemory[t] = std::max(bufferMemory[t], initFreeGpuMem[g + nGpus] - initFreeGpuMem[g + nGpus*2]);
+      }
+     }
 #if NCCL_VERSION_CODE >= NCCL_VERSION(2,28,0)
     /* Create device communicators based on test-specific requirements */
     if (deviceImpl) {
@ -1309,15 +1396,28 @@ testResult_t run() {
       }
       NCCLCHECK(ncclGroupEnd());
     }
+     int64_t deviceCommMaxMem = 0;
+     for (int g = 0; g < nGpus; ++g) {
+       CUDACHECK(cudaSetDevice(gpus[g]));
+       int64_t freeGpuMem;
+       getGPUMemoryInfo(nullptr, &freeGpuMem);
+       deviceCommMaxMem = std::max(deviceCommMaxMem, initFreeGpuMem[g + nGpus*2] - freeGpuMem);
+     }
+     for ( size_t t = 0; t < nThreads; ++t) {
+       initGpuMem[t] += deviceCommMaxMem;
+     }
 #endif
+    free(initFreeGpuMem);
  }

  int errors[nThreads];
  double bw[nThreads];
+  int64_t devMemUsed[nThreads];
  int bw_count[nThreads];
  for (int t=0; t<nThreads; t++) {
    bw[t] = 0.0;
    errors[t] = bw_count[t] = 0;
+    devMemUsed[t] = std::numeric_limits<int64_t>::min();
  }

  fflush(stdout);
@ -1358,6 +1458,9 @@ testResult_t run() {
    threads[t].args.errors=errors+t;
    threads[t].args.bw=bw+t;
    threads[t].args.bw_count=bw_count+t;
+    threads[t].args.initGpuMem = initGpuMem + t;
+    threads[t].args.bufferMemory = bufferMemory + t;
+    threads[t].args.devMemUsed = devMemUsed + t;

    threads[t].args.reportErrors = datacheck;

@ -1376,11 +1479,17 @@ testResult_t run() {
      errors[0] += errors[t];
      bw[0] += bw[t];
      bw_count[0] += bw_count[t];
+      devMemUsed[0] = std::max(devMemUsed[0], devMemUsed[t]);
+      initGpuMem[0] = std::max(initGpuMem[0], initGpuMem[t]);
+      bufferMemory[0] = std::max(bufferMemory[0], bufferMemory[t]);
    }
  }

 #ifdef MPI_SUPPORT
  MPI_Allreduce(MPI_IN_PLACE, &errors[0], 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
+  MPI_Allreduce(MPI_IN_PLACE, &devMemUsed[0], 1, MPI_INT64_T, MPI_MAX, MPI_COMM_WORLD);
+  MPI_Allreduce(MPI_IN_PLACE, &initGpuMem[0], 1, MPI_INT64_T, MPI_MAX, MPI_COMM_WORLD);
+  MPI_Allreduce(MPI_IN_PLACE, &bufferMemory[0], 1, MPI_INT64_T, MPI_MAX, MPI_COMM_WORLD);
 #endif

  if (!parallel_init) {
@ -1416,10 +1525,18 @@ testResult_t run() {
  }

  envstr = getenv("NCCL_TESTS_MIN_BW");
-  double check_avg_bw = envstr ? atof(envstr) : -1;
+  const double check_avg_bw = envstr ? atof(envstr) : -1;
  bw[0] /= bw_count[0];

  writeResultFooter(errors, bw, check_avg_bw, program_invocation_short_name);
+  if (memory_report) {
+    memInfo_t memInfos[3];
+    memInfos[0] = { initGpuMem[0], "Initialization" };
+    memInfos[1] = { bufferMemory[0], "User-Allocated" };
+    memInfos[2] = { devMemUsed[0], "Collective" };
+    writeMemInfo(memInfos, 3);
+  }
+  finalizeFooter();

 #ifdef MPI_SUPPORT
  MPI_Comm_free(&mpi_comm);
--- a/src/common.h
+++ b/src/common.h
@ -155,6 +155,10 @@ struct threadArgs {

  struct testColl* collTest;

+  int64_t* initGpuMem;
+  int64_t* bufferMemory;
+  int64_t* devMemUsed;
+
 #if NCCL_VERSION_CODE >= NCCL_VERSION(2,19,0)
  void** sendRegHandles;
  void** recvRegHandles;
--- a/src/util.cu
+++ b/src/util.cu
@ -19,6 +19,8 @@
 #include "util.h"
 #include <assert.h>
 #include <errno.h>
+#include <string>
+#include <iomanip>

 #define PRINT if (is_main_thread) printf

@ -641,8 +643,8 @@ void writeResultFooter(const int errors[], const double bw[], double check_avg_b
    jsonFinishList();
  }

-  PRINT("# Out of bounds values : %d %s\n", errors[0], errors[0] ? "FAILED" : "OK");
-  PRINT("# Avg bus bandwidth    : %g %s\n", bw[0], check_avg_bw == -1 ? "" : (bw[0] < check_avg_bw*(0.9) ? "FAILED" : "OK"));
+  PRINT("# %-20s : %d %s\n", "Out of bounds values", errors[0], errors[0] ? "FAILED" : "OK");
+  PRINT("# %-20s : %g %s\n", "Avg bus bandwidth", bw[0], check_avg_bw == -1 ? "" : (bw[0] < check_avg_bw*(0.9) ? "FAILED" : "OK"));
  PRINT("#\n");
  PRINT("# Collective test concluded: %s\n", program_name);

@ -660,6 +662,46 @@ void writeResultFooter(const int errors[], const double bw[], double check_avg_b
  }
 }

+std::string getMemString(double amount) {
+  std::string postfix = " B";
+  if (abs(amount) >= 1024.0*1024.0*1024.0) {
+    postfix = " GB";
+    amount /= 1024.0 * 1024.0 * 1024.0;
+  } else if (abs(amount) >= 1024.0*1024.0) {
+    postfix = " MB";
+    amount /= 1024.0 * 1024.0;
+  } else if (abs(amount) >= 1024.0) {
+    postfix = " KB";
+    amount /= 1024.0;
+  }
+  int precision = 0;
+  if (abs(amount) < 10.0) {
+    precision = 2;
+  } else if (abs(amount) < 100.0) {
+    precision = 1;
+  }
+  std::stringstream ss;
+  ss << std::fixed << std::setprecision(precision) << amount << postfix;
+  return ss.str();
+}
+
+void writeMemInfo(memInfo_t* memInfos, int numMemInfos) {
+
+  std::stringstream ss;
+  uint64_t maxAmount = 0;
+  for (int i = 0; i < numMemInfos; i++) {
+    ss << memInfos[i].name << " "
+      << getMemString(memInfos[i].amount)
+      << " ";
+    if (i < numMemInfos - 1) {
+      ss << "| ";
+    }
+    maxAmount += memInfos[i].amount;
+  }
+  ss << "| Total  " << getMemString(maxAmount);
+  PRINT("# %-20s : %s\n", "GPU memory usage", ss.str().c_str());
+}
+
 // Write out remaining errors to stdout/json.
 void writeErrors() {
  const char *error = ncclGetLastError(NULL);
@ -678,3 +720,6 @@ void writeErrors() {
  }
 }

+void finalizeFooter() {
+  PRINT("#\n");
+}
--- a/src/util.h
+++ b/src/util.h
@ -8,6 +8,11 @@

 #include "common.h"

+struct memInfo_t {
+  int64_t amount;
+  const char* name;
+};
+
 // Try to set up JSON file output. If MPI is used, only rank 0 will proceed.
 // This should be called by only a single thread.
 // If 'in_path' is NULL, we stop.
@ -32,6 +37,8 @@ void writeBenchmarkLineBody(double timeUsec, double algBw, double busBw, bool re
 testResult_t writeDeviceReport(size_t *maxMem, int localRank, int proc, int totalProcs, int color, const char hostname[], const char *program_name);
 void writeResultHeader(bool report_cputime, bool report_timestamps);
 void writeResultFooter(const int errors[], const double bw[], double check_avg_bw, const char *program_name);
+void finalizeFooter();
+void writeMemInfo(memInfo_t* memInfos, int numMemInfos);
 void writeErrors();

 #endif