diff --git a/src/common.cu b/src/common.cu
index 7a91441..c932f7f 100644
--- a/src/common.cu
+++ b/src/common.cu
@@ -99,6 +99,7 @@ int cudaGraphLaunches = 0;
 static int report_cputime = 0;
 static int report_timestamps = 0;
 static int deviceImpl = 0;
+int memory_report = 0;
 
 int deviceCtaCount = 16; // Default number of CTAs for device implementation
 
@@ -710,18 +711,38 @@ testResult_t TimeTest(struct threadArgs* args, ncclDataType_t type, const char*
   return testSuccess;
 }
 
+static void getGPUMemoryInfo(int64_t* ptotalGpuMem, int64_t* pfreeGpuMem) {
+  size_t freeGpuMem, totalGpuMem = 0;
+  cudaMemGetInfo(&freeGpuMem, &totalGpuMem);
+  if (ptotalGpuMem != nullptr) *ptotalGpuMem = totalGpuMem;
+  if (pfreeGpuMem != nullptr) *pfreeGpuMem = freeGpuMem;
+}
+
 testResult_t threadRunTests(struct threadArgs* args) {
+  //  capture the free memory before
+  int64_t* totalGpuFreeMem = (int64_t*)calloc(args->nGpus*2, sizeof(int64_t));
+  for (int g = 0; g < args->nGpus; ++g) {
+    CUDACHECK(cudaSetDevice(args->gpus[g]));
+    getGPUMemoryInfo(nullptr, &totalGpuFreeMem[g]);
+  }
+
   // Set device to the first of our GPUs. If we don't do that, some operations
   // will be done on the current GPU (by default : 0) and if the GPUs are in
   // exclusive mode those operations will fail.
   CUDACHECK(cudaSetDevice(args->gpus[0]));
   TESTCHECK(ncclTestEngine.runTest(args, ncclroot, (ncclDataType_t)nccltype, test_typenames[nccltype], (ncclRedOp_t)ncclop, test_opnames[ncclop]));
+
+  // Capture the memory used by the GPUs
+  for (int g = 0; g < args->nGpus; ++g) {
+    CUDACHECK(cudaSetDevice(args->gpus[g]));
+    getGPUMemoryInfo(nullptr, &totalGpuFreeMem[g + args->nGpus]);
+    *args->devMemUsed = std::max(*args->devMemUsed, totalGpuFreeMem[g] - totalGpuFreeMem[g + args->nGpus]);
+  }
+  free(totalGpuFreeMem);
   return testSuccess;
 }
 
 testResult_t threadInit(struct threadArgs* args) {
-  char hostname[1024];
-  getHostName(hostname, 1024);
   int nranks =  args->nProcs*args->nThreads*args->nGpus;
 
   //set main thread again
@@ -729,6 +750,13 @@ testResult_t threadInit(struct threadArgs* args) {
 
   jsonIdentifyWriter(is_main_thread);
 
+  // Capture GPU memory before initializing the NCCL communicators
+  int64_t* initFreeGpuMem = (int64_t*)calloc(args->nGpus*3, sizeof(int64_t));
+  for (int g = 0; g < args->nGpus; ++g) {
+    CUDACHECK(cudaSetDevice(args->gpus[g]));
+    getGPUMemoryInfo(nullptr, &initFreeGpuMem[g]);
+  }
+
 #if NCCL_VERSION_CODE >= NCCL_VERSION(2,14,0)
      ncclConfig_t config = NCCL_CONFIG_INITIALIZER;
 #if NCCL_VERSION_CODE >= NCCL_VERSION(2,27,0)
@@ -751,6 +779,14 @@ testResult_t threadInit(struct threadArgs* args) {
 #endif
   }
   NCCLCHECK(ncclGroupEnd());
+
+  // Capture the memory used by the GPUs after initializing the NCCL communicators
+  for (int g = 0; g < args->nGpus; ++g) {
+    CUDACHECK(cudaSetDevice(args->gpus[g]));
+    getGPUMemoryInfo(nullptr, &initFreeGpuMem[g + args->nGpus]);
+    *args->initGpuMem = std::max(*args->initGpuMem, initFreeGpuMem[g] - initFreeGpuMem[g + args->nGpus]);
+  }
+
 #if NCCL_VERSION_CODE >= NCCL_VERSION(2,19,0)
   NCCLCHECK(ncclGroupStart());
   for (int i=0; i<args->nGpus; i++) {
@@ -767,6 +803,12 @@ testResult_t threadInit(struct threadArgs* args) {
   }
   NCCLCHECK(ncclGroupEnd());
 #endif
+  // Capture memory used by test buffers
+  for (int g = 0; g < args->nGpus; ++g) {
+    CUDACHECK(cudaSetDevice(args->gpus[g]));
+    getGPUMemoryInfo(nullptr, &initFreeGpuMem[g + args->nGpus*2]);
+    args->bufferMemory[args->thread] = std::max(args->bufferMemory[args->thread], initFreeGpuMem[g + args->nGpus] - initFreeGpuMem[g + args->nGpus*2]);
+  }
 #if NCCL_VERSION_CODE >= NCCL_VERSION(2,28,0)
   /* Create device communicators based on test-specific requirements */
   if (deviceImpl) {
@@ -783,7 +825,17 @@ testResult_t threadInit(struct threadArgs* args) {
     }
     NCCLCHECK(ncclGroupEnd());
   }
+  // Capture memory used by test buffers
+  int64_t deviceCommMaxMem = 0;
+  for (int g = 0; g < args->nGpus; ++g) {
+    CUDACHECK(cudaSetDevice(args->gpus[g]));
+    int64_t freeGpuMem;
+    getGPUMemoryInfo(nullptr, &freeGpuMem);
+    deviceCommMaxMem = std::max(deviceCommMaxMem, initFreeGpuMem[g + args->nGpus*2] - freeGpuMem);
+  }
+  *args->initGpuMem += deviceCommMaxMem;
 #endif
+  free(initFreeGpuMem);
 
   TESTCHECK(threadRunTests(args));
 
@@ -879,13 +931,15 @@ int main(int argc, char* argv[], char **envp) {
     {"cta_policy", required_argument, 0, 'x'},
     {"device_implementation", required_argument, 0, 'D'},
     {"device_cta_count", required_argument, 0, 'V'},
+    {"memory", required_argument, 0, 'M'},
+
     {"help", no_argument, 0, 'h'},
     {}
   };
 
   while(1) {
     int c;
-    c = getopt_long(argc, argv, "t:g:b:e:i:f:n:m:w:N:p:c:o:d:r:z:y:T:hG:C:a:R:x:D:V:J:S:", longopts, &longindex);
+    c = getopt_long(argc, argv, "t:g:b:e:i:f:n:m:w:N:p:c:o:d:r:z:y:T:hG:C:a:R:x:D:V:J:S:M:", longopts, &longindex);
 
     if (c == -1)
       break;
@@ -994,6 +1048,9 @@ int main(int argc, char* argv[], char **envp) {
         printf("Option -R (register) is not supported before NCCL 2.19. Ignoring\n");
 #endif
         break;
+      case 'M':
+        memory_report = (int)strtol(optarg, NULL, 0);
+        break;
       case 'x':
 #if NCCL_VERSION_CODE >= NCCL_VERSION(2,27,0)
         ctaPolicy = (int)strtol(optarg, NULL, 0);
@@ -1008,8 +1065,7 @@ int main(int argc, char* argv[], char **envp) {
       case 'D':
 	if (test_ncclVersion >= NCCL_VERSION(2,28,0)) {
           deviceImpl = (int)strtol(optarg, NULL, 0);
-	}
-	else {
+        } else {
           fprintf(stderr, "Option -D (device implementation) requires NCCL >= 2.28.0\n");
           return -1;
         }
@@ -1021,9 +1077,8 @@ int main(int argc, char* argv[], char **envp) {
             fprintf(stderr, "device_cta_count (-V) must be positive and less than 128, got %d. "
                     "Using default value 16.\n", deviceCtaCount);
             deviceCtaCount = 16;
-	  }
-        }
-	else {
+          }
+        } else {
           fprintf(stderr, "Option -V (device CTA count) requires NCCL >= 2.28.0\n");
           return -1;
 	}
@@ -1065,6 +1120,7 @@ int main(int argc, char* argv[], char **envp) {
             "[-x,--cta_policy <0/1/2> set CTA policy (NCCL_CTA_POLICY_DEFAULT (0), NCCL_CTA_POLICY_EFFICIENCY (1), NCCL_CTA_POLICY_ZERO (2)) (default: do not set)] \n\t"
             "[-D,--device_implementation <implementation number> enable device implementation (default: 0, use NCCL implementation; requires -R 2 if > 0)] \n\t"
             "[-V,--device_cta_count <number> set number of CTAs for device implementation (default: 16)] \n\t"
+            "[-M,--memory_report <0/1> enable memory usage report (default: 0)] \n\t"
             "[-h,--help]\n",
           basename(argv[0]));
         return 0;
@@ -1213,7 +1269,6 @@ testResult_t run() {
   for (int i=0; i<nGpus*nThreads; i++) {
     gpus[i] = (gpu0 != -1 ? gpu0 : localRank*nThreads*nGpus) + i;
     CUDACHECK(cudaSetDevice(gpus[i]));
-    TESTCHECK(AllocateBuffs(sendbuffs+i, sendBytes, recvbuffs+i, recvBytes, expected+i, (size_t)maxBytes));
     if (streamnull) {
       streams[i] = NULL;
     }
@@ -1256,7 +1311,16 @@ testResult_t run() {
 #if NCCL_VERSION_CODE >= NCCL_VERSION(2,28,0)
   ncclDevComm devComms[nThreads*nGpus];
 #endif
+  int64_t initGpuMem[nThreads] = {0};
+  int64_t bufferMemory[nThreads] = {0};
   if (!parallel_init) {
+    // Capture the memory used by the GPUs before initializing the NCCL communicators
+    int64_t* initFreeGpuMem = (int64_t*)calloc(nGpus*3, sizeof(int64_t));
+    for (int g = 0; g < nGpus; ++g) {
+      CUDACHECK(cudaSetDevice(gpus[g]));
+      getGPUMemoryInfo(nullptr, &initFreeGpuMem[g]);
+    }
+    //if parallel init is not selected, use main thread to initialize NCCL
 #if NCCL_VERSION_CODE >= NCCL_VERSION(2,14,0)
      ncclConfig_t config = NCCL_CONFIG_INITIALIZER;
 #if NCCL_VERSION_CODE >= NCCL_VERSION(2,27,0)
@@ -1277,9 +1341,22 @@ testResult_t run() {
 #endif
      }
      NCCLCHECK(ncclGroupEnd());
+
+     // Capture the memory used by the GPUs after initializing the NCCL communicators
+     for (int g = 0; g < nGpus; ++g) {
+       CUDACHECK(cudaSetDevice(gpus[g]));
+       getGPUMemoryInfo(nullptr, &initFreeGpuMem[g + nGpus]);
+     }
+     for ( size_t t = 0; t < nThreads; ++t) {
+       for (int g = 0; g < nGpus; ++g) {
+         initGpuMem[t] = std::max(initGpuMem[t], initFreeGpuMem[g] - initFreeGpuMem[g + nGpus]);
+       }
+     }
 #if NCCL_VERSION_CODE >= NCCL_VERSION(2,19,0)
      NCCLCHECK(ncclGroupStart());
      for (int i=0; i<nGpus*nThreads; i++) {
+       CUDACHECK(cudaSetDevice(gpus[i]));
+       TESTCHECK(AllocateBuffs(sendbuffs+i, sendBytes, recvbuffs+i, recvBytes, expected+i, (size_t)maxBytes));
 #if NCCL_VERSION_CODE >= NCCL_VERSION(2,27,0)
        if (test_ncclVersion >= NCCL_VERSION(2,27,0) && (local_register == SYMMETRIC_REGISTER)) {
          NCCLCHECK(ncclCommWindowRegister(comms[i], sendbuffs[i], maxBytes, (ncclWindow_t*)&sendRegHandles[i], NCCL_WIN_COLL_SYMMETRIC));
@@ -1293,6 +1370,16 @@ testResult_t run() {
      }
      NCCLCHECK(ncclGroupEnd());
 #endif
+     // Capture memory used by after allocating buffers
+     for (int g = 0; g < nGpus; ++g) {
+       CUDACHECK(cudaSetDevice(gpus[g]));
+       getGPUMemoryInfo(nullptr, &initFreeGpuMem[g + nGpus*2]);
+     }
+     for ( size_t t = 0; t < nThreads; ++t) {
+      for (int g = 0; g < nGpus; ++g) {
+        bufferMemory[t] = std::max(bufferMemory[t], initFreeGpuMem[g + nGpus] - initFreeGpuMem[g + nGpus*2]);
+      }
+     }
 #if NCCL_VERSION_CODE >= NCCL_VERSION(2,28,0)
      /* Create device communicators based on test-specific requirements */
      if (deviceImpl) {
@@ -1309,15 +1396,28 @@ testResult_t run() {
        }
        NCCLCHECK(ncclGroupEnd());
      }
+     int64_t deviceCommMaxMem = 0;
+     for (int g = 0; g < nGpus; ++g) {
+       CUDACHECK(cudaSetDevice(gpus[g]));
+       int64_t freeGpuMem;
+       getGPUMemoryInfo(nullptr, &freeGpuMem);
+       deviceCommMaxMem = std::max(deviceCommMaxMem, initFreeGpuMem[g + nGpus*2] - freeGpuMem);
+     }
+     for ( size_t t = 0; t < nThreads; ++t) {
+       initGpuMem[t] += deviceCommMaxMem;
+     }
 #endif
+    free(initFreeGpuMem);
   }
 
   int errors[nThreads];
   double bw[nThreads];
+  int64_t devMemUsed[nThreads];
   int bw_count[nThreads];
   for (int t=0; t<nThreads; t++) {
     bw[t] = 0.0;
     errors[t] = bw_count[t] = 0;
+    devMemUsed[t] = std::numeric_limits<int64_t>::min();
   }
 
   fflush(stdout);
@@ -1358,6 +1458,9 @@ testResult_t run() {
     threads[t].args.errors=errors+t;
     threads[t].args.bw=bw+t;
     threads[t].args.bw_count=bw_count+t;
+    threads[t].args.initGpuMem = initGpuMem + t;
+    threads[t].args.bufferMemory = bufferMemory + t;
+    threads[t].args.devMemUsed = devMemUsed + t;
 
     threads[t].args.reportErrors = datacheck;
 
@@ -1376,11 +1479,17 @@ testResult_t run() {
       errors[0] += errors[t];
       bw[0] += bw[t];
       bw_count[0] += bw_count[t];
+      devMemUsed[0] = std::max(devMemUsed[0], devMemUsed[t]);
+      initGpuMem[0] = std::max(initGpuMem[0], initGpuMem[t]);
+      bufferMemory[0] = std::max(bufferMemory[0], bufferMemory[t]);
     }
   }
 
 #ifdef MPI_SUPPORT
   MPI_Allreduce(MPI_IN_PLACE, &errors[0], 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
+  MPI_Allreduce(MPI_IN_PLACE, &devMemUsed[0], 1, MPI_INT64_T, MPI_MAX, MPI_COMM_WORLD);
+  MPI_Allreduce(MPI_IN_PLACE, &initGpuMem[0], 1, MPI_INT64_T, MPI_MAX, MPI_COMM_WORLD);
+  MPI_Allreduce(MPI_IN_PLACE, &bufferMemory[0], 1, MPI_INT64_T, MPI_MAX, MPI_COMM_WORLD);
 #endif
 
   if (!parallel_init) {
@@ -1416,10 +1525,18 @@ testResult_t run() {
   }
 
   envstr = getenv("NCCL_TESTS_MIN_BW");
-  double check_avg_bw = envstr ? atof(envstr) : -1;
+  const double check_avg_bw = envstr ? atof(envstr) : -1;
   bw[0] /= bw_count[0];
 
   writeResultFooter(errors, bw, check_avg_bw, program_invocation_short_name);
+  if (memory_report) {
+    memInfo_t memInfos[3];
+    memInfos[0] = { initGpuMem[0], "Initialization" };
+    memInfos[1] = { bufferMemory[0], "User-Allocated" };
+    memInfos[2] = { devMemUsed[0], "Collective" };
+    writeMemInfo(memInfos, 3);
+  }
+  finalizeFooter();
 
 #ifdef MPI_SUPPORT
   MPI_Comm_free(&mpi_comm);
diff --git a/src/common.h b/src/common.h
index b0b250c..3f3a0e2 100644
--- a/src/common.h
+++ b/src/common.h
@@ -155,6 +155,10 @@ struct threadArgs {
 
   struct testColl* collTest;
 
+  int64_t* initGpuMem;
+  int64_t* bufferMemory;
+  int64_t* devMemUsed;
+
 #if NCCL_VERSION_CODE >= NCCL_VERSION(2,19,0)
   void** sendRegHandles;
   void** recvRegHandles;
diff --git a/src/util.cu b/src/util.cu
index 1e4d283..5585a70 100644
--- a/src/util.cu
+++ b/src/util.cu
@@ -19,6 +19,8 @@
 #include "util.h"
 #include <assert.h>
 #include <errno.h>
+#include <string>
+#include <iomanip>
 
 #define PRINT if (is_main_thread) printf
 
@@ -641,8 +643,8 @@ void writeResultFooter(const int errors[], const double bw[], double check_avg_b
     jsonFinishList();
   }
 
-  PRINT("# Out of bounds values : %d %s\n", errors[0], errors[0] ? "FAILED" : "OK");
-  PRINT("# Avg bus bandwidth    : %g %s\n", bw[0], check_avg_bw == -1 ? "" : (bw[0] < check_avg_bw*(0.9) ? "FAILED" : "OK"));
+  PRINT("# %-20s : %d %s\n", "Out of bounds values", errors[0], errors[0] ? "FAILED" : "OK");
+  PRINT("# %-20s : %g %s\n", "Avg bus bandwidth", bw[0], check_avg_bw == -1 ? "" : (bw[0] < check_avg_bw*(0.9) ? "FAILED" : "OK"));
   PRINT("#\n");
   PRINT("# Collective test concluded: %s\n", program_name);
 
@@ -660,6 +662,46 @@ void writeResultFooter(const int errors[], const double bw[], double check_avg_b
   }
 }
 
+std::string getMemString(double amount) {
+  std::string postfix = " B";
+  if (abs(amount) >= 1024.0*1024.0*1024.0) {
+    postfix = " GB";
+    amount /= 1024.0 * 1024.0 * 1024.0;
+  } else if (abs(amount) >= 1024.0*1024.0) {
+    postfix = " MB";
+    amount /= 1024.0 * 1024.0;
+  } else if (abs(amount) >= 1024.0) {
+    postfix = " KB";
+    amount /= 1024.0;
+  }
+  int precision = 0;
+  if (abs(amount) < 10.0) {
+    precision = 2;
+  } else if (abs(amount) < 100.0) {
+    precision = 1;
+  }
+  std::stringstream ss;
+  ss << std::fixed << std::setprecision(precision) << amount << postfix;
+  return ss.str();
+}
+
+void writeMemInfo(memInfo_t* memInfos, int numMemInfos) {
+
+  std::stringstream ss;
+  uint64_t maxAmount = 0;
+  for (int i = 0; i < numMemInfos; i++) {
+    ss << memInfos[i].name << " "
+      << getMemString(memInfos[i].amount)
+      << " ";
+    if (i < numMemInfos - 1) {
+      ss << "| ";
+    }
+    maxAmount += memInfos[i].amount;
+  }
+  ss << "| Total  " << getMemString(maxAmount);
+  PRINT("# %-20s : %s\n", "GPU memory usage", ss.str().c_str());
+}
+
 // Write out remaining errors to stdout/json.
 void writeErrors() {
   const char *error = ncclGetLastError(NULL);
@@ -678,3 +720,6 @@ void writeErrors() {
   }
 }
 
+void finalizeFooter() {
+  PRINT("#\n");
+}
diff --git a/src/util.h b/src/util.h
index bea21ac..059ee7c 100644
--- a/src/util.h
+++ b/src/util.h
@@ -8,6 +8,11 @@
 
 #include "common.h"
 
+struct memInfo_t {
+  int64_t amount;
+  const char* name;
+};
+
 // Try to set up JSON file output. If MPI is used, only rank 0 will proceed.
 // This should be called by only a single thread.
 // If 'in_path' is NULL, we stop.
@@ -32,6 +37,8 @@ void writeBenchmarkLineBody(double timeUsec, double algBw, double busBw, bool re
 testResult_t writeDeviceReport(size_t *maxMem, int localRank, int proc, int totalProcs, int color, const char hostname[], const char *program_name);
 void writeResultHeader(bool report_cputime, bool report_timestamps);
 void writeResultFooter(const int errors[], const double bw[], double check_avg_bw, const char *program_name);
+void finalizeFooter();
+void writeMemInfo(memInfo_t* memInfos, int numMemInfos);
 void writeErrors();
 
 #endif