diff --git a/src/common.cu b/src/common.cu index 69b892a..b7c3e0c 100644 --- a/src/common.cu +++ b/src/common.cu @@ -16,6 +16,9 @@ #include "../verifiable/verifiable.h" +#define DIVUP(x, y) \ + (((x)+(y)-1)/(y)) + int test_ncclVersion = 0; // init'd with ncclGetVersion() #if NCCL_MAJOR >= 2 @@ -1047,10 +1050,14 @@ testResult_t run() { PRINT("%s", line); #endif + // Reserve 1GiB of memory for each 16GiB installed, but limit to a max of 4GiB + const size_t GB = (1ULL << 30); + size_t reserveMem = std::min(DIVUP(maxMem, 16*GB) * 1*GB, 4*GB); // We need sendbuff, recvbuff, expected (when datacheck enabled), plus 1G for the rest. - size_t memMaxBytes = (maxMem - (1<<30)) / (datacheck ? 3 : 2); + size_t memMaxBytes = (maxMem - reserveMem - 1*GB) / (datacheck ? 3 : 2); if (maxBytes > memMaxBytes) { maxBytes = memMaxBytes; + if (minBytes > maxBytes) minBytes = maxBytes; if (proc == 0) printf("#\n# Reducing maxBytes to %ld due to memory limitation\n", maxBytes); }