mirror of
https://github.com/NVIDIA/nccl-tests.git
synced 2026-06-06 00:04:43 +00:00
Fixes to all tests that divide buffers by nranks so that they trim buffer sizes to be multiples of 16 bytes.
This ensures non-pow2 ranks have buffer addresses aligned suitably for performance.
This commit is contained in:
+3
-5
@@ -7,10 +7,8 @@
|
||||
#include "cuda_runtime.h"
|
||||
#include "common.h"
|
||||
|
||||
#define ALIGN 4
|
||||
|
||||
void AllGatherGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) {
|
||||
size_t base = (count/(ALIGN*nranks))*ALIGN;
|
||||
void AllGatherGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, size_t eltSize, int nranks) {
|
||||
size_t base = (count/nranks) & -(16/eltSize);
|
||||
*sendcount = base;
|
||||
*recvcount = base*nranks;
|
||||
*sendInplaceOffset = base;
|
||||
@@ -60,7 +58,7 @@ struct testColl allGatherTest = {
|
||||
|
||||
void AllGatherGetBuffSize(size_t *sendcount, size_t *recvcount, size_t count, int nranks) {
|
||||
size_t paramcount, sendInplaceOffset, recvInplaceOffset;
|
||||
AllGatherGetCollByteCount(sendcount, recvcount, ¶mcount, &sendInplaceOffset, &recvInplaceOffset, count, nranks);
|
||||
AllGatherGetCollByteCount(sendcount, recvcount, ¶mcount, &sendInplaceOffset, &recvInplaceOffset, count, /*eltSize=*/1, nranks);
|
||||
}
|
||||
|
||||
testResult_t AllGatherRunTest(struct threadArgs* args, int root, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName) {
|
||||
|
||||
+2
-2
@@ -7,7 +7,7 @@
|
||||
#include "cuda_runtime.h"
|
||||
#include "common.h"
|
||||
|
||||
void AllReduceGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) {
|
||||
void AllReduceGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, size_t eltSize, int nranks) {
|
||||
*sendcount = count;
|
||||
*recvcount = count;
|
||||
*sendInplaceOffset = 0;
|
||||
@@ -55,7 +55,7 @@ struct testColl allReduceTest = {
|
||||
|
||||
void AllReduceGetBuffSize(size_t *sendcount, size_t *recvcount, size_t count, int nranks) {
|
||||
size_t paramcount, sendInplaceOffset, recvInplaceOffset;
|
||||
AllReduceGetCollByteCount(sendcount, recvcount, ¶mcount, &sendInplaceOffset, &recvInplaceOffset, count, nranks);
|
||||
AllReduceGetCollByteCount(sendcount, recvcount, ¶mcount, &sendInplaceOffset, &recvInplaceOffset, count, /*eltSize=*/1, nranks);
|
||||
}
|
||||
|
||||
testResult_t AllReduceRunTest(struct threadArgs* args, int root, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName) {
|
||||
|
||||
+5
-5
@@ -7,12 +7,12 @@
|
||||
#include "cuda_runtime.h"
|
||||
#include "common.h"
|
||||
|
||||
void AlltoAllGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) {
|
||||
*sendcount = (count/nranks)*nranks;
|
||||
*recvcount = (count/nranks)*nranks;
|
||||
void AlltoAllGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, size_t eltSize, int nranks) {
|
||||
*paramcount = (count/nranks) & -(16/eltSize);
|
||||
*sendcount = nranks*(*paramcount);
|
||||
*recvcount = *sendcount;
|
||||
*sendInplaceOffset = 0;
|
||||
*recvInplaceOffset = 0;
|
||||
*paramcount = count/nranks;
|
||||
}
|
||||
|
||||
testResult_t AlltoAllInitData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int rep, int in_place) {
|
||||
@@ -74,7 +74,7 @@ struct testColl alltoAllTest = {
|
||||
|
||||
void AlltoAllGetBuffSize(size_t *sendcount, size_t *recvcount, size_t count, int nranks) {
|
||||
size_t paramcount, sendInplaceOffset, recvInplaceOffset;
|
||||
AlltoAllGetCollByteCount(sendcount, recvcount, ¶mcount, &sendInplaceOffset, &recvInplaceOffset, count, nranks);
|
||||
AlltoAllGetCollByteCount(sendcount, recvcount, ¶mcount, &sendInplaceOffset, &recvInplaceOffset, count, /*eltSize=*/1, nranks);
|
||||
}
|
||||
|
||||
testResult_t AlltoAllRunTest(struct threadArgs* args, int root, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName) {
|
||||
|
||||
+2
-2
@@ -7,7 +7,7 @@
|
||||
#include "cuda_runtime.h"
|
||||
#include "common.h"
|
||||
|
||||
void BroadcastGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) {
|
||||
void BroadcastGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, size_t eltSize, int nranks) {
|
||||
*sendcount = count;
|
||||
*recvcount = count;
|
||||
*sendInplaceOffset = 0;
|
||||
@@ -64,7 +64,7 @@ struct testColl broadcastTest = {
|
||||
|
||||
void BroadcastGetBuffSize(size_t *sendcount, size_t *recvcount, size_t count, int nranks) {
|
||||
size_t paramcount, sendInplaceOffset, recvInplaceOffset;
|
||||
BroadcastGetCollByteCount(sendcount, recvcount, ¶mcount, &sendInplaceOffset, &recvInplaceOffset, count, nranks);
|
||||
BroadcastGetCollByteCount(sendcount, recvcount, ¶mcount, &sendInplaceOffset, &recvInplaceOffset, count, /*eltSize=*/1, nranks);
|
||||
}
|
||||
|
||||
testResult_t BroadcastRunTest(struct threadArgs* args, int root, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName) {
|
||||
|
||||
+1
-1
@@ -571,7 +571,7 @@ void setupArgs(size_t size, ncclDataType_t type, struct threadArgs* args) {
|
||||
size_t count, sendCount, recvCount, paramCount, sendInplaceOffset, recvInplaceOffset;
|
||||
|
||||
count = size / wordSize(type);
|
||||
args->collTest->getCollByteCount(&sendCount, &recvCount, ¶mCount, &sendInplaceOffset, &recvInplaceOffset, (size_t)count, (size_t)nranks);
|
||||
args->collTest->getCollByteCount(&sendCount, &recvCount, ¶mCount, &sendInplaceOffset, &recvInplaceOffset, (size_t)count, wordSize(type), (size_t)nranks);
|
||||
|
||||
args->nbytes = paramCount * wordSize(type);
|
||||
args->sendBytes = sendCount * wordSize(type);
|
||||
|
||||
+1
-1
@@ -87,7 +87,7 @@ struct testColl {
|
||||
void (*getCollByteCount)(
|
||||
size_t *sendcount, size_t *recvcount, size_t *paramcount,
|
||||
size_t *sendInplaceOffset, size_t *recvInplaceOffset,
|
||||
size_t count, int nranks);
|
||||
size_t count, size_t eltSize, int nranks);
|
||||
testResult_t (*initData)(struct threadArgs* args, ncclDataType_t type,
|
||||
ncclRedOp_t op, int root, int rep, int in_place);
|
||||
void (*getBw)(size_t count, int typesize, double sec, double* algBw, double* busBw, int nranks);
|
||||
|
||||
+6
-6
@@ -7,12 +7,12 @@
|
||||
#include "cuda_runtime.h"
|
||||
#include "common.h"
|
||||
|
||||
void GatherGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) {
|
||||
*sendcount = count/nranks;
|
||||
*recvcount = (count/nranks)*nranks;
|
||||
*sendInplaceOffset = count/nranks;
|
||||
void GatherGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, size_t eltSize, int nranks) {
|
||||
*sendcount = (count/nranks) & -(16/eltSize);
|
||||
*recvcount = (*sendcount)*nranks;
|
||||
*sendInplaceOffset = *sendcount;
|
||||
*recvInplaceOffset = 0;
|
||||
*paramcount = count/nranks;
|
||||
*paramcount = *sendcount;
|
||||
}
|
||||
|
||||
testResult_t GatherInitData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int rep, int in_place) {
|
||||
@@ -73,7 +73,7 @@ struct testColl gatherTest = {
|
||||
|
||||
void GatherGetBuffSize(size_t *sendcount, size_t *recvcount, size_t count, int nranks) {
|
||||
size_t paramcount, sendInplaceOffset, recvInplaceOffset;
|
||||
GatherGetCollByteCount(sendcount, recvcount, ¶mcount, &sendInplaceOffset, &recvInplaceOffset, count, nranks);
|
||||
GatherGetCollByteCount(sendcount, recvcount, ¶mcount, &sendInplaceOffset, &recvInplaceOffset, count, /*eltSize=*/1, nranks);
|
||||
}
|
||||
|
||||
testResult_t GatherRunTest(struct threadArgs* args, int root, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName) {
|
||||
|
||||
+3
-3
@@ -9,8 +9,8 @@
|
||||
|
||||
#define ALIGN 4
|
||||
|
||||
void HyperCubeGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) {
|
||||
size_t base = (count/(ALIGN*nranks))*ALIGN;
|
||||
void HyperCubeGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, size_t eltSize, int nranks) {
|
||||
size_t base = (count/nranks) & -(16/eltSize);
|
||||
*sendcount = base;
|
||||
*recvcount = base*nranks;
|
||||
*sendInplaceOffset = base;
|
||||
@@ -78,7 +78,7 @@ struct testColl hyperCubeTest = {
|
||||
|
||||
void HyperCubeGetBuffSize(size_t *sendcount, size_t *recvcount, size_t count, int nranks) {
|
||||
size_t paramcount, sendInplaceOffset, recvInplaceOffset;
|
||||
HyperCubeGetCollByteCount(sendcount, recvcount, ¶mcount, &sendInplaceOffset, &recvInplaceOffset, count, nranks);
|
||||
HyperCubeGetCollByteCount(sendcount, recvcount, ¶mcount, &sendInplaceOffset, &recvInplaceOffset, count, /*eltSize=*/1, nranks);
|
||||
}
|
||||
|
||||
testResult_t HyperCubeRunTest(struct threadArgs* args, int root, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName) {
|
||||
|
||||
+2
-2
@@ -7,7 +7,7 @@
|
||||
#include "cuda_runtime.h"
|
||||
#include "common.h"
|
||||
|
||||
void ReduceGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) {
|
||||
void ReduceGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, size_t eltSize, int nranks) {
|
||||
*sendcount = count;
|
||||
*recvcount = count;
|
||||
*sendInplaceOffset = 0;
|
||||
@@ -54,7 +54,7 @@ struct testColl reduceTest = {
|
||||
|
||||
void ReduceGetBuffSize(size_t *sendcount, size_t *recvcount, size_t count, int nranks) {
|
||||
size_t paramcount, sendInplaceOffset, recvInplaceOffset;
|
||||
ReduceGetCollByteCount(sendcount, recvcount, ¶mcount, &sendInplaceOffset, &recvInplaceOffset, count, nranks);
|
||||
ReduceGetCollByteCount(sendcount, recvcount, ¶mcount, &sendInplaceOffset, &recvInplaceOffset, count, /*eltSize=*/1, nranks);
|
||||
}
|
||||
|
||||
testResult_t ReduceRunTest(struct threadArgs* args, int root, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName) {
|
||||
|
||||
@@ -7,10 +7,8 @@
|
||||
#include "cuda_runtime.h"
|
||||
#include "common.h"
|
||||
|
||||
#define ALIGN 4
|
||||
|
||||
void ReduceScatterGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) {
|
||||
size_t base = (count/(ALIGN*nranks))*ALIGN;
|
||||
void ReduceScatterGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, size_t eltSize, int nranks) {
|
||||
size_t base = (count/nranks) & -(16/eltSize);
|
||||
*sendcount = base*nranks;
|
||||
*recvcount = base;
|
||||
*sendInplaceOffset = 0;
|
||||
@@ -59,7 +57,7 @@ struct testColl reduceScatterTest = {
|
||||
|
||||
void ReduceScatterGetBuffSize(size_t *sendcount, size_t *recvcount, size_t count, int nranks) {
|
||||
size_t paramcount, sendInplaceOffset, recvInplaceOffset;
|
||||
ReduceScatterGetCollByteCount(sendcount, recvcount, ¶mcount, &sendInplaceOffset, &recvInplaceOffset, count, nranks);
|
||||
ReduceScatterGetCollByteCount(sendcount, recvcount, ¶mcount, &sendInplaceOffset, &recvInplaceOffset, count, /*eltSize=*/1, nranks);
|
||||
}
|
||||
|
||||
testResult_t ReduceScatterRunTest(struct threadArgs* args, int root, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName) {
|
||||
|
||||
+6
-6
@@ -7,12 +7,12 @@
|
||||
#include "cuda_runtime.h"
|
||||
#include "common.h"
|
||||
|
||||
void ScatterGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) {
|
||||
*sendcount = (count/nranks)*nranks;
|
||||
*recvcount = count/nranks;
|
||||
void ScatterGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, size_t eltSize, int nranks) {
|
||||
*recvcount = (count/nranks) & -(16/eltSize);
|
||||
*sendcount = (*recvcount)*nranks;
|
||||
*sendInplaceOffset = 0;
|
||||
*recvInplaceOffset = count/nranks;
|
||||
*paramcount = count/nranks;
|
||||
*recvInplaceOffset = *recvcount;
|
||||
*paramcount = *recvcount;
|
||||
}
|
||||
|
||||
testResult_t ScatterInitData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int rep, int in_place) {
|
||||
@@ -69,7 +69,7 @@ struct testColl scatterTest = {
|
||||
|
||||
void ScatterGetBuffSize(size_t *sendcount, size_t *recvcount, size_t count, int nranks) {
|
||||
size_t paramcount, sendInplaceOffset, recvInplaceOffset;
|
||||
ScatterGetCollByteCount(sendcount, recvcount, ¶mcount, &sendInplaceOffset, &recvInplaceOffset, count, nranks);
|
||||
ScatterGetCollByteCount(sendcount, recvcount, ¶mcount, &sendInplaceOffset, &recvInplaceOffset, count, /*eltSize=*/1, nranks);
|
||||
}
|
||||
|
||||
testResult_t ScatterRunTest(struct threadArgs* args, int root, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName) {
|
||||
|
||||
+2
-2
@@ -7,7 +7,7 @@
|
||||
#include "cuda_runtime.h"
|
||||
#include "common.h"
|
||||
|
||||
void SendRecvGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) {
|
||||
void SendRecvGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, size_t eltSize, int nranks) {
|
||||
*sendcount = count;
|
||||
*recvcount = count;
|
||||
*sendInplaceOffset = 0;
|
||||
@@ -68,7 +68,7 @@ struct testColl sendRecvTest = {
|
||||
|
||||
void SendRecvGetBuffSize(size_t *sendcount, size_t *recvcount, size_t count, int nranks) {
|
||||
size_t paramcount, sendInplaceOffset, recvInplaceOffset;
|
||||
SendRecvGetCollByteCount(sendcount, recvcount, ¶mcount, &sendInplaceOffset, &recvInplaceOffset, count, nranks);
|
||||
SendRecvGetCollByteCount(sendcount, recvcount, ¶mcount, &sendInplaceOffset, &recvInplaceOffset, count, /*eltSize=*/1, nranks);
|
||||
}
|
||||
|
||||
testResult_t SendRecvRunTest(struct threadArgs* args, int root, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName) {
|
||||
|
||||
Reference in New Issue
Block a user