mirror of
https://github.com/NVIDIA/nccl-tests.git
synced 2026-04-23 16:08:20 +08:00
Merge 2803191af6 into 501a149d57
This commit is contained in:
commit
44f2be9989
11
doc/PERFORMANCE.md
Normal file → Executable file
11
doc/PERFORMANCE.md
Normal file → Executable file
@ -130,6 +130,16 @@ Similarly to broadcast, all data need to be sent to the root, hence :
|
||||
And :
|
||||
|
||||
`B = S/t`
|
||||
|
||||
### Bisection
|
||||
|
||||
In the bisection operation, each rank is paired with a single other rank and sends him one message of size S, hence:
|
||||
|
||||
t = S/B
|
||||
|
||||
And :
|
||||
|
||||
B = S/t
|
||||
|
||||
### Summary
|
||||
|
||||
@ -141,5 +151,6 @@ To obtain a bus bandwidth which should be independent of the number of ranks _n_
|
||||
* Broadcast : 1
|
||||
* Reduce : 1
|
||||
* AlltoAll: (_n_-1)/_n_
|
||||
* Bisection : 1
|
||||
|
||||
The bus bandwidth should reflect the speed of the hardware bottleneck : NVLink, PCI, QPI, or network.
|
||||
|
||||
2
src/Makefile
Normal file → Executable file
2
src/Makefile
Normal file → Executable file
@ -91,7 +91,7 @@ NVLDFLAGS += $(LIBRARIES:%=-l%)
|
||||
DST_DIR := $(BUILDDIR)
|
||||
SRC_FILES := $(wildcard *.cu)
|
||||
OBJ_FILES := $(SRC_FILES:%.cu=${DST_DIR}/%.o)
|
||||
BIN_FILES_LIST := all_reduce all_gather broadcast reduce_scatter reduce alltoall scatter gather sendrecv hypercube
|
||||
BIN_FILES_LIST := all_reduce all_gather broadcast reduce_scatter reduce alltoall scatter gather sendrecv hypercube bisection
|
||||
BIN_FILES := $(BIN_FILES_LIST:%=${DST_DIR}/%_perf)
|
||||
|
||||
build: ${BIN_FILES}
|
||||
|
||||
121
src/bisection.cu
Normal file
121
src/bisection.cu
Normal file
@ -0,0 +1,121 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
||||
#include "cuda_runtime.h"
|
||||
#include "common.h"
|
||||
|
||||
void BisectionGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) {
|
||||
*sendcount = count;
|
||||
*recvcount = count;
|
||||
*sendInplaceOffset = 0;
|
||||
*recvInplaceOffset = 0;
|
||||
*paramcount = count;
|
||||
}
|
||||
|
||||
int getPeer(int rank, int n_ranks){
|
||||
return (rank + n_ranks/2) % n_ranks;
|
||||
}
|
||||
|
||||
|
||||
testResult_t BisectionInitData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int rep, int in_place) {
|
||||
size_t sendcount = args->sendBytes / wordSize(type);
|
||||
size_t recvcount = args->expectedBytes / wordSize(type);
|
||||
int nranks = args->nProcs*args->nThreads*args->nGpus;
|
||||
|
||||
if (nranks % 2 != 0){
|
||||
print("Bisection test should run on an even number of ranks.\n");
|
||||
return testNcclError;
|
||||
}
|
||||
|
||||
for (int i=0; i<args->nGpus; i++) {
|
||||
CUDACHECK(cudaSetDevice(args->gpus[i]));
|
||||
int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i);
|
||||
CUDACHECK(cudaMemset(args->recvbuffs[i], 0, args->expectedBytes));
|
||||
void* data = in_place ? args->recvbuffs[i] : args->sendbuffs[i];
|
||||
TESTCHECK(InitData(data, sendcount, rank*sendcount, type, ncclSum, rep, 1, 0));
|
||||
int peer = getPeer(rank, nranks);
|
||||
TESTCHECK(InitData(args->expected[i], recvcount, peer*recvcount, type, ncclSum, rep, 1, 0));
|
||||
CUDACHECK(cudaDeviceSynchronize());
|
||||
}
|
||||
// We don't support in-place sendrecv
|
||||
args->reportErrors = in_place ? 0 : 1;
|
||||
return testSuccess;
|
||||
}
|
||||
|
||||
void BisectionGetBw(size_t count, int typesize, double sec, double* algBw, double* busBw, int nranks) {
|
||||
*busBw = *algBw = (double)(count * typesize) / 1.0E9 / sec;
|
||||
}
|
||||
|
||||
testResult_t BisectionRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
|
||||
int n_ranks, comm_rank, peer;
|
||||
|
||||
NCCLCHECK(ncclCommUserRank(comm, &comm_rank));
|
||||
NCCLCHECK(ncclCommCount(comm, &n_ranks));
|
||||
|
||||
peer = getPeer(comm_rank, n_ranks);
|
||||
|
||||
NCCLCHECK(ncclGroupStart());
|
||||
NCCLCHECK(ncclSend(sendbuff, count, type, peer, comm, stream));
|
||||
NCCLCHECK(ncclRecv(recvbuff, count, type, peer, comm, stream));
|
||||
NCCLCHECK(ncclGroupEnd());
|
||||
|
||||
return testSuccess;
|
||||
}
|
||||
|
||||
struct testColl bisectionTest = {
|
||||
"Bisection",
|
||||
BisectionGetCollByteCount,
|
||||
BisectionInitData,
|
||||
BisectionGetBw,
|
||||
BisectionRunColl
|
||||
};
|
||||
|
||||
void BisectionGetBuffSize(size_t *sendcount, size_t *recvcount, size_t count, int nranks) {
|
||||
size_t paramcount, sendInplaceOffset, recvInplaceOffset;
|
||||
BisectionGetCollByteCount(sendcount, recvcount, ¶mcount, &sendInplaceOffset, &recvInplaceOffset, count, nranks);
|
||||
}
|
||||
|
||||
testResult_t BisectionRunTest(struct threadArgs* args, int root, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName) {
|
||||
args->collTest = &bisectionTest;
|
||||
ncclDataType_t *run_types;
|
||||
ncclRedOp_t *run_ops;
|
||||
const char **run_typenames, **run_opnames;
|
||||
int type_count, op_count;
|
||||
|
||||
if ((int)type != -1) {
|
||||
type_count = 1;
|
||||
run_types = &type;
|
||||
run_typenames = &typeName;
|
||||
} else {
|
||||
type_count = test_typenum;
|
||||
run_types = test_types;
|
||||
run_typenames = test_typenames;
|
||||
}
|
||||
|
||||
if ((int)op != -1) {
|
||||
op_count = 1;
|
||||
run_ops = &op;
|
||||
run_opnames = &opName;
|
||||
} else {
|
||||
op_count = test_opnum;
|
||||
run_ops = test_ops;
|
||||
run_opnames = test_opnames;
|
||||
}
|
||||
|
||||
for (int i=0; i<type_count; i++) {
|
||||
for (int j=0; j<op_count; j++) {
|
||||
TESTCHECK(TimeTest(args, run_types[i], run_typenames[i], run_ops[j], run_opnames[j], -1));
|
||||
}
|
||||
}
|
||||
return testSuccess;
|
||||
}
|
||||
|
||||
struct testEngine bisectionEngine = {
|
||||
BisectionGetBuffSize,
|
||||
BisectionRunTest
|
||||
};
|
||||
|
||||
#pragma weak ncclTestEngine=bisectionEngine
|
||||
Loading…
Reference in New Issue
Block a user