mirror of
https://github.com/NVIDIA/nccl-tests.git
synced 2026-04-25 08:58:18 +08:00
Merge branch 'bfloat16'
This commit is contained in:
commit
f476f4a17a
@ -1,5 +1,5 @@
|
||||
#
|
||||
# Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
|
||||
# Copyright (c) 2015-2021, NVIDIA CORPORATION. All rights reserved.
|
||||
#
|
||||
# See LICENSE.txt for license information
|
||||
#
|
||||
@ -70,7 +70,7 @@ NVLDFLAGS += $(LIBRARIES:%=-l%)
|
||||
DST_DIR := $(BUILDDIR)
|
||||
SRC_FILES := $(wildcard *.cu)
|
||||
OBJ_FILES := $(SRC_FILES:%.cu=${DST_DIR}/%.o)
|
||||
BIN_FILES_LIST := all_reduce all_gather broadcast reduce_scatter reduce alltoall
|
||||
BIN_FILES_LIST := all_reduce all_gather broadcast reduce_scatter reduce alltoall scatter sendrecv hypercube
|
||||
BIN_FILES := $(BIN_FILES_LIST:%=${DST_DIR}/%_perf)
|
||||
|
||||
build: ${BIN_FILES}
|
||||
|
||||
@ -8,15 +8,15 @@
|
||||
#include "common.h"
|
||||
|
||||
void print_header() {
|
||||
PRINT("# %10s %12s %6s out-of-place in-place \n", "", "", "");
|
||||
PRINT("# %10s %12s %6s %7s %6s %6s %5s %7s %6s %6s %5s\n", "size", "count", "type",
|
||||
PRINT("# %10s %12s %8s out-of-place in-place \n", "", "", "");
|
||||
PRINT("# %10s %12s %8s %7s %6s %6s %5s %7s %6s %6s %5s\n", "size", "count", "type",
|
||||
"time", "algbw", "busbw", "error", "time", "algbw", "busbw", "error");
|
||||
PRINT("# %10s %12s %6s %7s %6s %6s %5s %7s %6s %6s %5s\n", "(B)", "(elements)", "",
|
||||
PRINT("# %10s %12s %8s %7s %6s %6s %5s %7s %6s %6s %5s\n", "(B)", "(elements)", "",
|
||||
"(us)", "(GB/s)", "(GB/s)", "", "(us)", "(GB/s)", "(GB/s)", "");
|
||||
}
|
||||
|
||||
void print_line_header (size_t size, size_t count, const char *typeName, const char *opName, int root) {
|
||||
PRINT("%12li %12li %6s", size, count, typeName);
|
||||
PRINT("%12li %12li %8s", size, count, typeName);
|
||||
}
|
||||
|
||||
void AllGatherGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) {
|
||||
@ -84,7 +84,7 @@ testResult_t AllGatherRunTest(struct threadArgs* args, int root, ncclDataType_t
|
||||
run_types = &type;
|
||||
run_typenames = &typeName;
|
||||
} else {
|
||||
type_count = ncclNumTypes;
|
||||
type_count = test_typenum;
|
||||
run_types = test_types;
|
||||
run_typenames = test_typenames;
|
||||
}
|
||||
|
||||
@ -8,15 +8,15 @@
|
||||
#include "common.h"
|
||||
|
||||
void print_header() {
|
||||
PRINT("# %10s %12s %6s %6s out-of-place in-place \n", "", "", "", "");
|
||||
PRINT("# %10s %12s %6s %6s %7s %6s %6s %5s %7s %6s %6s %5s\n", "size", "count", "type", "redop",
|
||||
PRINT("# %10s %12s %8s %6s out-of-place in-place \n", "", "", "", "");
|
||||
PRINT("# %10s %12s %8s %6s %7s %6s %6s %5s %7s %6s %6s %5s\n", "size", "count", "type", "redop",
|
||||
"time", "algbw", "busbw", "error", "time", "algbw", "busbw", "error");
|
||||
PRINT("# %10s %12s %6s %6s %7s %6s %6s %5s %7s %6s %6s %5s\n", "(B)", "(elements)", "", "",
|
||||
PRINT("# %10s %12s %8s %6s %7s %6s %6s %5s %7s %6s %6s %5s\n", "(B)", "(elements)", "", "",
|
||||
"(us)", "(GB/s)", "(GB/s)", "", "(us)", "(GB/s)", "(GB/s)", "");
|
||||
}
|
||||
|
||||
void print_line_header (size_t size, size_t count, const char *typeName, const char *opName, int root) {
|
||||
PRINT("%12li %12li %6s %6s", size, count, typeName, opName);
|
||||
PRINT("%12li %12li %8s %6s", size, count, typeName, opName);
|
||||
}
|
||||
|
||||
void AllReduceGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) {
|
||||
@ -83,7 +83,7 @@ testResult_t AllReduceRunTest(struct threadArgs* args, int root, ncclDataType_t
|
||||
run_types = &type;
|
||||
run_typenames = &typeName;
|
||||
} else {
|
||||
type_count = ncclNumTypes;
|
||||
type_count = test_typenum;
|
||||
run_types = test_types;
|
||||
run_typenames = test_typenames;
|
||||
}
|
||||
@ -93,7 +93,7 @@ testResult_t AllReduceRunTest(struct threadArgs* args, int root, ncclDataType_t
|
||||
run_ops = &op;
|
||||
run_opnames = &opName;
|
||||
} else {
|
||||
op_count = ncclNumOps;
|
||||
op_count = test_opnum;
|
||||
run_ops = test_ops;
|
||||
run_opnames = test_opnames;
|
||||
}
|
||||
|
||||
@ -8,15 +8,15 @@
|
||||
#include "common.h"
|
||||
|
||||
void print_header() {
|
||||
PRINT("# %10s %12s %6s %6s out-of-place in-place \n", "", "", "", "");
|
||||
PRINT("# %10s %12s %6s %6s %7s %6s %6s %5s %7s %6s %6s %5s\n", "size", "count", "type", "redop",
|
||||
PRINT("# %10s %12s %8s %6s out-of-place in-place \n", "", "", "", "");
|
||||
PRINT("# %10s %12s %8s %6s %7s %6s %6s %5s %7s %6s %6s %5s\n", "size", "count", "type", "redop",
|
||||
"time", "algbw", "busbw", "error", "time", "algbw", "busbw", "error");
|
||||
PRINT("# %10s %12s %6s %6s %7s %6s %6s %5s %7s %6s %6s %5s\n", "(B)", "(elements)", "", "",
|
||||
PRINT("# %10s %12s %8s %6s %7s %6s %6s %5s %7s %6s %6s %5s\n", "(B)", "(elements)", "", "",
|
||||
"(us)", "(GB/s)", "(GB/s)", "", "(us)", "(GB/s)", "(GB/s)", "");
|
||||
}
|
||||
|
||||
void print_line_header (size_t size, size_t count, const char *typeName, const char *opName, int root) {
|
||||
PRINT("%12li %12li %6s %6s", size, count, typeName, opName);
|
||||
PRINT("%12li %12li %8s %6s", size, count, typeName, opName);
|
||||
}
|
||||
|
||||
void AlltoAllGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) {
|
||||
@ -102,7 +102,7 @@ testResult_t AlltoAllRunTest(struct threadArgs* args, int root, ncclDataType_t t
|
||||
run_types = &type;
|
||||
run_typenames = &typeName;
|
||||
} else {
|
||||
type_count = ncclNumTypes;
|
||||
type_count = test_typenum;
|
||||
run_types = test_types;
|
||||
run_typenames = test_typenames;
|
||||
}
|
||||
|
||||
@ -8,15 +8,15 @@
|
||||
#include "common.h"
|
||||
|
||||
void print_header() {
|
||||
PRINT("# %10s %12s %6s %6s out-of-place in-place \n", "", "", "", "");
|
||||
PRINT("# %10s %12s %6s %6s %7s %6s %6s %5s %7s %6s %6s %5s\n", "size", "count", "type", "root",
|
||||
PRINT("# %10s %12s %8s %6s out-of-place in-place \n", "", "", "", "");
|
||||
PRINT("# %10s %12s %8s %6s %7s %6s %6s %5s %7s %6s %6s %5s\n", "size", "count", "type", "root",
|
||||
"time", "algbw", "busbw", "error", "time", "algbw", "busbw", "error");
|
||||
PRINT("# %10s %12s %6s %6s %7s %6s %6s %5s %7s %6s %6s %5s\n", "(B)", "(elements)", "", "",
|
||||
PRINT("# %10s %12s %8s %6s %7s %6s %6s %5s %7s %6s %6s %5s\n", "(B)", "(elements)", "", "",
|
||||
"(us)", "(GB/s)", "(GB/s)", "", "(us)", "(GB/s)", "(GB/s)", "");
|
||||
}
|
||||
|
||||
void print_line_header (size_t size, size_t count, const char *typeName, const char *opName, int root) {
|
||||
PRINT("%12li %12li %6s %6i", size, count, typeName, root);
|
||||
PRINT("%12li %12li %8s %6i", size, count, typeName, root);
|
||||
}
|
||||
|
||||
void BroadcastGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) {
|
||||
@ -92,7 +92,7 @@ testResult_t BroadcastRunTest(struct threadArgs* args, int root, ncclDataType_t
|
||||
run_types = &type;
|
||||
run_typenames = &typeName;
|
||||
} else {
|
||||
type_count = ncclNumTypes;
|
||||
type_count = test_typenum;
|
||||
run_types = test_types;
|
||||
run_typenames = test_typenames;
|
||||
}
|
||||
|
||||
306
src/common.cu
306
src/common.cu
@ -11,15 +11,41 @@
|
||||
#include <libgen.h>
|
||||
#include "cuda.h"
|
||||
|
||||
int test_ncclVersion = 0; // init'd with ncclGetVersion()
|
||||
|
||||
#if NCCL_MAJOR >= 2
|
||||
ncclDataType_t test_types[ncclNumTypes] = {ncclInt8, ncclUint8, ncclInt32, ncclUint32, ncclInt64, ncclUint64, ncclHalf, ncclFloat, ncclDouble};
|
||||
const char *test_typenames[ncclNumTypes] = {"int8", "uint8", "int32", "uint32", "int64", "uint64", "half", "float", "double"};
|
||||
ncclDataType_t test_types[ncclNumTypes] = {ncclInt8, ncclUint8, ncclInt32, ncclUint32, ncclInt64, ncclUint64, ncclHalf, ncclFloat, ncclDouble,
|
||||
#if defined(__CUDA_BF16_TYPES_EXIST__) && NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0)
|
||||
ncclBfloat16
|
||||
#endif
|
||||
};
|
||||
const char *test_typenames[ncclNumTypes] = {"int8", "uint8", "int32", "uint32", "int64", "uint64", "half", "float", "double",
|
||||
#if defined(__CUDA_BF16_TYPES_EXIST__) && NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0)
|
||||
"bfloat16"
|
||||
#endif
|
||||
};
|
||||
|
||||
#if defined(__CUDA_BF16_TYPES_EXIST__) && NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0)
|
||||
int test_typenum = 10;
|
||||
#else
|
||||
int test_typenum = 9;
|
||||
#endif
|
||||
|
||||
#else
|
||||
ncclDataType_t test_types[ncclNumTypes] = {ncclChar, ncclInt, ncclHalf, ncclFloat, ncclDouble, ncclInt64, ncclUint64};
|
||||
const char *test_typenames[ncclNumTypes] = {"char", "int", "half", "float", "double", "int64", "uint64"};
|
||||
int test_typenum = 7;
|
||||
#endif
|
||||
|
||||
#if NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0)
|
||||
ncclRedOp_t test_ops[ncclNumOps] = {ncclSum, ncclProd, ncclMax, ncclMin, ncclAvg};
|
||||
const char *test_opnames[ncclNumOps] = {"sum", "prod", "max", "min", "avg"};
|
||||
int test_opnum = 5;
|
||||
#else
|
||||
ncclRedOp_t test_ops[ncclNumOps] = {ncclSum, ncclProd, ncclMax, ncclMin};
|
||||
const char *test_opnames[ncclNumOps] = {"sum", "prod", "max", "min"};
|
||||
int test_opnum = 4;
|
||||
#endif
|
||||
|
||||
thread_local int is_main_thread = 0;
|
||||
|
||||
@ -39,6 +65,13 @@ static int nccltype = ncclFloat;
|
||||
static int ncclroot = 0;
|
||||
static int parallel_init = 0;
|
||||
static int blocking_coll = 0;
|
||||
static int cudaGraphLaunches = 0;
|
||||
#ifdef MPI_SUPPORT
|
||||
// Report average iteration time: (0=RANK0,1=AVG,2=MIN,3=MAX)
|
||||
static int average = 1;
|
||||
#endif
|
||||
|
||||
#define NUM_BLOCKS 32
|
||||
|
||||
static double parsesize(const char *value) {
|
||||
long long int units;
|
||||
@ -79,6 +112,9 @@ static double parsesize(const char *value) {
|
||||
double DeltaMaxValue(ncclDataType_t type) {
|
||||
switch(type) {
|
||||
case ncclHalf: return 1e-2;
|
||||
#if defined(__CUDA_BF16_TYPES_EXIST__)
|
||||
case ncclBfloat16: return 1e-2;
|
||||
#endif
|
||||
case ncclFloat: return 1e-5;
|
||||
case ncclDouble: return 1e-12;
|
||||
case ncclInt:
|
||||
@ -113,25 +149,32 @@ template<> __device__
|
||||
float toFloat(half a) {
|
||||
return __half2float(a);
|
||||
}
|
||||
#if defined(__CUDA_BF16_TYPES_EXIST__)
|
||||
template<> __device__
|
||||
float toFloat(__nv_bfloat16 a) {
|
||||
return __bfloat162float(a);
|
||||
}
|
||||
#endif
|
||||
|
||||
template<typename T, int BSIZE> __global__
|
||||
void deltaKern(void* A_, void* B_, size_t count, double* max) {
|
||||
const T* A = (const T*)A_;
|
||||
const T* B = (const T*)B_;
|
||||
__shared__ double temp[BSIZE];
|
||||
int tid = threadIdx.x;
|
||||
int tid = blockIdx.x*blockDim.x + threadIdx.x;
|
||||
double locmax = 0.0;
|
||||
for(int i=tid; i<count; i+=blockDim.x) {
|
||||
for(size_t i=tid; i<count; i+=blockDim.x*gridDim.x) {
|
||||
|
||||
double delta = absDiff(A[i], B[i]);
|
||||
if( delta > locmax ) {
|
||||
locmax = delta;
|
||||
#ifdef DEBUG_PRINT
|
||||
if (delta > .1) printf("Error at %d/%ld : %f != %f\n", i, count, toFloat(A[i]), toFloat(B[i]));
|
||||
if (delta > .1) printf("Error at %ld/%ld(%p) : %f != %f\n", i, count, B+i, toFloat(A[i]), toFloat(B[i]));
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
tid = threadIdx.x;
|
||||
temp[tid] = locmax;
|
||||
for(int stride = BSIZE/2; stride > 1; stride>>=1) {
|
||||
__syncthreads();
|
||||
@ -140,34 +183,38 @@ void deltaKern(void* A_, void* B_, size_t count, double* max) {
|
||||
}
|
||||
__syncthreads();
|
||||
if( threadIdx.x == 0)
|
||||
*max = temp[0] > temp[1] ? temp[0] : temp[1];
|
||||
max[blockIdx.x] = temp[0] > temp[1] ? temp[0] : temp[1];
|
||||
}
|
||||
|
||||
|
||||
testResult_t CheckDelta(void* expected, void* results, size_t count, ncclDataType_t type, double* devmax) {
|
||||
testResult_t CheckDelta(void* results, void* expected, size_t count, ncclDataType_t type, double* devmax) {
|
||||
switch (type) {
|
||||
#if defined(__CUDA_BF16_TYPES_EXIST__)
|
||||
case ncclBfloat16:
|
||||
deltaKern<__nv_bfloat16, 512><<<NUM_BLOCKS, 512>>>(results, expected, count, devmax); break;
|
||||
#endif
|
||||
case ncclHalf:
|
||||
deltaKern<half, 512><<<1, 512>>>(results, expected, count, devmax); break;
|
||||
deltaKern<half, 512><<<NUM_BLOCKS, 512>>>(results, expected, count, devmax); break;
|
||||
case ncclFloat:
|
||||
deltaKern<float, 512><<<1, 512>>>(results, expected, count, devmax); break;
|
||||
deltaKern<float, 512><<<NUM_BLOCKS, 512>>>(results, expected, count, devmax); break;
|
||||
case ncclDouble:
|
||||
deltaKern<double, 512><<<1, 512>>>(results, expected, count, devmax); break;
|
||||
deltaKern<double, 512><<<NUM_BLOCKS, 512>>>(results, expected, count, devmax); break;
|
||||
|
||||
case ncclChar:
|
||||
#if NCCL_MAJOR >= 2
|
||||
case ncclUint8:
|
||||
#endif
|
||||
deltaKern<uint8_t, 512><<<1, 512>>>(results, expected, count, devmax); break;
|
||||
deltaKern<uint8_t, 512><<<NUM_BLOCKS, 512>>>(results, expected, count, devmax); break;
|
||||
case ncclInt:
|
||||
#if NCCL_MAJOR >= 2
|
||||
case ncclUint32:
|
||||
#endif
|
||||
deltaKern<uint32_t, 512><<<1, 512>>>(results, expected, count, devmax); break;
|
||||
deltaKern<uint32_t, 512><<<NUM_BLOCKS, 512>>>(results, expected, count, devmax); break;
|
||||
case ncclInt64:
|
||||
case ncclUint64:
|
||||
deltaKern<uint64_t, 512><<<1, 512>>>(results, expected, count, devmax); break;
|
||||
deltaKern<uint64_t, 512><<<NUM_BLOCKS, 512>>>(results, expected, count, devmax); break;
|
||||
}
|
||||
CUDACHECK(cudaDeviceSynchronize());
|
||||
for (int i=1; i<NUM_BLOCKS; i++) devmax[0] = std::max(devmax[0], devmax[i]);
|
||||
return testSuccess;
|
||||
}
|
||||
|
||||
@ -192,6 +239,12 @@ template<>
|
||||
__device__ half testValue<half>(const size_t offset, const int rep, const int rank) {
|
||||
return __float2half(testValue<float>(offset, rep, rank));
|
||||
}
|
||||
#if defined(__CUDA_BF16_TYPES_EXIST__)
|
||||
template<>
|
||||
__device__ __nv_bfloat16 testValue<__nv_bfloat16>(const size_t offset, const int rep, const int rank) {
|
||||
return __float2bfloat16(testValue<float>(offset, rep, rank));
|
||||
}
|
||||
#endif
|
||||
|
||||
// Operations
|
||||
template<typename T>
|
||||
@ -213,22 +266,50 @@ __device__ half ncclOpMax(half a, half b) { return __half2float(a)>__half2float(
|
||||
template<>
|
||||
__device__ half ncclOpMin(half a, half b) { return __half2float(a)<__half2float(b) ? a : b; }
|
||||
|
||||
template<typename T, T (*Op)(T, T)>
|
||||
template<typename T>
|
||||
__device__ T ncclPostOpIdent(T x, int n) { return x; }
|
||||
|
||||
template<typename T>
|
||||
__device__ T ncclPostOpDiv(T x, int n) { return x/n; }
|
||||
template<>
|
||||
__device__ half ncclPostOpDiv<half>(half x, int n) { return __float2half(__half2float(x)/n); }
|
||||
#if defined(__CUDA_BF16_TYPES_EXIST__)
|
||||
template<>
|
||||
__device__ __nv_bfloat16 ncclPostOpDiv<__nv_bfloat16>(__nv_bfloat16 x, int n) { return __float2bfloat16(__bfloat162float(x)/n); }
|
||||
#endif
|
||||
|
||||
template<typename T, T (*Op)(T, T), T(*PostOp)(T,int)>
|
||||
__global__ void InitDataReduceKernel(T* data, const size_t N, const size_t offset, const int rep, const int nranks) {
|
||||
for (size_t o=blockIdx.x*blockDim.x+threadIdx.x; o<N; o+=gridDim.x*blockDim.x) {
|
||||
T val = testValue<T>(o+offset, rep, 0);
|
||||
for (int i=1; i<nranks; i++) {
|
||||
val = Op(val, testValue<T>(o+offset, rep, i));
|
||||
}
|
||||
data[o] = val;
|
||||
data[o] = PostOp(val, nranks);
|
||||
}
|
||||
}
|
||||
|
||||
#define KERN(type, op) (void*)InitDataReduceKernel<type, op<type>>
|
||||
#define OPS(type) KERN(type, ncclOpSum), KERN(type, ncclOpProd), KERN(type, ncclOpMax), KERN(type, ncclOpMin)
|
||||
#define KERN(type, op, postop) (void*)InitDataReduceKernel<type, op<type>, postop<type> >
|
||||
#if NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0)
|
||||
#define OPS(type) \
|
||||
KERN(type, ncclOpSum, ncclPostOpIdent), \
|
||||
KERN(type, ncclOpProd, ncclPostOpIdent), \
|
||||
KERN(type, ncclOpMax, ncclPostOpIdent), \
|
||||
KERN(type, ncclOpMin, ncclPostOpIdent), \
|
||||
KERN(type, ncclOpSum/*Avg*/, ncclPostOpDiv)
|
||||
#else
|
||||
#define OPS(type) \
|
||||
KERN(type, ncclOpSum, ncclPostOpIdent), \
|
||||
KERN(type, ncclOpProd, ncclPostOpIdent), \
|
||||
KERN(type, ncclOpMax, ncclPostOpIdent), \
|
||||
KERN(type, ncclOpMin, ncclPostOpIdent)
|
||||
#endif
|
||||
|
||||
static void* const redInitDataKerns[ncclNumOps*ncclNumTypes] = {
|
||||
OPS(int8_t), OPS(uint8_t), OPS(int32_t), OPS(uint32_t), OPS(int64_t), OPS(uint64_t), OPS(half), OPS(float), OPS(double)
|
||||
OPS(int8_t), OPS(uint8_t), OPS(int32_t), OPS(uint32_t), OPS(int64_t), OPS(uint64_t), OPS(half), OPS(float), OPS(double),
|
||||
#if defined(__CUDA_BF16_TYPES_EXIST__) && NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0)
|
||||
OPS(__nv_bfloat16)
|
||||
#endif
|
||||
};
|
||||
|
||||
testResult_t InitDataReduce(void* data, const size_t count, const size_t offset, ncclDataType_t type, ncclRedOp_t op, const int rep, const int nranks) {
|
||||
@ -254,7 +335,10 @@ static void* const initDataKerns[ncclNumTypes] = {
|
||||
(void*)InitDataKernel<uint64_t>,
|
||||
(void*)InitDataKernel< half>,
|
||||
(void*)InitDataKernel< float>,
|
||||
(void*)InitDataKernel< double>
|
||||
(void*)InitDataKernel< double>,
|
||||
#if defined(__CUDA_BF16_TYPES_EXIST__) && NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0)
|
||||
(void*)InitDataKernel<__nv_bfloat16>,
|
||||
#endif
|
||||
};
|
||||
|
||||
template<typename T>
|
||||
@ -352,7 +436,7 @@ testResult_t testStreamSynchronize(int ngpus, cudaStream_t* streams, ncclComm_t*
|
||||
if (cudaErr != cudaErrorNotReady) CUDACHECK(cudaErr);
|
||||
|
||||
#if NCCL_VERSION_CODE >= NCCL_VERSION(2,4,0)
|
||||
if (comms) {
|
||||
if (test_ncclVersion >= NCCL_VERSION(2,4,0) && comms) {
|
||||
ncclResult_t ncclAsyncErr;
|
||||
NCCLCHECK(ncclCommGetAsyncError(comms[i], &ncclAsyncErr));
|
||||
if (ncclAsyncErr != ncclSuccess) {
|
||||
@ -379,8 +463,8 @@ testResult_t startColl(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t
|
||||
|
||||
// Try to change offset for each iteration so that we avoid cache effects and catch race conditions in ptrExchange
|
||||
size_t totalnbytes = max(args->sendBytes, args->expectedBytes);
|
||||
size_t shift = (totalnbytes * iter) % args->maxbytes;
|
||||
if (shift + totalnbytes > args->maxbytes) shift = 0;
|
||||
size_t steps = totalnbytes ? args->maxbytes / totalnbytes : 1;
|
||||
size_t shift = totalnbytes * (iter % steps);
|
||||
|
||||
if (args->nGpus > 1) NCCLCHECK(ncclGroupStart());
|
||||
for (int i = 0; i < args->nGpus; i++) {
|
||||
@ -416,6 +500,10 @@ testResult_t completeColl(struct threadArgs* args) {
|
||||
|
||||
testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int in_place) {
|
||||
size_t count = args->nbytes / wordSize(type);
|
||||
if (datacheck) {
|
||||
// Initialize sendbuffs, recvbuffs and expected
|
||||
TESTCHECK(args->collTest->initData(args, type, op, root, 99, in_place));
|
||||
}
|
||||
|
||||
// Sync
|
||||
TESTCHECK(startColl(args, type, op, root, in_place, 0));
|
||||
@ -423,6 +511,15 @@ testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t
|
||||
|
||||
Barrier(args);
|
||||
|
||||
cudaGraph_t graphs[args->nGpus];
|
||||
cudaGraphExec_t graphExec[args->nGpus];
|
||||
if (cudaGraphLaunches >= 1) {
|
||||
// Begin cuda graph capture
|
||||
for (int i=0; i<args->nGpus; i++) {
|
||||
CUDACHECK(cudaStreamBeginCapture(args->streams[i], args->nThreads > 1 ? cudaStreamCaptureModeThreadLocal : cudaStreamCaptureModeGlobal));
|
||||
}
|
||||
}
|
||||
|
||||
// Performance Benchmark
|
||||
auto start = std::chrono::high_resolution_clock::now();
|
||||
for (int iter = 0; iter < iters; iter++) {
|
||||
@ -432,11 +529,57 @@ testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t
|
||||
}
|
||||
if (agg_iters>1) NCCLCHECK(ncclGroupEnd());
|
||||
}
|
||||
|
||||
if (cudaGraphLaunches >= 1) {
|
||||
// End cuda graph capture
|
||||
for (int i=0; i<args->nGpus; i++) {
|
||||
CUDACHECK(cudaStreamEndCapture(args->streams[i], graphs+i));
|
||||
}
|
||||
// Instantiate cuda graph
|
||||
for (int i=0; i<args->nGpus; i++) {
|
||||
CUDACHECK(cudaGraphInstantiate(graphExec+i, graphs[i], NULL, NULL, 0));
|
||||
}
|
||||
// Resync CPU, restart timing, launch cuda graph
|
||||
Barrier(args);
|
||||
start = std::chrono::high_resolution_clock::now();
|
||||
for (int l=0; l<cudaGraphLaunches; l++) {
|
||||
for (int i=0; i<args->nGpus; i++) {
|
||||
CUDACHECK(cudaGraphLaunch(graphExec[i], args->streams[i]));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
TESTCHECK(completeColl(args));
|
||||
|
||||
auto delta = std::chrono::high_resolution_clock::now() - start;
|
||||
double deltaSec = std::chrono::duration_cast<std::chrono::duration<double>>(delta).count();
|
||||
deltaSec = deltaSec/(iters*agg_iters);
|
||||
if (cudaGraphLaunches >= 1) deltaSec = deltaSec/cudaGraphLaunches;
|
||||
#ifdef MPI_SUPPORT
|
||||
switch (average) {
|
||||
case 1:
|
||||
// Calculate the average time across all ranks
|
||||
MPI_Allreduce(MPI_IN_PLACE, &deltaSec, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
|
||||
deltaSec = deltaSec/(args->nProcs*args->nThreads*args->nGpus);
|
||||
break;
|
||||
case 2:
|
||||
// Obtain the minimum time across all ranks
|
||||
MPI_Allreduce(MPI_IN_PLACE, &deltaSec, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);
|
||||
break;
|
||||
case 3:
|
||||
// Obtain the maximum time across all ranks
|
||||
MPI_Allreduce(MPI_IN_PLACE, &deltaSec, 1, MPI_DOUBLE, MPI_MAX, MPI_COMM_WORLD);
|
||||
break;
|
||||
}
|
||||
#endif
|
||||
|
||||
if (cudaGraphLaunches >= 1) {
|
||||
//destroy cuda graph
|
||||
for (int i=0; i<args->nGpus; i++) {
|
||||
CUDACHECK(cudaGraphExecDestroy(graphExec[i]));
|
||||
CUDACHECK(cudaGraphDestroy(graphs[i]));
|
||||
}
|
||||
}
|
||||
|
||||
double algBw, busBw;
|
||||
args->collTest->getBw(count, wordSize(type), deltaSec, &algBw, &busBw, args->nProcs*args->nThreads*args->nGpus);
|
||||
@ -450,10 +593,41 @@ testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t
|
||||
// Initialize sendbuffs, recvbuffs and expected
|
||||
TESTCHECK(args->collTest->initData(args, type, op, root, rep, in_place));
|
||||
|
||||
if (cudaGraphLaunches >= 1) {
|
||||
// Begin cuda graph capture for data check
|
||||
for (int i=0; i<args->nGpus; i++) {
|
||||
CUDACHECK(cudaStreamBeginCapture(args->streams[i], cudaStreamCaptureModeThreadLocal));
|
||||
}
|
||||
}
|
||||
|
||||
//test validation in single itertion, should ideally be included into the multi-iteration run
|
||||
TESTCHECK(startColl(args, type, op, root, in_place, 0));
|
||||
|
||||
if (cudaGraphLaunches >= 1) {
|
||||
// End cuda graph capture
|
||||
for (int i=0; i<args->nGpus; i++) {
|
||||
CUDACHECK(cudaStreamEndCapture(args->streams[i], graphs+i));
|
||||
}
|
||||
// Instantiate cuda graph
|
||||
for (int i=0; i<args->nGpus; i++) {
|
||||
CUDACHECK(cudaGraphInstantiate(graphExec+i, graphs[i], NULL, NULL, 0));
|
||||
}
|
||||
// Launch cuda graph
|
||||
for (int i=0; i<args->nGpus; i++) {
|
||||
CUDACHECK(cudaGraphLaunch(graphExec[i], args->streams[i]));
|
||||
}
|
||||
}
|
||||
|
||||
TESTCHECK(completeColl(args));
|
||||
|
||||
if (cudaGraphLaunches >= 1) {
|
||||
//destroy cuda graph
|
||||
for (int i=0; i<args->nGpus; i++) {
|
||||
CUDACHECK(cudaGraphExecDestroy(graphExec[i]));
|
||||
CUDACHECK(cudaGraphDestroy(graphs[i]));
|
||||
}
|
||||
}
|
||||
|
||||
TESTCHECK(CheckData(args, type, op, root, in_place, &maxDelta));
|
||||
|
||||
//aggregate delta from all threads and procs
|
||||
@ -470,10 +644,10 @@ testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t
|
||||
}
|
||||
|
||||
double timeUsec = deltaSec*1.0E6;
|
||||
char timeStr[10];
|
||||
char timeStr[100];
|
||||
if (timeUsec > 10000.0) {
|
||||
sprintf(timeStr, "%7.0f", timeUsec);
|
||||
} else if (timeUsec > 100.0) {
|
||||
} else if (timeUsec >= 100.0) {
|
||||
sprintf(timeStr, "%7.1f", timeUsec);
|
||||
} else {
|
||||
sprintf(timeStr, "%7.2f", timeUsec);
|
||||
@ -587,6 +761,17 @@ int main(int argc, char* argv[]) {
|
||||
// Make sure everyline is flushed so that we see the progress of the test
|
||||
setlinebuf(stdout);
|
||||
|
||||
#if NCCL_VERSION_CODE >= NCCL_VERSION(2,4,0)
|
||||
ncclGetVersion(&test_ncclVersion);
|
||||
#else
|
||||
test_ncclVersion = NCCL_VERSION_CODE;
|
||||
#endif
|
||||
//printf("# NCCL_VERSION_CODE=%d ncclGetVersion=%d\n", NCCL_VERSION_CODE, test_ncclVersion);
|
||||
if (NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0) && test_ncclVersion < NCCL_VERSION(2,10,0)) {
|
||||
test_opnum -= 1; // exclude ncclAvg
|
||||
test_typenum -= 1; // exclude bfloat16
|
||||
}
|
||||
|
||||
// Parse args
|
||||
double parsed;
|
||||
int longindex;
|
||||
@ -606,12 +791,14 @@ int main(int argc, char* argv[]) {
|
||||
{"datatype", required_argument, 0, 'd'},
|
||||
{"root", required_argument, 0, 'r'},
|
||||
{"blocking", required_argument, 0, 'z'},
|
||||
{"cudagraph", required_argument, 0, 'G'},
|
||||
{"average", required_argument, 0, 'a'},
|
||||
{"help", no_argument, 0, 'h'}
|
||||
};
|
||||
|
||||
while(1) {
|
||||
int c;
|
||||
c = getopt_long(argc, argv, "t:g:b:e:i:f:n:m:w:p:c:o:d:r:z:h", longopts, &longindex);
|
||||
c = getopt_long(argc, argv, "t:g:b:e:i:f:n:m:w:p:c:o:d:r:z:hG:a:", longopts, &longindex);
|
||||
|
||||
if (c == -1)
|
||||
break;
|
||||
@ -649,7 +836,7 @@ int main(int argc, char* argv[]) {
|
||||
iters = (int)strtol(optarg, NULL, 0);
|
||||
break;
|
||||
case 'm':
|
||||
#if NCCL_MAJOR >= 2 && NCCL_MINOR >= 2
|
||||
#if NCCL_MAJOR > 2 || (NCCL_MAJOR >= 2 && NCCL_MINOR >= 2)
|
||||
agg_iters = (int)strtol(optarg, NULL, 0);
|
||||
#else
|
||||
fprintf(stderr, "Option -m not supported before NCCL 2.2. Ignoring\n");
|
||||
@ -676,29 +863,21 @@ int main(int argc, char* argv[]) {
|
||||
case 'z':
|
||||
blocking_coll = strtol(optarg, NULL, 0);
|
||||
break;
|
||||
case 'h':
|
||||
fprintf(stderr, "USAGE: %s \n\t"
|
||||
"[-t,--nthreads <num threads>] \n\t"
|
||||
"[-g,--ngpus <gpus per thread>] \n\t"
|
||||
"[-b,--minbytes <min size in bytes>] \n\t"
|
||||
"[-e,--maxbytes <max size in bytes>] \n\t"
|
||||
"[-i,--stepbytes <increment size>] \n\t"
|
||||
"[-f,--stepfactor <increment factor>] \n\t"
|
||||
"[-n,--iters <iteration count>] \n\t"
|
||||
"[-m,--agg_iters <aggregated iteration count>] \n\t"
|
||||
"[-w,--warmup_iters <warmup iteration count>] \n\t"
|
||||
"[-p,--parallel_init <0/1>] \n\t"
|
||||
"[-c,--check <0/1>] \n\t"
|
||||
"[-o,--op <sum/prod/min/max/all>] \n\t"
|
||||
"[-d,--datatype <nccltype/all>] \n\t"
|
||||
"[-r,--root <root>] \n\t"
|
||||
"[-z,--blocking <0/1>] \n\t"
|
||||
"[-h,--help]\n",
|
||||
basename(argv[0]));
|
||||
return 0;
|
||||
case 'G':
|
||||
#if (NCCL_MAJOR > 2 || (NCCL_MAJOR >= 2 && NCCL_MINOR >= 9)) && CUDART_VERSION >= 11030
|
||||
cudaGraphLaunches = strtol(optarg, NULL, 0);
|
||||
#else
|
||||
printf("Option -G (CUDA graph) not supported before NCCL 2.9 + CUDA 11.3. Ignoring\n");
|
||||
#endif
|
||||
break;
|
||||
#ifdef MPI_SUPPORT
|
||||
case 'a':
|
||||
average = (int)strtol(optarg, NULL, 0);
|
||||
break;
|
||||
#endif
|
||||
default:
|
||||
fprintf(stderr, "invalid option \n");
|
||||
fprintf(stderr, "USAGE: %s \n\t"
|
||||
if (c != 'h') printf("invalid option '%c'\n", c);
|
||||
printf("USAGE: %s \n\t"
|
||||
"[-t,--nthreads <num threads>] \n\t"
|
||||
"[-g,--ngpus <gpus per thread>] \n\t"
|
||||
"[-b,--minbytes <min size in bytes>] \n\t"
|
||||
@ -710,10 +889,18 @@ int main(int argc, char* argv[]) {
|
||||
"[-w,--warmup_iters <warmup iteration count>] \n\t"
|
||||
"[-p,--parallel_init <0/1>] \n\t"
|
||||
"[-c,--check <0/1>] \n\t"
|
||||
#if NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0)
|
||||
"[-o,--op <sum/prod/min/max/avg/all>] \n\t"
|
||||
#else
|
||||
"[-o,--op <sum/prod/min/max/all>] \n\t"
|
||||
#endif
|
||||
"[-d,--datatype <nccltype/all>] \n\t"
|
||||
"[-r,--root <root>] \n\t"
|
||||
"[-z,--blocking <0/1>] \n\t"
|
||||
"[-G,--cudagraph <num graph launches>] \n\t"
|
||||
#ifdef MPI_SUPPORT
|
||||
"[-a,--average <0/1/2/3> report average iteration time <0=RANK0/1=AVG/2=MIN/3=MAX>] \n\t"
|
||||
#endif
|
||||
"[-h,--help]\n",
|
||||
basename(argv[0]));
|
||||
return 0;
|
||||
@ -728,7 +915,8 @@ int main(int argc, char* argv[]) {
|
||||
#ifdef MPI_SUPPORT
|
||||
MPI_Init(&argc, &argv);
|
||||
#endif
|
||||
return run();
|
||||
TESTCHECK(run());
|
||||
return 0;
|
||||
}
|
||||
|
||||
testResult_t run() {
|
||||
@ -760,6 +948,7 @@ testResult_t run() {
|
||||
#define MAX_LINE 2048
|
||||
char line[MAX_LINE];
|
||||
int len = 0;
|
||||
size_t maxMem = ~0;
|
||||
for (int i=0; i<nThreads*nGpus; i++) {
|
||||
int cudaDev = localRank*nThreads*nGpus+i;
|
||||
int rank = proc*nThreads*nGpus+i;
|
||||
@ -767,6 +956,7 @@ testResult_t run() {
|
||||
CUDACHECK(cudaGetDeviceProperties(&prop, cudaDev));
|
||||
len += snprintf(line+len, MAX_LINE-len, "# Rank %2d Pid %6d on %10s device %2d [0x%02x] %s\n",
|
||||
rank, getpid(), hostname, cudaDev, prop.pciBusID, prop.name);
|
||||
maxMem = std::min(maxMem, prop.totalGlobalMem);
|
||||
}
|
||||
|
||||
#if MPI_SUPPORT
|
||||
@ -778,10 +968,18 @@ testResult_t run() {
|
||||
PRINT("%s", lines+MAX_LINE*p);
|
||||
free(lines);
|
||||
}
|
||||
MPI_Allreduce(MPI_IN_PLACE, &maxMem, 1, MPI_LONG, MPI_MIN, MPI_COMM_WORLD);
|
||||
#else
|
||||
PRINT("%s", line);
|
||||
#endif
|
||||
|
||||
// We need sendbuff, recvbuff, expected (when datacheck enabled), plus 1G for the rest.
|
||||
size_t memMaxBytes = (maxMem - (1<<30)) / (datacheck ? 3 : 2);
|
||||
if (maxBytes > memMaxBytes) {
|
||||
maxBytes = memMaxBytes;
|
||||
if (proc == 0) printf("#\n# Reducing maxBytes to %ld due to memory limitation\n", maxBytes);
|
||||
}
|
||||
|
||||
ncclUniqueId ncclId;
|
||||
if (proc == 0) {
|
||||
NCCLCHECK(ncclGetUniqueId(&ncclId));
|
||||
@ -823,7 +1021,7 @@ testResult_t run() {
|
||||
int errors[nThreads];
|
||||
double bw[nThreads];
|
||||
double* delta;
|
||||
CUDACHECK(cudaHostAlloc(&delta, sizeof(double)*nThreads, cudaHostAllocPortable | cudaHostAllocMapped));
|
||||
CUDACHECK(cudaHostAlloc(&delta, sizeof(double)*nThreads*NUM_BLOCKS, cudaHostAllocPortable | cudaHostAllocMapped));
|
||||
int bw_count[nThreads];
|
||||
for (int t=0; t<nThreads; t++) {
|
||||
bw[t] = 0.0;
|
||||
@ -863,7 +1061,7 @@ testResult_t run() {
|
||||
threads[t].args.sync = (volatile int*)sync;
|
||||
threads[t].args.sync_idx = 0;
|
||||
threads[t].args.deltaThreads = delta;
|
||||
threads[t].args.deltaHost = (delta + t);
|
||||
threads[t].args.deltaHost = (delta + t*NUM_BLOCKS);
|
||||
threads[t].args.delta = delta;
|
||||
threads[t].args.errors=errors+t;
|
||||
threads[t].args.bw=bw+t;
|
||||
@ -901,8 +1099,8 @@ testResult_t run() {
|
||||
|
||||
// Free off CUDA allocated memory
|
||||
for (int i=0; i<nGpus*nThreads; i++) {
|
||||
CUDACHECK(cudaFree(sendbuffs[i]));
|
||||
CUDACHECK(cudaFree(recvbuffs[i]));
|
||||
if (sendbuffs[i]) CUDACHECK(cudaFree((char*)sendbuffs[i]));
|
||||
if (recvbuffs[i]) CUDACHECK(cudaFree((char*)recvbuffs[i]));
|
||||
if (datacheck) CUDACHECK(cudaFree(expected[i]));
|
||||
}
|
||||
CUDACHECK(cudaFreeHost(delta));
|
||||
|
||||
13
src/common.h
13
src/common.h
@ -54,8 +54,8 @@ typedef enum {
|
||||
if (r!= testSuccess) { \
|
||||
char hostname[1024]; \
|
||||
getHostName(hostname, 1024); \
|
||||
printf(" .. %s: Test failure %s:%d\n", \
|
||||
hostname, \
|
||||
printf(" .. %s pid %d: Test failure %s:%d\n", \
|
||||
hostname, getpid(), \
|
||||
__FILE__,__LINE__); \
|
||||
return r; \
|
||||
} \
|
||||
@ -78,6 +78,7 @@ extern struct testColl allGatherTest;
|
||||
extern struct testColl reduceScatterTest;
|
||||
extern struct testColl broadcastTest;
|
||||
extern struct testColl reduceTest;
|
||||
extern struct testColl alltoAllTest;
|
||||
|
||||
struct testEngine {
|
||||
void (*getBuffSize)(size_t *sendcount, size_t *recvcount, size_t count, int nranks);
|
||||
@ -213,6 +214,9 @@ static size_t wordSize(ncclDataType_t type) {
|
||||
#endif
|
||||
return 1;
|
||||
case ncclHalf:
|
||||
#if defined(__CUDA_BF16_TYPES_EXIST__)
|
||||
case ncclBfloat16:
|
||||
#endif
|
||||
//case ncclFloat16:
|
||||
return 2;
|
||||
case ncclInt:
|
||||
@ -232,10 +236,13 @@ static size_t wordSize(ncclDataType_t type) {
|
||||
}
|
||||
}
|
||||
|
||||
extern int test_ncclVersion; // init'd with ncclGetVersion()
|
||||
extern ncclDataType_t test_types[ncclNumTypes];
|
||||
extern const char *test_typenames[ncclNumTypes];
|
||||
extern ncclRedOp_t test_ops[ncclNumOps];
|
||||
extern const char *test_opnames[ncclNumOps];
|
||||
extern int test_opnum;
|
||||
extern int test_typenum;
|
||||
|
||||
static int ncclstringtotype(char *str) {
|
||||
for (int t=0; t<ncclNumTypes; t++) {
|
||||
@ -251,7 +258,7 @@ static int ncclstringtotype(char *str) {
|
||||
}
|
||||
|
||||
static int ncclstringtoop (char *str) {
|
||||
for (int o=0; o<ncclNumOps; o++) {
|
||||
for (int o=0; o<test_opnum; o++) {
|
||||
if (strcmp(str, test_opnames[o]) == 0) {
|
||||
return o;
|
||||
}
|
||||
|
||||
124
src/hypercube.cu
Normal file
124
src/hypercube.cu
Normal file
@ -0,0 +1,124 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2016-2021, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
||||
#include "cuda_runtime.h"
|
||||
#include "common.h"
|
||||
|
||||
#define ALIGN 4
|
||||
|
||||
void print_header() {
|
||||
PRINT("# %10s %12s %8s out-of-place in-place \n", "", "", "");
|
||||
PRINT("# %10s %12s %8s %7s %6s %6s %5s %7s %6s %6s %5s\n", "size", "count", "type",
|
||||
"time", "algbw", "busbw", "error", "time", "algbw", "busbw", "error");
|
||||
PRINT("# %10s %12s %8s %7s %6s %6s %5s %7s %6s %6s %5s\n", "(B)", "(elements)", "",
|
||||
"(us)", "(GB/s)", "(GB/s)", "", "(us)", "(GB/s)", "(GB/s)", "");
|
||||
}
|
||||
|
||||
void print_line_header (size_t size, size_t count, const char *typeName, const char *opName, int root) {
|
||||
PRINT("%12li %12li %8s", size, count, typeName);
|
||||
}
|
||||
|
||||
void HyperCubeGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) {
|
||||
size_t base = (count/(ALIGN*nranks))*ALIGN;
|
||||
*sendcount = base;
|
||||
*recvcount = base*nranks;
|
||||
*sendInplaceOffset = base;
|
||||
*recvInplaceOffset = 0;
|
||||
*paramcount = base;
|
||||
}
|
||||
|
||||
testResult_t HyperCubeInitData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int rep, int in_place) {
|
||||
size_t sendcount = args->sendBytes / wordSize(type);
|
||||
size_t recvcount = args->expectedBytes / wordSize(type);
|
||||
int nranks = args->nProcs*args->nThreads*args->nGpus;
|
||||
|
||||
for (int i=0; i<args->nGpus; i++) {
|
||||
int gpuid = args->localRank*args->nThreads*args->nGpus + args->thread*args->nGpus + i;
|
||||
CUDACHECK(cudaSetDevice(gpuid));
|
||||
int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i);
|
||||
CUDACHECK(cudaMemset(args->recvbuffs[i], 0, args->expectedBytes));
|
||||
void* data = in_place ? ((char*)args->recvbuffs[i])+rank*args->sendBytes : args->sendbuffs[i];
|
||||
TESTCHECK(InitData(data, sendcount, type, rep, rank));
|
||||
for (int j=0; j<nranks; j++) {
|
||||
TESTCHECK(InitData(((char*)args->expected[i])+args->sendBytes*j, sendcount, type, rep, j));
|
||||
}
|
||||
CUDACHECK(cudaDeviceSynchronize());
|
||||
}
|
||||
return testSuccess;
|
||||
}
|
||||
|
||||
void HyperCubeGetBw(size_t count, int typesize, double sec, double* algBw, double* busBw, int nranks) {
|
||||
double baseBw = (double)(count * typesize * (nranks - 1)) / 1.0E9 / sec;
|
||||
|
||||
*algBw = baseBw;
|
||||
double factor = 1;
|
||||
*busBw = baseBw * factor;
|
||||
}
|
||||
|
||||
testResult_t HyperCubeRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
|
||||
char* sbuff = (char*)sendbuff;
|
||||
char* rbuff = (char*)recvbuff;
|
||||
int nRanks;
|
||||
NCCLCHECK(ncclCommCount(comm, &nRanks));
|
||||
int rank;
|
||||
NCCLCHECK(ncclCommUserRank(comm, &rank));
|
||||
size_t rankSize = count * wordSize(type);
|
||||
|
||||
if (rbuff+rank*rankSize != sbuff) CUDACHECK(cudaMemcpyAsync(rbuff+rank*rankSize, sbuff, rankSize, cudaMemcpyDeviceToDevice, stream));
|
||||
|
||||
// Hypercube AllGather
|
||||
for (int mask=1; mask<nRanks; mask<<=1) {
|
||||
NCCLCHECK(ncclGroupStart());
|
||||
int s = rank & ~(mask-1);
|
||||
int r = s ^ mask;
|
||||
NCCLCHECK(ncclSend(rbuff+s*rankSize, count*mask, type, rank^mask, comm, stream));
|
||||
NCCLCHECK(ncclRecv(rbuff+r*rankSize, count*mask, type, rank^mask, comm, stream));
|
||||
NCCLCHECK(ncclGroupEnd());
|
||||
}
|
||||
return testSuccess;
|
||||
}
|
||||
|
||||
struct testColl hyperCubeTest = {
|
||||
"HyperCube",
|
||||
HyperCubeGetCollByteCount,
|
||||
HyperCubeInitData,
|
||||
HyperCubeGetBw,
|
||||
HyperCubeRunColl
|
||||
};
|
||||
|
||||
void HyperCubeGetBuffSize(size_t *sendcount, size_t *recvcount, size_t count, int nranks) {
|
||||
size_t paramcount, sendInplaceOffset, recvInplaceOffset;
|
||||
HyperCubeGetCollByteCount(sendcount, recvcount, ¶mcount, &sendInplaceOffset, &recvInplaceOffset, count, nranks);
|
||||
}
|
||||
|
||||
testResult_t HyperCubeRunTest(struct threadArgs* args, int root, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName) {
|
||||
args->collTest = &hyperCubeTest;
|
||||
ncclDataType_t *run_types;
|
||||
const char **run_typenames;
|
||||
int type_count;
|
||||
|
||||
if ((int)type != -1) {
|
||||
type_count = 1;
|
||||
run_types = &type;
|
||||
run_typenames = &typeName;
|
||||
} else {
|
||||
type_count = test_typenum;
|
||||
run_types = test_types;
|
||||
run_typenames = test_typenames;
|
||||
}
|
||||
|
||||
for (int i=0; i<type_count; i++) {
|
||||
TESTCHECK(TimeTest(args, run_types[i], run_typenames[i], (ncclRedOp_t)0, "", -1));
|
||||
}
|
||||
return testSuccess;
|
||||
}
|
||||
|
||||
struct testEngine hyperCubeEngine = {
|
||||
HyperCubeGetBuffSize,
|
||||
HyperCubeRunTest
|
||||
};
|
||||
|
||||
#pragma weak ncclTestEngine=hyperCubeEngine
|
||||
@ -8,15 +8,15 @@
|
||||
#include "common.h"
|
||||
|
||||
void print_header() {
|
||||
PRINT("# %10s %12s %6s %6s out-of-place in-place \n", "", "", "", "");
|
||||
PRINT("# %10s %12s %6s %6s %6s %7s %6s %6s %5s %7s %6s %6s %5s\n", "size", "count", "type", "redop", "root",
|
||||
PRINT("# %10s %12s %8s %6s out-of-place in-place \n", "", "", "", "");
|
||||
PRINT("# %10s %12s %8s %6s %6s %7s %6s %6s %5s %7s %6s %6s %5s\n", "size", "count", "type", "redop", "root",
|
||||
"time", "algbw", "busbw", "error", "time", "algbw", "busbw", "error");
|
||||
PRINT("# %10s %12s %6s %6s %6s %7s %6s %6s %5s %7s %6s %6s %5s\n", "(B)", "(elements)", "", "", "",
|
||||
PRINT("# %10s %12s %8s %6s %6s %7s %6s %6s %5s %7s %6s %6s %5s\n", "(B)", "(elements)", "", "", "",
|
||||
"(us)", "(GB/s)", "(GB/s)", "", "(us)", "(GB/s)", "(GB/s)", "");
|
||||
}
|
||||
|
||||
void print_line_header (size_t size, size_t count, const char *typeName, const char *opName, int root) {
|
||||
PRINT("%12li %12li %6s %6s %6i", size, count, typeName, opName, root);
|
||||
PRINT("%12li %12li %8s %6s %6i", size, count, typeName, opName, root);
|
||||
}
|
||||
|
||||
void ReduceGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) {
|
||||
@ -83,7 +83,7 @@ testResult_t ReduceRunTest(struct threadArgs* args, int root, ncclDataType_t typ
|
||||
run_types = &type;
|
||||
run_typenames = &typeName;
|
||||
} else {
|
||||
type_count = ncclNumTypes;
|
||||
type_count = test_typenum;
|
||||
run_types = test_types;
|
||||
run_typenames = test_typenames;
|
||||
}
|
||||
@ -93,7 +93,7 @@ testResult_t ReduceRunTest(struct threadArgs* args, int root, ncclDataType_t typ
|
||||
run_ops = &op;
|
||||
run_opnames = &opName;
|
||||
} else {
|
||||
op_count = ncclNumOps;
|
||||
op_count = test_opnum;
|
||||
run_ops = test_ops;
|
||||
run_opnames = test_opnames;
|
||||
}
|
||||
|
||||
@ -8,15 +8,15 @@
|
||||
#include "common.h"
|
||||
|
||||
void print_header() {
|
||||
PRINT("# %10s %12s %6s %6s out-of-place in-place \n", "", "", "", "");
|
||||
PRINT("# %10s %12s %6s %6s %7s %6s %6s %5s %7s %6s %6s %5s\n", "size", "count", "type", "redop",
|
||||
PRINT("# %10s %12s %8s %6s out-of-place in-place \n", "", "", "", "");
|
||||
PRINT("# %10s %12s %8s %6s %7s %6s %6s %5s %7s %6s %6s %5s\n", "size", "count", "type", "redop",
|
||||
"time", "algbw", "busbw", "error", "time", "algbw", "busbw", "error");
|
||||
PRINT("# %10s %12s %6s %6s %7s %6s %6s %5s %7s %6s %6s %5s\n", "(B)", "(elements)", "", "",
|
||||
PRINT("# %10s %12s %8s %6s %7s %6s %6s %5s %7s %6s %6s %5s\n", "(B)", "(elements)", "", "",
|
||||
"(us)", "(GB/s)", "(GB/s)", "", "(us)", "(GB/s)", "(GB/s)", "");
|
||||
}
|
||||
|
||||
void print_line_header (size_t size, size_t count, const char *typeName, const char *opName, int root) {
|
||||
PRINT("%12li %12li %6s %6s", size, count, typeName, opName);
|
||||
PRINT("%12li %12li %8s %6s", size, count, typeName, opName);
|
||||
}
|
||||
|
||||
void ReduceScatterGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) {
|
||||
@ -84,7 +84,7 @@ testResult_t ReduceScatterRunTest(struct threadArgs* args, int root, ncclDataTyp
|
||||
run_types = &type;
|
||||
run_typenames = &typeName;
|
||||
} else {
|
||||
type_count = ncclNumTypes;
|
||||
type_count = test_typenum;
|
||||
run_types = test_types;
|
||||
run_typenames = test_typenames;
|
||||
}
|
||||
@ -94,7 +94,7 @@ testResult_t ReduceScatterRunTest(struct threadArgs* args, int root, ncclDataTyp
|
||||
run_opnames = &opName;
|
||||
op_count = 1;
|
||||
} else {
|
||||
op_count = sizeof(test_ops)/sizeof(test_ops[0]);
|
||||
op_count = test_opnum;
|
||||
run_ops = test_ops;
|
||||
run_opnames = test_opnames;
|
||||
}
|
||||
|
||||
125
src/scatter.cu
Normal file
125
src/scatter.cu
Normal file
@ -0,0 +1,125 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2016-2021, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
||||
#include "cuda_runtime.h"
|
||||
#include "common.h"
|
||||
|
||||
void print_header() {
|
||||
PRINT("# %10s %12s %8s %6s out-of-place in-place \n", "", "", "", "");
|
||||
PRINT("# %10s %12s %8s %6s %7s %6s %6s %5s %7s %6s %6s %5s\n", "size", "count", "type", "root",
|
||||
"time", "algbw", "busbw", "error", "time", "algbw", "busbw", "error");
|
||||
PRINT("# %10s %12s %8s %6s %7s %6s %6s %5s %7s %6s %6s %5s\n", "(B)", "(elements)", "", "",
|
||||
"(us)", "(GB/s)", "(GB/s)", "", "(us)", "(GB/s)", "(GB/s)", "");
|
||||
}
|
||||
|
||||
void print_line_header (size_t size, size_t count, const char *typeName, const char *opName, int root) {
|
||||
PRINT("%12li %12li %8s %6i", size, count, typeName, root);
|
||||
}
|
||||
|
||||
void ScatterGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) {
|
||||
*sendcount = (count/nranks)*nranks;
|
||||
*recvcount = count/nranks;
|
||||
*sendInplaceOffset = 0;
|
||||
*recvInplaceOffset = count/nranks;
|
||||
*paramcount = count/nranks;
|
||||
}
|
||||
|
||||
testResult_t ScatterInitData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int rep, int in_place) {
|
||||
size_t sendcount = args->sendBytes / wordSize(type);
|
||||
size_t recvcount = args->expectedBytes / wordSize(type);
|
||||
|
||||
for (int i=0; i<args->nGpus; i++) {
|
||||
int gpuid = args->localRank*args->nThreads*args->nGpus + args->thread*args->nGpus + i;
|
||||
CUDACHECK(cudaSetDevice(gpuid));
|
||||
int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i);
|
||||
CUDACHECK(cudaMemset(args->recvbuffs[i], 0, args->expectedBytes));
|
||||
void* data = in_place ? args->recvbuffs[i] : args->sendbuffs[i];
|
||||
if (rank == root) TESTCHECK(InitData(data, sendcount, type, rep, rank));
|
||||
TESTCHECK(InitData(args->expected[i], recvcount, type, rep+rank*recvcount, root));
|
||||
CUDACHECK(cudaDeviceSynchronize());
|
||||
}
|
||||
return testSuccess;
|
||||
}
|
||||
|
||||
void ScatterGetBw(size_t count, int typesize, double sec, double* algBw, double* busBw, int nranks) {
|
||||
double baseBw = (double)(count * nranks * typesize) / 1.0E9 / sec;
|
||||
|
||||
*algBw = baseBw;
|
||||
double factor = ((double)(nranks-1))/((double)(nranks));
|
||||
*busBw = baseBw * factor;
|
||||
}
|
||||
|
||||
testResult_t ScatterRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
|
||||
int nRanks;
|
||||
NCCLCHECK(ncclCommCount(comm, &nRanks));
|
||||
int rank;
|
||||
NCCLCHECK(ncclCommUserRank(comm, &rank));
|
||||
size_t rankOffset = count * wordSize(type);
|
||||
if (count == 0) return testSuccess;
|
||||
|
||||
NCCLCHECK(ncclGroupStart());
|
||||
if (rank == root) {
|
||||
for (int r=0; r<nRanks; r++) {
|
||||
NCCLCHECK(ncclSend(((char*)sendbuff)+r*rankOffset, count, type, r, comm, stream));
|
||||
}
|
||||
}
|
||||
NCCLCHECK(ncclRecv(recvbuff, count, type, root, comm, stream));
|
||||
NCCLCHECK(ncclGroupEnd());
|
||||
|
||||
return testSuccess;
|
||||
}
|
||||
|
||||
struct testColl scatterTest = {
|
||||
"Scatter",
|
||||
ScatterGetCollByteCount,
|
||||
ScatterInitData,
|
||||
ScatterGetBw,
|
||||
ScatterRunColl
|
||||
};
|
||||
|
||||
void ScatterGetBuffSize(size_t *sendcount, size_t *recvcount, size_t count, int nranks) {
|
||||
size_t paramcount, sendInplaceOffset, recvInplaceOffset;
|
||||
ScatterGetCollByteCount(sendcount, recvcount, ¶mcount, &sendInplaceOffset, &recvInplaceOffset, count, nranks);
|
||||
}
|
||||
|
||||
testResult_t ScatterRunTest(struct threadArgs* args, int root, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName) {
|
||||
args->collTest = &scatterTest;
|
||||
ncclDataType_t *run_types;
|
||||
const char **run_typenames;
|
||||
int type_count;
|
||||
int begin_root, end_root;
|
||||
|
||||
if ((int)type != -1) {
|
||||
type_count = 1;
|
||||
run_types = &type;
|
||||
run_typenames = &typeName;
|
||||
} else {
|
||||
type_count = test_typenum;
|
||||
run_types = test_types;
|
||||
run_typenames = test_typenames;
|
||||
}
|
||||
|
||||
if (root != -1) {
|
||||
begin_root = end_root = root;
|
||||
} else {
|
||||
begin_root = 0;
|
||||
end_root = args->nProcs*args->nThreads*args->nGpus-1;
|
||||
}
|
||||
|
||||
for (int i=0; i<type_count; i++) {
|
||||
for (int j=begin_root; j<=end_root; j++) {
|
||||
TESTCHECK(TimeTest(args, run_types[i], run_typenames[i], (ncclRedOp_t)0, "", j));
|
||||
}
|
||||
}
|
||||
return testSuccess;
|
||||
}
|
||||
|
||||
struct testEngine scatterEngine = {
|
||||
ScatterGetBuffSize,
|
||||
ScatterRunTest
|
||||
};
|
||||
|
||||
#pragma weak ncclTestEngine=scatterEngine
|
||||
127
src/sendrecv.cu
Normal file
127
src/sendrecv.cu
Normal file
@ -0,0 +1,127 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2016-2021, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
||||
#include "cuda_runtime.h"
|
||||
#include "common.h"
|
||||
|
||||
void print_header() {
|
||||
PRINT("# %10s %12s %8s out-of-place in-place \n", "", "", "");
|
||||
PRINT("# %10s %12s %8s %7s %6s %6s %5s %7s %6s %6s %5s\n", "size", "count", "type",
|
||||
"time", "algbw", "busbw", "error", "time", "algbw", "busbw", "error");
|
||||
PRINT("# %10s %12s %8s %7s %6s %6s %5s %7s %6s %6s %5s\n", "(B)", "(elements)", "",
|
||||
"(us)", "(GB/s)", "(GB/s)", "", "(us)", "(GB/s)", "(GB/s)", "");
|
||||
}
|
||||
|
||||
void print_line_header (size_t size, size_t count, const char *typeName, const char *opName, int root) {
|
||||
PRINT("%12li %12li %8s", size, count, typeName);
|
||||
}
|
||||
|
||||
void SendRecvGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) {
|
||||
*sendcount = count;
|
||||
*recvcount = count;
|
||||
*sendInplaceOffset = 0;
|
||||
*recvInplaceOffset = 0;
|
||||
*paramcount = *sendcount;
|
||||
}
|
||||
|
||||
testResult_t SendRecvInitData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int rep, int in_place) {
|
||||
size_t sendcount = args->sendBytes / wordSize(type);
|
||||
size_t recvcount = args->expectedBytes / wordSize(type);
|
||||
int nranks = args->nProcs*args->nThreads*args->nGpus;
|
||||
|
||||
for (int i=0; i<args->nGpus; i++) {
|
||||
int gpuid = args->localRank*args->nThreads*args->nGpus + args->thread*args->nGpus + i;
|
||||
CUDACHECK(cudaSetDevice(gpuid));
|
||||
int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i);
|
||||
CUDACHECK(cudaMemset(args->recvbuffs[i], 0, args->expectedBytes));
|
||||
void* data = in_place ? args->recvbuffs[i] : args->sendbuffs[i];
|
||||
TESTCHECK(InitData(data, sendcount, type, rep, rank));
|
||||
int peer = (rank-1+nranks)%nranks;
|
||||
TESTCHECK(InitData(args->expected[i], recvcount, type, rep, peer));
|
||||
CUDACHECK(cudaDeviceSynchronize());
|
||||
}
|
||||
// We don't support in-place sendrecv
|
||||
args->reportErrors = in_place ? 0 : 1;
|
||||
return testSuccess;
|
||||
}
|
||||
|
||||
void SendRecvGetBw(size_t count, int typesize, double sec, double* algBw, double* busBw, int nranks) {
|
||||
double baseBw = (double)(count * typesize) / 1.0E9 / sec;
|
||||
|
||||
*algBw = baseBw;
|
||||
double factor = 1;
|
||||
*busBw = baseBw * factor;
|
||||
}
|
||||
|
||||
testResult_t SendRecvRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
|
||||
int nRanks;
|
||||
NCCLCHECK(ncclCommCount(comm, &nRanks));
|
||||
int rank;
|
||||
NCCLCHECK(ncclCommUserRank(comm, &rank));
|
||||
int recvPeer = (rank-1+nRanks) % nRanks;
|
||||
int sendPeer = (rank+1) % nRanks;
|
||||
|
||||
NCCLCHECK(ncclGroupStart());
|
||||
NCCLCHECK(ncclSend(sendbuff, count, type, sendPeer, comm, stream));
|
||||
NCCLCHECK(ncclRecv(recvbuff, count, type, recvPeer, comm, stream));
|
||||
NCCLCHECK(ncclGroupEnd());
|
||||
return testSuccess;
|
||||
}
|
||||
|
||||
struct testColl sendRecvTest = {
|
||||
"SendRecv",
|
||||
SendRecvGetCollByteCount,
|
||||
SendRecvInitData,
|
||||
SendRecvGetBw,
|
||||
SendRecvRunColl
|
||||
};
|
||||
|
||||
void SendRecvGetBuffSize(size_t *sendcount, size_t *recvcount, size_t count, int nranks) {
|
||||
size_t paramcount, sendInplaceOffset, recvInplaceOffset;
|
||||
SendRecvGetCollByteCount(sendcount, recvcount, ¶mcount, &sendInplaceOffset, &recvInplaceOffset, count, nranks);
|
||||
}
|
||||
|
||||
testResult_t SendRecvRunTest(struct threadArgs* args, int root, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName) {
|
||||
args->collTest = &sendRecvTest;
|
||||
ncclDataType_t *run_types;
|
||||
ncclRedOp_t *run_ops;
|
||||
const char **run_typenames, **run_opnames;
|
||||
int type_count, op_count;
|
||||
|
||||
if ((int)type != -1) {
|
||||
type_count = 1;
|
||||
run_types = &type;
|
||||
run_typenames = &typeName;
|
||||
} else {
|
||||
type_count = test_typenum;
|
||||
run_types = test_types;
|
||||
run_typenames = test_typenames;
|
||||
}
|
||||
|
||||
if ((int)op != -1) {
|
||||
op_count = 1;
|
||||
run_ops = &op;
|
||||
run_opnames = &opName;
|
||||
} else {
|
||||
op_count = test_opnum;
|
||||
run_ops = test_ops;
|
||||
run_opnames = test_opnames;
|
||||
}
|
||||
|
||||
for (int i=0; i<type_count; i++) {
|
||||
for (int j=0; j<op_count; j++) {
|
||||
TESTCHECK(TimeTest(args, run_types[i], run_typenames[i], run_ops[j], run_opnames[j], -1));
|
||||
}
|
||||
}
|
||||
return testSuccess;
|
||||
}
|
||||
|
||||
struct testEngine sendRecvEngine = {
|
||||
SendRecvGetBuffSize,
|
||||
SendRecvRunTest
|
||||
};
|
||||
|
||||
#pragma weak ncclTestEngine=sendRecvEngine
|
||||
Loading…
Reference in New Issue
Block a user