From e37545e4911c210558baba789941ea7bf59db00d Mon Sep 17 00:00:00 2001 From: David Addison Date: Mon, 15 Mar 2021 14:44:06 -0700 Subject: [PATCH 1/7] Add support for new datatype: bfloat16 --- src/common.cu | 41 +++++++++++++++++++++++++++++++++++++---- src/common.h | 3 +++ 2 files changed, 40 insertions(+), 4 deletions(-) diff --git a/src/common.cu b/src/common.cu index ff4e1fd..4589593 100644 --- a/src/common.cu +++ b/src/common.cu @@ -12,8 +12,16 @@ #include "cuda.h" #if NCCL_MAJOR >= 2 -ncclDataType_t test_types[ncclNumTypes] = {ncclInt8, ncclUint8, ncclInt32, ncclUint32, ncclInt64, ncclUint64, ncclHalf, ncclFloat, ncclDouble}; -const char *test_typenames[ncclNumTypes] = {"int8", "uint8", "int32", "uint32", "int64", "uint64", "half", "float", "double"}; +ncclDataType_t test_types[ncclNumTypes] = {ncclInt8, ncclUint8, ncclInt32, ncclUint32, ncclInt64, ncclUint64, ncclHalf, ncclFloat, ncclDouble, +#if defined(__CUDA_BF16_TYPES_EXIST__) + ncclBfloat16 +#endif +}; +const char *test_typenames[ncclNumTypes] = {"int8", "uint8", "int32", "uint32", "int64", "uint64", "half", "float", "double", +#if defined(__CUDA_BF16_TYPES_EXIST__) + "bfloat16" +#endif +}; #else ncclDataType_t test_types[ncclNumTypes] = {ncclChar, ncclInt, ncclHalf, ncclFloat, ncclDouble, ncclInt64, ncclUint64}; const char *test_typenames[ncclNumTypes] = {"char", "int", "half", "float", "double", "int64", "uint64"}; @@ -61,6 +69,9 @@ double parsesize(char *value) { double DeltaMaxValue(ncclDataType_t type) { switch(type) { case ncclHalf: return 1e-2; +#if defined(__CUDA_BF16_TYPES_EXIST__) + case ncclBfloat16: return 1e-2; +#endif case ncclFloat: return 1e-5; case ncclDouble: return 1e-12; case ncclInt: @@ -95,6 +106,12 @@ template<> __device__ float toFloat(half a) { return __half2float(a); } +#if defined(__CUDA_BF16_TYPES_EXIST__) +template<> __device__ +float toFloat(__nv_bfloat16 a) { + return __bfloat162float(a); +} +#endif template __global__ void deltaKern(void* A_, void* B_, size_t count, double* max) { @@ -128,6 +145,10 @@ void deltaKern(void* A_, void* B_, size_t count, double* max) { testResult_t CheckDelta(void* expected, void* results, size_t count, ncclDataType_t type, double* devmax) { switch (type) { +#if defined(__CUDA_BF16_TYPES_EXIST__) + case ncclBfloat16: + deltaKern<__nv_bfloat16, 512><<<1, 512>>>(results, expected, count, devmax); break; +#endif case ncclHalf: deltaKern<<<1, 512>>>(results, expected, count, devmax); break; case ncclFloat: @@ -174,6 +195,12 @@ template<> __device__ half testValue(const size_t offset, const int rep, const int rank) { return __float2half(testValue(offset, rep, rank)); } +#if defined(__CUDA_BF16_TYPES_EXIST__) +template<> +__device__ __nv_bfloat16 testValue<__nv_bfloat16>(const size_t offset, const int rep, const int rank) { + return __float2bfloat16(testValue(offset, rep, rank)); +} +#endif // Operations template @@ -210,7 +237,10 @@ __global__ void InitDataReduceKernel(T* data, const size_t N, const size_t offse #define OPS(type) KERN(type, ncclOpSum), KERN(type, ncclOpProd), KERN(type, ncclOpMax), KERN(type, ncclOpMin) static void* const redInitDataKerns[ncclNumOps*ncclNumTypes] = { - OPS(int8_t), OPS(uint8_t), OPS(int32_t), OPS(uint32_t), OPS(int64_t), OPS(uint64_t), OPS(half), OPS(float), OPS(double) + OPS(int8_t), OPS(uint8_t), OPS(int32_t), OPS(uint32_t), OPS(int64_t), OPS(uint64_t), OPS(half), OPS(float), OPS(double), +#if defined(__CUDA_BF16_TYPES_EXIST__) + OPS(__nv_bfloat16) +#endif }; testResult_t InitDataReduce(void* data, const size_t count, const size_t offset, ncclDataType_t type, ncclRedOp_t op, const int rep, const int nranks) { @@ -236,7 +266,10 @@ static void* const initDataKerns[ncclNumTypes] = { (void*)InitDataKernel, (void*)InitDataKernel< half>, (void*)InitDataKernel< float>, - (void*)InitDataKernel< double> + (void*)InitDataKernel< double>, +#if defined(__CUDA_BF16_TYPES_EXIST__) + (void*)InitDataKernel<__nv_bfloat16>, +#endif }; template diff --git a/src/common.h b/src/common.h index 865ee25..c869254 100644 --- a/src/common.h +++ b/src/common.h @@ -213,6 +213,9 @@ static size_t wordSize(ncclDataType_t type) { #endif return 1; case ncclHalf: +#if defined(__CUDA_BF16_TYPES_EXIST__) + case ncclBfloat16: +#endif //case ncclFloat16: return 2; case ncclInt: From cde7e769c1879a77daddebe9da164513e030105b Mon Sep 17 00:00:00 2001 From: David Addison Date: Thu, 17 Jun 2021 14:08:43 -0700 Subject: [PATCH 2/7] Add support for ncclAvg operation --- src/all_gather.cu | 2 +- src/all_reduce.cu | 4 +- src/alltoall.cu | 2 +- src/broadcast.cu | 2 +- src/common.cu | 92 ++++++++++++++++++++++++++++++++++++------- src/common.h | 5 ++- src/reduce.cu | 4 +- src/reduce_scatter.cu | 4 +- 8 files changed, 90 insertions(+), 25 deletions(-) diff --git a/src/all_gather.cu b/src/all_gather.cu index f5bc44c..ee1d0ea 100644 --- a/src/all_gather.cu +++ b/src/all_gather.cu @@ -84,7 +84,7 @@ testResult_t AllGatherRunTest(struct threadArgs* args, int root, ncclDataType_t run_types = &type; run_typenames = &typeName; } else { - type_count = ncclNumTypes; + type_count = test_typenum; run_types = test_types; run_typenames = test_typenames; } diff --git a/src/all_reduce.cu b/src/all_reduce.cu index bd8daaf..52dce89 100644 --- a/src/all_reduce.cu +++ b/src/all_reduce.cu @@ -83,7 +83,7 @@ testResult_t AllReduceRunTest(struct threadArgs* args, int root, ncclDataType_t run_types = &type; run_typenames = &typeName; } else { - type_count = ncclNumTypes; + type_count = test_typenum; run_types = test_types; run_typenames = test_typenames; } @@ -93,7 +93,7 @@ testResult_t AllReduceRunTest(struct threadArgs* args, int root, ncclDataType_t run_ops = &op; run_opnames = &opName; } else { - op_count = ncclNumOps; + op_count = test_opnum; run_ops = test_ops; run_opnames = test_opnames; } diff --git a/src/alltoall.cu b/src/alltoall.cu index 31cfca0..4afd3eb 100644 --- a/src/alltoall.cu +++ b/src/alltoall.cu @@ -102,7 +102,7 @@ testResult_t AlltoAllRunTest(struct threadArgs* args, int root, ncclDataType_t t run_types = &type; run_typenames = &typeName; } else { - type_count = ncclNumTypes; + type_count = test_typenum; run_types = test_types; run_typenames = test_typenames; } diff --git a/src/broadcast.cu b/src/broadcast.cu index c62a99f..f7c0094 100644 --- a/src/broadcast.cu +++ b/src/broadcast.cu @@ -92,7 +92,7 @@ testResult_t BroadcastRunTest(struct threadArgs* args, int root, ncclDataType_t run_types = &type; run_typenames = &typeName; } else { - type_count = ncclNumTypes; + type_count = test_typenum; run_types = test_types; run_typenames = test_typenames; } diff --git a/src/common.cu b/src/common.cu index 4589593..1313079 100644 --- a/src/common.cu +++ b/src/common.cu @@ -11,23 +11,41 @@ #include #include "cuda.h" +int test_ncclVersion = 0; // init'd with ncclGetVersion() + #if NCCL_MAJOR >= 2 ncclDataType_t test_types[ncclNumTypes] = {ncclInt8, ncclUint8, ncclInt32, ncclUint32, ncclInt64, ncclUint64, ncclHalf, ncclFloat, ncclDouble, -#if defined(__CUDA_BF16_TYPES_EXIST__) +#if defined(__CUDA_BF16_TYPES_EXIST__) && NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0) ncclBfloat16 #endif }; const char *test_typenames[ncclNumTypes] = {"int8", "uint8", "int32", "uint32", "int64", "uint64", "half", "float", "double", -#if defined(__CUDA_BF16_TYPES_EXIST__) +#if defined(__CUDA_BF16_TYPES_EXIST__) && NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0) "bfloat16" #endif }; + +#if defined(__CUDA_BF16_TYPES_EXIST__) && NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0) +int test_typenum = 10; +#else +int test_typenum = 9; +#endif + #else ncclDataType_t test_types[ncclNumTypes] = {ncclChar, ncclInt, ncclHalf, ncclFloat, ncclDouble, ncclInt64, ncclUint64}; const char *test_typenames[ncclNumTypes] = {"char", "int", "half", "float", "double", "int64", "uint64"}; +int test_typenum = 7; #endif + +#if NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0) +ncclRedOp_t test_ops[ncclNumOps] = {ncclSum, ncclProd, ncclMax, ncclMin, ncclAvg}; +const char *test_opnames[ncclNumOps] = {"sum", "prod", "max", "min", "avg"}; +int test_opnum = 5; +#else ncclRedOp_t test_ops[ncclNumOps] = {ncclSum, ncclProd, ncclMax, ncclMin}; const char *test_opnames[ncclNumOps] = {"sum", "prod", "max", "min"}; +int test_opnum = 4; +#endif thread_local int is_main_thread = 0; @@ -126,7 +144,7 @@ void deltaKern(void* A_, void* B_, size_t count, double* max) { if( delta > locmax ) { locmax = delta; #ifdef DEBUG_PRINT - if (delta > .1) printf("Error at %d/%ld : %f != %f\n", i, count, toFloat(A[i]), toFloat(B[i])); + if (delta > .1) printf("Error at %ld/%ld(%p) : %f != %f\n", i, count, B+i, toFloat(A[i]), toFloat(B[i])); #endif } } @@ -222,23 +240,48 @@ __device__ half ncclOpMax(half a, half b) { return __half2float(a)>__half2float( template<> __device__ half ncclOpMin(half a, half b) { return __half2float(a)<__half2float(b) ? a : b; } -template +template +__device__ T ncclPostOpIdent(T x, int n) { return x; } + +template +__device__ T ncclPostOpDiv(T x, int n) { return x/n; } +template<> +__device__ half ncclPostOpDiv(half x, int n) { return __float2half(__half2float(x)/n); } +#if defined(__CUDA_BF16_TYPES_EXIST__) +template<> +__device__ __nv_bfloat16 ncclPostOpDiv<__nv_bfloat16>(__nv_bfloat16 x, int n) { return __float2bfloat16(__bfloat162float(x)/n); } +#endif + +template __global__ void InitDataReduceKernel(T* data, const size_t N, const size_t offset, const int rep, const int nranks) { for (size_t o=blockIdx.x*blockDim.x+threadIdx.x; o(o+offset, rep, 0); for (int i=1; i(o+offset, rep, i)); } - data[o] = val; + data[o] = PostOp(val, nranks); } } -#define KERN(type, op) (void*)InitDataReduceKernel> -#define OPS(type) KERN(type, ncclOpSum), KERN(type, ncclOpProd), KERN(type, ncclOpMax), KERN(type, ncclOpMin) +#define KERN(type, op, postop) (void*)InitDataReduceKernel, postop > +#if NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0) + #define OPS(type) \ + KERN(type, ncclOpSum, ncclPostOpIdent), \ + KERN(type, ncclOpProd, ncclPostOpIdent), \ + KERN(type, ncclOpMax, ncclPostOpIdent), \ + KERN(type, ncclOpMin, ncclPostOpIdent), \ + KERN(type, ncclOpSum/*Avg*/, ncclPostOpDiv) +#else + #define OPS(type) \ + KERN(type, ncclOpSum, ncclPostOpIdent), \ + KERN(type, ncclOpProd, ncclPostOpIdent), \ + KERN(type, ncclOpMax, ncclPostOpIdent), \ + KERN(type, ncclOpMin, ncclPostOpIdent) +#endif static void* const redInitDataKerns[ncclNumOps*ncclNumTypes] = { OPS(int8_t), OPS(uint8_t), OPS(int32_t), OPS(uint32_t), OPS(int64_t), OPS(uint64_t), OPS(half), OPS(float), OPS(double), -#if defined(__CUDA_BF16_TYPES_EXIST__) +#if defined(__CUDA_BF16_TYPES_EXIST__) && NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0) OPS(__nv_bfloat16) #endif }; @@ -267,7 +310,7 @@ static void* const initDataKerns[ncclNumTypes] = { (void*)InitDataKernel< half>, (void*)InitDataKernel< float>, (void*)InitDataKernel< double>, -#if defined(__CUDA_BF16_TYPES_EXIST__) +#if defined(__CUDA_BF16_TYPES_EXIST__) && NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0) (void*)InitDataKernel<__nv_bfloat16>, #endif }; @@ -367,7 +410,7 @@ testResult_t testStreamSynchronize(int ngpus, cudaStream_t* streams, ncclComm_t* if (cudaErr != cudaErrorNotReady) CUDACHECK(cudaErr); #if NCCL_VERSION_CODE >= NCCL_VERSION(2,4,0) - if (comms) { + if (test_ncclVersion >= NCCL_VERSION(2,4,0) && comms) { ncclResult_t ncclAsyncErr; NCCLCHECK(ncclCommGetAsyncError(comms[i], &ncclAsyncErr)); if (ncclAsyncErr != ncclSuccess) { @@ -602,6 +645,17 @@ int main(int argc, char* argv[]) { // Make sure everyline is flushed so that we see the progress of the test setlinebuf(stdout); + #if NCCL_VERSION_CODE >= NCCL_VERSION(2,4,0) + ncclGetVersion(&test_ncclVersion); + #else + test_ncclVersion = NCCL_VERSION_CODE; + #endif + //printf("# NCCL_VERSION_CODE=%d ncclGetVersion=%d\n", NCCL_VERSION_CODE, test_ncclVersion); + if (NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0) && test_ncclVersion < NCCL_VERSION(2,10,0)) { + test_opnum -= 1; // exclude ncclAvg + test_typenum -= 1; // exclude bfloat16 + } + // Parse args int longindex; static struct option longopts[] = { @@ -653,7 +707,7 @@ int main(int argc, char* argv[]) { iters = (int)strtol(optarg, NULL, 0); break; case 'm': -#if NCCL_MAJOR >= 2 && NCCL_MINOR >= 2 +#if NCCL_MAJOR > 2 || (NCCL_MAJOR >= 2 && NCCL_MINOR >= 2) agg_iters = (int)strtol(optarg, NULL, 0); #else printf("Option -m not supported before NCCL 2.2. Ignoring\n"); @@ -693,7 +747,11 @@ int main(int argc, char* argv[]) { "[-w,--warmup_iters ] \n\t" "[-p,--parallel_init <0/1>] \n\t" "[-c,--check <0/1>] \n\t" +#if NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0) + "[-o,--op ] \n\t" +#else "[-o,--op ] \n\t" +#endif "[-d,--datatype ] \n\t" "[-r,--root ] \n\t" "[-z,--blocking <0/1>] \n\t" @@ -701,8 +759,8 @@ int main(int argc, char* argv[]) { basename(argv[0])); return 0; default: - printf("invalid option \n"); - printf("USAGE: %s \n\t" + if (c != 'h') printf("invalid option '%c'\n", c); + printf("USAGE: %s \n\t" "[-t,--nthreads ] \n\t" "[-g,--ngpus ] \n\t" "[-b,--minbytes ] \n\t" @@ -714,7 +772,11 @@ int main(int argc, char* argv[]) { "[-w,--warmup_iters ] \n\t" "[-p,--parallel_init <0/1>] \n\t" "[-c,--check <0/1>] \n\t" +#if NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0) + "[-o,--op ] \n\t" +#else "[-o,--op ] \n\t" +#endif "[-d,--datatype ] \n\t" "[-r,--root ] \n\t" "[-z,--blocking <0/1>] \n\t" @@ -899,8 +961,8 @@ testResult_t run() { // Free off CUDA allocated memory for (int i=0; i Date: Mon, 28 Jun 2021 10:12:34 -0700 Subject: [PATCH 3/7] Fixed formatting for bfloat16 support --- src/all_gather.cu | 8 ++++---- src/all_reduce.cu | 8 ++++---- src/alltoall.cu | 8 ++++---- src/broadcast.cu | 8 ++++---- src/reduce.cu | 8 ++++---- src/reduce_scatter.cu | 8 ++++---- 6 files changed, 24 insertions(+), 24 deletions(-) diff --git a/src/all_gather.cu b/src/all_gather.cu index ee1d0ea..0b9e0cc 100644 --- a/src/all_gather.cu +++ b/src/all_gather.cu @@ -8,15 +8,15 @@ #include "common.h" void print_header() { - PRINT("# %10s %12s %6s out-of-place in-place \n", "", "", ""); - PRINT("# %10s %12s %6s %7s %6s %6s %5s %7s %6s %6s %5s\n", "size", "count", "type", + PRINT("# %10s %12s %8s out-of-place in-place \n", "", "", ""); + PRINT("# %10s %12s %8s %7s %6s %6s %5s %7s %6s %6s %5s\n", "size", "count", "type", "time", "algbw", "busbw", "error", "time", "algbw", "busbw", "error"); - PRINT("# %10s %12s %6s %7s %6s %6s %5s %7s %6s %6s %5s\n", "(B)", "(elements)", "", + PRINT("# %10s %12s %8s %7s %6s %6s %5s %7s %6s %6s %5s\n", "(B)", "(elements)", "", "(us)", "(GB/s)", "(GB/s)", "", "(us)", "(GB/s)", "(GB/s)", ""); } void print_line_header (size_t size, size_t count, const char *typeName, const char *opName, int root) { - PRINT("%12li %12li %6s", size, count, typeName); + PRINT("%12li %12li %8s", size, count, typeName); } void AllGatherGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) { diff --git a/src/all_reduce.cu b/src/all_reduce.cu index 52dce89..9b6b7f0 100644 --- a/src/all_reduce.cu +++ b/src/all_reduce.cu @@ -8,15 +8,15 @@ #include "common.h" void print_header() { - PRINT("# %10s %12s %6s %6s out-of-place in-place \n", "", "", "", ""); - PRINT("# %10s %12s %6s %6s %7s %6s %6s %5s %7s %6s %6s %5s\n", "size", "count", "type", "redop", + PRINT("# %10s %12s %8s %6s out-of-place in-place \n", "", "", "", ""); + PRINT("# %10s %12s %8s %6s %7s %6s %6s %5s %7s %6s %6s %5s\n", "size", "count", "type", "redop", "time", "algbw", "busbw", "error", "time", "algbw", "busbw", "error"); - PRINT("# %10s %12s %6s %6s %7s %6s %6s %5s %7s %6s %6s %5s\n", "(B)", "(elements)", "", "", + PRINT("# %10s %12s %8s %6s %7s %6s %6s %5s %7s %6s %6s %5s\n", "(B)", "(elements)", "", "", "(us)", "(GB/s)", "(GB/s)", "", "(us)", "(GB/s)", "(GB/s)", ""); } void print_line_header (size_t size, size_t count, const char *typeName, const char *opName, int root) { - PRINT("%12li %12li %6s %6s", size, count, typeName, opName); + PRINT("%12li %12li %8s %6s", size, count, typeName, opName); } void AllReduceGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) { diff --git a/src/alltoall.cu b/src/alltoall.cu index 4afd3eb..8650997 100644 --- a/src/alltoall.cu +++ b/src/alltoall.cu @@ -8,15 +8,15 @@ #include "common.h" void print_header() { - PRINT("# %10s %12s %6s %6s out-of-place in-place \n", "", "", "", ""); - PRINT("# %10s %12s %6s %6s %7s %6s %6s %5s %7s %6s %6s %5s\n", "size", "count", "type", "redop", + PRINT("# %10s %12s %8s %6s out-of-place in-place \n", "", "", "", ""); + PRINT("# %10s %12s %8s %6s %7s %6s %6s %5s %7s %6s %6s %5s\n", "size", "count", "type", "redop", "time", "algbw", "busbw", "error", "time", "algbw", "busbw", "error"); - PRINT("# %10s %12s %6s %6s %7s %6s %6s %5s %7s %6s %6s %5s\n", "(B)", "(elements)", "", "", + PRINT("# %10s %12s %8s %6s %7s %6s %6s %5s %7s %6s %6s %5s\n", "(B)", "(elements)", "", "", "(us)", "(GB/s)", "(GB/s)", "", "(us)", "(GB/s)", "(GB/s)", ""); } void print_line_header (size_t size, size_t count, const char *typeName, const char *opName, int root) { - PRINT("%12li %12li %6s %6s", size, count, typeName, opName); + PRINT("%12li %12li %8s %6s", size, count, typeName, opName); } void AlltoAllGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) { diff --git a/src/broadcast.cu b/src/broadcast.cu index f7c0094..e2b4421 100644 --- a/src/broadcast.cu +++ b/src/broadcast.cu @@ -8,15 +8,15 @@ #include "common.h" void print_header() { - PRINT("# %10s %12s %6s %6s out-of-place in-place \n", "", "", "", ""); - PRINT("# %10s %12s %6s %6s %7s %6s %6s %5s %7s %6s %6s %5s\n", "size", "count", "type", "root", + PRINT("# %10s %12s %8s %6s out-of-place in-place \n", "", "", "", ""); + PRINT("# %10s %12s %8s %6s %7s %6s %6s %5s %7s %6s %6s %5s\n", "size", "count", "type", "root", "time", "algbw", "busbw", "error", "time", "algbw", "busbw", "error"); - PRINT("# %10s %12s %6s %6s %7s %6s %6s %5s %7s %6s %6s %5s\n", "(B)", "(elements)", "", "", + PRINT("# %10s %12s %8s %6s %7s %6s %6s %5s %7s %6s %6s %5s\n", "(B)", "(elements)", "", "", "(us)", "(GB/s)", "(GB/s)", "", "(us)", "(GB/s)", "(GB/s)", ""); } void print_line_header (size_t size, size_t count, const char *typeName, const char *opName, int root) { - PRINT("%12li %12li %6s %6i", size, count, typeName, root); + PRINT("%12li %12li %8s %6i", size, count, typeName, root); } void BroadcastGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) { diff --git a/src/reduce.cu b/src/reduce.cu index e40b501..2787688 100644 --- a/src/reduce.cu +++ b/src/reduce.cu @@ -8,15 +8,15 @@ #include "common.h" void print_header() { - PRINT("# %10s %12s %6s %6s out-of-place in-place \n", "", "", "", ""); - PRINT("# %10s %12s %6s %6s %6s %7s %6s %6s %5s %7s %6s %6s %5s\n", "size", "count", "type", "redop", "root", + PRINT("# %10s %12s %8s %6s out-of-place in-place \n", "", "", "", ""); + PRINT("# %10s %12s %8s %6s %6s %7s %6s %6s %5s %7s %6s %6s %5s\n", "size", "count", "type", "redop", "root", "time", "algbw", "busbw", "error", "time", "algbw", "busbw", "error"); - PRINT("# %10s %12s %6s %6s %6s %7s %6s %6s %5s %7s %6s %6s %5s\n", "(B)", "(elements)", "", "", "", + PRINT("# %10s %12s %8s %6s %6s %7s %6s %6s %5s %7s %6s %6s %5s\n", "(B)", "(elements)", "", "", "", "(us)", "(GB/s)", "(GB/s)", "", "(us)", "(GB/s)", "(GB/s)", ""); } void print_line_header (size_t size, size_t count, const char *typeName, const char *opName, int root) { - PRINT("%12li %12li %6s %6s %6i", size, count, typeName, opName, root); + PRINT("%12li %12li %8s %6s %6i", size, count, typeName, opName, root); } void ReduceGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) { diff --git a/src/reduce_scatter.cu b/src/reduce_scatter.cu index c6de434..b0c4fab 100644 --- a/src/reduce_scatter.cu +++ b/src/reduce_scatter.cu @@ -8,15 +8,15 @@ #include "common.h" void print_header() { - PRINT("# %10s %12s %6s %6s out-of-place in-place \n", "", "", "", ""); - PRINT("# %10s %12s %6s %6s %7s %6s %6s %5s %7s %6s %6s %5s\n", "size", "count", "type", "redop", + PRINT("# %10s %12s %8s %6s out-of-place in-place \n", "", "", "", ""); + PRINT("# %10s %12s %8s %6s %7s %6s %6s %5s %7s %6s %6s %5s\n", "size", "count", "type", "redop", "time", "algbw", "busbw", "error", "time", "algbw", "busbw", "error"); - PRINT("# %10s %12s %6s %6s %7s %6s %6s %5s %7s %6s %6s %5s\n", "(B)", "(elements)", "", "", + PRINT("# %10s %12s %8s %6s %7s %6s %6s %5s %7s %6s %6s %5s\n", "(B)", "(elements)", "", "", "(us)", "(GB/s)", "(GB/s)", "", "(us)", "(GB/s)", "(GB/s)", ""); } void print_line_header (size_t size, size_t count, const char *typeName, const char *opName, int root) { - PRINT("%12li %12li %6s %6s", size, count, typeName, opName); + PRINT("%12li %12li %8s %6s", size, count, typeName, opName); } void ReduceScatterGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) { From e55ad3796d710adcf72778dca02559dc6c9706bb Mon Sep 17 00:00:00 2001 From: David Addison Date: Mon, 28 Jun 2021 14:19:45 -0700 Subject: [PATCH 4/7] Added support for CUDA graph capture/replay (-G) --- src/common.cu | 82 ++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 81 insertions(+), 1 deletion(-) diff --git a/src/common.cu b/src/common.cu index 1313079..c180294 100644 --- a/src/common.cu +++ b/src/common.cu @@ -65,6 +65,7 @@ static int nccltype = ncclFloat; static int ncclroot = 0; static int parallel_init = 0; static int blocking_coll = 0; +static int cudaGraphLaunches = 0; double parsesize(char *value) { long long int units; @@ -481,6 +482,15 @@ testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t Barrier(args); + cudaGraph_t graphs[args->nGpus]; + cudaGraphExec_t graphExec[args->nGpus]; + if (cudaGraphLaunches >= 1) { + // Begin cuda graph capture + for (int i=0; inGpus; i++) { + CUDACHECK(cudaStreamBeginCapture(args->streams[i], args->nThreads > 1 ? cudaStreamCaptureModeThreadLocal : cudaStreamCaptureModeGlobal)); + } + } + // Performance Benchmark auto start = std::chrono::high_resolution_clock::now(); for (int iter = 0; iter < iters; iter++) { @@ -490,11 +500,40 @@ testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t } if (agg_iters>1) NCCLCHECK(ncclGroupEnd()); } + + if (cudaGraphLaunches >= 1) { + // End cuda graph capture + for (int i=0; inGpus; i++) { + CUDACHECK(cudaStreamEndCapture(args->streams[i], graphs+i)); + } + // Instantiate cuda graph + for (int i=0; inGpus; i++) { + CUDACHECK(cudaGraphInstantiate(graphExec+i, graphs[i], NULL, NULL, 0)); + } + // Resync CPU, restart timing, launch cuda graph + Barrier(args); + start = std::chrono::high_resolution_clock::now(); + for (int l=0; lnGpus; i++) { + CUDACHECK(cudaGraphLaunch(graphExec[i], args->streams[i])); + } + } + } + TESTCHECK(completeColl(args)); auto delta = std::chrono::high_resolution_clock::now() - start; double deltaSec = std::chrono::duration_cast>(delta).count(); deltaSec = deltaSec/(iters*agg_iters); + if (cudaGraphLaunches >= 1) deltaSec = deltaSec/cudaGraphLaunches; + + if (cudaGraphLaunches >= 1) { + //destroy cuda graph + for (int i=0; inGpus; i++) { + CUDACHECK(cudaGraphExecDestroy(graphExec[i])); + CUDACHECK(cudaGraphDestroy(graphs[i])); + } + } double algBw, busBw; args->collTest->getBw(count, wordSize(type), deltaSec, &algBw, &busBw, args->nProcs*args->nThreads*args->nGpus); @@ -508,10 +547,41 @@ testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t // Initialize sendbuffs, recvbuffs and expected TESTCHECK(args->collTest->initData(args, type, op, root, rep, in_place)); + if (cudaGraphLaunches >= 1) { + // Begin cuda graph capture for data check + for (int i=0; inGpus; i++) { + CUDACHECK(cudaStreamBeginCapture(args->streams[i], cudaStreamCaptureModeThreadLocal)); + } + } + //test validation in single itertion, should ideally be included into the multi-iteration run TESTCHECK(startColl(args, type, op, root, in_place, 0)); + + if (cudaGraphLaunches >= 1) { + // End cuda graph capture + for (int i=0; inGpus; i++) { + CUDACHECK(cudaStreamEndCapture(args->streams[i], graphs+i)); + } + // Instantiate cuda graph + for (int i=0; inGpus; i++) { + CUDACHECK(cudaGraphInstantiate(graphExec+i, graphs[i], NULL, NULL, 0)); + } + // Launch cuda graph + for (int i=0; inGpus; i++) { + CUDACHECK(cudaGraphLaunch(graphExec[i], args->streams[i])); + } + } + TESTCHECK(completeColl(args)); + if (cudaGraphLaunches >= 1) { + //destroy cuda graph + for (int i=0; inGpus; i++) { + CUDACHECK(cudaGraphExecDestroy(graphExec[i])); + CUDACHECK(cudaGraphDestroy(graphs[i])); + } + } + TESTCHECK(CheckData(args, type, op, root, in_place, &maxDelta)); //aggregate delta from all threads and procs @@ -674,12 +744,13 @@ int main(int argc, char* argv[]) { {"datatype", required_argument, 0, 'd'}, {"root", required_argument, 0, 'r'}, {"blocking", required_argument, 0, 'z'}, + {"cudagraph", required_argument, 0, 'G'}, {"help", no_argument, 0, 'h'} }; while(1) { int c; - c = getopt_long(argc, argv, "t:g:b:e:i:f:n:m:w:p:c:o:d:r:z:h", longopts, &longindex); + c = getopt_long(argc, argv, "t:g:b:e:i:f:n:m:w:p:c:o:d:r:z:hG:", longopts, &longindex); if (c == -1) break; @@ -734,6 +805,13 @@ int main(int argc, char* argv[]) { case 'z': blocking_coll = strtol(optarg, NULL, 0); break; + case 'G': +#if (NCCL_MAJOR > 2 || (NCCL_MAJOR >= 2 && NCCL_MINOR >= 9)) && CUDART_VERSION >= 11030 + cudaGraphLaunches = strtol(optarg, NULL, 0); +#else + printf("Option -G (CUDA graph) not supported before NCCL 2.9 + CUDA 11.3. Ignoring\n"); +#endif + break; case 'h': printf("USAGE: %s \n\t" "[-t,--nthreads ] \n\t" @@ -755,6 +833,7 @@ int main(int argc, char* argv[]) { "[-d,--datatype ] \n\t" "[-r,--root ] \n\t" "[-z,--blocking <0/1>] \n\t" + "[-G,--cudagraph ] \n\t" "[-h,--help]\n", basename(argv[0])); return 0; @@ -780,6 +859,7 @@ int main(int argc, char* argv[]) { "[-d,--datatype ] \n\t" "[-r,--root ] \n\t" "[-z,--blocking <0/1>] \n\t" + "[-G,--cudagraph ] \n\t" "[-h,--help]\n", basename(argv[0])); return 0; From 9dae3d3a37a7505a9eb0622be4268e2d2a3cb5f9 Mon Sep 17 00:00:00 2001 From: David Addison Date: Mon, 28 Jun 2021 16:49:10 -0700 Subject: [PATCH 5/7] Added new tests: scatter, sendrecv, hypercube --- src/Makefile | 4 +- src/hypercube.cu | 124 +++++++++++++++++++++++++++++++++++++++++++++ src/scatter.cu | 125 ++++++++++++++++++++++++++++++++++++++++++++++ src/sendrecv.cu | 127 +++++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 378 insertions(+), 2 deletions(-) create mode 100644 src/hypercube.cu create mode 100644 src/scatter.cu create mode 100644 src/sendrecv.cu diff --git a/src/Makefile b/src/Makefile index 52169bb..26e653e 100644 --- a/src/Makefile +++ b/src/Makefile @@ -1,5 +1,5 @@ # -# Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2015-2021, NVIDIA CORPORATION. All rights reserved. # # See LICENSE.txt for license information # @@ -70,7 +70,7 @@ NVLDFLAGS += $(LIBRARIES:%=-l%) DST_DIR := $(BUILDDIR) SRC_FILES := $(wildcard *.cu) OBJ_FILES := $(SRC_FILES:%.cu=${DST_DIR}/%.o) -BIN_FILES_LIST := all_reduce all_gather broadcast reduce_scatter reduce alltoall +BIN_FILES_LIST := all_reduce all_gather broadcast reduce_scatter reduce alltoall scatter sendrecv hypercube BIN_FILES := $(BIN_FILES_LIST:%=${DST_DIR}/%_perf) build: ${BIN_FILES} diff --git a/src/hypercube.cu b/src/hypercube.cu new file mode 100644 index 0000000..142f1a6 --- /dev/null +++ b/src/hypercube.cu @@ -0,0 +1,124 @@ +/************************************************************************* + * Copyright (c) 2016-2021, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#include "cuda_runtime.h" +#include "common.h" + +#define ALIGN 4 + +void print_header() { + PRINT("# %10s %12s %8s out-of-place in-place \n", "", "", ""); + PRINT("# %10s %12s %8s %7s %6s %6s %5s %7s %6s %6s %5s\n", "size", "count", "type", + "time", "algbw", "busbw", "error", "time", "algbw", "busbw", "error"); + PRINT("# %10s %12s %8s %7s %6s %6s %5s %7s %6s %6s %5s\n", "(B)", "(elements)", "", + "(us)", "(GB/s)", "(GB/s)", "", "(us)", "(GB/s)", "(GB/s)", ""); +} + +void print_line_header (size_t size, size_t count, const char *typeName, const char *opName, int root) { + PRINT("%12li %12li %8s", size, count, typeName); +} + +void HyperCubeGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) { + size_t base = (count/(ALIGN*nranks))*ALIGN; + *sendcount = base; + *recvcount = base*nranks; + *sendInplaceOffset = base; + *recvInplaceOffset = 0; + *paramcount = base; +} + +testResult_t HyperCubeInitData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int rep, int in_place) { + size_t sendcount = args->sendBytes / wordSize(type); + size_t recvcount = args->expectedBytes / wordSize(type); + int nranks = args->nProcs*args->nThreads*args->nGpus; + + for (int i=0; inGpus; i++) { + int gpuid = args->localRank*args->nThreads*args->nGpus + args->thread*args->nGpus + i; + CUDACHECK(cudaSetDevice(gpuid)); + int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i); + CUDACHECK(cudaMemset(args->recvbuffs[i], 0, args->expectedBytes)); + void* data = in_place ? ((char*)args->recvbuffs[i])+rank*args->sendBytes : args->sendbuffs[i]; + TESTCHECK(InitData(data, sendcount, type, rep, rank)); + for (int j=0; jexpected[i])+args->sendBytes*j, sendcount, type, rep, j)); + } + CUDACHECK(cudaDeviceSynchronize()); + } + return testSuccess; +} + +void HyperCubeGetBw(size_t count, int typesize, double sec, double* algBw, double* busBw, int nranks) { + double baseBw = (double)(count * typesize * (nranks - 1)) / 1.0E9 / sec; + + *algBw = baseBw; + double factor = 1; + *busBw = baseBw * factor; +} + +testResult_t HyperCubeRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) { + char* sbuff = (char*)sendbuff; + char* rbuff = (char*)recvbuff; + int nRanks; + NCCLCHECK(ncclCommCount(comm, &nRanks)); + int rank; + NCCLCHECK(ncclCommUserRank(comm, &rank)); + size_t rankSize = count * wordSize(type); + + if (rbuff+rank*rankSize != sbuff) CUDACHECK(cudaMemcpyAsync(rbuff+rank*rankSize, sbuff, rankSize, cudaMemcpyDeviceToDevice, stream)); + + // Hypercube AllGather + for (int mask=1; maskcollTest = &hyperCubeTest; + ncclDataType_t *run_types; + const char **run_typenames; + int type_count; + + if ((int)type != -1) { + type_count = 1; + run_types = &type; + run_typenames = &typeName; + } else { + type_count = test_typenum; + run_types = test_types; + run_typenames = test_typenames; + } + + for (int i=0; isendBytes / wordSize(type); + size_t recvcount = args->expectedBytes / wordSize(type); + + for (int i=0; inGpus; i++) { + int gpuid = args->localRank*args->nThreads*args->nGpus + args->thread*args->nGpus + i; + CUDACHECK(cudaSetDevice(gpuid)); + int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i); + CUDACHECK(cudaMemset(args->recvbuffs[i], 0, args->expectedBytes)); + void* data = in_place ? args->recvbuffs[i] : args->sendbuffs[i]; + if (rank == root) TESTCHECK(InitData(data, sendcount, type, rep, rank)); + TESTCHECK(InitData(args->expected[i], recvcount, type, rep+rank*recvcount, root)); + CUDACHECK(cudaDeviceSynchronize()); + } + return testSuccess; +} + +void ScatterGetBw(size_t count, int typesize, double sec, double* algBw, double* busBw, int nranks) { + double baseBw = (double)(count * nranks * typesize) / 1.0E9 / sec; + + *algBw = baseBw; + double factor = ((double)(nranks-1))/((double)(nranks)); + *busBw = baseBw * factor; +} + +testResult_t ScatterRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) { + int nRanks; + NCCLCHECK(ncclCommCount(comm, &nRanks)); + int rank; + NCCLCHECK(ncclCommUserRank(comm, &rank)); + size_t rankOffset = count * wordSize(type); + if (count == 0) return testSuccess; + + NCCLCHECK(ncclGroupStart()); + if (rank == root) { + for (int r=0; rcollTest = &scatterTest; + ncclDataType_t *run_types; + const char **run_typenames; + int type_count; + int begin_root, end_root; + + if ((int)type != -1) { + type_count = 1; + run_types = &type; + run_typenames = &typeName; + } else { + type_count = test_typenum; + run_types = test_types; + run_typenames = test_typenames; + } + + if (root != -1) { + begin_root = end_root = root; + } else { + begin_root = 0; + end_root = args->nProcs*args->nThreads*args->nGpus-1; + } + + for (int i=0; isendBytes / wordSize(type); + size_t recvcount = args->expectedBytes / wordSize(type); + int nranks = args->nProcs*args->nThreads*args->nGpus; + + for (int i=0; inGpus; i++) { + int gpuid = args->localRank*args->nThreads*args->nGpus + args->thread*args->nGpus + i; + CUDACHECK(cudaSetDevice(gpuid)); + int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i); + CUDACHECK(cudaMemset(args->recvbuffs[i], 0, args->expectedBytes)); + void* data = in_place ? args->recvbuffs[i] : args->sendbuffs[i]; + TESTCHECK(InitData(data, sendcount, type, rep, rank)); + int peer = (rank-1+nranks)%nranks; + TESTCHECK(InitData(args->expected[i], recvcount, type, rep, peer)); + CUDACHECK(cudaDeviceSynchronize()); + } + // We don't support in-place sendrecv + args->reportErrors = in_place ? 0 : 1; + return testSuccess; +} + +void SendRecvGetBw(size_t count, int typesize, double sec, double* algBw, double* busBw, int nranks) { + double baseBw = (double)(count * typesize) / 1.0E9 / sec; + + *algBw = baseBw; + double factor = 1; + *busBw = baseBw * factor; +} + +testResult_t SendRecvRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) { + int nRanks; + NCCLCHECK(ncclCommCount(comm, &nRanks)); + int rank; + NCCLCHECK(ncclCommUserRank(comm, &rank)); + int recvPeer = (rank-1+nRanks) % nRanks; + int sendPeer = (rank+1) % nRanks; + + NCCLCHECK(ncclGroupStart()); + NCCLCHECK(ncclSend(sendbuff, count, type, sendPeer, comm, stream)); + NCCLCHECK(ncclRecv(recvbuff, count, type, recvPeer, comm, stream)); + NCCLCHECK(ncclGroupEnd()); + return testSuccess; +} + +struct testColl sendRecvTest = { + "SendRecv", + SendRecvGetCollByteCount, + SendRecvInitData, + SendRecvGetBw, + SendRecvRunColl +}; + +void SendRecvGetBuffSize(size_t *sendcount, size_t *recvcount, size_t count, int nranks) { + size_t paramcount, sendInplaceOffset, recvInplaceOffset; + SendRecvGetCollByteCount(sendcount, recvcount, ¶mcount, &sendInplaceOffset, &recvInplaceOffset, count, nranks); +} + +testResult_t SendRecvRunTest(struct threadArgs* args, int root, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName) { + args->collTest = &sendRecvTest; + ncclDataType_t *run_types; + ncclRedOp_t *run_ops; + const char **run_typenames, **run_opnames; + int type_count, op_count; + + if ((int)type != -1) { + type_count = 1; + run_types = &type; + run_typenames = &typeName; + } else { + type_count = test_typenum; + run_types = test_types; + run_typenames = test_typenames; + } + + if ((int)op != -1) { + op_count = 1; + run_ops = &op; + run_opnames = &opName; + } else { + op_count = test_opnum; + run_ops = test_ops; + run_opnames = test_opnames; + } + + for (int i=0; i Date: Mon, 28 Jun 2021 18:23:12 -0700 Subject: [PATCH 6/7] Resync with changes in gitilab-master code --- src/common.cu | 81 +++++++++++++++++++++++---------------------------- src/common.h | 5 ++-- 2 files changed, 40 insertions(+), 46 deletions(-) diff --git a/src/common.cu b/src/common.cu index c180294..7aad2c1 100644 --- a/src/common.cu +++ b/src/common.cu @@ -67,6 +67,8 @@ static int parallel_init = 0; static int blocking_coll = 0; static int cudaGraphLaunches = 0; +#define NUM_BLOCKS 32 + double parsesize(char *value) { long long int units; double size; @@ -137,9 +139,9 @@ void deltaKern(void* A_, void* B_, size_t count, double* max) { const T* A = (const T*)A_; const T* B = (const T*)B_; __shared__ double temp[BSIZE]; - int tid = threadIdx.x; + int tid = blockIdx.x*blockDim.x + threadIdx.x; double locmax = 0.0; - for(int i=tid; i locmax ) { @@ -150,6 +152,7 @@ void deltaKern(void* A_, void* B_, size_t count, double* max) { } } + tid = threadIdx.x; temp[tid] = locmax; for(int stride = BSIZE/2; stride > 1; stride>>=1) { __syncthreads(); @@ -158,38 +161,38 @@ void deltaKern(void* A_, void* B_, size_t count, double* max) { } __syncthreads(); if( threadIdx.x == 0) - *max = temp[0] > temp[1] ? temp[0] : temp[1]; + max[blockIdx.x] = temp[0] > temp[1] ? temp[0] : temp[1]; } - -testResult_t CheckDelta(void* expected, void* results, size_t count, ncclDataType_t type, double* devmax) { +testResult_t CheckDelta(void* results, void* expected, size_t count, ncclDataType_t type, double* devmax) { switch (type) { #if defined(__CUDA_BF16_TYPES_EXIST__) case ncclBfloat16: - deltaKern<__nv_bfloat16, 512><<<1, 512>>>(results, expected, count, devmax); break; + deltaKern<__nv_bfloat16, 512><<>>(results, expected, count, devmax); break; #endif case ncclHalf: - deltaKern<<<1, 512>>>(results, expected, count, devmax); break; + deltaKern<<>>(results, expected, count, devmax); break; case ncclFloat: - deltaKern<<<1, 512>>>(results, expected, count, devmax); break; + deltaKern<<>>(results, expected, count, devmax); break; case ncclDouble: - deltaKern<<<1, 512>>>(results, expected, count, devmax); break; + deltaKern<<>>(results, expected, count, devmax); break; case ncclChar: #if NCCL_MAJOR >= 2 case ncclUint8: #endif - deltaKern<<<1, 512>>>(results, expected, count, devmax); break; + deltaKern<<>>(results, expected, count, devmax); break; case ncclInt: #if NCCL_MAJOR >= 2 case ncclUint32: #endif - deltaKern<<<1, 512>>>(results, expected, count, devmax); break; + deltaKern<<>>(results, expected, count, devmax); break; case ncclInt64: case ncclUint64: - deltaKern<<<1, 512>>>(results, expected, count, devmax); break; + deltaKern<<>>(results, expected, count, devmax); break; } CUDACHECK(cudaDeviceSynchronize()); + for (int i=1; isendBytes, args->expectedBytes); - size_t shift = (totalnbytes * iter) % args->maxbytes; - if (shift + totalnbytes > args->maxbytes) shift = 0; + size_t steps = totalnbytes ? args->maxbytes / totalnbytes : 1; + size_t shift = totalnbytes * (iter % steps); if (args->nGpus > 1) NCCLCHECK(ncclGroupStart()); for (int i = 0; i < args->nGpus; i++) { @@ -475,6 +478,10 @@ testResult_t completeColl(struct threadArgs* args) { testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int in_place) { size_t count = args->nbytes / wordSize(type); + if (datacheck) { + // Initialize sendbuffs, recvbuffs and expected + TESTCHECK(args->collTest->initData(args, type, op, root, 99, in_place)); + } // Sync TESTCHECK(startColl(args, type, op, root, in_place, 0)); @@ -598,10 +605,10 @@ testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t } double timeUsec = deltaSec*1.0E6; - char timeStr[10]; + char timeStr[100]; if (timeUsec > 10000.0) { sprintf(timeStr, "%7.0f", timeUsec); - } else if (timeUsec > 100.0) { + } else if (timeUsec >= 100.0) { sprintf(timeStr, "%7.1f", timeUsec); } else { sprintf(timeStr, "%7.2f", timeUsec); @@ -812,31 +819,6 @@ int main(int argc, char* argv[]) { printf("Option -G (CUDA graph) not supported before NCCL 2.9 + CUDA 11.3. Ignoring\n"); #endif break; - case 'h': - printf("USAGE: %s \n\t" - "[-t,--nthreads ] \n\t" - "[-g,--ngpus ] \n\t" - "[-b,--minbytes ] \n\t" - "[-e,--maxbytes ] \n\t" - "[-i,--stepbytes ] \n\t" - "[-f,--stepfactor ] \n\t" - "[-n,--iters ] \n\t" - "[-m,--agg_iters ] \n\t" - "[-w,--warmup_iters ] \n\t" - "[-p,--parallel_init <0/1>] \n\t" - "[-c,--check <0/1>] \n\t" -#if NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0) - "[-o,--op ] \n\t" -#else - "[-o,--op ] \n\t" -#endif - "[-d,--datatype ] \n\t" - "[-r,--root ] \n\t" - "[-z,--blocking <0/1>] \n\t" - "[-G,--cudagraph ] \n\t" - "[-h,--help]\n", - basename(argv[0])); - return 0; default: if (c != 'h') printf("invalid option '%c'\n", c); printf("USAGE: %s \n\t" @@ -868,7 +850,8 @@ int main(int argc, char* argv[]) { #ifdef MPI_SUPPORT MPI_Init(&argc, &argv); #endif - return run(); + TESTCHECK(run()); + return 0; } testResult_t run() { @@ -900,6 +883,7 @@ testResult_t run() { #define MAX_LINE 2048 char line[MAX_LINE]; int len = 0; + size_t maxMem = ~0; for (int i=0; i memMaxBytes) { + maxBytes = memMaxBytes; + if (proc == 0) printf("#\n# Reducing maxBytes to %ld due to memory limitation\n", maxBytes); + } + ncclUniqueId ncclId; if (proc == 0) { NCCLCHECK(ncclGetUniqueId(&ncclId)); @@ -963,7 +956,7 @@ testResult_t run() { int errors[nThreads]; double bw[nThreads]; double* delta; - CUDACHECK(cudaHostAlloc(&delta, sizeof(double)*nThreads, cudaHostAllocPortable | cudaHostAllocMapped)); + CUDACHECK(cudaHostAlloc(&delta, sizeof(double)*nThreads*NUM_BLOCKS, cudaHostAllocPortable | cudaHostAllocMapped)); int bw_count[nThreads]; for (int t=0; t Date: Wed, 30 Jun 2021 19:36:07 -0700 Subject: [PATCH 7/7] Added new option to report average iteration time --- src/common.cu | 32 +++++++++++++++++++++++++++++++- 1 file changed, 31 insertions(+), 1 deletion(-) diff --git a/src/common.cu b/src/common.cu index 7aad2c1..d9f0368 100644 --- a/src/common.cu +++ b/src/common.cu @@ -66,6 +66,10 @@ static int ncclroot = 0; static int parallel_init = 0; static int blocking_coll = 0; static int cudaGraphLaunches = 0; +#ifdef MPI_SUPPORT +// Report average iteration time: (0=RANK0,1=AVG,2=MIN,3=MAX) +static int average = 1; +#endif #define NUM_BLOCKS 32 @@ -533,6 +537,23 @@ testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t double deltaSec = std::chrono::duration_cast>(delta).count(); deltaSec = deltaSec/(iters*agg_iters); if (cudaGraphLaunches >= 1) deltaSec = deltaSec/cudaGraphLaunches; +#ifdef MPI_SUPPORT + switch (average) { + case 1: + // Calculate the average time across all ranks + MPI_Allreduce(MPI_IN_PLACE, &deltaSec, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD); + deltaSec = deltaSec/(args->nProcs*args->nThreads*args->nGpus); + break; + case 2: + // Obtain the minimum time across all ranks + MPI_Allreduce(MPI_IN_PLACE, &deltaSec, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD); + break; + case 3: + // Obtain the maximum time across all ranks + MPI_Allreduce(MPI_IN_PLACE, &deltaSec, 1, MPI_DOUBLE, MPI_MAX, MPI_COMM_WORLD); + break; + } +#endif if (cudaGraphLaunches >= 1) { //destroy cuda graph @@ -752,12 +773,13 @@ int main(int argc, char* argv[]) { {"root", required_argument, 0, 'r'}, {"blocking", required_argument, 0, 'z'}, {"cudagraph", required_argument, 0, 'G'}, + {"average", required_argument, 0, 'a'}, {"help", no_argument, 0, 'h'} }; while(1) { int c; - c = getopt_long(argc, argv, "t:g:b:e:i:f:n:m:w:p:c:o:d:r:z:hG:", longopts, &longindex); + c = getopt_long(argc, argv, "t:g:b:e:i:f:n:m:w:p:c:o:d:r:z:hG:a:", longopts, &longindex); if (c == -1) break; @@ -819,6 +841,11 @@ int main(int argc, char* argv[]) { printf("Option -G (CUDA graph) not supported before NCCL 2.9 + CUDA 11.3. Ignoring\n"); #endif break; +#ifdef MPI_SUPPORT + case 'a': + average = (int)strtol(optarg, NULL, 0); + break; +#endif default: if (c != 'h') printf("invalid option '%c'\n", c); printf("USAGE: %s \n\t" @@ -842,6 +869,9 @@ int main(int argc, char* argv[]) { "[-r,--root ] \n\t" "[-z,--blocking <0/1>] \n\t" "[-G,--cudagraph ] \n\t" +#ifdef MPI_SUPPORT + "[-a,--average <0/1/2/3> report average iteration time <0=RANK0/1=AVG/2=MIN/3=MAX>] \n\t" +#endif "[-h,--help]\n", basename(argv[0])); return 0;