From e37545e4911c210558baba789941ea7bf59db00d Mon Sep 17 00:00:00 2001 From: David Addison Date: Mon, 15 Mar 2021 14:44:06 -0700 Subject: [PATCH] Add support for new datatype: bfloat16 --- src/common.cu | 41 +++++++++++++++++++++++++++++++++++++---- src/common.h | 3 +++ 2 files changed, 40 insertions(+), 4 deletions(-) diff --git a/src/common.cu b/src/common.cu index ff4e1fd..4589593 100644 --- a/src/common.cu +++ b/src/common.cu @@ -12,8 +12,16 @@ #include "cuda.h" #if NCCL_MAJOR >= 2 -ncclDataType_t test_types[ncclNumTypes] = {ncclInt8, ncclUint8, ncclInt32, ncclUint32, ncclInt64, ncclUint64, ncclHalf, ncclFloat, ncclDouble}; -const char *test_typenames[ncclNumTypes] = {"int8", "uint8", "int32", "uint32", "int64", "uint64", "half", "float", "double"}; +ncclDataType_t test_types[ncclNumTypes] = {ncclInt8, ncclUint8, ncclInt32, ncclUint32, ncclInt64, ncclUint64, ncclHalf, ncclFloat, ncclDouble, +#if defined(__CUDA_BF16_TYPES_EXIST__) + ncclBfloat16 +#endif +}; +const char *test_typenames[ncclNumTypes] = {"int8", "uint8", "int32", "uint32", "int64", "uint64", "half", "float", "double", +#if defined(__CUDA_BF16_TYPES_EXIST__) + "bfloat16" +#endif +}; #else ncclDataType_t test_types[ncclNumTypes] = {ncclChar, ncclInt, ncclHalf, ncclFloat, ncclDouble, ncclInt64, ncclUint64}; const char *test_typenames[ncclNumTypes] = {"char", "int", "half", "float", "double", "int64", "uint64"}; @@ -61,6 +69,9 @@ double parsesize(char *value) { double DeltaMaxValue(ncclDataType_t type) { switch(type) { case ncclHalf: return 1e-2; +#if defined(__CUDA_BF16_TYPES_EXIST__) + case ncclBfloat16: return 1e-2; +#endif case ncclFloat: return 1e-5; case ncclDouble: return 1e-12; case ncclInt: @@ -95,6 +106,12 @@ template<> __device__ float toFloat(half a) { return __half2float(a); } +#if defined(__CUDA_BF16_TYPES_EXIST__) +template<> __device__ +float toFloat(__nv_bfloat16 a) { + return __bfloat162float(a); +} +#endif template __global__ void deltaKern(void* A_, void* B_, size_t count, double* max) { @@ -128,6 +145,10 @@ void deltaKern(void* A_, void* B_, size_t count, double* max) { testResult_t CheckDelta(void* expected, void* results, size_t count, ncclDataType_t type, double* devmax) { switch (type) { +#if defined(__CUDA_BF16_TYPES_EXIST__) + case ncclBfloat16: + deltaKern<__nv_bfloat16, 512><<<1, 512>>>(results, expected, count, devmax); break; +#endif case ncclHalf: deltaKern<<<1, 512>>>(results, expected, count, devmax); break; case ncclFloat: @@ -174,6 +195,12 @@ template<> __device__ half testValue(const size_t offset, const int rep, const int rank) { return __float2half(testValue(offset, rep, rank)); } +#if defined(__CUDA_BF16_TYPES_EXIST__) +template<> +__device__ __nv_bfloat16 testValue<__nv_bfloat16>(const size_t offset, const int rep, const int rank) { + return __float2bfloat16(testValue(offset, rep, rank)); +} +#endif // Operations template @@ -210,7 +237,10 @@ __global__ void InitDataReduceKernel(T* data, const size_t N, const size_t offse #define OPS(type) KERN(type, ncclOpSum), KERN(type, ncclOpProd), KERN(type, ncclOpMax), KERN(type, ncclOpMin) static void* const redInitDataKerns[ncclNumOps*ncclNumTypes] = { - OPS(int8_t), OPS(uint8_t), OPS(int32_t), OPS(uint32_t), OPS(int64_t), OPS(uint64_t), OPS(half), OPS(float), OPS(double) + OPS(int8_t), OPS(uint8_t), OPS(int32_t), OPS(uint32_t), OPS(int64_t), OPS(uint64_t), OPS(half), OPS(float), OPS(double), +#if defined(__CUDA_BF16_TYPES_EXIST__) + OPS(__nv_bfloat16) +#endif }; testResult_t InitDataReduce(void* data, const size_t count, const size_t offset, ncclDataType_t type, ncclRedOp_t op, const int rep, const int nranks) { @@ -236,7 +266,10 @@ static void* const initDataKerns[ncclNumTypes] = { (void*)InitDataKernel, (void*)InitDataKernel< half>, (void*)InitDataKernel< float>, - (void*)InitDataKernel< double> + (void*)InitDataKernel< double>, +#if defined(__CUDA_BF16_TYPES_EXIST__) + (void*)InitDataKernel<__nv_bfloat16>, +#endif }; template diff --git a/src/common.h b/src/common.h index 865ee25..c869254 100644 --- a/src/common.h +++ b/src/common.h @@ -213,6 +213,9 @@ static size_t wordSize(ncclDataType_t type) { #endif return 1; case ncclHalf: +#if defined(__CUDA_BF16_TYPES_EXIST__) + case ncclBfloat16: +#endif //case ncclFloat16: return 2; case ncclInt: