diff --git a/src/Makefile b/src/Makefile index b4bb42a..0a994b7 100644 --- a/src/Makefile +++ b/src/Makefile @@ -106,7 +106,12 @@ ${DST_DIR}/timer.o: timer.cc timer.h @mkdir -p ${DST_DIR} $(CXX) $(CXXFLAGS) -o $@ -c timer.cc -${DST_DIR}/%_perf:${DST_DIR}/%.o ${DST_DIR}/common.o ${DST_DIR}/timer.o $(TEST_VERIFIABLE_OBJS) +${DST_DIR}/ucommd.o: ucommd.cc ucommd.h + @printf "Compiling %-35s > %s\n" $< $@ + @mkdir -p ${DST_DIR} + $(CXX) $(CXXFLAGS) -o $@ -c ucommd.cc + +${DST_DIR}/%_perf:${DST_DIR}/%.o ${DST_DIR}/common.o ${DST_DIR}/timer.o ${DST_DIR}/ucommd.o $(TEST_VERIFIABLE_OBJS) @printf "Linking %-35s > %s\n" $< $@ @mkdir -p ${DST_DIR} $(NVCC) -o $@ $(NVCUFLAGS) $^ ${NVLDFLAGS} -Xcompiler \"-Wl,-rpath,/usr/local/sihpc/lib\" diff --git a/src/common.cu b/src/common.cu index e1f8a85..c8f323f 100644 --- a/src/common.cu +++ b/src/common.cu @@ -14,6 +14,10 @@ #include "../verifiable/verifiable.h" +#include "ucommd.h" + +static Ucommd ucommd_; + int test_ncclVersion = 0; // init'd with ncclGetVersion() #if NCCL_MAJOR >= 2 @@ -64,14 +68,18 @@ static int nGpus = 1; static size_t minBytes = 32*1024*1024; static size_t maxBytes = 32*1024*1024; static size_t stepBytes = 1*1024*1024; -static size_t stepFactor = 1; +static size_t stepFactor = 2; static int datacheck = 1; static int warmup_iters = 5; static int iters = 20; static int agg_iters = 1; static int run_cycles = 1; static int ncclop = ncclSum; -static int nccltype = ncclFloat; +#if defined(__CUDA_BF16_TYPES_EXIST__) && NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0) +static int nccltype = ncclBfloat16; +#else +static int nccltype = ncclHalf; +#endif static int ncclroot = 0; static int parallel_init = 0; static int blocking_coll = 0; @@ -709,11 +717,15 @@ int main(int argc, char* argv[]) { } #endif + nGpus = ucommd_.getNGpusPerProc(); + minBytes = maxBytes = ucommd_.getBytes(); + timeout = ucommd_.getTimeoutSec(); + // Parse args double parsed; int longindex; static struct option longopts[] = { - {"nthreads", required_argument, 0, 't'}, + //{"nthreads", required_argument, 0, 't'}, {"ngpus", required_argument, 0, 'g'}, {"minbytes", required_argument, 0, 'b'}, {"maxbytes", required_argument, 0, 'e'}, @@ -741,15 +753,16 @@ int main(int argc, char* argv[]) { while(1) { int c; - c = getopt_long(argc, argv, "t:g:b:e:i:f:n:m:w:N:p:c:o:d:r:z:y:T:hG:C:a:R:", longopts, &longindex); + //c = getopt_long(argc, argv, "t:g:b:e:i:f:n:m:w:N:p:c:o:d:r:z:y:T:hG:C:a:R:", longopts, &longindex); + c = getopt_long(argc, argv, "g:b:e:i:f:n:m:w:N:p:c:o:d:r:z:y:T:hG:C:a:R:", longopts, &longindex); if (c == -1) break; switch(c) { - case 't': - nThreads = strtol(optarg, NULL, 0); - break; + //case 't': + // nThreads = strtol(optarg, NULL, 0); + // break; case 'g': nGpus = strtol(optarg, NULL, 0); break; @@ -846,7 +859,7 @@ int main(int argc, char* argv[]) { default: if (c != 'h') printf("invalid option '%c'\n", c); printf("USAGE: %s \n\t" - "[-t,--nthreads ] \n\t" + // "[-t,--nthreads ] \n\t" "[-g,--ngpus ] \n\t" "[-b,--minbytes ] \n\t" "[-e,--maxbytes ] \n\t" @@ -919,8 +932,10 @@ testResult_t run() { #endif is_main_thread = is_main_proc = (proc == 0) ? 1 : 0; - PRINT("# nThread %d nGpus %d minBytes %ld maxBytes %ld step: %ld(%s) warmup iters: %d iters: %d agg iters: %d validation: %d graph: %d\n", - nThreads, nGpus, minBytes, maxBytes, +//PRINT("# nThread %d nGpus %d minBytes %ld maxBytes %ld step: %ld(%s) warmup iters: %d iters: %d agg iters: %d validation: %d graph: %d\n", +// nThreads, nGpus, minBytes, maxBytes, + PRINT("# nGpus(perProc) %d minBytes %ld maxBytes %ld step: %ld(%s) warmup iters: %d iters: %d agg iters: %d validation: %d graph: %d\n", + nGpus, minBytes, maxBytes, (stepFactor > 1)?stepFactor:stepBytes, (stepFactor > 1)?"factor":"bytes", warmup_iters, iters, agg_iters, datacheck, cudaGraphLaunches); if (blocking_coll) PRINT("# Blocking Enabled: wait for completion and barrier after each collective \n"); @@ -949,7 +964,9 @@ testResult_t run() { // Gather all output in rank order to root (0) MPI_Gather(line, MAX_LINE, MPI_BYTE, lines, MAX_LINE, MPI_BYTE, 0, MPI_COMM_WORLD); if (proc == 0) { - for (int p = 0; p < totalProcs; p++) + //for (int p = 0; p < totalProcs; p++) + int stride = ucommd_.getLocalSize() > 0 ? ucommd_.getLocalSize() : 1; + for (int p = stride-1; p < totalProcs; p+=stride) PRINT("%s", lines+MAX_LINE*p); free(lines); } @@ -1123,11 +1140,14 @@ testResult_t run() { #endif envstr = getenv("NCCL_TESTS_MIN_BW"); - double check_avg_bw = envstr ? atof(envstr) : -1; +//double check_avg_bw = envstr ? atof(envstr) : -1; + double check_avg_bw = envstr ? atof(envstr) : + (!strcmp(threads[0].args.collTest->name, "AllReduce") && minBytes == maxBytes && minBytes >= ucommd_.getBytes()) ? ucommd_.getBw(nGpus) : -1; bw[0] /= bw_count[0]; PRINT("# Out of bounds values : %d %s\n", errors[0], errors[0] ? "FAILED" : "OK"); - PRINT("# Avg bus bandwidth : %g %s\n", bw[0], check_avg_bw == -1 ? "" : (bw[0] < check_avg_bw*(0.9) ? "FAILED" : "OK")); + PRINT("# Avg bus bandwidth : %g %s\n", bw[0], check_avg_bw == -1 ? "" : (bw[0] < check_avg_bw/**(0.9)*/ ? "FAILED" : "OK")); + if (bw[0] < check_avg_bw) PRINT("# Expected min bandwidth : %g\n", check_avg_bw); PRINT("#\n"); #ifdef MPI_SUPPORT MPI_Comm_free(&mpi_comm); @@ -1139,7 +1159,7 @@ testResult_t run() { // 'cuda-memcheck --leak-check full' requires this cudaDeviceReset(); - if (errors[0] || bw[0] < check_avg_bw*(0.9)) + if (errors[0] || bw[0] < check_avg_bw/**(0.9)*/) exit(EXIT_FAILURE); else exit(EXIT_SUCCESS); diff --git a/src/common.h b/src/common.h index e6762e1..5bfcd38 100644 --- a/src/common.h +++ b/src/common.h @@ -94,12 +94,6 @@ struct testColl { testResult_t (*runColl)(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream); }; -extern struct testColl allReduceTest; -extern struct testColl allGatherTest; -extern struct testColl reduceScatterTest; -extern struct testColl broadcastTest; -extern struct testColl reduceTest; -extern struct testColl alltoAllTest; struct testEngine { void (*getBuffSize)(size_t *sendcount, size_t *recvcount, size_t count, int nranks); @@ -162,7 +156,14 @@ extern void AllocateBuffs(void **sendbuff, void **recvbuff, void **expected, voi #include static void getHostName(char* hostname, int maxlen) { - gethostname(hostname, maxlen); + const char* node_name = getenv("NODE_NAME"); + if (node_name && node_name[0]) { + strncpy(hostname, node_name, maxlen); + } else { + if (-1 == gethostname(hostname, maxlen)) { + strncpy(hostname, "unknown", 16); + } + } for (int i=0; i< maxlen; i++) { if (hostname[i] == '.') { hostname[i] = '\0'; diff --git a/src/ucommd.cc b/src/ucommd.cc new file mode 100644 index 0000000..d0c1bd6 --- /dev/null +++ b/src/ucommd.cc @@ -0,0 +1,250 @@ +/** + * Copyright (c) 2024, Scitix Tech PTE. LTD. All rights reserved. + */ + +#include +#include +#include + +#include +#include +#include +#include + +#include "ucommd.h" + +Ucommd::Ucommd() { + (void)_check_multi_node_via_ompi(); + (void)_check_sys_nv_devices(); + (void)_check_sys_ib_devices(); + (void)_get_node_name(); +} + +Ucommd::~Ucommd() { + nvdevs_.clear(); + ibdevs_.clear(); +} + +void Ucommd::_check_multi_node_via_ompi() { + const auto world_size_env = std::getenv("OMPI_COMM_WORLD_SIZE"); + if (world_size_env == nullptr) return; + world_size_ = std::strtol(world_size_env, nullptr, 10); + const auto local_size_env = std::getenv("OMPI_COMM_WORLD_LOCAL_SIZE"); + if (local_size_env == nullptr) return; + local_size_ = std::strtol(local_size_env, nullptr, 10); + + // assume homogeneous mpirun + nnodes_ = world_size_ / local_size_; + is_multi_node_ = (nnodes_ > 1); +} + +void Ucommd::_check_sys_nv_devices() { + DIR* dir; + dir = opendir("/sys/bus/pci/drivers/nvidia"); + if (dir) { + struct dirent *entry; + while ((entry = readdir(dir))) { + if (entry->d_name[0] != '0') continue; + const auto nvdev = std::string(entry->d_name); + auto dev_class = std::ifstream( + std::string("/sys/bus/pci/drivers/nvidia/") + nvdev + "/class"); + if (dev_class.is_open()) { + char dclass[16] = {0}; + dev_class.getline(dclass, 16); + if (dev_class.good() && + (std::string("0x030200").compare(dclass) == 0 || + std::string("0x030000").compare(dclass) == 0)) { + nvdevs_.push_back(nvdev); + } + dev_class.close(); + } + } + closedir(dir); + std::sort(nvdevs_.begin(), nvdevs_.end()); + } +} + +void Ucommd::_check_sys_ib_devices() { + DIR* dir; + dir = opendir("/sys/class/infiniband"); + if (dir) { + struct dirent *entry; + while ((entry = readdir(dir))) { + if ((strcmp(entry->d_name, ".") == 0) || + (strcmp(entry->d_name, "..") == 0)) { + continue; + } + const auto ibdev = std::string(entry->d_name); + if ([&ibdev] { + bool is_ib = false; + auto node_type = std::ifstream( + std::string("/sys/class/infiniband/") + ibdev + "/node_type"); + if (node_type.is_open()) { + char ntype = node_type.get(); + if (node_type.good()) is_ib = '1' <= ntype && ntype <= '3'; + node_type.close(); + } + return is_ib; + }() && + [&ibdev] { + bool is_cx6 = false; + auto hca_type = std::ifstream( + std::string("/sys/class/infiniband/") + ibdev + "/hca_type"); + if (hca_type.is_open()) { + char htype[8] = {0}; + hca_type.getline(htype, 8); + if (hca_type.good()) { + is_cx6 = std::string("MT4123").compare(htype) == 0 || + std::string("MT4125").compare(htype) == 0 || + std::string("MT4129").compare(htype) == 0 || + std::string("MT4131").compare(htype) == 0 || + std::string("MT4124").compare(htype) == 0; + } + hca_type.close(); + } + return is_cx6; + }() && + [&ibdev, this] { + bool port_active = false; + auto port_state = std::ifstream( + std::string("/sys/class/infiniband/") + ibdev + "/ports/1/state"); + if (port_state.is_open()) { + char state = port_state.get(); + if (port_state.good()) { + port_active = state == '4'; + } + port_state.close(); + } + if (!port_active) { + printf("[%s] %s: port not active or unable to get port state\n", + node_name_.c_str(), ibdev.c_str()); + } + return port_active; + }() && + [&ibdev, this] { + bool link_up = false; + auto phys_state = std::ifstream( + std::string("/sys/class/infiniband/") + ibdev + "/ports/1/phys_state"); + if (phys_state.is_open()) { + char state = phys_state.get(); + if (phys_state.good()) { + link_up = state == '5'; + } + phys_state.close(); + } + if (!link_up) { + printf("[%s] %s: phys link not up or unable to get phys state\n", + node_name_.c_str(), ibdev.c_str()); + } + return link_up; + }()) { + ibdevs_.push_back(ibdev); + } + } + closedir(dir); + std::sort(ibdevs_.begin(), ibdevs_.end()); + } +} + +void Ucommd::_get_node_name() { + const auto node_name_env = std::getenv("NODE_NAME"); + if (node_name_env && node_name_env[0]) { + node_name_.assign(node_name_env); + } else { + char hostname[128] = {0}; + if (!gethostname(hostname, 128)) { + node_name_.assign(hostname); + } else { + node_name_.assign("unknown"); + } + } +} + +int Ucommd::getLocalSize() const { + return local_size_; +} + +int Ucommd::getNGpusPerProc() const { + return (is_multi_node_ || local_size_ > 1) ? 1 : (int)nvdevs_.size(); +} + +size_t Ucommd::getBytes() const { + return !is_multi_node_ ? 1UL << 32 : + world_size_ > 1024 ? ((size_t)world_size_) << 24 : + local_size_ > 4 ? ((size_t)world_size_) << 25 : + local_size_ > 1 ? ((size_t)world_size_) << 26 : + ((size_t)world_size_) << 27; +} + +int Ucommd::getTimeoutSec() const { + return 600; +} + +int Ucommd::getBw(int ngpus) { + return is_multi_node_ ? get_ib_bw() : + (ngpus > 1 || local_size_ > 1) ? get_nvlink_bw() : -1; +} + +int Ucommd::get_nvlink_bw() { + int bw = -1; + if (!nvdevs_.empty()) { + auto dev_id = std::ifstream( + std::string("/sys/bus/pci/drivers/nvidia/") + nvdevs_.at(0) + "/device"); + if (dev_id.is_open()) { + char device[16] = {0}; + dev_id.getline(device, 16); + if (dev_id.good()) { + if (std::string("0x2330").compare(device) == 0) { + bw = 450 * 3 / 4; + } else + if (std::string("0x20b0").compare(device) == 0 || + std::string("0x20b2").compare(device) == 0 || + std::string("0x20b3").compare(device) == 0) { + bw = 300 * 2 / 3; + } else + if (std::string("0x20f3").compare(device) == 0 || + std::string("0x20bd").compare(device) == 0) { + bw = 200 * 2 / 3; + } + } + } + } + return bw; +} + +int Ucommd::get_ib_bw() { + int bw = -1; + if (!ibdevs_.empty()) { + int rate = 0; + auto port_rate = std::ifstream( + std::string("/sys/class/infiniband/") + ibdevs_.at(0) + "/ports/1/rate"); + if (port_rate.is_open()) { + char c; + while ((c = port_rate.get()) && ('0' <= c && c <= '9')) { + rate = rate * 10 + c - '0'; + } + port_rate.close(); + } + bw = rate * 3 / 32; + + // for DP AllReduce only ... + auto nnics = ibdevs_.size(); + if (local_size_ == 2) { + bw = nnics > 1 ? bw * 2 : bw; + } else + if (local_size_ == 4) { + bw = nnics > 3 ? bw * 4 : nnics > 1 ? bw * 2 : bw; + } else + if (local_size_ == 8) { + bw *= nnics; + const char* mask_env = getenv("NCCL_TESTS_SPLIT_MASK"); + if (mask_env) { + auto mask = std::strtol(mask_env, nullptr, 10); + if (mask == 7 || mask == 3 || mask == 1) { + bw /= (mask+1); + } // else ??? + } + } + } + return bw; +} diff --git a/src/ucommd.h b/src/ucommd.h new file mode 100644 index 0000000..92dc26a --- /dev/null +++ b/src/ucommd.h @@ -0,0 +1,45 @@ +/** + * Copyright (c) 2024, Scitix Tech PTE. LTD. All rights reserved. + */ + +#ifndef __UCOMMD_H__ +#define __UCOMMD_H__ + +#include +#include + +class Ucommd { + public: + Ucommd(); + ~Ucommd(); + + public: + int getLocalSize() const; + + int getNGpusPerProc() const; + int getTimeoutSec() const; + size_t getBytes() const; + + int getBw(int ngpus = -1); + + private: + int get_nvlink_bw(); + int get_ib_bw(); + + private: + void _check_multi_node_via_ompi(); + void _check_sys_nv_devices(); + void _check_sys_ib_devices(); + void _get_node_name(); + + private: + int world_size_ = -1; + int local_size_ = -1; + int nnodes_ = -1; + bool is_multi_node_ = false; + std::string node_name_; + std::vector nvdevs_; + std::vector ibdevs_; +}; + +#endif