mirror of
https://github.com/NVIDIA/nccl-tests.git
synced 2026-05-01 11:58:19 +08:00
add ucommd get_bw for checking the perf results (busbw).
* do check for AllReduce only. * disable option '-t', thus nThreads = 1 always. * message size, min+max bytes, timeouts, etc. are fed automatically. * support checking results when running in comm split mode. other changes: * try to get physical hostname via env 'NODE_NAME'. * check ib port state and print a log if not up nor active. * default stepFactor is changed to '2', datatype is changed to 'bf16'.
This commit is contained in:
parent
fd83f7ca84
commit
395e345502
@ -106,7 +106,12 @@ ${DST_DIR}/timer.o: timer.cc timer.h
|
||||
@mkdir -p ${DST_DIR}
|
||||
$(CXX) $(CXXFLAGS) -o $@ -c timer.cc
|
||||
|
||||
${DST_DIR}/%_perf:${DST_DIR}/%.o ${DST_DIR}/common.o ${DST_DIR}/timer.o $(TEST_VERIFIABLE_OBJS)
|
||||
${DST_DIR}/ucommd.o: ucommd.cc ucommd.h
|
||||
@printf "Compiling %-35s > %s\n" $< $@
|
||||
@mkdir -p ${DST_DIR}
|
||||
$(CXX) $(CXXFLAGS) -o $@ -c ucommd.cc
|
||||
|
||||
${DST_DIR}/%_perf:${DST_DIR}/%.o ${DST_DIR}/common.o ${DST_DIR}/timer.o ${DST_DIR}/ucommd.o $(TEST_VERIFIABLE_OBJS)
|
||||
@printf "Linking %-35s > %s\n" $< $@
|
||||
@mkdir -p ${DST_DIR}
|
||||
$(NVCC) -o $@ $(NVCUFLAGS) $^ ${NVLDFLAGS} -Xcompiler \"-Wl,-rpath,/usr/local/sihpc/lib\"
|
||||
|
||||
@ -14,6 +14,10 @@
|
||||
|
||||
#include "../verifiable/verifiable.h"
|
||||
|
||||
#include "ucommd.h"
|
||||
|
||||
static Ucommd ucommd_;
|
||||
|
||||
int test_ncclVersion = 0; // init'd with ncclGetVersion()
|
||||
|
||||
#if NCCL_MAJOR >= 2
|
||||
@ -64,14 +68,18 @@ static int nGpus = 1;
|
||||
static size_t minBytes = 32*1024*1024;
|
||||
static size_t maxBytes = 32*1024*1024;
|
||||
static size_t stepBytes = 1*1024*1024;
|
||||
static size_t stepFactor = 1;
|
||||
static size_t stepFactor = 2;
|
||||
static int datacheck = 1;
|
||||
static int warmup_iters = 5;
|
||||
static int iters = 20;
|
||||
static int agg_iters = 1;
|
||||
static int run_cycles = 1;
|
||||
static int ncclop = ncclSum;
|
||||
static int nccltype = ncclFloat;
|
||||
#if defined(__CUDA_BF16_TYPES_EXIST__) && NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0)
|
||||
static int nccltype = ncclBfloat16;
|
||||
#else
|
||||
static int nccltype = ncclHalf;
|
||||
#endif
|
||||
static int ncclroot = 0;
|
||||
static int parallel_init = 0;
|
||||
static int blocking_coll = 0;
|
||||
@ -709,11 +717,15 @@ int main(int argc, char* argv[]) {
|
||||
}
|
||||
#endif
|
||||
|
||||
nGpus = ucommd_.getNGpusPerProc();
|
||||
minBytes = maxBytes = ucommd_.getBytes();
|
||||
timeout = ucommd_.getTimeoutSec();
|
||||
|
||||
// Parse args
|
||||
double parsed;
|
||||
int longindex;
|
||||
static struct option longopts[] = {
|
||||
{"nthreads", required_argument, 0, 't'},
|
||||
//{"nthreads", required_argument, 0, 't'},
|
||||
{"ngpus", required_argument, 0, 'g'},
|
||||
{"minbytes", required_argument, 0, 'b'},
|
||||
{"maxbytes", required_argument, 0, 'e'},
|
||||
@ -741,15 +753,16 @@ int main(int argc, char* argv[]) {
|
||||
|
||||
while(1) {
|
||||
int c;
|
||||
c = getopt_long(argc, argv, "t:g:b:e:i:f:n:m:w:N:p:c:o:d:r:z:y:T:hG:C:a:R:", longopts, &longindex);
|
||||
//c = getopt_long(argc, argv, "t:g:b:e:i:f:n:m:w:N:p:c:o:d:r:z:y:T:hG:C:a:R:", longopts, &longindex);
|
||||
c = getopt_long(argc, argv, "g:b:e:i:f:n:m:w:N:p:c:o:d:r:z:y:T:hG:C:a:R:", longopts, &longindex);
|
||||
|
||||
if (c == -1)
|
||||
break;
|
||||
|
||||
switch(c) {
|
||||
case 't':
|
||||
nThreads = strtol(optarg, NULL, 0);
|
||||
break;
|
||||
//case 't':
|
||||
// nThreads = strtol(optarg, NULL, 0);
|
||||
// break;
|
||||
case 'g':
|
||||
nGpus = strtol(optarg, NULL, 0);
|
||||
break;
|
||||
@ -846,7 +859,7 @@ int main(int argc, char* argv[]) {
|
||||
default:
|
||||
if (c != 'h') printf("invalid option '%c'\n", c);
|
||||
printf("USAGE: %s \n\t"
|
||||
"[-t,--nthreads <num threads>] \n\t"
|
||||
// "[-t,--nthreads <num threads>] \n\t"
|
||||
"[-g,--ngpus <gpus per thread>] \n\t"
|
||||
"[-b,--minbytes <min size in bytes>] \n\t"
|
||||
"[-e,--maxbytes <max size in bytes>] \n\t"
|
||||
@ -919,8 +932,10 @@ testResult_t run() {
|
||||
#endif
|
||||
is_main_thread = is_main_proc = (proc == 0) ? 1 : 0;
|
||||
|
||||
PRINT("# nThread %d nGpus %d minBytes %ld maxBytes %ld step: %ld(%s) warmup iters: %d iters: %d agg iters: %d validation: %d graph: %d\n",
|
||||
nThreads, nGpus, minBytes, maxBytes,
|
||||
//PRINT("# nThread %d nGpus %d minBytes %ld maxBytes %ld step: %ld(%s) warmup iters: %d iters: %d agg iters: %d validation: %d graph: %d\n",
|
||||
// nThreads, nGpus, minBytes, maxBytes,
|
||||
PRINT("# nGpus(perProc) %d minBytes %ld maxBytes %ld step: %ld(%s) warmup iters: %d iters: %d agg iters: %d validation: %d graph: %d\n",
|
||||
nGpus, minBytes, maxBytes,
|
||||
(stepFactor > 1)?stepFactor:stepBytes, (stepFactor > 1)?"factor":"bytes",
|
||||
warmup_iters, iters, agg_iters, datacheck, cudaGraphLaunches);
|
||||
if (blocking_coll) PRINT("# Blocking Enabled: wait for completion and barrier after each collective \n");
|
||||
@ -949,7 +964,9 @@ testResult_t run() {
|
||||
// Gather all output in rank order to root (0)
|
||||
MPI_Gather(line, MAX_LINE, MPI_BYTE, lines, MAX_LINE, MPI_BYTE, 0, MPI_COMM_WORLD);
|
||||
if (proc == 0) {
|
||||
for (int p = 0; p < totalProcs; p++)
|
||||
//for (int p = 0; p < totalProcs; p++)
|
||||
int stride = ucommd_.getLocalSize() > 0 ? ucommd_.getLocalSize() : 1;
|
||||
for (int p = stride-1; p < totalProcs; p+=stride)
|
||||
PRINT("%s", lines+MAX_LINE*p);
|
||||
free(lines);
|
||||
}
|
||||
@ -1123,11 +1140,14 @@ testResult_t run() {
|
||||
#endif
|
||||
|
||||
envstr = getenv("NCCL_TESTS_MIN_BW");
|
||||
double check_avg_bw = envstr ? atof(envstr) : -1;
|
||||
//double check_avg_bw = envstr ? atof(envstr) : -1;
|
||||
double check_avg_bw = envstr ? atof(envstr) :
|
||||
(!strcmp(threads[0].args.collTest->name, "AllReduce") && minBytes == maxBytes && minBytes >= ucommd_.getBytes()) ? ucommd_.getBw(nGpus) : -1;
|
||||
bw[0] /= bw_count[0];
|
||||
|
||||
PRINT("# Out of bounds values : %d %s\n", errors[0], errors[0] ? "FAILED" : "OK");
|
||||
PRINT("# Avg bus bandwidth : %g %s\n", bw[0], check_avg_bw == -1 ? "" : (bw[0] < check_avg_bw*(0.9) ? "FAILED" : "OK"));
|
||||
PRINT("# Avg bus bandwidth : %g %s\n", bw[0], check_avg_bw == -1 ? "" : (bw[0] < check_avg_bw/**(0.9)*/ ? "FAILED" : "OK"));
|
||||
if (bw[0] < check_avg_bw) PRINT("# Expected min bandwidth : %g\n", check_avg_bw);
|
||||
PRINT("#\n");
|
||||
#ifdef MPI_SUPPORT
|
||||
MPI_Comm_free(&mpi_comm);
|
||||
@ -1139,7 +1159,7 @@ testResult_t run() {
|
||||
// 'cuda-memcheck --leak-check full' requires this
|
||||
cudaDeviceReset();
|
||||
|
||||
if (errors[0] || bw[0] < check_avg_bw*(0.9))
|
||||
if (errors[0] || bw[0] < check_avg_bw/**(0.9)*/)
|
||||
exit(EXIT_FAILURE);
|
||||
else
|
||||
exit(EXIT_SUCCESS);
|
||||
|
||||
15
src/common.h
15
src/common.h
@ -94,12 +94,6 @@ struct testColl {
|
||||
testResult_t (*runColl)(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type,
|
||||
ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream);
|
||||
};
|
||||
extern struct testColl allReduceTest;
|
||||
extern struct testColl allGatherTest;
|
||||
extern struct testColl reduceScatterTest;
|
||||
extern struct testColl broadcastTest;
|
||||
extern struct testColl reduceTest;
|
||||
extern struct testColl alltoAllTest;
|
||||
|
||||
struct testEngine {
|
||||
void (*getBuffSize)(size_t *sendcount, size_t *recvcount, size_t count, int nranks);
|
||||
@ -162,7 +156,14 @@ extern void AllocateBuffs(void **sendbuff, void **recvbuff, void **expected, voi
|
||||
#include <unistd.h>
|
||||
|
||||
static void getHostName(char* hostname, int maxlen) {
|
||||
gethostname(hostname, maxlen);
|
||||
const char* node_name = getenv("NODE_NAME");
|
||||
if (node_name && node_name[0]) {
|
||||
strncpy(hostname, node_name, maxlen);
|
||||
} else {
|
||||
if (-1 == gethostname(hostname, maxlen)) {
|
||||
strncpy(hostname, "unknown", 16);
|
||||
}
|
||||
}
|
||||
for (int i=0; i< maxlen; i++) {
|
||||
if (hostname[i] == '.') {
|
||||
hostname[i] = '\0';
|
||||
|
||||
250
src/ucommd.cc
Normal file
250
src/ucommd.cc
Normal file
@ -0,0 +1,250 @@
|
||||
/**
|
||||
* Copyright (c) 2024, Scitix Tech PTE. LTD. All rights reserved.
|
||||
*/
|
||||
|
||||
#include <unistd.h>
|
||||
#include <dirent.h>
|
||||
#include <string.h>
|
||||
|
||||
#include <algorithm>
|
||||
#include <vector>
|
||||
#include <string>
|
||||
#include <fstream>
|
||||
|
||||
#include "ucommd.h"
|
||||
|
||||
Ucommd::Ucommd() {
|
||||
(void)_check_multi_node_via_ompi();
|
||||
(void)_check_sys_nv_devices();
|
||||
(void)_check_sys_ib_devices();
|
||||
(void)_get_node_name();
|
||||
}
|
||||
|
||||
Ucommd::~Ucommd() {
|
||||
nvdevs_.clear();
|
||||
ibdevs_.clear();
|
||||
}
|
||||
|
||||
void Ucommd::_check_multi_node_via_ompi() {
|
||||
const auto world_size_env = std::getenv("OMPI_COMM_WORLD_SIZE");
|
||||
if (world_size_env == nullptr) return;
|
||||
world_size_ = std::strtol(world_size_env, nullptr, 10);
|
||||
const auto local_size_env = std::getenv("OMPI_COMM_WORLD_LOCAL_SIZE");
|
||||
if (local_size_env == nullptr) return;
|
||||
local_size_ = std::strtol(local_size_env, nullptr, 10);
|
||||
|
||||
// assume homogeneous mpirun
|
||||
nnodes_ = world_size_ / local_size_;
|
||||
is_multi_node_ = (nnodes_ > 1);
|
||||
}
|
||||
|
||||
void Ucommd::_check_sys_nv_devices() {
|
||||
DIR* dir;
|
||||
dir = opendir("/sys/bus/pci/drivers/nvidia");
|
||||
if (dir) {
|
||||
struct dirent *entry;
|
||||
while ((entry = readdir(dir))) {
|
||||
if (entry->d_name[0] != '0') continue;
|
||||
const auto nvdev = std::string(entry->d_name);
|
||||
auto dev_class = std::ifstream(
|
||||
std::string("/sys/bus/pci/drivers/nvidia/") + nvdev + "/class");
|
||||
if (dev_class.is_open()) {
|
||||
char dclass[16] = {0};
|
||||
dev_class.getline(dclass, 16);
|
||||
if (dev_class.good() &&
|
||||
(std::string("0x030200").compare(dclass) == 0 ||
|
||||
std::string("0x030000").compare(dclass) == 0)) {
|
||||
nvdevs_.push_back(nvdev);
|
||||
}
|
||||
dev_class.close();
|
||||
}
|
||||
}
|
||||
closedir(dir);
|
||||
std::sort(nvdevs_.begin(), nvdevs_.end());
|
||||
}
|
||||
}
|
||||
|
||||
void Ucommd::_check_sys_ib_devices() {
|
||||
DIR* dir;
|
||||
dir = opendir("/sys/class/infiniband");
|
||||
if (dir) {
|
||||
struct dirent *entry;
|
||||
while ((entry = readdir(dir))) {
|
||||
if ((strcmp(entry->d_name, ".") == 0) ||
|
||||
(strcmp(entry->d_name, "..") == 0)) {
|
||||
continue;
|
||||
}
|
||||
const auto ibdev = std::string(entry->d_name);
|
||||
if ([&ibdev] {
|
||||
bool is_ib = false;
|
||||
auto node_type = std::ifstream(
|
||||
std::string("/sys/class/infiniband/") + ibdev + "/node_type");
|
||||
if (node_type.is_open()) {
|
||||
char ntype = node_type.get();
|
||||
if (node_type.good()) is_ib = '1' <= ntype && ntype <= '3';
|
||||
node_type.close();
|
||||
}
|
||||
return is_ib;
|
||||
}() &&
|
||||
[&ibdev] {
|
||||
bool is_cx6 = false;
|
||||
auto hca_type = std::ifstream(
|
||||
std::string("/sys/class/infiniband/") + ibdev + "/hca_type");
|
||||
if (hca_type.is_open()) {
|
||||
char htype[8] = {0};
|
||||
hca_type.getline(htype, 8);
|
||||
if (hca_type.good()) {
|
||||
is_cx6 = std::string("MT4123").compare(htype) == 0 ||
|
||||
std::string("MT4125").compare(htype) == 0 ||
|
||||
std::string("MT4129").compare(htype) == 0 ||
|
||||
std::string("MT4131").compare(htype) == 0 ||
|
||||
std::string("MT4124").compare(htype) == 0;
|
||||
}
|
||||
hca_type.close();
|
||||
}
|
||||
return is_cx6;
|
||||
}() &&
|
||||
[&ibdev, this] {
|
||||
bool port_active = false;
|
||||
auto port_state = std::ifstream(
|
||||
std::string("/sys/class/infiniband/") + ibdev + "/ports/1/state");
|
||||
if (port_state.is_open()) {
|
||||
char state = port_state.get();
|
||||
if (port_state.good()) {
|
||||
port_active = state == '4';
|
||||
}
|
||||
port_state.close();
|
||||
}
|
||||
if (!port_active) {
|
||||
printf("[%s] %s: port not active or unable to get port state\n",
|
||||
node_name_.c_str(), ibdev.c_str());
|
||||
}
|
||||
return port_active;
|
||||
}() &&
|
||||
[&ibdev, this] {
|
||||
bool link_up = false;
|
||||
auto phys_state = std::ifstream(
|
||||
std::string("/sys/class/infiniband/") + ibdev + "/ports/1/phys_state");
|
||||
if (phys_state.is_open()) {
|
||||
char state = phys_state.get();
|
||||
if (phys_state.good()) {
|
||||
link_up = state == '5';
|
||||
}
|
||||
phys_state.close();
|
||||
}
|
||||
if (!link_up) {
|
||||
printf("[%s] %s: phys link not up or unable to get phys state\n",
|
||||
node_name_.c_str(), ibdev.c_str());
|
||||
}
|
||||
return link_up;
|
||||
}()) {
|
||||
ibdevs_.push_back(ibdev);
|
||||
}
|
||||
}
|
||||
closedir(dir);
|
||||
std::sort(ibdevs_.begin(), ibdevs_.end());
|
||||
}
|
||||
}
|
||||
|
||||
void Ucommd::_get_node_name() {
|
||||
const auto node_name_env = std::getenv("NODE_NAME");
|
||||
if (node_name_env && node_name_env[0]) {
|
||||
node_name_.assign(node_name_env);
|
||||
} else {
|
||||
char hostname[128] = {0};
|
||||
if (!gethostname(hostname, 128)) {
|
||||
node_name_.assign(hostname);
|
||||
} else {
|
||||
node_name_.assign("unknown");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
int Ucommd::getLocalSize() const {
|
||||
return local_size_;
|
||||
}
|
||||
|
||||
int Ucommd::getNGpusPerProc() const {
|
||||
return (is_multi_node_ || local_size_ > 1) ? 1 : (int)nvdevs_.size();
|
||||
}
|
||||
|
||||
size_t Ucommd::getBytes() const {
|
||||
return !is_multi_node_ ? 1UL << 32 :
|
||||
world_size_ > 1024 ? ((size_t)world_size_) << 24 :
|
||||
local_size_ > 4 ? ((size_t)world_size_) << 25 :
|
||||
local_size_ > 1 ? ((size_t)world_size_) << 26 :
|
||||
((size_t)world_size_) << 27;
|
||||
}
|
||||
|
||||
int Ucommd::getTimeoutSec() const {
|
||||
return 600;
|
||||
}
|
||||
|
||||
int Ucommd::getBw(int ngpus) {
|
||||
return is_multi_node_ ? get_ib_bw() :
|
||||
(ngpus > 1 || local_size_ > 1) ? get_nvlink_bw() : -1;
|
||||
}
|
||||
|
||||
int Ucommd::get_nvlink_bw() {
|
||||
int bw = -1;
|
||||
if (!nvdevs_.empty()) {
|
||||
auto dev_id = std::ifstream(
|
||||
std::string("/sys/bus/pci/drivers/nvidia/") + nvdevs_.at(0) + "/device");
|
||||
if (dev_id.is_open()) {
|
||||
char device[16] = {0};
|
||||
dev_id.getline(device, 16);
|
||||
if (dev_id.good()) {
|
||||
if (std::string("0x2330").compare(device) == 0) {
|
||||
bw = 450 * 3 / 4;
|
||||
} else
|
||||
if (std::string("0x20b0").compare(device) == 0 ||
|
||||
std::string("0x20b2").compare(device) == 0 ||
|
||||
std::string("0x20b3").compare(device) == 0) {
|
||||
bw = 300 * 2 / 3;
|
||||
} else
|
||||
if (std::string("0x20f3").compare(device) == 0 ||
|
||||
std::string("0x20bd").compare(device) == 0) {
|
||||
bw = 200 * 2 / 3;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return bw;
|
||||
}
|
||||
|
||||
int Ucommd::get_ib_bw() {
|
||||
int bw = -1;
|
||||
if (!ibdevs_.empty()) {
|
||||
int rate = 0;
|
||||
auto port_rate = std::ifstream(
|
||||
std::string("/sys/class/infiniband/") + ibdevs_.at(0) + "/ports/1/rate");
|
||||
if (port_rate.is_open()) {
|
||||
char c;
|
||||
while ((c = port_rate.get()) && ('0' <= c && c <= '9')) {
|
||||
rate = rate * 10 + c - '0';
|
||||
}
|
||||
port_rate.close();
|
||||
}
|
||||
bw = rate * 3 / 32;
|
||||
|
||||
// for DP AllReduce only ...
|
||||
auto nnics = ibdevs_.size();
|
||||
if (local_size_ == 2) {
|
||||
bw = nnics > 1 ? bw * 2 : bw;
|
||||
} else
|
||||
if (local_size_ == 4) {
|
||||
bw = nnics > 3 ? bw * 4 : nnics > 1 ? bw * 2 : bw;
|
||||
} else
|
||||
if (local_size_ == 8) {
|
||||
bw *= nnics;
|
||||
const char* mask_env = getenv("NCCL_TESTS_SPLIT_MASK");
|
||||
if (mask_env) {
|
||||
auto mask = std::strtol(mask_env, nullptr, 10);
|
||||
if (mask == 7 || mask == 3 || mask == 1) {
|
||||
bw /= (mask+1);
|
||||
} // else ???
|
||||
}
|
||||
}
|
||||
}
|
||||
return bw;
|
||||
}
|
||||
45
src/ucommd.h
Normal file
45
src/ucommd.h
Normal file
@ -0,0 +1,45 @@
|
||||
/**
|
||||
* Copyright (c) 2024, Scitix Tech PTE. LTD. All rights reserved.
|
||||
*/
|
||||
|
||||
#ifndef __UCOMMD_H__
|
||||
#define __UCOMMD_H__
|
||||
|
||||
#include <vector>
|
||||
#include <string>
|
||||
|
||||
class Ucommd {
|
||||
public:
|
||||
Ucommd();
|
||||
~Ucommd();
|
||||
|
||||
public:
|
||||
int getLocalSize() const;
|
||||
|
||||
int getNGpusPerProc() const;
|
||||
int getTimeoutSec() const;
|
||||
size_t getBytes() const;
|
||||
|
||||
int getBw(int ngpus = -1);
|
||||
|
||||
private:
|
||||
int get_nvlink_bw();
|
||||
int get_ib_bw();
|
||||
|
||||
private:
|
||||
void _check_multi_node_via_ompi();
|
||||
void _check_sys_nv_devices();
|
||||
void _check_sys_ib_devices();
|
||||
void _get_node_name();
|
||||
|
||||
private:
|
||||
int world_size_ = -1;
|
||||
int local_size_ = -1;
|
||||
int nnodes_ = -1;
|
||||
bool is_multi_node_ = false;
|
||||
std::string node_name_;
|
||||
std::vector<std::string> nvdevs_;
|
||||
std::vector<std::string> ibdevs_;
|
||||
};
|
||||
|
||||
#endif
|
||||
Loading…
Reference in New Issue
Block a user