diff --git a/src/Makefile b/src/Makefile
index b4bb42a..0a994b7 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -106,7 +106,12 @@ ${DST_DIR}/timer.o: timer.cc timer.h
 	@mkdir -p ${DST_DIR}
 	$(CXX) $(CXXFLAGS) -o $@ -c timer.cc
 
-${DST_DIR}/%_perf:${DST_DIR}/%.o ${DST_DIR}/common.o ${DST_DIR}/timer.o $(TEST_VERIFIABLE_OBJS)
+${DST_DIR}/ucommd.o: ucommd.cc ucommd.h
+	@printf "Compiling  %-35s > %s\n" $< $@
+	@mkdir -p ${DST_DIR}
+	$(CXX) $(CXXFLAGS) -o $@ -c ucommd.cc
+
+${DST_DIR}/%_perf:${DST_DIR}/%.o ${DST_DIR}/common.o ${DST_DIR}/timer.o ${DST_DIR}/ucommd.o $(TEST_VERIFIABLE_OBJS)
 	@printf "Linking  %-35s > %s\n" $< $@
 	@mkdir -p ${DST_DIR}
 	$(NVCC) -o $@ $(NVCUFLAGS) $^ ${NVLDFLAGS} -Xcompiler \"-Wl,-rpath,/usr/local/sihpc/lib\"
diff --git a/src/common.cu b/src/common.cu
index e1f8a85..c8f323f 100644
--- a/src/common.cu
+++ b/src/common.cu
@@ -14,6 +14,10 @@
 
 #include "../verifiable/verifiable.h"
 
+#include "ucommd.h"
+
+static Ucommd ucommd_;
+
 int test_ncclVersion = 0; // init'd with ncclGetVersion()
 
 #if NCCL_MAJOR >= 2
@@ -64,14 +68,18 @@ static int nGpus = 1;
 static size_t minBytes = 32*1024*1024;
 static size_t maxBytes = 32*1024*1024;
 static size_t stepBytes = 1*1024*1024;
-static size_t stepFactor = 1;
+static size_t stepFactor = 2;
 static int datacheck = 1;
 static int warmup_iters = 5;
 static int iters = 20;
 static int agg_iters = 1;
 static int run_cycles = 1;
 static int ncclop = ncclSum;
-static int nccltype = ncclFloat;
+#if defined(__CUDA_BF16_TYPES_EXIST__) && NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0)
+static int nccltype = ncclBfloat16;
+#else
+static int nccltype = ncclHalf;
+#endif
 static int ncclroot = 0;
 static int parallel_init = 0;
 static int blocking_coll = 0;
@@ -709,11 +717,15 @@ int main(int argc, char* argv[]) {
     }
   #endif
 
+  nGpus = ucommd_.getNGpusPerProc();
+  minBytes = maxBytes = ucommd_.getBytes();
+  timeout = ucommd_.getTimeoutSec();
+
   // Parse args
   double parsed;
   int longindex;
   static struct option longopts[] = {
-    {"nthreads", required_argument, 0, 't'},
+  //{"nthreads", required_argument, 0, 't'},
     {"ngpus", required_argument, 0, 'g'},
     {"minbytes", required_argument, 0, 'b'},
     {"maxbytes", required_argument, 0, 'e'},
@@ -741,15 +753,16 @@ int main(int argc, char* argv[]) {
 
   while(1) {
     int c;
-    c = getopt_long(argc, argv, "t:g:b:e:i:f:n:m:w:N:p:c:o:d:r:z:y:T:hG:C:a:R:", longopts, &longindex);
+  //c = getopt_long(argc, argv, "t:g:b:e:i:f:n:m:w:N:p:c:o:d:r:z:y:T:hG:C:a:R:", longopts, &longindex);
+    c = getopt_long(argc, argv, "g:b:e:i:f:n:m:w:N:p:c:o:d:r:z:y:T:hG:C:a:R:", longopts, &longindex);
 
     if (c == -1)
       break;
 
     switch(c) {
-      case 't':
-        nThreads = strtol(optarg, NULL, 0);
-        break;
+    //case 't':
+    //  nThreads = strtol(optarg, NULL, 0);
+    //  break;
       case 'g':
         nGpus = strtol(optarg, NULL, 0);
         break;
@@ -846,7 +859,7 @@ int main(int argc, char* argv[]) {
       default:
         if (c != 'h') printf("invalid option '%c'\n", c);
         printf("USAGE: %s \n\t"
-            "[-t,--nthreads <num threads>] \n\t"
+        //  "[-t,--nthreads <num threads>] \n\t"
             "[-g,--ngpus <gpus per thread>] \n\t"
             "[-b,--minbytes <min size in bytes>] \n\t"
             "[-e,--maxbytes <max size in bytes>] \n\t"
@@ -919,8 +932,10 @@ testResult_t run() {
 #endif
   is_main_thread = is_main_proc = (proc == 0) ? 1 : 0;
 
-  PRINT("# nThread %d nGpus %d minBytes %ld maxBytes %ld step: %ld(%s) warmup iters: %d iters: %d agg iters: %d validation: %d graph: %d\n",
-        nThreads, nGpus, minBytes, maxBytes,
+//PRINT("# nThread %d nGpus %d minBytes %ld maxBytes %ld step: %ld(%s) warmup iters: %d iters: %d agg iters: %d validation: %d graph: %d\n",
+//      nThreads, nGpus, minBytes, maxBytes,
+  PRINT("# nGpus(perProc) %d minBytes %ld maxBytes %ld step: %ld(%s) warmup iters: %d iters: %d agg iters: %d validation: %d graph: %d\n",
+        nGpus, minBytes, maxBytes,
         (stepFactor > 1)?stepFactor:stepBytes, (stepFactor > 1)?"factor":"bytes",
         warmup_iters, iters, agg_iters, datacheck, cudaGraphLaunches);
   if (blocking_coll) PRINT("# Blocking Enabled: wait for completion and barrier after each collective \n");
@@ -949,7 +964,9 @@ testResult_t run() {
   // Gather all output in rank order to root (0)
   MPI_Gather(line, MAX_LINE, MPI_BYTE, lines, MAX_LINE, MPI_BYTE, 0, MPI_COMM_WORLD);
   if (proc == 0) {
-    for (int p = 0; p < totalProcs; p++)
+  //for (int p = 0; p < totalProcs; p++)
+    int stride = ucommd_.getLocalSize() > 0 ? ucommd_.getLocalSize() : 1;
+    for (int p = stride-1; p < totalProcs; p+=stride)
       PRINT("%s", lines+MAX_LINE*p);
     free(lines);
   }
@@ -1123,11 +1140,14 @@ testResult_t run() {
 #endif
 
   envstr = getenv("NCCL_TESTS_MIN_BW");
-  double check_avg_bw = envstr ? atof(envstr) : -1;
+//double check_avg_bw = envstr ? atof(envstr) : -1;
+  double check_avg_bw = envstr ? atof(envstr) :
+      (!strcmp(threads[0].args.collTest->name, "AllReduce") && minBytes == maxBytes && minBytes >= ucommd_.getBytes()) ? ucommd_.getBw(nGpus) : -1;
   bw[0] /= bw_count[0];
 
   PRINT("# Out of bounds values : %d %s\n", errors[0], errors[0] ? "FAILED" : "OK");
-  PRINT("# Avg bus bandwidth    : %g %s\n", bw[0], check_avg_bw == -1 ? "" : (bw[0] < check_avg_bw*(0.9) ? "FAILED" : "OK"));
+  PRINT("# Avg bus bandwidth    : %g %s\n", bw[0], check_avg_bw == -1 ? "" : (bw[0] < check_avg_bw/**(0.9)*/ ? "FAILED" : "OK"));
+  if (bw[0] < check_avg_bw) PRINT("# Expected min bandwidth : %g\n", check_avg_bw);
   PRINT("#\n");
 #ifdef MPI_SUPPORT
   MPI_Comm_free(&mpi_comm);
@@ -1139,7 +1159,7 @@ testResult_t run() {
   // 'cuda-memcheck --leak-check full' requires this
   cudaDeviceReset();
 
-  if (errors[0] || bw[0] < check_avg_bw*(0.9))
+  if (errors[0] || bw[0] < check_avg_bw/**(0.9)*/)
     exit(EXIT_FAILURE);
   else
     exit(EXIT_SUCCESS);
diff --git a/src/common.h b/src/common.h
index e6762e1..5bfcd38 100644
--- a/src/common.h
+++ b/src/common.h
@@ -94,12 +94,6 @@ struct testColl {
   testResult_t (*runColl)(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type,
       ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream);
 };
-extern struct testColl allReduceTest;
-extern struct testColl allGatherTest;
-extern struct testColl reduceScatterTest;
-extern struct testColl broadcastTest;
-extern struct testColl reduceTest;
-extern struct testColl alltoAllTest;
 
 struct testEngine {
   void (*getBuffSize)(size_t *sendcount, size_t *recvcount, size_t count, int nranks);
@@ -162,7 +156,14 @@ extern void AllocateBuffs(void **sendbuff, void **recvbuff, void **expected, voi
 #include <unistd.h>
 
 static void getHostName(char* hostname, int maxlen) {
-  gethostname(hostname, maxlen);
+  const char* node_name = getenv("NODE_NAME");
+  if (node_name && node_name[0]) {
+    strncpy(hostname, node_name, maxlen);
+  } else {
+    if (-1 == gethostname(hostname, maxlen)) {
+      strncpy(hostname, "unknown", 16);
+    }
+  }
   for (int i=0; i< maxlen; i++) {
     if (hostname[i] == '.') {
       hostname[i] = '\0';
diff --git a/src/ucommd.cc b/src/ucommd.cc
new file mode 100644
index 0000000..d0c1bd6
--- /dev/null
+++ b/src/ucommd.cc
@@ -0,0 +1,250 @@
+/**
+ * Copyright (c) 2024, Scitix Tech PTE. LTD. All rights reserved.
+ */
+
+#include <unistd.h>
+#include <dirent.h>
+#include <string.h>
+
+#include <algorithm>
+#include <vector>
+#include <string>
+#include <fstream>
+
+#include "ucommd.h"
+
+Ucommd::Ucommd() {
+  (void)_check_multi_node_via_ompi();
+  (void)_check_sys_nv_devices();
+  (void)_check_sys_ib_devices();
+  (void)_get_node_name();
+}
+
+Ucommd::~Ucommd() {
+  nvdevs_.clear();
+  ibdevs_.clear();
+}
+
+void Ucommd::_check_multi_node_via_ompi() {
+  const auto world_size_env = std::getenv("OMPI_COMM_WORLD_SIZE");
+  if (world_size_env == nullptr) return;
+  world_size_ = std::strtol(world_size_env, nullptr, 10);
+  const auto local_size_env = std::getenv("OMPI_COMM_WORLD_LOCAL_SIZE");
+  if (local_size_env == nullptr) return;
+  local_size_ = std::strtol(local_size_env, nullptr, 10);
+
+  // assume homogeneous mpirun
+  nnodes_ = world_size_ / local_size_;
+  is_multi_node_ = (nnodes_ > 1);
+}
+
+void Ucommd::_check_sys_nv_devices() {
+  DIR* dir;
+  dir = opendir("/sys/bus/pci/drivers/nvidia");
+  if (dir) {
+    struct dirent *entry;
+    while ((entry = readdir(dir))) {
+      if (entry->d_name[0] != '0') continue;
+      const auto nvdev = std::string(entry->d_name);
+      auto dev_class = std::ifstream(
+          std::string("/sys/bus/pci/drivers/nvidia/") + nvdev + "/class");
+      if (dev_class.is_open()) {
+        char dclass[16] = {0};
+        dev_class.getline(dclass, 16);
+        if (dev_class.good() &&
+           (std::string("0x030200").compare(dclass) == 0 ||
+            std::string("0x030000").compare(dclass) == 0)) {
+          nvdevs_.push_back(nvdev);
+        }
+        dev_class.close();
+      }
+    }
+    closedir(dir);
+    std::sort(nvdevs_.begin(), nvdevs_.end());
+  }
+}
+
+void Ucommd::_check_sys_ib_devices() {
+  DIR* dir;
+  dir = opendir("/sys/class/infiniband");
+  if (dir) {
+    struct dirent *entry;
+    while ((entry = readdir(dir))) {
+      if ((strcmp(entry->d_name, ".") == 0) ||
+          (strcmp(entry->d_name, "..") == 0)) {
+        continue;
+      }
+      const auto ibdev = std::string(entry->d_name);
+      if ([&ibdev] {
+          bool is_ib = false;
+          auto node_type = std::ifstream(
+              std::string("/sys/class/infiniband/") + ibdev + "/node_type");
+          if (node_type.is_open()) {
+            char ntype = node_type.get();
+            if (node_type.good()) is_ib = '1' <= ntype && ntype <= '3';
+            node_type.close();
+          }
+          return is_ib;
+        }() &&
+        [&ibdev] {
+          bool is_cx6 = false;
+          auto hca_type = std::ifstream(
+              std::string("/sys/class/infiniband/") + ibdev + "/hca_type");
+          if (hca_type.is_open()) {
+            char htype[8] = {0};
+            hca_type.getline(htype, 8);
+            if (hca_type.good()) {
+              is_cx6 = std::string("MT4123").compare(htype) == 0 ||
+                       std::string("MT4125").compare(htype) == 0 ||
+                       std::string("MT4129").compare(htype) == 0 ||
+                       std::string("MT4131").compare(htype) == 0 ||
+                       std::string("MT4124").compare(htype) == 0;
+            }
+            hca_type.close();
+          }
+          return is_cx6;
+        }() &&
+        [&ibdev, this] {
+          bool port_active = false;
+          auto port_state = std::ifstream(
+              std::string("/sys/class/infiniband/") + ibdev + "/ports/1/state");
+          if (port_state.is_open()) {
+            char state = port_state.get();
+            if (port_state.good()) {
+              port_active = state == '4';
+            }
+            port_state.close();
+          }
+          if (!port_active) {
+            printf("[%s] %s: port not active or unable to get port state\n",
+                node_name_.c_str(), ibdev.c_str());
+          }
+          return port_active;
+        }() &&
+        [&ibdev, this] {
+          bool link_up = false;
+          auto phys_state = std::ifstream(
+              std::string("/sys/class/infiniband/") + ibdev + "/ports/1/phys_state");
+          if (phys_state.is_open()) {
+            char state = phys_state.get();
+            if (phys_state.good()) {
+              link_up = state == '5';
+            }
+            phys_state.close();
+          }
+          if (!link_up) {
+            printf("[%s] %s: phys link not up or unable to get phys state\n",
+                node_name_.c_str(), ibdev.c_str());
+          }
+          return link_up;
+        }()) {
+        ibdevs_.push_back(ibdev);
+      }
+    }
+    closedir(dir);
+    std::sort(ibdevs_.begin(), ibdevs_.end());
+  }
+}
+
+void Ucommd::_get_node_name() {
+  const auto node_name_env = std::getenv("NODE_NAME");
+  if (node_name_env && node_name_env[0]) {
+    node_name_.assign(node_name_env);
+  } else {
+    char hostname[128] = {0};
+    if (!gethostname(hostname, 128)) {
+      node_name_.assign(hostname);
+    } else {
+      node_name_.assign("unknown");
+    }
+  }
+}
+
+int Ucommd::getLocalSize() const {
+  return local_size_;
+}
+
+int Ucommd::getNGpusPerProc() const {
+  return (is_multi_node_ || local_size_ > 1) ? 1 : (int)nvdevs_.size();
+}
+
+size_t Ucommd::getBytes() const {
+  return !is_multi_node_ ? 1UL << 32 :
+      world_size_ > 1024 ? ((size_t)world_size_) << 24 :
+      local_size_ > 4 ? ((size_t)world_size_) << 25 :
+      local_size_ > 1 ? ((size_t)world_size_) << 26 :
+      ((size_t)world_size_) << 27;
+}
+
+int Ucommd::getTimeoutSec() const {
+  return 600;
+}
+
+int Ucommd::getBw(int ngpus) {
+  return is_multi_node_ ? get_ib_bw() :
+      (ngpus > 1 || local_size_ > 1) ? get_nvlink_bw() : -1;
+}
+
+int Ucommd::get_nvlink_bw() {
+  int bw = -1;
+  if (!nvdevs_.empty()) {
+    auto dev_id = std::ifstream(
+        std::string("/sys/bus/pci/drivers/nvidia/") + nvdevs_.at(0) + "/device");
+    if (dev_id.is_open()) {
+      char device[16] = {0};
+      dev_id.getline(device, 16);
+      if (dev_id.good()) {
+        if (std::string("0x2330").compare(device) == 0) {
+          bw = 450 * 3 / 4;
+        } else
+        if (std::string("0x20b0").compare(device) == 0 ||
+            std::string("0x20b2").compare(device) == 0 ||
+            std::string("0x20b3").compare(device) == 0) {
+          bw = 300 * 2 / 3;
+        } else
+        if (std::string("0x20f3").compare(device) == 0 ||
+            std::string("0x20bd").compare(device) == 0) {
+          bw = 200 * 2 / 3;
+        }
+      }
+    }
+  }
+  return bw;
+}
+
+int Ucommd::get_ib_bw() {
+  int bw = -1;
+  if (!ibdevs_.empty()) {
+    int rate = 0;
+    auto port_rate = std::ifstream(
+        std::string("/sys/class/infiniband/") + ibdevs_.at(0) + "/ports/1/rate");
+    if (port_rate.is_open()) {
+      char c;
+      while ((c = port_rate.get()) && ('0' <= c && c <= '9')) {
+        rate = rate * 10 + c - '0';
+      }
+      port_rate.close();
+    }
+    bw = rate * 3 / 32;
+
+    // for DP AllReduce only ...
+    auto nnics = ibdevs_.size();
+    if (local_size_ == 2) {
+      bw = nnics > 1 ? bw * 2 : bw;
+    } else
+    if (local_size_ == 4) {
+      bw = nnics > 3 ? bw * 4 : nnics > 1 ? bw * 2 : bw;
+    } else
+    if (local_size_ == 8) {
+      bw *= nnics;
+      const char* mask_env = getenv("NCCL_TESTS_SPLIT_MASK");
+      if (mask_env) {
+        auto mask = std::strtol(mask_env, nullptr, 10);
+        if (mask == 7 || mask == 3 || mask == 1) {
+          bw /= (mask+1);
+        } // else ???
+      }
+    }
+  }
+  return bw;
+}
diff --git a/src/ucommd.h b/src/ucommd.h
new file mode 100644
index 0000000..92dc26a
--- /dev/null
+++ b/src/ucommd.h
@@ -0,0 +1,45 @@
+/**
+ * Copyright (c) 2024, Scitix Tech PTE. LTD. All rights reserved.
+ */
+
+#ifndef __UCOMMD_H__
+#define __UCOMMD_H__
+
+#include <vector>
+#include <string>
+
+class Ucommd {
+ public:
+  Ucommd();
+  ~Ucommd();
+
+ public:
+  int getLocalSize() const;
+
+  int getNGpusPerProc() const;
+  int getTimeoutSec() const;
+  size_t getBytes() const;
+
+  int getBw(int ngpus = -1);
+
+ private:
+  int get_nvlink_bw();
+  int get_ib_bw();
+
+ private:
+  void _check_multi_node_via_ompi();
+  void _check_sys_nv_devices();
+  void _check_sys_ib_devices();
+  void _get_node_name();
+
+ private:
+  int world_size_ = -1;
+  int local_size_ = -1;
+  int nnodes_ = -1;
+  bool is_multi_node_ = false;
+  std::string node_name_;
+  std::vector<std::string> nvdevs_;
+  std::vector<std::string> ibdevs_;
+};
+
+#endif