diff --git a/scripts/nccl_perf b/scripts/nccl_perf new file mode 100755 index 0000000..b2193f3 --- /dev/null +++ b/scripts/nccl_perf @@ -0,0 +1,27 @@ +#!/usr/bin/env bash +BASE_DIR=$(cd $(dirname $0) && pwd) +TEST_DIR=$BASE_DIR/../libexec/nccl-tests +COLL=all_reduce +while [[ -n $1 ]]; do + case $1 in + -l*) + case ${1:2} in + Broadcast|broadcast|Bcast|bcast) COLL=broadcast;; + Reduce|reduce) COLL=reduce;; + Gather|gather) COLL=gather;; + Scatter|scatter) COLL=scatter;; + AllToAll|alltoall) COLL=alltoall;; + AllGather|allgather) COLL=all_gather;; + ReduceScatter|reducescatter) COLL=reduce_scatter;; + HyperCube|Hypercube|hypercube) COLL=hypercube;; + SendRecv|sendrecv) COLL=sendrecv;; + esac;; + *) OPTIONS="$OPTIONS $1";; + esac + shift 1 +done +export OMPI_ALLOW_RUN_AS_ROOT=1 +export OMPI_ALLOW_RUN_AS_ROOT_CONFIRM=1 +export OMPI_MCA_btl=self,tcp +export OMPI_MCA_pml=^ucx +$TEST_DIR/${COLL}_perf$OPTIONS diff --git a/scripts/nccl_test b/scripts/nccl_test new file mode 100755 index 0000000..0738c57 --- /dev/null +++ b/scripts/nccl_test @@ -0,0 +1,36 @@ +#!/usr/bin/env bash +TEST_DIR=$(cd $(dirname $0) && pwd) +COLL=all_reduce +while [[ -n $1 ]]; do + case $1 in + -l*) + case ${1:2} in + Broadcast|broadcast|Bcast|bcast) COLL=broadcast;; + Reduce|reduce) COLL=reduce;; + Gather|gather) COLL=gather;; + Scatter|scatter) COLL=scatter;; + AllToAll|alltoall) COLL=alltoall;; + AllGather|allgather) COLL=all_gather;; + ReduceScatter|reducescatter) COLL=reduce_scatter;; + HyperCube|Hypercube|hypercube) COLL=hypercube;; + SendRecv|sendrecv) COLL=sendrecv;; + esac;; + *) OPTIONS="$OPTIONS $1";; + esac + shift 1 +done + +export OMPI_ALLOW_RUN_AS_ROOT=1 +export OMPI_ALLOW_RUN_AS_ROOT_CONFIRM=1 +export OMPI_MCA_btl=self,tcp +export OMPI_MCA_pml=^ucx + +if [ $OMPI_COMM_WORLD_SIZE -gt $OMPI_COMM_WORLD_LOCAL_SIZE ]; then + if [ $OMPI_COMM_WORLD_RANK -eq 0 ]; then + export NCCL_DEBUG=${NCCL_DEBUG:-"INFO"} + fi +fi +if [ $OMPI_COMM_WORLD_RANK -eq 0 ]; then + echo "[$(hostname)] running nccl test $COLL$OPTIONS, world_size=$OMPI_COMM_WORLD_SIZE" +fi +$TEST_DIR/${COLL}_perf -f2$OPTIONS diff --git a/src/Makefile b/src/Makefile index 393de8e..b4bb42a 100644 --- a/src/Makefile +++ b/src/Makefile @@ -19,7 +19,14 @@ CUDA_MAJOR = $(shell echo $(CUDA_VERSION) | cut -d "." -f 1) # Better define NVCC_GENCODE in your environment to the minimal set # of archs to reduce compile time. -ifeq ($(shell test "0$(CUDA_MAJOR)" -ge 11; echo $$?),0) +ifeq ($(shell test "0$(CUDA_MAJOR)" -ge 12; echo $$?),0) +NVCC_GENCODE ?= -gencode=arch=compute_70,code=sm_70 \ + -gencode=arch=compute_80,code=sm_80 \ + -gencode=arch=compute_86,code=sm_86 \ + -gencode=arch=compute_89,code=sm_89 \ + -gencode=arch=compute_90,code=sm_90 \ + -gencode=arch=compute_90,code=compute_90 +else ifeq ($(shell test "0$(CUDA_MAJOR)" -ge 11; echo $$?),0) NVCC_GENCODE ?= -gencode=arch=compute_60,code=sm_60 \ -gencode=arch=compute_61,code=sm_61 \ -gencode=arch=compute_70,code=sm_70 \ @@ -33,6 +40,7 @@ NVCC_GENCODE ?= -gencode=arch=compute_35,code=sm_35 \ -gencode=arch=compute_70,code=sm_70 \ -gencode=arch=compute_70,code=compute_70 endif +$(info NVCC_GENCODE is ${NVCC_GENCODE}) NVCUFLAGS := -ccbin $(CXX) $(NVCC_GENCODE) -std=c++11 CXXFLAGS := -std=c++11 @@ -101,5 +109,5 @@ ${DST_DIR}/timer.o: timer.cc timer.h ${DST_DIR}/%_perf:${DST_DIR}/%.o ${DST_DIR}/common.o ${DST_DIR}/timer.o $(TEST_VERIFIABLE_OBJS) @printf "Linking %-35s > %s\n" $< $@ @mkdir -p ${DST_DIR} - $(NVCC) -o $@ $(NVCUFLAGS) $^ ${NVLDFLAGS} + $(NVCC) -o $@ $(NVCUFLAGS) $^ ${NVLDFLAGS} -Xcompiler \"-Wl,-rpath,/usr/local/sihpc/lib\"