tweak build and test scripts for SiCL.

* add '/usr/local/sihpc/lib' to rpath.

* print 'NVCC_GENCODE' in Makefile, and by default generate bin for
  Volta, Ampere, Ada, and Hopper.

* add test run wrapper scripts "nccl_perf" and "nccl_test".
This commit is contained in:
Zhongqi An 2024-11-14 11:30:42 +08:00
parent 8dfeab9eb9
commit fd83f7ca84
3 changed files with 73 additions and 2 deletions

27
scripts/nccl_perf Executable file
View File

@ -0,0 +1,27 @@
#!/usr/bin/env bash
BASE_DIR=$(cd $(dirname $0) && pwd)
TEST_DIR=$BASE_DIR/../libexec/nccl-tests
COLL=all_reduce
while [[ -n $1 ]]; do
case $1 in
-l*)
case ${1:2} in
Broadcast|broadcast|Bcast|bcast) COLL=broadcast;;
Reduce|reduce) COLL=reduce;;
Gather|gather) COLL=gather;;
Scatter|scatter) COLL=scatter;;
AllToAll|alltoall) COLL=alltoall;;
AllGather|allgather) COLL=all_gather;;
ReduceScatter|reducescatter) COLL=reduce_scatter;;
HyperCube|Hypercube|hypercube) COLL=hypercube;;
SendRecv|sendrecv) COLL=sendrecv;;
esac;;
*) OPTIONS="$OPTIONS $1";;
esac
shift 1
done
export OMPI_ALLOW_RUN_AS_ROOT=1
export OMPI_ALLOW_RUN_AS_ROOT_CONFIRM=1
export OMPI_MCA_btl=self,tcp
export OMPI_MCA_pml=^ucx
$TEST_DIR/${COLL}_perf$OPTIONS

36
scripts/nccl_test Executable file
View File

@ -0,0 +1,36 @@
#!/usr/bin/env bash
TEST_DIR=$(cd $(dirname $0) && pwd)
COLL=all_reduce
while [[ -n $1 ]]; do
case $1 in
-l*)
case ${1:2} in
Broadcast|broadcast|Bcast|bcast) COLL=broadcast;;
Reduce|reduce) COLL=reduce;;
Gather|gather) COLL=gather;;
Scatter|scatter) COLL=scatter;;
AllToAll|alltoall) COLL=alltoall;;
AllGather|allgather) COLL=all_gather;;
ReduceScatter|reducescatter) COLL=reduce_scatter;;
HyperCube|Hypercube|hypercube) COLL=hypercube;;
SendRecv|sendrecv) COLL=sendrecv;;
esac;;
*) OPTIONS="$OPTIONS $1";;
esac
shift 1
done
export OMPI_ALLOW_RUN_AS_ROOT=1
export OMPI_ALLOW_RUN_AS_ROOT_CONFIRM=1
export OMPI_MCA_btl=self,tcp
export OMPI_MCA_pml=^ucx
if [ $OMPI_COMM_WORLD_SIZE -gt $OMPI_COMM_WORLD_LOCAL_SIZE ]; then
if [ $OMPI_COMM_WORLD_RANK -eq 0 ]; then
export NCCL_DEBUG=${NCCL_DEBUG:-"INFO"}
fi
fi
if [ $OMPI_COMM_WORLD_RANK -eq 0 ]; then
echo "[$(hostname)] running nccl test $COLL$OPTIONS, world_size=$OMPI_COMM_WORLD_SIZE"
fi
$TEST_DIR/${COLL}_perf -f2$OPTIONS

View File

@ -19,7 +19,14 @@ CUDA_MAJOR = $(shell echo $(CUDA_VERSION) | cut -d "." -f 1)
# Better define NVCC_GENCODE in your environment to the minimal set
# of archs to reduce compile time.
ifeq ($(shell test "0$(CUDA_MAJOR)" -ge 11; echo $$?),0)
ifeq ($(shell test "0$(CUDA_MAJOR)" -ge 12; echo $$?),0)
NVCC_GENCODE ?= -gencode=arch=compute_70,code=sm_70 \
-gencode=arch=compute_80,code=sm_80 \
-gencode=arch=compute_86,code=sm_86 \
-gencode=arch=compute_89,code=sm_89 \
-gencode=arch=compute_90,code=sm_90 \
-gencode=arch=compute_90,code=compute_90
else ifeq ($(shell test "0$(CUDA_MAJOR)" -ge 11; echo $$?),0)
NVCC_GENCODE ?= -gencode=arch=compute_60,code=sm_60 \
-gencode=arch=compute_61,code=sm_61 \
-gencode=arch=compute_70,code=sm_70 \
@ -33,6 +40,7 @@ NVCC_GENCODE ?= -gencode=arch=compute_35,code=sm_35 \
-gencode=arch=compute_70,code=sm_70 \
-gencode=arch=compute_70,code=compute_70
endif
$(info NVCC_GENCODE is ${NVCC_GENCODE})
NVCUFLAGS := -ccbin $(CXX) $(NVCC_GENCODE) -std=c++11
CXXFLAGS := -std=c++11
@ -101,5 +109,5 @@ ${DST_DIR}/timer.o: timer.cc timer.h
${DST_DIR}/%_perf:${DST_DIR}/%.o ${DST_DIR}/common.o ${DST_DIR}/timer.o $(TEST_VERIFIABLE_OBJS)
@printf "Linking %-35s > %s\n" $< $@
@mkdir -p ${DST_DIR}
$(NVCC) -o $@ $(NVCUFLAGS) $^ ${NVLDFLAGS}
$(NVCC) -o $@ $(NVCUFLAGS) $^ ${NVLDFLAGS} -Xcompiler \"-Wl,-rpath,/usr/local/sihpc/lib\"