diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index dd3e20a..8794d12 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -17,12 +17,11 @@ jobs: fail-fast: false matrix: include: + - name: cuda12.9-ubuntu22.04 + dockerfile: docker/Dockerfile.cuda12.9.ubuntu22.04 - name: cuda13-ubuntu22.04 dockerfile: docker/Dockerfile.cuda13.x.ubuntu22.04 - # - name: cuda12-ubuntu20.04 - # dockerfile: docker/Dockerfile.cuda12.x.ubuntu20.04 - steps: - name: Checkout source uses: actions/checkout@v4 diff --git a/docker/Dockerfile.cuda12.9.ubuntu22.04 b/docker/Dockerfile.cuda12.9.ubuntu22.04 new file mode 100644 index 0000000..1d97fe4 --- /dev/null +++ b/docker/Dockerfile.cuda12.9.ubuntu22.04 @@ -0,0 +1,151 @@ +########################### +# Build-time configuration +########################### + +# Base OS and CUDA versions +ARG UBUNTU_VERSION=22.04 +ARG CUDA_VERSION=12.9.1 +ARG CUDART_VERSION=12.9.79 +ARG CUDART_MAJOR_VERSION=12 + +# NCCL versions +ARG NCCL_PACKAGE_VERSION=2.29.2-1+cuda12.9 +ARG NCCL_SO_VERSION=2.29.2 + +# OpenMPI versions +# - MPI_VERSION: full OpenMPI version +# - MPI_SERIES: major.minor series used in download URL +ARG MPI_VERSION=4.1.8 +ARG MPI_SERIES=4.1 + +# Build date (override at build time) +ARG BUILD_DATE=20260128 + +########################### +# Build Stage +########################### +FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION} AS build + +# Re-declare build args for this stage (values are inherited) +ARG UBUNTU_VERSION +ARG CUDA_VERSION +ARG CUDART_VERSION +ARG NCCL_PACKAGE_VERSION +ARG NCCL_SO_VERSION +ARG MPI_VERSION +ARG MPI_SERIES +ARG BUILD_DATE + +ENV DEBIAN_FRONTEND=noninteractive +WORKDIR /workspace + +# ------------------------- +# 1. Base build dependencies +# ------------------------- +RUN { apt-get -o Acquire::http::No-Cache=true update > build.log 2>&1 && \ + apt-get install -y --no-install-recommends \ + build-essential gcc g++ curl git wget ca-certificates \ + make automake autoconf libtool pkg-config \ + python3 python3-pip gzip xz-utils >> build.log 2>&1 && \ + rm -rf /var/lib/apt/lists/* && rm -f build.log; } || (cat build.log && false) + +# ------------------------- +# 2. Install CUDA keyring and restore NVIDIA repository +# ------------------------- +RUN { wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb > build.log 2>&1 && \ + dpkg -i cuda-keyring_1.1-1_all.deb >> build.log 2>&1 && \ + apt-get update >> build.log 2>&1 && rm -f build.log; } || (cat build.log && false) + +# ------------------------- +# 3. Install NCCL (pinned version) +# ------------------------- +RUN { apt-get install -y --no-install-recommends \ + libnccl2=${NCCL_PACKAGE_VERSION} \ + libnccl-dev=${NCCL_PACKAGE_VERSION} > build.log 2>&1 && \ + ldconfig >> build.log 2>&1 && \ + rm -rf /var/lib/apt/lists/* && rm -f build.log; } || (cat build.log && false) + +# ------------------------- +# 4. Build OpenMPI from source +# ------------------------- +RUN wget https://download.open-mpi.org/release/open-mpi/v${MPI_SERIES}/openmpi-${MPI_VERSION}.tar.gz > /dev/null 2>&1 && \ + tar zxvf openmpi-${MPI_VERSION}.tar.gz > /dev/null 2>&1 && \ + cd openmpi-${MPI_VERSION} && \ + ./configure --prefix=/usr/local/sihpc --with-cuda=/usr/local/cuda > /dev/null 2>&1 && \ + make -j$(nproc) > /dev/null 2>&1 && make install > /dev/null 2>&1 && \ + rm -rf /workspace/openmpi-${MPI_VERSION} /workspace/openmpi-${MPI_VERSION}.tar.gz + +# ------------------------- +# 5. Build nccl-tests +# ------------------------- +RUN git clone --depth 1 --single-branch -b sync/upstream-20251216 https://github.com/scitix/nccl-tests.git > /dev/null 2>&1 && \ + cd nccl-tests && \ + { make MPI=1 MPI_HOME=/usr/local/sihpc > build.log 2>&1 && rm -f build.log || (cat build.log && false); } && \ + mkdir -p /usr/local/sihpc/libexec/nccl-tests && \ + cp -rf build/*_perf /usr/local/sihpc/libexec/nccl-tests/ && \ + mkdir -p /usr/local/sihpc/bin && \ + cp scripts/nccl_perf /usr/local/sihpc/bin/nccl_perf && \ + cp scripts/nccl_test /usr/local/sihpc/libexec/nccl-tests/nccl_test && \ + cp scripts/env.sh /usr/local/sihpc/env.sh && \ + cp scripts/install_sihpc /usr/local/sihpc/bin/install_sihpc && \ + cp scripts/uninstall_sihpc /usr/local/sihpc/bin/uninstall_sihpc && \ + rm -rf /workspace/nccl-tests + +# ------------------------- +# 6. Collect runtime libraries (strict selection) +# ------------------------- +RUN { set -e && \ + mkdir -p /usr/local/sihpc/lib > build.log 2>&1 && \ + cp /usr/local/cuda/lib64/libcudart* /usr/local/sihpc/lib/ >> build.log 2>&1 && \ + cp /usr/lib/x86_64-linux-gnu/libnccl.so* /usr/local/sihpc/lib/ >> build.log 2>&1 && \ + rm -f build.log; } || (cat build.log && false) +# cp /lib/x86_64-linux-gnu/libltdl.so.7.3.1 /usr/local/sihpc/lib/ && \ +# cp /usr/lib/x86_64-linux-gnu/libhwloc.so* /usr/local/sihpc/lib/ && \ +# cp /usr/lib/x86_64-linux-gnu/libevent_core* /usr/local/sihpc/lib/ && \ +# cp /usr/lib/x86_64-linux-gnu/libevent_pthreads* /usr/local/sihpc/lib/ + +# ------------------------- +# 7. Fix library symlinks +# ------------------------- +RUN cd /usr/local/sihpc/lib && \ + rm -f libcudart.so libcudart.so.${CUDART_MAJOR_VERSION} && \ + ln -sf libnccl.so.${NCCL_SO_VERSION} libnccl.so.2 && \ + ln -sf libnccl.so.2 libnccl.so && \ + ln -sf libcudart.so.${CUDART_VERSION} libcudart.so.${CUDART_MAJOR_VERSION} && \ + ln -sf libcudart.so.${CUDART_MAJOR_VERSION} libcudart.so +# rm -f libevent_core-2.1.so.7 && \ +# ln -sf libhwloc.so.15.1.0 libhwloc.so.15 && \ +# ln -sf libhwloc.so.15.1.0 libhwloc.so && \ +# ln -sf libevent_core-2.1.so.7.0.0 libevent_core-2.1.so.7 && \ +# ln -sf libevent_core-2.1.so.7 libevent_core-2.1.so && \ +# ln -sf libevent_pthreads-2.1.so.7.0.0 libevent_pthreads-2.1.so.7 && \ +# ln -sf libevent_pthreads-2.1.so.7 libevent_pthreads-2.1.so && \ +# ln -sf libltdl.so.7.3.1 libltdl.so.7 && \ +# ln -sf libltdl.so.7 libltdl.so + +########################### +# Package Stage +########################### +FROM ubuntu:20.04 AS package + +# Re-declare args for this stage (values are inherited) +ARG UBUNTU_VERSION +ARG NCCL_PACKAGE_VERSION +ARG MPI_VERSION +ARG BUILD_DATE + +# Expose versions/date as environment variables for runtime shell expansion +ENV NCCL_PACKAGE_VERSION=${NCCL_PACKAGE_VERSION} \ + MPI_VERSION=${MPI_VERSION} \ + BUILD_DATE=${BUILD_DATE} + +COPY --from=build /usr/local/sihpc /usr/local/sihpc + +WORKDIR / +RUN apt-get update && apt-get install -y --no-install-recommends makeself && \ + SAFE_NCCL_PKG=$(printf '%s\n' "${NCCL_PACKAGE_VERSION}" | tr '+' '-') && \ + PACKAGE_FILENAME="sicl-nccl${SAFE_NCCL_PKG}-ompi${MPI_VERSION}-ubuntu${UBUNTU_VERSION}-${BUILD_DATE}.run" && \ + { makeself --gzip /usr/local/sihpc \ + "${PACKAGE_FILENAME}" \ + "SiHPC MPI + NCCL + NCCL-tests Portable Installer" \ + ./bin/install_sihpc > build.log 2>&1 && rm -f build.log; } || (cat build.log && false) \ No newline at end of file diff --git a/docker/Dockerfile.cuda13.x.ubuntu22.04 b/docker/Dockerfile.cuda13.x.ubuntu22.04 index 82cb17d..80d6983 100644 --- a/docker/Dockerfile.cuda13.x.ubuntu22.04 +++ b/docker/Dockerfile.cuda13.x.ubuntu22.04 @@ -9,8 +9,8 @@ ARG CUDART_VERSION=13.1.80 ARG CUDART_MAJOR_VERSION=13 # NCCL versions -ARG NCCL_PACKAGE_VERSION=2.28.9-1+cuda13.0 -ARG NCCL_SO_VERSION=2.28.9 +ARG NCCL_PACKAGE_VERSION=2.29.2-1+cuda12.9 +ARG NCCL_SO_VERSION=2.29.2 # OpenMPI versions # - MPI_VERSION: full OpenMPI version @@ -19,7 +19,7 @@ ARG MPI_VERSION=4.1.8 ARG MPI_SERIES=4.1 # Build date (override at build time) -ARG BUILD_DATE=20251221 +ARG BUILD_DATE=20260128 ########################### # Build Stage diff --git a/scripts/nccl_test b/scripts/nccl_test index 61e9019..62c793d 100755 --- a/scripts/nccl_test +++ b/scripts/nccl_test @@ -26,12 +26,12 @@ export OMPI_ALLOW_RUN_AS_ROOT_CONFIRM=1 export OMPI_MCA_btl=self,tcp export OMPI_MCA_pml=^ucx -if [ $OMPI_COMM_WORLD_SIZE -gt $OMPI_COMM_WORLD_LOCAL_SIZE ]; then - if [ $OMPI_COMM_WORLD_RANK -eq 0 ]; then +if [ "${OMPI_COMM_WORLD_SIZE:-0}" -gt "${OMPI_COMM_WORLD_LOCAL_SIZE:-0}" ]; then + if [ "${OMPI_COMM_WORLD_RANK:-0}" -eq 0 ]; then export NCCL_DEBUG=${NCCL_DEBUG:-"INFO"} fi fi -if [ $OMPI_COMM_WORLD_RANK -eq 0 ]; then - echo "[$(hostname)] running nccl test $COLL$OPTIONS, world_size=$OMPI_COMM_WORLD_SIZE" +if [ "${OMPI_COMM_WORLD_RANK:-0}" -eq 0 ]; then + echo "[$(hostname)] running nccl test $COLL$OPTIONS, world_size=${OMPI_COMM_WORLD_SIZE:-0}" fi $TEST_DIR/${COLL}_perf -f2$OPTIONS