From cc230e61e1d2709252d314097c775f0a4e159ce2 Mon Sep 17 00:00:00 2001 From: xlliu Date: Sun, 21 Dec 2025 14:13:42 +0800 Subject: [PATCH] add github workflow --- .github/workflows/pre_check.yml | 29 ++++ .github/workflows/release.yml | 52 ++++++ docker/Dockerfile.cuda12.x.ubuntu20.04 | 153 ++++++++++++++++++ .../Dockerfile.cuda13.x.ubuntu22.04 | 38 +---- 4 files changed, 242 insertions(+), 30 deletions(-) create mode 100644 .github/workflows/pre_check.yml create mode 100644 .github/workflows/release.yml create mode 100644 docker/Dockerfile.cuda12.x.ubuntu20.04 rename Dockerfile => docker/Dockerfile.cuda13.x.ubuntu22.04 (83%) diff --git a/.github/workflows/pre_check.yml b/.github/workflows/pre_check.yml new file mode 100644 index 0000000..67bd7b4 --- /dev/null +++ b/.github/workflows/pre_check.yml @@ -0,0 +1,29 @@ +on: + pull_request: + workflow_dispatch: + +jobs: + build-only: + runs-on: ubuntu-22.04 + steps: + - uses: actions/checkout@v4 + - uses: docker/setup-buildx-action@v3 + - name: Free disk space + run: | + sudo rm -rf /usr/share/dotnet + sudo rm -rf /usr/local/lib/android + sudo rm -rf /opt/ghc + sudo rm -rf /opt/hostedtoolcache + sudo docker system prune -af || true + df -h + - name: Build run package + run: | + docker buildx build \ + -f docker/Dockerfile.cuda13.x.ubuntu22.04 \ + --platform linux/amd64 \ + --target package \ + --output type=local,dest=dist/cuda13.x.ubuntu22.04 \ + . + - name: List artifacts + run: | + ls -lh dist/cuda13.x.ubuntu22.04 \ No newline at end of file diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml new file mode 100644 index 0000000..4cf8622 --- /dev/null +++ b/.github/workflows/release.yml @@ -0,0 +1,52 @@ +name: release + +on: + push: + tags: + - "v*" + +permissions: + contents: write # permit uploading Release assets + +jobs: + build-and-release: + name: build-run (${{ matrix.name }}) + runs-on: ubuntu-22.04 + + strategy: + fail-fast: false + matrix: + include: + - name: cuda13-ubuntu22.04 + dockerfile: docker/Dockerfile.cuda13.ubuntu22.04 + + # - name: cuda12-ubuntu20.04 + # dockerfile: docker/Dockerfile.cuda12.ubuntu20.04 + + steps: + - name: Checkout source + uses: actions/checkout@v4 + + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + + - name: Build run package + run: | + BUILD_DATE=$(date +%Y%m%d) + + docker buildx build \ + --platform linux/amd64 \ + --progress=plain \ + -f ${{ matrix.dockerfile }} \ + --build-arg BUILD_DATE=${BUILD_DATE} \ + --output type=local,dest=dist/${{ matrix.name }} \ + . + + echo "Produced files:" + ls -lh dist/${{ matrix.name }} + + - name: Upload run to GitHub Release + uses: softprops/action-gh-release@v2 + with: + files: | + dist/${{ matrix.name }}/*.run diff --git a/docker/Dockerfile.cuda12.x.ubuntu20.04 b/docker/Dockerfile.cuda12.x.ubuntu20.04 new file mode 100644 index 0000000..efe1a62 --- /dev/null +++ b/docker/Dockerfile.cuda12.x.ubuntu20.04 @@ -0,0 +1,153 @@ +########################### +# Build-time configuration +########################### + +# Base OS and CUDA versions +ARG UBUNTU_VERSION=20.04 +ARG CUDA_VERSION=12.8.1 +ARG CUDART_VERSION=12.8.90 +ARG CUDART_MAJOR_VERSION=12 + +# NCCL versions +ARG NCCL_PACKAGE_VERSION=2.27.7-1+cuda12.4 +ARG NCCL_SO_VERSION=2.27.7 + +# OpenMPI versions +# - MPI_VERSION: full OpenMPI version +# - MPI_SERIES: major.minor series used in download URL +ARG MPI_VERSION=4.1.8 +ARG MPI_SERIES=4.1 + +# Build date (override at build time) +ARG BUILD_DATE=20251221 + +########################### +# Build Stage +########################### +FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu20.04 AS build + +# Re-declare build args for this stage (values are inherited) +ARG UBUNTU_VERSION +ARG CUDA_VERSION +ARG CUDART_VERSION +ARG NCCL_PACKAGE_VERSION +ARG NCCL_SO_VERSION +ARG MPI_VERSION +ARG MPI_SERIES +ARG BUILD_DATE + +ENV DEBIAN_FRONTEND=noninteractive +WORKDIR /workspace + +# ------------------------- +# 1. Base build dependencies +# ------------------------- +RUN mv /etc/apt/sources.list.d/cuda*.list /tmp/disabled-cuda.list || true && \ + apt-get -o Acquire::http::No-Cache=true update && \ + apt-get install -y --no-install-recommends \ + build-essential gcc g++ curl git wget ca-certificates \ + make automake autoconf libtool pkg-config \ + python3 python3-pip gzip xz-utils && \ + rm -rf /var/lib/apt/lists/* + +# ------------------------- +# 2. Install CUDA keyring and restore NVIDIA repository +# ------------------------- +RUN wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/cuda-keyring_1.1-1_all.deb && \ + dpkg -i cuda-keyring_1.1-1_all.deb && \ + apt-get update + +# ------------------------- +# 3. Install NCCL (pinned version) +# ------------------------- +RUN apt-mark unhold libnccl2 libnccl-dev || true && \ + apt-get install -y --no-install-recommends \ + libnccl2=${NCCL_PACKAGE_VERSION} \ + libnccl-dev=${NCCL_PACKAGE_VERSION} && \ + apt-mark hold libnccl2 libnccl-dev && \ + ldconfig && \ + rm -rf /var/lib/apt/lists/* + +# ------------------------- +# 4. Build OpenMPI from source +# ------------------------- +RUN wget https://download.open-mpi.org/release/open-mpi/v${MPI_SERIES}/openmpi-${MPI_VERSION}.tar.gz && \ + tar zxvf openmpi-${MPI_VERSION}.tar.gz && \ + cd openmpi-${MPI_VERSION} && \ + ./configure --prefix=/usr/local/sihpc --with-cuda=/usr/local/cuda && \ + make -j$(nproc) && make install && \ + rm -rf /workspace/openmpi-${MPI_VERSION} /workspace/openmpi-${MPI_VERSION}.tar.gz + +# ------------------------- +# 5. Build nccl-tests +# ------------------------- +RUN git clone https://github.com/scitix/nccl-tests.git -b sicl && \ + cd nccl-tests && \ + make MPI=1 MPI_HOME=/usr/local/sihpc && \ + mkdir -p /usr/local/sihpc/libexec/nccl-tests && \ + cp -rf build/*_perf /usr/local/sihpc/libexec/nccl-tests/ && \ + cp scripts/nccl_perf /usr/local/sihpc/bin/nccl_perf && \ + cp scripts/nccl_test /usr/local/sihpc/libexec/nccl-tests/nccl_test && \ + cp scripts/env.sh /usr/local/sihpc/env.sh && \ + cp scripts/install_sihpc /usr/local/sihpc/bin/install_sihpc && \ + cp scripts/uninstall_sihpc /usr/local/sihpc/bin/uninstall_sihpc && \ + rm -rf /workspace/nccl-tests + +# ------------------------- +# 6. Collect runtime libraries (strict selection) +# ------------------------- +RUN set -eux && \ + mkdir -p /usr/local/sihpc/lib && \ + cp /usr/local/cuda/lib64/libcudart* /usr/local/sihpc/lib/ && \ + cp /usr/lib/x86_64-linux-gnu/libnccl.so* /usr/local/sihpc/lib/ + # cp /lib/x86_64-linux-gnu/libltdl.so.7.3.1 /usr/local/sihpc/lib/ && \ + # cp /usr/lib/x86_64-linux-gnu/libhwloc.so* /usr/local/sihpc/lib/ && \ + # cp /usr/lib/x86_64-linux-gnu/libevent_core* /usr/local/sihpc/lib/ && \ + # cp /usr/lib/x86_64-linux-gnu/libevent_pthreads* /usr/local/sihpc/lib/ + +# ------------------------- +# 7. Fix library symlinks +# ------------------------- +RUN cd /usr/local/sihpc/lib && \ + rm -f libcudart.so libcudart.so.12 && \ + ln -sf libnccl.so.2.27.7 libnccl.so.2 && \ + ln -sf libnccl.so.2 libnccl.so && \ + ln -sf libcudart.so.12.8.90 libcudart.so.12 && \ + ln -sf libcudart.so.12 libcudart.so + # rm -f libevent_core-2.1.so.7 && \ + # ln -sf libhwloc.so.15.1.0 libhwloc.so.15 && \ + # ln -sf libhwloc.so.15.1.0 libhwloc.so && \ + # ln -sf libevent_core-2.1.so.7.0.0 libevent_core-2.1.so.7 && \ + # ln -sf libevent_core-2.1.so.7 libevent_core-2.1.so && \ + # ln -sf libevent_pthreads-2.1.so.7.0.0 libevent_pthreads-2.1.so.7 && \ + # ln -sf libevent_pthreads-2.1.so.7 libevent_pthreads-2.1.so && \ + # ln -sf libltdl.so.7.3.1 libltdl.so.7 && \ + # ln -sf libltdl.so.7 libltdl.so + +########################### +# Package Stage +########################### +FROM ubuntu:20.04 AS package + +# Re-declare args for this stage (values are inherited) +ARG UBUNTU_VERSION +ARG NCCL_PACKAGE_VERSION +ARG MPI_VERSION +ARG BUILD_DATE + +# Expose versions/date as environment variables for runtime shell expansion +ENV NCCL_PACKAGE_VERSION=${NCCL_PACKAGE_VERSION} \ + MPI_VERSION=${MPI_VERSION} \ + BUILD_DATE=${BUILD_DATE} + +COPY --from=build /usr/local/sihpc /usr/local/sihpc + +WORKDIR / +RUN apt-get update && apt-get install -y --no-install-recommends makeself && \ + chmod +x /usr/local/sihpc/bin/install_sihpc && \ + SAFE_NCCL_PKG=$(printf '%s\n' "${NCCL_PACKAGE_VERSION}" | tr '+' '-') && \ + PACKAGE_FILENAME="sicl-nccl${SAFE_NCCL_PKG}-ompi${MPI_VERSION}-ubuntu${UBUNTU_VERSION}-${BUILD_DATE}.run" && \ + makeself --gzip /usr/local/sihpc \ + "${PACKAGE_FILENAME}" \ + "SiHPC MPI + NCCL + NCCL-tests Portable Installer" \ + ./bin/install_sihpc diff --git a/Dockerfile b/docker/Dockerfile.cuda13.x.ubuntu22.04 similarity index 83% rename from Dockerfile rename to docker/Dockerfile.cuda13.x.ubuntu22.04 index f71d72d..1ca8a1c 100644 --- a/Dockerfile +++ b/docker/Dockerfile.cuda13.x.ubuntu22.04 @@ -77,8 +77,7 @@ RUN wget https://download.open-mpi.org/release/open-mpi/v${MPI_SERIES}/openmpi-$ # ------------------------- # 5. Build nccl-tests # ------------------------- -RUN cd /tmp && \ - git clone https://github.com/scitix/nccl-tests.git -b sync/upstream-20251216 && \ +RUN git clone https://github.com/scitix/nccl-tests.git -b sync/upstream-20251216 && \ cd nccl-tests && \ make MPI=1 MPI_HOME=/usr/local/sihpc && \ mkdir -p /usr/local/sihpc/libexec/nccl-tests && \ @@ -97,10 +96,10 @@ RUN set -eux && \ mkdir -p /usr/local/sihpc/lib && \ cp /usr/local/cuda/lib64/libcudart* /usr/local/sihpc/lib/ && \ cp /usr/lib/x86_64-linux-gnu/libnccl.so* /usr/local/sihpc/lib/ -# cp /lib/x86_64-linux-gnu/libltdl.so.7.3.1 /usr/local/sihpc/lib/; \ -# cp /usr/lib/x86_64-linux-gnu/libhwloc.so* /usr/local/sihpc/lib/; \ -# cp /usr/lib/x86_64-linux-gnu/libevent_core* /usr/local/sihpc/lib/; \ -# cp /usr/lib/x86_64-linux-gnu/libevent_pthreads* /usr/local/sihpc/lib/; \ +# cp /lib/x86_64-linux-gnu/libltdl.so.7.3.1 /usr/local/sihpc/lib/ && \ +# cp /usr/lib/x86_64-linux-gnu/libhwloc.so* /usr/local/sihpc/lib/ && \ +# cp /usr/lib/x86_64-linux-gnu/libevent_core* /usr/local/sihpc/lib/ && \ +# cp /usr/lib/x86_64-linux-gnu/libevent_pthreads* /usr/local/sihpc/lib/ # ------------------------- # 7. Fix library symlinks @@ -121,36 +120,15 @@ RUN cd /usr/local/sihpc/lib && \ # ln -sf libltdl.so.7.3.1 libltdl.so.7 && \ # ln -sf libltdl.so.7 libltdl.so -########################### -# Package Stage -########################### -FROM ubuntu:${UBUNTU_VERSION} AS package - -# Re-declare args for this stage (values are inherited) -ARG UBUNTU_VERSION -ARG NCCL_PACKAGE_VERSION -ARG MPI_VERSION -ARG BUILD_DATE - # Expose versions/date as environment variables for runtime shell expansion ENV NCCL_PACKAGE_VERSION=${NCCL_PACKAGE_VERSION} \ MPI_VERSION=${MPI_VERSION} \ BUILD_DATE=${BUILD_DATE} -COPY --from=build /usr/local/sihpc /usr/local/sihpc - -WORKDIR /dist -RUN apt-get update && apt-get install -y --no-install-recommends makeself && \ - chmod +x /usr/local/sihpc/bin/install_sihpc && \ - SAFE_NCCL_PKG="${NCCL_PACKAGE_VERSION//+/-}" && \ +WORKDIR / +RUN SAFE_NCCL_PKG=$(printf '%s\n' "${NCCL_PACKAGE_VERSION}" | tr '+' '-') && \ PACKAGE_FILENAME="sicl-nccl${SAFE_NCCL_PKG}-ompi${MPI_VERSION}-ubuntu${UBUNTU_VERSION}-${BUILD_DATE}.run" && \ makeself --gzip /usr/local/sihpc \ "${PACKAGE_FILENAME}" \ "SiHPC MPI + NCCL + NCCL-tests Portable Installer" \ - ./bin/install_sihpc && \ - rm -rf /usr/local/sihpc && \ - apt-get autoremove -y && \ - apt-get clean && \ - rm -rf /var/lib/apt/lists/* - -CMD ["bash", "-c", "SAFE_NCCL_PKG=${NCCL_PACKAGE_VERSION//+/-}; FILE=\"sicl-nccl${SAFE_NCCL_PKG}-ompi${MPI_VERSION}-ubuntu${UBUNTU_VERSION}-${BUILD_DATE}.run\"; ls -lh \"/build/$FILE\" && echo 'Build complete.'"] \ No newline at end of file + ./bin/install_sihpc \ No newline at end of file