diff --git a/.github/workflows/pre-check.yml b/.github/workflows/pre-check.yml new file mode 100644 index 0000000..f293c61 --- /dev/null +++ b/.github/workflows/pre-check.yml @@ -0,0 +1,29 @@ +on: + pull_request: + workflow_dispatch: + +jobs: + build-only: + runs-on: ubuntu-22.04 + steps: + - uses: actions/checkout@v4 + - uses: docker/setup-buildx-action@v3 + - name: Free disk space + run: | + sudo rm -rf /usr/share/dotnet + sudo rm -rf /usr/local/lib/android + sudo rm -rf /opt/ghc + sudo rm -rf /opt/hostedtoolcache + sudo docker system prune -af || true + df -h + - name: Build run package + run: | + docker buildx build \ + -f docker/Dockerfile.cuda12.x.ubuntu20.04 \ + --platform linux/amd64 \ + --target package \ + --output type=local,dest=dist \ + . + - name: List artifacts + run: | + ls -lh dist \ No newline at end of file diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml new file mode 100644 index 0000000..86b42f8 --- /dev/null +++ b/.github/workflows/release.yml @@ -0,0 +1,37 @@ +name: Release run installer + +on: + push: + tags: + - "v*" + +permissions: + contents: write + +jobs: + build-release: + runs-on: ubuntu-22.04 + + steps: + - uses: actions/checkout@v4 + + - name: Setup Docker Buildx + uses: docker/setup-buildx-action@v3 + + - name: Build .run installer (Docker) + run: | + docker buildx build \ + -f docker/Dockerfile.cuda12.x.ubuntu20.04 \ + --platform linux/amd64 \ + --build-arg BUILD_DATE=$(date +%Y%m%d) \ + --output type=local,dest=dist \ + . + + - name: List artifacts + run: ls -lh dist + + - name: Upload to GitHub Release + uses: softprops/action-gh-release@v2 + with: + files: | + dist/*.run diff --git a/docker/Dockerfile.cuda12.x.ubuntu20.04 b/docker/Dockerfile.cuda12.x.ubuntu20.04 new file mode 100644 index 0000000..efe1a62 --- /dev/null +++ b/docker/Dockerfile.cuda12.x.ubuntu20.04 @@ -0,0 +1,153 @@ +########################### +# Build-time configuration +########################### + +# Base OS and CUDA versions +ARG UBUNTU_VERSION=20.04 +ARG CUDA_VERSION=12.8.1 +ARG CUDART_VERSION=12.8.90 +ARG CUDART_MAJOR_VERSION=12 + +# NCCL versions +ARG NCCL_PACKAGE_VERSION=2.27.7-1+cuda12.4 +ARG NCCL_SO_VERSION=2.27.7 + +# OpenMPI versions +# - MPI_VERSION: full OpenMPI version +# - MPI_SERIES: major.minor series used in download URL +ARG MPI_VERSION=4.1.8 +ARG MPI_SERIES=4.1 + +# Build date (override at build time) +ARG BUILD_DATE=20251221 + +########################### +# Build Stage +########################### +FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu20.04 AS build + +# Re-declare build args for this stage (values are inherited) +ARG UBUNTU_VERSION +ARG CUDA_VERSION +ARG CUDART_VERSION +ARG NCCL_PACKAGE_VERSION +ARG NCCL_SO_VERSION +ARG MPI_VERSION +ARG MPI_SERIES +ARG BUILD_DATE + +ENV DEBIAN_FRONTEND=noninteractive +WORKDIR /workspace + +# ------------------------- +# 1. Base build dependencies +# ------------------------- +RUN mv /etc/apt/sources.list.d/cuda*.list /tmp/disabled-cuda.list || true && \ + apt-get -o Acquire::http::No-Cache=true update && \ + apt-get install -y --no-install-recommends \ + build-essential gcc g++ curl git wget ca-certificates \ + make automake autoconf libtool pkg-config \ + python3 python3-pip gzip xz-utils && \ + rm -rf /var/lib/apt/lists/* + +# ------------------------- +# 2. Install CUDA keyring and restore NVIDIA repository +# ------------------------- +RUN wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/cuda-keyring_1.1-1_all.deb && \ + dpkg -i cuda-keyring_1.1-1_all.deb && \ + apt-get update + +# ------------------------- +# 3. Install NCCL (pinned version) +# ------------------------- +RUN apt-mark unhold libnccl2 libnccl-dev || true && \ + apt-get install -y --no-install-recommends \ + libnccl2=${NCCL_PACKAGE_VERSION} \ + libnccl-dev=${NCCL_PACKAGE_VERSION} && \ + apt-mark hold libnccl2 libnccl-dev && \ + ldconfig && \ + rm -rf /var/lib/apt/lists/* + +# ------------------------- +# 4. Build OpenMPI from source +# ------------------------- +RUN wget https://download.open-mpi.org/release/open-mpi/v${MPI_SERIES}/openmpi-${MPI_VERSION}.tar.gz && \ + tar zxvf openmpi-${MPI_VERSION}.tar.gz && \ + cd openmpi-${MPI_VERSION} && \ + ./configure --prefix=/usr/local/sihpc --with-cuda=/usr/local/cuda && \ + make -j$(nproc) && make install && \ + rm -rf /workspace/openmpi-${MPI_VERSION} /workspace/openmpi-${MPI_VERSION}.tar.gz + +# ------------------------- +# 5. Build nccl-tests +# ------------------------- +RUN git clone https://github.com/scitix/nccl-tests.git -b sicl && \ + cd nccl-tests && \ + make MPI=1 MPI_HOME=/usr/local/sihpc && \ + mkdir -p /usr/local/sihpc/libexec/nccl-tests && \ + cp -rf build/*_perf /usr/local/sihpc/libexec/nccl-tests/ && \ + cp scripts/nccl_perf /usr/local/sihpc/bin/nccl_perf && \ + cp scripts/nccl_test /usr/local/sihpc/libexec/nccl-tests/nccl_test && \ + cp scripts/env.sh /usr/local/sihpc/env.sh && \ + cp scripts/install_sihpc /usr/local/sihpc/bin/install_sihpc && \ + cp scripts/uninstall_sihpc /usr/local/sihpc/bin/uninstall_sihpc && \ + rm -rf /workspace/nccl-tests + +# ------------------------- +# 6. Collect runtime libraries (strict selection) +# ------------------------- +RUN set -eux && \ + mkdir -p /usr/local/sihpc/lib && \ + cp /usr/local/cuda/lib64/libcudart* /usr/local/sihpc/lib/ && \ + cp /usr/lib/x86_64-linux-gnu/libnccl.so* /usr/local/sihpc/lib/ + # cp /lib/x86_64-linux-gnu/libltdl.so.7.3.1 /usr/local/sihpc/lib/ && \ + # cp /usr/lib/x86_64-linux-gnu/libhwloc.so* /usr/local/sihpc/lib/ && \ + # cp /usr/lib/x86_64-linux-gnu/libevent_core* /usr/local/sihpc/lib/ && \ + # cp /usr/lib/x86_64-linux-gnu/libevent_pthreads* /usr/local/sihpc/lib/ + +# ------------------------- +# 7. Fix library symlinks +# ------------------------- +RUN cd /usr/local/sihpc/lib && \ + rm -f libcudart.so libcudart.so.12 && \ + ln -sf libnccl.so.2.27.7 libnccl.so.2 && \ + ln -sf libnccl.so.2 libnccl.so && \ + ln -sf libcudart.so.12.8.90 libcudart.so.12 && \ + ln -sf libcudart.so.12 libcudart.so + # rm -f libevent_core-2.1.so.7 && \ + # ln -sf libhwloc.so.15.1.0 libhwloc.so.15 && \ + # ln -sf libhwloc.so.15.1.0 libhwloc.so && \ + # ln -sf libevent_core-2.1.so.7.0.0 libevent_core-2.1.so.7 && \ + # ln -sf libevent_core-2.1.so.7 libevent_core-2.1.so && \ + # ln -sf libevent_pthreads-2.1.so.7.0.0 libevent_pthreads-2.1.so.7 && \ + # ln -sf libevent_pthreads-2.1.so.7 libevent_pthreads-2.1.so && \ + # ln -sf libltdl.so.7.3.1 libltdl.so.7 && \ + # ln -sf libltdl.so.7 libltdl.so + +########################### +# Package Stage +########################### +FROM ubuntu:20.04 AS package + +# Re-declare args for this stage (values are inherited) +ARG UBUNTU_VERSION +ARG NCCL_PACKAGE_VERSION +ARG MPI_VERSION +ARG BUILD_DATE + +# Expose versions/date as environment variables for runtime shell expansion +ENV NCCL_PACKAGE_VERSION=${NCCL_PACKAGE_VERSION} \ + MPI_VERSION=${MPI_VERSION} \ + BUILD_DATE=${BUILD_DATE} + +COPY --from=build /usr/local/sihpc /usr/local/sihpc + +WORKDIR / +RUN apt-get update && apt-get install -y --no-install-recommends makeself && \ + chmod +x /usr/local/sihpc/bin/install_sihpc && \ + SAFE_NCCL_PKG=$(printf '%s\n' "${NCCL_PACKAGE_VERSION}" | tr '+' '-') && \ + PACKAGE_FILENAME="sicl-nccl${SAFE_NCCL_PKG}-ompi${MPI_VERSION}-ubuntu${UBUNTU_VERSION}-${BUILD_DATE}.run" && \ + makeself --gzip /usr/local/sihpc \ + "${PACKAGE_FILENAME}" \ + "SiHPC MPI + NCCL + NCCL-tests Portable Installer" \ + ./bin/install_sihpc