mirror of
https://github.com/NVIDIA/nccl-tests.git
synced 2026-04-23 16:08:20 +08:00
add Dockerfile for cuda129
This commit is contained in:
parent
587d57c5ae
commit
613b80389c
5
.github/workflows/release.yml
vendored
5
.github/workflows/release.yml
vendored
@ -17,12 +17,11 @@ jobs:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
include:
|
||||
- name: cuda12.9-ubuntu22.04
|
||||
dockerfile: docker/Dockerfile.cuda12.9.ubuntu22.04
|
||||
- name: cuda13-ubuntu22.04
|
||||
dockerfile: docker/Dockerfile.cuda13.x.ubuntu22.04
|
||||
|
||||
# - name: cuda12-ubuntu20.04
|
||||
# dockerfile: docker/Dockerfile.cuda12.x.ubuntu20.04
|
||||
|
||||
steps:
|
||||
- name: Checkout source
|
||||
uses: actions/checkout@v4
|
||||
|
||||
151
docker/Dockerfile.cuda12.9.ubuntu22.04
Normal file
151
docker/Dockerfile.cuda12.9.ubuntu22.04
Normal file
@ -0,0 +1,151 @@
|
||||
###########################
|
||||
# Build-time configuration
|
||||
###########################
|
||||
|
||||
# Base OS and CUDA versions
|
||||
ARG UBUNTU_VERSION=22.04
|
||||
ARG CUDA_VERSION=12.9.1
|
||||
ARG CUDART_VERSION=12.9.79
|
||||
ARG CUDART_MAJOR_VERSION=12
|
||||
|
||||
# NCCL versions
|
||||
ARG NCCL_PACKAGE_VERSION=2.29.2-1+cuda12.9
|
||||
ARG NCCL_SO_VERSION=2.29.2
|
||||
|
||||
# OpenMPI versions
|
||||
# - MPI_VERSION: full OpenMPI version
|
||||
# - MPI_SERIES: major.minor series used in download URL
|
||||
ARG MPI_VERSION=4.1.8
|
||||
ARG MPI_SERIES=4.1
|
||||
|
||||
# Build date (override at build time)
|
||||
ARG BUILD_DATE=20260128
|
||||
|
||||
###########################
|
||||
# Build Stage
|
||||
###########################
|
||||
FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION} AS build
|
||||
|
||||
# Re-declare build args for this stage (values are inherited)
|
||||
ARG UBUNTU_VERSION
|
||||
ARG CUDA_VERSION
|
||||
ARG CUDART_VERSION
|
||||
ARG NCCL_PACKAGE_VERSION
|
||||
ARG NCCL_SO_VERSION
|
||||
ARG MPI_VERSION
|
||||
ARG MPI_SERIES
|
||||
ARG BUILD_DATE
|
||||
|
||||
ENV DEBIAN_FRONTEND=noninteractive
|
||||
WORKDIR /workspace
|
||||
|
||||
# -------------------------
|
||||
# 1. Base build dependencies
|
||||
# -------------------------
|
||||
RUN { apt-get -o Acquire::http::No-Cache=true update > build.log 2>&1 && \
|
||||
apt-get install -y --no-install-recommends \
|
||||
build-essential gcc g++ curl git wget ca-certificates \
|
||||
make automake autoconf libtool pkg-config \
|
||||
python3 python3-pip gzip xz-utils >> build.log 2>&1 && \
|
||||
rm -rf /var/lib/apt/lists/* && rm -f build.log; } || (cat build.log && false)
|
||||
|
||||
# -------------------------
|
||||
# 2. Install CUDA keyring and restore NVIDIA repository
|
||||
# -------------------------
|
||||
RUN { wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb > build.log 2>&1 && \
|
||||
dpkg -i cuda-keyring_1.1-1_all.deb >> build.log 2>&1 && \
|
||||
apt-get update >> build.log 2>&1 && rm -f build.log; } || (cat build.log && false)
|
||||
|
||||
# -------------------------
|
||||
# 3. Install NCCL (pinned version)
|
||||
# -------------------------
|
||||
RUN { apt-get install -y --no-install-recommends \
|
||||
libnccl2=${NCCL_PACKAGE_VERSION} \
|
||||
libnccl-dev=${NCCL_PACKAGE_VERSION} > build.log 2>&1 && \
|
||||
ldconfig >> build.log 2>&1 && \
|
||||
rm -rf /var/lib/apt/lists/* && rm -f build.log; } || (cat build.log && false)
|
||||
|
||||
# -------------------------
|
||||
# 4. Build OpenMPI from source
|
||||
# -------------------------
|
||||
RUN wget https://download.open-mpi.org/release/open-mpi/v${MPI_SERIES}/openmpi-${MPI_VERSION}.tar.gz > /dev/null 2>&1 && \
|
||||
tar zxvf openmpi-${MPI_VERSION}.tar.gz > /dev/null 2>&1 && \
|
||||
cd openmpi-${MPI_VERSION} && \
|
||||
./configure --prefix=/usr/local/sihpc --with-cuda=/usr/local/cuda > /dev/null 2>&1 && \
|
||||
make -j$(nproc) > /dev/null 2>&1 && make install > /dev/null 2>&1 && \
|
||||
rm -rf /workspace/openmpi-${MPI_VERSION} /workspace/openmpi-${MPI_VERSION}.tar.gz
|
||||
|
||||
# -------------------------
|
||||
# 5. Build nccl-tests
|
||||
# -------------------------
|
||||
RUN git clone --depth 1 --single-branch -b sync/upstream-20251216 https://github.com/scitix/nccl-tests.git > /dev/null 2>&1 && \
|
||||
cd nccl-tests && \
|
||||
{ make MPI=1 MPI_HOME=/usr/local/sihpc > build.log 2>&1 && rm -f build.log || (cat build.log && false); } && \
|
||||
mkdir -p /usr/local/sihpc/libexec/nccl-tests && \
|
||||
cp -rf build/*_perf /usr/local/sihpc/libexec/nccl-tests/ && \
|
||||
mkdir -p /usr/local/sihpc/bin && \
|
||||
cp scripts/nccl_perf /usr/local/sihpc/bin/nccl_perf && \
|
||||
cp scripts/nccl_test /usr/local/sihpc/libexec/nccl-tests/nccl_test && \
|
||||
cp scripts/env.sh /usr/local/sihpc/env.sh && \
|
||||
cp scripts/install_sihpc /usr/local/sihpc/bin/install_sihpc && \
|
||||
cp scripts/uninstall_sihpc /usr/local/sihpc/bin/uninstall_sihpc && \
|
||||
rm -rf /workspace/nccl-tests
|
||||
|
||||
# -------------------------
|
||||
# 6. Collect runtime libraries (strict selection)
|
||||
# -------------------------
|
||||
RUN { set -e && \
|
||||
mkdir -p /usr/local/sihpc/lib > build.log 2>&1 && \
|
||||
cp /usr/local/cuda/lib64/libcudart* /usr/local/sihpc/lib/ >> build.log 2>&1 && \
|
||||
cp /usr/lib/x86_64-linux-gnu/libnccl.so* /usr/local/sihpc/lib/ >> build.log 2>&1 && \
|
||||
rm -f build.log; } || (cat build.log && false)
|
||||
# cp /lib/x86_64-linux-gnu/libltdl.so.7.3.1 /usr/local/sihpc/lib/ && \
|
||||
# cp /usr/lib/x86_64-linux-gnu/libhwloc.so* /usr/local/sihpc/lib/ && \
|
||||
# cp /usr/lib/x86_64-linux-gnu/libevent_core* /usr/local/sihpc/lib/ && \
|
||||
# cp /usr/lib/x86_64-linux-gnu/libevent_pthreads* /usr/local/sihpc/lib/
|
||||
|
||||
# -------------------------
|
||||
# 7. Fix library symlinks
|
||||
# -------------------------
|
||||
RUN cd /usr/local/sihpc/lib && \
|
||||
rm -f libcudart.so libcudart.so.${CUDART_MAJOR_VERSION} && \
|
||||
ln -sf libnccl.so.${NCCL_SO_VERSION} libnccl.so.2 && \
|
||||
ln -sf libnccl.so.2 libnccl.so && \
|
||||
ln -sf libcudart.so.${CUDART_VERSION} libcudart.so.${CUDART_MAJOR_VERSION} && \
|
||||
ln -sf libcudart.so.${CUDART_MAJOR_VERSION} libcudart.so
|
||||
# rm -f libevent_core-2.1.so.7 && \
|
||||
# ln -sf libhwloc.so.15.1.0 libhwloc.so.15 && \
|
||||
# ln -sf libhwloc.so.15.1.0 libhwloc.so && \
|
||||
# ln -sf libevent_core-2.1.so.7.0.0 libevent_core-2.1.so.7 && \
|
||||
# ln -sf libevent_core-2.1.so.7 libevent_core-2.1.so && \
|
||||
# ln -sf libevent_pthreads-2.1.so.7.0.0 libevent_pthreads-2.1.so.7 && \
|
||||
# ln -sf libevent_pthreads-2.1.so.7 libevent_pthreads-2.1.so && \
|
||||
# ln -sf libltdl.so.7.3.1 libltdl.so.7 && \
|
||||
# ln -sf libltdl.so.7 libltdl.so
|
||||
|
||||
###########################
|
||||
# Package Stage
|
||||
###########################
|
||||
FROM ubuntu:20.04 AS package
|
||||
|
||||
# Re-declare args for this stage (values are inherited)
|
||||
ARG UBUNTU_VERSION
|
||||
ARG NCCL_PACKAGE_VERSION
|
||||
ARG MPI_VERSION
|
||||
ARG BUILD_DATE
|
||||
|
||||
# Expose versions/date as environment variables for runtime shell expansion
|
||||
ENV NCCL_PACKAGE_VERSION=${NCCL_PACKAGE_VERSION} \
|
||||
MPI_VERSION=${MPI_VERSION} \
|
||||
BUILD_DATE=${BUILD_DATE}
|
||||
|
||||
COPY --from=build /usr/local/sihpc /usr/local/sihpc
|
||||
|
||||
WORKDIR /
|
||||
RUN apt-get update && apt-get install -y --no-install-recommends makeself && \
|
||||
SAFE_NCCL_PKG=$(printf '%s\n' "${NCCL_PACKAGE_VERSION}" | tr '+' '-') && \
|
||||
PACKAGE_FILENAME="sicl-nccl${SAFE_NCCL_PKG}-ompi${MPI_VERSION}-ubuntu${UBUNTU_VERSION}-${BUILD_DATE}.run" && \
|
||||
{ makeself --gzip /usr/local/sihpc \
|
||||
"${PACKAGE_FILENAME}" \
|
||||
"SiHPC MPI + NCCL + NCCL-tests Portable Installer" \
|
||||
./bin/install_sihpc > build.log 2>&1 && rm -f build.log; } || (cat build.log && false)
|
||||
@ -9,8 +9,8 @@ ARG CUDART_VERSION=13.1.80
|
||||
ARG CUDART_MAJOR_VERSION=13
|
||||
|
||||
# NCCL versions
|
||||
ARG NCCL_PACKAGE_VERSION=2.28.9-1+cuda13.0
|
||||
ARG NCCL_SO_VERSION=2.28.9
|
||||
ARG NCCL_PACKAGE_VERSION=2.29.2-1+cuda12.9
|
||||
ARG NCCL_SO_VERSION=2.29.2
|
||||
|
||||
# OpenMPI versions
|
||||
# - MPI_VERSION: full OpenMPI version
|
||||
@ -19,7 +19,7 @@ ARG MPI_VERSION=4.1.8
|
||||
ARG MPI_SERIES=4.1
|
||||
|
||||
# Build date (override at build time)
|
||||
ARG BUILD_DATE=20251221
|
||||
ARG BUILD_DATE=20260128
|
||||
|
||||
###########################
|
||||
# Build Stage
|
||||
|
||||
@ -26,12 +26,12 @@ export OMPI_ALLOW_RUN_AS_ROOT_CONFIRM=1
|
||||
export OMPI_MCA_btl=self,tcp
|
||||
export OMPI_MCA_pml=^ucx
|
||||
|
||||
if [ $OMPI_COMM_WORLD_SIZE -gt $OMPI_COMM_WORLD_LOCAL_SIZE ]; then
|
||||
if [ $OMPI_COMM_WORLD_RANK -eq 0 ]; then
|
||||
if [ "${OMPI_COMM_WORLD_SIZE:-0}" -gt "${OMPI_COMM_WORLD_LOCAL_SIZE:-0}" ]; then
|
||||
if [ "${OMPI_COMM_WORLD_RANK:-0}" -eq 0 ]; then
|
||||
export NCCL_DEBUG=${NCCL_DEBUG:-"INFO"}
|
||||
fi
|
||||
fi
|
||||
if [ $OMPI_COMM_WORLD_RANK -eq 0 ]; then
|
||||
echo "[$(hostname)] running nccl test $COLL$OPTIONS, world_size=$OMPI_COMM_WORLD_SIZE"
|
||||
if [ "${OMPI_COMM_WORLD_RANK:-0}" -eq 0 ]; then
|
||||
echo "[$(hostname)] running nccl test $COLL$OPTIONS, world_size=${OMPI_COMM_WORLD_SIZE:-0}"
|
||||
fi
|
||||
$TEST_DIR/${COLL}_perf -f2$OPTIONS
|
||||
|
||||
Loading…
Reference in New Issue
Block a user