TensorRT-LLMs/docker/common/install_deep_ep.sh
Tailing Yuan 0b60da2c45
feat: large-scale EP(part 7: DeepEP integration) (#4792)
Signed-off-by: Tailing Yuan <yuantailing@gmail.com>
Co-authored-by: Kaiyu Xie <26294424+kaiyux@users.noreply.github.com>
2025-06-14 19:12:38 +08:00

48 lines
1.6 KiB
Bash

#!/bin/bash
set -euxo pipefail
GITHUB_URL=${GITHUB_MIRROR:-https://github.com}
DEEP_EP_COMMIT=2b266cf6452134f993ab0fcb3ef2d5de7683c561
if [ "$(. /etc/os-release && echo $ID)" == "rocky" ]; then
echo "Skipping DeepEP installation in the Rocky distribution."
exit 0
fi
libmlx5_dir=$(dirname $(ldconfig -p | grep libmlx5.so.1 | head -n1 | awk '{print $NF}'))
export NVCC_APPEND_FLAGS="--threads 4"
# Custom NVSHMEM
curl -fsSL https://developer.download.nvidia.com/compute/redist/nvshmem/3.2.5/source/nvshmem_src_3.2.5-1.txz | tar xz
pushd nvshmem_src
curl -fsSL $GITHUB_URL/deepseek-ai/DeepEP/raw/$DEEP_EP_COMMIT/third-party/nvshmem.patch | patch -p1
sed "s/TRANSPORT_VERSION_MAJOR 3/TRANSPORT_VERSION_MAJOR 103/" -i src/CMakeLists.txt
ln -s libmlx5.so.1 "$libmlx5_dir/libmlx5.so"
cmake -S . -B build \
-DCMAKE_INSTALL_PREFIX=/opt/custom_nvshmem \
-DGDRCOPY_HOME=/usr/include \
-DNVSHMEM_SHMEM_SUPPORT=0 \
-DNVSHMEM_UCX_SUPPORT=0 \
-DNVSHMEM_USE_NCCL=0 \
-DNVSHMEM_MPI_SUPPORT=0 \
-DNVSHMEM_IBGDA_SUPPORT=1 \
-DNVSHMEM_PMIX_SUPPORT=0 \
-DNVSHMEM_TIMEOUT_DEVICE_POLLING=0 \
-DNVSHMEM_USE_GDRCOPY=1 \
-DCMAKE_CUDA_ARCHITECTURES="90-real;100-real;120-real" \
-DNVSHMEM_BUILD_TESTS=0 \
-DNVSHMEM_BUILD_EXAMPLES=0
cmake --build build -j`nproc`
make -C build install
popd
# DeepEP
curl -fsSL $GITHUB_URL/deepseek-ai/DeepEP/archive/$DEEP_EP_COMMIT.tar.gz | tar xz
TORCH_CUDA_ARCH_LIST="9.0;10.0;12.0" NVSHMEM_DIR=/opt/custom_nvshmem pip install -v --no-cache-dir ./DeepEP-$DEEP_EP_COMMIT
# Clean up
rm -r nvshmem_src
rm "$libmlx5_dir/libmlx5.so"
rm -r DeepEP-$DEEP_EP_COMMIT