mirror of
https://github.com/NVIDIA/TensorRT-LLM.git
synced 2026-01-13 22:18:36 +08:00
Refactor: move DeepEP from Docker images to wheel building (#5534)
Signed-off-by: Tailing Yuan <yuantailing@gmail.com>
This commit is contained in:
parent
1260e2f33f
commit
85b4a6808d
3
.gitattributes
vendored
3
.gitattributes
vendored
@ -1,7 +1,8 @@
|
||||
*.a filter=lfs diff=lfs merge=lfs -text
|
||||
*.dll filter=lfs diff=lfs merge=lfs -text
|
||||
*.lib filter=lfs diff=lfs merge=lfs -text
|
||||
*.so filter=lfs diff=lfs merge=lfs -text
|
||||
*.dll filter=lfs diff=lfs merge=lfs -text
|
||||
*.txz filter=lfs diff=lfs merge=lfs -text
|
||||
*.xz filter=lfs diff=lfs merge=lfs -text
|
||||
triton_backend/tools/gpt/input_data.json filter=lfs diff=lfs merge=lfs -text
|
||||
*cubin.cpp filter=lfs diff=lfs merge=lfs -text
|
||||
|
||||
4
.gitignore
vendored
4
.gitignore
vendored
@ -40,6 +40,9 @@ tensorrt_llm/libs
|
||||
tensorrt_llm/bindings.*.so
|
||||
tensorrt_llm/bindings.pyi
|
||||
tensorrt_llm/bindings/**/*.pyi
|
||||
tensorrt_llm/deep_ep/
|
||||
tensorrt_llm/deep_ep_cpp_tllm.*.so
|
||||
tensorrt_llm/deep_ep_cpp_tllm.pyi
|
||||
*docs/cpp_docs*
|
||||
*docs/source/_cpp_gen*
|
||||
docs/source/**/*.rst
|
||||
@ -55,6 +58,7 @@ llm-test-workspace/
|
||||
*.safetensors
|
||||
*/tllm_debug/**
|
||||
*.patch
|
||||
!cpp/tensorrt_llm/deep_ep/*.patch
|
||||
|
||||
# Generated files
|
||||
cpp/include/tensorrt_llm/executor/version.h
|
||||
|
||||
@ -27,6 +27,7 @@ repos:
|
||||
args: [--allow-multiple-documents]
|
||||
exclude: ".*/gitlab/.*.yml"
|
||||
- id: trailing-whitespace
|
||||
exclude: '\.patch$'
|
||||
- id: check-toml
|
||||
- id: mixed-line-ending
|
||||
args: [--fix=lf]
|
||||
|
||||
@ -297,4 +297,8 @@ if(BUILD_PYBIND)
|
||||
add_subdirectory(pybind)
|
||||
endif()
|
||||
|
||||
if(BUILD_DEEP_EP)
|
||||
add_subdirectory(deep_ep)
|
||||
endif()
|
||||
|
||||
add_subdirectory(plugins)
|
||||
|
||||
207
cpp/tensorrt_llm/deep_ep/CMakeLists.txt
Normal file
207
cpp/tensorrt_llm/deep_ep/CMakeLists.txt
Normal file
@ -0,0 +1,207 @@
|
||||
set(DEEP_EP_COMMIT c381dadf43a85062f6a8947592017ee513abc70b)
|
||||
set(NVSHMEM_URL_HASH
|
||||
SHA256=eb2c8fb3b7084c2db86bd9fd905387909f1dfd483e7b45f7b3c3d5fcf5374b5a)
|
||||
|
||||
add_custom_target(deep_ep)
|
||||
|
||||
# CUDA architectures
|
||||
# ==================
|
||||
|
||||
# Filter CUDA arch >= 9.0
|
||||
set(DEEP_EP_CUDA_ARCHITECTURES "")
|
||||
foreach(CUDA_ARCH IN LISTS CMAKE_CUDA_ARCHITECTURES)
|
||||
string(REGEX MATCHALL "^([1-9][0-9]*)([0-9])[af]?(-real|-virtual)?$" MATCHES
|
||||
${CUDA_ARCH})
|
||||
if(NOT CMAKE_MATCH_0)
|
||||
message(FATAL_ERROR "Invalid CUDA arch format: \"${CUDA_ARCH}\"")
|
||||
endif()
|
||||
set(CUDA_ARCH_MAJOR ${CMAKE_MATCH_1})
|
||||
set(CUDA_ARCH_MINOR ${CMAKE_MATCH_2})
|
||||
set(CUDA_ARCH_POSTFIX ${CMAKE_MATCH_3})
|
||||
if(${CUDA_ARCH_MAJOR} GREATER_EQUAL 9)
|
||||
list(APPEND DEEP_EP_CUDA_ARCHITECTURES
|
||||
"${CUDA_ARCH_MAJOR}${CUDA_ARCH_MINOR}${CUDA_ARCH_POSTFIX}")
|
||||
endif()
|
||||
endforeach()
|
||||
|
||||
# Skip build if there is no suitable CUDA arch
|
||||
if(WIN32)
|
||||
set(DEEP_EP_CUDA_ARCHITECTURES "")
|
||||
endif()
|
||||
message(
|
||||
STATUS "deep_ep DEEP_EP_CUDA_ARCHITECTURES: ${DEEP_EP_CUDA_ARCHITECTURES}")
|
||||
file(WRITE ${CMAKE_CURRENT_BINARY_DIR}/cuda_architectures.txt
|
||||
"${DEEP_EP_CUDA_ARCHITECTURES}")
|
||||
if(NOT DEEP_EP_CUDA_ARCHITECTURES)
|
||||
return()
|
||||
endif()
|
||||
|
||||
# Prepare files
|
||||
# =============
|
||||
|
||||
# Download DeepEP
|
||||
include(FetchContent)
|
||||
if(DEFINED ENV{GITHUB_MIRROR} AND NOT "$ENV{GITHUB_MIRROR}" STREQUAL "")
|
||||
set(GITHUB_URL "$ENV{GITHUB_MIRROR}")
|
||||
else()
|
||||
set(GITHUB_URL "https://github.com")
|
||||
endif()
|
||||
set(DEEP_EP_URL
|
||||
"${GITHUB_URL}/deepseek-ai/DeepEP/archive/${DEEP_EP_COMMIT}.tar.gz")
|
||||
message(STATUS "deep_ep DEEP_EP_URL: ${DEEP_EP_URL}")
|
||||
FetchContent_Declare(deep_ep_download URL ${DEEP_EP_URL})
|
||||
FetchContent_MakeAvailable(deep_ep_download)
|
||||
set(DEEP_EP_SOURCE_DIR ${deep_ep_download_SOURCE_DIR})
|
||||
|
||||
# Copy and update python files
|
||||
set(DEEP_EP_PYTHON_DEST ${CMAKE_CURRENT_BINARY_DIR}/python/deep_ep)
|
||||
file(REMOVE_RECURSE ${DEEP_EP_PYTHON_DEST})
|
||||
file(MAKE_DIRECTORY ${DEEP_EP_PYTHON_DEST})
|
||||
configure_file(${DEEP_EP_SOURCE_DIR}/LICENSE ${DEEP_EP_PYTHON_DEST}/LICENSE
|
||||
COPYONLY)
|
||||
set(_files __init__.py buffer.py utils.py)
|
||||
foreach(_f IN LISTS _files)
|
||||
set(_src "${DEEP_EP_SOURCE_DIR}/deep_ep/${_f}")
|
||||
set(_dst "${DEEP_EP_PYTHON_DEST}/${_f}")
|
||||
file(READ "${_src}" _content)
|
||||
string(REPLACE "deep_ep_cpp" "tensorrt_llm.deep_ep_cpp_tllm" _content
|
||||
"${_content}")
|
||||
string(
|
||||
PREPEND
|
||||
_content
|
||||
"# Adapted from https://github.com/deepseek-ai/DeepEP/blob/${DEEP_EP_COMMIT}/deep_ep/${_f}\n"
|
||||
)
|
||||
file(WRITE "${_dst}" "${_content}")
|
||||
set_property(
|
||||
DIRECTORY
|
||||
APPEND
|
||||
PROPERTY CMAKE_CONFIGURE_DEPENDS ${_src})
|
||||
endforeach()
|
||||
|
||||
# Delete stale nvshmem on patch update
|
||||
set(NVSHMEM_STAMP_FILE ${CMAKE_CURRENT_BINARY_DIR}/nvshmem_stamp.txt)
|
||||
file(SHA256 ${DEEP_EP_SOURCE_DIR}/third-party/nvshmem.patch NVSHMEM_PATCH_HASH)
|
||||
file(SHA256 ${CMAKE_CURRENT_SOURCE_DIR}/nvshmem_fast_build.patch
|
||||
NVSHMEM_PATCH_2_HASH)
|
||||
set(NVSHMEM_STAMP_CONTENT "${NVSHMEM_URL_HASH}")
|
||||
string(APPEND NVSHMEM_STAMP_CONTENT " PATCH_COMMAND v1")
|
||||
string(APPEND NVSHMEM_STAMP_CONTENT " ${NVSHMEM_PATCH_HASH}")
|
||||
string(APPEND NVSHMEM_STAMP_CONTENT " 103")
|
||||
string(APPEND NVSHMEM_STAMP_CONTENT " ${NVSHMEM_PATCH_2_HASH}")
|
||||
set(OLD_NVSHMEM_STAMP_CONTENT "")
|
||||
if(EXISTS ${NVSHMEM_STAMP_FILE})
|
||||
file(READ ${NVSHMEM_STAMP_FILE} OLD_NVSHMEM_STAMP_CONTENT)
|
||||
endif()
|
||||
if(NOT OLD_NVSHMEM_STAMP_CONTENT STREQUAL NVSHMEM_STAMP_CONTENT)
|
||||
file(REMOVE_RECURSE ${CMAKE_CURRENT_BINARY_DIR}/nvshmem_project-prefix)
|
||||
file(WRITE ${NVSHMEM_STAMP_FILE} "${NVSHMEM_STAMP_CONTENT}")
|
||||
endif()
|
||||
set_property(
|
||||
DIRECTORY APPEND
|
||||
PROPERTY CMAKE_CONFIGURE_DEPENDS
|
||||
${DEEP_EP_SOURCE_DIR}/third-party/nvshmem.patch
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/nvshmem_fast_build.patch)
|
||||
|
||||
# Add NVSHMEM
|
||||
# ===========
|
||||
|
||||
# NVSHMEM only works with GCC. Building NVSHMEM with Clang results in
|
||||
# compilation errors. Using NVSHMEM with Clang results in slow builds and device
|
||||
# link issues.
|
||||
if(NOT CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
|
||||
set(CMAKE_C_COMPILER gcc)
|
||||
set(CMAKE_CXX_COMPILER g++)
|
||||
set(CMAKE_CUDA_HOST_COMPILER g++)
|
||||
endif()
|
||||
|
||||
# Add nvshmem external project
|
||||
include(ExternalProject)
|
||||
ExternalProject_Add(
|
||||
nvshmem_project
|
||||
URL file://${CMAKE_CURRENT_SOURCE_DIR}/nvshmem_src_3.2.5-1.txz
|
||||
URL_HASH ${NVSHMEM_URL_HASH}
|
||||
PATCH_COMMAND patch -p1 --forward --batch -i
|
||||
${DEEP_EP_SOURCE_DIR}/third-party/nvshmem.patch
|
||||
COMMAND sed "s/TRANSPORT_VERSION_MAJOR 3/TRANSPORT_VERSION_MAJOR 103/" -i
|
||||
src/CMakeLists.txt
|
||||
COMMAND patch -p1 --forward --batch -i
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/nvshmem_fast_build.patch
|
||||
CMAKE_CACHE_ARGS
|
||||
-DCMAKE_C_COMPILER:STRING=${CMAKE_C_COMPILER}
|
||||
-DCMAKE_C_COMPILER_LAUNCHER:STRING=${CMAKE_C_COMPILER_LAUNCHER}
|
||||
-DCMAKE_CXX_COMPILER:STRING=${CMAKE_CXX_COMPILER}
|
||||
-DCMAKE_CXX_COMPILER_LAUNCHER:STRING=${CMAKE_CXX_COMPILER_LAUNCHER}
|
||||
-DCMAKE_CUDA_ARCHITECTURES:STRING=${DEEP_EP_CUDA_ARCHITECTURES}
|
||||
-DCMAKE_CUDA_HOST_COMPILER:STRING=${CMAKE_CUDA_HOST_COMPILER}
|
||||
-DCMAKE_CUDA_COMPILER_LAUNCHER:STRING=${CMAKE_CUDA_COMPILER_LAUNCHER}
|
||||
-DNVSHMEM_BUILD_EXAMPLES:BOOL=0
|
||||
-DNVSHMEM_BUILD_PACKAGES:BOOL=0
|
||||
-DNVSHMEM_BUILD_TESTS:BOOL=0
|
||||
-DNVSHMEM_IBGDA_SUPPORT:BOOL=1
|
||||
-DNVSHMEM_IBRC_SUPPORT:BOOL=0
|
||||
-DNVSHMEM_MPI_SUPPORT:BOOL=0
|
||||
-DNVSHMEM_PMIX_SUPPORT:BOOL=0
|
||||
-DNVSHMEM_SHMEM_SUPPORT:BOOL=0
|
||||
-DNVSHMEM_TIMEOUT_DEVICE_POLLING:BOOL=0
|
||||
-DNVSHMEM_UCX_SUPPORT:BOOL=0
|
||||
-DNVSHMEM_USE_GDRCOPY:BOOL=0
|
||||
-DNVSHMEM_USE_NCCL:BOOL=0
|
||||
INSTALL_COMMAND ""
|
||||
BINARY_DIR ${CMAKE_CURRENT_BINARY_DIR}/nvshmem-build
|
||||
BUILD_BYPRODUCTS
|
||||
${CMAKE_CURRENT_BINARY_DIR}/nvshmem-build/src/lib/libnvshmem.a)
|
||||
add_library(nvshmem_project::nvshmem STATIC IMPORTED)
|
||||
add_dependencies(nvshmem_project::nvshmem nvshmem_project)
|
||||
file(MAKE_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/nvshmem-build/src/include)
|
||||
set_target_properties(
|
||||
nvshmem_project::nvshmem
|
||||
PROPERTIES IMPORTED_LOCATION
|
||||
${CMAKE_CURRENT_BINARY_DIR}/nvshmem-build/src/lib/libnvshmem.a
|
||||
INTERFACE_INCLUDE_DIRECTORIES
|
||||
${CMAKE_CURRENT_BINARY_DIR}/nvshmem-build/src/include)
|
||||
|
||||
# Add DeepEP cpp
|
||||
# ==============
|
||||
|
||||
# Let CMake generate `fatbinData` for CUDA separable compilation. Set to FALSE
|
||||
# or TRUE are both OK, but it generates `code=lto_90a` rather than `code=sm_90a`
|
||||
# for arch `90a-real` if set to TRUE.
|
||||
set(CMAKE_INTERPROCEDURAL_OPTIMIZATION FALSE)
|
||||
|
||||
# Find torch_python
|
||||
find_library(TORCH_PYTHON_LIB torch_python REQUIRED
|
||||
HINTS ${TORCH_INSTALL_PREFIX}/lib)
|
||||
|
||||
# Add deep_ep_cpp_tllm
|
||||
file(GLOB_RECURSE SRC_CPP ${DEEP_EP_SOURCE_DIR}/csrc/*.cpp)
|
||||
file(GLOB_RECURSE SRC_CU ${DEEP_EP_SOURCE_DIR}/csrc/*.cu)
|
||||
pybind11_add_module(deep_ep_cpp_tllm ${SRC_CPP} ${SRC_CU})
|
||||
set_target_properties(
|
||||
deep_ep_cpp_tllm
|
||||
PROPERTIES CXX_STANDARD_REQUIRED ON
|
||||
CUDA_STANDARD_REQUIRED ON
|
||||
CXX_STANDARD 17
|
||||
CUDA_STANDARD 17
|
||||
CUDA_SEPARABLE_COMPILATION ON
|
||||
CUDA_ARCHITECTURES ${DEEP_EP_CUDA_ARCHITECTURES}
|
||||
LINK_DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/deep_ep_cpp_tllm.version
|
||||
INSTALL_RPATH "$ORIGIN/libs/nvshmem;${TORCH_INSTALL_PREFIX}/lib"
|
||||
BUILD_WITH_INSTALL_RPATH TRUE)
|
||||
target_compile_options(
|
||||
deep_ep_cpp_tllm
|
||||
PRIVATE ${TORCH_CXX_FLAGS} -O3 $<$<COMPILE_LANGUAGE:CUDA>:-Xcompiler=-O3>
|
||||
$<$<COMPILE_LANGUAGE:CUDA>:--ptxas-options=--register-usage-level=10>)
|
||||
target_compile_definitions(
|
||||
deep_ep_cpp_tllm PRIVATE DISABLE_AGGRESSIVE_PTX_INSTRS
|
||||
TORCH_EXTENSION_NAME=deep_ep_cpp_tllm)
|
||||
target_link_libraries(
|
||||
deep_ep_cpp_tllm PRIVATE nvshmem_project::nvshmem ${TORCH_LIBRARIES}
|
||||
${TORCH_PYTHON_LIB})
|
||||
target_link_options(
|
||||
deep_ep_cpp_tllm PRIVATE
|
||||
-Wl,--version-script,${CMAKE_CURRENT_SOURCE_DIR}/deep_ep_cpp_tllm.version
|
||||
-Wl,--no-undefined-version)
|
||||
|
||||
# Set targets
|
||||
# ===========
|
||||
add_dependencies(deep_ep deep_ep_cpp_tllm nvshmem_project)
|
||||
8
cpp/tensorrt_llm/deep_ep/README.md
Normal file
8
cpp/tensorrt_llm/deep_ep/README.md
Normal file
@ -0,0 +1,8 @@
|
||||
How to generate `nvshmem_fast_build.patch`?
|
||||
|
||||
1. Build the project without applying the `nvshmem_fast_build.patch`.
|
||||
2. Link NVSHMEM to DeepEP with one NVSHMEM object file omitted.
|
||||
3. Repeat step 2 until no more object files can be omitted.
|
||||
4. Remove the unused files from NVSHMEM's `CMakelists.txt`, and save the differences as `nvshmem_fast_build.patch`.
|
||||
|
||||
The script `strip_nvshmem_helper.py` automatically performs steps 2 and 3.
|
||||
4
cpp/tensorrt_llm/deep_ep/deep_ep_cpp_tllm.version
Normal file
4
cpp/tensorrt_llm/deep_ep/deep_ep_cpp_tllm.version
Normal file
@ -0,0 +1,4 @@
|
||||
{
|
||||
global: PyInit_deep_ep_cpp_tllm;
|
||||
local: *;
|
||||
};
|
||||
66
cpp/tensorrt_llm/deep_ep/nvshmem_fast_build.patch
Normal file
66
cpp/tensorrt_llm/deep_ep/nvshmem_fast_build.patch
Normal file
@ -0,0 +1,66 @@
|
||||
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
|
||||
index cba899bba..c27337601 100644
|
||||
--- a/src/CMakeLists.txt
|
||||
+++ b/src/CMakeLists.txt
|
||||
@@ -264,48 +264,20 @@ set(NVSHMEM_HOST_SOURCES_NOMAXREGCOUNT
|
||||
host/comm/rma.cu
|
||||
host/stream/comm/quiet_on_stream.cu
|
||||
host/stream/comm/cuda_interface_sync.cu
|
||||
- host/stream/coll/alltoall/alltoall.cu
|
||||
host/stream/coll/barrier/barrier.cu
|
||||
- host/stream/coll/broadcast/broadcast.cu
|
||||
- host/stream/coll/fcollect/fcollect.cu
|
||||
- host/stream/coll/rdxn/reduce_and.cu
|
||||
- host/stream/coll/rdxn/reduce_or.cu
|
||||
- host/stream/coll/rdxn/reduce_xor.cu
|
||||
- host/stream/coll/rdxn/reduce_min.cu
|
||||
host/stream/coll/rdxn/reduce_max.cu
|
||||
- host/stream/coll/rdxn/reduce_prod.cu
|
||||
- host/stream/coll/rdxn/reduce_sum.cu
|
||||
host/stream/coll/rdxn/reduce_team.cu
|
||||
- host/stream/coll/reducescatter/reducescatter_and.cu
|
||||
- host/stream/coll/reducescatter/reducescatter_or.cu
|
||||
- host/stream/coll/reducescatter/reducescatter_xor.cu
|
||||
- host/stream/coll/reducescatter/reducescatter_min.cu
|
||||
- host/stream/coll/reducescatter/reducescatter_max.cu
|
||||
- host/stream/coll/reducescatter/reducescatter_prod.cu
|
||||
- host/stream/coll/reducescatter/reducescatter_sum.cu
|
||||
)
|
||||
|
||||
set(NVSHMEM_HOST_SOURCES
|
||||
host/bootstrap/bootstrap.cpp
|
||||
host/bootstrap/bootstrap_loader.cpp
|
||||
host/coll/cpu_coll.cpp
|
||||
- host/coll/alltoall/alltoall.cpp
|
||||
- host/coll/alltoall/alltoall_on_stream.cpp
|
||||
host/coll/barrier/barrier.cpp
|
||||
host/coll/barrier/barrier_on_stream.cpp
|
||||
- host/coll/broadcast/broadcast.cpp
|
||||
- host/coll/broadcast/broadcast_on_stream.cpp
|
||||
- host/coll/fcollect/fcollect.cpp
|
||||
- host/coll/fcollect/fcollect_on_stream.cpp
|
||||
- host/coll/rdxn/rdxn.cpp
|
||||
- host/coll/rdxn/rdxn_on_stream.cpp
|
||||
- host/coll/reducescatter/reducescatter.cpp
|
||||
- host/coll/reducescatter/reducescatter_on_stream.cpp
|
||||
host/comm/putget.cpp
|
||||
- host/comm/fence.cpp
|
||||
host/comm/quiet.cpp
|
||||
host/comm/sync.cpp
|
||||
- host/comm/amo.cpp
|
||||
host/proxy/proxy.cpp
|
||||
host/transport/transport.cpp
|
||||
host/transport/p2p/p2p.cpp
|
||||
@@ -1006,3 +978,12 @@ set(CPACK_RPM_PACKAGE_REQUIRES_PREUN "/sbin/ldconfig")
|
||||
|
||||
include(CPack)
|
||||
# End Installation definitions
|
||||
+
|
||||
+set_target_properties(
|
||||
+ git_commit
|
||||
+ nvshmem_device_project
|
||||
+ nvshmem_bootstrap_pmi
|
||||
+ nvshmem_bootstrap_pmi2
|
||||
+ nvshmem_host
|
||||
+ nvshmem-info
|
||||
+ PROPERTIES EXCLUDE_FROM_ALL TRUE)
|
||||
3
cpp/tensorrt_llm/deep_ep/nvshmem_src_3.2.5-1.txz
Normal file
3
cpp/tensorrt_llm/deep_ep/nvshmem_src_3.2.5-1.txz
Normal file
@ -0,0 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:eb2c8fb3b7084c2db86bd9fd905387909f1dfd483e7b45f7b3c3d5fcf5374b5a
|
||||
size 618175
|
||||
61
cpp/tensorrt_llm/deep_ep/strip_nvshmem_helper.py
Normal file
61
cpp/tensorrt_llm/deep_ep/strip_nvshmem_helper.py
Normal file
@ -0,0 +1,61 @@
|
||||
# A helper script to detect unused NVSHMEM object files.
|
||||
#
|
||||
# The script links NVSHMEM to DeepEP with one object file removed at a time and
|
||||
# checks whether there are any undefined symbols. See README.md for details.
|
||||
# This script is not tested or QA'ed, so you may need to update this script if
|
||||
# the project structure changes or compilation options change.
|
||||
import pathlib
|
||||
import re
|
||||
import subprocess
|
||||
|
||||
project_dir = pathlib.Path(__file__).parent.parent.parent.parent
|
||||
|
||||
# Run `find cpp/build | grep kernels/internode_ll.cu.o$` to get the directory
|
||||
deep_ep_obj_dir = project_dir / "cpp/build/tensorrt_llm/deep_ep/CMakeFiles/deep_ep_cpp_tllm.dir/__/__/_deps/deep_ep_download-src/csrc"
|
||||
assert deep_ep_obj_dir.is_dir()
|
||||
|
||||
# Run `find cpp/build | grep host/bootstrap/bootstrap.cpp.o$` to get the directory
|
||||
# Please set it to `nvshmem.dir` rather than `nvshmem_host.dir`
|
||||
nvshmem_obj_dir = project_dir / "cpp/build/tensorrt_llm/deep_ep/nvshmem-build/src/CMakeFiles/nvshmem.dir"
|
||||
assert nvshmem_obj_dir.is_dir()
|
||||
|
||||
# Parse the `-gencode` arguments
|
||||
with (project_dir /
|
||||
"cpp/build/tensorrt_llm/deep_ep/cuda_architectures.txt").open() as f:
|
||||
cuda_architectures = f.read()
|
||||
pattern = re.compile(r'^([1-9][0-9]*[0-9][af]?)(-real|-virtual)?$')
|
||||
gencode_args = []
|
||||
for cuda_arch in cuda_architectures.split(";"):
|
||||
matches = re.match(pattern, cuda_arch)
|
||||
assert matches is not None, f"Invalid cuda arch \"{cuda_arch}\""
|
||||
sm_version = matches.group(1)
|
||||
postfix = matches.group(2) or ""
|
||||
code = {
|
||||
"": f"[compute_{sm_version},sm_{sm_version}]",
|
||||
"-real": f"[sm_{sm_version}]",
|
||||
"-virtual": f"[compute_{sm_version}]",
|
||||
}[postfix]
|
||||
gencode_args.append(f"-gencode=arch=compute_{sm_version},{code=:s}")
|
||||
|
||||
temp_dir = project_dir / "cpp/build/tensorrt_llm/deep_ep/strip_nvshmem_helper"
|
||||
temp_dir.mkdir(exist_ok=True)
|
||||
ranlib = temp_dir / "liba.a"
|
||||
if ranlib.exists():
|
||||
ranlib.unlink()
|
||||
|
||||
deep_ep_obj_list = sorted(deep_ep_obj_dir.glob("kernels/**/*.o"))
|
||||
nvshmem_obj_set = set(nvshmem_obj_dir.glob("**/*.o"))
|
||||
for exclude_obj in sorted(nvshmem_obj_set):
|
||||
# Create liba.a with one object file removed
|
||||
subprocess.check_call(
|
||||
["ar", "rcs", ranlib, *(nvshmem_obj_set - {exclude_obj})])
|
||||
# Test whether there are undefined symbols
|
||||
res = subprocess.call([
|
||||
"/usr/local/cuda/bin/nvcc", *gencode_args, "-Xlinker", "--no-undefined",
|
||||
"-shared", *deep_ep_obj_list, ranlib, "-o", temp_dir / "a.out"
|
||||
])
|
||||
# If there are no undefined symbols, print "-" to indicate the file can be omitted
|
||||
print("-" if res == 0 else "+",
|
||||
str(exclude_obj.relative_to(nvshmem_obj_dir))[:-2])
|
||||
# Unlink the archive file because `ar` appends existing archives
|
||||
ranlib.unlink()
|
||||
@ -16,7 +16,6 @@ LABEL com.nvidia.ai-terms="https://www.nvidia.com/en-us/agreements/enterprise-so
|
||||
ENV BASH_ENV=${BASH_ENV:-/etc/bash.bashrc}
|
||||
ENV ENV=${ENV:-/etc/shinit_v2}
|
||||
ARG GITHUB_MIRROR=""
|
||||
ENV GITHUB_MIRROR=$GITHUB_MIRROR
|
||||
RUN echo "Using GitHub mirror: $GITHUB_MIRROR"
|
||||
SHELL ["/bin/bash", "-c"]
|
||||
|
||||
@ -28,13 +27,13 @@ FROM base AS devel
|
||||
ARG PYTHON_VERSION="3.12.3"
|
||||
RUN echo "Using Python version: $PYTHON_VERSION"
|
||||
COPY docker/common/install_base.sh install_base.sh
|
||||
RUN bash ./install_base.sh $PYTHON_VERSION && rm install_base.sh
|
||||
RUN GITHUB_MIRROR=$GITHUB_MIRROR bash ./install_base.sh $PYTHON_VERSION && rm install_base.sh
|
||||
|
||||
COPY docker/common/install_cmake.sh install_cmake.sh
|
||||
RUN bash ./install_cmake.sh && rm install_cmake.sh
|
||||
RUN GITHUB_MIRROR=$GITHUB_MIRROR bash ./install_cmake.sh && rm install_cmake.sh
|
||||
|
||||
COPY docker/common/install_ccache.sh install_ccache.sh
|
||||
RUN bash ./install_ccache.sh && rm install_ccache.sh
|
||||
RUN GITHUB_MIRROR=$GITHUB_MIRROR bash ./install_ccache.sh && rm install_ccache.sh
|
||||
|
||||
# Only take effect when the base image is Rocky Linux 8 with old CUDA version.
|
||||
COPY docker/common/install_cuda_toolkit.sh install_cuda_toolkit.sh
|
||||
@ -61,7 +60,7 @@ RUN bash ./install_polygraphy.sh && rm install_polygraphy.sh
|
||||
|
||||
# Install mpi4py
|
||||
COPY docker/common/install_mpi4py.sh install_mpi4py.sh
|
||||
RUN bash ./install_mpi4py.sh && rm install_mpi4py.sh
|
||||
RUN GITHUB_MIRROR=$GITHUB_MIRROR bash ./install_mpi4py.sh && rm install_mpi4py.sh
|
||||
|
||||
# Install PyTorch
|
||||
ARG TORCH_INSTALL_TYPE="skip"
|
||||
@ -72,10 +71,6 @@ RUN bash ./install_pytorch.sh $TORCH_INSTALL_TYPE && rm install_pytorch.sh
|
||||
RUN pip3 uninstall -y opencv && rm -rf /usr/local/lib/python3*/dist-packages/cv2/
|
||||
RUN pip3 install opencv-python-headless --force-reinstall --no-deps --no-cache-dir
|
||||
|
||||
# Install DeepEP
|
||||
COPY docker/common/install_deep_ep.sh install_deep_ep.sh
|
||||
RUN bash ./install_deep_ep.sh && rm install_deep_ep.sh
|
||||
|
||||
# WARs against security issues inherited from pytorch:25.04
|
||||
# * https://github.com/advisories/GHSA-vqfr-h8mv-ghfj
|
||||
# * https://github.com/advisories/GHSA-7cx3-6m66-7c5m
|
||||
@ -120,9 +115,10 @@ COPY .gitmodules setup.py requirements.txt requirements-dev.txt constraints.txt
|
||||
RUN mkdir -p /root/.cache/pip /root/.cache/ccache
|
||||
ENV CCACHE_DIR=/root/.cache/ccache
|
||||
# Build the TRT-LLM wheel
|
||||
ARG GITHUB_MIRROR=""
|
||||
ARG BUILD_WHEEL_ARGS="--clean --python_bindings --benchmarks"
|
||||
RUN --mount=type=cache,target=/root/.cache/pip --mount=type=cache,target=${CCACHE_DIR} \
|
||||
python3 scripts/build_wheel.py ${BUILD_WHEEL_ARGS}
|
||||
GITHUB_MIRROR=$GITHUB_MIRROR python3 scripts/build_wheel.py ${BUILD_WHEEL_ARGS}
|
||||
|
||||
FROM ${DEVEL_IMAGE} AS release
|
||||
|
||||
|
||||
@ -53,6 +53,7 @@ init_ubuntu() {
|
||||
llvm \
|
||||
libclang-rt-dev \
|
||||
libffi-dev \
|
||||
libibverbs-dev \
|
||||
libnuma1 \
|
||||
libnuma-dev \
|
||||
python3-dev \
|
||||
@ -115,17 +116,19 @@ install_gcctoolset_rockylinux() {
|
||||
# https://catalog.ngc.nvidia.com/orgs/nvidia/containers/cuda
|
||||
echo 'export LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH' >> "${ENV}"
|
||||
dnf install \
|
||||
vim \
|
||||
wget \
|
||||
git-lfs \
|
||||
gcc-toolset-11 \
|
||||
libffi-devel \
|
||||
-y
|
||||
patch \
|
||||
vim \
|
||||
wget \
|
||||
git-lfs \
|
||||
gcc-toolset-11 \
|
||||
libffi-devel \
|
||||
-y
|
||||
dnf install \
|
||||
openmpi \
|
||||
openmpi-devel \
|
||||
pigz \
|
||||
-y
|
||||
openmpi \
|
||||
openmpi-devel \
|
||||
pigz \
|
||||
rdma-core-devel \
|
||||
-y
|
||||
echo "source scl_source enable gcc-toolset-11" >> "${ENV}"
|
||||
echo 'export PATH=/usr/lib64/openmpi/bin:$PATH' >> "${ENV}"
|
||||
}
|
||||
|
||||
@ -1,47 +0,0 @@
|
||||
#!/bin/bash
|
||||
|
||||
set -euxo pipefail
|
||||
|
||||
GITHUB_URL=${GITHUB_MIRROR:-https://github.com}
|
||||
DEEP_EP_COMMIT=2b266cf6452134f993ab0fcb3ef2d5de7683c561
|
||||
|
||||
if [ "$(. /etc/os-release && echo $ID)" == "rocky" ]; then
|
||||
echo "Skipping DeepEP installation in the Rocky distribution."
|
||||
exit 0
|
||||
fi
|
||||
libmlx5_dir=$(dirname $(ldconfig -p | grep libmlx5.so.1 | head -n1 | awk '{print $NF}'))
|
||||
|
||||
export NVCC_APPEND_FLAGS="--threads 4"
|
||||
|
||||
# Custom NVSHMEM
|
||||
curl -fsSL https://developer.download.nvidia.com/compute/redist/nvshmem/3.2.5/source/nvshmem_src_3.2.5-1.txz | tar xz
|
||||
pushd nvshmem_src
|
||||
curl -fsSL $GITHUB_URL/deepseek-ai/DeepEP/raw/$DEEP_EP_COMMIT/third-party/nvshmem.patch | patch -p1
|
||||
sed "s/TRANSPORT_VERSION_MAJOR 3/TRANSPORT_VERSION_MAJOR 103/" -i src/CMakeLists.txt
|
||||
ln -s libmlx5.so.1 "$libmlx5_dir/libmlx5.so"
|
||||
cmake -S . -B build \
|
||||
-DCMAKE_INSTALL_PREFIX=/opt/custom_nvshmem \
|
||||
-DGDRCOPY_HOME=/usr/include \
|
||||
-DNVSHMEM_SHMEM_SUPPORT=0 \
|
||||
-DNVSHMEM_UCX_SUPPORT=0 \
|
||||
-DNVSHMEM_USE_NCCL=0 \
|
||||
-DNVSHMEM_MPI_SUPPORT=0 \
|
||||
-DNVSHMEM_IBGDA_SUPPORT=1 \
|
||||
-DNVSHMEM_PMIX_SUPPORT=0 \
|
||||
-DNVSHMEM_TIMEOUT_DEVICE_POLLING=0 \
|
||||
-DNVSHMEM_USE_GDRCOPY=1 \
|
||||
-DCMAKE_CUDA_ARCHITECTURES="90-real;100-real;120-real" \
|
||||
-DNVSHMEM_BUILD_TESTS=0 \
|
||||
-DNVSHMEM_BUILD_EXAMPLES=0
|
||||
cmake --build build -j`nproc`
|
||||
make -C build install
|
||||
popd
|
||||
|
||||
# DeepEP
|
||||
curl -fsSL $GITHUB_URL/deepseek-ai/DeepEP/archive/$DEEP_EP_COMMIT.tar.gz | tar xz
|
||||
TORCH_CUDA_ARCH_LIST="9.0;10.0;12.0" NVSHMEM_DIR=/opt/custom_nvshmem pip install -v --no-cache-dir ./DeepEP-$DEEP_EP_COMMIT
|
||||
|
||||
# Clean up
|
||||
rm -r nvshmem_src
|
||||
rm "$libmlx5_dir/libmlx5.so"
|
||||
rm -r DeepEP-$DEEP_EP_COMMIT
|
||||
@ -592,6 +592,7 @@ pipeline {
|
||||
//Workspace normally is: /home/jenkins/agent/workspace/LLM/L0_MergeRequest@tmp/
|
||||
HF_HOME="${env.WORKSPACE_TMP}/.cache/huggingface"
|
||||
CCACHE_DIR="${CCACHE_DIR}"
|
||||
GITHUB_MIRROR="https://urm.nvidia.com/artifactory/github-go-remote"
|
||||
PIP_INDEX_URL="https://urm.nvidia.com/artifactory/api/pypi/pypi-remote/simple"
|
||||
// force datasets to be offline mode, to prevent CI jobs are downloading HF dataset causing test failures
|
||||
HF_DATASETS_OFFLINE=1
|
||||
|
||||
@ -2220,6 +2220,7 @@ pipeline {
|
||||
//Workspace normally is: /home/jenkins/agent/workspace/LLM/L0_MergeRequest@tmp/
|
||||
HF_HOME="${env.WORKSPACE_TMP}/.cache/huggingface"
|
||||
CCACHE_DIR="${CCACHE_DIR}"
|
||||
GITHUB_MIRROR="https://urm.nvidia.com/artifactory/github-go-remote"
|
||||
PIP_INDEX_URL="https://urm.nvidia.com/artifactory/api/pypi/pypi-remote/simple"
|
||||
// force datasets to be offline mode, to prevent CI jobs are downloading HF dataset causing test failures
|
||||
HF_DATASETS_OFFLINE=1
|
||||
|
||||
@ -1,7 +1,7 @@
|
||||
|
||||
import java.lang.InterruptedException
|
||||
|
||||
DOCKER_IMAGE = "urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:pytorch-25.05-py3-x86_64-ubuntu24.04-trt10.11.0.33-skip-tritondevel-202506271620-5539"
|
||||
DOCKER_IMAGE = "urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:pytorch-25.05-py3-x86_64-ubuntu24.04-trt10.11.0.33-skip-tritondevel-202507071100-5534"
|
||||
|
||||
def createKubernetesPodConfig(image, arch = "amd64")
|
||||
{
|
||||
|
||||
@ -5,7 +5,7 @@
|
||||
# for reuse in Dev Containers configuration.
|
||||
# Also, the file needs to be parseable by 'sh' for reuse by docker/Makefile.
|
||||
LLM_DOCKER_IMAGE_URI=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm
|
||||
LLM_DOCKER_IMAGE_TAG_SUFFIX=-trt10.11.0.33-skip-tritondevel-202506271620-5539
|
||||
LLM_DOCKER_IMAGE_TAG_SUFFIX=-trt10.11.0.33-skip-tritondevel-202507071100-5534
|
||||
LLM_DOCKER_IMAGE=${LLM_DOCKER_IMAGE_URI}:pytorch-25.05-py3-x86_64-ubuntu24.04${LLM_DOCKER_IMAGE_TAG_SUFFIX}
|
||||
LLM_SBSA_DOCKER_IMAGE=${LLM_DOCKER_IMAGE_URI}:pytorch-25.05-py3-aarch64-ubuntu24.04${LLM_DOCKER_IMAGE_TAG_SUFFIX}
|
||||
LLM_ROCKYLINUX8_PY310_DOCKER_IMAGE=${LLM_DOCKER_IMAGE_URI}:cuda-12.9.0-devel-rocky8-x86_64-rocky8-py310${LLM_DOCKER_IMAGE_TAG_SUFFIX}
|
||||
|
||||
@ -31,6 +31,7 @@
|
||||
],
|
||||
"skip": {
|
||||
"<filename>": "<description>",
|
||||
"tensorrt_llm/deep_ep/strip_nvshmem_helper.py": "py",
|
||||
"tensorrt_llm/kernels/selectiveScan/selectiveScan.h": "external",
|
||||
"tensorrt_llm/kernels/cutlass_kernels/python/generate_kernels.py": "py",
|
||||
"tensorrt_llm/kernels/cutlass_kernels/fp8_rowwise_gemm/fp8_rowwise_gemm_kernel_template_sm90.h": "dual license",
|
||||
|
||||
@ -412,10 +412,12 @@ def main(*,
|
||||
if cpp_only:
|
||||
build_pyt = "OFF"
|
||||
build_pybind = "OFF"
|
||||
build_deep_ep = "OFF"
|
||||
else:
|
||||
targets.extend(["bindings", "th_common"])
|
||||
targets.extend(["th_common", "bindings", "deep_ep"])
|
||||
build_pyt = "ON"
|
||||
build_pybind = "ON"
|
||||
build_deep_ep = "ON"
|
||||
|
||||
if benchmarks:
|
||||
targets.append("benchmarks")
|
||||
@ -454,7 +456,7 @@ def main(*,
|
||||
)
|
||||
cmake_def_args = " ".join(cmake_def_args)
|
||||
cmake_configure_command = (
|
||||
f'cmake -DCMAKE_BUILD_TYPE="{build_type}" -DBUILD_PYT="{build_pyt}" -DBUILD_PYBIND="{build_pybind}"'
|
||||
f'cmake -DCMAKE_BUILD_TYPE="{build_type}" -DBUILD_PYT="{build_pyt}" -DBUILD_PYBIND="{build_pybind}" -DBUILD_DEEP_EP="{build_deep_ep}"'
|
||||
f' -DNVTX_DISABLE="{disable_nvtx}" -DBUILD_MICRO_BENCHMARKS={build_micro_benchmarks}'
|
||||
f' -DBUILD_WHEEL_TARGETS="{";".join(targets)}"'
|
||||
f' -DPython_EXECUTABLE={venv_python} -DPython3_EXECUTABLE={venv_python}'
|
||||
@ -594,6 +596,13 @@ def main(*,
|
||||
"tensorrt_llm/kernels/decoderMaskedMultiheadAttention/libdecoder_attention_1.so",
|
||||
lib_dir / "libdecoder_attention_1.so")
|
||||
|
||||
deep_ep_dir = pkg_dir / "deep_ep"
|
||||
if deep_ep_dir.is_symlink():
|
||||
deep_ep_dir.unlink()
|
||||
elif deep_ep_dir.is_dir():
|
||||
clear_folder(deep_ep_dir)
|
||||
deep_ep_dir.rmdir()
|
||||
|
||||
bin_dir = pkg_dir / "bin"
|
||||
if bin_dir.exists():
|
||||
clear_folder(bin_dir)
|
||||
@ -605,19 +614,41 @@ def main(*,
|
||||
|
||||
if not cpp_only:
|
||||
|
||||
def get_pybind_lib():
|
||||
pybind_build_dir = (build_dir / "tensorrt_llm" / "pybind")
|
||||
def get_pybind_lib(subdirectory, name):
|
||||
pybind_build_dir = (build_dir / "tensorrt_llm" / subdirectory)
|
||||
if on_windows:
|
||||
pybind_lib = list(pybind_build_dir.glob("bindings.*.pyd"))
|
||||
pybind_lib = list(pybind_build_dir.glob(f"{name}.*.pyd"))
|
||||
else:
|
||||
pybind_lib = list(pybind_build_dir.glob("bindings.*.so"))
|
||||
pybind_lib = list(pybind_build_dir.glob(f"{name}.*.so"))
|
||||
|
||||
assert len(
|
||||
pybind_lib
|
||||
) == 1, f"Exactly one pybind library should be present: {pybind_lib}"
|
||||
return pybind_lib[0]
|
||||
|
||||
install_file(get_pybind_lib(), pkg_dir)
|
||||
install_file(get_pybind_lib("pybind", "bindings"), pkg_dir)
|
||||
|
||||
with (build_dir / "tensorrt_llm" / "deep_ep" /
|
||||
"cuda_architectures.txt").open() as f:
|
||||
deep_ep_cuda_architectures = f.read().strip().strip(";")
|
||||
if deep_ep_cuda_architectures:
|
||||
install_file(get_pybind_lib("deep_ep", "deep_ep_cpp_tllm"), pkg_dir)
|
||||
install_tree(build_dir / "tensorrt_llm" / "deep_ep" / "python" /
|
||||
"deep_ep",
|
||||
deep_ep_dir,
|
||||
dirs_exist_ok=True)
|
||||
(lib_dir / "nvshmem").mkdir(exist_ok=True)
|
||||
install_file(
|
||||
build_dir / "tensorrt_llm/deep_ep/nvshmem-build/License.txt",
|
||||
lib_dir / "nvshmem")
|
||||
install_file(
|
||||
build_dir /
|
||||
"tensorrt_llm/deep_ep/nvshmem-build/src/lib/nvshmem_bootstrap_uid.so.3",
|
||||
lib_dir / "nvshmem")
|
||||
install_file(
|
||||
build_dir /
|
||||
"tensorrt_llm/deep_ep/nvshmem-build/src/lib/nvshmem_transport_ibgda.so.103",
|
||||
lib_dir / "nvshmem")
|
||||
if not skip_stubs:
|
||||
with working_directory(project_dir):
|
||||
build_run(f"\"{venv_python}\" -m pip install pybind11-stubgen")
|
||||
@ -651,14 +682,13 @@ def main(*,
|
||||
if 'LD_LIBRARY_PATH' in env_ld:
|
||||
new_library_path += f":{env_ld['LD_LIBRARY_PATH']}"
|
||||
env_ld["LD_LIBRARY_PATH"] = new_library_path
|
||||
try:
|
||||
build_run(
|
||||
f"\"{venv_python}\" -m pybind11_stubgen -o . bindings --exit-code",
|
||||
env=env_ld)
|
||||
if deep_ep_cuda_architectures:
|
||||
build_run(
|
||||
f"\"{venv_python}\" -m pybind11_stubgen -o . bindings --exit-code",
|
||||
f"\"{venv_python}\" -m pybind11_stubgen -o . deep_ep_cpp_tllm --exit-code",
|
||||
env=env_ld)
|
||||
except CalledProcessError as ex:
|
||||
print(f"Failed to build pybind11 stubgen: {ex}",
|
||||
file=sys.stderr)
|
||||
exit(1)
|
||||
|
||||
if not skip_building_wheel:
|
||||
if dist_dir is None:
|
||||
|
||||
5
setup.py
5
setup.py
@ -104,7 +104,10 @@ else:
|
||||
'libs/libnvinfer_plugin_tensorrt_llm.so',
|
||||
'libs/libtensorrt_llm_ucx_wrapper.so', 'libs/libdecoder_attention_0.so',
|
||||
'libs/libtensorrt_llm_nixl_wrapper.so',
|
||||
'libs/libdecoder_attention_1.so', 'bindings.*.so', "include/**/*"
|
||||
'libs/libdecoder_attention_1.so', 'libs/nvshmem/License.txt',
|
||||
'libs/nvshmem/nvshmem_bootstrap_uid.so.3',
|
||||
'libs/nvshmem/nvshmem_transport_ibgda.so.103', 'bindings.*.so',
|
||||
'deep_ep/LICENSE', 'deep_ep_cpp_tllm.*.so', "include/**/*"
|
||||
]
|
||||
|
||||
package_data += [
|
||||
|
||||
@ -5,11 +5,11 @@ from typing import List, Tuple, Union
|
||||
|
||||
import torch
|
||||
|
||||
from tensorrt_llm._utils import local_mpi_size, mpi_comm
|
||||
from tensorrt_llm._utils import mpi_comm
|
||||
from tensorrt_llm.mapping import Mapping
|
||||
|
||||
try:
|
||||
from deep_ep import Buffer
|
||||
from tensorrt_llm.deep_ep import Buffer
|
||||
deep_ep_installed = True
|
||||
except ModuleNotFoundError:
|
||||
deep_ep_installed = False
|
||||
@ -54,7 +54,6 @@ class VariableLengthBuffer:
|
||||
self.buffer = Buffer(None,
|
||||
num_nvl_bytes,
|
||||
num_rdma_bytes,
|
||||
num_nvl_peers=local_mpi_size(),
|
||||
comm=self.comm)
|
||||
|
||||
def dispatch(self, x: Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]],
|
||||
|
||||
Loading…
Reference in New Issue
Block a user