mirror of
https://github.com/NVIDIA/TensorRT-LLM.git
synced 2026-01-13 22:18:36 +08:00
[None][fix] Clean up linking to CUDA stub libraries in build_wheel.py (#6823)
Signed-off-by: Linda-Stadter <57756729+Linda-Stadter@users.noreply.github.com> Signed-off-by: Martin Marciniszyn Mehringer <11665257+MartinMarciniszyn@users.noreply.github.com> Co-authored-by: Linda-Stadter <57756729+Linda-Stadter@users.noreply.github.com>
This commit is contained in:
parent
1ce23545fc
commit
425dad01fd
@ -43,6 +43,7 @@ target_link_libraries(
|
||||
${Python3_LIBRARIES}
|
||||
${TORCH_LIBRARIES}
|
||||
torch_python
|
||||
CUDA::cuda_driver
|
||||
${CUDA_NVML_LIB}
|
||||
th_common)
|
||||
target_compile_definitions(
|
||||
@ -54,6 +55,6 @@ if(NOT WIN32)
|
||||
${TRTLLM_NB_MODULE}
|
||||
PROPERTIES
|
||||
LINK_FLAGS
|
||||
"-Wl,-rpath,'$ORIGIN/libs' -Wl,-rpath,'$ORIGIN/../nvidia/nccl/lib' -Wl,-rpath,'${CUDA_TOOLKIT_ROOT_DIR}/targets/x86_64-linux/lib/stubs' ${AS_NEEDED_FLAG} ${UNDEFINED_FLAG}"
|
||||
"-Wl,-rpath,'$ORIGIN/libs' -Wl,-rpath,'$ORIGIN/../nvidia/nccl/lib' ${AS_NEEDED_FLAG} ${UNDEFINED_FLAG}"
|
||||
)
|
||||
endif()
|
||||
|
||||
@ -44,6 +44,7 @@ target_link_libraries(
|
||||
${Python3_LIBRARIES}
|
||||
${TORCH_LIBRARIES}
|
||||
torch_python
|
||||
CUDA::cuda_driver
|
||||
${CUDA_NVML_LIB}
|
||||
th_common)
|
||||
target_compile_definitions(
|
||||
@ -55,6 +56,6 @@ if(NOT WIN32)
|
||||
${TRTLLM_PYBIND_MODULE}
|
||||
PROPERTIES
|
||||
LINK_FLAGS
|
||||
"-Wl,-rpath,'$ORIGIN/libs' -Wl,-rpath,'$ORIGIN/../nvidia/nccl/lib' -Wl,-rpath,'${CUDA_TOOLKIT_ROOT_DIR}/targets/x86_64-linux/lib/stubs' ${AS_NEEDED_FLAG} ${UNDEFINED_FLAG}"
|
||||
"-Wl,-rpath,'$ORIGIN/libs' -Wl,-rpath,'$ORIGIN/../nvidia/nccl/lib' ${AS_NEEDED_FLAG} ${UNDEFINED_FLAG}"
|
||||
)
|
||||
endif()
|
||||
|
||||
@ -71,8 +71,9 @@ RUN bash ./install_pytorch.sh $TORCH_INSTALL_TYPE && rm install_pytorch.sh
|
||||
ENV PYTORCH_CUDA_ALLOC_CONF="garbage_collection_threshold:0.99999"
|
||||
|
||||
# Install OpenCV with FFMPEG support
|
||||
RUN pip3 uninstall -y opencv && rm -rf /usr/local/lib/python3*/dist-packages/cv2/
|
||||
RUN pip3 install opencv-python-headless --force-reinstall --no-deps --no-cache-dir
|
||||
RUN pip3 uninstall -y opencv && \
|
||||
rm -rf /usr/local/lib/python3*/dist-packages/cv2/ && \
|
||||
pip3 install opencv-python-headless --force-reinstall --no-deps --no-cache-dir
|
||||
|
||||
# WARs against security issues inherited from pytorch:25.06
|
||||
# * https://github.com/advisories/GHSA-8qvm-5x2c-j2w7
|
||||
|
||||
@ -16,8 +16,10 @@
|
||||
|
||||
import os
|
||||
import platform
|
||||
import re
|
||||
import sys
|
||||
import sysconfig
|
||||
import tempfile
|
||||
import warnings
|
||||
from argparse import ArgumentParser
|
||||
from contextlib import contextmanager
|
||||
@ -27,7 +29,7 @@ from pathlib import Path
|
||||
from shutil import copy, copytree, rmtree
|
||||
from subprocess import DEVNULL, CalledProcessError, check_output, run
|
||||
from textwrap import dedent
|
||||
from typing import List
|
||||
from typing import Sequence
|
||||
|
||||
try:
|
||||
from packaging.requirements import Requirement
|
||||
@ -120,7 +122,8 @@ def create_venv(project_dir: Path):
|
||||
return venv_prefix
|
||||
|
||||
|
||||
def setup_venv(project_dir: Path, requirements_file: Path, no_venv: bool):
|
||||
def setup_venv(project_dir: Path, requirements_file: Path,
|
||||
no_venv: bool) -> tuple[Path, Path]:
|
||||
"""Creates/updates a venv and installs requirements.
|
||||
|
||||
Args:
|
||||
@ -279,6 +282,139 @@ def generate_fmha_cu(project_dir, venv_python):
|
||||
os.chdir(project_dir)
|
||||
|
||||
|
||||
def create_cuda_stub_links(cuda_stub_dir: str, missing_libs: list[str]) -> str:
|
||||
"""
|
||||
Creates symbolic links for CUDA stub libraries in a temporary directory.
|
||||
|
||||
Args:
|
||||
cuda_stub_dir (str): Path to the directory containing CUDA stubs.
|
||||
missing_libs: Versioned names of the missing libraries.
|
||||
|
||||
Returns:
|
||||
str: Path to the temporary directory where links were created.
|
||||
"""
|
||||
cuda_stub_path = Path(cuda_stub_dir)
|
||||
if not cuda_stub_path.exists():
|
||||
raise RuntimeError(
|
||||
f"CUDA stub directory '{cuda_stub_dir}' does not exist.")
|
||||
|
||||
# Create a temporary directory for the symbolic links
|
||||
temp_dir = tempfile.mkdtemp(prefix="cuda_stub_links_")
|
||||
temp_dir_path = Path(temp_dir)
|
||||
|
||||
version_pattern = r'\.\d+'
|
||||
for missing_lib in filter(lambda x: re.search(version_pattern, x),
|
||||
missing_libs):
|
||||
# Define `so` as the first part of `missing_lib` with trailing '.' and digits removed
|
||||
so = cuda_stub_path / re.sub(version_pattern, '', missing_lib)
|
||||
so_versioned = temp_dir_path / missing_lib
|
||||
|
||||
# Check if the library exists in the original directory
|
||||
if so.exists():
|
||||
try:
|
||||
# Create the symbolic link in the temporary directory
|
||||
so_versioned.symlink_to(so)
|
||||
except OSError as e:
|
||||
# Clean up the temporary directory on error
|
||||
rmtree(temp_dir)
|
||||
raise RuntimeError(
|
||||
f"Failed to create symbolic link for '{missing_lib}' in temporary directory '{temp_dir}': {e}"
|
||||
)
|
||||
else:
|
||||
warnings.warn(
|
||||
f"Warning: Source library '{so}' does not exist and was skipped."
|
||||
)
|
||||
|
||||
# Return the path to the temporary directory where the links were created
|
||||
return str(temp_dir_path)
|
||||
|
||||
|
||||
def check_missing_libs(so_prefix: str) -> list[str]:
|
||||
result = build_run(f"ldd {so_prefix}.cpython*.so",
|
||||
capture_output=True,
|
||||
text=True)
|
||||
missing = []
|
||||
for line in result.stdout.splitlines():
|
||||
if "not found" in line:
|
||||
lib_name = line.split()[
|
||||
0] # Extract the library name before "=> not found"
|
||||
if lib_name not in missing:
|
||||
missing.append(lib_name)
|
||||
return missing
|
||||
|
||||
|
||||
def generate_python_stubs_linux(binding_type: str, venv_python: Path,
|
||||
deep_ep: bool):
|
||||
is_nanobind = binding_type == "nanobind"
|
||||
if is_nanobind:
|
||||
build_run(f"\"{venv_python}\" -m pip install nanobind")
|
||||
build_run(f"\"{venv_python}\" -m pip install pybind11-stubgen")
|
||||
|
||||
env_stub_gen = os.environ.copy()
|
||||
cuda_home_dir = env_stub_gen.get("CUDA_HOME") or env_stub_gen.get(
|
||||
"CUDA_PATH") or "/usr/local/cuda"
|
||||
missing_libs = check_missing_libs("bindings")
|
||||
cuda_stub_dir = f"{cuda_home_dir}/lib64/stubs"
|
||||
|
||||
if missing_libs and Path(cuda_stub_dir).exists():
|
||||
# Create symbolic links for the CUDA stubs
|
||||
link_dir = create_cuda_stub_links(cuda_stub_dir, missing_libs)
|
||||
ld_library_path = env_stub_gen.get("LD_LIBRARY_PATH")
|
||||
env_stub_gen["LD_LIBRARY_PATH"] = ":".join(
|
||||
filter(None, [link_dir, cuda_stub_dir, ld_library_path]))
|
||||
else:
|
||||
link_dir = None
|
||||
|
||||
try:
|
||||
if is_nanobind:
|
||||
build_run(f"\"{venv_python}\" -m nanobind.stubgen -m bindings -O .",
|
||||
env=env_stub_gen)
|
||||
else:
|
||||
build_run(
|
||||
f"\"{venv_python}\" -m pybind11_stubgen -o . bindings --exit-code",
|
||||
env=env_stub_gen)
|
||||
build_run(
|
||||
f"\"{venv_python}\" -m pybind11_stubgen -o . deep_gemm_cpp_tllm --exit-code",
|
||||
env=env_stub_gen)
|
||||
if deep_ep:
|
||||
build_run(
|
||||
f"\"{venv_python}\" -m pybind11_stubgen -o . deep_ep_cpp_tllm --exit-code",
|
||||
env=env_stub_gen)
|
||||
finally:
|
||||
if link_dir:
|
||||
rmtree(link_dir)
|
||||
|
||||
|
||||
def generate_python_stubs_windows(binding_type: str, venv_python: Path,
|
||||
pkg_dir: Path, lib_dir: Path):
|
||||
if binding_type == "nanobind":
|
||||
print("Windows not yet supported for nanobind stubs")
|
||||
exit(1)
|
||||
else:
|
||||
build_run(f"\"{venv_python}\" -m pip install pybind11-stubgen")
|
||||
stubgen = "stubgen.py"
|
||||
stubgen_contents = """
|
||||
# Loading torch, trt before bindings is required to avoid import errors on windows.
|
||||
# isort: off
|
||||
import torch
|
||||
import tensorrt as trt
|
||||
# isort: on
|
||||
import os
|
||||
import platform
|
||||
|
||||
from pybind11_stubgen import main
|
||||
|
||||
if __name__ == "__main__":
|
||||
# Load dlls from `libs` directory before launching bindings.
|
||||
if platform.system() == "Windows":
|
||||
os.add_dll_directory(r\"{lib_dir}\")
|
||||
main()
|
||||
""".format(lib_dir=lib_dir)
|
||||
(pkg_dir / stubgen).write_text(dedent(stubgen_contents))
|
||||
build_run(f"\"{venv_python}\" {stubgen} -o . bindings")
|
||||
(pkg_dir / stubgen).unlink()
|
||||
|
||||
|
||||
def main(*,
|
||||
build_type: str = "Release",
|
||||
generator: str = "",
|
||||
@ -286,7 +422,7 @@ def main(*,
|
||||
dist_dir: Path = None,
|
||||
cuda_architectures: str = None,
|
||||
job_count: int = None,
|
||||
extra_cmake_vars: List[str] = list(),
|
||||
extra_cmake_vars: Sequence[str] = tuple(),
|
||||
extra_make_targets: str = "",
|
||||
trt_root: str = '/usr/local/tensorrt',
|
||||
nccl_root: str = None,
|
||||
@ -361,7 +497,7 @@ def main(*,
|
||||
|
||||
if on_windows:
|
||||
# Windows does not support multi-device currently.
|
||||
extra_cmake_vars.extend(["ENABLE_MULTI_DEVICE=0"])
|
||||
extra_cmake_vars = list(extra_cmake_vars) + ["ENABLE_MULTI_DEVICE=0"]
|
||||
|
||||
# The Ninja CMake generator is used for our Windows build
|
||||
# (Easier than MSBuild to make compatible with our Docker image)
|
||||
@ -703,81 +839,14 @@ def main(*,
|
||||
dirs_exist_ok=True)
|
||||
|
||||
if not skip_stubs:
|
||||
with working_directory(project_dir):
|
||||
if binding_type == "nanobind":
|
||||
build_run(f"\"{venv_python}\" -m pip install nanobind")
|
||||
else:
|
||||
build_run(
|
||||
f"\"{venv_python}\" -m pip install pybind11-stubgen")
|
||||
with working_directory(pkg_dir):
|
||||
if on_windows:
|
||||
if binding_type == "nanobind":
|
||||
print("Windows not yet supported for nanobind stubs")
|
||||
exit(1)
|
||||
else:
|
||||
stubgen = "stubgen.py"
|
||||
stubgen_contents = """
|
||||
# Loading torch, trt before bindings is required to avoid import errors on windows.
|
||||
# isort: off
|
||||
import torch
|
||||
import tensorrt as trt
|
||||
# isort: on
|
||||
import os
|
||||
import platform
|
||||
|
||||
from pybind11_stubgen import main
|
||||
|
||||
if __name__ == "__main__":
|
||||
# Load dlls from `libs` directory before launching bindings.
|
||||
if platform.system() == "Windows":
|
||||
os.add_dll_directory(r\"{lib_dir}\")
|
||||
main()
|
||||
""".format(lib_dir=lib_dir)
|
||||
(pkg_dir / stubgen).write_text(dedent(stubgen_contents))
|
||||
build_run(f"\"{venv_python}\" {stubgen} -o . bindings")
|
||||
(pkg_dir / stubgen).unlink()
|
||||
else:
|
||||
env_ld = os.environ.copy()
|
||||
|
||||
new_library_path = "/usr/local/cuda/compat:/usr/local/cuda/compat/lib:/usr/local/cuda/compat/lib.real"
|
||||
if 'LD_LIBRARY_PATH' in env_ld:
|
||||
new_library_path += f":{env_ld['LD_LIBRARY_PATH']}"
|
||||
|
||||
result = build_run("find /usr -name *libnvidia-ml.so*",
|
||||
capture_output=True,
|
||||
text=True)
|
||||
assert result.returncode == 0, f"Failed to run find *libnvidia-ml.so*: {result.stderr}"
|
||||
|
||||
# Build containers only contain stub version of libnvidia-ml.so and not the real version.
|
||||
# If real version not in system, we need to create symbolic link to stub version to prevent import errors.
|
||||
if "libnvidia-ml.so.1" not in result.stdout:
|
||||
if "libnvidia-ml.so" in result.stdout:
|
||||
line = result.stdout.splitlines()[0]
|
||||
path = os.path.dirname(line)
|
||||
new_library_path += f":{path}"
|
||||
build_run(f"ln -s {line} {path}/libnvidia-ml.so.1")
|
||||
else:
|
||||
print(
|
||||
f"Failed to find libnvidia-ml.so: {result.stderr}",
|
||||
file=sys.stderr)
|
||||
exit(1)
|
||||
|
||||
env_ld["LD_LIBRARY_PATH"] = new_library_path
|
||||
if binding_type == "nanobind":
|
||||
build_run(
|
||||
f"\"{venv_python}\" -m nanobind.stubgen -m bindings -O .",
|
||||
env=env_ld)
|
||||
else:
|
||||
build_run(
|
||||
f"\"{venv_python}\" -m pybind11_stubgen -o . bindings --exit-code",
|
||||
env=env_ld)
|
||||
if deep_ep_cuda_architectures:
|
||||
build_run(
|
||||
f"\"{venv_python}\" -m pybind11_stubgen -o . deep_ep_cpp_tllm --exit-code",
|
||||
env=env_ld)
|
||||
build_run(
|
||||
f"\"{venv_python}\" -m pybind11_stubgen -o . deep_gemm_cpp_tllm --exit-code",
|
||||
env=env_ld)
|
||||
generate_python_stubs_windows(binding_type, venv_python,
|
||||
pkg_dir, lib_dir)
|
||||
else: # on linux
|
||||
generate_python_stubs_linux(
|
||||
binding_type, venv_python,
|
||||
bool(deep_ep_cuda_architectures))
|
||||
|
||||
if not skip_building_wheel:
|
||||
if dist_dir is None:
|
||||
|
||||
Loading…
Reference in New Issue
Block a user