mirror of
https://github.com/NVIDIA/TensorRT-LLM.git
synced 2026-01-14 06:27:45 +08:00
Co-authored-by: Kaiyu Xie <26294424+kaiyux@users.noreply.github.com> open source f8c0381a2bc50ee2739c3d8c2be481b31e5f00bd (#2736) Co-authored-by: Kaiyu Xie <26294424+kaiyux@users.noreply.github.com> Add note for blackwell (#2742) Update the docs to workaround the extra-index-url issue (#2744) update README.md (#2751) Fix github io pages (#2761) Update
448 lines
17 KiB
Python
Executable File
448 lines
17 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
# SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
|
# SPDX-License-Identifier: Apache-2.0
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
|
|
import os
|
|
import platform
|
|
import sys
|
|
from argparse import ArgumentParser
|
|
from contextlib import contextmanager
|
|
from functools import partial
|
|
from multiprocessing import cpu_count
|
|
from pathlib import Path
|
|
from shutil import copy, rmtree
|
|
from subprocess import CalledProcessError, check_output, run
|
|
from textwrap import dedent
|
|
from typing import List
|
|
|
|
|
|
@contextmanager
|
|
def working_directory(path):
|
|
"""Changes working directory and returns to previous on exit."""
|
|
prev_cwd = Path.cwd()
|
|
os.chdir(path)
|
|
try:
|
|
yield
|
|
finally:
|
|
os.chdir(prev_cwd)
|
|
|
|
|
|
def get_project_dir():
|
|
return Path(__file__).parent.resolve().parent
|
|
|
|
|
|
def get_source_dir():
|
|
return get_project_dir() / "cpp"
|
|
|
|
|
|
def get_build_dir(build_dir, build_type):
|
|
if build_dir is None:
|
|
build_dir = get_source_dir() / ("build" if build_type == "Release" else
|
|
f"build_{build_type}")
|
|
else:
|
|
build_dir = Path(build_dir)
|
|
return build_dir
|
|
|
|
|
|
def clear_folder(folder_path):
|
|
for item in os.listdir(folder_path):
|
|
item_path = os.path.join(folder_path, item)
|
|
if os.path.isdir(item_path):
|
|
rmtree(item_path)
|
|
else:
|
|
os.remove(item_path)
|
|
|
|
|
|
def main(*,
|
|
build_type: str = "Release",
|
|
build_dir: Path = None,
|
|
dist_dir: Path = None,
|
|
cuda_architectures: str = None,
|
|
job_count: int = None,
|
|
extra_cmake_vars: List[str] = list(),
|
|
extra_make_targets: str = "",
|
|
trt_root: str = None,
|
|
nccl_root: str = None,
|
|
clean: bool = False,
|
|
clean_wheel: bool = False,
|
|
configure_cmake: bool = False,
|
|
use_ccache: bool = False,
|
|
fast_build: bool = False,
|
|
cpp_only: bool = False,
|
|
install: bool = False,
|
|
skip_building_wheel: bool = False,
|
|
python_bindings: bool = True,
|
|
benchmarks: bool = False,
|
|
micro_benchmarks: bool = False,
|
|
nvtx: bool = False,
|
|
skip_stubs: bool = False):
|
|
|
|
if clean:
|
|
clean_wheel = True
|
|
|
|
project_dir = get_project_dir()
|
|
os.chdir(project_dir)
|
|
build_run = partial(run, shell=True, check=True)
|
|
|
|
# Get all submodules and check their folder exists. If not,
|
|
# invoke git submodule update
|
|
with open(project_dir / ".gitmodules", "r") as submodules_f:
|
|
submodules = [
|
|
l.split("=")[1].strip() for l in submodules_f.readlines()
|
|
if "path = " in l
|
|
]
|
|
if any(not (project_dir / submodule / ".git").exists()
|
|
for submodule in submodules):
|
|
build_run('git submodule update --init --recursive')
|
|
on_windows = platform.system() == "Windows"
|
|
requirements_filename = "requirements-dev-windows.txt" if on_windows else "requirements-dev.txt"
|
|
build_run(f"\"{sys.executable}\" -m pip install -r {requirements_filename}")
|
|
# Ensure TRT is installed on windows to prevent surprises.
|
|
reqs = check_output([sys.executable, "-m", "pip", "freeze"])
|
|
installed_packages = [r.decode().split("==")[0] for r in reqs.split()]
|
|
if "tensorrt" not in installed_packages:
|
|
error_msg = "TensorRT was not installed properly."
|
|
if on_windows:
|
|
error_msg += (
|
|
" Please download the TensorRT zip file manually,"
|
|
" install it and relaunch build_wheel.py."
|
|
" See https://docs.nvidia.com/deeplearning/tensorrt/install-guide/index.html#installing-zip for more details."
|
|
)
|
|
else:
|
|
error_msg += " Please run `pip install tensorrt` manually and relaunch build_wheel.py"
|
|
raise RuntimeError(error_msg)
|
|
|
|
if cuda_architectures is not None:
|
|
if "70-real" in cuda_architectures:
|
|
raise RuntimeError("Volta architecture is deprecated support.")
|
|
|
|
cmake_cuda_architectures = (
|
|
f'"-DCMAKE_CUDA_ARCHITECTURES={cuda_architectures}"'
|
|
if cuda_architectures is not None else "")
|
|
|
|
cmake_def_args = []
|
|
cmake_generator = ""
|
|
|
|
hardware_arch = platform.machine()
|
|
|
|
if on_windows:
|
|
# Windows does not support multi-device currently.
|
|
extra_cmake_vars.extend(["ENABLE_MULTI_DEVICE=0"])
|
|
|
|
# The Ninja CMake generator is used for our Windows build
|
|
# (Easier than MSBuild to make compatible with our Docker image)
|
|
cmake_generator = "-GNinja"
|
|
|
|
if job_count is None:
|
|
job_count = cpu_count()
|
|
|
|
if len(extra_cmake_vars):
|
|
# Backwards compatibility, we also support semicolon expansion for each value.
|
|
# However, it is best to use flag multiple-times due to issues with spaces in CLI.
|
|
expanded_args = []
|
|
for var in extra_cmake_vars:
|
|
expanded_args += var.split(";")
|
|
|
|
extra_cmake_vars = ["\"-D{}\"".format(var) for var in expanded_args]
|
|
# Don't include duplicate conditions
|
|
cmake_def_args.extend(set(extra_cmake_vars))
|
|
|
|
if trt_root is not None:
|
|
trt_root = trt_root.replace("\\", "/")
|
|
trt_lib_dir_candidates = (
|
|
f"{trt_root}/targets/{hardware_arch}-linux-gnu/lib",
|
|
f"{trt_root}/lib")
|
|
try:
|
|
trt_lib_dir = next(
|
|
filter(lambda x: Path(x).exists(), trt_lib_dir_candidates))
|
|
except StopIteration:
|
|
trt_lib_dir = trt_lib_dir_candidates[0]
|
|
cmake_def_args.append(f"-DTRT_LIB_DIR={trt_lib_dir}")
|
|
cmake_def_args.append(f"-DTRT_INCLUDE_DIR={trt_root}/include")
|
|
|
|
if nccl_root is not None:
|
|
cmake_def_args.append(f"-DNCCL_LIB_DIR={nccl_root}/lib")
|
|
cmake_def_args.append(f"-DNCCL_INCLUDE_DIR={nccl_root}/include")
|
|
|
|
build_dir = get_build_dir(build_dir, build_type)
|
|
first_build = not build_dir.exists()
|
|
|
|
if clean and build_dir.exists():
|
|
clear_folder(build_dir) # Keep the folder in case it is mounted.
|
|
build_dir.mkdir(parents=True, exist_ok=True)
|
|
|
|
if use_ccache:
|
|
cmake_def_args.append(
|
|
f"-DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_CUDA_COMPILER_LAUNCHER=ccache"
|
|
)
|
|
|
|
if fast_build:
|
|
cmake_def_args.append(f"-DFAST_BUILD=ON")
|
|
|
|
build_pyt = "OFF" if cpp_only else "ON"
|
|
th_common_lib = "" if cpp_only else "th_common"
|
|
build_pybind = "OFF" if cpp_only else "ON"
|
|
bindings_lib = "" if cpp_only else "bindings"
|
|
benchmarks_lib = "benchmarks" if benchmarks else ""
|
|
build_micro_benchmarks = "ON" if micro_benchmarks else "OFF"
|
|
micro_benchmarks_lib = "micro_benchmarks" if micro_benchmarks else ""
|
|
disable_nvtx = "OFF" if nvtx else "ON"
|
|
executor_worker = "" if on_windows else "executorWorker "
|
|
|
|
source_dir = get_source_dir()
|
|
with working_directory(build_dir):
|
|
cmake_def_args = " ".join(cmake_def_args)
|
|
if clean or first_build or configure_cmake:
|
|
build_run(
|
|
f'cmake -DCMAKE_BUILD_TYPE="{build_type}" -DBUILD_PYT="{build_pyt}" -DBUILD_PYBIND="{build_pybind}"'
|
|
f' -DNVTX_DISABLE="{disable_nvtx}" -DBUILD_MICRO_BENCHMARKS={build_micro_benchmarks}'
|
|
f' {cmake_cuda_architectures} {cmake_def_args} {cmake_generator} -S "{source_dir}"'
|
|
)
|
|
build_run(
|
|
f'cmake --build . --config {build_type} --parallel {job_count} '
|
|
f'--target tensorrt_llm nvinfer_plugin_tensorrt_llm {th_common_lib} {bindings_lib} {benchmarks_lib} '
|
|
f'{micro_benchmarks_lib} {executor_worker} {" ".join(extra_make_targets)}'
|
|
)
|
|
|
|
if cpp_only:
|
|
assert not install, "Installing is not supported for cpp_only builds"
|
|
return
|
|
|
|
pkg_dir = project_dir / "tensorrt_llm"
|
|
assert pkg_dir.is_dir(), f"{pkg_dir} is not a directory"
|
|
lib_dir = pkg_dir / "libs"
|
|
if lib_dir.exists():
|
|
clear_folder(lib_dir)
|
|
lib_dir.mkdir(parents=True, exist_ok=True)
|
|
if on_windows:
|
|
copy(build_dir / "tensorrt_llm/tensorrt_llm.dll",
|
|
lib_dir / "tensorrt_llm.dll")
|
|
copy(build_dir / f"tensorrt_llm/thop/th_common.dll",
|
|
lib_dir / "th_common.dll")
|
|
copy(
|
|
build_dir / f"tensorrt_llm/plugins/nvinfer_plugin_tensorrt_llm.dll",
|
|
lib_dir / "nvinfer_plugin_tensorrt_llm.dll")
|
|
copy(
|
|
build_dir /
|
|
"tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/tensorrt_llm_nvrtc_wrapper.dll",
|
|
lib_dir / "tensorrt_llm_nvrtc_wrapper.dll")
|
|
else:
|
|
copy(build_dir / "tensorrt_llm/libtensorrt_llm.so",
|
|
lib_dir / "libtensorrt_llm.so")
|
|
copy(build_dir / "tensorrt_llm/thop/libth_common.so",
|
|
lib_dir / "libth_common.so")
|
|
copy(
|
|
build_dir /
|
|
"tensorrt_llm/plugins/libnvinfer_plugin_tensorrt_llm.so",
|
|
lib_dir / "libnvinfer_plugin_tensorrt_llm.so")
|
|
copy(
|
|
build_dir /
|
|
"tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/libtensorrt_llm_nvrtc_wrapper.so",
|
|
lib_dir / "libtensorrt_llm_nvrtc_wrapper.so")
|
|
copy(
|
|
build_dir /
|
|
"tensorrt_llm/batch_manager/libtensorrt_llm_ucx_wrapper.so",
|
|
lib_dir / "libtensorrt_llm_ucx_wrapper.so")
|
|
copy(
|
|
build_dir /
|
|
"tensorrt_llm/kernels/decoderMaskedMultiheadAttention/libdecoder_attention_0.so",
|
|
lib_dir / "libdecoder_attention_0.so")
|
|
copy(
|
|
build_dir /
|
|
"tensorrt_llm/kernels/decoderMaskedMultiheadAttention/libdecoder_attention_1.so",
|
|
lib_dir / "libdecoder_attention_1.so")
|
|
|
|
bin_dir = pkg_dir / "bin"
|
|
if bin_dir.exists():
|
|
clear_folder(bin_dir)
|
|
bin_dir.mkdir(parents=True, exist_ok=True)
|
|
|
|
if not on_windows:
|
|
copy(build_dir / "tensorrt_llm/executor_worker/executorWorker",
|
|
bin_dir / "executorWorker")
|
|
|
|
if not cpp_only:
|
|
|
|
def get_pybind_lib():
|
|
pybind_build_dir = (build_dir / "tensorrt_llm" / "pybind")
|
|
if on_windows:
|
|
pybind_lib = list(pybind_build_dir.glob("bindings.*.pyd"))
|
|
else:
|
|
pybind_lib = list(pybind_build_dir.glob("bindings.*.so"))
|
|
|
|
assert len(
|
|
pybind_lib
|
|
) == 1, f"Exactly one pybind library should be present: {pybind_lib}"
|
|
return pybind_lib[0]
|
|
|
|
copy(get_pybind_lib(), pkg_dir)
|
|
if not skip_stubs:
|
|
with working_directory(project_dir):
|
|
build_run(
|
|
f"\"{sys.executable}\" -m pip install pybind11-stubgen")
|
|
with working_directory(pkg_dir):
|
|
if on_windows:
|
|
stubgen = "stubgen.py"
|
|
stubgen_contents = """
|
|
# Loading torch, trt before bindings is required to avoid import errors on windows.
|
|
# isort: off
|
|
import torch
|
|
import tensorrt as trt
|
|
# isort: on
|
|
import os
|
|
import platform
|
|
|
|
from pybind11_stubgen import main
|
|
|
|
if __name__ == "__main__":
|
|
# Load dlls from `libs` directory before launching bindings.
|
|
if platform.system() == "Windows":
|
|
os.add_dll_directory(r\"{lib_dir}\")
|
|
main()
|
|
""".format(lib_dir=lib_dir)
|
|
(pkg_dir / stubgen).write_text(dedent(stubgen_contents))
|
|
build_run(f"\"{sys.executable}\" {stubgen} -o . bindings")
|
|
(pkg_dir / stubgen).unlink()
|
|
else:
|
|
env_ld = os.environ.copy()
|
|
|
|
new_library_path = "/usr/local/cuda/compat/lib.real"
|
|
if 'LD_LIBRARY_PATH' in env_ld:
|
|
new_library_path += f":{env_ld['LD_LIBRARY_PATH']}"
|
|
env_ld["LD_LIBRARY_PATH"] = new_library_path
|
|
try:
|
|
build_run(
|
|
f"\"{sys.executable}\" -m pybind11_stubgen -o . bindings --exit-code",
|
|
env=env_ld)
|
|
except CalledProcessError as ex:
|
|
print(f"Failed to build pybind11 stubgen: {ex}",
|
|
file=sys.stderr)
|
|
exit(1)
|
|
|
|
if not skip_building_wheel:
|
|
if dist_dir is None:
|
|
dist_dir = project_dir / "build"
|
|
else:
|
|
dist_dir = Path(dist_dir)
|
|
|
|
if not dist_dir.exists():
|
|
dist_dir.mkdir(parents=True)
|
|
|
|
if clean_wheel:
|
|
# For incremental build, the python build module adds
|
|
# the new files but does not remove the deleted files.
|
|
#
|
|
# This breaks the Windows CI/CD pipeline when building
|
|
# and validating python changes in the whl.
|
|
clear_folder(dist_dir)
|
|
|
|
build_run(
|
|
f'\"{sys.executable}\" -m build {project_dir} --skip-dependency-check --no-isolation --wheel --outdir "{dist_dir}"'
|
|
)
|
|
|
|
if install:
|
|
build_run(f"\"{sys.executable}\" -m pip install -e .[devel]")
|
|
|
|
|
|
def add_arguments(parser: ArgumentParser):
|
|
parser.add_argument("--build_type",
|
|
"-b",
|
|
default="Release",
|
|
choices=["Release", "RelWithDebInfo", "Debug"])
|
|
parser.add_argument("--cuda_architectures", "-a")
|
|
parser.add_argument("--install", "-i", action="store_true")
|
|
parser.add_argument("--clean", "-c", action="store_true")
|
|
parser.add_argument("--clean_wheel",
|
|
action="store_true",
|
|
help="Clear dist_dir folder creating wheel")
|
|
parser.add_argument("--configure_cmake",
|
|
action="store_true",
|
|
help="Always configure cmake before building")
|
|
parser.add_argument("--use_ccache",
|
|
"-ccache",
|
|
default=False,
|
|
action="store_true",
|
|
help="Use ccache compiler driver")
|
|
parser.add_argument(
|
|
"--fast_build",
|
|
"-f",
|
|
default=False,
|
|
action="store_true",
|
|
help=
|
|
"Skip compiling some kernels to accelerate compilation -- for development only"
|
|
)
|
|
parser.add_argument("--job_count",
|
|
"-j",
|
|
const=cpu_count(),
|
|
nargs="?",
|
|
help="Parallel job count")
|
|
parser.add_argument(
|
|
"--cpp_only",
|
|
"-l",
|
|
action="store_true",
|
|
help="Only build the C++ library without Python dependencies")
|
|
parser.add_argument(
|
|
"--extra-cmake-vars",
|
|
"-D",
|
|
action="append",
|
|
help=
|
|
"Extra cmake variable definition which can be specified multiple times, example: -D \"key1=value1\" -D \"key2=value2\"",
|
|
default=[])
|
|
parser.add_argument(
|
|
"--extra-make-targets",
|
|
help="A list of additional make targets, example: \"target_1 target_2\"",
|
|
nargs="+",
|
|
default=[])
|
|
parser.add_argument("--trt_root",
|
|
help="Directory to find TensorRT headers/libs")
|
|
parser.add_argument("--nccl_root",
|
|
help="Directory to find NCCL headers/libs")
|
|
parser.add_argument("--build_dir",
|
|
type=Path,
|
|
help="Directory where cpp sources are built")
|
|
parser.add_argument("--dist_dir",
|
|
type=Path,
|
|
help="Directory where python wheels are built")
|
|
parser.add_argument(
|
|
"--skip_building_wheel",
|
|
"-s",
|
|
action="store_true",
|
|
help=
|
|
"Do not build the *.whl files (they are only needed for distribution).")
|
|
parser.add_argument(
|
|
"--python_bindings",
|
|
"-p",
|
|
action="store_true",
|
|
help="(deprecated) Build the python bindings for the C++ runtime.")
|
|
parser.add_argument("--benchmarks",
|
|
action="store_true",
|
|
help="Build the benchmarks for the C++ runtime.")
|
|
parser.add_argument("--micro_benchmarks",
|
|
action="store_true",
|
|
help="Build the micro benchmarks for C++ components.")
|
|
parser.add_argument("--nvtx",
|
|
action="store_true",
|
|
help="Enable NVTX features.")
|
|
parser.add_argument("--skip-stubs",
|
|
action="store_true",
|
|
help="Skip building python stubs")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
parser = ArgumentParser()
|
|
add_arguments(parser)
|
|
args = parser.parse_args()
|
|
main(**vars(args))
|