TensorRT-LLMs/scripts/build_wheel.py

#!/usr/bin/env python3
# SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import os
import platform
import sys
from argparse import ArgumentParser
from contextlib import contextmanager
from functools import partial
from multiprocessing import cpu_count
from pathlib import Path
from shutil import copy, rmtree
from subprocess import CalledProcessError, check_output, run
from textwrap import dedent
from typing import List


@contextmanager
def working_directory(path):
    """Changes working directory and returns to previous on exit."""
    prev_cwd = Path.cwd()
    os.chdir(path)
    try:
        yield
    finally:
        os.chdir(prev_cwd)


def get_project_dir():
    return Path(__file__).parent.resolve().parent


def get_source_dir():
    return get_project_dir() / "cpp"


def get_build_dir(build_dir, build_type):
    if build_dir is None:
        build_dir = get_source_dir() / ("build" if build_type == "Release" else
                                        f"build_{build_type}")
    else:
        build_dir = Path(build_dir)
    return build_dir


def clear_folder(folder_path):
    for item in os.listdir(folder_path):
        item_path = os.path.join(folder_path, item)
        if os.path.isdir(item_path):
            rmtree(item_path)
        else:
            os.remove(item_path)


def main(*,
         build_type: str = "Release",
         build_dir: Path = None,
         dist_dir: Path = None,
         cuda_architectures: str = None,
         job_count: int = None,
         extra_cmake_vars: List[str] = list(),
         extra_make_targets: str = "",
         trt_root: str = None,
         nccl_root: str = None,
         clean: bool = False,
         clean_wheel: bool = False,
         configure_cmake: bool = False,
         use_ccache: bool = False,
         fast_build: bool = False,
         cpp_only: bool = False,
         install: bool = False,
         skip_building_wheel: bool = False,
         python_bindings: bool = True,
         benchmarks: bool = False,
         micro_benchmarks: bool = False,
         nvtx: bool = False,
         skip_stubs: bool = False):

    if clean:
        clean_wheel = True

    project_dir = get_project_dir()
    os.chdir(project_dir)
    build_run = partial(run, shell=True, check=True)

    # Get all submodules and check their folder exists. If not,
    # invoke git submodule update
    with open(project_dir / ".gitmodules", "r") as submodules_f:
        submodules = [
            l.split("=")[1].strip() for l in submodules_f.readlines()
            if "path = " in l
        ]
    if any(not (project_dir / submodule / ".git").exists()
           for submodule in submodules):
        build_run('git submodule update --init --recursive')
    on_windows = platform.system() == "Windows"
    requirements_filename = "requirements-dev-windows.txt" if on_windows else "requirements-dev.txt"
    build_run(f"\"{sys.executable}\" -m pip install -r {requirements_filename}")
    # Ensure TRT is installed on windows to prevent surprises.
    reqs = check_output([sys.executable, "-m", "pip", "freeze"])
    installed_packages = [r.decode().split("==")[0] for r in reqs.split()]
    if "tensorrt" not in installed_packages:
        error_msg = "TensorRT was not installed properly."
        if on_windows:
            error_msg += (
                " Please download the TensorRT zip file manually,"
                " install it and relaunch build_wheel.py."
                " See https://docs.nvidia.com/deeplearning/tensorrt/install-guide/index.html#installing-zip for more details."
            )
        else:
            error_msg += " Please run `pip install tensorrt` manually and relaunch build_wheel.py"
        raise RuntimeError(error_msg)

    if cuda_architectures is not None:
        if "70-real" in cuda_architectures:
            raise RuntimeError("Volta architecture is deprecated support.")

    cmake_cuda_architectures = (
        f'"-DCMAKE_CUDA_ARCHITECTURES={cuda_architectures}"'
        if cuda_architectures is not None else "")

    cmake_def_args = []
    cmake_generator = ""

    hardware_arch = platform.machine()

    if on_windows:
        # Windows does not support multi-device currently.
        extra_cmake_vars.extend(["ENABLE_MULTI_DEVICE=0"])

        # The Ninja CMake generator is used for our Windows build
        # (Easier than MSBuild to make compatible with our Docker image)
        cmake_generator = "-GNinja"

    if job_count is None:
        job_count = cpu_count()

    if len(extra_cmake_vars):
        # Backwards compatibility, we also support semicolon expansion for each value.
        # However, it is best to use flag multiple-times due to issues with spaces in CLI.
        expanded_args = []
        for var in extra_cmake_vars:
            expanded_args += var.split(";")

        extra_cmake_vars = ["\"-D{}\"".format(var) for var in expanded_args]
        # Don't include duplicate conditions
        cmake_def_args.extend(set(extra_cmake_vars))

    if trt_root is not None:
        trt_root = trt_root.replace("\\", "/")
        trt_lib_dir_candidates = (
            f"{trt_root}/targets/{hardware_arch}-linux-gnu/lib",
            f"{trt_root}/lib")
        try:
            trt_lib_dir = next(
                filter(lambda x: Path(x).exists(), trt_lib_dir_candidates))
        except StopIteration:
            trt_lib_dir = trt_lib_dir_candidates[0]
        cmake_def_args.append(f"-DTRT_LIB_DIR={trt_lib_dir}")
        cmake_def_args.append(f"-DTRT_INCLUDE_DIR={trt_root}/include")

    if nccl_root is not None:
        cmake_def_args.append(f"-DNCCL_LIB_DIR={nccl_root}/lib")
        cmake_def_args.append(f"-DNCCL_INCLUDE_DIR={nccl_root}/include")

    build_dir = get_build_dir(build_dir, build_type)
    first_build = not build_dir.exists()

    if clean and build_dir.exists():
        clear_folder(build_dir)  # Keep the folder in case it is mounted.
    build_dir.mkdir(parents=True, exist_ok=True)

    if use_ccache:
        cmake_def_args.append(
            f"-DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_CUDA_COMPILER_LAUNCHER=ccache"
        )

    if fast_build:
        cmake_def_args.append(f"-DFAST_BUILD=ON")

    build_pyt = "OFF" if cpp_only else "ON"
    th_common_lib = "" if cpp_only else "th_common"
    build_pybind = "OFF" if cpp_only else "ON"
    bindings_lib = "" if cpp_only else "bindings"
    benchmarks_lib = "benchmarks" if benchmarks else ""
    build_micro_benchmarks = "ON" if micro_benchmarks else "OFF"
    micro_benchmarks_lib = "micro_benchmarks" if micro_benchmarks else ""
    disable_nvtx = "OFF" if nvtx else "ON"
    executor_worker = "" if on_windows else "executorWorker "

    source_dir = get_source_dir()
    with working_directory(build_dir):
        cmake_def_args = " ".join(cmake_def_args)
        if clean or first_build or configure_cmake:
            build_run(
                f'cmake -DCMAKE_BUILD_TYPE="{build_type}" -DBUILD_PYT="{build_pyt}" -DBUILD_PYBIND="{build_pybind}"'
                f' -DNVTX_DISABLE="{disable_nvtx}" -DBUILD_MICRO_BENCHMARKS={build_micro_benchmarks}'
                f' {cmake_cuda_architectures} {cmake_def_args} {cmake_generator} -S "{source_dir}"'
            )
        build_run(
            f'cmake --build . --config {build_type} --parallel {job_count} '
            f'--target tensorrt_llm nvinfer_plugin_tensorrt_llm {th_common_lib} {bindings_lib} {benchmarks_lib} '
            f'{micro_benchmarks_lib} {executor_worker} {" ".join(extra_make_targets)}'
        )

    if cpp_only:
        assert not install, "Installing is not supported for cpp_only builds"
        return

    pkg_dir = project_dir / "tensorrt_llm"
    assert pkg_dir.is_dir(), f"{pkg_dir} is not a directory"
    lib_dir = pkg_dir / "libs"
    if lib_dir.exists():
        clear_folder(lib_dir)
    lib_dir.mkdir(parents=True, exist_ok=True)
    if on_windows:
        copy(build_dir / "tensorrt_llm/tensorrt_llm.dll",
             lib_dir / "tensorrt_llm.dll")
        copy(build_dir / f"tensorrt_llm/thop/th_common.dll",
             lib_dir / "th_common.dll")
        copy(
            build_dir / f"tensorrt_llm/plugins/nvinfer_plugin_tensorrt_llm.dll",
            lib_dir / "nvinfer_plugin_tensorrt_llm.dll")
        copy(
            build_dir /
            "tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/tensorrt_llm_nvrtc_wrapper.dll",
            lib_dir / "tensorrt_llm_nvrtc_wrapper.dll")
    else:
        copy(build_dir / "tensorrt_llm/libtensorrt_llm.so",
             lib_dir / "libtensorrt_llm.so")
        copy(build_dir / "tensorrt_llm/thop/libth_common.so",
             lib_dir / "libth_common.so")
        copy(
            build_dir /
            "tensorrt_llm/plugins/libnvinfer_plugin_tensorrt_llm.so",
            lib_dir / "libnvinfer_plugin_tensorrt_llm.so")
        copy(
            build_dir /
            "tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/libtensorrt_llm_nvrtc_wrapper.so",
            lib_dir / "libtensorrt_llm_nvrtc_wrapper.so")
        copy(
            build_dir /
            "tensorrt_llm/batch_manager/libtensorrt_llm_ucx_wrapper.so",
            lib_dir / "libtensorrt_llm_ucx_wrapper.so")
        copy(
            build_dir /
            "tensorrt_llm/kernels/decoderMaskedMultiheadAttention/libdecoder_attention_0.so",
            lib_dir / "libdecoder_attention_0.so")
        copy(
            build_dir /
            "tensorrt_llm/kernels/decoderMaskedMultiheadAttention/libdecoder_attention_1.so",
            lib_dir / "libdecoder_attention_1.so")

    bin_dir = pkg_dir / "bin"
    if bin_dir.exists():
        clear_folder(bin_dir)
    bin_dir.mkdir(parents=True, exist_ok=True)

    if not on_windows:
        copy(build_dir / "tensorrt_llm/executor_worker/executorWorker",
             bin_dir / "executorWorker")

    if not cpp_only:

        def get_pybind_lib():
            pybind_build_dir = (build_dir / "tensorrt_llm" / "pybind")
            if on_windows:
                pybind_lib = list(pybind_build_dir.glob("bindings.*.pyd"))
            else:
                pybind_lib = list(pybind_build_dir.glob("bindings.*.so"))

            assert len(
                pybind_lib
            ) == 1, f"Exactly one pybind library should be present: {pybind_lib}"
            return pybind_lib[0]

        copy(get_pybind_lib(), pkg_dir)
        if not skip_stubs:
            with working_directory(project_dir):
                build_run(
                    f"\"{sys.executable}\" -m pip install pybind11-stubgen")
            with working_directory(pkg_dir):
                if on_windows:
                    stubgen = "stubgen.py"
                    stubgen_contents = """
                    # Loading torch, trt before bindings is required to avoid import errors on windows.
                    # isort: off
                    import torch
                    import tensorrt as trt
                    # isort: on
                    import os
                    import platform

                    from pybind11_stubgen import main

                    if __name__ == "__main__":
                        # Load dlls from `libs` directory before launching bindings.
                        if platform.system() == "Windows":
                            os.add_dll_directory(r\"{lib_dir}\")
                        main()
                    """.format(lib_dir=lib_dir)
                    (pkg_dir / stubgen).write_text(dedent(stubgen_contents))
                    build_run(f"\"{sys.executable}\" {stubgen} -o . bindings")
                    (pkg_dir / stubgen).unlink()
                else:
                    env_ld = os.environ.copy()

                    new_library_path = "/usr/local/cuda/compat/lib.real"
                    if 'LD_LIBRARY_PATH' in env_ld:
                        new_library_path += f":{env_ld['LD_LIBRARY_PATH']}"
                    env_ld["LD_LIBRARY_PATH"] = new_library_path
                    try:
                        build_run(
                            f"\"{sys.executable}\" -m pybind11_stubgen -o . bindings --exit-code",
                            env=env_ld)
                    except CalledProcessError as ex:
                        print(f"Failed to build pybind11 stubgen: {ex}",
                              file=sys.stderr)
                        exit(1)

    if not skip_building_wheel:
        if dist_dir is None:
            dist_dir = project_dir / "build"
        else:
            dist_dir = Path(dist_dir)

        if not dist_dir.exists():
            dist_dir.mkdir(parents=True)

        if clean_wheel:
            # For incremental build, the python build module adds
            # the new files but does not remove the deleted files.
            #
            # This breaks the Windows CI/CD pipeline when building
            # and validating python changes in the whl.
            clear_folder(dist_dir)

        build_run(
            f'\"{sys.executable}\" -m build {project_dir} --skip-dependency-check --no-isolation --wheel --outdir "{dist_dir}"'
        )

    if install:
        build_run(f"\"{sys.executable}\" -m pip install -e .[devel]")


def add_arguments(parser: ArgumentParser):
    parser.add_argument("--build_type",
                        "-b",
                        default="Release",
                        choices=["Release", "RelWithDebInfo", "Debug"])
    parser.add_argument("--cuda_architectures", "-a")
    parser.add_argument("--install", "-i", action="store_true")
    parser.add_argument("--clean", "-c", action="store_true")
    parser.add_argument("--clean_wheel",
                        action="store_true",
                        help="Clear dist_dir folder creating wheel")
    parser.add_argument("--configure_cmake",
                        action="store_true",
                        help="Always configure cmake before building")
    parser.add_argument("--use_ccache",
                        "-ccache",
                        default=False,
                        action="store_true",
                        help="Use ccache compiler driver")
    parser.add_argument(
        "--fast_build",
        "-f",
        default=False,
        action="store_true",
        help=
        "Skip compiling some kernels to accelerate compilation -- for development only"
    )
    parser.add_argument("--job_count",
                        "-j",
                        const=cpu_count(),
                        nargs="?",
                        help="Parallel job count")
    parser.add_argument(
        "--cpp_only",
        "-l",
        action="store_true",
        help="Only build the C++ library without Python dependencies")
    parser.add_argument(
        "--extra-cmake-vars",
        "-D",
        action="append",
        help=
        "Extra cmake variable definition which can be specified multiple times, example: -D \"key1=value1\" -D \"key2=value2\"",
        default=[])
    parser.add_argument(
        "--extra-make-targets",
        help="A list of additional make targets, example: \"target_1 target_2\"",
        nargs="+",
        default=[])
    parser.add_argument("--trt_root",
                        help="Directory to find TensorRT headers/libs")
    parser.add_argument("--nccl_root",
                        help="Directory to find NCCL headers/libs")
    parser.add_argument("--build_dir",
                        type=Path,
                        help="Directory where cpp sources are built")
    parser.add_argument("--dist_dir",
                        type=Path,
                        help="Directory where python wheels are built")
    parser.add_argument(
        "--skip_building_wheel",
        "-s",
        action="store_true",
        help=
        "Do not build the *.whl files (they are only needed for distribution).")
    parser.add_argument(
        "--python_bindings",
        "-p",
        action="store_true",
        help="(deprecated) Build the python bindings for the C++ runtime.")
    parser.add_argument("--benchmarks",
                        action="store_true",
                        help="Build the benchmarks for the C++ runtime.")
    parser.add_argument("--micro_benchmarks",
                        action="store_true",
                        help="Build the micro benchmarks for C++ components.")
    parser.add_argument("--nvtx",
                        action="store_true",
                        help="Enable NVTX features.")
    parser.add_argument("--skip-stubs",
                        action="store_true",
                        help="Skip building python stubs")


if __name__ == "__main__":
    parser = ArgumentParser()
    add_arguments(parser)
    args = parser.parse_args()
    main(**vars(args))