TensorRT-LLMs/scripts/build_wheel.py

#!/usr/bin/env python3
# SPDX-FileCopyrightText: Copyright (c) 2022-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import os
import platform
import re
import shutil
import sys
import sysconfig
import tempfile
import warnings
from argparse import ArgumentParser
from contextlib import contextmanager
from functools import partial
from multiprocessing import cpu_count
from pathlib import Path
from shutil import copy, copytree, rmtree
from subprocess import DEVNULL, CalledProcessError, check_output, run
from textwrap import dedent
from typing import Sequence

try:
    from packaging.requirements import Requirement
except (ImportError, ModuleNotFoundError):
    from pip._vendor.packaging.requirements import Requirement

build_run = partial(run, shell=True, check=True)


@contextmanager
def working_directory(path):
    """Changes working directory and returns to previous on exit."""
    prev_cwd = Path.cwd()
    os.chdir(path)
    try:
        yield
    finally:
        os.chdir(prev_cwd)


def get_project_dir():
    return Path(__file__).parent.resolve().parent


def get_source_dir():
    return get_project_dir() / "cpp"


def get_build_dir(build_dir, build_type):
    if build_dir is None:
        build_dir = get_source_dir() / ("build" if build_type == "Release" else
                                        f"build_{build_type}")
    else:
        build_dir = Path(build_dir).resolve()
    return build_dir


def clear_folder(folder_path):
    for item in os.listdir(folder_path):
        item_path = os.path.join(folder_path, item)
        try:
            if os.path.isdir(item_path) and not os.path.islink(item_path):
                rmtree(item_path)
            else:
                os.remove(item_path)
        except (OSError, IOError) as e:
            print(f"Failed to remove {item_path}: {e}", file=sys.stderr)


def sysconfig_scheme(override_vars=None):
    # Backported 'venv' scheme from Python 3.11+
    if os.name == 'nt':
        scheme = {
            'purelib': '{base}/Lib/site-packages',
            'scripts': '{base}/Scripts',
        }
    else:
        scheme = {
            'purelib': '{base}/lib/python{py_version_short}/site-packages',
            'scripts': '{base}/bin',
        }

    vars_ = sysconfig.get_config_vars()
    if override_vars:
        vars_.update(override_vars)
    return {key: value.format(**vars_) for key, value in scheme.items()}


def create_venv(project_dir: Path):
    py_major = sys.version_info.major
    py_minor = sys.version_info.minor
    venv_prefix = project_dir / f".venv-{py_major}.{py_minor}"
    print(
        f"-- Using virtual environment at: {venv_prefix} (Python {py_major}.{py_minor})"
    )

    # Ensure compatible virtualenv version is installed (>=20.29.1, <22.0)
    print("-- Ensuring virtualenv version >=20.29.1,<22.0 is installed...")
    build_run(f'"{sys.executable}" -m pip install "virtualenv>=20.29.1,<22.0"')

    # Create venv if it doesn't exist
    if not venv_prefix.exists():
        print(f"-- Creating virtual environment in {venv_prefix}...")
        build_run(
            f'"{sys.executable}" -m virtualenv --system-site-packages "{venv_prefix}"'
        )
    else:
        print("-- Virtual environment already exists.")

    return venv_prefix


def setup_venv(project_dir: Path, requirements_file: Path,
               no_venv: bool) -> tuple[Path, Path]:
    """Creates/updates a venv and installs requirements.

    Args:
        project_dir: The root directory of the project.
        requirements_file: Path to the requirements file.
        no_venv: Use current Python environment as is.

    Returns:
        Tuple[Path, Path]: Paths to the python and conan executables in the venv.
    """
    if no_venv or sys.prefix != sys.base_prefix:
        reason = "Explicitly requested by user" if no_venv else "Already inside virtual environment"
        print(f"-- {reason}, using environment {sys.prefix} as is.")
        venv_prefix = Path(sys.prefix)
    else:
        venv_prefix = create_venv(project_dir)

    scheme = sysconfig_scheme({'base': venv_prefix})
    # Determine venv executable paths
    scripts_dir = Path(scheme["scripts"])
    venv_python = venv_prefix / sys.executable.removeprefix(sys.prefix)[1:]

    if os.environ.get("NVIDIA_PYTORCH_VERSION"):
        # Ensure PyPI PyTorch is not installed in the venv
        purelib_dir = Path(scheme["purelib"])
        pytorch_package_dir = purelib_dir / "torch"
        if str(venv_prefix) != sys.base_prefix and pytorch_package_dir.exists():
            warnings.warn(
                f"Using the NVIDIA PyTorch container with PyPI distributed PyTorch may lead to compatibility issues.\n"
                f"If you encounter any problems, please delete the environment at `{venv_prefix}` so that "
                f"`build_wheel.py` can recreate a virtual environment using container-provided PyTorch installation."
            )
            print("^^^^^^^^^^ IMPORTANT WARNING ^^^^^^^^^^", file=sys.stderr)
            input("Press Ctrl+C to stop, any key to continue...\n")

        # Ensure inherited PyTorch version is compatible
        try:
            info = check_output(
                [str(venv_python), "-m", "pip", "show", "torch"])
        except CalledProcessError:
            raise RuntimeError(
                "NVIDIA PyTorch container detected, but cannot find PyTorch installation. "
                "The environment is corrupted. Please recreate your container.")
        version_installed = next(
            line.removeprefix("Version: ")
            for line in info.decode().splitlines()
            if line.startswith("Version: "))
        version_required = None
        try:
            with open(requirements_file) as fp:
                for line in fp:
                    if line.startswith("torch"):
                        version_required = Requirement(line)
                        break
        except FileNotFoundError:
            pass

        if version_required is not None:
            if version_installed not in version_required.specifier:
                raise RuntimeError(
                    f"Incompatible NVIDIA PyTorch container detected. "
                    f"The container provides PyTorch version {version_installed}, "
                    f"but current revision requires {version_required}. "
                    f"Please recreate your container using image specified in jenkins/current_image_tags.properties. "
                    f"NOTE: Please don't try install PyTorch using pip. "
                    f"Using the NVIDIA PyTorch container with PyPI distributed PyTorch may lead to compatibility issues."
                )

    # Install/update requirements
    print(
        f"-- Installing requirements from {requirements_file} into {venv_prefix}..."
    )
    build_run(f'"{venv_python}" -m pip install -r "{requirements_file}"')

    venv_conan = setup_conan(scripts_dir, venv_python)

    return venv_python, venv_conan


def setup_conan(scripts_dir, venv_python):
    build_run(f'"{venv_python}" -m pip install conan==2.14.0')
    # Determine the path to the conan executable within the venv
    venv_conan = scripts_dir / "conan"
    if not venv_conan.exists():
        # Attempt to find it using shutil.which as a fallback, in case it's already installed in the system
        try:
            result = build_run(
                f'''{venv_python} -c "import shutil; print(shutil.which('conan'))" ''',
                capture_output=True,
                text=True)
            conan_path_str = result.stdout.strip()

            if conan_path_str:
                venv_conan = Path(conan_path_str)
                print(
                    f"-- Found conan executable via PATH search at: {venv_conan}"
                )
            else:
                raise RuntimeError(
                    f"Failed to locate conan executable in virtual environment {scripts_dir} or system PATH."
                )

        except CalledProcessError as e:
            print(f"Fallback search command output: {e.stdout}",
                  file=sys.stderr)
            print(f"Fallback search command error: {e.stderr}", file=sys.stderr)
            raise RuntimeError(
                f"Failed to locate conan executable in virtual environment {scripts_dir} or system PATH."
            )
    else:
        print(f"-- Found conan executable at: {venv_conan}")

    # Create default profile
    build_run(f'"{venv_conan}" profile detect -f')

    # Add the TensorRT LLM remote if it doesn't exist
    build_run(
        f'"{venv_conan}" remote add --force TensorRT-LLM https://edge.urm.nvidia.com/artifactory/api/conan/sw-tensorrt-llm-conan',
        stdout=DEVNULL,
        stderr=DEVNULL)

    return venv_conan


def generate_fmha_cu(project_dir, venv_python):
    fmha_v2_cu_dir = project_dir / "cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/fmha_v2_cu"
    fmha_v2_cu_dir.mkdir(parents=True, exist_ok=True)

    fmha_v2_dir = project_dir / "cpp/kernels/fmha_v2"

    env = os.environ.copy()
    env.update({
        "TORCH_CUDA_ARCH_LIST": "9.0",
        "ENABLE_SM89_QMMA": "1",
        "ENABLE_HMMA_FP32": "1",
        "GENERATE_CUBIN": "1",
        "SCHEDULING_MODE": "1",
        "ENABLE_SM100": "1",
        "ENABLE_SM120": "1",
        "GENERATE_CU_TRTLLM": "true"
    })

    shutil.rmtree(fmha_v2_dir / "generated", ignore_errors=True)
    shutil.rmtree(fmha_v2_dir / "temp", ignore_errors=True)
    shutil.rmtree(fmha_v2_dir / "obj", ignore_errors=True)
    build_run("python3 setup.py", env=env, cwd=fmha_v2_dir)

    # Only touches generated source files if content is updated
    def move_if_updated(src, dst):
        with open(src, "rb") as f:
            new_content = f.read()
        try:
            with open(dst, "rb") as f:
                old_content = f.read()
        except FileNotFoundError:
            old_content = None

        if old_content != new_content:
            shutil.move(src, dst)

    # Copy generated header file when cu path is active and cubins are deleted.
    cubin_dir = project_dir / "cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin"
    move_if_updated(fmha_v2_dir / "generated/fmha_cubin.h",
                    cubin_dir / "fmha_cubin.h")

    # Copy generated source file (fmha_cubin.cpp) to the same directory as header
    cpp_src = fmha_v2_dir / "generated/fmha_cubin.cpp"
    if cpp_src.exists():
        move_if_updated(cpp_src, cubin_dir / "fmha_cubin.cpp")

    generated_files = set()
    for cu_file in (fmha_v2_dir / "generated").glob("*sm*.cu"):
        dst_file = fmha_v2_cu_dir / os.path.basename(cu_file)
        move_if_updated(cu_file, dst_file)
        generated_files.add(str(dst_file.resolve()))

    # Remove extra files
    for root, _, files in os.walk(fmha_v2_cu_dir):
        for file in files:
            file_path = os.path.realpath(os.path.join(root, file))
            if file_path not in generated_files:
                os.remove(file_path)


def create_cuda_stub_links(cuda_stub_dir: str, missing_libs: list[str]) -> str:
    """
    Creates symbolic links for CUDA stub libraries in a temporary directory.

    Args:
        cuda_stub_dir (str): Path to the directory containing CUDA stubs.
        missing_libs: Versioned names of the missing libraries.

    Returns:
        str: Path to the temporary directory where links were created.
    """
    cuda_stub_path = Path(cuda_stub_dir)
    if not cuda_stub_path.exists():
        raise RuntimeError(
            f"CUDA stub directory '{cuda_stub_dir}' does not exist.")

    # Create a temporary directory for the symbolic links
    temp_dir = tempfile.mkdtemp(prefix="cuda_stub_links_")
    temp_dir_path = Path(temp_dir)

    version_pattern = r'\.\d+'
    for missing_lib in filter(lambda x: re.search(version_pattern, x),
                              missing_libs):
        # Define `so` as the first part of `missing_lib` with trailing '.' and digits removed
        so = cuda_stub_path / re.sub(version_pattern, '', missing_lib)
        so_versioned = temp_dir_path / missing_lib

        # Check if the library exists in the original directory
        if so.exists():
            try:
                # Create the symbolic link in the temporary directory
                so_versioned.symlink_to(so)
            except OSError as e:
                # Clean up the temporary directory on error
                rmtree(temp_dir)
                raise RuntimeError(
                    f"Failed to create symbolic link for '{missing_lib}' in temporary directory '{temp_dir}': {e}"
                )
        else:
            warnings.warn(
                f"Warning: Source library '{so}' does not exist and was skipped."
            )

    # Return the path to the temporary directory where the links were created
    return str(temp_dir_path)


def check_missing_libs(lib_name: str) -> list[str]:
    result = build_run(f"ldd {lib_name}", capture_output=True, text=True)
    missing = []
    for line in result.stdout.splitlines():
        if "not found" in line:
            lib_name = line.split()[
                0]  # Extract the library name before "=> not found"
            if lib_name not in missing:
                missing.append(lib_name)
    return missing


def generate_python_stubs_linux(binding_type: str, venv_python: Path,
                                deep_ep: bool, flash_mla: bool,
                                transfer_agent_binding: bool,
                                binding_lib_name: str):
    is_nanobind = binding_type == "nanobind"
    if is_nanobind:
        build_run(f"\"{venv_python}\" -m pip install nanobind")
    build_run(f"\"{venv_python}\" -m pip install pybind11-stubgen")

    env_stub_gen = os.environ.copy()
    cuda_home_dir = env_stub_gen.get("CUDA_HOME") or env_stub_gen.get(
        "CUDA_PATH") or "/usr/local/cuda"
    missing_libs = check_missing_libs(binding_lib_name)
    cuda_stub_dir = f"{cuda_home_dir}/lib64/stubs"

    if missing_libs and Path(cuda_stub_dir).exists():
        # Create symbolic links for the CUDA stubs
        link_dir = create_cuda_stub_links(cuda_stub_dir, missing_libs)
        ld_library_path = env_stub_gen.get("LD_LIBRARY_PATH")
        env_stub_gen["LD_LIBRARY_PATH"] = ":".join(
            filter(None, [link_dir, cuda_stub_dir, ld_library_path]))
    else:
        link_dir = None

    try:
        if is_nanobind:
            build_run(
                f"\"{venv_python}\" -m nanobind.stubgen -m bindings -r -O .",
                env=env_stub_gen)
        else:
            build_run(
                f"\"{venv_python}\" -m pybind11_stubgen -o . bindings --exit-code",
                env=env_stub_gen)
        build_run(
            f"\"{venv_python}\" -m pybind11_stubgen -o . deep_gemm_cpp_tllm --exit-code",
            env=env_stub_gen)
        if flash_mla:
            build_run(
                f"\"{venv_python}\" -m pybind11_stubgen -o . flash_mla_cpp_tllm --exit-code",
                env=env_stub_gen)
        if deep_ep:
            build_run(
                f"\"{venv_python}\" -m pybind11_stubgen -o . deep_ep_cpp_tllm --exit-code",
                env=env_stub_gen)
        if transfer_agent_binding:
            # Generate stubs for tensorrt_llm_transfer_agent_binding
            if is_nanobind:
                build_run(
                    f"\"{venv_python}\" -m nanobind.stubgen -m tensorrt_llm_transfer_agent_binding -O .",
                    env=env_stub_gen)
            else:
                build_run(
                    f"\"{venv_python}\" -m pybind11_stubgen -o . tensorrt_llm_transfer_agent_binding --exit-code",
                    env=env_stub_gen)
    finally:
        if link_dir:
            rmtree(link_dir)


def generate_python_stubs_windows(binding_type: str, venv_python: Path,
                                  pkg_dir: Path, lib_dir: Path):
    if binding_type == "nanobind":
        print("Windows not yet supported for nanobind stubs")
        exit(1)
    else:
        build_run(f"\"{venv_python}\" -m pip install pybind11-stubgen")
        stubgen = "stubgen.py"
        stubgen_contents = """
                        # Loading torch, trt before bindings is required to avoid import errors on windows.
                        # isort: off
                        import torch
                        import tensorrt as trt
                        # isort: on
                        import os
                        import platform

                        from pybind11_stubgen import main

                        if __name__ == "__main__":
                            # Load dlls from `libs` directory before launching bindings.
                            if platform.system() == "Windows":
                                os.add_dll_directory(r\"{lib_dir}\")
                            main()
                        """.format(lib_dir=lib_dir)
        (pkg_dir / stubgen).write_text(dedent(stubgen_contents))
        build_run(f"\"{venv_python}\" {stubgen} -o . bindings")
        (pkg_dir / stubgen).unlink()


def build_kv_cache_manager_v2(project_dir, venv_python, use_mypyc=False):
    print("-- Building kv_cache_manager_v2...")
    kv_cache_mgr_dir = project_dir / "tensorrt_llm/runtime/kv_cache_manager_v2"
    runtime_dir = project_dir / "tensorrt_llm/runtime"

    # Clean up any existing mypyc artifacts in runtime directory to prevent stale inclusion
    # when switching from --mypyc to standard build
    if not use_mypyc:
        for so_file in runtime_dir.glob("*__mypyc*.so"):
            print(f"Removing stale mypyc artifact: {so_file}")
            so_file.unlink()

        # Also clean up any .so files inside kv_cache_manager_v2
        for so_file in kv_cache_mgr_dir.rglob("*.so"):
            print(f"Removing stale artifact: {so_file}")
            so_file.unlink()

    # Build rawref
    print("-- Building kv_cache_manager_v2 rawref extension...")
    rawref_dir = kv_cache_mgr_dir / "rawref"
    build_run(f'"{venv_python}" setup.py build_ext --inplace', cwd=rawref_dir)

    if use_mypyc:
        # Build mypyc
        print("-- Building kv_cache_manager_v2 mypyc extensions...")
        # setup_mypyc.py is in kv_cache_manager_v2 but executed from runtime dir
        setup_mypyc = kv_cache_mgr_dir / "setup_mypyc.py"
        build_run(f'"{venv_python}" "{setup_mypyc}" build_ext --inplace',
                  cwd=runtime_dir)

        # Verify that the shared library was generated
        if not list(runtime_dir.glob("*__mypyc*.so")):
            raise RuntimeError(
                "Failed to build kv_cache_manager_v2: no shared library generated."
            )


def main(*,
         build_type: str = "Release",
         generator: str = "",
         build_dir: Path = None,
         dist_dir: Path = None,
         cuda_architectures: str = None,
         job_count: int = None,
         extra_cmake_vars: Sequence[str] = tuple(),
         extra_make_targets: str = "",
         trt_root: str = '/usr/local/tensorrt',
         nccl_root: str = None,
         nixl_root: str = None,
         mooncake_root: str = None,
         internal_cutlass_kernels_root: str = None,
         clean: bool = False,
         clean_wheel: bool = False,
         configure_cmake: bool = False,
         use_ccache: bool = False,
         fast_build: bool = False,
         cpp_only: bool = False,
         install: bool = False,
         skip_building_wheel: bool = False,
         linking_install_binary: bool = False,
         binding_type: str = "nanobind",
         benchmarks: bool = False,
         micro_benchmarks: bool = False,
         nvtx: bool = False,
         skip_stubs: bool = False,
         generate_fmha: bool = False,
         no_venv: bool = False,
         nvrtc_dynamic_linking: bool = False,
         mypyc: bool = False):

    if clean:
        clean_wheel = True

    project_dir = get_project_dir()
    os.chdir(project_dir)

    # Get all submodules and check their folder exists. If not,
    # invoke git submodule update
    with open(project_dir / ".gitmodules", "r") as submodules_f:
        submodules = [
            l.split("=")[1].strip() for l in submodules_f.readlines()
            if "path = " in l
        ]
    if any(not (project_dir / submodule / ".git").exists()
           for submodule in submodules):
        build_run('git submodule update --init --recursive')
    on_windows = platform.system() == "Windows"
    requirements_filename = "requirements-dev-windows.txt" if on_windows else "requirements-dev.txt"

    # Setup venv and install requirements
    venv_python, venv_conan = setup_venv(project_dir,
                                         project_dir / requirements_filename,
                                         no_venv)

    # Ensure base TRT is installed (check inside the venv)
    try:
        check_output([str(venv_python), "-m", "pip", "show", "tensorrt"])
    except CalledProcessError:
        error_msg = "TensorRT was not installed properly."
        if on_windows:
            error_msg += (
                " Please download the TensorRT zip file manually,"
                " install it and relaunch build_wheel.py."
                " See https://docs.nvidia.com/deeplearning/tensorrt/install-guide/index.html#installing-zip for more details."
            )
        else:
            error_msg += f" Please install tensorrt into the venv using \"`{venv_python}` -m pip install tensorrt\" and relaunch build_wheel.py"
        raise RuntimeError(error_msg)

    if cuda_architectures is not None:
        if "70-real" in cuda_architectures:
            raise RuntimeError("Volta architecture is deprecated support.")

    cuda_architectures = cuda_architectures or 'all'
    cmake_cuda_architectures = f'"-DCMAKE_CUDA_ARCHITECTURES={cuda_architectures}"'

    cmake_def_args = []
    cmake_generator = ""

    if on_windows:
        # Windows does not support multi-device currently.
        extra_cmake_vars = list(extra_cmake_vars) + ["ENABLE_MULTI_DEVICE=0"]

        # The Ninja CMake generator is used for our Windows build
        # (Easier than MSBuild to make compatible with our Docker image)

    if generator:
        cmake_generator = "-G" + generator

    if job_count is None:
        job_count = cpu_count()

    if len(extra_cmake_vars):
        # Backwards compatibility, we also support semicolon expansion for each value.
        # However, it is best to use flag multiple-times due to issues with spaces in CLI.
        expanded_args = []
        for var in extra_cmake_vars:
            expanded_args += var.split(";")

        extra_cmake_vars = ["\"-D{}\"".format(var) for var in expanded_args]
        # Don't include duplicate conditions
        cmake_def_args.extend(set(extra_cmake_vars))

    if trt_root is not None:
        cmake_def_args.append(f"-DTensorRT_ROOT={trt_root}")

    if nccl_root is not None:
        cmake_def_args.append(f"-DNCCL_ROOT={nccl_root}")

    if nixl_root is not None:
        cmake_def_args.append(f"-DNIXL_ROOT={nixl_root}")

    if mooncake_root is not None:
        if on_windows:
            raise RuntimeError("Mooncake is not supported on Windows.")
        cmake_def_args.append(f"-DMOONCAKE_ROOT={mooncake_root}")

    build_dir = get_build_dir(build_dir, build_type)
    first_build = not Path(build_dir, "CMakeFiles").exists()

    if clean and build_dir.exists():
        clear_folder(build_dir)  # Keep the folder in case it is mounted.
    build_dir.mkdir(parents=True, exist_ok=True)

    def get_binding_type_from_cache():
        cmake_cache_file = build_dir / "CMakeCache.txt"
        if not cmake_cache_file.exists():
            return None

        with open(cmake_cache_file, 'r') as f:
            for line in f:
                if line.startswith("BINDING_TYPE:STRING="):
                    cashed_binding_type = line.split("=", 1)[1].strip()
                    if cashed_binding_type in ['pybind', 'nanobind']:
                        return cashed_binding_type
            return None

    cached_binding_type = get_binding_type_from_cache()

    if not first_build and cached_binding_type != binding_type:
        # Clean up of previous binding build artifacts
        nanobind_dir = build_dir / "tensorrt_llm" / "nanobind"
        if nanobind_dir.exists():
            rmtree(nanobind_dir)
        nanobind_stub_dir = project_dir / "tensorrt_llm" / "bindings"
        if nanobind_stub_dir.exists():
            rmtree(nanobind_stub_dir)

        pybind_dir = build_dir / "tensorrt_llm" / "pybind"
        if pybind_dir.exists():
            rmtree(pybind_dir)
        pybind_stub_dir = project_dir / "tensorrt_llm" / "bindings"
        if pybind_stub_dir.exists():
            rmtree(pybind_stub_dir)

        configure_cmake = True

    if use_ccache:
        cmake_def_args.append(
            f"-DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_CUDA_COMPILER_LAUNCHER=ccache"
        )

    if fast_build:
        cmake_def_args.append(f"-DFAST_BUILD=ON")

    if nvrtc_dynamic_linking:
        cmake_def_args.append(f"-DNVRTC_DYNAMIC_LINKING=ON")

    targets = ["tensorrt_llm", "nvinfer_plugin_tensorrt_llm"]

    if cpp_only:
        build_pyt = "OFF"
        build_deep_ep = "OFF"
        build_deep_gemm = "OFF"
        build_flash_mla = "OFF"
    else:
        targets.extend([
            "th_common", "bindings", "deep_ep", "deep_gemm", "pg_utils",
            "flash_mla"
        ])
        build_pyt = "ON"
        build_deep_ep = "ON"
        build_deep_gemm = "ON"
        build_flash_mla = "ON"

    if benchmarks:
        targets.append("benchmarks")

    if micro_benchmarks:
        targets.append("micro_benchmarks")
        build_micro_benchmarks = "ON"
    else:
        build_micro_benchmarks = "OFF"

    disable_nvtx = "OFF" if nvtx else "ON"

    if not on_windows:
        targets.append("executorWorker")

    source_dir = get_source_dir()

    fmha_v2_cu_dir = project_dir / "cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/fmha_v2_cu"
    if clean or generate_fmha or not fmha_v2_cu_dir.exists():
        generate_fmha_cu(project_dir, venv_python)

    with working_directory(build_dir):
        if clean or first_build or configure_cmake:
            build_run(
                f"\"{venv_conan}\" install --build=missing --remote=TensorRT-LLM --output-folder={build_dir}/conan -s 'build_type={build_type}' {source_dir}"
            )
            cmake_def_args.append(
                f"-DCMAKE_TOOLCHAIN_FILE={build_dir}/conan/conan_toolchain.cmake"
            )
            if internal_cutlass_kernels_root:
                cmake_def_args.append(
                    f"-DINTERNAL_CUTLASS_KERNELS_PATH={internal_cutlass_kernels_root}"
                )
            cmake_def_args = " ".join(cmake_def_args)
            cmake_configure_command = (
                f'cmake -DCMAKE_BUILD_TYPE="{build_type}" -DBUILD_PYT="{build_pyt}" -DBINDING_TYPE="{binding_type}" -DBUILD_DEEP_EP="{build_deep_ep}" -DBUILD_DEEP_GEMM="{build_deep_gemm}" -DBUILD_FLASH_MLA="{build_flash_mla}"'
                f' -DNVTX_DISABLE="{disable_nvtx}" -DBUILD_MICRO_BENCHMARKS={build_micro_benchmarks}'
                f' -DBUILD_WHEEL_TARGETS="{";".join(targets)}"'
                f' -DPython_EXECUTABLE={venv_python} -DPython3_EXECUTABLE={venv_python}'
                f' {cmake_cuda_architectures} {cmake_def_args} {cmake_generator} -S "{source_dir}"'
            )
            print("CMake Configure command: ")
            print(cmake_configure_command)
            build_run(cmake_configure_command)

        cmake_build_command = (
            f'cmake --build . --config {build_type} --parallel {job_count} '
            f'--target build_wheel_targets {" ".join(extra_make_targets)}')
        print("CMake Build command: ")
        print(cmake_build_command)
        build_run(cmake_build_command)

    if cpp_only:
        assert not install, "Installing is not supported for cpp_only builds"
        return

    pkg_dir = project_dir / "tensorrt_llm"
    assert pkg_dir.is_dir(), f"{pkg_dir} is not a directory"
    lib_dir = pkg_dir / "libs"
    include_dir = pkg_dir / "include"
    if lib_dir.exists():
        clear_folder(lib_dir)
    if include_dir.exists():
        clear_folder(include_dir)

    cache_dir = os.getenv("TRTLLM_DG_CACHE_DIR")
    if cache_dir is not None:
        cache_dir = Path(cache_dir)
    elif on_windows:
        if os.getenv("APPDATA") is not None:
            cache_dir = Path(os.getenv("APPDATA")) / "tensorrt_llm"
        else:
            cache_dir = Path(os.getenv("TEMP")) / "tensorrt_llm"
    else:
        if os.getenv("HOME") is not None:
            cache_dir = Path(os.getenv("HOME")) / ".tensorrt_llm"
        else:
            cache_dir = Path(os.getenv("TEMP"), "/tmp") / "tensorrt_llm"
    if cache_dir.exists():
        clear_folder(cache_dir)

    install_file = copy
    install_tree = copytree
    if skip_building_wheel and linking_install_binary:

        def symlink_remove_dst(src, dst):
            src = os.path.abspath(src)
            dst = os.path.abspath(dst)
            if os.path.isdir(dst):
                dst = os.path.join(dst, os.path.basename(src))
            if os.path.exists(dst):
                os.remove(dst)
            os.symlink(src, dst)

        install_file = symlink_remove_dst

        def symlink_remove_dst_tree(src, dst, dirs_exist_ok=True):
            src = os.path.abspath(src)
            dst = os.path.abspath(dst)
            if dirs_exist_ok and os.path.exists(dst):
                os.remove(dst)
            os.symlink(src, dst)

        install_tree = symlink_remove_dst_tree

    lib_dir.mkdir(parents=True, exist_ok=True)
    include_dir.mkdir(parents=True, exist_ok=True)
    install_tree(get_source_dir() / "include" / "tensorrt_llm" / "deep_gemm",
                 include_dir / "deep_gemm",
                 dirs_exist_ok=True)
    required_cuda_headers = [
        "cuda_fp16.h", "cuda_fp16.hpp", "cuda_bf16.h", "cuda_bf16.hpp",
        "cuda_fp8.h", "cuda_fp8.hpp"
    ]
    if os.getenv("CUDA_HOME") is not None:
        cuda_include_dir = Path(os.getenv("CUDA_HOME")) / "include"
    elif os.getenv("CUDA_PATH") is not None:
        cuda_include_dir = Path(os.getenv("CUDA_PATH")) / "include"
    elif not on_windows:
        cuda_include_dir = Path("/usr/local/cuda/include")
    else:
        cuda_include_dir = None

    if cuda_include_dir is None or not cuda_include_dir.exists():
        print(
            "CUDA_HOME or CUDA_PATH should be set to enable DeepGEMM JIT compilation"
        )
    else:
        cuda_include_target_dir = include_dir / "cuda" / "include"
        cuda_include_target_dir.mkdir(parents=True, exist_ok=True)
        for header in required_cuda_headers:
            install_file(cuda_include_dir / header, include_dir / header)

    if on_windows:
        install_file(build_dir / "tensorrt_llm/tensorrt_llm.dll",
                     lib_dir / "tensorrt_llm.dll")
        install_file(build_dir / f"tensorrt_llm/thop/th_common.dll",
                     lib_dir / "th_common.dll")
        install_file(
            build_dir / f"tensorrt_llm/plugins/nvinfer_plugin_tensorrt_llm.dll",
            lib_dir / "nvinfer_plugin_tensorrt_llm.dll")
    else:
        install_file(build_dir / "tensorrt_llm/libtensorrt_llm.so",
                     lib_dir / "libtensorrt_llm.so")
        install_file(build_dir / "tensorrt_llm/thop/libth_common.so",
                     lib_dir / "libth_common.so")
        install_file(
            build_dir /
            "tensorrt_llm/plugins/libnvinfer_plugin_tensorrt_llm.so",
            lib_dir / "libnvinfer_plugin_tensorrt_llm.so")
        if os.path.exists(
                build_dir /
                "tensorrt_llm/executor/cache_transmission/ucx_utils/libtensorrt_llm_ucx_wrapper.so"
        ):
            install_file(
                build_dir /
                "tensorrt_llm/executor/cache_transmission/ucx_utils/libtensorrt_llm_ucx_wrapper.so",
                lib_dir / "libtensorrt_llm_ucx_wrapper.so")
            build_run(
                f'patchelf --set-rpath \'$ORIGIN/ucx/\' {lib_dir / "libtensorrt_llm_ucx_wrapper.so"}'
            )
            if os.path.exists("/usr/local/ucx"):
                ucx_dir = lib_dir / "ucx"
                if ucx_dir.exists():
                    clear_folder(ucx_dir)
                install_tree("/usr/local/ucx/lib", ucx_dir, dirs_exist_ok=True)
                build_run(
                    f"find {ucx_dir} -type f -name '*.so*' -exec patchelf --set-rpath \'$ORIGIN:$ORIGIN/ucx:$ORIGIN/../\' {{}} \\;"
                )
        # NIXL wrapper and libraries
        nixl_utils_dir = build_dir / "tensorrt_llm/executor/cache_transmission/nixl_utils"
        if os.path.exists(nixl_utils_dir / "libtensorrt_llm_nixl_wrapper.so"):
            install_file(nixl_utils_dir / "libtensorrt_llm_nixl_wrapper.so",
                         lib_dir / "libtensorrt_llm_nixl_wrapper.so")
            build_run(
                f'patchelf --set-rpath \'$ORIGIN/nixl/\' {lib_dir / "libtensorrt_llm_nixl_wrapper.so"}'
            )
            # Copy NIXL libraries
            if os.path.exists("/opt/nvidia/nvda_nixl"):
                nixl_dir = lib_dir / "nixl"
                if nixl_dir.exists():
                    clear_folder(nixl_dir)
                nixl_lib_path = "/opt/nvidia/nvda_nixl/lib/x86_64-linux-gnu"
                if not os.path.exists(nixl_lib_path):
                    nixl_lib_path = "/opt/nvidia/nvda_nixl/lib/aarch64-linux-gnu"
                if not os.path.exists(nixl_lib_path):
                    nixl_lib_path = "/opt/nvidia/nvda_nixl/lib64"
                install_tree(nixl_lib_path, nixl_dir, dirs_exist_ok=True)
                build_run(
                    f"find {nixl_dir} -type f -name '*.so*' -exec patchelf --set-rpath \'$ORIGIN:$ORIGIN/plugins:$ORIGIN/../:$ORIGIN/../ucx/:$ORIGIN/../../ucx/\' {{}} \\;"
                )
        # Install tensorrt_llm_transfer_agent_binding Python module (standalone agent bindings)
        # This is built when either NIXL or Mooncake is enabled
        # Install to tensorrt_llm/ (same level as bindings.so)
        agent_binding_so = list(
            nixl_utils_dir.glob("tensorrt_llm_transfer_agent_binding*.so"))
        if agent_binding_so:
            trtllm_dir = project_dir / "tensorrt_llm"
            install_file(agent_binding_so[0],
                         trtllm_dir / agent_binding_so[0].name)
        if os.path.exists(
                build_dir /
                "tensorrt_llm/executor/cache_transmission/mooncake_utils/libtensorrt_llm_mooncake_wrapper.so"
        ):
            install_file(
                build_dir /
                "tensorrt_llm/executor/cache_transmission/mooncake_utils/libtensorrt_llm_mooncake_wrapper.so",
                lib_dir / "libtensorrt_llm_mooncake_wrapper.so")
        install_file(
            build_dir /
            "tensorrt_llm/kernels/decoderMaskedMultiheadAttention/libdecoder_attention_0.so",
            lib_dir / "libdecoder_attention_0.so")
        install_file(
            build_dir /
            "tensorrt_llm/kernels/decoderMaskedMultiheadAttention/libdecoder_attention_1.so",
            lib_dir / "libdecoder_attention_1.so")
        install_file(build_dir / "tensorrt_llm/runtime/utils/libpg_utils.so",
                     lib_dir / "libpg_utils.so")

    deep_ep_dir = pkg_dir / "deep_ep"
    if deep_ep_dir.is_symlink():
        deep_ep_dir.unlink()
    elif deep_ep_dir.is_dir():
        clear_folder(deep_ep_dir)
        deep_ep_dir.rmdir()

    # Handle deep_gemm installation
    deep_gemm_dir = pkg_dir / "deep_gemm"
    if deep_gemm_dir.is_symlink():
        deep_gemm_dir.unlink()
    elif deep_gemm_dir.is_dir():
        clear_folder(deep_gemm_dir)
        deep_gemm_dir.rmdir()

    bin_dir = pkg_dir / "bin"
    if bin_dir.exists():
        clear_folder(bin_dir)
    bin_dir.mkdir(parents=True, exist_ok=True)

    if not on_windows:
        install_file(build_dir / "tensorrt_llm/executor_worker/executorWorker",
                     bin_dir / "executorWorker")

    scripts_dir = pkg_dir / "scripts"
    if scripts_dir.exists():
        clear_folder(scripts_dir)
    scripts_dir.mkdir(parents=True, exist_ok=True)

    if not on_windows:
        install_file(project_dir / "docker/common/install_tensorrt.sh",
                     scripts_dir / "install_tensorrt.sh")

    if not cpp_only:

        def get_binding_lib(subdirectory, name):
            binding_build_dir = (build_dir / "tensorrt_llm" / subdirectory)
            if on_windows:
                binding_lib = list(binding_build_dir.glob(f"{name}.*.pyd"))
            else:
                binding_lib = list(binding_build_dir.glob(f"{name}.*.so"))

            assert len(
                binding_lib
            ) == 1, f"Exactly one binding library should be present: {binding_lib}"
            return binding_lib[0]

        binding_lib_dir = get_binding_lib(binding_type, "bindings")
        binding_lib_file_name = binding_lib_dir.name
        install_file(binding_lib_dir, pkg_dir)

        with (build_dir / "tensorrt_llm" / "deep_ep" /
              "cuda_architectures.txt").open() as f:
            deep_ep_cuda_architectures = f.read().strip().strip(";")
        if deep_ep_cuda_architectures:
            install_file(get_binding_lib("deep_ep", "deep_ep_cpp_tllm"),
                         pkg_dir)
            install_tree(build_dir / "tensorrt_llm" / "deep_ep" / "python" /
                         "deep_ep",
                         deep_ep_dir,
                         dirs_exist_ok=True)
            (lib_dir / "nvshmem").mkdir(exist_ok=True)
            install_file(
                build_dir / "tensorrt_llm/deep_ep/nvshmem-build/License.txt",
                lib_dir / "nvshmem")
            install_file(
                build_dir /
                "tensorrt_llm/deep_ep/nvshmem-build/src/lib/nvshmem_bootstrap_uid.so.3",
                lib_dir / "nvshmem")
            install_file(
                build_dir /
                "tensorrt_llm/deep_ep/nvshmem-build/src/lib/nvshmem_transport_ibgda.so.103",
                lib_dir / "nvshmem")

        install_file(get_binding_lib("deep_gemm", "deep_gemm_cpp_tllm"),
                     pkg_dir)
        install_tree(build_dir / "tensorrt_llm" / "deep_gemm" / "python" /
                     "deep_gemm",
                     deep_gemm_dir,
                     dirs_exist_ok=True)

        with (build_dir / "tensorrt_llm" / "flash_mla" /
              "cuda_architectures.txt").open() as f:
            flash_mla_cuda_architectures = f.read().strip().strip(";")
        if flash_mla_cuda_architectures:
            install_file(get_binding_lib("flash_mla", "flash_mla_cpp_tllm"),
                         pkg_dir)
            install_tree(build_dir / "tensorrt_llm" / "flash_mla" / "python" /
                         "flash_mla",
                         pkg_dir / "flash_mla",
                         dirs_exist_ok=True)

        if not skip_stubs:
            with working_directory(pkg_dir):
                if on_windows:
                    generate_python_stubs_windows(binding_type, venv_python,
                                                  pkg_dir, lib_dir)
                else:  # on linux
                    generate_python_stubs_linux(
                        binding_type, venv_python,
                        bool(deep_ep_cuda_architectures),
                        bool(flash_mla_cuda_architectures),
                        nixl_root is not None or mooncake_root is not None,
                        binding_lib_file_name)

    build_kv_cache_manager_v2(project_dir, venv_python, use_mypyc=mypyc)

    if not skip_building_wheel:
        if dist_dir is None:
            dist_dir = project_dir / "build"
        else:
            dist_dir = Path(dist_dir)

        if not dist_dir.exists():
            dist_dir.mkdir(parents=True)

        if clean_wheel:
            # For incremental build, the python build module adds
            # the new files but does not remove the deleted files.
            #
            # This breaks the Windows CI/CD pipeline when building
            # and validating python changes in the whl.
            clear_folder(dist_dir)

        extra_wheel_build_args = os.getenv("EXTRA_WHEEL_BUILD_ARGS", "")
        build_run(
            f'\"{venv_python}\" -m build {project_dir} --skip-dependency-check {extra_wheel_build_args} --no-isolation --wheel --outdir "{dist_dir}"'
        )
        env = os.environ.copy()
        if mypyc:
            env["TRTLLM_ENABLE_MYPYC"] = "1"
        else:
            env["TRTLLM_ENABLE_MYPYC"] = "0"

        build_run(
            f'\"{venv_python}\" -m build {project_dir} --skip-dependency-check --no-isolation --wheel --outdir "{dist_dir}"',
            env=env)

    if install:
        build_run(f"\"{sys.executable}\" -m pip install -e .[devel]")


def add_arguments(parser: ArgumentParser):
    parser.add_argument(
        "--build_type",
        "-b",
        default="Release",
        choices=["Release", "RelWithDebInfo", "Debug"],
        help="Build type, will be passed to cmake `CMAKE_BUILD_TYPE` variable")
    parser.add_argument(
        "--generator",
        "-G",
        default="",
        help="CMake generator to use (e.g., 'Ninja', 'Unix Makefiles')")
    parser.add_argument(
        "--cuda_architectures",
        "-a",
        help=
        "CUDA architectures to build for, will be passed to cmake `CUDA_ARCHITECTURES` variable. Example: `--cuda_architectures=90-real;100-real`"
    )
    parser.add_argument("--install",
                        "-i",
                        action="store_true",
                        help="Install the built python package after building")
    parser.add_argument("--clean",
                        "-c",
                        action="store_true",
                        help="Clean the build directory before building")
    parser.add_argument(
        "--clean_wheel",
        action="store_true",
        help=
        "Clear dist_dir folder when creating wheel. Will be set to `true` if `--clean` is set"
    )
    parser.add_argument("--configure_cmake",
                        action="store_true",
                        help="Always configure cmake before building")
    parser.add_argument("--use_ccache",
                        "-ccache",
                        default=False,
                        action="store_true",
                        help="Use ccache compiler driver for faster rebuilds")
    parser.add_argument(
        "--fast_build",
        "-f",
        default=False,
        action="store_true",
        help=
        "Skip compiling some kernels to accelerate compilation -- for development only"
    )
    parser.add_argument(
        "--job_count",
        "-j",
        const=cpu_count(),
        nargs="?",
        help=
        "Number of parallel jobs for compilation (default: number of CPU cores)"
    )
    parser.add_argument(
        "--cpp_only",
        "-l",
        action="store_true",
        help="Only build the C++ library without Python dependencies")
    parser.add_argument(
        "--extra-cmake-vars",
        "-D",
        action="append",
        help=
        "Extra cmake variable definitions which can be specified multiple times. Example: -D \"key1=value1\" -D \"key2=value2\"",
        default=[])
    parser.add_argument(
        "--extra-make-targets",
        help="Additional make targets to build. Example: \"target_1 target_2\"",
        nargs="+",
        default=[])
    parser.add_argument(
        "--trt_root",
        default="/usr/local/tensorrt",
        help="Directory containing TensorRT headers and libraries")
    parser.add_argument("--nccl_root",
                        help="Directory containing NCCL headers and libraries")
    parser.add_argument("--nixl_root",
                        help="Directory containing NIXL headers and libraries")
    parser.add_argument(
        "--mooncake_root",
        help=
        "Directory containing Mooncake transfer engine headers and libraries")
    parser.add_argument(
        "--internal-cutlass-kernels-root",
        default="",
        help=
        "Directory containing internal_cutlass_kernels sources. If specified, the internal_cutlass_kernels and NVRTC wrapper libraries will be built from source."
    )
    parser.add_argument(
        "--build_dir",
        type=Path,
        help=
        "Directory where C++ sources are built (default: cpp/build or cpp/build_<build_type>)"
    )
    parser.add_argument(
        "--dist_dir",
        type=Path,
        help="Directory where Python wheels are built (default: build/)")
    parser.add_argument(
        "--skip_building_wheel",
        "-s",
        action="store_true",
        help=
        "Skip building the *.whl files (they are only needed for distribution)")
    parser.add_argument(
        "--linking_install_binary",
        action="store_true",
        help=
        "Install the built binary by creating symbolic links instead of copying files"
    )
    parser.add_argument("--binding_type",
                        choices=["pybind", "nanobind"],
                        default="nanobind",
                        help="Which binding library to use: pybind or nanobind")
    parser.add_argument("--benchmarks",
                        action="store_true",
                        help="Build the benchmarks for the C++ runtime")
    parser.add_argument("--micro_benchmarks",
                        action="store_true",
                        help="Build the micro benchmarks for C++ components")
    parser.add_argument("--nvtx",
                        action="store_true",
                        help="Enable NVTX profiling features")
    parser.add_argument("--skip-stubs",
                        action="store_true",
                        help="Skip building Python type stubs")
    parser.add_argument("--generate_fmha",
                        action="store_true",
                        help="Generate the FMHA CUDA files")
    parser.add_argument(
        "--no-venv",
        action="store_true",
        help=
        "Use the current Python interpreter without creating a virtual environment"
    )
    parser.add_argument(
        "--nvrtc_dynamic_linking",
        action="store_true",
        help="Link against dynamic NVRTC libraries instead of static ones")
    parser.add_argument("--mypyc",
                        action="store_true",
                        help="Compile kv_cache_manager_v2 with mypyc")


if __name__ == "__main__":
    parser = ArgumentParser()
    add_arguments(parser)
    args = parser.parse_args()
    main(**vars(args))