[None][chore] Removing pybind11 bindings and references (#10550)

Signed-off-by: Linda-Stadter <57756729+Linda-Stadter@users.noreply.github.com>
This commit is contained in:
Linda 2026-01-26 14:19:12 +01:00 committed by GitHub
parent ce37e27066
commit ce556290c9
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
14 changed files with 32 additions and 428 deletions

2
.github/CODEOWNERS vendored
View File

@ -185,8 +185,6 @@ docs/source/performance/perf-benchmarking.md @NVIDIA/trtllm-bench-reviewers
/tensorrt_llm/_torch/pyexecutor/resource_manager.py @NVIDIA/trt-llm-kv-cache-manager-devs
/cpp/tensorrt_llm/nanobind/batch_manager/kvCacheManager.h @NVIDIA/trt-llm-kv-cache-manager-devs
/cpp/tensorrt_llm/nanobind/batch_manager/kvCacheManager.cpp @NVIDIA/trt-llm-kv-cache-manager-devs
/cpp/tensorrt_llm/pybind/batch_manager/kvCacheManager.h @NVIDIA/trt-llm-kv-cache-manager-devs
/cpp/tensorrt_llm/pybind/batch_manager/kvCacheManager.cpp @NVIDIA/trt-llm-kv-cache-manager-devs
# The rule below requires that any PR modifying public APIs must be approved by at least one member
# of the NVIDIA/trt-llm-committed-api-review-committee or NVIDIA/trt-llm-noncommitted-api-review-committee team.

View File

@ -55,8 +55,8 @@ graph TB
Sampling[Sampling]
BatchManager[Batch Manager]
KVCache[KV Cache Manager]
PyScheduler --> |Pybind|Shared_Scheduler
PyDecoder --> |Pybind|Shared_Decoder
PyScheduler --> |Nanobind|Shared_Scheduler
PyDecoder --> |Nanobind|Shared_Decoder
Executor --> Shared_Decoder
Shared_Decoder --> Sampling
Executor --> Shared_Scheduler[Scheduler]

View File

@ -83,11 +83,6 @@ endif()
add_compile_definitions("TLLM_GEN_EXPORT_INTERFACE")
add_compile_definitions("TLLM_ENABLE_CUDA")
set(BINDING_TYPE
"nanobind"
CACHE STRING
"Binding type of Python bindings for C++ runtime and batch manager")
set(INTERNAL_CUTLASS_KERNELS_PATH
""
CACHE
@ -246,16 +241,15 @@ get_filename_component(TRT_LLM_ROOT_DIR ${CMAKE_CURRENT_SOURCE_DIR} PATH)
set(3RDPARTY_DIR ${TRT_LLM_ROOT_DIR}/3rdparty)
add_subdirectory(${3RDPARTY_DIR} 3rdparty)
if(BINDING_TYPE STREQUAL "pybind"
OR BUILD_DEEP_EP
OR BUILD_DEEP_GEMM)
if(BUILD_DEEP_EP
OR BUILD_DEEP_GEMM
OR BUILD_FLASH_MLA)
FetchContent_MakeAvailable(pybind11)
include_directories(${CMAKE_BINARY_DIR}/_deps/pybind11-src/include)
endif()
if(BINDING_TYPE STREQUAL "nanobind")
FetchContent_MakeAvailable(nanobind)
include_directories(${CMAKE_BINARY_DIR}/_deps/nanobind-src/include)
endif()
FetchContent_MakeAvailable(nanobind)
include_directories(${CMAKE_BINARY_DIR}/_deps/nanobind-src/include)
FetchContent_MakeAvailable(cutlass cxxopts flashmla json xgrammar)

View File

@ -293,13 +293,7 @@ if(BUILD_PYT)
add_subdirectory(thop)
endif()
if(BINDING_TYPE STREQUAL "pybind")
add_subdirectory(pybind)
endif()
if(BINDING_TYPE STREQUAL "nanobind")
add_subdirectory(nanobind)
endif()
add_subdirectory(nanobind)
if(BUILD_DEEP_EP)
add_subdirectory(deep_ep)

View File

@ -65,23 +65,10 @@ if(NIXL_ENABLED OR MOONCAKE_ENABLED)
# Collect binding source files
set(AGENT_BINDING_SOURCES "")
if(BINDING_TYPE STREQUAL "pybind")
list(APPEND AGENT_BINDING_SOURCES agentBindingsPybind.cpp)
else()
list(APPEND AGENT_BINDING_SOURCES agentBindingsNanobind.cpp)
endif()
list(APPEND AGENT_BINDING_SOURCES agentBindings.cpp)
if(BINDING_TYPE STREQUAL "pybind")
# Use pybind11 (already fetched via FetchContent)
pybind11_add_module(${TRANSFER_AGENT_BINDING_TARGET}
${AGENT_BINDING_SOURCES})
message(STATUS "Building tensorrt_llm_transfer_agent_binding with pybind11")
else()
# Default to nanobind (already fetched via FetchContent)
nanobind_add_module(${TRANSFER_AGENT_BINDING_TARGET}
${AGENT_BINDING_SOURCES})
message(STATUS "Building tensorrt_llm_transfer_agent_binding with nanobind")
endif()
nanobind_add_module(${TRANSFER_AGENT_BINDING_TARGET} ${AGENT_BINDING_SOURCES})
message(STATUS "Building tensorrt_llm_transfer_agent_binding with nanobind")
target_compile_options(${TRANSFER_AGENT_BINDING_TARGET} PRIVATE -Wno-error)

View File

@ -1,250 +0,0 @@
/*
* SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "tensorrt_llm/executor/transferAgent.h"
#ifdef ENABLE_NIXL
#include "transferAgent.h"
#endif
#ifdef ENABLE_MOONCAKE
#include "../mooncake_utils/transferAgent.h"
#endif
#include <pybind11/functional.h>
#include <pybind11/operators.h>
#include <pybind11/pybind11.h>
#include <pybind11/stl.h>
namespace py = pybind11;
namespace kvc = tensorrt_llm::executor::kv_cache;
PYBIND11_MODULE(tensorrt_llm_transfer_agent_binding, m)
{
m.doc() = "TensorRT-LLM Transfer Agent Python bindings (pybind11)";
// MemoryType enum
py::enum_<kvc::MemoryType>(m, "MemoryType")
.value("DRAM", kvc::MemoryType::kDRAM)
.value("VRAM", kvc::MemoryType::kVRAM)
.value("BLK", kvc::MemoryType::kBLK)
.value("OBJ", kvc::MemoryType::kOBJ)
.value("FILE", kvc::MemoryType::kFILE);
// TransferOp enum
py::enum_<kvc::TransferOp>(m, "TransferOp")
.value("READ", kvc::TransferOp::kREAD)
.value("WRITE", kvc::TransferOp::kWRITE);
// TransferState enum
py::enum_<kvc::TransferState>(m, "TransferState")
.value("IN_PROGRESS", kvc::TransferState::kIN_PROGRESS)
.value("SUCCESS", kvc::TransferState::kSUCCESS)
.value("FAILURE", kvc::TransferState::kFAILURE);
// MemoryDesc class
py::class_<kvc::MemoryDesc>(m, "MemoryDesc")
.def(py::init<uintptr_t, size_t, uint32_t>(), py::arg("addr"), py::arg("len"), py::arg("device_id"))
.def_property_readonly("addr", &kvc::MemoryDesc::getAddr)
.def_property_readonly("len", &kvc::MemoryDesc::getLen)
.def_property_readonly("device_id", &kvc::MemoryDesc::getDeviceId);
// MemoryDescs class
py::class_<kvc::MemoryDescs>(m, "MemoryDescs")
.def(py::init<kvc::MemoryType, std::vector<kvc::MemoryDesc>>(), py::arg("type"), py::arg("descs"))
// Batch constructor from list of tuples: [(ptr, size, device_id), ...]
.def(py::init(
[](kvc::MemoryType type, std::vector<std::tuple<uintptr_t, size_t, uint32_t>> const& tuples)
{
std::vector<kvc::MemoryDesc> descs;
descs.reserve(tuples.size());
for (auto const& [addr, len, deviceId] : tuples)
{
descs.emplace_back(addr, len, deviceId);
}
return kvc::MemoryDescs(type, std::move(descs));
}),
py::arg("type"), py::arg("tuples"))
.def_property_readonly("type", &kvc::MemoryDescs::getType)
.def_property_readonly("descs", &kvc::MemoryDescs::getDescs);
// AgentDesc class
py::class_<kvc::AgentDesc>(m, "AgentDesc")
.def(py::init(
[](py::bytes data)
{
std::string str(PyBytes_AsString(data.ptr()), PyBytes_Size(data.ptr()));
return kvc::AgentDesc{std::move(str)};
}),
py::arg("backend_agent_desc"))
.def(py::init<std::string>(), py::arg("backend_agent_desc"))
.def_property_readonly("backend_agent_desc",
[](kvc::AgentDesc const& self)
{
auto const& desc = self.getBackendAgentDesc();
return py::bytes(desc.data(), desc.size());
});
// TransferRequest class
py::class_<kvc::TransferRequest>(m, "TransferRequest")
.def(py::init<kvc::TransferOp, kvc::TransferDescs, kvc::TransferDescs, std::string const&,
std::optional<kvc::SyncMessage>>(),
py::arg("op"), py::arg("src_descs"), py::arg("dst_descs"), py::arg("remote_name"),
py::arg("sync_message") = std::nullopt)
.def_property_readonly("op", &kvc::TransferRequest::getOp)
.def_property_readonly("src_descs", &kvc::TransferRequest::getSrcDescs)
.def_property_readonly("dst_descs", &kvc::TransferRequest::getDstDescs)
.def_property_readonly("remote_name", &kvc::TransferRequest::getRemoteName)
.def_property_readonly("sync_message", &kvc::TransferRequest::getSyncMessage);
// TransferStatus base class
py::class_<kvc::TransferStatus>(m, "TransferStatus")
.def("is_completed", &kvc::TransferStatus::isCompleted)
.def("wait", &kvc::TransferStatus::wait, py::arg("timeout_ms") = -1);
// BaseAgentConfig struct
py::class_<kvc::BaseAgentConfig>(m, "BaseAgentConfig")
.def(py::init<>())
.def(py::init(
[](std::string name, bool use_prog_thread, bool multi_thread, bool use_listen_thread,
bool enable_telemetry, std::unordered_map<std::string, std::string> backend_params)
{
return kvc::BaseAgentConfig{std::move(name), use_prog_thread, multi_thread, use_listen_thread,
enable_telemetry, std::move(backend_params)};
}),
py::arg("name"), py::arg("use_prog_thread") = true, py::arg("multi_thread") = false,
py::arg("use_listen_thread") = false, py::arg("enable_telemetry") = false,
py::arg("backend_params") = std::unordered_map<std::string, std::string>{})
.def_readwrite("name", &kvc::BaseAgentConfig::mName)
.def_readwrite("use_prog_thread", &kvc::BaseAgentConfig::useProgThread)
.def_readwrite("multi_thread", &kvc::BaseAgentConfig::multiThread)
.def_readwrite("use_listen_thread", &kvc::BaseAgentConfig::useListenThread)
.def_readwrite("enable_telemetry", &kvc::BaseAgentConfig::enableTelemetry)
.def_readwrite("backend_params", &kvc::BaseAgentConfig::backendParams);
// BaseTransferAgent class (abstract base)
py::class_<kvc::BaseTransferAgent>(m, "BaseTransferAgent")
.def("register_memory", &kvc::BaseTransferAgent::registerMemory, py::arg("descs"))
.def("deregister_memory", &kvc::BaseTransferAgent::deregisterMemory, py::arg("descs"))
.def("load_remote_agent",
py::overload_cast<std::string const&, kvc::AgentDesc const&>(&kvc::BaseTransferAgent::loadRemoteAgent),
py::arg("name"), py::arg("agent_desc"))
.def("load_remote_agent_by_connection",
py::overload_cast<std::string const&, kvc::ConnectionInfoType const&>(
&kvc::BaseTransferAgent::loadRemoteAgent),
py::arg("name"), py::arg("connection_info"))
.def("get_local_agent_desc", &kvc::BaseTransferAgent::getLocalAgentDesc)
.def("invalidate_remote_agent", &kvc::BaseTransferAgent::invalidateRemoteAgent, py::arg("name"))
.def(
"submit_transfer_requests",
[](kvc::BaseTransferAgent& self, kvc::TransferRequest const& request)
{ return self.submitTransferRequests(request).release(); },
py::arg("request"), py::return_value_policy::take_ownership)
.def(
"notify_sync_message", &kvc::BaseTransferAgent::notifySyncMessage, py::arg("name"), py::arg("sync_message"))
.def("get_notified_sync_messages", &kvc::BaseTransferAgent::getNotifiedSyncMessages)
.def("get_local_connection_info", &kvc::BaseTransferAgent::getLocalConnectionInfo)
.def("check_remote_descs", &kvc::BaseTransferAgent::checkRemoteDescs, py::arg("name"), py::arg("memory_descs"));
#ifdef ENABLE_NIXL
// NixlTransferStatus class - release GIL for blocking operations
py::class_<kvc::NixlTransferStatus, kvc::TransferStatus>(m, "NixlTransferStatus")
.def("is_completed", &kvc::NixlTransferStatus::isCompleted, py::call_guard<py::gil_scoped_release>())
.def("wait", &kvc::NixlTransferStatus::wait, py::arg("timeout_ms") = -1,
py::call_guard<py::gil_scoped_release>());
// NixlTransferAgent class
py::class_<kvc::NixlTransferAgent, kvc::BaseTransferAgent>(m, "NixlTransferAgent")
.def(py::init<kvc::BaseAgentConfig const&>(), py::arg("config"))
.def("register_memory", &kvc::NixlTransferAgent::registerMemory, py::arg("descs"))
.def("deregister_memory", &kvc::NixlTransferAgent::deregisterMemory, py::arg("descs"))
.def("load_remote_agent",
py::overload_cast<std::string const&, kvc::AgentDesc const&>(&kvc::NixlTransferAgent::loadRemoteAgent),
py::arg("name"), py::arg("agent_desc"))
.def("load_remote_agent_by_connection",
py::overload_cast<std::string const&, kvc::ConnectionInfoType const&>(
&kvc::NixlTransferAgent::loadRemoteAgent),
py::arg("name"), py::arg("connection_info"))
.def("get_local_agent_desc", &kvc::NixlTransferAgent::getLocalAgentDesc)
.def("get_local_connection_info", &kvc::NixlTransferAgent::getLocalConnectionInfo)
.def("invalidate_remote_agent", &kvc::NixlTransferAgent::invalidateRemoteAgent, py::arg("name"))
.def(
"submit_transfer_requests",
[](kvc::NixlTransferAgent& self, kvc::TransferRequest const& request)
{ return self.submitTransferRequests(request).release(); },
py::arg("request"), py::return_value_policy::take_ownership, py::call_guard<py::gil_scoped_release>())
.def(
"notify_sync_message", &kvc::NixlTransferAgent::notifySyncMessage, py::arg("name"), py::arg("sync_message"))
.def("get_notified_sync_messages", &kvc::NixlTransferAgent::getNotifiedSyncMessages)
.def("check_remote_descs", &kvc::NixlTransferAgent::checkRemoteDescs, py::arg("name"), py::arg("memory_descs"));
#endif
#ifdef ENABLE_MOONCAKE
// MooncakeTransferStatus class - release GIL for blocking operations
py::class_<kvc::MooncakeTransferStatus, kvc::TransferStatus>(m, "MooncakeTransferStatus")
.def("is_completed", &kvc::MooncakeTransferStatus::isCompleted, py::call_guard<py::gil_scoped_release>())
.def("wait", &kvc::MooncakeTransferStatus::wait, py::arg("timeout_ms") = -1,
py::call_guard<py::gil_scoped_release>());
// MooncakeTransferAgent class
py::class_<kvc::MooncakeTransferAgent, kvc::BaseTransferAgent>(m, "MooncakeTransferAgent")
.def(py::init<kvc::BaseAgentConfig const&>(), py::arg("config"))
.def("register_memory", &kvc::MooncakeTransferAgent::registerMemory, py::arg("descs"))
.def("deregister_memory", &kvc::MooncakeTransferAgent::deregisterMemory, py::arg("descs"))
.def("load_remote_agent",
py::overload_cast<std::string const&, kvc::AgentDesc const&>(&kvc::MooncakeTransferAgent::loadRemoteAgent),
py::arg("name"), py::arg("agent_desc"))
.def("load_remote_agent_by_connection",
py::overload_cast<std::string const&, kvc::ConnectionInfoType const&>(
&kvc::MooncakeTransferAgent::loadRemoteAgent),
py::arg("name"), py::arg("connection_info"))
.def("get_local_agent_desc", &kvc::MooncakeTransferAgent::getLocalAgentDesc)
.def("get_local_connection_info", &kvc::MooncakeTransferAgent::getLocalConnectionInfo)
.def("invalidate_remote_agent", &kvc::MooncakeTransferAgent::invalidateRemoteAgent, py::arg("name"))
.def(
"submit_transfer_requests",
[](kvc::MooncakeTransferAgent& self, kvc::TransferRequest const& request)
{ return self.submitTransferRequests(request).release(); },
py::arg("request"), py::return_value_policy::take_ownership, py::call_guard<py::gil_scoped_release>())
.def("notify_sync_message", &kvc::MooncakeTransferAgent::notifySyncMessage, py::arg("name"),
py::arg("sync_message"))
.def("get_notified_sync_messages", &kvc::MooncakeTransferAgent::getNotifiedSyncMessages)
.def("check_remote_descs", &kvc::MooncakeTransferAgent::checkRemoteDescs, py::arg("name"),
py::arg("memory_descs"));
#endif
// Factory function to create transfer agent by backend name (uses dynamic loading)
m.def(
"make_transfer_agent",
[](std::string const& backend, kvc::BaseAgentConfig const& config) -> kvc::BaseTransferAgent*
{ return kvc::makeTransferAgent(backend, &config).release(); },
py::arg("backend"), py::arg("config"), py::return_value_policy::take_ownership,
"Create a transfer agent by backend name ('nixl' or 'mooncake'). Uses dynamic loading.");
// Expose which backends are available
#ifdef ENABLE_NIXL
m.attr("NIXL_ENABLED") = true;
#else
m.attr("NIXL_ENABLED") = false;
#endif
#ifdef ENABLE_MOONCAKE
m.attr("MOONCAKE_ENABLED") = true;
#else
m.attr("MOONCAKE_ENABLED") = false;
#endif
}

View File

@ -81,7 +81,6 @@ tr::SamplingConfig makeSamplingConfig(std::vector<tr::SamplingConfig> const& con
NB_MODULE(TRTLLM_NB_MODULE, m)
{
m.doc() = "TensorRT LLM Python bindings for C++ runtime";
m.attr("binding_type") = "nanobind";
nb::set_leak_warnings(false);
// Create MpiComm binding first since it's used in the executor bindings

View File

@ -141,7 +141,7 @@ Two C++ examples are provided that shows how to use the Executor API and can be
## Python Bindings for the Executor API
Python bindings for the Executor API are also available to use the Executor API from Python. The Python bindings are defined in [bindings.cpp](source:cpp/tensorrt_llm/pybind/executor/bindings.cpp) and once built, are available in package `tensorrt_llm.bindings.executor`. Running `'help('tensorrt_llm.bindings.executor')` in a Python interpreter will provide an overview of the classes available.
Python bindings for the Executor API are also available to use the Executor API from Python. The Python bindings are defined in [bindings.cpp](source:cpp/tensorrt_llm/nanobind/executor/bindings.cpp) and once built, are available in package `tensorrt_llm.bindings.executor`. Running `'help('tensorrt_llm.bindings.executor')` in a Python interpreter will provide an overview of the classes available.
In addition, three Python examples are provided to demonstrate how to use the Python bindings to the Executor API for single and multi-GPU models. They can be found in [`examples/bindings`](source:examples/bindings).

View File

@ -49,11 +49,6 @@ def CONFIG_LINUX_AARCH64 = "linux_aarch64"
@Field
def CONFIG_LINUX_AARCH64_LLVM = "linux_aarch64_LLVM"
@Field
def CONFIG_LINUX_X86_64_PYBIND = "linux_x86_64_Pybind"
@Field
def CONFIG_LINUX_AARCH64_PYBIND = "linux_aarch64_Pybind"
@Field
def BUILD_CONFIGS = [
@ -64,11 +59,6 @@ def BUILD_CONFIGS = [
(TARNAME) : "TensorRT-LLM.tar.gz",
(WHEEL_ARCHS): "80-real;86-real;89-real;90-real;100-real;103-real;120-real",
],
(CONFIG_LINUX_X86_64_PYBIND) : [
(WHEEL_EXTRA_ARGS) : "--binding_type pybind --extra-cmake-vars ENABLE_MULTI_DEVICE=1 --extra-cmake-vars WARNING_IS_ERROR=ON --extra-cmake-vars NIXL_ROOT=/opt/nvidia/nvda_nixl --extra-cmake-vars MOONCAKE_ROOT=/usr/local/Mooncake --micro_benchmarks",
(TARNAME) : "pybind-TensorRT-LLM.tar.gz",
(WHEEL_ARCHS): "80-real;86-real;89-real;90-real;100-real;103-real;120-real",
],
(CONFIG_LINUX_X86_64_SINGLE_DEVICE) : [
(WHEEL_EXTRA_ARGS) : "--extra-cmake-vars ENABLE_MULTI_DEVICE=0 --extra-cmake-vars WARNING_IS_ERROR=ON --extra-cmake-vars ENABLE_UCX=0 --micro_benchmarks",
(TARNAME) : "single-device-TensorRT-LLM.tar.gz",
@ -85,12 +75,6 @@ def BUILD_CONFIGS = [
(WHEEL_ARCHS): "90-real;100-real;103-real;120-real",
(BUILD_JOBS_FOR_CONFIG): "8", // TODO: Remove after fix the build OOM issue on SBSA
],
(CONFIG_LINUX_AARCH64_PYBIND): [
(WHEEL_EXTRA_ARGS) : "--binding_type pybind --extra-cmake-vars WARNING_IS_ERROR=ON --extra-cmake-vars NIXL_ROOT=/opt/nvidia/nvda_nixl --extra-cmake-vars MOONCAKE_ROOT=/usr/local/Mooncake",
(TARNAME) : "pybind-TensorRT-LLM-GH200.tar.gz",
(WHEEL_ARCHS): "90-real;100-real;103-real;120-real",
(BUILD_JOBS_FOR_CONFIG): "8", // TODO: Remove after fix the build OOM issue on SBSA
],
(CONFIG_LINUX_AARCH64_LLVM) : [
(WHEEL_EXTRA_ARGS) : "--extra-cmake-vars WARNING_IS_ERROR=ON -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_CUDA_HOST_COMPILER=clang -DCMAKE_LINKER_TYPE=LLD",
(TARNAME) : "llvm-TensorRT-LLM-GH200.tar.gz",
@ -549,8 +533,6 @@ def launchStages(pipeline, cpu_arch, enableFailFast, globalVars)
pipeline, cpu_arch == AARCH64_TRIPLE ? CONFIG_LINUX_AARCH64 : CONFIG_LINUX_X86_64_VANILLA),
"Build TRT-LLM LLVM": [LLM_DOCKER_IMAGE] + prepareLLMBuild(
pipeline, cpu_arch == AARCH64_TRIPLE ? CONFIG_LINUX_AARCH64_LLVM : CONFIG_LINUX_X86_64_LLVM),
"Build TRT-LLM Pybind": [LLM_DOCKER_IMAGE] + prepareLLMBuild(
pipeline, cpu_arch == AARCH64_TRIPLE ? CONFIG_LINUX_AARCH64_PYBIND : CONFIG_LINUX_X86_64_PYBIND),
]
if (cpu_arch == X86_64_TRIPLE) {

View File

@ -694,7 +694,7 @@ def getMultiGpuFileChanged(pipeline, testFilter, globalVars)
"cpp/tensorrt_llm/plugins/gptAttentionPlugin/gptAttentionPlugin.cpp",
"cpp/tensorrt_llm/plugins/gptAttentionPlugin/gptAttentionPlugin.h",
"cpp/tensorrt_llm/plugins/ncclPlugin/",
"cpp/tensorrt_llm/pybind/",
"cpp/tensorrt_llm/nanobind/",
"cpp/tensorrt_llm/runtime/ipcUtils.cpp",
"cpp/tensorrt_llm/runtime/ncclCommunicator.cpp",
"cpp/tensorrt_llm/runtime/utils/mpiUtils.cpp",

View File

@ -65,9 +65,6 @@ def LLVM_CONFIG = "LLVM"
@Field
def LINUX_AARCH64_CONFIG = "linux_aarch64"
@Field
def PYBIND_CONFIG = "Pybind"
@Field
def BUILD_CONFIGS = [
// Vanilla TARNAME is used for packaging in runLLMPackage
@ -75,7 +72,6 @@ def BUILD_CONFIGS = [
(SINGLE_DEVICE_CONFIG) : [(TARNAME) : "single-device-TensorRT-LLM.tar.gz"],
(LLVM_CONFIG) : [(TARNAME) : "llvm-TensorRT-LLM.tar.gz"],
(LINUX_AARCH64_CONFIG) : [(TARNAME) : "TensorRT-LLM-GH200.tar.gz"],
(PYBIND_CONFIG) : [(TARNAME) : "pybind-TensorRT-LLM.tar.gz"],
]
// TODO: Move common variables to an unified location
@ -3185,7 +3181,6 @@ def launchTestJobs(pipeline, testFilter)
"A10-TensorRT-3": ["a10", "l0_a10", 3, 5],
"A10-TensorRT-4": ["a10", "l0_a10", 4, 5],
"A10-TensorRT-5": ["a10", "l0_a10", 5, 5],
"A10-Pybind": ["a10", "l0_a10_pybind", 1, 1],
"A30-Triton-1": ["a30", "l0_a30", 1, 1],
"A30-PyTorch-1": ["a30", "l0_a30", 1, 2],
"A30-PyTorch-2": ["a30", "l0_a30", 2, 2],
@ -3282,9 +3277,6 @@ def launchTestJobs(pipeline, testFilter)
if (key.contains("llvm")) {
config = LLVM_CONFIG
}
if (key.contains("Pybind")) {
config = PYBIND_CONFIG
}
runLLMTestlistOnPlatform(pipeline, values[0], values[1], config, key.contains("-Perf-"), key, values[2], values[3])
}]]}
fullSet = parallelJobs.keySet()

View File

@ -29,7 +29,6 @@ from multiprocessing import cpu_count
from pathlib import Path
from shutil import copy, copytree, rmtree
from subprocess import DEVNULL, CalledProcessError, check_output, run
from textwrap import dedent
from typing import Sequence
try:
@ -368,13 +367,10 @@ def check_missing_libs(lib_name: str) -> list[str]:
return missing
def generate_python_stubs_linux(binding_type: str, venv_python: Path,
deep_ep: bool, flash_mla: bool,
transfer_agent_binding: bool,
def generate_python_stubs_linux(venv_python: Path, deep_ep: bool,
flash_mla: bool, transfer_agent_binding: bool,
binding_lib_name: str):
is_nanobind = binding_type == "nanobind"
if is_nanobind:
build_run(f"\"{venv_python}\" -m pip install nanobind")
build_run(f"\"{venv_python}\" -m pip install nanobind")
build_run(f"\"{venv_python}\" -m pip install pybind11-stubgen")
env_stub_gen = os.environ.copy()
@ -393,14 +389,8 @@ def generate_python_stubs_linux(binding_type: str, venv_python: Path,
link_dir = None
try:
if is_nanobind:
build_run(
f"\"{venv_python}\" -m nanobind.stubgen -m bindings -r -O .",
env=env_stub_gen)
else:
build_run(
f"\"{venv_python}\" -m pybind11_stubgen -o . bindings --exit-code",
env=env_stub_gen)
build_run(f"\"{venv_python}\" -m nanobind.stubgen -m bindings -r -O .",
env=env_stub_gen)
build_run(
f"\"{venv_python}\" -m pybind11_stubgen -o . deep_gemm_cpp_tllm --exit-code",
env=env_stub_gen)
@ -414,47 +404,21 @@ def generate_python_stubs_linux(binding_type: str, venv_python: Path,
env=env_stub_gen)
if transfer_agent_binding:
# Generate stubs for tensorrt_llm_transfer_agent_binding
if is_nanobind:
build_run(
f"\"{venv_python}\" -m nanobind.stubgen -m tensorrt_llm_transfer_agent_binding -O .",
env=env_stub_gen)
else:
build_run(
f"\"{venv_python}\" -m pybind11_stubgen -o . tensorrt_llm_transfer_agent_binding --exit-code",
env=env_stub_gen)
build_run(
f"\"{venv_python}\" -m nanobind.stubgen -m tensorrt_llm_transfer_agent_binding -O .",
env=env_stub_gen)
finally:
if link_dir:
rmtree(link_dir)
def generate_python_stubs_windows(binding_type: str, venv_python: Path,
pkg_dir: Path, lib_dir: Path):
if binding_type == "nanobind":
print("Windows not yet supported for nanobind stubs")
exit(1)
else:
build_run(f"\"{venv_python}\" -m pip install pybind11-stubgen")
stubgen = "stubgen.py"
stubgen_contents = """
# Loading torch, trt before bindings is required to avoid import errors on windows.
# isort: off
import torch
import tensorrt as trt
# isort: on
import os
import platform
def generate_python_stubs_windows(venv_python: Path, pkg_dir: Path,
lib_dir: Path):
from pybind11_stubgen import main
if __name__ == "__main__":
# Load dlls from `libs` directory before launching bindings.
if platform.system() == "Windows":
os.add_dll_directory(r\"{lib_dir}\")
main()
""".format(lib_dir=lib_dir)
(pkg_dir / stubgen).write_text(dedent(stubgen_contents))
build_run(f"\"{venv_python}\" {stubgen} -o . bindings")
(pkg_dir / stubgen).unlink()
print("Windows not supported for nanobind stubs")
exit(1)
def build_kv_cache_manager_v2(project_dir, venv_python, use_mypyc=False):
@ -517,7 +481,6 @@ def main(*,
install: bool = False,
skip_building_wheel: bool = False,
linking_install_binary: bool = False,
binding_type: str = "nanobind",
benchmarks: bool = False,
micro_benchmarks: bool = False,
nvtx: bool = False,
@ -621,39 +584,6 @@ def main(*,
clear_folder(build_dir) # Keep the folder in case it is mounted.
build_dir.mkdir(parents=True, exist_ok=True)
def get_binding_type_from_cache():
cmake_cache_file = build_dir / "CMakeCache.txt"
if not cmake_cache_file.exists():
return None
with open(cmake_cache_file, 'r') as f:
for line in f:
if line.startswith("BINDING_TYPE:STRING="):
cashed_binding_type = line.split("=", 1)[1].strip()
if cashed_binding_type in ['pybind', 'nanobind']:
return cashed_binding_type
return None
cached_binding_type = get_binding_type_from_cache()
if not first_build and cached_binding_type != binding_type:
# Clean up of previous binding build artifacts
nanobind_dir = build_dir / "tensorrt_llm" / "nanobind"
if nanobind_dir.exists():
rmtree(nanobind_dir)
nanobind_stub_dir = project_dir / "tensorrt_llm" / "bindings"
if nanobind_stub_dir.exists():
rmtree(nanobind_stub_dir)
pybind_dir = build_dir / "tensorrt_llm" / "pybind"
if pybind_dir.exists():
rmtree(pybind_dir)
pybind_stub_dir = project_dir / "tensorrt_llm" / "bindings"
if pybind_stub_dir.exists():
rmtree(pybind_stub_dir)
configure_cmake = True
if use_ccache:
cmake_def_args.append(
f"-DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_CUDA_COMPILER_LAUNCHER=ccache"
@ -716,7 +646,7 @@ def main(*,
)
cmake_def_args = " ".join(cmake_def_args)
cmake_configure_command = (
f'cmake -DCMAKE_BUILD_TYPE="{build_type}" -DBUILD_PYT="{build_pyt}" -DBINDING_TYPE="{binding_type}" -DBUILD_DEEP_EP="{build_deep_ep}" -DBUILD_DEEP_GEMM="{build_deep_gemm}" -DBUILD_FLASH_MLA="{build_flash_mla}"'
f'cmake -DCMAKE_BUILD_TYPE="{build_type}" -DBUILD_PYT="{build_pyt}" -DBUILD_DEEP_EP="{build_deep_ep}" -DBUILD_DEEP_GEMM="{build_deep_gemm}" -DBUILD_FLASH_MLA="{build_flash_mla}"'
f' -DNVTX_DISABLE="{disable_nvtx}" -DBUILD_MICRO_BENCHMARKS={build_micro_benchmarks}'
f' -DBUILD_WHEEL_TARGETS="{";".join(targets)}"'
f' -DPython_EXECUTABLE={venv_python} -DPython3_EXECUTABLE={venv_python}'
@ -947,7 +877,7 @@ def main(*,
) == 1, f"Exactly one binding library should be present: {binding_lib}"
return binding_lib[0]
binding_lib_dir = get_binding_lib(binding_type, "bindings")
binding_lib_dir = get_binding_lib("nanobind", "bindings")
binding_lib_file_name = binding_lib_dir.name
install_file(binding_lib_dir, pkg_dir)
@ -995,12 +925,10 @@ def main(*,
if not skip_stubs:
with working_directory(pkg_dir):
if on_windows:
generate_python_stubs_windows(binding_type, venv_python,
pkg_dir, lib_dir)
generate_python_stubs_windows(venv_python, pkg_dir, lib_dir)
else: # on linux
generate_python_stubs_linux(
binding_type, venv_python,
bool(deep_ep_cuda_architectures),
venv_python, bool(deep_ep_cuda_architectures),
bool(flash_mla_cuda_architectures),
nixl_root is not None or mooncake_root is not None,
binding_lib_file_name)
@ -1155,10 +1083,6 @@ def add_arguments(parser: ArgumentParser):
help=
"Install the built binary by creating symbolic links instead of copying files"
)
parser.add_argument("--binding_type",
choices=["pybind", "nanobind"],
default="nanobind",
help="Which binding library to use: pybind or nanobind")
parser.add_argument("--benchmarks",
action="store_true",
help="Build the benchmarks for the C++ runtime")

View File

@ -279,19 +279,3 @@ l0_a10:
backend: fmha
tests:
- test_fmha.py::test_fmha TIMEOUT (90)
l0_a10_pybind:
- condition:
ranges:
system_gpu_count:
gte: 1
lte: 1
wildcards:
gpu:
- '*a10*'
linux_distribution_name: ubuntu*
terms:
stage: pre_merge
tests:
- unittest/bindings
- test_e2e.py::test_openai_chat_example[trt]
- test_e2e.py::test_openai_chat_example[pytorch] TIMEOUT (90)