mirror of
https://github.com/NVIDIA/TensorRT-LLM.git
synced 2026-02-04 18:21:52 +08:00
[None][chore] Removing pybind11 bindings and references (#10550)
Signed-off-by: Linda-Stadter <57756729+Linda-Stadter@users.noreply.github.com>
This commit is contained in:
parent
ce37e27066
commit
ce556290c9
2
.github/CODEOWNERS
vendored
2
.github/CODEOWNERS
vendored
@ -185,8 +185,6 @@ docs/source/performance/perf-benchmarking.md @NVIDIA/trtllm-bench-reviewers
|
||||
/tensorrt_llm/_torch/pyexecutor/resource_manager.py @NVIDIA/trt-llm-kv-cache-manager-devs
|
||||
/cpp/tensorrt_llm/nanobind/batch_manager/kvCacheManager.h @NVIDIA/trt-llm-kv-cache-manager-devs
|
||||
/cpp/tensorrt_llm/nanobind/batch_manager/kvCacheManager.cpp @NVIDIA/trt-llm-kv-cache-manager-devs
|
||||
/cpp/tensorrt_llm/pybind/batch_manager/kvCacheManager.h @NVIDIA/trt-llm-kv-cache-manager-devs
|
||||
/cpp/tensorrt_llm/pybind/batch_manager/kvCacheManager.cpp @NVIDIA/trt-llm-kv-cache-manager-devs
|
||||
|
||||
# The rule below requires that any PR modifying public APIs must be approved by at least one member
|
||||
# of the NVIDIA/trt-llm-committed-api-review-committee or NVIDIA/trt-llm-noncommitted-api-review-committee team.
|
||||
|
||||
4
.github/tava_architecture_diagram.md
vendored
4
.github/tava_architecture_diagram.md
vendored
@ -55,8 +55,8 @@ graph TB
|
||||
Sampling[Sampling]
|
||||
BatchManager[Batch Manager]
|
||||
KVCache[KV Cache Manager]
|
||||
PyScheduler --> |Pybind|Shared_Scheduler
|
||||
PyDecoder --> |Pybind|Shared_Decoder
|
||||
PyScheduler --> |Nanobind|Shared_Scheduler
|
||||
PyDecoder --> |Nanobind|Shared_Decoder
|
||||
Executor --> Shared_Decoder
|
||||
Shared_Decoder --> Sampling
|
||||
Executor --> Shared_Scheduler[Scheduler]
|
||||
|
||||
@ -83,11 +83,6 @@ endif()
|
||||
add_compile_definitions("TLLM_GEN_EXPORT_INTERFACE")
|
||||
add_compile_definitions("TLLM_ENABLE_CUDA")
|
||||
|
||||
set(BINDING_TYPE
|
||||
"nanobind"
|
||||
CACHE STRING
|
||||
"Binding type of Python bindings for C++ runtime and batch manager")
|
||||
|
||||
set(INTERNAL_CUTLASS_KERNELS_PATH
|
||||
""
|
||||
CACHE
|
||||
@ -246,16 +241,15 @@ get_filename_component(TRT_LLM_ROOT_DIR ${CMAKE_CURRENT_SOURCE_DIR} PATH)
|
||||
set(3RDPARTY_DIR ${TRT_LLM_ROOT_DIR}/3rdparty)
|
||||
add_subdirectory(${3RDPARTY_DIR} 3rdparty)
|
||||
|
||||
if(BINDING_TYPE STREQUAL "pybind"
|
||||
OR BUILD_DEEP_EP
|
||||
OR BUILD_DEEP_GEMM)
|
||||
if(BUILD_DEEP_EP
|
||||
OR BUILD_DEEP_GEMM
|
||||
OR BUILD_FLASH_MLA)
|
||||
FetchContent_MakeAvailable(pybind11)
|
||||
include_directories(${CMAKE_BINARY_DIR}/_deps/pybind11-src/include)
|
||||
endif()
|
||||
if(BINDING_TYPE STREQUAL "nanobind")
|
||||
FetchContent_MakeAvailable(nanobind)
|
||||
include_directories(${CMAKE_BINARY_DIR}/_deps/nanobind-src/include)
|
||||
endif()
|
||||
|
||||
FetchContent_MakeAvailable(nanobind)
|
||||
include_directories(${CMAKE_BINARY_DIR}/_deps/nanobind-src/include)
|
||||
|
||||
FetchContent_MakeAvailable(cutlass cxxopts flashmla json xgrammar)
|
||||
|
||||
|
||||
@ -293,13 +293,7 @@ if(BUILD_PYT)
|
||||
add_subdirectory(thop)
|
||||
endif()
|
||||
|
||||
if(BINDING_TYPE STREQUAL "pybind")
|
||||
add_subdirectory(pybind)
|
||||
endif()
|
||||
|
||||
if(BINDING_TYPE STREQUAL "nanobind")
|
||||
add_subdirectory(nanobind)
|
||||
endif()
|
||||
add_subdirectory(nanobind)
|
||||
|
||||
if(BUILD_DEEP_EP)
|
||||
add_subdirectory(deep_ep)
|
||||
|
||||
@ -65,23 +65,10 @@ if(NIXL_ENABLED OR MOONCAKE_ENABLED)
|
||||
|
||||
# Collect binding source files
|
||||
set(AGENT_BINDING_SOURCES "")
|
||||
if(BINDING_TYPE STREQUAL "pybind")
|
||||
list(APPEND AGENT_BINDING_SOURCES agentBindingsPybind.cpp)
|
||||
else()
|
||||
list(APPEND AGENT_BINDING_SOURCES agentBindingsNanobind.cpp)
|
||||
endif()
|
||||
list(APPEND AGENT_BINDING_SOURCES agentBindings.cpp)
|
||||
|
||||
if(BINDING_TYPE STREQUAL "pybind")
|
||||
# Use pybind11 (already fetched via FetchContent)
|
||||
pybind11_add_module(${TRANSFER_AGENT_BINDING_TARGET}
|
||||
${AGENT_BINDING_SOURCES})
|
||||
message(STATUS "Building tensorrt_llm_transfer_agent_binding with pybind11")
|
||||
else()
|
||||
# Default to nanobind (already fetched via FetchContent)
|
||||
nanobind_add_module(${TRANSFER_AGENT_BINDING_TARGET}
|
||||
${AGENT_BINDING_SOURCES})
|
||||
message(STATUS "Building tensorrt_llm_transfer_agent_binding with nanobind")
|
||||
endif()
|
||||
nanobind_add_module(${TRANSFER_AGENT_BINDING_TARGET} ${AGENT_BINDING_SOURCES})
|
||||
message(STATUS "Building tensorrt_llm_transfer_agent_binding with nanobind")
|
||||
|
||||
target_compile_options(${TRANSFER_AGENT_BINDING_TARGET} PRIVATE -Wno-error)
|
||||
|
||||
|
||||
@ -1,250 +0,0 @@
|
||||
/*
|
||||
* SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
* SPDX-License-Identifier: Apache-2.0
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include "tensorrt_llm/executor/transferAgent.h"
|
||||
|
||||
#ifdef ENABLE_NIXL
|
||||
#include "transferAgent.h"
|
||||
#endif
|
||||
|
||||
#ifdef ENABLE_MOONCAKE
|
||||
#include "../mooncake_utils/transferAgent.h"
|
||||
#endif
|
||||
|
||||
#include <pybind11/functional.h>
|
||||
#include <pybind11/operators.h>
|
||||
#include <pybind11/pybind11.h>
|
||||
#include <pybind11/stl.h>
|
||||
|
||||
namespace py = pybind11;
|
||||
namespace kvc = tensorrt_llm::executor::kv_cache;
|
||||
|
||||
PYBIND11_MODULE(tensorrt_llm_transfer_agent_binding, m)
|
||||
{
|
||||
m.doc() = "TensorRT-LLM Transfer Agent Python bindings (pybind11)";
|
||||
|
||||
// MemoryType enum
|
||||
py::enum_<kvc::MemoryType>(m, "MemoryType")
|
||||
.value("DRAM", kvc::MemoryType::kDRAM)
|
||||
.value("VRAM", kvc::MemoryType::kVRAM)
|
||||
.value("BLK", kvc::MemoryType::kBLK)
|
||||
.value("OBJ", kvc::MemoryType::kOBJ)
|
||||
.value("FILE", kvc::MemoryType::kFILE);
|
||||
|
||||
// TransferOp enum
|
||||
py::enum_<kvc::TransferOp>(m, "TransferOp")
|
||||
.value("READ", kvc::TransferOp::kREAD)
|
||||
.value("WRITE", kvc::TransferOp::kWRITE);
|
||||
|
||||
// TransferState enum
|
||||
py::enum_<kvc::TransferState>(m, "TransferState")
|
||||
.value("IN_PROGRESS", kvc::TransferState::kIN_PROGRESS)
|
||||
.value("SUCCESS", kvc::TransferState::kSUCCESS)
|
||||
.value("FAILURE", kvc::TransferState::kFAILURE);
|
||||
|
||||
// MemoryDesc class
|
||||
py::class_<kvc::MemoryDesc>(m, "MemoryDesc")
|
||||
.def(py::init<uintptr_t, size_t, uint32_t>(), py::arg("addr"), py::arg("len"), py::arg("device_id"))
|
||||
.def_property_readonly("addr", &kvc::MemoryDesc::getAddr)
|
||||
.def_property_readonly("len", &kvc::MemoryDesc::getLen)
|
||||
.def_property_readonly("device_id", &kvc::MemoryDesc::getDeviceId);
|
||||
|
||||
// MemoryDescs class
|
||||
py::class_<kvc::MemoryDescs>(m, "MemoryDescs")
|
||||
.def(py::init<kvc::MemoryType, std::vector<kvc::MemoryDesc>>(), py::arg("type"), py::arg("descs"))
|
||||
// Batch constructor from list of tuples: [(ptr, size, device_id), ...]
|
||||
.def(py::init(
|
||||
[](kvc::MemoryType type, std::vector<std::tuple<uintptr_t, size_t, uint32_t>> const& tuples)
|
||||
{
|
||||
std::vector<kvc::MemoryDesc> descs;
|
||||
descs.reserve(tuples.size());
|
||||
for (auto const& [addr, len, deviceId] : tuples)
|
||||
{
|
||||
descs.emplace_back(addr, len, deviceId);
|
||||
}
|
||||
return kvc::MemoryDescs(type, std::move(descs));
|
||||
}),
|
||||
py::arg("type"), py::arg("tuples"))
|
||||
.def_property_readonly("type", &kvc::MemoryDescs::getType)
|
||||
.def_property_readonly("descs", &kvc::MemoryDescs::getDescs);
|
||||
|
||||
// AgentDesc class
|
||||
py::class_<kvc::AgentDesc>(m, "AgentDesc")
|
||||
.def(py::init(
|
||||
[](py::bytes data)
|
||||
{
|
||||
std::string str(PyBytes_AsString(data.ptr()), PyBytes_Size(data.ptr()));
|
||||
return kvc::AgentDesc{std::move(str)};
|
||||
}),
|
||||
py::arg("backend_agent_desc"))
|
||||
.def(py::init<std::string>(), py::arg("backend_agent_desc"))
|
||||
.def_property_readonly("backend_agent_desc",
|
||||
[](kvc::AgentDesc const& self)
|
||||
{
|
||||
auto const& desc = self.getBackendAgentDesc();
|
||||
return py::bytes(desc.data(), desc.size());
|
||||
});
|
||||
|
||||
// TransferRequest class
|
||||
py::class_<kvc::TransferRequest>(m, "TransferRequest")
|
||||
.def(py::init<kvc::TransferOp, kvc::TransferDescs, kvc::TransferDescs, std::string const&,
|
||||
std::optional<kvc::SyncMessage>>(),
|
||||
py::arg("op"), py::arg("src_descs"), py::arg("dst_descs"), py::arg("remote_name"),
|
||||
py::arg("sync_message") = std::nullopt)
|
||||
.def_property_readonly("op", &kvc::TransferRequest::getOp)
|
||||
.def_property_readonly("src_descs", &kvc::TransferRequest::getSrcDescs)
|
||||
.def_property_readonly("dst_descs", &kvc::TransferRequest::getDstDescs)
|
||||
.def_property_readonly("remote_name", &kvc::TransferRequest::getRemoteName)
|
||||
.def_property_readonly("sync_message", &kvc::TransferRequest::getSyncMessage);
|
||||
|
||||
// TransferStatus base class
|
||||
py::class_<kvc::TransferStatus>(m, "TransferStatus")
|
||||
.def("is_completed", &kvc::TransferStatus::isCompleted)
|
||||
.def("wait", &kvc::TransferStatus::wait, py::arg("timeout_ms") = -1);
|
||||
|
||||
// BaseAgentConfig struct
|
||||
py::class_<kvc::BaseAgentConfig>(m, "BaseAgentConfig")
|
||||
.def(py::init<>())
|
||||
.def(py::init(
|
||||
[](std::string name, bool use_prog_thread, bool multi_thread, bool use_listen_thread,
|
||||
bool enable_telemetry, std::unordered_map<std::string, std::string> backend_params)
|
||||
{
|
||||
return kvc::BaseAgentConfig{std::move(name), use_prog_thread, multi_thread, use_listen_thread,
|
||||
enable_telemetry, std::move(backend_params)};
|
||||
}),
|
||||
py::arg("name"), py::arg("use_prog_thread") = true, py::arg("multi_thread") = false,
|
||||
py::arg("use_listen_thread") = false, py::arg("enable_telemetry") = false,
|
||||
py::arg("backend_params") = std::unordered_map<std::string, std::string>{})
|
||||
.def_readwrite("name", &kvc::BaseAgentConfig::mName)
|
||||
.def_readwrite("use_prog_thread", &kvc::BaseAgentConfig::useProgThread)
|
||||
.def_readwrite("multi_thread", &kvc::BaseAgentConfig::multiThread)
|
||||
.def_readwrite("use_listen_thread", &kvc::BaseAgentConfig::useListenThread)
|
||||
.def_readwrite("enable_telemetry", &kvc::BaseAgentConfig::enableTelemetry)
|
||||
.def_readwrite("backend_params", &kvc::BaseAgentConfig::backendParams);
|
||||
|
||||
// BaseTransferAgent class (abstract base)
|
||||
py::class_<kvc::BaseTransferAgent>(m, "BaseTransferAgent")
|
||||
.def("register_memory", &kvc::BaseTransferAgent::registerMemory, py::arg("descs"))
|
||||
.def("deregister_memory", &kvc::BaseTransferAgent::deregisterMemory, py::arg("descs"))
|
||||
.def("load_remote_agent",
|
||||
py::overload_cast<std::string const&, kvc::AgentDesc const&>(&kvc::BaseTransferAgent::loadRemoteAgent),
|
||||
py::arg("name"), py::arg("agent_desc"))
|
||||
.def("load_remote_agent_by_connection",
|
||||
py::overload_cast<std::string const&, kvc::ConnectionInfoType const&>(
|
||||
&kvc::BaseTransferAgent::loadRemoteAgent),
|
||||
py::arg("name"), py::arg("connection_info"))
|
||||
.def("get_local_agent_desc", &kvc::BaseTransferAgent::getLocalAgentDesc)
|
||||
.def("invalidate_remote_agent", &kvc::BaseTransferAgent::invalidateRemoteAgent, py::arg("name"))
|
||||
.def(
|
||||
"submit_transfer_requests",
|
||||
[](kvc::BaseTransferAgent& self, kvc::TransferRequest const& request)
|
||||
{ return self.submitTransferRequests(request).release(); },
|
||||
py::arg("request"), py::return_value_policy::take_ownership)
|
||||
.def(
|
||||
"notify_sync_message", &kvc::BaseTransferAgent::notifySyncMessage, py::arg("name"), py::arg("sync_message"))
|
||||
.def("get_notified_sync_messages", &kvc::BaseTransferAgent::getNotifiedSyncMessages)
|
||||
.def("get_local_connection_info", &kvc::BaseTransferAgent::getLocalConnectionInfo)
|
||||
.def("check_remote_descs", &kvc::BaseTransferAgent::checkRemoteDescs, py::arg("name"), py::arg("memory_descs"));
|
||||
|
||||
#ifdef ENABLE_NIXL
|
||||
// NixlTransferStatus class - release GIL for blocking operations
|
||||
py::class_<kvc::NixlTransferStatus, kvc::TransferStatus>(m, "NixlTransferStatus")
|
||||
.def("is_completed", &kvc::NixlTransferStatus::isCompleted, py::call_guard<py::gil_scoped_release>())
|
||||
.def("wait", &kvc::NixlTransferStatus::wait, py::arg("timeout_ms") = -1,
|
||||
py::call_guard<py::gil_scoped_release>());
|
||||
|
||||
// NixlTransferAgent class
|
||||
py::class_<kvc::NixlTransferAgent, kvc::BaseTransferAgent>(m, "NixlTransferAgent")
|
||||
.def(py::init<kvc::BaseAgentConfig const&>(), py::arg("config"))
|
||||
.def("register_memory", &kvc::NixlTransferAgent::registerMemory, py::arg("descs"))
|
||||
.def("deregister_memory", &kvc::NixlTransferAgent::deregisterMemory, py::arg("descs"))
|
||||
.def("load_remote_agent",
|
||||
py::overload_cast<std::string const&, kvc::AgentDesc const&>(&kvc::NixlTransferAgent::loadRemoteAgent),
|
||||
py::arg("name"), py::arg("agent_desc"))
|
||||
.def("load_remote_agent_by_connection",
|
||||
py::overload_cast<std::string const&, kvc::ConnectionInfoType const&>(
|
||||
&kvc::NixlTransferAgent::loadRemoteAgent),
|
||||
py::arg("name"), py::arg("connection_info"))
|
||||
.def("get_local_agent_desc", &kvc::NixlTransferAgent::getLocalAgentDesc)
|
||||
.def("get_local_connection_info", &kvc::NixlTransferAgent::getLocalConnectionInfo)
|
||||
.def("invalidate_remote_agent", &kvc::NixlTransferAgent::invalidateRemoteAgent, py::arg("name"))
|
||||
.def(
|
||||
"submit_transfer_requests",
|
||||
[](kvc::NixlTransferAgent& self, kvc::TransferRequest const& request)
|
||||
{ return self.submitTransferRequests(request).release(); },
|
||||
py::arg("request"), py::return_value_policy::take_ownership, py::call_guard<py::gil_scoped_release>())
|
||||
.def(
|
||||
"notify_sync_message", &kvc::NixlTransferAgent::notifySyncMessage, py::arg("name"), py::arg("sync_message"))
|
||||
.def("get_notified_sync_messages", &kvc::NixlTransferAgent::getNotifiedSyncMessages)
|
||||
.def("check_remote_descs", &kvc::NixlTransferAgent::checkRemoteDescs, py::arg("name"), py::arg("memory_descs"));
|
||||
#endif
|
||||
|
||||
#ifdef ENABLE_MOONCAKE
|
||||
// MooncakeTransferStatus class - release GIL for blocking operations
|
||||
py::class_<kvc::MooncakeTransferStatus, kvc::TransferStatus>(m, "MooncakeTransferStatus")
|
||||
.def("is_completed", &kvc::MooncakeTransferStatus::isCompleted, py::call_guard<py::gil_scoped_release>())
|
||||
.def("wait", &kvc::MooncakeTransferStatus::wait, py::arg("timeout_ms") = -1,
|
||||
py::call_guard<py::gil_scoped_release>());
|
||||
|
||||
// MooncakeTransferAgent class
|
||||
py::class_<kvc::MooncakeTransferAgent, kvc::BaseTransferAgent>(m, "MooncakeTransferAgent")
|
||||
.def(py::init<kvc::BaseAgentConfig const&>(), py::arg("config"))
|
||||
.def("register_memory", &kvc::MooncakeTransferAgent::registerMemory, py::arg("descs"))
|
||||
.def("deregister_memory", &kvc::MooncakeTransferAgent::deregisterMemory, py::arg("descs"))
|
||||
.def("load_remote_agent",
|
||||
py::overload_cast<std::string const&, kvc::AgentDesc const&>(&kvc::MooncakeTransferAgent::loadRemoteAgent),
|
||||
py::arg("name"), py::arg("agent_desc"))
|
||||
.def("load_remote_agent_by_connection",
|
||||
py::overload_cast<std::string const&, kvc::ConnectionInfoType const&>(
|
||||
&kvc::MooncakeTransferAgent::loadRemoteAgent),
|
||||
py::arg("name"), py::arg("connection_info"))
|
||||
.def("get_local_agent_desc", &kvc::MooncakeTransferAgent::getLocalAgentDesc)
|
||||
.def("get_local_connection_info", &kvc::MooncakeTransferAgent::getLocalConnectionInfo)
|
||||
.def("invalidate_remote_agent", &kvc::MooncakeTransferAgent::invalidateRemoteAgent, py::arg("name"))
|
||||
.def(
|
||||
"submit_transfer_requests",
|
||||
[](kvc::MooncakeTransferAgent& self, kvc::TransferRequest const& request)
|
||||
{ return self.submitTransferRequests(request).release(); },
|
||||
py::arg("request"), py::return_value_policy::take_ownership, py::call_guard<py::gil_scoped_release>())
|
||||
.def("notify_sync_message", &kvc::MooncakeTransferAgent::notifySyncMessage, py::arg("name"),
|
||||
py::arg("sync_message"))
|
||||
.def("get_notified_sync_messages", &kvc::MooncakeTransferAgent::getNotifiedSyncMessages)
|
||||
.def("check_remote_descs", &kvc::MooncakeTransferAgent::checkRemoteDescs, py::arg("name"),
|
||||
py::arg("memory_descs"));
|
||||
#endif
|
||||
|
||||
// Factory function to create transfer agent by backend name (uses dynamic loading)
|
||||
m.def(
|
||||
"make_transfer_agent",
|
||||
[](std::string const& backend, kvc::BaseAgentConfig const& config) -> kvc::BaseTransferAgent*
|
||||
{ return kvc::makeTransferAgent(backend, &config).release(); },
|
||||
py::arg("backend"), py::arg("config"), py::return_value_policy::take_ownership,
|
||||
"Create a transfer agent by backend name ('nixl' or 'mooncake'). Uses dynamic loading.");
|
||||
|
||||
// Expose which backends are available
|
||||
#ifdef ENABLE_NIXL
|
||||
m.attr("NIXL_ENABLED") = true;
|
||||
#else
|
||||
m.attr("NIXL_ENABLED") = false;
|
||||
#endif
|
||||
|
||||
#ifdef ENABLE_MOONCAKE
|
||||
m.attr("MOONCAKE_ENABLED") = true;
|
||||
#else
|
||||
m.attr("MOONCAKE_ENABLED") = false;
|
||||
#endif
|
||||
}
|
||||
@ -81,7 +81,6 @@ tr::SamplingConfig makeSamplingConfig(std::vector<tr::SamplingConfig> const& con
|
||||
NB_MODULE(TRTLLM_NB_MODULE, m)
|
||||
{
|
||||
m.doc() = "TensorRT LLM Python bindings for C++ runtime";
|
||||
m.attr("binding_type") = "nanobind";
|
||||
nb::set_leak_warnings(false);
|
||||
|
||||
// Create MpiComm binding first since it's used in the executor bindings
|
||||
|
||||
@ -141,7 +141,7 @@ Two C++ examples are provided that shows how to use the Executor API and can be
|
||||
|
||||
## Python Bindings for the Executor API
|
||||
|
||||
Python bindings for the Executor API are also available to use the Executor API from Python. The Python bindings are defined in [bindings.cpp](source:cpp/tensorrt_llm/pybind/executor/bindings.cpp) and once built, are available in package `tensorrt_llm.bindings.executor`. Running `'help('tensorrt_llm.bindings.executor')` in a Python interpreter will provide an overview of the classes available.
|
||||
Python bindings for the Executor API are also available to use the Executor API from Python. The Python bindings are defined in [bindings.cpp](source:cpp/tensorrt_llm/nanobind/executor/bindings.cpp) and once built, are available in package `tensorrt_llm.bindings.executor`. Running `'help('tensorrt_llm.bindings.executor')` in a Python interpreter will provide an overview of the classes available.
|
||||
|
||||
In addition, three Python examples are provided to demonstrate how to use the Python bindings to the Executor API for single and multi-GPU models. They can be found in [`examples/bindings`](source:examples/bindings).
|
||||
|
||||
|
||||
@ -49,11 +49,6 @@ def CONFIG_LINUX_AARCH64 = "linux_aarch64"
|
||||
@Field
|
||||
def CONFIG_LINUX_AARCH64_LLVM = "linux_aarch64_LLVM"
|
||||
|
||||
@Field
|
||||
def CONFIG_LINUX_X86_64_PYBIND = "linux_x86_64_Pybind"
|
||||
|
||||
@Field
|
||||
def CONFIG_LINUX_AARCH64_PYBIND = "linux_aarch64_Pybind"
|
||||
|
||||
@Field
|
||||
def BUILD_CONFIGS = [
|
||||
@ -64,11 +59,6 @@ def BUILD_CONFIGS = [
|
||||
(TARNAME) : "TensorRT-LLM.tar.gz",
|
||||
(WHEEL_ARCHS): "80-real;86-real;89-real;90-real;100-real;103-real;120-real",
|
||||
],
|
||||
(CONFIG_LINUX_X86_64_PYBIND) : [
|
||||
(WHEEL_EXTRA_ARGS) : "--binding_type pybind --extra-cmake-vars ENABLE_MULTI_DEVICE=1 --extra-cmake-vars WARNING_IS_ERROR=ON --extra-cmake-vars NIXL_ROOT=/opt/nvidia/nvda_nixl --extra-cmake-vars MOONCAKE_ROOT=/usr/local/Mooncake --micro_benchmarks",
|
||||
(TARNAME) : "pybind-TensorRT-LLM.tar.gz",
|
||||
(WHEEL_ARCHS): "80-real;86-real;89-real;90-real;100-real;103-real;120-real",
|
||||
],
|
||||
(CONFIG_LINUX_X86_64_SINGLE_DEVICE) : [
|
||||
(WHEEL_EXTRA_ARGS) : "--extra-cmake-vars ENABLE_MULTI_DEVICE=0 --extra-cmake-vars WARNING_IS_ERROR=ON --extra-cmake-vars ENABLE_UCX=0 --micro_benchmarks",
|
||||
(TARNAME) : "single-device-TensorRT-LLM.tar.gz",
|
||||
@ -85,12 +75,6 @@ def BUILD_CONFIGS = [
|
||||
(WHEEL_ARCHS): "90-real;100-real;103-real;120-real",
|
||||
(BUILD_JOBS_FOR_CONFIG): "8", // TODO: Remove after fix the build OOM issue on SBSA
|
||||
],
|
||||
(CONFIG_LINUX_AARCH64_PYBIND): [
|
||||
(WHEEL_EXTRA_ARGS) : "--binding_type pybind --extra-cmake-vars WARNING_IS_ERROR=ON --extra-cmake-vars NIXL_ROOT=/opt/nvidia/nvda_nixl --extra-cmake-vars MOONCAKE_ROOT=/usr/local/Mooncake",
|
||||
(TARNAME) : "pybind-TensorRT-LLM-GH200.tar.gz",
|
||||
(WHEEL_ARCHS): "90-real;100-real;103-real;120-real",
|
||||
(BUILD_JOBS_FOR_CONFIG): "8", // TODO: Remove after fix the build OOM issue on SBSA
|
||||
],
|
||||
(CONFIG_LINUX_AARCH64_LLVM) : [
|
||||
(WHEEL_EXTRA_ARGS) : "--extra-cmake-vars WARNING_IS_ERROR=ON -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_CUDA_HOST_COMPILER=clang -DCMAKE_LINKER_TYPE=LLD",
|
||||
(TARNAME) : "llvm-TensorRT-LLM-GH200.tar.gz",
|
||||
@ -549,8 +533,6 @@ def launchStages(pipeline, cpu_arch, enableFailFast, globalVars)
|
||||
pipeline, cpu_arch == AARCH64_TRIPLE ? CONFIG_LINUX_AARCH64 : CONFIG_LINUX_X86_64_VANILLA),
|
||||
"Build TRT-LLM LLVM": [LLM_DOCKER_IMAGE] + prepareLLMBuild(
|
||||
pipeline, cpu_arch == AARCH64_TRIPLE ? CONFIG_LINUX_AARCH64_LLVM : CONFIG_LINUX_X86_64_LLVM),
|
||||
"Build TRT-LLM Pybind": [LLM_DOCKER_IMAGE] + prepareLLMBuild(
|
||||
pipeline, cpu_arch == AARCH64_TRIPLE ? CONFIG_LINUX_AARCH64_PYBIND : CONFIG_LINUX_X86_64_PYBIND),
|
||||
]
|
||||
|
||||
if (cpu_arch == X86_64_TRIPLE) {
|
||||
|
||||
@ -694,7 +694,7 @@ def getMultiGpuFileChanged(pipeline, testFilter, globalVars)
|
||||
"cpp/tensorrt_llm/plugins/gptAttentionPlugin/gptAttentionPlugin.cpp",
|
||||
"cpp/tensorrt_llm/plugins/gptAttentionPlugin/gptAttentionPlugin.h",
|
||||
"cpp/tensorrt_llm/plugins/ncclPlugin/",
|
||||
"cpp/tensorrt_llm/pybind/",
|
||||
"cpp/tensorrt_llm/nanobind/",
|
||||
"cpp/tensorrt_llm/runtime/ipcUtils.cpp",
|
||||
"cpp/tensorrt_llm/runtime/ncclCommunicator.cpp",
|
||||
"cpp/tensorrt_llm/runtime/utils/mpiUtils.cpp",
|
||||
|
||||
@ -65,9 +65,6 @@ def LLVM_CONFIG = "LLVM"
|
||||
@Field
|
||||
def LINUX_AARCH64_CONFIG = "linux_aarch64"
|
||||
|
||||
@Field
|
||||
def PYBIND_CONFIG = "Pybind"
|
||||
|
||||
@Field
|
||||
def BUILD_CONFIGS = [
|
||||
// Vanilla TARNAME is used for packaging in runLLMPackage
|
||||
@ -75,7 +72,6 @@ def BUILD_CONFIGS = [
|
||||
(SINGLE_DEVICE_CONFIG) : [(TARNAME) : "single-device-TensorRT-LLM.tar.gz"],
|
||||
(LLVM_CONFIG) : [(TARNAME) : "llvm-TensorRT-LLM.tar.gz"],
|
||||
(LINUX_AARCH64_CONFIG) : [(TARNAME) : "TensorRT-LLM-GH200.tar.gz"],
|
||||
(PYBIND_CONFIG) : [(TARNAME) : "pybind-TensorRT-LLM.tar.gz"],
|
||||
]
|
||||
|
||||
// TODO: Move common variables to an unified location
|
||||
@ -3185,7 +3181,6 @@ def launchTestJobs(pipeline, testFilter)
|
||||
"A10-TensorRT-3": ["a10", "l0_a10", 3, 5],
|
||||
"A10-TensorRT-4": ["a10", "l0_a10", 4, 5],
|
||||
"A10-TensorRT-5": ["a10", "l0_a10", 5, 5],
|
||||
"A10-Pybind": ["a10", "l0_a10_pybind", 1, 1],
|
||||
"A30-Triton-1": ["a30", "l0_a30", 1, 1],
|
||||
"A30-PyTorch-1": ["a30", "l0_a30", 1, 2],
|
||||
"A30-PyTorch-2": ["a30", "l0_a30", 2, 2],
|
||||
@ -3282,9 +3277,6 @@ def launchTestJobs(pipeline, testFilter)
|
||||
if (key.contains("llvm")) {
|
||||
config = LLVM_CONFIG
|
||||
}
|
||||
if (key.contains("Pybind")) {
|
||||
config = PYBIND_CONFIG
|
||||
}
|
||||
runLLMTestlistOnPlatform(pipeline, values[0], values[1], config, key.contains("-Perf-"), key, values[2], values[3])
|
||||
}]]}
|
||||
fullSet = parallelJobs.keySet()
|
||||
|
||||
@ -29,7 +29,6 @@ from multiprocessing import cpu_count
|
||||
from pathlib import Path
|
||||
from shutil import copy, copytree, rmtree
|
||||
from subprocess import DEVNULL, CalledProcessError, check_output, run
|
||||
from textwrap import dedent
|
||||
from typing import Sequence
|
||||
|
||||
try:
|
||||
@ -368,13 +367,10 @@ def check_missing_libs(lib_name: str) -> list[str]:
|
||||
return missing
|
||||
|
||||
|
||||
def generate_python_stubs_linux(binding_type: str, venv_python: Path,
|
||||
deep_ep: bool, flash_mla: bool,
|
||||
transfer_agent_binding: bool,
|
||||
def generate_python_stubs_linux(venv_python: Path, deep_ep: bool,
|
||||
flash_mla: bool, transfer_agent_binding: bool,
|
||||
binding_lib_name: str):
|
||||
is_nanobind = binding_type == "nanobind"
|
||||
if is_nanobind:
|
||||
build_run(f"\"{venv_python}\" -m pip install nanobind")
|
||||
build_run(f"\"{venv_python}\" -m pip install nanobind")
|
||||
build_run(f"\"{venv_python}\" -m pip install pybind11-stubgen")
|
||||
|
||||
env_stub_gen = os.environ.copy()
|
||||
@ -393,14 +389,8 @@ def generate_python_stubs_linux(binding_type: str, venv_python: Path,
|
||||
link_dir = None
|
||||
|
||||
try:
|
||||
if is_nanobind:
|
||||
build_run(
|
||||
f"\"{venv_python}\" -m nanobind.stubgen -m bindings -r -O .",
|
||||
env=env_stub_gen)
|
||||
else:
|
||||
build_run(
|
||||
f"\"{venv_python}\" -m pybind11_stubgen -o . bindings --exit-code",
|
||||
env=env_stub_gen)
|
||||
build_run(f"\"{venv_python}\" -m nanobind.stubgen -m bindings -r -O .",
|
||||
env=env_stub_gen)
|
||||
build_run(
|
||||
f"\"{venv_python}\" -m pybind11_stubgen -o . deep_gemm_cpp_tllm --exit-code",
|
||||
env=env_stub_gen)
|
||||
@ -414,47 +404,21 @@ def generate_python_stubs_linux(binding_type: str, venv_python: Path,
|
||||
env=env_stub_gen)
|
||||
if transfer_agent_binding:
|
||||
# Generate stubs for tensorrt_llm_transfer_agent_binding
|
||||
if is_nanobind:
|
||||
build_run(
|
||||
f"\"{venv_python}\" -m nanobind.stubgen -m tensorrt_llm_transfer_agent_binding -O .",
|
||||
env=env_stub_gen)
|
||||
else:
|
||||
build_run(
|
||||
f"\"{venv_python}\" -m pybind11_stubgen -o . tensorrt_llm_transfer_agent_binding --exit-code",
|
||||
env=env_stub_gen)
|
||||
|
||||
build_run(
|
||||
f"\"{venv_python}\" -m nanobind.stubgen -m tensorrt_llm_transfer_agent_binding -O .",
|
||||
env=env_stub_gen)
|
||||
|
||||
finally:
|
||||
if link_dir:
|
||||
rmtree(link_dir)
|
||||
|
||||
|
||||
def generate_python_stubs_windows(binding_type: str, venv_python: Path,
|
||||
pkg_dir: Path, lib_dir: Path):
|
||||
if binding_type == "nanobind":
|
||||
print("Windows not yet supported for nanobind stubs")
|
||||
exit(1)
|
||||
else:
|
||||
build_run(f"\"{venv_python}\" -m pip install pybind11-stubgen")
|
||||
stubgen = "stubgen.py"
|
||||
stubgen_contents = """
|
||||
# Loading torch, trt before bindings is required to avoid import errors on windows.
|
||||
# isort: off
|
||||
import torch
|
||||
import tensorrt as trt
|
||||
# isort: on
|
||||
import os
|
||||
import platform
|
||||
def generate_python_stubs_windows(venv_python: Path, pkg_dir: Path,
|
||||
lib_dir: Path):
|
||||
|
||||
from pybind11_stubgen import main
|
||||
|
||||
if __name__ == "__main__":
|
||||
# Load dlls from `libs` directory before launching bindings.
|
||||
if platform.system() == "Windows":
|
||||
os.add_dll_directory(r\"{lib_dir}\")
|
||||
main()
|
||||
""".format(lib_dir=lib_dir)
|
||||
(pkg_dir / stubgen).write_text(dedent(stubgen_contents))
|
||||
build_run(f"\"{venv_python}\" {stubgen} -o . bindings")
|
||||
(pkg_dir / stubgen).unlink()
|
||||
print("Windows not supported for nanobind stubs")
|
||||
exit(1)
|
||||
|
||||
|
||||
def build_kv_cache_manager_v2(project_dir, venv_python, use_mypyc=False):
|
||||
@ -517,7 +481,6 @@ def main(*,
|
||||
install: bool = False,
|
||||
skip_building_wheel: bool = False,
|
||||
linking_install_binary: bool = False,
|
||||
binding_type: str = "nanobind",
|
||||
benchmarks: bool = False,
|
||||
micro_benchmarks: bool = False,
|
||||
nvtx: bool = False,
|
||||
@ -621,39 +584,6 @@ def main(*,
|
||||
clear_folder(build_dir) # Keep the folder in case it is mounted.
|
||||
build_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
def get_binding_type_from_cache():
|
||||
cmake_cache_file = build_dir / "CMakeCache.txt"
|
||||
if not cmake_cache_file.exists():
|
||||
return None
|
||||
|
||||
with open(cmake_cache_file, 'r') as f:
|
||||
for line in f:
|
||||
if line.startswith("BINDING_TYPE:STRING="):
|
||||
cashed_binding_type = line.split("=", 1)[1].strip()
|
||||
if cashed_binding_type in ['pybind', 'nanobind']:
|
||||
return cashed_binding_type
|
||||
return None
|
||||
|
||||
cached_binding_type = get_binding_type_from_cache()
|
||||
|
||||
if not first_build and cached_binding_type != binding_type:
|
||||
# Clean up of previous binding build artifacts
|
||||
nanobind_dir = build_dir / "tensorrt_llm" / "nanobind"
|
||||
if nanobind_dir.exists():
|
||||
rmtree(nanobind_dir)
|
||||
nanobind_stub_dir = project_dir / "tensorrt_llm" / "bindings"
|
||||
if nanobind_stub_dir.exists():
|
||||
rmtree(nanobind_stub_dir)
|
||||
|
||||
pybind_dir = build_dir / "tensorrt_llm" / "pybind"
|
||||
if pybind_dir.exists():
|
||||
rmtree(pybind_dir)
|
||||
pybind_stub_dir = project_dir / "tensorrt_llm" / "bindings"
|
||||
if pybind_stub_dir.exists():
|
||||
rmtree(pybind_stub_dir)
|
||||
|
||||
configure_cmake = True
|
||||
|
||||
if use_ccache:
|
||||
cmake_def_args.append(
|
||||
f"-DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_CUDA_COMPILER_LAUNCHER=ccache"
|
||||
@ -716,7 +646,7 @@ def main(*,
|
||||
)
|
||||
cmake_def_args = " ".join(cmake_def_args)
|
||||
cmake_configure_command = (
|
||||
f'cmake -DCMAKE_BUILD_TYPE="{build_type}" -DBUILD_PYT="{build_pyt}" -DBINDING_TYPE="{binding_type}" -DBUILD_DEEP_EP="{build_deep_ep}" -DBUILD_DEEP_GEMM="{build_deep_gemm}" -DBUILD_FLASH_MLA="{build_flash_mla}"'
|
||||
f'cmake -DCMAKE_BUILD_TYPE="{build_type}" -DBUILD_PYT="{build_pyt}" -DBUILD_DEEP_EP="{build_deep_ep}" -DBUILD_DEEP_GEMM="{build_deep_gemm}" -DBUILD_FLASH_MLA="{build_flash_mla}"'
|
||||
f' -DNVTX_DISABLE="{disable_nvtx}" -DBUILD_MICRO_BENCHMARKS={build_micro_benchmarks}'
|
||||
f' -DBUILD_WHEEL_TARGETS="{";".join(targets)}"'
|
||||
f' -DPython_EXECUTABLE={venv_python} -DPython3_EXECUTABLE={venv_python}'
|
||||
@ -947,7 +877,7 @@ def main(*,
|
||||
) == 1, f"Exactly one binding library should be present: {binding_lib}"
|
||||
return binding_lib[0]
|
||||
|
||||
binding_lib_dir = get_binding_lib(binding_type, "bindings")
|
||||
binding_lib_dir = get_binding_lib("nanobind", "bindings")
|
||||
binding_lib_file_name = binding_lib_dir.name
|
||||
install_file(binding_lib_dir, pkg_dir)
|
||||
|
||||
@ -995,12 +925,10 @@ def main(*,
|
||||
if not skip_stubs:
|
||||
with working_directory(pkg_dir):
|
||||
if on_windows:
|
||||
generate_python_stubs_windows(binding_type, venv_python,
|
||||
pkg_dir, lib_dir)
|
||||
generate_python_stubs_windows(venv_python, pkg_dir, lib_dir)
|
||||
else: # on linux
|
||||
generate_python_stubs_linux(
|
||||
binding_type, venv_python,
|
||||
bool(deep_ep_cuda_architectures),
|
||||
venv_python, bool(deep_ep_cuda_architectures),
|
||||
bool(flash_mla_cuda_architectures),
|
||||
nixl_root is not None or mooncake_root is not None,
|
||||
binding_lib_file_name)
|
||||
@ -1155,10 +1083,6 @@ def add_arguments(parser: ArgumentParser):
|
||||
help=
|
||||
"Install the built binary by creating symbolic links instead of copying files"
|
||||
)
|
||||
parser.add_argument("--binding_type",
|
||||
choices=["pybind", "nanobind"],
|
||||
default="nanobind",
|
||||
help="Which binding library to use: pybind or nanobind")
|
||||
parser.add_argument("--benchmarks",
|
||||
action="store_true",
|
||||
help="Build the benchmarks for the C++ runtime")
|
||||
|
||||
@ -279,19 +279,3 @@ l0_a10:
|
||||
backend: fmha
|
||||
tests:
|
||||
- test_fmha.py::test_fmha TIMEOUT (90)
|
||||
l0_a10_pybind:
|
||||
- condition:
|
||||
ranges:
|
||||
system_gpu_count:
|
||||
gte: 1
|
||||
lte: 1
|
||||
wildcards:
|
||||
gpu:
|
||||
- '*a10*'
|
||||
linux_distribution_name: ubuntu*
|
||||
terms:
|
||||
stage: pre_merge
|
||||
tests:
|
||||
- unittest/bindings
|
||||
- test_e2e.py::test_openai_chat_example[trt]
|
||||
- test_e2e.py::test_openai_chat_example[pytorch] TIMEOUT (90)
|
||||
|
||||
Loading…
Reference in New Issue
Block a user