From a139eae42566d8ecdefcf37f2a8d99a80cd9815d Mon Sep 17 00:00:00 2001 From: Yuan Tong <13075180+tongyuantongyu@users.noreply.github.com> Date: Fri, 11 Apr 2025 15:07:50 +0800 Subject: [PATCH] chore: Stabilize ABI boundary for internal kernel library (#3117) chore: Stabilize ABI boundary for internal kernel library Signed-off-by: Yuan Tong <13075180+tongyuantongyu@users.noreply.github.com> --- .gitattributes | 1 + cpp/CMakeLists.txt | 30 -- cpp/include/tensorrt_llm/common/assert.h | 2 +- cpp/include/tensorrt_llm/common/cudaUtils.h | 8 +- cpp/include/tensorrt_llm/common/stringUtils.h | 33 +- .../tensorrt_llm/common/tllmException.h | 4 +- .../tensorrt_llm/runtime/ipcNvlsMemory.h | 15 +- cpp/tensorrt_llm/CMakeLists.txt | 305 ++++-------------- cpp/tensorrt_llm/common/cudaDriverWrapper.h | 4 +- cpp/tensorrt_llm/common/stringUtils.cpp | 40 ++- cpp/tensorrt_llm/common/tllmException.cpp | 9 +- .../ucx_utils/CMakeLists.txt | 19 +- .../decoderXQAImplJIT/compileEngine.cpp | 3 +- .../libtensorrt_llm_nvrtc_wrapper.so | 3 - .../tensorrt_llm_nvrtc_wrapper.tar.xz | 3 + .../aarch64-linux-gnu/version.txt | 4 +- .../libtensorrt_llm_nvrtc_wrapper.so | 3 - .../tensorrt_llm_nvrtc_wrapper.tar.xz | 3 + .../nvrtcWrapper/x86_64-linux-gnu/version.txt | 4 +- ...orrt_llm_internal_cutlass_kernels_static.a | 3 - ...nternal_cutlass_kernels_static.pre_cxx11.a | 3 - ...llm_internal_cutlass_kernels_static.tar.xz | 3 + .../aarch64-linux-gnu/version.txt | 5 +- .../include/moe_kernels.h | 9 +- ...orrt_llm_internal_cutlass_kernels_static.a | 3 - ...nternal_cutlass_kernels_static.pre_cxx11.a | 3 - ...llm_internal_cutlass_kernels_static.tar.xz | 3 + .../x86_64-linux-gnu/version.txt | 5 +- cpp/tensorrt_llm/kernels/lora/lora.cpp | 7 + cpp/tensorrt_llm/kernels/lora/lora.h | 4 + .../kernels/multiHeadAttentionCommon.h | 2 + cpp/tensorrt_llm/pybind/bindings.cpp | 2 +- cpp/tensorrt_llm/runtime/ipcNvlsMemory.cpp | 31 +- cpp/tensorrt_llm/runtime/tllmBuffers.h | 24 +- cpp/tests/CMakeLists.txt | 12 +- cpp/tests/unit_tests/CMakeLists.txt | 12 +- 36 files changed, 217 insertions(+), 407 deletions(-) delete mode 100755 cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/aarch64-linux-gnu/libtensorrt_llm_nvrtc_wrapper.so create mode 100644 cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/aarch64-linux-gnu/tensorrt_llm_nvrtc_wrapper.tar.xz delete mode 100644 cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/x86_64-linux-gnu/libtensorrt_llm_nvrtc_wrapper.so create mode 100644 cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/x86_64-linux-gnu/tensorrt_llm_nvrtc_wrapper.tar.xz delete mode 100644 cpp/tensorrt_llm/kernels/internal_cutlass_kernels/aarch64-linux-gnu/libtensorrt_llm_internal_cutlass_kernels_static.a delete mode 100644 cpp/tensorrt_llm/kernels/internal_cutlass_kernels/aarch64-linux-gnu/libtensorrt_llm_internal_cutlass_kernels_static.pre_cxx11.a create mode 100644 cpp/tensorrt_llm/kernels/internal_cutlass_kernels/aarch64-linux-gnu/tensorrt_llm_internal_cutlass_kernels_static.tar.xz delete mode 100644 cpp/tensorrt_llm/kernels/internal_cutlass_kernels/x86_64-linux-gnu/libtensorrt_llm_internal_cutlass_kernels_static.a delete mode 100644 cpp/tensorrt_llm/kernels/internal_cutlass_kernels/x86_64-linux-gnu/libtensorrt_llm_internal_cutlass_kernels_static.pre_cxx11.a create mode 100644 cpp/tensorrt_llm/kernels/internal_cutlass_kernels/x86_64-linux-gnu/tensorrt_llm_internal_cutlass_kernels_static.tar.xz diff --git a/.gitattributes b/.gitattributes index b3f53c5ae7..c57ee78d42 100644 --- a/.gitattributes +++ b/.gitattributes @@ -2,3 +2,4 @@ *.lib filter=lfs diff=lfs merge=lfs -text *.so filter=lfs diff=lfs merge=lfs -text *.dll filter=lfs diff=lfs merge=lfs -text +*.xz filter=lfs diff=lfs merge=lfs -text diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 56be38d9e1..07b7889e31 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -50,36 +50,6 @@ else() message(STATUS "NVTX is enabled") endif() -if(EXISTS - "${CMAKE_CURRENT_SOURCE_DIR}/tensorrt_llm/batch_manager/CMakeLists.txt") - set(BUILD_BATCH_MANAGER_DEFAULT ON) -else() - set(BUILD_BATCH_MANAGER_DEFAULT OFF) -endif() - -option(BUILD_BATCH_MANAGER "Build batch manager from source" - ${BUILD_BATCH_MANAGER_DEFAULT}) - -if(BUILD_BATCH_MANAGER) - message(STATUS "Building batch manager") -else() - message(STATUS "Importing batch manager") -endif() - -if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/tensorrt_llm/executor/CMakeLists.txt") - set(BUILD_EXECUTOR_DEFAULT ON) -else() - set(BUILD_EXECUTOR_DEFAULT OFF) -endif() - -option(BUILD_EXECUTOR "Build executor from source" ${BUILD_EXECUTOR_DEFAULT}) - -if(BUILD_EXECUTOR) - message(STATUS "Building executor") -else() - message(STATUS "Importing executor") -endif() - if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/CMakeLists.txt" ) diff --git a/cpp/include/tensorrt_llm/common/assert.h b/cpp/include/tensorrt_llm/common/assert.h index 7f51dbf1b4..768bb3e212 100644 --- a/cpp/include/tensorrt_llm/common/assert.h +++ b/cpp/include/tensorrt_llm/common/assert.h @@ -25,7 +25,7 @@ namespace tensorrt_llm::common { [[noreturn]] inline void throwRuntimeError(char const* const file, int const line, std::string const& info = "") { - throw TllmException(file, line, fmtstr("[TensorRT-LLM][ERROR] Assertion failed: %s", info.c_str())); + throw TllmException(file, line, fmtstr("[TensorRT-LLM][ERROR] Assertion failed: %s", info.c_str()).c_str()); } } // namespace tensorrt_llm::common diff --git a/cpp/include/tensorrt_llm/common/cudaUtils.h b/cpp/include/tensorrt_llm/common/cudaUtils.h index 9a064b459d..95cccae718 100644 --- a/cpp/include/tensorrt_llm/common/cudaUtils.h +++ b/cpp/include/tensorrt_llm/common/cudaUtils.h @@ -125,8 +125,8 @@ void check(T ptr, char const* const func, char const* const file, int const line { if (ptr) { - throw TllmException( - file, line, fmtstr("[TensorRT-LLM][ERROR] CUDA runtime error in %s: %s", func, _cudaGetErrorEnum(ptr))); + throw TllmException(file, line, + fmtstr("[TensorRT-LLM][ERROR] CUDA runtime error in %s: %s", func, _cudaGetErrorEnum(ptr)).c_str()); } } @@ -136,8 +136,8 @@ void checkEx( { if (std::all_of(std::begin(validReturns), std::end(validReturns), [&ptr](T const& t) { return t != ptr; })) { - throw TllmException( - file, line, fmtstr("[TensorRT-LLM][ERROR] CUDA runtime error in %s: %s", func, _cudaGetErrorEnum(ptr))); + throw TllmException(file, line, + fmtstr("[TensorRT-LLM][ERROR] CUDA runtime error in %s: %s", func, _cudaGetErrorEnum(ptr)).c_str()); } } diff --git a/cpp/include/tensorrt_llm/common/stringUtils.h b/cpp/include/tensorrt_llm/common/stringUtils.h index 107a918738..e9777c173c 100644 --- a/cpp/include/tensorrt_llm/common/stringUtils.h +++ b/cpp/include/tensorrt_llm/common/stringUtils.h @@ -21,6 +21,7 @@ #endif // ENABLE_BF16 #include +#include #include // std::make_unique #include // std::stringstream #include @@ -101,12 +102,40 @@ inline std::string fmtstr(std::string&& s) return s; } +typedef char* (*fmtstr_allocator)(void* target, size_t count); +void fmtstr_(char const* format, fmtstr_allocator alloc, void* target, va_list args); + #if defined(_MSC_VER) -std::string fmtstr(char const* format, ...); +inline std::string fmtstr(char const* format, ...); #else -std::string fmtstr(char const* format, ...) __attribute__((format(printf, 1, 2))); +inline std::string fmtstr(char const* format, ...) __attribute__((format(printf, 1, 2))); #endif +inline std::string fmtstr(char const* format, ...) +{ + std::string result; + + va_list args; + va_start(args, format); + fmtstr_( + format, + [](void* target, size_t count) -> char* + { + if (count <= 0) + { + return nullptr; + } + + const auto str = static_cast(target); + str->resize(count); + return str->data(); + }, + &result, args); + va_end(args); + + return result; +} + // __PRETTY_FUNCTION__ is used for neat debugging printing but is not supported on Windows // The alternative is __FUNCSIG__, which is similar but not identical #if defined(_WIN32) diff --git a/cpp/include/tensorrt_llm/common/tllmException.h b/cpp/include/tensorrt_llm/common/tllmException.h index 47e0e63d3f..15a1a77019 100644 --- a/cpp/include/tensorrt_llm/common/tllmException.h +++ b/cpp/include/tensorrt_llm/common/tllmException.h @@ -22,7 +22,7 @@ #include #define NEW_TLLM_EXCEPTION(...) \ - tensorrt_llm::common::TllmException(__FILE__, __LINE__, tensorrt_llm::common::fmtstr(__VA_ARGS__)) + tensorrt_llm::common::TllmException(__FILE__, __LINE__, tensorrt_llm::common::fmtstr(__VA_ARGS__).c_str()) namespace tensorrt_llm::common { @@ -32,7 +32,7 @@ class TllmException : public std::runtime_error public: static auto constexpr MAX_FRAMES = 128; - explicit TllmException(char const* file, std::size_t line, std::string const& msg); + explicit TllmException(char const* file, std::size_t line, char const* msg); ~TllmException() noexcept override; diff --git a/cpp/include/tensorrt_llm/runtime/ipcNvlsMemory.h b/cpp/include/tensorrt_llm/runtime/ipcNvlsMemory.h index d6f2e64e35..8cf2706949 100644 --- a/cpp/include/tensorrt_llm/runtime/ipcNvlsMemory.h +++ b/cpp/include/tensorrt_llm/runtime/ipcNvlsMemory.h @@ -24,10 +24,13 @@ namespace tensorrt_llm::runtime { struct IpcNvlsHandle { + // Begin internal kernel visible fields + // Changes to these fields must sync with ipcNvlsMemory.h in internal kernel repo size_t size = 0; // Device pointers used by kernels uintptr_t uc_ptr = 0; uintptr_t mc_ptr = 0; + // End internal kernel visible fields std::vector ipc_uc_ptrs; // Device pointers CUdeviceptr uc_va; @@ -43,9 +46,9 @@ void MPI_group_barrier(std::set ranks); bool ipcNvlsSupported(); -IpcNvlsHandle ipcNvlsAllocate(size_t size, std::set ranks); +IpcNvlsHandle* ipcNvlsAllocate(size_t size, std::set ranks); -void ipcNvlsFree(IpcNvlsHandle handle); +void ipcNvlsFree(IpcNvlsHandle* handle); template class DeviceAllocationNvls @@ -68,19 +71,19 @@ public: // Return device pointer to multicast memory [[nodiscard]] T* getMulticastPointer() const { - return reinterpret_cast(_handle.mc_ptr); + return reinterpret_cast(_handle->mc_ptr); } // Return device pointer for current rank [[nodiscard]] T* getUnicastPointer() const { - return reinterpret_cast(_handle.uc_ptr); + return reinterpret_cast(_handle->uc_ptr); } // Return host list of device pointers to memory on each rank [[nodiscard]] T** getIpcUnicastPointers() { - return reinterpret_cast(_handle.ipc_uc_ptrs.data()); + return reinterpret_cast(_handle->ipc_uc_ptrs.data()); } [[nodiscard]] size_t getCapacity() const @@ -99,6 +102,6 @@ public: private: size_t _capacity = 0; - IpcNvlsHandle _handle; + IpcNvlsHandle* _handle; }; } // namespace tensorrt_llm::runtime diff --git a/cpp/tensorrt_llm/CMakeLists.txt b/cpp/tensorrt_llm/CMakeLists.txt index 7b5f7114dc..f35006c5a2 100644 --- a/cpp/tensorrt_llm/CMakeLists.txt +++ b/cpp/tensorrt_llm/CMakeLists.txt @@ -85,121 +85,12 @@ endif() set(BATCH_MANAGER_TARGET tensorrt_llm_batch_manager_static) set(BATCH_MANAGER_TARGET_ARCH ${TARGET_ARCH}) - -if(BUILD_BATCH_MANAGER) - add_subdirectory(batch_manager) -else() - add_library(${BATCH_MANAGER_TARGET} STATIC IMPORTED) - if(NOT WIN32) # Linux - if(USE_CXX11_ABI) - set(BATCH_MANAGER_LIB_LOC - "${CMAKE_CURRENT_SOURCE_DIR}/batch_manager/${BATCH_MANAGER_TARGET_ARCH}/libtensorrt_llm_batch_manager_static.a" - ) - else() - set(BATCH_MANAGER_LIB_LOC - "${CMAKE_CURRENT_SOURCE_DIR}/batch_manager/${BATCH_MANAGER_TARGET_ARCH}/libtensorrt_llm_batch_manager_static.pre_cxx11.a" - ) - endif() - else() # Windows - set(BATCH_MANAGER_LIB_LOC - "${CMAKE_CURRENT_SOURCE_DIR}/batch_manager/${BATCH_MANAGER_TARGET_ARCH}/tensorrt_llm_batch_manager_static.lib" - ) - endif() - set_property(TARGET ${BATCH_MANAGER_TARGET} PROPERTY IMPORTED_LOCATION - ${BATCH_MANAGER_LIB_LOC}) - file(SIZE ${BATCH_MANAGER_LIB_LOC} BATCH_MANAGER_LIB_SIZE) - if(BATCH_MANAGER_LIB_SIZE LESS 1024) - message( - FATAL_ERROR - "The batch manager library is truncated or incomplete. This is usually caused by using Git LFS (Large File Storage) incorrectly. Please try running command `git lfs install && git lfs pull`." - ) - endif() -endif() +add_subdirectory(batch_manager) set(EXECUTOR_TARGET tensorrt_llm_executor_static) set(EXECUTOR_TARGET_ARCH ${TARGET_ARCH}) set(UCX_WRAPPER_TARGET tensorrt_llm_ucx_wrapper) - -if(BUILD_EXECUTOR) - add_subdirectory(executor) -else() - add_library(${EXECUTOR_TARGET} STATIC IMPORTED) - if(NOT WIN32) # Linux - if(USE_CXX11_ABI) - set(EXECUTOR_LIB_LOC - "${CMAKE_CURRENT_SOURCE_DIR}/executor/${EXECUTOR_TARGET_ARCH}/libtensorrt_llm_executor_static.a" - ) - else() - set(EXECUTOR_LIB_LOC - "${CMAKE_CURRENT_SOURCE_DIR}/executor/${EXECUTOR_TARGET_ARCH}/libtensorrt_llm_executor_static.pre_cxx11.a" - ) - endif() - else() # Windows - set(EXECUTOR_LIB_LOC - "${CMAKE_CURRENT_SOURCE_DIR}/executor/${EXECUTOR_TARGET_ARCH}/tensorrt_llm_executor_static.lib" - ) - endif() - set_property(TARGET ${EXECUTOR_TARGET} PROPERTY IMPORTED_LOCATION - ${EXECUTOR_LIB_LOC}) - file(SIZE ${EXECUTOR_LIB_LOC} EXECUTOR_LIB_SIZE) - if(EXECUTOR_LIB_SIZE LESS 1024) - message( - FATAL_ERROR - "The executor library is truncated or incomplete. This is usually caused by using Git LFS (Large File Storage) incorrectly. Please try running command `git lfs install && git lfs pull`." - ) - endif() - - if(ENABLE_UCX) - add_library(${UCX_WRAPPER_TARGET} SHARED IMPORTED) - if(NOT WIN32) # Linux - set(UCX_WRAPPER_LIB_SOURCE_REL_LOC - "executor/cache_transmission/ucx_utils/${EXECUTOR_TARGET_ARCH}/libtensorrt_llm_ucx_wrapper.so" - ) - set(UCX_WRAPPER_LIB_BINARY_REL_LOC - "executor/cache_transmission/ucx_utils/libtensorrt_llm_ucx_wrapper.so" - ) - else() - set(UCX_WRAPPER_LIB_BINARY_REL_DIR - "executor/cache_transmission/ucx_utils/") - set(UCX_WRAPPER_DLL_NAME "tensorrt_llm_ucx_wrapper.dll") - set(UCX_WRAPPER_LIB_NAME "tensorrt_llm_ucx_wrapper.lib") - - set(UCX_WRAPPER_LIB_SOURCE_REL_LOC - "${UCX_WRAPPER_LIB_BINARY_REL_DIR}/${EXECUTOR_TARGET_ARCH}/${UCX_WRAPPER_DLL_NAME}" - ) - set(UCX_WRAPPER_LIB_BINARY_REL_LOC - "${UCX_WRAPPER_LIB_BINARY_REL_DIR}/${UCX_WRAPPER_DLL_NAME}") - set(UCX_WRAPPER_IMPLIB_SOURCE_REL_LOC - "${UCX_WRAPPER_LIB_BINARY_REL_DIR}/${EXECUTOR_TARGET_ARCH}/${UCX_WRAPPER_LIB_NAME}" - ) - set(UCX_WRAPPER_IMPLIB_BINARY_REL_LOC - "${UCX_WRAPPER_LIB_BINARY_REL_DIR}/${UCX_WRAPPER_LIB_NAME}") - endif() - set(UCX_WRAPPER_LIB_LOC - "${CMAKE_CURRENT_SOURCE_DIR}/${UCX_WRAPPER_LIB_SOURCE_REL_LOC}") - # Copy the .so to build directory, which is needed in build_wheel.py. - configure_file(${UCX_WRAPPER_LIB_SOURCE_REL_LOC} - ${UCX_WRAPPER_LIB_BINARY_REL_LOC} COPYONLY) - set_property(TARGET ${UCX_WRAPPER_TARGET} PROPERTY IMPORTED_LOCATION - ${UCX_WRAPPER_LIB_LOC}) - if(WIN32) - set(UCX_WRAPPER_IMPLIB_LOC - "${CMAKE_CURRENT_SOURCE_DIR}/${UCX_WRAPPER_IMPLIB_SOURCE_REL_LOC}") - configure_file(${UCX_WRAPPER_IMPLIB_SOURCE_REL_LOC} - ${UCX_WRAPPER_IMPLIB_BINARY_REL_LOC} COPYONLY) - set_property(TARGET ${UCX_WRAPPER_TARGET} - PROPERTY IMPORTED_IMPLIB ${UCX_WRAPPER_IMPLIB_LOC}) - endif() - - file(SIZE ${UCX_WRAPPER_LIB_LOC} UCX_WRAPPER_LIB_SIZE) - if(UCX_WRAPPER_LIB_SIZE LESS 1024) - message( - FATAL_ERROR - "The ucx wrapper library is truncated or incomplete. This is usually caused by using Git LFS (Large File Storage) incorrectly. Please try running command `git lfs install && git lfs pull`." - ) - endif() - endif() -endif() +add_subdirectory(executor) set(INTERNAL_CUTLASS_KERNELS_TARGET tensorrt_llm_internal_cutlass_kernels_static) @@ -208,24 +99,33 @@ if(BUILD_INTERNAL_CUTLASS_KERNELS) add_subdirectory(kernels/internal_cutlass_kernels) else() add_library(${INTERNAL_CUTLASS_KERNELS_TARGET} STATIC IMPORTED) + set(INTERNAL_CUTLASS_KERNELS_LIB_TARBALL + "${CMAKE_CURRENT_SOURCE_DIR}/kernels/internal_cutlass_kernels/${INTERNAL_CUTLASS_KERNELS_TARGET_ARCH}/${INTERNAL_CUTLASS_KERNELS_TARGET}.tar.xz" + ) if(NOT WIN32) # Linux - if(USE_CXX11_ABI) - set(INTERNAL_CUTLASS_KERNELS_LIB_LOC - "${CMAKE_CURRENT_SOURCE_DIR}/kernels/internal_cutlass_kernels/${INTERNAL_CUTLASS_KERNELS_TARGET_ARCH}/libtensorrt_llm_internal_cutlass_kernels_static.a" - ) - else() - set(INTERNAL_CUTLASS_KERNELS_LIB_LOC - "${CMAKE_CURRENT_SOURCE_DIR}/kernels/internal_cutlass_kernels/${INTERNAL_CUTLASS_KERNELS_TARGET_ARCH}/libtensorrt_llm_internal_cutlass_kernels_static.pre_cxx11.a" - ) - endif() + set(INTERNAL_CUTLASS_KERNELS_LIB_NAME + "lib${INTERNAL_CUTLASS_KERNELS_TARGET}.a") else() # Windows - set(INTERNAL_CUTLASS_KERNELS_LIB_LOC - "${CMAKE_CURRENT_SOURCE_DIR}/kernels/internal_cutlass_kernels/${INTERNAL_CUTLASS_KERNELS_TARGET_ARCH}/tensorrt_llm_internal_cutlass_kernels_static.lib" - ) + set(INTERNAL_CUTLASS_KERNELS_LIB_NAME + "${INTERNAL_CUTLASS_KERNELS_TARGET}.lib") endif() + set(INTERNAL_CUTLASS_KERNELS_LIB_PATH + "${CMAKE_CURRENT_BINARY_DIR}/${INTERNAL_CUTLASS_KERNELS_LIB_NAME}") + add_custom_command( + OUTPUT ${INTERNAL_CUTLASS_KERNELS_LIB_PATH} + COMMAND ${CMAKE_COMMAND} -E tar xf ${INTERNAL_CUTLASS_KERNELS_LIB_TARBALL} + DEPENDS ${INTERNAL_CUTLASS_KERNELS_LIB_TARBALL} + WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR} + VERBATIM) + add_custom_target(${INTERNAL_CUTLASS_KERNELS_TARGET}_helper + DEPENDS ${INTERNAL_CUTLASS_KERNELS_LIB_PATH}) + add_dependencies(${INTERNAL_CUTLASS_KERNELS_TARGET} + ${INTERNAL_CUTLASS_KERNELS_TARGET}_helper) set_property(TARGET ${INTERNAL_CUTLASS_KERNELS_TARGET} - PROPERTY IMPORTED_LOCATION ${INTERNAL_CUTLASS_KERNELS_LIB_LOC}) - file(SIZE ${INTERNAL_CUTLASS_KERNELS_LIB_LOC} + PROPERTY IMPORTED_LOCATION ${INTERNAL_CUTLASS_KERNELS_LIB_PATH}) + target_link_libraries(${INTERNAL_CUTLASS_KERNELS_TARGET} + INTERFACE ${INTERNAL_CUTLASS_KERNELS_LIB_PATH}) + file(SIZE ${INTERNAL_CUTLASS_KERNELS_LIB_TARBALL} INTERNAL_CUTLASS_KERNELS_LIB_SIZE) if(INTERNAL_CUTLASS_KERNELS_LIB_SIZE LESS 1024) message( @@ -239,70 +139,6 @@ find_package(Threads REQUIRED) target_link_libraries(${BATCH_MANAGER_TARGET} INTERFACE Threads::Threads) target_link_libraries(${EXECUTOR_TARGET} INTERFACE Threads::Threads) -if(NOT WIN32) - if(USE_CXX11_ABI) - add_custom_command( - OUTPUT "${CMAKE_CURRENT_BINARY_DIR}/.check_symbol" - COMMAND nm -C $ | grep -q - 'std::__cxx11::' - DEPENDS ${BATCH_MANAGER_TARGET}) - else() - add_custom_command( - OUTPUT "${CMAKE_CURRENT_BINARY_DIR}/.check_symbol" - COMMAND nm -C $ | grep -qv - 'std::__cxx11::' - DEPENDS ${BATCH_MANAGER_TARGET}) - endif() - add_custom_target(check_symbol - DEPENDS "${CMAKE_CURRENT_BINARY_DIR}/.check_symbol") -else() - add_custom_target(check_symbol) -endif() - -if(NOT WIN32) - if(USE_CXX11_ABI) - add_custom_command( - OUTPUT - "${CMAKE_CURRENT_BINARY_DIR}/.check_symbol_internal_cutlass_kernels" - COMMAND nm -C $ | grep -q - 'std::__cxx11::' - DEPENDS ${INTERNAL_CUTLASS_KERNELS_TARGET}) - else() - add_custom_command( - OUTPUT - "${CMAKE_CURRENT_BINARY_DIR}/.check_symbol_internal_cutlass_kernels" - COMMAND nm -C $ | grep -qv - 'std::__cxx11::' - DEPENDS ${INTERNAL_CUTLASS_KERNELS_TARGET}) - endif() - add_custom_target( - check_symbol_internal_cutlass_kernels - DEPENDS "${CMAKE_CURRENT_BINARY_DIR}/.check_symbol_internal_cutlass_kernels" - ) -else() - add_custom_target(check_symbol_internal_cutlass_kernels) -endif() - -if(NOT WIN32) - if(USE_CXX11_ABI) - add_custom_command( - OUTPUT "${CMAKE_CURRENT_BINARY_DIR}/.check_symbol_executor" - COMMAND nm -C $ | grep -q 'std::__cxx11::' - DEPENDS ${EXECUTOR_TARGET}) - else() - add_custom_command( - OUTPUT "${CMAKE_CURRENT_BINARY_DIR}/.check_symbol_executor" - COMMAND nm -C $ | grep -qv - 'std::__cxx11::' - DEPENDS ${EXECUTOR_TARGET}) - endif() - add_custom_target( - check_symbol_executor - DEPENDS "${CMAKE_CURRENT_BINARY_DIR}/.check_symbol_executor") -else() - add_custom_target(check_symbol_executor) -endif() - set(NVRTC_WRAPPER_TARGET tensorrt_llm_nvrtc_wrapper) set(NVRTC_WRAPPER_TARGET_ARCH ${TARGET_ARCH}) @@ -311,48 +147,40 @@ if(BUILD_NVRTC_WRAPPER) kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper) else() add_library(${NVRTC_WRAPPER_TARGET} SHARED IMPORTED) + set(NVRTC_WRAPPER_LIB_TARBALL + "${CMAKE_CURRENT_SOURCE_DIR}/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/${NVRTC_WRAPPER_TARGET_ARCH}/${NVRTC_WRAPPER_TARGET}.tar.xz" + ) + set(NVRTC_WRAPPER_LIB_BINARY_DIR + "${CMAKE_CURRENT_BINARY_DIR}/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper" + ) if(NOT WIN32) # Linux - set(NVRTC_WRAPPER_LIB_SOURCE_REL_LOC - "kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/${NVRTC_WRAPPER_TARGET_ARCH}/libtensorrt_llm_nvrtc_wrapper.so" - ) - set(NVRTC_WRAPPER_LIB_BINARY_REL_LOC - "kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/libtensorrt_llm_nvrtc_wrapper.so" - ) - else() - set(NVRTC_WRAPPER_LIB_BINARY_REL_DIR - "kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper" - ) - set(NVRTC_WRAPPER_DLL_NAME "tensorrt_llm_nvrtc_wrapper.dll") - set(NVRTC_WRAPPER_LIB_NAME "tensorrt_llm_nvrtc_wrapper.lib") - - set(NVRTC_WRAPPER_LIB_SOURCE_REL_LOC - "${NVRTC_WRAPPER_LIB_BINARY_REL_DIR}/${NVRTC_WRAPPER_TARGET_ARCH}/${NVRTC_WRAPPER_DLL_NAME}" - ) - set(NVRTC_WRAPPER_LIB_BINARY_REL_LOC - "${NVRTC_WRAPPER_LIB_BINARY_REL_DIR}/${NVRTC_WRAPPER_DLL_NAME}") - set(NVRTC_WRAPPER_IMPLIB_SOURCE_REL_LOC - "${NVRTC_WRAPPER_LIB_BINARY_REL_DIR}/${NVRTC_WRAPPER_TARGET_ARCH}/${NVRTC_WRAPPER_LIB_NAME}" - ) - set(NVRTC_WRAPPER_IMPLIB_BINARY_REL_LOC - "${NVRTC_WRAPPER_LIB_BINARY_REL_DIR}/${NVRTC_WRAPPER_LIB_NAME}") + set(NVRTC_WRAPPER_LIB_NAME "lib${NVRTC_WRAPPER_TARGET}.so") + else() # Windows + set(NVRTC_WRAPPER_LIB_NAME "${NVRTC_WRAPPER_TARGET}.lib") + set(NVRTC_WRAPPER_DLL_NAME "${NVRTC_WRAPPER_TARGET}.dll") + set(NVRTC_WRAPPER_DLL_PATH + "${NVRTC_WRAPPER_LIB_BINARY_DIR}/${NVRTC_WRAPPER_DLL_NAME}") endif() - set(NVRTC_WRAPPER_LIB_LOC - "${CMAKE_CURRENT_SOURCE_DIR}/${NVRTC_WRAPPER_LIB_SOURCE_REL_LOC}") - # Copy the .so to build directory, which is needed in build_wheel.py. - configure_file(${NVRTC_WRAPPER_LIB_SOURCE_REL_LOC} - ${NVRTC_WRAPPER_LIB_BINARY_REL_LOC} COPYONLY) - set_property(TARGET ${NVRTC_WRAPPER_TARGET} PROPERTY IMPORTED_LOCATION - ${NVRTC_WRAPPER_LIB_LOC}) + set(NVRTC_WRAPPER_LIB_PATH + "${NVRTC_WRAPPER_LIB_BINARY_DIR}/${NVRTC_WRAPPER_LIB_NAME}") + add_custom_command( + OUTPUT ${NVRTC_WRAPPER_LIB_PATH} ${NVRTC_WRAPPER_DLL_PATH} + COMMAND ${CMAKE_COMMAND} -E make_directory ${NVRTC_WRAPPER_LIB_BINARY_DIR} + COMMAND ${CMAKE_COMMAND} -E chdir ${NVRTC_WRAPPER_LIB_BINARY_DIR} + ${CMAKE_COMMAND} -E tar xf ${NVRTC_WRAPPER_LIB_TARBALL} + DEPENDS ${NVRTC_WRAPPER_LIB_TARBALL} + VERBATIM) + add_custom_target(${NVRTC_WRAPPER_TARGET}_helper + DEPENDS ${NVRTC_WRAPPER_LIB_PATH} ${NVRTC_WRAPPER_DLL_PATH}) + add_dependencies(${NVRTC_WRAPPER_TARGET} ${NVRTC_WRAPPER_TARGET}_helper) + set_property(TARGET ${NVRTC_WRAPPER_TARGET} + PROPERTY IMPORTED_LOCATION ${NVRTC_WRAPPER_LIB_PATH}) if(WIN32) - set(NVRTC_WRAPPER_IMPLIB_LOC - "${CMAKE_CURRENT_SOURCE_DIR}/${NVRTC_WRAPPER_IMPLIB_SOURCE_REL_LOC}") - configure_file(${NVRTC_WRAPPER_IMPLIB_SOURCE_REL_LOC} - ${NVRTC_WRAPPER_IMPLIB_BINARY_REL_LOC} COPYONLY) set_property(TARGET ${NVRTC_WRAPPER_TARGET} - PROPERTY IMPORTED_IMPLIB ${NVRTC_WRAPPER_IMPLIB_LOC}) + PROPERTY IMPORTED_IMPLIB ${NVRTC_WRAPPER_DLL_PATH}) endif() - file(SIZE ${NVRTC_WRAPPER_LIB_LOC} NVRTC_WRAPPER_LIB_SIZE) + file(SIZE ${INTERNAL_CUTLASS_KERNELS_LIB_TARBALL} NVRTC_WRAPPER_LIB_SIZE) if(NVRTC_WRAPPER_LIB_SIZE LESS 1024) message( FATAL_ERROR @@ -414,25 +242,14 @@ set_target_properties( PROPERTIES CXX_STANDARD "17" CXX_STANDARD_REQUIRED "YES" CXX_EXTENSIONS "NO" LINK_FLAGS "${AS_NEEDED_FLAG} ${UNDEFINED_FLAG}") -function(link_whole_archive TARGET LIBRARY_TO_LINK) - if(WIN32) - target_link_libraries(${TARGET} PUBLIC $) - set_target_properties( - ${TARGET} PROPERTIES LINK_FLAGS "/WHOLEARCHIVE:${LIBRARY_TO_LINK}") - else() - # Assume everything else is like gcc - target_link_libraries( - ${TARGET} PRIVATE "-Wl,--whole-archive" $ - "-Wl,--no-whole-archive") - endif() -endfunction() - target_link_libraries(${SHARED_TARGET} PUBLIC ${TRTLLM_LINK_LIBS}) -link_whole_archive(${SHARED_TARGET} ${BATCH_MANAGER_TARGET}) -link_whole_archive(${SHARED_TARGET} ${EXECUTOR_TARGET}) -link_whole_archive(${SHARED_TARGET} fp8_blockscale_gemm_src) -link_whole_archive(${SHARED_TARGET} ${INTERNAL_CUTLASS_KERNELS_TARGET}) +target_link_libraries( + ${SHARED_TARGET} + PRIVATE $ + $ + $ + $) # Link kernel_src and cutlass_src. static internal cutlass lib overridden. target_link_libraries(${SHARED_TARGET} PUBLIC kernels_src cutlass_src) @@ -458,10 +275,6 @@ endif() target_link_libraries(${SHARED_TARGET} PUBLIC ${NVRTC_WRAPPER_TARGET}) -add_dependencies(${SHARED_TARGET} check_symbol) -add_dependencies(${SHARED_TARGET} check_symbol_executor) -add_dependencies(${SHARED_TARGET} check_symbol_internal_cutlass_kernels) - if(BUILD_PYT) add_subdirectory(thop) endif() diff --git a/cpp/tensorrt_llm/common/cudaDriverWrapper.h b/cpp/tensorrt_llm/common/cudaDriverWrapper.h index affad6634a..3f61fed5ed 100644 --- a/cpp/tensorrt_llm/common/cudaDriverWrapper.h +++ b/cpp/tensorrt_llm/common/cudaDriverWrapper.h @@ -126,8 +126,8 @@ void checkDriver( char const* errorString = nullptr; wrap.cuGetErrorName(result, &errorName); wrap.cuGetErrorString(result, &errorString); - throw TllmException( - file, line, fmtstr("[TensorRT-LLM][ERROR] CUDA driver error in %s: %s: %s.", func, errorName, errorString)); + throw TllmException(file, line, + fmtstr("[TensorRT-LLM][ERROR] CUDA driver error in %s: %s: %s.", func, errorName, errorString).c_str()); } } diff --git a/cpp/tensorrt_llm/common/stringUtils.cpp b/cpp/tensorrt_llm/common/stringUtils.cpp index f1c6f88b43..75052ad4fa 100644 --- a/cpp/tensorrt_llm/common/stringUtils.cpp +++ b/cpp/tensorrt_llm/common/stringUtils.cpp @@ -26,35 +26,33 @@ namespace tensorrt_llm::common { -namespace -{ -std::string vformat(char const* fmt, va_list args) +void fmtstr_(char const* format, fmtstr_allocator alloc, void* target, va_list args) { va_list args0; va_copy(args0, args); - auto const size = vsnprintf(nullptr, 0, fmt, args0); - if (size <= 0) - return ""; - std::string stringBuf(size, char{}); - auto const size2 = std::vsnprintf(&stringBuf[0], size + 1, fmt, args); + size_t constexpr init_size = 2048; + char fixed_buffer[init_size]; + auto const size = std::vsnprintf(fixed_buffer, init_size, format, args0); + TLLM_CHECK_WITH_INFO(size >= 0, std::string(std::strerror(errno))); + if (size == 0) + { + return; + } - TLLM_CHECK_WITH_INFO(size2 == size, std::string(std::strerror(errno))); + auto* memory = alloc(target, size); - return stringBuf; + if (static_cast(size) < init_size) + { + std::memcpy(memory, fixed_buffer, size + 1); + } + else + { + auto const size2 = std::vsnprintf(memory, size + 1, format, args); + TLLM_CHECK_WITH_INFO(size2 == size, std::string(std::strerror(errno))); + } } -} // namespace - -std::string fmtstr(char const* format, ...) -{ - va_list args; - va_start(args, format); - std::string result = vformat(format, args); - va_end(args); - return result; -}; - std::unordered_set str2set(std::string const& input, char delimiter) { std::unordered_set values; diff --git a/cpp/tensorrt_llm/common/tllmException.cpp b/cpp/tensorrt_llm/common/tllmException.cpp index b410613d05..00fb465b02 100644 --- a/cpp/tensorrt_llm/common/tllmException.cpp +++ b/cpp/tensorrt_llm/common/tllmException.cpp @@ -35,18 +35,17 @@ int constexpr VOID_PTR_SZ = 2 + sizeof(void*) * 2; #if !defined(_MSC_VER) -TllmException::TllmException(char const* file, std::size_t line, std::string const& msg) +TllmException::TllmException(char const* file, std::size_t line, char const* msg) : std::runtime_error{""} { mNbFrames = backtrace(mCallstack.data(), MAX_FRAMES); auto const trace = getTrace(); - std::runtime_error::operator=( - std::runtime_error{fmtstr("%s (%s:%zu)\n%s", msg.c_str(), file, line, trace.c_str())}); + std::runtime_error::operator=(std::runtime_error{fmtstr("%s (%s:%zu)\n%s", msg, file, line, trace.c_str())}); } #else -TllmException::TllmException(char const* file, std::size_t line, std::string const& msg) +TllmException::TllmException(char const* file, std::size_t line, char const* msg) : mNbFrames{} - , std::runtime_error{fmtstr("%s (%s:%zu)", msg.c_str(), file, line)} + , std::runtime_error{fmtstr("%s (%s:%zu)", msg, file, line)} { } #endif diff --git a/cpp/tensorrt_llm/executor/cache_transmission/ucx_utils/CMakeLists.txt b/cpp/tensorrt_llm/executor/cache_transmission/ucx_utils/CMakeLists.txt index 79e3c0ca83..cd2095dd8d 100644 --- a/cpp/tensorrt_llm/executor/cache_transmission/ucx_utils/CMakeLists.txt +++ b/cpp/tensorrt_llm/executor/cache_transmission/ucx_utils/CMakeLists.txt @@ -16,23 +16,8 @@ if(ENABLE_UCX) set(TOP_LEVEL_DIR "${PROJECT_SOURCE_DIR}/..") target_compile_definitions(${UCX_WRAPPER_TARGET} PUBLIC TOP_LEVEL_DIR="${TOP_LEVEL_DIR}") - - target_include_directories( - ${UCX_WRAPPER_TARGET} - PRIVATE $) - # link_whole_archive - if(WIN32) - target_link_libraries(${UCX_WRAPPER_TARGET} - PUBLIC $) - set_target_properties(${UCX_WRAPPER_TARGET} - PROPERTIES LINK_FLAGS "/WHOLEARCHIVE:ucxx::ucxx") - else() - # Assume everything else is like gcc - target_link_libraries( - ${UCX_WRAPPER_TARGET} - PRIVATE "-Wl,--whole-archive" $ - "-Wl,--no-whole-archive") - endif() + target_link_libraries(${UCX_WRAPPER_TARGET} + PRIVATE $) target_link_libraries(${UCX_WRAPPER_TARGET} PUBLIC ucxx::ucxx ucx::ucs) target_link_libraries(${UCX_WRAPPER_TARGET} PUBLIC ${CUDA_RT_LIB}) endif() diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/compileEngine.cpp b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/compileEngine.cpp index c6132d5b05..bc727ce9cf 100644 --- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/compileEngine.cpp +++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/compileEngine.cpp @@ -35,7 +35,8 @@ void CHECK_TLLM_XQA_JIT_ERROR_(tllmXqaJitStatus result, char const* const func, std::vector log(tllmXqaJitGetLastErrorStringSize()); tllmXqaJitGetLastErrorString(log.data()); throw tensorrt_llm::common::TllmException(file, line, - tensorrt_llm::common::fmtstr("[TensorRT-LLM][ERROR] TllmXqaJit runtime error in %s: %s", func, log.data())); + tensorrt_llm::common::fmtstr("[TensorRT-LLM][ERROR] TllmXqaJit runtime error in %s: %s", func, log.data()) + .c_str()); } } diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/aarch64-linux-gnu/libtensorrt_llm_nvrtc_wrapper.so b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/aarch64-linux-gnu/libtensorrt_llm_nvrtc_wrapper.so deleted file mode 100755 index 55979661e2..0000000000 --- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/aarch64-linux-gnu/libtensorrt_llm_nvrtc_wrapper.so +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:5ad6be58302fad71488246c4dea6f96d710143988a195d67b304ea251bd0aa89 -size 126817632 diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/aarch64-linux-gnu/tensorrt_llm_nvrtc_wrapper.tar.xz b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/aarch64-linux-gnu/tensorrt_llm_nvrtc_wrapper.tar.xz new file mode 100644 index 0000000000..f8290b232d --- /dev/null +++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/aarch64-linux-gnu/tensorrt_llm_nvrtc_wrapper.tar.xz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8702e9bf2ad0e50a86f9d3a9be52fc70b8fdf5be644d585c69d9560b6fe42dad +size 34773116 diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/aarch64-linux-gnu/version.txt b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/aarch64-linux-gnu/version.txt index c073741f97..82e8e50707 100644 --- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/aarch64-linux-gnu/version.txt +++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/aarch64-linux-gnu/version.txt @@ -1,2 +1,2 @@ -a8252eee786f39e51f70a4c011588c7d libtensorrt_llm_nvrtc_wrapper.so -commit 705292307acd1546f4f9e2b2fab84350d01d41ab +5ad6be58302fad71488246c4dea6f96d710143988a195d67b304ea251bd0aa89 libtensorrt_llm_nvrtc_wrapper.so +commit 9c24486cb2cd9dd9582b311b84e1b428d29a735a diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/x86_64-linux-gnu/libtensorrt_llm_nvrtc_wrapper.so b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/x86_64-linux-gnu/libtensorrt_llm_nvrtc_wrapper.so deleted file mode 100644 index f601f23899..0000000000 --- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/x86_64-linux-gnu/libtensorrt_llm_nvrtc_wrapper.so +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:253ba70949732bf3d79c759ba3601516cd5b5b03a121f00c3ce45bbb40aea035 -size 133862752 diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/x86_64-linux-gnu/tensorrt_llm_nvrtc_wrapper.tar.xz b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/x86_64-linux-gnu/tensorrt_llm_nvrtc_wrapper.tar.xz new file mode 100644 index 0000000000..9420b6c077 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/x86_64-linux-gnu/tensorrt_llm_nvrtc_wrapper.tar.xz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7cddaa17269f699ffa0783ed296cabb7fe71cf61910a2799c71c4e39192fc513 +size 38282412 diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/x86_64-linux-gnu/version.txt b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/x86_64-linux-gnu/version.txt index 2db116e657..da3a200ba1 100644 --- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/x86_64-linux-gnu/version.txt +++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/x86_64-linux-gnu/version.txt @@ -1,2 +1,2 @@ -1dacb3147d5d47d795a447ee563ee92a libtensorrt_llm_nvrtc_wrapper.so -commit 705292307acd1546f4f9e2b2fab84350d01d41ab +9d1104bbe6b4f258482549ec71c9d1aed0de912b5824dced5cf7829bff66ba0d libtensorrt_llm_nvrtc_wrapper.so +commit 9c24486cb2cd9dd9582b311b84e1b428d29a735a diff --git a/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/aarch64-linux-gnu/libtensorrt_llm_internal_cutlass_kernels_static.a b/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/aarch64-linux-gnu/libtensorrt_llm_internal_cutlass_kernels_static.a deleted file mode 100644 index d712ed09af..0000000000 --- a/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/aarch64-linux-gnu/libtensorrt_llm_internal_cutlass_kernels_static.a +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:42d72057eac00d2f82cecb27f7401258c2fe932d51a945f1be4baa4271307acb -size 138648070 diff --git a/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/aarch64-linux-gnu/libtensorrt_llm_internal_cutlass_kernels_static.pre_cxx11.a b/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/aarch64-linux-gnu/libtensorrt_llm_internal_cutlass_kernels_static.pre_cxx11.a deleted file mode 100644 index 034b44aece..0000000000 --- a/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/aarch64-linux-gnu/libtensorrt_llm_internal_cutlass_kernels_static.pre_cxx11.a +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:ef06a1b7cc3e1a2e71a2ce2f4081412eded9e75a236e2c4dda0ed636de8148b8 -size 138563288 diff --git a/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/aarch64-linux-gnu/tensorrt_llm_internal_cutlass_kernels_static.tar.xz b/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/aarch64-linux-gnu/tensorrt_llm_internal_cutlass_kernels_static.tar.xz new file mode 100644 index 0000000000..fbc0b77f60 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/aarch64-linux-gnu/tensorrt_llm_internal_cutlass_kernels_static.tar.xz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5e50158a750697d719bbca9d4e18680290c10cb2bc30e5711854c49edb92ce95 +size 45029036 diff --git a/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/aarch64-linux-gnu/version.txt b/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/aarch64-linux-gnu/version.txt index 36a07f3718..089be0dff9 100644 --- a/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/aarch64-linux-gnu/version.txt +++ b/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/aarch64-linux-gnu/version.txt @@ -1,3 +1,2 @@ -7ef325eb05b4770773732c0f8bc5748d libtensorrt_llm_internal_cutlass_kernels_static.a -ef25d7af2b5d9824ddc50a1b79db36e8 libtensorrt_llm_internal_cutlass_kernels_static.pre_cxx11.a -commit b43c46c83bd0833eae16ea0eae7cef6bee81644c +a357a7193265159ac09d7ddcc47e0445f0f348d8f93e08c5d82c98ed38d3e342 libtensorrt_llm_internal_cutlass_kernels_static.a +commit 9c24486cb2cd9dd9582b311b84e1b428d29a735a diff --git a/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/include/moe_kernels.h b/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/include/moe_kernels.h index 2d4ba9f529..ceae9cfdd9 100644 --- a/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/include/moe_kernels.h +++ b/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/include/moe_kernels.h @@ -18,13 +18,14 @@ #pragma once #include "cutlass/gemm/gemm.h" #include "tensorrt_llm/common/assert.h" +#include "tensorrt_llm/common/cudaUtils.h" #include "tensorrt_llm/common/quantization.h" #include "tensorrt_llm/kernels/cutlass_kernels/fp8_blockscale_gemm/fp8_blockscale_gemm.h" #include "tensorrt_llm/kernels/internal_cutlass_kernels/include/moe_gemm_kernels.h" -#include "tensorrt_llm/kernels/lora/lora.h" #ifdef ENABLE_FP4 #include #endif +#include #include #include #include @@ -221,6 +222,12 @@ struct QuantParams } }; +// Change to following declarations must sync with lora.h in public repo +class LoraImpl; + +int Lora_run(LoraImpl* impl, int64_t numTokens, int64_t numReqs, void const* input, int32_t const* loraRanks, + void const* const* loraWeightsPtr, int weightIndex, void* const* outputs, void* workspace, cudaStream_t stream); + struct LoraParams { using LoraImplPtr = std::shared_ptr; diff --git a/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/x86_64-linux-gnu/libtensorrt_llm_internal_cutlass_kernels_static.a b/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/x86_64-linux-gnu/libtensorrt_llm_internal_cutlass_kernels_static.a deleted file mode 100644 index 4a5c140704..0000000000 --- a/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/x86_64-linux-gnu/libtensorrt_llm_internal_cutlass_kernels_static.a +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:651dceb91cafbe24997cdc71c3f984ab6b18e99dbc2ed2958ca08b2cf4897cc3 -size 135328862 diff --git a/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/x86_64-linux-gnu/libtensorrt_llm_internal_cutlass_kernels_static.pre_cxx11.a b/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/x86_64-linux-gnu/libtensorrt_llm_internal_cutlass_kernels_static.pre_cxx11.a deleted file mode 100644 index 7238d9eac5..0000000000 --- a/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/x86_64-linux-gnu/libtensorrt_llm_internal_cutlass_kernels_static.pre_cxx11.a +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:2cbe6c81bae4a338f6a61f081571abff720c2275058c38ed67c879886056cd98 -size 134434150 diff --git a/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/x86_64-linux-gnu/tensorrt_llm_internal_cutlass_kernels_static.tar.xz b/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/x86_64-linux-gnu/tensorrt_llm_internal_cutlass_kernels_static.tar.xz new file mode 100644 index 0000000000..51c0c9f993 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/x86_64-linux-gnu/tensorrt_llm_internal_cutlass_kernels_static.tar.xz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0255e3a868db94e0bb555f692c087ee73c6b800f907c64fc36e0d3846ffa12f6 +size 44693484 diff --git a/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/x86_64-linux-gnu/version.txt b/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/x86_64-linux-gnu/version.txt index 0b78d70c5a..2a9cd12dc1 100644 --- a/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/x86_64-linux-gnu/version.txt +++ b/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/x86_64-linux-gnu/version.txt @@ -1,3 +1,2 @@ -893ae060cdb1d5a54729afca9d1b9b99 libtensorrt_llm_internal_cutlass_kernels_static.a -a7ee89c7577bf9bf8d8a9ac8072b810d libtensorrt_llm_internal_cutlass_kernels_static.pre_cxx11.a -commit b43c46c83bd0833eae16ea0eae7cef6bee81644c +4f6da1c3b64b7cef5841dd7507839e718c5f47fa81f3a8e2e6839a81bda459db libtensorrt_llm_internal_cutlass_kernels_static.a +commit 9c24486cb2cd9dd9582b311b84e1b428d29a735a diff --git a/cpp/tensorrt_llm/kernels/lora/lora.cpp b/cpp/tensorrt_llm/kernels/lora/lora.cpp index 0bd19e266f..67e774f60c 100644 --- a/cpp/tensorrt_llm/kernels/lora/lora.cpp +++ b/cpp/tensorrt_llm/kernels/lora/lora.cpp @@ -332,4 +332,11 @@ int LoraImpl::run(int64_t numTokens, int64_t numReqs, void const* input, int32_t return 0; } +int Lora_run(LoraImpl* impl, int64_t numTokens, int64_t numReqs, void const* input, int32_t const* loraRanks, + void const* const* loraWeightsPtr, int weightIndex, void* const* outputs, void* workspace, cudaStream_t stream) +{ + TLLM_CHECK_WITH_INFO(impl != nullptr, "Attempt to run an empty LoraImpl"); + return impl->run(numTokens, numReqs, input, loraRanks, loraWeightsPtr, weightIndex, outputs, workspace, stream); +} + } // namespace tensorrt_llm::kernels diff --git a/cpp/tensorrt_llm/kernels/lora/lora.h b/cpp/tensorrt_llm/kernels/lora/lora.h index acf123f33f..38437b5348 100644 --- a/cpp/tensorrt_llm/kernels/lora/lora.h +++ b/cpp/tensorrt_llm/kernels/lora/lora.h @@ -66,4 +66,8 @@ private: std::optional mBestConfig; }; +// Change to following declarations must sync with moe_kernels.h in internal kernel repo +int Lora_run(LoraImpl* impl, int64_t numTokens, int64_t numReqs, void const* input, int32_t const* loraRanks, + void const* const* loraWeightsPtr, int weightIndex, void* const* outputs, void* workspace, cudaStream_t stream); + } // namespace tensorrt_llm::kernels diff --git a/cpp/tensorrt_llm/kernels/multiHeadAttentionCommon.h b/cpp/tensorrt_llm/kernels/multiHeadAttentionCommon.h index 75fc695848..4ad5d78f2f 100644 --- a/cpp/tensorrt_llm/kernels/multiHeadAttentionCommon.h +++ b/cpp/tensorrt_llm/kernels/multiHeadAttentionCommon.h @@ -24,6 +24,8 @@ namespace tensorrt_llm { namespace kernels { + +// Change to this enum must sync with nvrtcWrapper.cpp in internal kernel repo enum Data_type { DATA_TYPE_BOOL, diff --git a/cpp/tensorrt_llm/pybind/bindings.cpp b/cpp/tensorrt_llm/pybind/bindings.cpp index 98f064502d..775106ee16 100644 --- a/cpp/tensorrt_llm/pybind/bindings.cpp +++ b/cpp/tensorrt_llm/pybind/bindings.cpp @@ -560,7 +560,7 @@ PYBIND11_MODULE(TRTLLM_PYBIND_MODULE, m) .def("get_ipc_ptrs", [](tr::IpcNvlsHandle& self) { return reinterpret_cast(self.ipc_uc_ptrs.data()); }); - m.def("ipc_nvls_allocate", &tr::ipcNvlsAllocate); + m.def("ipc_nvls_allocate", &tr::ipcNvlsAllocate, py::return_value_policy::reference); m.def("ipc_nvls_free", &tr::ipcNvlsFree); m.def("ipc_nvls_supported", &tr::ipcNvlsSupported); } diff --git a/cpp/tensorrt_llm/runtime/ipcNvlsMemory.cpp b/cpp/tensorrt_llm/runtime/ipcNvlsMemory.cpp index 6537f7810d..b6ec30594c 100644 --- a/cpp/tensorrt_llm/runtime/ipcNvlsMemory.cpp +++ b/cpp/tensorrt_llm/runtime/ipcNvlsMemory.cpp @@ -152,7 +152,7 @@ bool ipcNvlsSupported() return true; } -IpcNvlsHandle ipcNvlsAllocate(size_t size, std::set group) +IpcNvlsHandle* ipcNvlsAllocate(size_t size, std::set group) { #if ENABLE_MULTI_DEVICE TLLM_CHECK(size > 0); @@ -324,26 +324,33 @@ IpcNvlsHandle ipcNvlsAllocate(size_t size, std::set group) printf("Rank %d imported IPC handles successfully\n", rank); - return handle; + return new IpcNvlsHandle(std::move(handle)); #else TLLM_THROW("ipcNvlsAllocate needs to be compiled with ENABLE_MULTI_DEVICE"); #endif } -void ipcNvlsFree(IpcNvlsHandle handle) +void ipcNvlsFree(IpcNvlsHandle* handle) { #if ENABLE_MULTI_DEVICE - // Unmap and release MC VA - CUCHECK(cuMemUnmap(handle.mc_va, handle.size)); - CUCHECK(cuMemRelease(handle.mc_handle)); - CUCHECK(cuMemAddressFree(handle.mc_va, handle.size)); - // Unmap and release UC VA - for (size_t i = 0; i < handle.ipc_uc_vas.size(); ++i) + if (handle == nullptr) { - CUCHECK(cuMemUnmap(handle.ipc_uc_vas[i], handle.size)); - CUCHECK(cuMemRelease(handle.ipc_uc_handles[i])); - CUCHECK(cuMemAddressFree(handle.ipc_uc_vas[i], handle.size)); + return; } + + // Unmap and release MC VA + CUCHECK(cuMemUnmap(handle->mc_va, handle->size)); + CUCHECK(cuMemRelease(handle->mc_handle)); + CUCHECK(cuMemAddressFree(handle->mc_va, handle->size)); + // Unmap and release UC VA + for (size_t i = 0; i < handle->ipc_uc_vas.size(); ++i) + { + CUCHECK(cuMemUnmap(handle->ipc_uc_vas[i], handle->size)); + CUCHECK(cuMemRelease(handle->ipc_uc_handles[i])); + CUCHECK(cuMemAddressFree(handle->ipc_uc_vas[i], handle->size)); + } + + delete handle; #else TLLM_THROW("ipcNvlsFree needs to be compiled with ENABLE_MULTI_DEVICE"); #endif diff --git a/cpp/tensorrt_llm/runtime/tllmBuffers.h b/cpp/tensorrt_llm/runtime/tllmBuffers.h index ada3322ef1..f9ebd76da2 100644 --- a/cpp/tensorrt_llm/runtime/tllmBuffers.h +++ b/cpp/tensorrt_llm/runtime/tllmBuffers.h @@ -709,7 +709,7 @@ public: { other.mSize = 0; other.mCapacity = 0; - other.mHandle = IpcNvlsHandle{}; + other.mHandle = nullptr; } ~MulticastBuffer() override @@ -733,7 +733,7 @@ public: // reset other other.mSize = 0; other.mCapacity = 0; - other.mHandle = IpcNvlsHandle{}; + other.mHandle = nullptr; } return *this; } @@ -741,22 +741,22 @@ public: // Return list of pointers to each rank [[nodiscard]] void* dataIpcList() { - return reinterpret_cast(mHandle.ipc_uc_ptrs.data()); + return reinterpret_cast(mHandle->ipc_uc_ptrs.data()); } [[nodiscard]] void const* dataIpcList() const { - return reinterpret_cast(mHandle.ipc_uc_ptrs.data()); + return reinterpret_cast(mHandle->ipc_uc_ptrs.data()); } [[nodiscard]] void* dataMC() { - return reinterpret_cast(mHandle.mc_ptr); + return reinterpret_cast(mHandle->mc_ptr); } [[nodiscard]] void const* dataMC() const { - return reinterpret_cast(mHandle.mc_ptr); + return reinterpret_cast(mHandle->mc_ptr); } ////////////////////////// @@ -768,13 +768,13 @@ public: // Return unicast pointer [[nodiscard]] void* data() override { - return reinterpret_cast(mHandle.uc_ptr); + return reinterpret_cast(mHandle->uc_ptr); } // Return unicast pointer [[nodiscard]] void const* data() const override { - return reinterpret_cast(mHandle.uc_ptr); + return reinterpret_cast(mHandle->uc_ptr); } [[nodiscard]] std::size_t getSize() const override @@ -806,8 +806,8 @@ public: printf("MulticastBuffer resize: %d B\n", int(toBytes(newSize))); mHandle = ipcNvlsAllocate(toBytes(newSize), mRanks); - TLLM_CHECK(mHandle.size % BufferDataType(mType).getSize() == 0); - mCapacity = mHandle.size / BufferDataType(mType).getSize(); + TLLM_CHECK(mHandle->size % BufferDataType(mType).getSize() == 0); + mCapacity = mHandle->size / BufferDataType(mType).getSize(); } mSize = newSize; } @@ -816,7 +816,7 @@ public: { if (mCapacity > 0) { - TLLM_CHECK(mHandle.size > 0); + TLLM_CHECK(mHandle->size > 0); ipcNvlsFree(mHandle); } } @@ -826,7 +826,7 @@ private: std::size_t mCapacity = 0; nvinfer1::DataType mType; std::set mRanks; - IpcNvlsHandle mHandle; + IpcNvlsHandle* mHandle; }; using DeviceBuffer = GenericBuffer; diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt index c7a823801a..c458ddfc74 100644 --- a/cpp/tests/CMakeLists.txt +++ b/cpp/tests/CMakeLists.txt @@ -100,14 +100,10 @@ add_gtest(eagleLayerTest layers/eagleLayerTest.cpp) add_subdirectory(utils) -if(BUILD_BATCH_MANAGER) - if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/batch_manager) - add_subdirectory(batch_manager) - endif() +if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/batch_manager) + add_subdirectory(batch_manager) endif() -if(BUILD_EXECUTOR) - if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/executor) - add_subdirectory(executor) - endif() +if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/executor) + add_subdirectory(executor) endif() diff --git a/cpp/tests/unit_tests/CMakeLists.txt b/cpp/tests/unit_tests/CMakeLists.txt index a3e697d4ae..ffe5391014 100644 --- a/cpp/tests/unit_tests/CMakeLists.txt +++ b/cpp/tests/unit_tests/CMakeLists.txt @@ -9,16 +9,12 @@ # license agreement from NVIDIA CORPORATION or its affiliates is strictly # prohibited. -if(BUILD_BATCH_MANAGER) - if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/batch_manager) - add_subdirectory(batch_manager) - endif() +if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/batch_manager) + add_subdirectory(batch_manager) endif() -if(BUILD_EXECUTOR) - if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/executor) - add_subdirectory(executor) - endif() +if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/executor) + add_subdirectory(executor) endif() add_subdirectory(common)