mirror of
https://github.com/NVIDIA/TensorRT-LLM.git
synced 2026-01-13 22:18:36 +08:00
chore: Stabilize ABI boundary for internal kernel library (#3117)
chore: Stabilize ABI boundary for internal kernel library Signed-off-by: Yuan Tong <13075180+tongyuantongyu@users.noreply.github.com>
This commit is contained in:
parent
410f56357e
commit
a139eae425
1
.gitattributes
vendored
1
.gitattributes
vendored
@ -2,3 +2,4 @@
|
||||
*.lib filter=lfs diff=lfs merge=lfs -text
|
||||
*.so filter=lfs diff=lfs merge=lfs -text
|
||||
*.dll filter=lfs diff=lfs merge=lfs -text
|
||||
*.xz filter=lfs diff=lfs merge=lfs -text
|
||||
|
||||
@ -50,36 +50,6 @@ else()
|
||||
message(STATUS "NVTX is enabled")
|
||||
endif()
|
||||
|
||||
if(EXISTS
|
||||
"${CMAKE_CURRENT_SOURCE_DIR}/tensorrt_llm/batch_manager/CMakeLists.txt")
|
||||
set(BUILD_BATCH_MANAGER_DEFAULT ON)
|
||||
else()
|
||||
set(BUILD_BATCH_MANAGER_DEFAULT OFF)
|
||||
endif()
|
||||
|
||||
option(BUILD_BATCH_MANAGER "Build batch manager from source"
|
||||
${BUILD_BATCH_MANAGER_DEFAULT})
|
||||
|
||||
if(BUILD_BATCH_MANAGER)
|
||||
message(STATUS "Building batch manager")
|
||||
else()
|
||||
message(STATUS "Importing batch manager")
|
||||
endif()
|
||||
|
||||
if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/tensorrt_llm/executor/CMakeLists.txt")
|
||||
set(BUILD_EXECUTOR_DEFAULT ON)
|
||||
else()
|
||||
set(BUILD_EXECUTOR_DEFAULT OFF)
|
||||
endif()
|
||||
|
||||
option(BUILD_EXECUTOR "Build executor from source" ${BUILD_EXECUTOR_DEFAULT})
|
||||
|
||||
if(BUILD_EXECUTOR)
|
||||
message(STATUS "Building executor")
|
||||
else()
|
||||
message(STATUS "Importing executor")
|
||||
endif()
|
||||
|
||||
if(EXISTS
|
||||
"${CMAKE_CURRENT_SOURCE_DIR}/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/CMakeLists.txt"
|
||||
)
|
||||
|
||||
@ -25,7 +25,7 @@ namespace tensorrt_llm::common
|
||||
{
|
||||
[[noreturn]] inline void throwRuntimeError(char const* const file, int const line, std::string const& info = "")
|
||||
{
|
||||
throw TllmException(file, line, fmtstr("[TensorRT-LLM][ERROR] Assertion failed: %s", info.c_str()));
|
||||
throw TllmException(file, line, fmtstr("[TensorRT-LLM][ERROR] Assertion failed: %s", info.c_str()).c_str());
|
||||
}
|
||||
|
||||
} // namespace tensorrt_llm::common
|
||||
|
||||
@ -125,8 +125,8 @@ void check(T ptr, char const* const func, char const* const file, int const line
|
||||
{
|
||||
if (ptr)
|
||||
{
|
||||
throw TllmException(
|
||||
file, line, fmtstr("[TensorRT-LLM][ERROR] CUDA runtime error in %s: %s", func, _cudaGetErrorEnum(ptr)));
|
||||
throw TllmException(file, line,
|
||||
fmtstr("[TensorRT-LLM][ERROR] CUDA runtime error in %s: %s", func, _cudaGetErrorEnum(ptr)).c_str());
|
||||
}
|
||||
}
|
||||
|
||||
@ -136,8 +136,8 @@ void checkEx(
|
||||
{
|
||||
if (std::all_of(std::begin(validReturns), std::end(validReturns), [&ptr](T const& t) { return t != ptr; }))
|
||||
{
|
||||
throw TllmException(
|
||||
file, line, fmtstr("[TensorRT-LLM][ERROR] CUDA runtime error in %s: %s", func, _cudaGetErrorEnum(ptr)));
|
||||
throw TllmException(file, line,
|
||||
fmtstr("[TensorRT-LLM][ERROR] CUDA runtime error in %s: %s", func, _cudaGetErrorEnum(ptr)).c_str());
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@ -21,6 +21,7 @@
|
||||
#endif // ENABLE_BF16
|
||||
#include <cuda_fp16.h>
|
||||
|
||||
#include <cstdarg>
|
||||
#include <memory> // std::make_unique
|
||||
#include <sstream> // std::stringstream
|
||||
#include <string>
|
||||
@ -101,12 +102,40 @@ inline std::string fmtstr(std::string&& s)
|
||||
return s;
|
||||
}
|
||||
|
||||
typedef char* (*fmtstr_allocator)(void* target, size_t count);
|
||||
void fmtstr_(char const* format, fmtstr_allocator alloc, void* target, va_list args);
|
||||
|
||||
#if defined(_MSC_VER)
|
||||
std::string fmtstr(char const* format, ...);
|
||||
inline std::string fmtstr(char const* format, ...);
|
||||
#else
|
||||
std::string fmtstr(char const* format, ...) __attribute__((format(printf, 1, 2)));
|
||||
inline std::string fmtstr(char const* format, ...) __attribute__((format(printf, 1, 2)));
|
||||
#endif
|
||||
|
||||
inline std::string fmtstr(char const* format, ...)
|
||||
{
|
||||
std::string result;
|
||||
|
||||
va_list args;
|
||||
va_start(args, format);
|
||||
fmtstr_(
|
||||
format,
|
||||
[](void* target, size_t count) -> char*
|
||||
{
|
||||
if (count <= 0)
|
||||
{
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
const auto str = static_cast<std::string*>(target);
|
||||
str->resize(count);
|
||||
return str->data();
|
||||
},
|
||||
&result, args);
|
||||
va_end(args);
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
// __PRETTY_FUNCTION__ is used for neat debugging printing but is not supported on Windows
|
||||
// The alternative is __FUNCSIG__, which is similar but not identical
|
||||
#if defined(_WIN32)
|
||||
|
||||
@ -22,7 +22,7 @@
|
||||
#include <string>
|
||||
|
||||
#define NEW_TLLM_EXCEPTION(...) \
|
||||
tensorrt_llm::common::TllmException(__FILE__, __LINE__, tensorrt_llm::common::fmtstr(__VA_ARGS__))
|
||||
tensorrt_llm::common::TllmException(__FILE__, __LINE__, tensorrt_llm::common::fmtstr(__VA_ARGS__).c_str())
|
||||
|
||||
namespace tensorrt_llm::common
|
||||
{
|
||||
@ -32,7 +32,7 @@ class TllmException : public std::runtime_error
|
||||
public:
|
||||
static auto constexpr MAX_FRAMES = 128;
|
||||
|
||||
explicit TllmException(char const* file, std::size_t line, std::string const& msg);
|
||||
explicit TllmException(char const* file, std::size_t line, char const* msg);
|
||||
|
||||
~TllmException() noexcept override;
|
||||
|
||||
|
||||
@ -24,10 +24,13 @@ namespace tensorrt_llm::runtime
|
||||
{
|
||||
struct IpcNvlsHandle
|
||||
{
|
||||
// Begin internal kernel visible fields
|
||||
// Changes to these fields must sync with ipcNvlsMemory.h in internal kernel repo
|
||||
size_t size = 0;
|
||||
// Device pointers used by kernels
|
||||
uintptr_t uc_ptr = 0;
|
||||
uintptr_t mc_ptr = 0;
|
||||
// End internal kernel visible fields
|
||||
std::vector<uintptr_t> ipc_uc_ptrs;
|
||||
// Device pointers
|
||||
CUdeviceptr uc_va;
|
||||
@ -43,9 +46,9 @@ void MPI_group_barrier(std::set<int> ranks);
|
||||
|
||||
bool ipcNvlsSupported();
|
||||
|
||||
IpcNvlsHandle ipcNvlsAllocate(size_t size, std::set<int> ranks);
|
||||
IpcNvlsHandle* ipcNvlsAllocate(size_t size, std::set<int> ranks);
|
||||
|
||||
void ipcNvlsFree(IpcNvlsHandle handle);
|
||||
void ipcNvlsFree(IpcNvlsHandle* handle);
|
||||
|
||||
template <typename T>
|
||||
class DeviceAllocationNvls
|
||||
@ -68,19 +71,19 @@ public:
|
||||
// Return device pointer to multicast memory
|
||||
[[nodiscard]] T* getMulticastPointer() const
|
||||
{
|
||||
return reinterpret_cast<T*>(_handle.mc_ptr);
|
||||
return reinterpret_cast<T*>(_handle->mc_ptr);
|
||||
}
|
||||
|
||||
// Return device pointer for current rank
|
||||
[[nodiscard]] T* getUnicastPointer() const
|
||||
{
|
||||
return reinterpret_cast<T*>(_handle.uc_ptr);
|
||||
return reinterpret_cast<T*>(_handle->uc_ptr);
|
||||
}
|
||||
|
||||
// Return host list of device pointers to memory on each rank
|
||||
[[nodiscard]] T** getIpcUnicastPointers()
|
||||
{
|
||||
return reinterpret_cast<T**>(_handle.ipc_uc_ptrs.data());
|
||||
return reinterpret_cast<T**>(_handle->ipc_uc_ptrs.data());
|
||||
}
|
||||
|
||||
[[nodiscard]] size_t getCapacity() const
|
||||
@ -99,6 +102,6 @@ public:
|
||||
|
||||
private:
|
||||
size_t _capacity = 0;
|
||||
IpcNvlsHandle _handle;
|
||||
IpcNvlsHandle* _handle;
|
||||
};
|
||||
} // namespace tensorrt_llm::runtime
|
||||
|
||||
@ -85,121 +85,12 @@ endif()
|
||||
|
||||
set(BATCH_MANAGER_TARGET tensorrt_llm_batch_manager_static)
|
||||
set(BATCH_MANAGER_TARGET_ARCH ${TARGET_ARCH})
|
||||
|
||||
if(BUILD_BATCH_MANAGER)
|
||||
add_subdirectory(batch_manager)
|
||||
else()
|
||||
add_library(${BATCH_MANAGER_TARGET} STATIC IMPORTED)
|
||||
if(NOT WIN32) # Linux
|
||||
if(USE_CXX11_ABI)
|
||||
set(BATCH_MANAGER_LIB_LOC
|
||||
"${CMAKE_CURRENT_SOURCE_DIR}/batch_manager/${BATCH_MANAGER_TARGET_ARCH}/libtensorrt_llm_batch_manager_static.a"
|
||||
)
|
||||
else()
|
||||
set(BATCH_MANAGER_LIB_LOC
|
||||
"${CMAKE_CURRENT_SOURCE_DIR}/batch_manager/${BATCH_MANAGER_TARGET_ARCH}/libtensorrt_llm_batch_manager_static.pre_cxx11.a"
|
||||
)
|
||||
endif()
|
||||
else() # Windows
|
||||
set(BATCH_MANAGER_LIB_LOC
|
||||
"${CMAKE_CURRENT_SOURCE_DIR}/batch_manager/${BATCH_MANAGER_TARGET_ARCH}/tensorrt_llm_batch_manager_static.lib"
|
||||
)
|
||||
endif()
|
||||
set_property(TARGET ${BATCH_MANAGER_TARGET} PROPERTY IMPORTED_LOCATION
|
||||
${BATCH_MANAGER_LIB_LOC})
|
||||
file(SIZE ${BATCH_MANAGER_LIB_LOC} BATCH_MANAGER_LIB_SIZE)
|
||||
if(BATCH_MANAGER_LIB_SIZE LESS 1024)
|
||||
message(
|
||||
FATAL_ERROR
|
||||
"The batch manager library is truncated or incomplete. This is usually caused by using Git LFS (Large File Storage) incorrectly. Please try running command `git lfs install && git lfs pull`."
|
||||
)
|
||||
endif()
|
||||
endif()
|
||||
add_subdirectory(batch_manager)
|
||||
|
||||
set(EXECUTOR_TARGET tensorrt_llm_executor_static)
|
||||
set(EXECUTOR_TARGET_ARCH ${TARGET_ARCH})
|
||||
set(UCX_WRAPPER_TARGET tensorrt_llm_ucx_wrapper)
|
||||
|
||||
if(BUILD_EXECUTOR)
|
||||
add_subdirectory(executor)
|
||||
else()
|
||||
add_library(${EXECUTOR_TARGET} STATIC IMPORTED)
|
||||
if(NOT WIN32) # Linux
|
||||
if(USE_CXX11_ABI)
|
||||
set(EXECUTOR_LIB_LOC
|
||||
"${CMAKE_CURRENT_SOURCE_DIR}/executor/${EXECUTOR_TARGET_ARCH}/libtensorrt_llm_executor_static.a"
|
||||
)
|
||||
else()
|
||||
set(EXECUTOR_LIB_LOC
|
||||
"${CMAKE_CURRENT_SOURCE_DIR}/executor/${EXECUTOR_TARGET_ARCH}/libtensorrt_llm_executor_static.pre_cxx11.a"
|
||||
)
|
||||
endif()
|
||||
else() # Windows
|
||||
set(EXECUTOR_LIB_LOC
|
||||
"${CMAKE_CURRENT_SOURCE_DIR}/executor/${EXECUTOR_TARGET_ARCH}/tensorrt_llm_executor_static.lib"
|
||||
)
|
||||
endif()
|
||||
set_property(TARGET ${EXECUTOR_TARGET} PROPERTY IMPORTED_LOCATION
|
||||
${EXECUTOR_LIB_LOC})
|
||||
file(SIZE ${EXECUTOR_LIB_LOC} EXECUTOR_LIB_SIZE)
|
||||
if(EXECUTOR_LIB_SIZE LESS 1024)
|
||||
message(
|
||||
FATAL_ERROR
|
||||
"The executor library is truncated or incomplete. This is usually caused by using Git LFS (Large File Storage) incorrectly. Please try running command `git lfs install && git lfs pull`."
|
||||
)
|
||||
endif()
|
||||
|
||||
if(ENABLE_UCX)
|
||||
add_library(${UCX_WRAPPER_TARGET} SHARED IMPORTED)
|
||||
if(NOT WIN32) # Linux
|
||||
set(UCX_WRAPPER_LIB_SOURCE_REL_LOC
|
||||
"executor/cache_transmission/ucx_utils/${EXECUTOR_TARGET_ARCH}/libtensorrt_llm_ucx_wrapper.so"
|
||||
)
|
||||
set(UCX_WRAPPER_LIB_BINARY_REL_LOC
|
||||
"executor/cache_transmission/ucx_utils/libtensorrt_llm_ucx_wrapper.so"
|
||||
)
|
||||
else()
|
||||
set(UCX_WRAPPER_LIB_BINARY_REL_DIR
|
||||
"executor/cache_transmission/ucx_utils/")
|
||||
set(UCX_WRAPPER_DLL_NAME "tensorrt_llm_ucx_wrapper.dll")
|
||||
set(UCX_WRAPPER_LIB_NAME "tensorrt_llm_ucx_wrapper.lib")
|
||||
|
||||
set(UCX_WRAPPER_LIB_SOURCE_REL_LOC
|
||||
"${UCX_WRAPPER_LIB_BINARY_REL_DIR}/${EXECUTOR_TARGET_ARCH}/${UCX_WRAPPER_DLL_NAME}"
|
||||
)
|
||||
set(UCX_WRAPPER_LIB_BINARY_REL_LOC
|
||||
"${UCX_WRAPPER_LIB_BINARY_REL_DIR}/${UCX_WRAPPER_DLL_NAME}")
|
||||
set(UCX_WRAPPER_IMPLIB_SOURCE_REL_LOC
|
||||
"${UCX_WRAPPER_LIB_BINARY_REL_DIR}/${EXECUTOR_TARGET_ARCH}/${UCX_WRAPPER_LIB_NAME}"
|
||||
)
|
||||
set(UCX_WRAPPER_IMPLIB_BINARY_REL_LOC
|
||||
"${UCX_WRAPPER_LIB_BINARY_REL_DIR}/${UCX_WRAPPER_LIB_NAME}")
|
||||
endif()
|
||||
set(UCX_WRAPPER_LIB_LOC
|
||||
"${CMAKE_CURRENT_SOURCE_DIR}/${UCX_WRAPPER_LIB_SOURCE_REL_LOC}")
|
||||
# Copy the .so to build directory, which is needed in build_wheel.py.
|
||||
configure_file(${UCX_WRAPPER_LIB_SOURCE_REL_LOC}
|
||||
${UCX_WRAPPER_LIB_BINARY_REL_LOC} COPYONLY)
|
||||
set_property(TARGET ${UCX_WRAPPER_TARGET} PROPERTY IMPORTED_LOCATION
|
||||
${UCX_WRAPPER_LIB_LOC})
|
||||
if(WIN32)
|
||||
set(UCX_WRAPPER_IMPLIB_LOC
|
||||
"${CMAKE_CURRENT_SOURCE_DIR}/${UCX_WRAPPER_IMPLIB_SOURCE_REL_LOC}")
|
||||
configure_file(${UCX_WRAPPER_IMPLIB_SOURCE_REL_LOC}
|
||||
${UCX_WRAPPER_IMPLIB_BINARY_REL_LOC} COPYONLY)
|
||||
set_property(TARGET ${UCX_WRAPPER_TARGET}
|
||||
PROPERTY IMPORTED_IMPLIB ${UCX_WRAPPER_IMPLIB_LOC})
|
||||
endif()
|
||||
|
||||
file(SIZE ${UCX_WRAPPER_LIB_LOC} UCX_WRAPPER_LIB_SIZE)
|
||||
if(UCX_WRAPPER_LIB_SIZE LESS 1024)
|
||||
message(
|
||||
FATAL_ERROR
|
||||
"The ucx wrapper library is truncated or incomplete. This is usually caused by using Git LFS (Large File Storage) incorrectly. Please try running command `git lfs install && git lfs pull`."
|
||||
)
|
||||
endif()
|
||||
endif()
|
||||
endif()
|
||||
add_subdirectory(executor)
|
||||
|
||||
set(INTERNAL_CUTLASS_KERNELS_TARGET
|
||||
tensorrt_llm_internal_cutlass_kernels_static)
|
||||
@ -208,24 +99,33 @@ if(BUILD_INTERNAL_CUTLASS_KERNELS)
|
||||
add_subdirectory(kernels/internal_cutlass_kernels)
|
||||
else()
|
||||
add_library(${INTERNAL_CUTLASS_KERNELS_TARGET} STATIC IMPORTED)
|
||||
set(INTERNAL_CUTLASS_KERNELS_LIB_TARBALL
|
||||
"${CMAKE_CURRENT_SOURCE_DIR}/kernels/internal_cutlass_kernels/${INTERNAL_CUTLASS_KERNELS_TARGET_ARCH}/${INTERNAL_CUTLASS_KERNELS_TARGET}.tar.xz"
|
||||
)
|
||||
if(NOT WIN32) # Linux
|
||||
if(USE_CXX11_ABI)
|
||||
set(INTERNAL_CUTLASS_KERNELS_LIB_LOC
|
||||
"${CMAKE_CURRENT_SOURCE_DIR}/kernels/internal_cutlass_kernels/${INTERNAL_CUTLASS_KERNELS_TARGET_ARCH}/libtensorrt_llm_internal_cutlass_kernels_static.a"
|
||||
)
|
||||
else()
|
||||
set(INTERNAL_CUTLASS_KERNELS_LIB_LOC
|
||||
"${CMAKE_CURRENT_SOURCE_DIR}/kernels/internal_cutlass_kernels/${INTERNAL_CUTLASS_KERNELS_TARGET_ARCH}/libtensorrt_llm_internal_cutlass_kernels_static.pre_cxx11.a"
|
||||
)
|
||||
endif()
|
||||
set(INTERNAL_CUTLASS_KERNELS_LIB_NAME
|
||||
"lib${INTERNAL_CUTLASS_KERNELS_TARGET}.a")
|
||||
else() # Windows
|
||||
set(INTERNAL_CUTLASS_KERNELS_LIB_LOC
|
||||
"${CMAKE_CURRENT_SOURCE_DIR}/kernels/internal_cutlass_kernels/${INTERNAL_CUTLASS_KERNELS_TARGET_ARCH}/tensorrt_llm_internal_cutlass_kernels_static.lib"
|
||||
)
|
||||
set(INTERNAL_CUTLASS_KERNELS_LIB_NAME
|
||||
"${INTERNAL_CUTLASS_KERNELS_TARGET}.lib")
|
||||
endif()
|
||||
set(INTERNAL_CUTLASS_KERNELS_LIB_PATH
|
||||
"${CMAKE_CURRENT_BINARY_DIR}/${INTERNAL_CUTLASS_KERNELS_LIB_NAME}")
|
||||
add_custom_command(
|
||||
OUTPUT ${INTERNAL_CUTLASS_KERNELS_LIB_PATH}
|
||||
COMMAND ${CMAKE_COMMAND} -E tar xf ${INTERNAL_CUTLASS_KERNELS_LIB_TARBALL}
|
||||
DEPENDS ${INTERNAL_CUTLASS_KERNELS_LIB_TARBALL}
|
||||
WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
|
||||
VERBATIM)
|
||||
add_custom_target(${INTERNAL_CUTLASS_KERNELS_TARGET}_helper
|
||||
DEPENDS ${INTERNAL_CUTLASS_KERNELS_LIB_PATH})
|
||||
add_dependencies(${INTERNAL_CUTLASS_KERNELS_TARGET}
|
||||
${INTERNAL_CUTLASS_KERNELS_TARGET}_helper)
|
||||
set_property(TARGET ${INTERNAL_CUTLASS_KERNELS_TARGET}
|
||||
PROPERTY IMPORTED_LOCATION ${INTERNAL_CUTLASS_KERNELS_LIB_LOC})
|
||||
file(SIZE ${INTERNAL_CUTLASS_KERNELS_LIB_LOC}
|
||||
PROPERTY IMPORTED_LOCATION ${INTERNAL_CUTLASS_KERNELS_LIB_PATH})
|
||||
target_link_libraries(${INTERNAL_CUTLASS_KERNELS_TARGET}
|
||||
INTERFACE ${INTERNAL_CUTLASS_KERNELS_LIB_PATH})
|
||||
file(SIZE ${INTERNAL_CUTLASS_KERNELS_LIB_TARBALL}
|
||||
INTERNAL_CUTLASS_KERNELS_LIB_SIZE)
|
||||
if(INTERNAL_CUTLASS_KERNELS_LIB_SIZE LESS 1024)
|
||||
message(
|
||||
@ -239,70 +139,6 @@ find_package(Threads REQUIRED)
|
||||
target_link_libraries(${BATCH_MANAGER_TARGET} INTERFACE Threads::Threads)
|
||||
target_link_libraries(${EXECUTOR_TARGET} INTERFACE Threads::Threads)
|
||||
|
||||
if(NOT WIN32)
|
||||
if(USE_CXX11_ABI)
|
||||
add_custom_command(
|
||||
OUTPUT "${CMAKE_CURRENT_BINARY_DIR}/.check_symbol"
|
||||
COMMAND nm -C $<TARGET_FILE:${BATCH_MANAGER_TARGET}> | grep -q
|
||||
'std::__cxx11::'
|
||||
DEPENDS ${BATCH_MANAGER_TARGET})
|
||||
else()
|
||||
add_custom_command(
|
||||
OUTPUT "${CMAKE_CURRENT_BINARY_DIR}/.check_symbol"
|
||||
COMMAND nm -C $<TARGET_FILE:${BATCH_MANAGER_TARGET}> | grep -qv
|
||||
'std::__cxx11::'
|
||||
DEPENDS ${BATCH_MANAGER_TARGET})
|
||||
endif()
|
||||
add_custom_target(check_symbol
|
||||
DEPENDS "${CMAKE_CURRENT_BINARY_DIR}/.check_symbol")
|
||||
else()
|
||||
add_custom_target(check_symbol)
|
||||
endif()
|
||||
|
||||
if(NOT WIN32)
|
||||
if(USE_CXX11_ABI)
|
||||
add_custom_command(
|
||||
OUTPUT
|
||||
"${CMAKE_CURRENT_BINARY_DIR}/.check_symbol_internal_cutlass_kernels"
|
||||
COMMAND nm -C $<TARGET_FILE:${INTERNAL_CUTLASS_KERNELS_TARGET}> | grep -q
|
||||
'std::__cxx11::'
|
||||
DEPENDS ${INTERNAL_CUTLASS_KERNELS_TARGET})
|
||||
else()
|
||||
add_custom_command(
|
||||
OUTPUT
|
||||
"${CMAKE_CURRENT_BINARY_DIR}/.check_symbol_internal_cutlass_kernels"
|
||||
COMMAND nm -C $<TARGET_FILE:${INTERNAL_CUTLASS_KERNELS_TARGET}> | grep -qv
|
||||
'std::__cxx11::'
|
||||
DEPENDS ${INTERNAL_CUTLASS_KERNELS_TARGET})
|
||||
endif()
|
||||
add_custom_target(
|
||||
check_symbol_internal_cutlass_kernels
|
||||
DEPENDS "${CMAKE_CURRENT_BINARY_DIR}/.check_symbol_internal_cutlass_kernels"
|
||||
)
|
||||
else()
|
||||
add_custom_target(check_symbol_internal_cutlass_kernels)
|
||||
endif()
|
||||
|
||||
if(NOT WIN32)
|
||||
if(USE_CXX11_ABI)
|
||||
add_custom_command(
|
||||
OUTPUT "${CMAKE_CURRENT_BINARY_DIR}/.check_symbol_executor"
|
||||
COMMAND nm -C $<TARGET_FILE:${EXECUTOR_TARGET}> | grep -q 'std::__cxx11::'
|
||||
DEPENDS ${EXECUTOR_TARGET})
|
||||
else()
|
||||
add_custom_command(
|
||||
OUTPUT "${CMAKE_CURRENT_BINARY_DIR}/.check_symbol_executor"
|
||||
COMMAND nm -C $<TARGET_FILE:${EXECUTOR_TARGET}> | grep -qv
|
||||
'std::__cxx11::'
|
||||
DEPENDS ${EXECUTOR_TARGET})
|
||||
endif()
|
||||
add_custom_target(
|
||||
check_symbol_executor
|
||||
DEPENDS "${CMAKE_CURRENT_BINARY_DIR}/.check_symbol_executor")
|
||||
else()
|
||||
add_custom_target(check_symbol_executor)
|
||||
endif()
|
||||
|
||||
set(NVRTC_WRAPPER_TARGET tensorrt_llm_nvrtc_wrapper)
|
||||
set(NVRTC_WRAPPER_TARGET_ARCH ${TARGET_ARCH})
|
||||
|
||||
@ -311,48 +147,40 @@ if(BUILD_NVRTC_WRAPPER)
|
||||
kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper)
|
||||
else()
|
||||
add_library(${NVRTC_WRAPPER_TARGET} SHARED IMPORTED)
|
||||
set(NVRTC_WRAPPER_LIB_TARBALL
|
||||
"${CMAKE_CURRENT_SOURCE_DIR}/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/${NVRTC_WRAPPER_TARGET_ARCH}/${NVRTC_WRAPPER_TARGET}.tar.xz"
|
||||
)
|
||||
set(NVRTC_WRAPPER_LIB_BINARY_DIR
|
||||
"${CMAKE_CURRENT_BINARY_DIR}/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper"
|
||||
)
|
||||
if(NOT WIN32) # Linux
|
||||
set(NVRTC_WRAPPER_LIB_SOURCE_REL_LOC
|
||||
"kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/${NVRTC_WRAPPER_TARGET_ARCH}/libtensorrt_llm_nvrtc_wrapper.so"
|
||||
)
|
||||
set(NVRTC_WRAPPER_LIB_BINARY_REL_LOC
|
||||
"kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/libtensorrt_llm_nvrtc_wrapper.so"
|
||||
)
|
||||
else()
|
||||
set(NVRTC_WRAPPER_LIB_BINARY_REL_DIR
|
||||
"kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper"
|
||||
)
|
||||
set(NVRTC_WRAPPER_DLL_NAME "tensorrt_llm_nvrtc_wrapper.dll")
|
||||
set(NVRTC_WRAPPER_LIB_NAME "tensorrt_llm_nvrtc_wrapper.lib")
|
||||
|
||||
set(NVRTC_WRAPPER_LIB_SOURCE_REL_LOC
|
||||
"${NVRTC_WRAPPER_LIB_BINARY_REL_DIR}/${NVRTC_WRAPPER_TARGET_ARCH}/${NVRTC_WRAPPER_DLL_NAME}"
|
||||
)
|
||||
set(NVRTC_WRAPPER_LIB_BINARY_REL_LOC
|
||||
"${NVRTC_WRAPPER_LIB_BINARY_REL_DIR}/${NVRTC_WRAPPER_DLL_NAME}")
|
||||
set(NVRTC_WRAPPER_IMPLIB_SOURCE_REL_LOC
|
||||
"${NVRTC_WRAPPER_LIB_BINARY_REL_DIR}/${NVRTC_WRAPPER_TARGET_ARCH}/${NVRTC_WRAPPER_LIB_NAME}"
|
||||
)
|
||||
set(NVRTC_WRAPPER_IMPLIB_BINARY_REL_LOC
|
||||
"${NVRTC_WRAPPER_LIB_BINARY_REL_DIR}/${NVRTC_WRAPPER_LIB_NAME}")
|
||||
set(NVRTC_WRAPPER_LIB_NAME "lib${NVRTC_WRAPPER_TARGET}.so")
|
||||
else() # Windows
|
||||
set(NVRTC_WRAPPER_LIB_NAME "${NVRTC_WRAPPER_TARGET}.lib")
|
||||
set(NVRTC_WRAPPER_DLL_NAME "${NVRTC_WRAPPER_TARGET}.dll")
|
||||
set(NVRTC_WRAPPER_DLL_PATH
|
||||
"${NVRTC_WRAPPER_LIB_BINARY_DIR}/${NVRTC_WRAPPER_DLL_NAME}")
|
||||
endif()
|
||||
set(NVRTC_WRAPPER_LIB_LOC
|
||||
"${CMAKE_CURRENT_SOURCE_DIR}/${NVRTC_WRAPPER_LIB_SOURCE_REL_LOC}")
|
||||
# Copy the .so to build directory, which is needed in build_wheel.py.
|
||||
configure_file(${NVRTC_WRAPPER_LIB_SOURCE_REL_LOC}
|
||||
${NVRTC_WRAPPER_LIB_BINARY_REL_LOC} COPYONLY)
|
||||
set_property(TARGET ${NVRTC_WRAPPER_TARGET} PROPERTY IMPORTED_LOCATION
|
||||
${NVRTC_WRAPPER_LIB_LOC})
|
||||
set(NVRTC_WRAPPER_LIB_PATH
|
||||
"${NVRTC_WRAPPER_LIB_BINARY_DIR}/${NVRTC_WRAPPER_LIB_NAME}")
|
||||
add_custom_command(
|
||||
OUTPUT ${NVRTC_WRAPPER_LIB_PATH} ${NVRTC_WRAPPER_DLL_PATH}
|
||||
COMMAND ${CMAKE_COMMAND} -E make_directory ${NVRTC_WRAPPER_LIB_BINARY_DIR}
|
||||
COMMAND ${CMAKE_COMMAND} -E chdir ${NVRTC_WRAPPER_LIB_BINARY_DIR}
|
||||
${CMAKE_COMMAND} -E tar xf ${NVRTC_WRAPPER_LIB_TARBALL}
|
||||
DEPENDS ${NVRTC_WRAPPER_LIB_TARBALL}
|
||||
VERBATIM)
|
||||
add_custom_target(${NVRTC_WRAPPER_TARGET}_helper
|
||||
DEPENDS ${NVRTC_WRAPPER_LIB_PATH} ${NVRTC_WRAPPER_DLL_PATH})
|
||||
add_dependencies(${NVRTC_WRAPPER_TARGET} ${NVRTC_WRAPPER_TARGET}_helper)
|
||||
set_property(TARGET ${NVRTC_WRAPPER_TARGET}
|
||||
PROPERTY IMPORTED_LOCATION ${NVRTC_WRAPPER_LIB_PATH})
|
||||
if(WIN32)
|
||||
set(NVRTC_WRAPPER_IMPLIB_LOC
|
||||
"${CMAKE_CURRENT_SOURCE_DIR}/${NVRTC_WRAPPER_IMPLIB_SOURCE_REL_LOC}")
|
||||
configure_file(${NVRTC_WRAPPER_IMPLIB_SOURCE_REL_LOC}
|
||||
${NVRTC_WRAPPER_IMPLIB_BINARY_REL_LOC} COPYONLY)
|
||||
set_property(TARGET ${NVRTC_WRAPPER_TARGET}
|
||||
PROPERTY IMPORTED_IMPLIB ${NVRTC_WRAPPER_IMPLIB_LOC})
|
||||
PROPERTY IMPORTED_IMPLIB ${NVRTC_WRAPPER_DLL_PATH})
|
||||
endif()
|
||||
|
||||
file(SIZE ${NVRTC_WRAPPER_LIB_LOC} NVRTC_WRAPPER_LIB_SIZE)
|
||||
file(SIZE ${INTERNAL_CUTLASS_KERNELS_LIB_TARBALL} NVRTC_WRAPPER_LIB_SIZE)
|
||||
if(NVRTC_WRAPPER_LIB_SIZE LESS 1024)
|
||||
message(
|
||||
FATAL_ERROR
|
||||
@ -414,25 +242,14 @@ set_target_properties(
|
||||
PROPERTIES CXX_STANDARD "17" CXX_STANDARD_REQUIRED "YES" CXX_EXTENSIONS "NO"
|
||||
LINK_FLAGS "${AS_NEEDED_FLAG} ${UNDEFINED_FLAG}")
|
||||
|
||||
function(link_whole_archive TARGET LIBRARY_TO_LINK)
|
||||
if(WIN32)
|
||||
target_link_libraries(${TARGET} PUBLIC $<TARGET_FILE:${LIBRARY_TO_LINK}>)
|
||||
set_target_properties(
|
||||
${TARGET} PROPERTIES LINK_FLAGS "/WHOLEARCHIVE:${LIBRARY_TO_LINK}")
|
||||
else()
|
||||
# Assume everything else is like gcc
|
||||
target_link_libraries(
|
||||
${TARGET} PRIVATE "-Wl,--whole-archive" $<TARGET_FILE:${LIBRARY_TO_LINK}>
|
||||
"-Wl,--no-whole-archive")
|
||||
endif()
|
||||
endfunction()
|
||||
|
||||
target_link_libraries(${SHARED_TARGET} PUBLIC ${TRTLLM_LINK_LIBS})
|
||||
|
||||
link_whole_archive(${SHARED_TARGET} ${BATCH_MANAGER_TARGET})
|
||||
link_whole_archive(${SHARED_TARGET} ${EXECUTOR_TARGET})
|
||||
link_whole_archive(${SHARED_TARGET} fp8_blockscale_gemm_src)
|
||||
link_whole_archive(${SHARED_TARGET} ${INTERNAL_CUTLASS_KERNELS_TARGET})
|
||||
target_link_libraries(
|
||||
${SHARED_TARGET}
|
||||
PRIVATE $<LINK_LIBRARY:WHOLE_ARCHIVE,${BATCH_MANAGER_TARGET}>
|
||||
$<LINK_LIBRARY:WHOLE_ARCHIVE,${EXECUTOR_TARGET}>
|
||||
$<LINK_LIBRARY:WHOLE_ARCHIVE,fp8_blockscale_gemm_src>
|
||||
$<LINK_LIBRARY:WHOLE_ARCHIVE,${INTERNAL_CUTLASS_KERNELS_TARGET}>)
|
||||
|
||||
# Link kernel_src and cutlass_src. static internal cutlass lib overridden.
|
||||
target_link_libraries(${SHARED_TARGET} PUBLIC kernels_src cutlass_src)
|
||||
@ -458,10 +275,6 @@ endif()
|
||||
|
||||
target_link_libraries(${SHARED_TARGET} PUBLIC ${NVRTC_WRAPPER_TARGET})
|
||||
|
||||
add_dependencies(${SHARED_TARGET} check_symbol)
|
||||
add_dependencies(${SHARED_TARGET} check_symbol_executor)
|
||||
add_dependencies(${SHARED_TARGET} check_symbol_internal_cutlass_kernels)
|
||||
|
||||
if(BUILD_PYT)
|
||||
add_subdirectory(thop)
|
||||
endif()
|
||||
|
||||
@ -126,8 +126,8 @@ void checkDriver(
|
||||
char const* errorString = nullptr;
|
||||
wrap.cuGetErrorName(result, &errorName);
|
||||
wrap.cuGetErrorString(result, &errorString);
|
||||
throw TllmException(
|
||||
file, line, fmtstr("[TensorRT-LLM][ERROR] CUDA driver error in %s: %s: %s.", func, errorName, errorString));
|
||||
throw TllmException(file, line,
|
||||
fmtstr("[TensorRT-LLM][ERROR] CUDA driver error in %s: %s: %s.", func, errorName, errorString).c_str());
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@ -26,35 +26,33 @@
|
||||
namespace tensorrt_llm::common
|
||||
{
|
||||
|
||||
namespace
|
||||
{
|
||||
std::string vformat(char const* fmt, va_list args)
|
||||
void fmtstr_(char const* format, fmtstr_allocator alloc, void* target, va_list args)
|
||||
{
|
||||
va_list args0;
|
||||
va_copy(args0, args);
|
||||
auto const size = vsnprintf(nullptr, 0, fmt, args0);
|
||||
if (size <= 0)
|
||||
return "";
|
||||
|
||||
std::string stringBuf(size, char{});
|
||||
auto const size2 = std::vsnprintf(&stringBuf[0], size + 1, fmt, args);
|
||||
size_t constexpr init_size = 2048;
|
||||
char fixed_buffer[init_size];
|
||||
auto const size = std::vsnprintf(fixed_buffer, init_size, format, args0);
|
||||
TLLM_CHECK_WITH_INFO(size >= 0, std::string(std::strerror(errno)));
|
||||
if (size == 0)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
TLLM_CHECK_WITH_INFO(size2 == size, std::string(std::strerror(errno)));
|
||||
auto* memory = alloc(target, size);
|
||||
|
||||
return stringBuf;
|
||||
if (static_cast<size_t>(size) < init_size)
|
||||
{
|
||||
std::memcpy(memory, fixed_buffer, size + 1);
|
||||
}
|
||||
else
|
||||
{
|
||||
auto const size2 = std::vsnprintf(memory, size + 1, format, args);
|
||||
TLLM_CHECK_WITH_INFO(size2 == size, std::string(std::strerror(errno)));
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
std::string fmtstr(char const* format, ...)
|
||||
{
|
||||
va_list args;
|
||||
va_start(args, format);
|
||||
std::string result = vformat(format, args);
|
||||
va_end(args);
|
||||
return result;
|
||||
};
|
||||
|
||||
std::unordered_set<std::string> str2set(std::string const& input, char delimiter)
|
||||
{
|
||||
std::unordered_set<std::string> values;
|
||||
|
||||
@ -35,18 +35,17 @@ int constexpr VOID_PTR_SZ = 2 + sizeof(void*) * 2;
|
||||
|
||||
#if !defined(_MSC_VER)
|
||||
|
||||
TllmException::TllmException(char const* file, std::size_t line, std::string const& msg)
|
||||
TllmException::TllmException(char const* file, std::size_t line, char const* msg)
|
||||
: std::runtime_error{""}
|
||||
{
|
||||
mNbFrames = backtrace(mCallstack.data(), MAX_FRAMES);
|
||||
auto const trace = getTrace();
|
||||
std::runtime_error::operator=(
|
||||
std::runtime_error{fmtstr("%s (%s:%zu)\n%s", msg.c_str(), file, line, trace.c_str())});
|
||||
std::runtime_error::operator=(std::runtime_error{fmtstr("%s (%s:%zu)\n%s", msg, file, line, trace.c_str())});
|
||||
}
|
||||
#else
|
||||
TllmException::TllmException(char const* file, std::size_t line, std::string const& msg)
|
||||
TllmException::TllmException(char const* file, std::size_t line, char const* msg)
|
||||
: mNbFrames{}
|
||||
, std::runtime_error{fmtstr("%s (%s:%zu)", msg.c_str(), file, line)}
|
||||
, std::runtime_error{fmtstr("%s (%s:%zu)", msg, file, line)}
|
||||
{
|
||||
}
|
||||
#endif
|
||||
|
||||
@ -16,23 +16,8 @@ if(ENABLE_UCX)
|
||||
set(TOP_LEVEL_DIR "${PROJECT_SOURCE_DIR}/..")
|
||||
target_compile_definitions(${UCX_WRAPPER_TARGET}
|
||||
PUBLIC TOP_LEVEL_DIR="${TOP_LEVEL_DIR}")
|
||||
|
||||
target_include_directories(
|
||||
${UCX_WRAPPER_TARGET}
|
||||
PRIVATE $<TARGET_PROPERTY:ucxx::ucxx,INTERFACE_INCLUDE_DIRECTORIES>)
|
||||
# link_whole_archive
|
||||
if(WIN32)
|
||||
target_link_libraries(${UCX_WRAPPER_TARGET}
|
||||
PUBLIC $<TARGET_FILE:ucxx::ucxx>)
|
||||
set_target_properties(${UCX_WRAPPER_TARGET}
|
||||
PROPERTIES LINK_FLAGS "/WHOLEARCHIVE:ucxx::ucxx")
|
||||
else()
|
||||
# Assume everything else is like gcc
|
||||
target_link_libraries(
|
||||
${UCX_WRAPPER_TARGET}
|
||||
PRIVATE "-Wl,--whole-archive" $<TARGET_FILE:ucxx::ucxx>
|
||||
"-Wl,--no-whole-archive")
|
||||
endif()
|
||||
target_link_libraries(${UCX_WRAPPER_TARGET}
|
||||
PRIVATE $<LINK_LIBRARY:WHOLE_ARCHIVE,ucxx::ucxx>)
|
||||
target_link_libraries(${UCX_WRAPPER_TARGET} PUBLIC ucxx::ucxx ucx::ucs)
|
||||
target_link_libraries(${UCX_WRAPPER_TARGET} PUBLIC ${CUDA_RT_LIB})
|
||||
endif()
|
||||
|
||||
@ -35,7 +35,8 @@ void CHECK_TLLM_XQA_JIT_ERROR_(tllmXqaJitStatus result, char const* const func,
|
||||
std::vector<char> log(tllmXqaJitGetLastErrorStringSize());
|
||||
tllmXqaJitGetLastErrorString(log.data());
|
||||
throw tensorrt_llm::common::TllmException(file, line,
|
||||
tensorrt_llm::common::fmtstr("[TensorRT-LLM][ERROR] TllmXqaJit runtime error in %s: %s", func, log.data()));
|
||||
tensorrt_llm::common::fmtstr("[TensorRT-LLM][ERROR] TllmXqaJit runtime error in %s: %s", func, log.data())
|
||||
.c_str());
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@ -1,3 +0,0 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:5ad6be58302fad71488246c4dea6f96d710143988a195d67b304ea251bd0aa89
|
||||
size 126817632
|
||||
@ -0,0 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:8702e9bf2ad0e50a86f9d3a9be52fc70b8fdf5be644d585c69d9560b6fe42dad
|
||||
size 34773116
|
||||
@ -1,2 +1,2 @@
|
||||
a8252eee786f39e51f70a4c011588c7d libtensorrt_llm_nvrtc_wrapper.so
|
||||
commit 705292307acd1546f4f9e2b2fab84350d01d41ab
|
||||
5ad6be58302fad71488246c4dea6f96d710143988a195d67b304ea251bd0aa89 libtensorrt_llm_nvrtc_wrapper.so
|
||||
commit 9c24486cb2cd9dd9582b311b84e1b428d29a735a
|
||||
|
||||
@ -1,3 +0,0 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:253ba70949732bf3d79c759ba3601516cd5b5b03a121f00c3ce45bbb40aea035
|
||||
size 133862752
|
||||
@ -0,0 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:7cddaa17269f699ffa0783ed296cabb7fe71cf61910a2799c71c4e39192fc513
|
||||
size 38282412
|
||||
@ -1,2 +1,2 @@
|
||||
1dacb3147d5d47d795a447ee563ee92a libtensorrt_llm_nvrtc_wrapper.so
|
||||
commit 705292307acd1546f4f9e2b2fab84350d01d41ab
|
||||
9d1104bbe6b4f258482549ec71c9d1aed0de912b5824dced5cf7829bff66ba0d libtensorrt_llm_nvrtc_wrapper.so
|
||||
commit 9c24486cb2cd9dd9582b311b84e1b428d29a735a
|
||||
|
||||
@ -1,3 +0,0 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:42d72057eac00d2f82cecb27f7401258c2fe932d51a945f1be4baa4271307acb
|
||||
size 138648070
|
||||
@ -1,3 +0,0 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:ef06a1b7cc3e1a2e71a2ce2f4081412eded9e75a236e2c4dda0ed636de8148b8
|
||||
size 138563288
|
||||
@ -0,0 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:5e50158a750697d719bbca9d4e18680290c10cb2bc30e5711854c49edb92ce95
|
||||
size 45029036
|
||||
@ -1,3 +1,2 @@
|
||||
7ef325eb05b4770773732c0f8bc5748d libtensorrt_llm_internal_cutlass_kernels_static.a
|
||||
ef25d7af2b5d9824ddc50a1b79db36e8 libtensorrt_llm_internal_cutlass_kernels_static.pre_cxx11.a
|
||||
commit b43c46c83bd0833eae16ea0eae7cef6bee81644c
|
||||
a357a7193265159ac09d7ddcc47e0445f0f348d8f93e08c5d82c98ed38d3e342 libtensorrt_llm_internal_cutlass_kernels_static.a
|
||||
commit 9c24486cb2cd9dd9582b311b84e1b428d29a735a
|
||||
|
||||
@ -18,13 +18,14 @@
|
||||
#pragma once
|
||||
#include "cutlass/gemm/gemm.h"
|
||||
#include "tensorrt_llm/common/assert.h"
|
||||
#include "tensorrt_llm/common/cudaUtils.h"
|
||||
#include "tensorrt_llm/common/quantization.h"
|
||||
#include "tensorrt_llm/kernels/cutlass_kernels/fp8_blockscale_gemm/fp8_blockscale_gemm.h"
|
||||
#include "tensorrt_llm/kernels/internal_cutlass_kernels/include/moe_gemm_kernels.h"
|
||||
#include "tensorrt_llm/kernels/lora/lora.h"
|
||||
#ifdef ENABLE_FP4
|
||||
#include <cuda_fp4.h>
|
||||
#endif
|
||||
#include <NvInferRuntime.h>
|
||||
#include <cuda_runtime_api.h>
|
||||
#include <optional>
|
||||
#include <random>
|
||||
@ -221,6 +222,12 @@ struct QuantParams
|
||||
}
|
||||
};
|
||||
|
||||
// Change to following declarations must sync with lora.h in public repo
|
||||
class LoraImpl;
|
||||
|
||||
int Lora_run(LoraImpl* impl, int64_t numTokens, int64_t numReqs, void const* input, int32_t const* loraRanks,
|
||||
void const* const* loraWeightsPtr, int weightIndex, void* const* outputs, void* workspace, cudaStream_t stream);
|
||||
|
||||
struct LoraParams
|
||||
{
|
||||
using LoraImplPtr = std::shared_ptr<LoraImpl>;
|
||||
|
||||
@ -1,3 +0,0 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:651dceb91cafbe24997cdc71c3f984ab6b18e99dbc2ed2958ca08b2cf4897cc3
|
||||
size 135328862
|
||||
@ -1,3 +0,0 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:2cbe6c81bae4a338f6a61f081571abff720c2275058c38ed67c879886056cd98
|
||||
size 134434150
|
||||
@ -0,0 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:0255e3a868db94e0bb555f692c087ee73c6b800f907c64fc36e0d3846ffa12f6
|
||||
size 44693484
|
||||
@ -1,3 +1,2 @@
|
||||
893ae060cdb1d5a54729afca9d1b9b99 libtensorrt_llm_internal_cutlass_kernels_static.a
|
||||
a7ee89c7577bf9bf8d8a9ac8072b810d libtensorrt_llm_internal_cutlass_kernels_static.pre_cxx11.a
|
||||
commit b43c46c83bd0833eae16ea0eae7cef6bee81644c
|
||||
4f6da1c3b64b7cef5841dd7507839e718c5f47fa81f3a8e2e6839a81bda459db libtensorrt_llm_internal_cutlass_kernels_static.a
|
||||
commit 9c24486cb2cd9dd9582b311b84e1b428d29a735a
|
||||
|
||||
@ -332,4 +332,11 @@ int LoraImpl::run(int64_t numTokens, int64_t numReqs, void const* input, int32_t
|
||||
return 0;
|
||||
}
|
||||
|
||||
int Lora_run(LoraImpl* impl, int64_t numTokens, int64_t numReqs, void const* input, int32_t const* loraRanks,
|
||||
void const* const* loraWeightsPtr, int weightIndex, void* const* outputs, void* workspace, cudaStream_t stream)
|
||||
{
|
||||
TLLM_CHECK_WITH_INFO(impl != nullptr, "Attempt to run an empty LoraImpl");
|
||||
return impl->run(numTokens, numReqs, input, loraRanks, loraWeightsPtr, weightIndex, outputs, workspace, stream);
|
||||
}
|
||||
|
||||
} // namespace tensorrt_llm::kernels
|
||||
|
||||
@ -66,4 +66,8 @@ private:
|
||||
std::optional<Config> mBestConfig;
|
||||
};
|
||||
|
||||
// Change to following declarations must sync with moe_kernels.h in internal kernel repo
|
||||
int Lora_run(LoraImpl* impl, int64_t numTokens, int64_t numReqs, void const* input, int32_t const* loraRanks,
|
||||
void const* const* loraWeightsPtr, int weightIndex, void* const* outputs, void* workspace, cudaStream_t stream);
|
||||
|
||||
} // namespace tensorrt_llm::kernels
|
||||
|
||||
@ -24,6 +24,8 @@ namespace tensorrt_llm
|
||||
{
|
||||
namespace kernels
|
||||
{
|
||||
|
||||
// Change to this enum must sync with nvrtcWrapper.cpp in internal kernel repo
|
||||
enum Data_type
|
||||
{
|
||||
DATA_TYPE_BOOL,
|
||||
|
||||
@ -560,7 +560,7 @@ PYBIND11_MODULE(TRTLLM_PYBIND_MODULE, m)
|
||||
.def("get_ipc_ptrs",
|
||||
[](tr::IpcNvlsHandle& self) { return reinterpret_cast<uintptr_t>(self.ipc_uc_ptrs.data()); });
|
||||
|
||||
m.def("ipc_nvls_allocate", &tr::ipcNvlsAllocate);
|
||||
m.def("ipc_nvls_allocate", &tr::ipcNvlsAllocate, py::return_value_policy::reference);
|
||||
m.def("ipc_nvls_free", &tr::ipcNvlsFree);
|
||||
m.def("ipc_nvls_supported", &tr::ipcNvlsSupported);
|
||||
}
|
||||
|
||||
@ -152,7 +152,7 @@ bool ipcNvlsSupported()
|
||||
return true;
|
||||
}
|
||||
|
||||
IpcNvlsHandle ipcNvlsAllocate(size_t size, std::set<int> group)
|
||||
IpcNvlsHandle* ipcNvlsAllocate(size_t size, std::set<int> group)
|
||||
{
|
||||
#if ENABLE_MULTI_DEVICE
|
||||
TLLM_CHECK(size > 0);
|
||||
@ -324,26 +324,33 @@ IpcNvlsHandle ipcNvlsAllocate(size_t size, std::set<int> group)
|
||||
|
||||
printf("Rank %d imported IPC handles successfully\n", rank);
|
||||
|
||||
return handle;
|
||||
return new IpcNvlsHandle(std::move(handle));
|
||||
#else
|
||||
TLLM_THROW("ipcNvlsAllocate needs to be compiled with ENABLE_MULTI_DEVICE");
|
||||
#endif
|
||||
}
|
||||
|
||||
void ipcNvlsFree(IpcNvlsHandle handle)
|
||||
void ipcNvlsFree(IpcNvlsHandle* handle)
|
||||
{
|
||||
#if ENABLE_MULTI_DEVICE
|
||||
// Unmap and release MC VA
|
||||
CUCHECK(cuMemUnmap(handle.mc_va, handle.size));
|
||||
CUCHECK(cuMemRelease(handle.mc_handle));
|
||||
CUCHECK(cuMemAddressFree(handle.mc_va, handle.size));
|
||||
// Unmap and release UC VA
|
||||
for (size_t i = 0; i < handle.ipc_uc_vas.size(); ++i)
|
||||
if (handle == nullptr)
|
||||
{
|
||||
CUCHECK(cuMemUnmap(handle.ipc_uc_vas[i], handle.size));
|
||||
CUCHECK(cuMemRelease(handle.ipc_uc_handles[i]));
|
||||
CUCHECK(cuMemAddressFree(handle.ipc_uc_vas[i], handle.size));
|
||||
return;
|
||||
}
|
||||
|
||||
// Unmap and release MC VA
|
||||
CUCHECK(cuMemUnmap(handle->mc_va, handle->size));
|
||||
CUCHECK(cuMemRelease(handle->mc_handle));
|
||||
CUCHECK(cuMemAddressFree(handle->mc_va, handle->size));
|
||||
// Unmap and release UC VA
|
||||
for (size_t i = 0; i < handle->ipc_uc_vas.size(); ++i)
|
||||
{
|
||||
CUCHECK(cuMemUnmap(handle->ipc_uc_vas[i], handle->size));
|
||||
CUCHECK(cuMemRelease(handle->ipc_uc_handles[i]));
|
||||
CUCHECK(cuMemAddressFree(handle->ipc_uc_vas[i], handle->size));
|
||||
}
|
||||
|
||||
delete handle;
|
||||
#else
|
||||
TLLM_THROW("ipcNvlsFree needs to be compiled with ENABLE_MULTI_DEVICE");
|
||||
#endif
|
||||
|
||||
@ -709,7 +709,7 @@ public:
|
||||
{
|
||||
other.mSize = 0;
|
||||
other.mCapacity = 0;
|
||||
other.mHandle = IpcNvlsHandle{};
|
||||
other.mHandle = nullptr;
|
||||
}
|
||||
|
||||
~MulticastBuffer() override
|
||||
@ -733,7 +733,7 @@ public:
|
||||
// reset other
|
||||
other.mSize = 0;
|
||||
other.mCapacity = 0;
|
||||
other.mHandle = IpcNvlsHandle{};
|
||||
other.mHandle = nullptr;
|
||||
}
|
||||
return *this;
|
||||
}
|
||||
@ -741,22 +741,22 @@ public:
|
||||
// Return list of pointers to each rank
|
||||
[[nodiscard]] void* dataIpcList()
|
||||
{
|
||||
return reinterpret_cast<void*>(mHandle.ipc_uc_ptrs.data());
|
||||
return reinterpret_cast<void*>(mHandle->ipc_uc_ptrs.data());
|
||||
}
|
||||
|
||||
[[nodiscard]] void const* dataIpcList() const
|
||||
{
|
||||
return reinterpret_cast<void const*>(mHandle.ipc_uc_ptrs.data());
|
||||
return reinterpret_cast<void const*>(mHandle->ipc_uc_ptrs.data());
|
||||
}
|
||||
|
||||
[[nodiscard]] void* dataMC()
|
||||
{
|
||||
return reinterpret_cast<void*>(mHandle.mc_ptr);
|
||||
return reinterpret_cast<void*>(mHandle->mc_ptr);
|
||||
}
|
||||
|
||||
[[nodiscard]] void const* dataMC() const
|
||||
{
|
||||
return reinterpret_cast<void const*>(mHandle.mc_ptr);
|
||||
return reinterpret_cast<void const*>(mHandle->mc_ptr);
|
||||
}
|
||||
|
||||
//////////////////////////
|
||||
@ -768,13 +768,13 @@ public:
|
||||
// Return unicast pointer
|
||||
[[nodiscard]] void* data() override
|
||||
{
|
||||
return reinterpret_cast<void*>(mHandle.uc_ptr);
|
||||
return reinterpret_cast<void*>(mHandle->uc_ptr);
|
||||
}
|
||||
|
||||
// Return unicast pointer
|
||||
[[nodiscard]] void const* data() const override
|
||||
{
|
||||
return reinterpret_cast<void const*>(mHandle.uc_ptr);
|
||||
return reinterpret_cast<void const*>(mHandle->uc_ptr);
|
||||
}
|
||||
|
||||
[[nodiscard]] std::size_t getSize() const override
|
||||
@ -806,8 +806,8 @@ public:
|
||||
printf("MulticastBuffer resize: %d B\n", int(toBytes(newSize)));
|
||||
mHandle = ipcNvlsAllocate(toBytes(newSize), mRanks);
|
||||
|
||||
TLLM_CHECK(mHandle.size % BufferDataType(mType).getSize() == 0);
|
||||
mCapacity = mHandle.size / BufferDataType(mType).getSize();
|
||||
TLLM_CHECK(mHandle->size % BufferDataType(mType).getSize() == 0);
|
||||
mCapacity = mHandle->size / BufferDataType(mType).getSize();
|
||||
}
|
||||
mSize = newSize;
|
||||
}
|
||||
@ -816,7 +816,7 @@ public:
|
||||
{
|
||||
if (mCapacity > 0)
|
||||
{
|
||||
TLLM_CHECK(mHandle.size > 0);
|
||||
TLLM_CHECK(mHandle->size > 0);
|
||||
ipcNvlsFree(mHandle);
|
||||
}
|
||||
}
|
||||
@ -826,7 +826,7 @@ private:
|
||||
std::size_t mCapacity = 0;
|
||||
nvinfer1::DataType mType;
|
||||
std::set<int> mRanks;
|
||||
IpcNvlsHandle mHandle;
|
||||
IpcNvlsHandle* mHandle;
|
||||
};
|
||||
|
||||
using DeviceBuffer = GenericBuffer<CudaAllocatorAsync>;
|
||||
|
||||
@ -100,14 +100,10 @@ add_gtest(eagleLayerTest layers/eagleLayerTest.cpp)
|
||||
|
||||
add_subdirectory(utils)
|
||||
|
||||
if(BUILD_BATCH_MANAGER)
|
||||
if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/batch_manager)
|
||||
add_subdirectory(batch_manager)
|
||||
endif()
|
||||
if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/batch_manager)
|
||||
add_subdirectory(batch_manager)
|
||||
endif()
|
||||
|
||||
if(BUILD_EXECUTOR)
|
||||
if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/executor)
|
||||
add_subdirectory(executor)
|
||||
endif()
|
||||
if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/executor)
|
||||
add_subdirectory(executor)
|
||||
endif()
|
||||
|
||||
@ -9,16 +9,12 @@
|
||||
# license agreement from NVIDIA CORPORATION or its affiliates is strictly
|
||||
# prohibited.
|
||||
|
||||
if(BUILD_BATCH_MANAGER)
|
||||
if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/batch_manager)
|
||||
add_subdirectory(batch_manager)
|
||||
endif()
|
||||
if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/batch_manager)
|
||||
add_subdirectory(batch_manager)
|
||||
endif()
|
||||
|
||||
if(BUILD_EXECUTOR)
|
||||
if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/executor)
|
||||
add_subdirectory(executor)
|
||||
endif()
|
||||
if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/executor)
|
||||
add_subdirectory(executor)
|
||||
endif()
|
||||
|
||||
add_subdirectory(common)
|
||||
|
||||
Loading…
Reference in New Issue
Block a user