mirror of
https://github.com/NVIDIA/TensorRT-LLM.git
synced 2026-01-22 19:52:38 +08:00
* Instead of allocating UserBuffers at beginning of runtime, UB buffers are now managed with global allocator. The allocator will dynamically assign free UB buffer or allocate new buffer for torch tensor. It makes userbuffers easier to use. * In common usecase, the Userbuffers will be allocated correctly during warm up stage. There is no dynamic allocation during inference. * UB fusion pattern is rewroten using the new UB Allocator. It contains following passes: 1. Fuse Quant with allreduce, replace with UB impl, and insert a copy_to_userbuffers. Currently the normal allreduce still does not support FP8 quant. So this need to be done in UB pass 2. Convert all supported allreduce with UB and insert copy_to_userbuffers. 3. Fuse op before ar with the copy_to_userbuffers. So the op directly writes to the userbuffer 4. Remove userbuffers finalize if the output is connect to another UB allreduce. Signed-off-by: Jin Li <59594262+liji-nv@users.noreply.github.com>
89 lines
2.8 KiB
CMake
89 lines
2.8 KiB
CMake
# SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION &
|
|
# AFFILIATES. All rights reserved. SPDX-License-Identifier: Apache-2.0
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License"); you may not
|
|
# use this file except in compliance with the License. You may obtain a copy of
|
|
# the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
|
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
|
# License for the specific language governing permissions and limitations under
|
|
# the License.
|
|
|
|
if(NOT WIN32)
|
|
# additional warnings
|
|
#
|
|
# Ignore overloaded-virtual warning. We intentionally change parameters of
|
|
# some methods in derived class.
|
|
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall")
|
|
if(WARNING_IS_ERROR)
|
|
message(STATUS "Treating warnings as errors in GCC compilation")
|
|
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Werror")
|
|
endif()
|
|
else() # Windows
|
|
# warning level 4
|
|
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /W4")
|
|
endif()
|
|
|
|
add_library(th_utils STATIC thUtils.cpp)
|
|
set_property(TARGET th_utils PROPERTY POSITION_INDEPENDENT_CODE ON)
|
|
set_property(TARGET th_utils PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
|
|
target_link_libraries(th_utils PUBLIC ${TORCH_LIBRARIES} ${CUBLAS_LIB}
|
|
${CURAND_LIB})
|
|
|
|
add_library(
|
|
th_common SHARED
|
|
allgatherOp.cpp
|
|
allreduceOp.cpp
|
|
attentionOp.cpp
|
|
convertSpecDecodingMaskToPackedMaskOp.cpp
|
|
cutlassScaledMM.cpp
|
|
cublasScaledMM.cpp
|
|
deepseekAllreduceFusionOp.cpp
|
|
dynamicDecodeOp.cpp
|
|
fmhaPackMaskOp.cpp
|
|
fp8Op.cpp
|
|
fp4Op.cpp
|
|
fp4Gemm.cpp
|
|
fp4Quantize.cpp
|
|
fp4BatchedQuantize.cpp
|
|
fp8BlockScalingGemm.cpp
|
|
fp8Quantize.cpp
|
|
gatherTreeOp.cpp
|
|
logitsBitmaskOp.cpp
|
|
mambaConv1dOp.cpp
|
|
moeOp.cpp
|
|
fp8BlockScaleMoe.cpp
|
|
noAuxTcOp.cpp
|
|
ncclCommunicatorOp.cpp
|
|
parallelDecodeKVCacheUpdateOp.cpp
|
|
redrafterCurandOp.cpp
|
|
reducescatterOp.cpp
|
|
relativeAttentionBiasOp.cpp
|
|
selectiveScanOp.cpp
|
|
userbuffersFinalizeOp.cpp
|
|
userbuffersTensor.cpp
|
|
weightOnlyQuantOp.cpp
|
|
mtpOp.cpp)
|
|
set_property(TARGET th_common PROPERTY POSITION_INDEPENDENT_CODE ON)
|
|
target_link_libraries(th_common PRIVATE ${TORCH_LIBRARIES} th_utils
|
|
${Python3_LIBRARIES} ${SHARED_TARGET})
|
|
|
|
if(ENABLE_MULTI_DEVICE)
|
|
target_include_directories(th_common PUBLIC ${MPI_C_INCLUDE_DIRS})
|
|
target_link_libraries(th_common PRIVATE ${MPI_C_LIBRARIES} ${NCCL_LIB}
|
|
CUDA::nvml)
|
|
endif()
|
|
|
|
if(NOT WIN32)
|
|
set_target_properties(
|
|
th_common
|
|
PROPERTIES LINK_FLAGS
|
|
"-Wl,-rpath='$ORIGIN' ${AS_NEEDED_FLAG} ${UNDEFINED_FLAG}")
|
|
else()
|
|
target_link_libraries(th_common PRIVATE context_attention_src)
|
|
endif()
|