mirror of
https://github.com/NVIDIA/TensorRT-LLM.git
synced 2026-01-14 06:27:45 +08:00
* feat: Add group_rms_norm kernel to normalize multiple inputs in a single operator. Previously, the RMSNorm implementation only supported a single input tensor. With group_rms_norm, multiple tensors can be normalized together: ```python input_a, input_b, ... = group_rms_norm([input_a, input_b, ...]) ``` All input tensors must share the same batch dimension. The kernel partitions work by dynamically assigning warp groups proportional to the last dimension of each input, improving launch efficiency and reducing overhead. This MR provides two implementations: GroupRMSNormKernel: Optimized for small-to-medium batch sizes GroupRMSNormKernelLargeBatch: Contains additional optimizations for large batch sizes Both kernels are currently exposed as custom PyTorch ops. A future MR will implement heuristic-based kernel selection and expose a unified interface. Signed-off-by: Simeng Liu <simengl@nvidia.com> * Resolve comments and fix typo with IS_FLASHINFER_AVAILABLE Signed-off-by: Simeng Liu <simengl@nvidia.com> --------- Signed-off-by: Simeng Liu <simengl@nvidia.com>
79 lines
3.1 KiB
CMake
79 lines
3.1 KiB
CMake
#
|
|
# SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION &
|
|
# AFFILIATES. All rights reserved. SPDX-License-Identifier: Apache-2.0
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License"); you may not
|
|
# use this file except in compliance with the License. You may obtain a copy of
|
|
# the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
|
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
|
# License for the specific language governing permissions and limitations under
|
|
# the License.
|
|
#
|
|
|
|
file(GLOB_RECURSE SRC_CPP *.cpp)
|
|
file(GLOB_RECURSE SRC_CU *.cu)
|
|
|
|
# Exclude files in the cutlass_kernels, decoderMaskedMultiheadAttention and
|
|
# selectiveScan trtllmGenKernels folder
|
|
list(FILTER SRC_CPP EXCLUDE REGEX "cutlass_kernels/.*")
|
|
list(FILTER SRC_CU EXCLUDE REGEX "cutlass_kernels/.*")
|
|
list(FILTER SRC_CPP EXCLUDE REGEX "flashMLA/.*")
|
|
list(FILTER SRC_CU EXCLUDE REGEX "flashMLA/.*")
|
|
list(FILTER SRC_CPP EXCLUDE REGEX "contextFusedMultiHeadAttention/.*")
|
|
list(FILTER SRC_CU EXCLUDE REGEX "contextFusedMultiHeadAttention/.*")
|
|
list(FILTER SRC_CPP EXCLUDE REGEX "decoderMaskedMultiheadAttention/.*")
|
|
list(FILTER SRC_CU EXCLUDE REGEX "decoderMaskedMultiheadAttention/.*")
|
|
list(FILTER SRC_CPP EXCLUDE REGEX "trtllmGenKernels/.*")
|
|
list(FILTER SRC_CU EXCLUDE REGEX "trtllmGenKernels/.*")
|
|
list(FILTER SRC_CPP EXCLUDE REGEX "selectiveScan/.*")
|
|
list(FILTER SRC_CU EXCLUDE REGEX "selectiveScan/.*")
|
|
list(FILTER SRC_CPP EXCLUDE REGEX "userbuffers/.*")
|
|
list(FILTER SRC_CU EXCLUDE REGEX "userbuffers/.*")
|
|
list(FILTER SRC_CU EXCLUDE REGEX "fusedLayernormKernels/.*")
|
|
|
|
function(filter_cuda_archs ARCH SOURCES_VAR)
|
|
if(NOT "${ARCH}" IN_LIST CMAKE_CUDA_ARCHITECTURES_ORIG)
|
|
set(FILTER_REGEX
|
|
".*_sm(_)?${ARCH}[.]cubin[.]cpp|^.*Sm(_)?${ARCH}.*cubin.cpp$")
|
|
list(APPEND SOURCES ${${SOURCES_VAR}})
|
|
list(APPEND SOURCES_FILTERED ${SOURCES})
|
|
list(FILTER SOURCES_FILTERED INCLUDE REGEX "${FILTER_REGEX}")
|
|
list(LENGTH SOURCES_FILTERED SOURCES_FILTERED_LEN)
|
|
message(
|
|
STATUS
|
|
"Excluding ${SOURCES_FILTERED_LEN} cubins for SM ${ARCH} from ${CMAKE_CURRENT_SOURCE_DIR}"
|
|
)
|
|
foreach(filtered_item ${SOURCES_FILTERED})
|
|
message(VERBOSE "- ${filtered_item}")
|
|
endforeach()
|
|
list(FILTER SOURCES EXCLUDE REGEX "${FILTER_REGEX}")
|
|
set(${SOURCES_VAR}
|
|
"${SOURCES}"
|
|
PARENT_SCOPE)
|
|
add_compile_definitions("EXCLUDE_SM_${ARCH}")
|
|
endif()
|
|
endfunction()
|
|
|
|
if(NOT ENABLE_MULTI_DEVICE)
|
|
list(FILTER SRC_CU EXCLUDE REGEX "customAllReduceKernels*.*cu$")
|
|
endif()
|
|
|
|
add_library(kernels_src STATIC ${SRC_CPP} ${SRC_CU})
|
|
set_property(TARGET kernels_src PROPERTY POSITION_INDEPENDENT_CODE ON)
|
|
set_property(TARGET kernels_src PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
|
|
|
|
add_subdirectory(cutlass_kernels)
|
|
add_subdirectory(flashMLA)
|
|
add_subdirectory(contextFusedMultiHeadAttention)
|
|
add_subdirectory(decoderMaskedMultiheadAttention)
|
|
add_subdirectory(selectiveScan)
|
|
add_subdirectory(userbuffers)
|
|
add_subdirectory(trtllmGenKernels)
|
|
add_subdirectory(fusedLayernormKernels)
|
|
add_subdirectory(groupRmsNormKernels)
|