TensorRT-LLMs/cpp/cmake/modules/cuda_configuration.cmake
nv-guomingz 4dfa3ccf43
chore: enhance the cmake experience by ignoring the additional semicolon (#3992)
Signed-off-by: nv-guomingz <137257613+nv-guomingz@users.noreply.github.com>
2025-05-08 18:43:36 +08:00

175 lines
6.8 KiB
CMake

#
# SPDX-FileCopyrightText: Copyright (c) 1993-2022 NVIDIA CORPORATION &
# AFFILIATES. All rights reserved. SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may not
# use this file except in compliance with the License. You may obtain a copy of
# the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations under
# the License.
#
macro(setup_cuda_compiler)
# Determine CUDA version before enabling the language extension
# check_language(CUDA) clears CMAKE_CUDA_HOST_COMPILER if CMAKE_CUDA_COMPILER
# is not set
include(CheckLanguage)
if(NOT CMAKE_CUDA_COMPILER AND CMAKE_CUDA_HOST_COMPILER)
set(CMAKE_CUDA_HOST_COMPILER_BACKUP ${CMAKE_CUDA_HOST_COMPILER})
endif()
check_language(CUDA)
if(CMAKE_CUDA_HOST_COMPILER_BACKUP)
set(CMAKE_CUDA_HOST_COMPILER ${CMAKE_CUDA_HOST_COMPILER_BACKUP})
check_language(CUDA)
endif()
if(CMAKE_CUDA_COMPILER)
message(STATUS "CUDA compiler: ${CMAKE_CUDA_COMPILER}")
if(NOT WIN32) # Linux
execute_process(
COMMAND
"bash" "-c"
"${CMAKE_CUDA_COMPILER} --version | egrep -o 'V[0-9]+.[0-9]+.[0-9]+' | cut -c2-"
RESULT_VARIABLE _BASH_SUCCESS
OUTPUT_VARIABLE CMAKE_CUDA_COMPILER_VERSION
OUTPUT_STRIP_TRAILING_WHITESPACE)
if(NOT _BASH_SUCCESS EQUAL 0)
message(FATAL_ERROR "Failed to determine CUDA version")
endif()
else() # Windows
execute_process(
COMMAND ${CMAKE_CUDA_COMPILER} --version
OUTPUT_VARIABLE versionString
RESULT_VARIABLE versionResult)
if(versionResult EQUAL 0 AND versionString MATCHES
"V[0-9]+\\.[0-9]+\\.[0-9]+")
string(REGEX REPLACE "V" "" version ${CMAKE_MATCH_0})
set(CMAKE_CUDA_COMPILER_VERSION "${version}")
else()
message(FATAL_ERROR "Failed to determine CUDA version")
endif()
endif()
else()
message(FATAL_ERROR "No CUDA compiler found")
endif()
set(CUDA_REQUIRED_VERSION "11.2")
if(CMAKE_CUDA_COMPILER_VERSION VERSION_LESS CUDA_REQUIRED_VERSION)
message(
FATAL_ERROR
"CUDA version ${CMAKE_CUDA_COMPILER_VERSION} must be at least ${CUDA_REQUIRED_VERSION}"
)
endif()
endmacro()
macro(setup_cuda_architectures)
# cmake-format: off
# Initialize and normalize CMAKE_CUDA_ARCHITECTURES before enabling CUDA.
# Special values:
# * `native` is resolved to HIGHEST available architecture.
# * Fallback to `all` if detection failed.
# * `all`/unset is resolved to a set of architectures we optimized for and compiler supports.
# * `all-major` is unsupported.
# Numerical architectures:
# * PTX is never included in result binary.
# * `*-virtual` architectures are therefore rejected.
# * `-real` suffix is automatically added to exclude PTX.
# * Always use accelerated (`-a` suffix) target for supported architectures.
# cmake-format: on
if(CMAKE_CUDA_ARCHITECTURES STREQUAL "native")
# Detect highest available compute capability
set(OUTPUTFILE ${PROJECT_BINARY_DIR}/detect_cuda_arch)
set(CUDAFILE ${CMAKE_SOURCE_DIR}/cmake/utils/detect_cuda_arch.cu)
execute_process(COMMAND ${CMAKE_CUDA_COMPILER} -lcuda ${CUDAFILE} -o
${OUTPUTFILE})
message(VERBOSE "Detecting native CUDA compute capability")
execute_process(
COMMAND ${OUTPUTFILE}
RESULT_VARIABLE CUDA_RETURN_CODE
OUTPUT_VARIABLE CUDA_ARCH_OUTPUT)
if(NOT ${CUDA_RETURN_CODE} EQUAL 0)
message(WARNING "Detecting native CUDA compute capability - fail")
message(
WARNING
"CUDA compute capability detection failed, compiling for all optimized architectures"
)
unset(CMAKE_CUDA_ARCHITECTURES)
else()
message(STATUS "Detecting native CUDA compute capability - done")
set(CMAKE_CUDA_ARCHITECTURES "${CUDA_ARCH_OUTPUT}")
endif()
elseif(CMAKE_CUDA_ARCHITECTURES STREQUAL "all")
unset(CMAKE_CUDA_ARCHITECTURES)
message(
STATUS
"Setting CMAKE_CUDA_ARCHITECTURES to all enables all architectures TensorRT-LLM optimized for, "
"not all architectures CUDA compiler supports.")
elseif(CMAKE_CUDA_ARCHITECTURES STREQUAL "all-major")
message(
FATAL_ERROR
"Setting CMAKE_CUDA_ARCHITECTURES to all-major does not make sense for TensorRT-LLM. "
"Please enable all architectures you intend to run on, so we can enable optimized kernels for them."
)
else()
unset(CMAKE_CUDA_ARCHITECTURES_CLEAN)
foreach(CUDA_ARCH IN LISTS CMAKE_CUDA_ARCHITECTURES)
if(CUDA_ARCH STREQUAL "")
continue()
endif()
if(CUDA_ARCH MATCHES "^([1-9])([0-9])+a?-virtual$")
message(FATAL_ERROR "Including PTX in compiled binary is unsupported.")
elseif(CUDA_ARCH MATCHES "^(([1-9])([0-9])+)a?(-real)?$")
list(APPEND CMAKE_CUDA_ARCHITECTURES_CLEAN ${CMAKE_MATCH_1})
else()
message(FATAL_ERROR "Unrecognized CUDA architecture: ${CUDA_ARCH}")
endif()
endforeach()
list(REMOVE_DUPLICATES CMAKE_CUDA_ARCHITECTURES_CLEAN)
set(CMAKE_CUDA_ARCHITECTURES ${CMAKE_CUDA_ARCHITECTURES_CLEAN})
endif()
if(NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
set(CMAKE_CUDA_ARCHITECTURES "80" "86")
if(CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL "11.8")
list(APPEND CMAKE_CUDA_ARCHITECTURES "89" "90")
endif()
if(CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL "12.7")
list(APPEND CMAKE_CUDA_ARCHITECTURES "100" "120")
endif()
endif()
# CMAKE_CUDA_ARCHITECTURES_ORIG contains all architectures enabled, without
# automatically added -real or -a suffix.
set(CMAKE_CUDA_ARCHITECTURES_ORIG "${CMAKE_CUDA_ARCHITECTURES}")
message(STATUS "GPU architectures: ${CMAKE_CUDA_ARCHITECTURES_ORIG}")
set(ARCHITECTURES_WITH_KERNELS "80" "86" "89" "90" "100" "120")
foreach(CUDA_ARCH IN LISTS ARCHITECTURES_WITH_KERNELS)
if(NOT "${CUDA_ARCH}" IN_LIST CMAKE_CUDA_ARCHITECTURES_ORIG)
add_definitions("-DEXCLUDE_SM_${CUDA_ARCH}")
message(STATUS "Excluding SM ${CUDA_ARCH}")
endif()
endforeach()
set(ARCHITECTURES_WITH_ACCEL "90" "100" "101" "120")
unset(CMAKE_CUDA_ARCHITECTURES_NORMALIZED)
foreach(CUDA_ARCH IN LISTS CMAKE_CUDA_ARCHITECTURES)
if("${CUDA_ARCH}" IN_LIST ARCHITECTURES_WITH_ACCEL)
list(APPEND CMAKE_CUDA_ARCHITECTURES_NORMALIZED "${CUDA_ARCH}a-real")
else()
list(APPEND CMAKE_CUDA_ARCHITECTURES_NORMALIZED "${CUDA_ARCH}-real")
endif()
endforeach()
set(CMAKE_CUDA_ARCHITECTURES ${CMAKE_CUDA_ARCHITECTURES_NORMALIZED})
endmacro()