# # SPDX-FileCopyrightText: Copyright (c) 1993-2022 NVIDIA CORPORATION & # AFFILIATES. All rights reserved. SPDX-License-Identifier: Apache-2.0 # # Licensed under the Apache License, Version 2.0 (the "License"); you may not # use this file except in compliance with the License. You may obtain a copy of # the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, WITHOUT # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the # License for the specific language governing permissions and limitations under # the License. # macro(setup_cuda_compiler) # Determine CUDA version before enabling the language extension # check_language(CUDA) clears CMAKE_CUDA_HOST_COMPILER if CMAKE_CUDA_COMPILER # is not set include(CheckLanguage) if(NOT CMAKE_CUDA_COMPILER AND CMAKE_CUDA_HOST_COMPILER) set(CMAKE_CUDA_HOST_COMPILER_BACKUP ${CMAKE_CUDA_HOST_COMPILER}) endif() check_language(CUDA) if(CMAKE_CUDA_HOST_COMPILER_BACKUP) set(CMAKE_CUDA_HOST_COMPILER ${CMAKE_CUDA_HOST_COMPILER_BACKUP}) check_language(CUDA) endif() if(CMAKE_CUDA_COMPILER) message(STATUS "CUDA compiler: ${CMAKE_CUDA_COMPILER}") if(NOT WIN32) # Linux execute_process( COMMAND "bash" "-c" "${CMAKE_CUDA_COMPILER} --version | egrep -o 'V[0-9]+.[0-9]+.[0-9]+' | cut -c2-" RESULT_VARIABLE _BASH_SUCCESS OUTPUT_VARIABLE CMAKE_CUDA_COMPILER_VERSION OUTPUT_STRIP_TRAILING_WHITESPACE) if(NOT _BASH_SUCCESS EQUAL 0) message(FATAL_ERROR "Failed to determine CUDA version") endif() else() # Windows execute_process( COMMAND ${CMAKE_CUDA_COMPILER} --version OUTPUT_VARIABLE versionString RESULT_VARIABLE versionResult) if(versionResult EQUAL 0 AND versionString MATCHES "V[0-9]+\\.[0-9]+\\.[0-9]+") string(REGEX REPLACE "V" "" version ${CMAKE_MATCH_0}) set(CMAKE_CUDA_COMPILER_VERSION "${version}") else() message(FATAL_ERROR "Failed to determine CUDA version") endif() endif() else() message(FATAL_ERROR "No CUDA compiler found") endif() set(CUDA_REQUIRED_VERSION "11.2") if(CMAKE_CUDA_COMPILER_VERSION VERSION_LESS CUDA_REQUIRED_VERSION) message( FATAL_ERROR "CUDA version ${CMAKE_CUDA_COMPILER_VERSION} must be at least ${CUDA_REQUIRED_VERSION}" ) endif() endmacro() function(setup_cuda_architectures) # cmake-format: off # Initialize and normalize CMAKE_CUDA_ARCHITECTURES. # Special values: # * `native` is resolved to HIGHEST available architecture. # * Fallback to `all` if detection failed. # * `all`/unset is resolved to a set of architectures we optimized for and compiler supports. # * `all-major` is unsupported. # Numerical architectures: # * PTX is never included in result binary. # * `*-virtual` architectures are therefore rejected. # * `-real` suffix is automatically added to exclude PTX. # * Always use accelerated (`-a` suffix) target for supported architectures. # * On CUDA 12.9 or newer, family (`-f` suffix) target will be used for supported architectures to reduce number of # targets to compile for. # * Extra architectures can be requested via add_cuda_architectures # for kernels that benefit from arch specific features. # cmake-format: on set(CMAKE_CUDA_ARCHITECTURES_RAW ${CMAKE_CUDA_ARCHITECTURES}) if(CMAKE_CUDA_ARCHITECTURES_RAW STREQUAL "native") # Detect highest available compute capability set(OUTPUTFILE ${PROJECT_BINARY_DIR}/detect_cuda_arch) set(CUDAFILE ${CMAKE_SOURCE_DIR}/cmake/utils/detect_cuda_arch.cu) execute_process(COMMAND ${CMAKE_CUDA_COMPILER} -lcuda ${CUDAFILE} -o ${OUTPUTFILE}) message(VERBOSE "Detecting native CUDA compute capability") execute_process( COMMAND ${OUTPUTFILE} RESULT_VARIABLE CUDA_RETURN_CODE OUTPUT_VARIABLE CUDA_ARCH_OUTPUT) if(NOT ${CUDA_RETURN_CODE} EQUAL 0) message(WARNING "Detecting native CUDA compute capability - fail") message( WARNING "CUDA compute capability detection failed, compiling for all optimized architectures" ) unset(CMAKE_CUDA_ARCHITECTURES_RAW) else() message(STATUS "Detecting native CUDA compute capability - done") set(CMAKE_CUDA_ARCHITECTURES_RAW "${CUDA_ARCH_OUTPUT}") endif() elseif(CMAKE_CUDA_ARCHITECTURES_RAW STREQUAL "all") unset(CMAKE_CUDA_ARCHITECTURES_RAW) message( STATUS "Setting CMAKE_CUDA_ARCHITECTURES to all enables all architectures TensorRT LLM optimized for, " "not all architectures CUDA compiler supports.") elseif(CMAKE_CUDA_ARCHITECTURES_RAW STREQUAL "all-major") message( FATAL_ERROR "Setting CMAKE_CUDA_ARCHITECTURES to all-major does not make sense for TensorRT-LLM. " "Please enable all architectures you intend to run on, so we can enable optimized kernels for them." ) else() foreach(CUDA_ARCH IN LISTS CMAKE_CUDA_ARCHITECTURES_RAW) if(CUDA_ARCH STREQUAL "") continue() endif() if(CUDA_ARCH MATCHES "^([1-9])([0-9])+a?-virtual$") message(FATAL_ERROR "Including PTX in compiled binary is unsupported.") elseif(CUDA_ARCH MATCHES "^(([1-9])([0-9])+)a?(-real)?$") list(APPEND CMAKE_CUDA_ARCHITECTURES_CLEAN ${CMAKE_MATCH_1}) else() message(FATAL_ERROR "Unrecognized CUDA architecture: ${CUDA_ARCH}") endif() endforeach() if("103" IN_LIST CMAKE_CUDA_ARCHITECTURES_CLEAN) list(APPEND CMAKE_CUDA_ARCHITECTURES_CLEAN "100") endif() list(REMOVE_DUPLICATES CMAKE_CUDA_ARCHITECTURES_CLEAN) set(CMAKE_CUDA_ARCHITECTURES_RAW ${CMAKE_CUDA_ARCHITECTURES_CLEAN}) endif() if(NOT DEFINED CMAKE_CUDA_ARCHITECTURES_RAW) set(CMAKE_CUDA_ARCHITECTURES_RAW 80 86) if(CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL "11.8") list(APPEND CMAKE_CUDA_ARCHITECTURES_RAW 89 90) endif() if(CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL "12.7") list(APPEND CMAKE_CUDA_ARCHITECTURES_RAW 100 120) endif() if(CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL "12.9") list(APPEND CMAKE_CUDA_ARCHITECTURES_RAW 103) endif() endif() # CMAKE_CUDA_ARCHITECTURES_ORIG contains all architectures enabled, without # automatically added -real or -a suffix. set(CMAKE_CUDA_ARCHITECTURES_ORIG "${CMAKE_CUDA_ARCHITECTURES_RAW}") message(STATUS "GPU architectures: ${CMAKE_CUDA_ARCHITECTURES_ORIG}") set(CMAKE_CUDA_ARCHITECTURES_ORIG ${CMAKE_CUDA_ARCHITECTURES_ORIG} PARENT_SCOPE) set(ARCHITECTURES_WITH_KERNELS 80 86 89 90 100 103 120) foreach(CUDA_ARCH IN LISTS ARCHITECTURES_WITH_KERNELS) if(NOT ${CUDA_ARCH} IN_LIST CMAKE_CUDA_ARCHITECTURES_ORIG) add_definitions("-DEXCLUDE_SM_${CUDA_ARCH}") message(STATUS "Excluding SM ${CUDA_ARCH}") endif() endforeach() # -a suffix supported from Hopper (90) set(MIN_ARCHITECTURE_HAS_ACCEL 90) # -f suffix supported from Blackwell (100) starting from CUDA 12.9. if(CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL "12.9") set(MIN_ARCHITECTURE_HAS_FAMILY 100) set(CMAKE_CUDA_ARCHITECTURES_HAS_FAMILIES ON PARENT_SCOPE) else() # -a provides no cross architecture compatibility, but luckily until CUDA # 12.8 We have only one architecture within each family >= 9. set(MIN_ARCHITECTURE_HAS_FAMILY 9999) # Effectively exclude all # architectures set(CMAKE_CUDA_ARCHITECTURES_HAS_FAMILIES OFF PARENT_SCOPE) endif() # Compatibility low bounds: Always compile kernels for these architectures. 86 # is enabled to avoid perf regression when using 80 kernels. set(ARCHITECTURES_COMPATIBILITY_BASE 80 86 90 100 120) # Exclude Tegra architectures set(ARCHITECTURES_NO_COMPATIBILITY 87 101) # Generate CMAKE_CUDA_ARCHITECTURES_NORMALIZED from # CMAKE_CUDA_ARCHITECTURES_ORIG set(CMAKE_CUDA_ARCHITECTURES_NORMALIZED_LIST) foreach(CUDA_ARCH IN LISTS CMAKE_CUDA_ARCHITECTURES_ORIG) # If ARCH is in ARCHITECTURES_NO_COMPATIBILITY or # ARCHITECTURES_COMPATIBILITY_BASE, add it directly if(${CUDA_ARCH} IN_LIST ARCHITECTURES_NO_COMPATIBILITY OR ${CUDA_ARCH} IN_LIST ARCHITECTURES_COMPATIBILITY_BASE) list(APPEND CMAKE_CUDA_ARCHITECTURES_NORMALIZED_LIST ${CUDA_ARCH}) else() # Find the largest BASE_ARCH in ARCHITECTURES_COMPATIBILITY_BASE less than # ARCH set(BEST_BASE_ARCH "") set(ARCH_MAJOR "") math(EXPR ARCH_MAJOR "${CUDA_ARCH} / 10") foreach(BASE_ARCH IN LISTS ARCHITECTURES_COMPATIBILITY_BASE) if(BASE_ARCH LESS ${CUDA_ARCH}) set(BASE_MAJOR "") math(EXPR BASE_MAJOR "${BASE_ARCH} / 10") # Check if major version matches if(BASE_MAJOR EQUAL ARCH_MAJOR) if(NOT "${BEST_BASE_ARCH}" OR ${BASE_ARCH} GREATER "${BEST_BASE_ARCH}") set(BEST_BASE_ARCH ${BASE_ARCH}) endif() endif() endif() endforeach() if("${BEST_BASE_ARCH}") if(NOT ${BEST_BASE_ARCH} IN_LIST CMAKE_CUDA_ARCHITECTURES_NORMALIZED_LIST) list(APPEND CMAKE_CUDA_ARCHITECTURES_NORMALIZED_LIST ${BEST_BASE_ARCH}) endif() else() message(FATAL_ERROR "Unsupported CUDA architecture: ${CUDA_ARCH}.") endif() endif() endforeach() # Apply suffixes based on architecture capabilities set(CMAKE_CUDA_ARCHITECTURES_NORMALIZED) set(CMAKE_CUDA_ARCHITECTURES_FAMILIES) foreach(CUDA_ARCH IN LISTS CMAKE_CUDA_ARCHITECTURES_NORMALIZED_LIST) if(CUDA_ARCH GREATER_EQUAL ${MIN_ARCHITECTURE_HAS_FAMILY} AND NOT CUDA_ARCH IN_LIST ARCHITECTURES_NO_COMPATIBILITY) list(APPEND CMAKE_CUDA_ARCHITECTURES_NORMALIZED "${CUDA_ARCH}f-real") list(APPEND CMAKE_CUDA_ARCHITECTURES_FAMILIES "${CUDA_ARCH}f") elseif(CUDA_ARCH GREATER_EQUAL ${MIN_ARCHITECTURE_HAS_ACCEL}) list(APPEND CMAKE_CUDA_ARCHITECTURES_NORMALIZED "${CUDA_ARCH}a-real") else() list(APPEND CMAKE_CUDA_ARCHITECTURES_NORMALIZED "${CUDA_ARCH}-real") endif() endforeach() set(CMAKE_CUDA_ARCHITECTURES ${CMAKE_CUDA_ARCHITECTURES_NORMALIZED} PARENT_SCOPE) set(CMAKE_CUDA_ARCHITECTURES_FAMILIES ${CMAKE_CUDA_ARCHITECTURES_FAMILIES} PARENT_SCOPE) endfunction() function(add_cuda_architectures target) # cmake-format: off # Add CUDA architectures to target. # -a suffix is added automatically for supported architectures. # Architectures are added only if user explicitly requested support for that architecture. # cmake-format: on set(MIN_ARCHITECTURE_HAS_ACCEL 90) foreach(CUDA_ARCH IN LISTS ARGN) if(${CUDA_ARCH} IN_LIST CMAKE_CUDA_ARCHITECTURES_ORIG) if(${CUDA_ARCH} GREATER_EQUAL ${MIN_ARCHITECTURE_HAS_ACCEL}) set(REAL_CUDA_ARCH "${CUDA_ARCH}a-real") else() set(REAL_CUDA_ARCH "${CUDA_ARCH}-real") endif() set_property( TARGET ${target} APPEND PROPERTY CUDA_ARCHITECTURES ${REAL_CUDA_ARCH}) endif() endforeach() endfunction() function(set_cuda_architectures target) # cmake-format: off # Set CUDA architectures for a target. # -a suffix is added automatically for supported architectures. # Architectures passed in may be specified with -f suffix to build family conditional version of the kernel. # Non-family architectures are added only if user explicitly requested support for that architecture. # Family conditional architectures are only added if user requested architectures would enable compilation for it. # If user requested no architectures set on the target, # the target will be compiled with `PLACEHOLDER_KERNELS` macro defined. # cmake-format: on set(MIN_ARCHITECTURE_HAS_ACCEL 90) set(CUDA_ARCHITECTURES "") foreach(CUDA_ARCH IN LISTS ARGN) if(${CUDA_ARCH} MATCHES "[0-9]+f") if(CMAKE_CUDA_ARCHITECTURES_HAS_FAMILIES) if(${CUDA_ARCH} IN_LIST CMAKE_CUDA_ARCHITECTURES_FAMILIES) list(APPEND CUDA_ARCHITECTURES "${CUDA_ARCH}-real") endif() else() # Fallback for compiler without -f support: Enable all architectures in # the family and requested string(REGEX REPLACE "f$" "" CUDA_ARCH_NUMERIC "${CUDA_ARCH}") math(EXPR ARCH_MAJOR "${CUDA_ARCH_NUMERIC} / 10") foreach(ORIG_ARCH IN LISTS CMAKE_CUDA_ARCHITECTURES_ORIG) math(EXPR ORIG_MAJOR "${ORIG_ARCH} / 10") if(ORIG_MAJOR EQUAL ARCH_MAJOR) list(APPEND CUDA_ARCHITECTURES "${ORIG_ARCH}a-real") endif() endforeach() endif() elseif(${CUDA_ARCH} IN_LIST CMAKE_CUDA_ARCHITECTURES_ORIG) if(${CUDA_ARCH} GREATER_EQUAL ${MIN_ARCHITECTURE_HAS_ACCEL}) list(APPEND CUDA_ARCHITECTURES "${CUDA_ARCH}a-real") else() list(APPEND CUDA_ARCHITECTURES "${CUDA_ARCH}-real") endif() endif() endforeach() if("${CUDA_ARCHITECTURES}" STREQUAL "") # We have to at least build for some architectures. set_property(TARGET ${target} PROPERTY CUDA_ARCHITECTURES "80-real") target_compile_definitions(${target} PRIVATE PLACEHOLDER_KERNELS) else() set_property(TARGET ${target} PROPERTY CUDA_ARCHITECTURES ${CUDA_ARCHITECTURES}) endif() endfunction()