mirror of
https://github.com/NVIDIA/TensorRT-LLM.git
synced 2026-01-14 06:27:45 +08:00
* Update TensorRT-LLM --------- Co-authored-by: Timur Abishev <abishev.timur@gmail.com> Co-authored-by: MahmoudAshraf97 <hassouna97.ma@gmail.com> Co-authored-by: Saeyoon Oh <saeyoon.oh@furiosa.ai> Co-authored-by: hattizai <hattizai@gmail.com>
208 lines
8.8 KiB
CMake
208 lines
8.8 KiB
CMake
# SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION &
|
|
# AFFILIATES. All rights reserved. SPDX-License-Identifier: Apache-2.0
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License"); you may not
|
|
# use this file except in compliance with the License. You may obtain a copy of
|
|
# the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
|
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
|
# License for the specific language governing permissions and limitations under
|
|
# the License.
|
|
|
|
# GoogleTest Preparation - Code block copied from
|
|
# https://google.github.io/googletest/quickstart-cmake.html
|
|
include(FetchContent)
|
|
FetchContent_Declare(
|
|
googletest
|
|
GIT_REPOSITORY
|
|
https://github.com/google/googletest.git
|
|
GIT_TAG release-1.12.1)
|
|
FetchContent_MakeAvailable(googletest)
|
|
include(GoogleTest)
|
|
|
|
# On Windows major version is appended to nvinfer libs.
|
|
if(WIN32)
|
|
set(ONNX_PARSER_LIB_NAME nvonnxparser_10)
|
|
else()
|
|
set(ONNX_PARSER_LIB_NAME nvonnxparser)
|
|
endif()
|
|
find_library_create_target(nvonnxparser ${ONNX_PARSER_LIB_NAME} SHARED
|
|
${TRT_OUT_DIR} ${TRT_LIB_DIR})
|
|
|
|
include_directories(
|
|
${PROJECT_SOURCE_DIR}/tensorrt_llm/cutlass_extensions/include
|
|
${PROJECT_SOURCE_DIR}/include
|
|
${3RDPARTY_DIR}/cutlass/include
|
|
${3RDPARTY_DIR}/cutlass/tools/util/include
|
|
${PROJECT_SOURCE_DIR}/tests/batch_manager
|
|
${PROJECT_SOURCE_DIR}/tests/utils)
|
|
|
|
set(TOP_LEVEL_DIR "${PROJECT_SOURCE_DIR}/..")
|
|
|
|
add_custom_target(google-tests)
|
|
|
|
function(add_gtest test_name test_src)
|
|
set(options NO_GTEST_MAIN NO_TLLM_LINKAGE)
|
|
cmake_parse_arguments(ARGS "${options}" "${oneValueArgs}" "${multiValueArgs}"
|
|
${ARGN})
|
|
add_executable(${test_name} ${test_src})
|
|
|
|
target_link_libraries(${test_name} PUBLIC gmock_main nvonnxparser)
|
|
if(NOT ARGS_NO_GTEST_MAIN)
|
|
target_link_libraries(${test_name} PUBLIC gtest_main)
|
|
endif()
|
|
if(NOT ARGS_NO_TLLM_LINKAGE)
|
|
target_link_libraries(${test_name} PUBLIC ${SHARED_TARGET}
|
|
nvinfer_plugin_tensorrt_llm)
|
|
endif()
|
|
|
|
target_compile_features(${test_name} PRIVATE cxx_std_17)
|
|
target_compile_definitions(${test_name}
|
|
PUBLIC TOP_LEVEL_DIR="${TOP_LEVEL_DIR}")
|
|
gtest_discover_tests(
|
|
${test_name}
|
|
PROPERTIES ENVIRONMENT "CUDA_MODULE_LOADING=LAZY" DISCOVERY_MODE
|
|
PRE_TEST # WAR for DLL discovery on windows.
|
|
DISCOVERY_TIMEOUT 30) # Longer timeout needed because discovery
|
|
# can be slow on Windows
|
|
add_dependencies(google-tests ${test_name})
|
|
endfunction()
|
|
|
|
add_gtest(loraManagerTest runtime/loraManagerTest.cpp)
|
|
add_gtest(loraUtilsTest runtime/loraUtilsTest.cpp)
|
|
add_gtest(loraCacheTest runtime/loraCacheTest.cpp)
|
|
add_gtest(workerPoolTest runtime/workerPoolTest.cpp)
|
|
add_gtest(transposeKVKernelTest runtime/transposeKVKernelTest.cpp)
|
|
add_gtest(gptDecoderTest runtime/gptDecoderTest.cpp)
|
|
add_gtest(gptDecoderBatchTest runtime/gptDecoderBatchTest.cpp)
|
|
add_gtest(gptSessionTest runtime/gptSessionTest.cpp)
|
|
target_link_libraries(gptSessionTest PRIVATE modelSpecStatic)
|
|
add_gtest(memoryUtilsTest common/memoryUtilsTest.cu)
|
|
if(ENABLE_MULTI_DEVICE EQUAL 1)
|
|
add_gtest(mpiUtilsTest common/mpiUtilsTest.cpp)
|
|
endif()
|
|
add_gtest(quantizationTest common/quantizationTest.cpp)
|
|
add_gtest(stringUtilsTest common/stringUtilsTest.cpp)
|
|
add_gtest(tllmExceptionTest common/tllmExceptionTest.cpp)
|
|
add_gtest(stlUtilsTest common/stlUtilsTest.cpp)
|
|
add_gtest(cudaProfilerUtilsTest common/cudaProfilerUtilsTest.cpp)
|
|
add_gtest(timestampUtilsTest common/timestampUtilsTest.cpp)
|
|
add_gtest(tllmRuntimeTest runtime/tllmRuntimeTest.cpp)
|
|
add_gtest(tllmBuffersTest runtime/tllmBuffersTest.cpp)
|
|
add_gtest(bufferManagerTest runtime/bufferManagerTest.cpp)
|
|
add_gtest(runtimeKernelTest runtime/runtimeKernelTest.cpp)
|
|
add_gtest(samplingTest runtime/samplingTest.cpp)
|
|
add_gtest(samplingConfigTest runtime/samplingConfigTest.cpp)
|
|
add_gtest(iTensorTest runtime/iTensorTest.cpp)
|
|
add_gtest(iBufferTest runtime/iBufferTest.cpp)
|
|
add_gtest(worldConfigTest runtime/worldConfigTest.cpp)
|
|
add_gtest(medusaModuleTest runtime/medusaModuleTest.cpp)
|
|
add_gtest(mixtureOfExpertsTest kernels/mixtureOfExpertsTest.cu)
|
|
add_gtest(ropeTest kernels/ropeTest.cu)
|
|
if(${BUILD_PYT})
|
|
add_gtest(torchTest runtime/torchTest.cpp)
|
|
add_gtest(thUtilsTest thop/thUtilsTest.cpp)
|
|
target_link_libraries(torchTest PUBLIC ${TORCH_LIBRARIES})
|
|
target_link_libraries(thUtilsTest PUBLIC th_utils ${Python3_LIBRARIES}
|
|
${TORCH_LIBRARIES})
|
|
endif()
|
|
set(SAMPLING_KERNEL_TEST_SRC
|
|
kernels/sampling/samplingTest.cpp
|
|
kernels/sampling/samplingTopKTest.cpp
|
|
kernels/sampling/samplingTopPTest.cpp
|
|
kernels/sampling/samplingAirTopPTest.cpp
|
|
kernels/sampling/samplingPenaltyTest.cpp
|
|
kernels/sampling/samplingUtilsTest.cu)
|
|
add_gtest(samplingKernelsTest "${SAMPLING_KERNEL_TEST_SRC}")
|
|
add_gtest(weightOnlyKernelTest kernels/weightOnly/weightOnlyKernelTest.cpp)
|
|
add_gtest(smoothQuantKernelTest kernels/smoothQuant/smoothQuantKernelTest.cpp)
|
|
add_gtest(cudaCoreGemmKernelTest
|
|
kernels/cudaCoreGemm/cudaCoreGemmKernelTest.cpp)
|
|
if(NOT ENABLE_MULTI_DEVICE EQUAL 0)
|
|
add_gtest(allReduceKernelTest kernels/allReduce/allReduceKernelTest.cu)
|
|
endif()
|
|
add_gtest(decodingKernelsTest kernels/decodingKernelTest.cpp)
|
|
add_gtest(banRepeatNGramsKernelsTest kernels/banRepeatNGramsKernelsTest.cpp)
|
|
add_gtest(stopCriteriaKernelsTest kernels/stopCriteriaKernelsTest.cpp)
|
|
add_gtest(shiftKCacheKernelTest kernels/shiftKCacheKernelTest.cu)
|
|
set(SAMPLING_LAYER_TEST_SRC
|
|
layers/baseSamplingLayerTest.cpp layers/samplingLayerTest.cpp
|
|
layers/topKSamplingLayerTest.cpp layers/topPSamplingLayerTest.cpp)
|
|
add_gtest(samplingLayerTest "${SAMPLING_LAYER_TEST_SRC}")
|
|
add_gtest(dynamicDecodeLayerTest layers/dynamicDecodeLayerTest.cpp)
|
|
add_gtest(medusaDecodeLayerTest layers/medusaDecodeLayerTest.cpp)
|
|
set(LOOKAHEAD_POOLMANAGER_TEST_SRC layers/randomLlm.cpp
|
|
layers/lookaheadPoolManagerTest.cpp)
|
|
add_gtest(lookaheadPoolManagerTest "${LOOKAHEAD_POOLMANAGER_TEST_SRC}")
|
|
set(LOOKAHEAD_ALGORITHM_TEST_SRC layers/randomLlm.cpp
|
|
layers/lookaheadAlgorithmTest.cpp)
|
|
add_gtest(lookaheadAlgorithmTest "${LOOKAHEAD_ALGORITHM_TEST_SRC}")
|
|
set(LOOKAHEAD_RANDOMLLM_TEST_SRC layers/randomLlm.cpp
|
|
layers/lookaheadRandomLlmTest.cpp)
|
|
add_gtest(lookaheadRandomLlmTest "${LOOKAHEAD_RANDOMLLM_TEST_SRC}")
|
|
add_gtest(explicitDraftTokensLayerTest layers/explicitDraftTokensLayerTest.cpp)
|
|
set(LOOKAHEAD_DECODING_TEST_SRC layers/randomLlm.cpp
|
|
layers/lookaheadDecodingLayerTest.cpp)
|
|
add_gtest(lookaheadDecodingLayerTest "${LOOKAHEAD_DECODING_TEST_SRC}")
|
|
|
|
add_gtest(
|
|
gemmSwigluRunnerTest
|
|
kernels/fused_gated_gemm/gemmSwigluRunnerTest.cu
|
|
${PROJECT_SOURCE_DIR}/tensorrt_llm/cutlass_extensions/kernels/fused_gated_gemm/gemm_swiglu_e4m3.cu
|
|
NO_GTEST_MAIN)
|
|
add_gtest(gemmSwigluKernelTestSm90Fp8
|
|
kernels/fused_gated_gemm/gemmSwigluKernelTestSm90Fp8.cu NO_GTEST_MAIN
|
|
NO_TLLM_LINKAGE)
|
|
|
|
foreach(target_name gemmSwigluRunnerTest;gemmSwigluKernelTestSm90Fp8)
|
|
set_property(TARGET ${target_name} PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
|
|
|
|
# Note - we deliberately do not include 90a PTX (even when 9.0+PTX is
|
|
# specified). This is because sm_90a has arch conditional instructions that
|
|
# are not forward compatible. As a result, it does not make sense to embed PTX
|
|
# into the binary anyway.
|
|
if("90" IN_LIST CMAKE_CUDA_ARCHITECTURES_ORIG
|
|
OR "90-real" IN_LIST CMAKE_CUDA_ARCHITECTURES_ORIG
|
|
OR "90-real" IN_LIST CMAKE_CUDA_ARCHITECTURES_NATIVE)
|
|
|
|
message(STATUS "MANUALLY APPENDING FLAG TO COMPILE FOR SM_90a.")
|
|
target_compile_options(
|
|
${target_name}
|
|
PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:-gencode=arch=compute_90a,code=sm_90a
|
|
-res-usage>)
|
|
|
|
# Hopper kernels require cuda lib for TMA APIs
|
|
target_link_libraries(${target_name} PRIVATE CUDA::cuda_driver)
|
|
|
|
# No kernels should be parsed, unless hopper is specified. This is a build
|
|
# time improvement
|
|
target_compile_definitions(${target_name} PRIVATE COMPILE_HOPPER_TMA_GEMMS)
|
|
endif()
|
|
|
|
# Suppress GCC note: the ABI for passing parameters with 64-byte alignment has
|
|
# changed in GCC 4.6 This note appears for kernels using TMA and clutters the
|
|
# compilation output.
|
|
if(NOT WIN32)
|
|
target_compile_options(
|
|
${target_name} PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:-Xcompiler=-Wno-psabi>)
|
|
endif()
|
|
endforeach()
|
|
|
|
add_subdirectory(utils)
|
|
|
|
if(BUILD_BATCH_MANAGER)
|
|
if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/batch_manager)
|
|
add_subdirectory(batch_manager)
|
|
endif()
|
|
endif()
|
|
|
|
if(BUILD_EXECUTOR)
|
|
if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/executor)
|
|
add_subdirectory(executor)
|
|
endif()
|
|
endif()
|