# SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION &
# AFFILIATES. All rights reserved. SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may not
# use this file except in compliance with the License. You may obtain a copy of
# the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations under
# the License.

if(NOT WIN32)
  # additional warnings
  #
  # Ignore overloaded-virtual warning. We intentionally change parameters of
  # some methods in derived class.
  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall")
  if(WARNING_IS_ERROR)
    message(STATUS "Treating warnings as errors in GCC compilation")
    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Werror")
  endif()
else() # Windows
  # warning level 4
  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /W4")
endif()

add_library(th_utils STATIC thUtils.cpp)
set_property(TARGET th_utils PROPERTY POSITION_INDEPENDENT_CODE ON)
set_property(TARGET th_utils PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
target_link_libraries(th_utils PUBLIC ${TORCH_LIBRARIES} ${CUBLAS_LIB}
                                      ${CURAND_LIB})

# TODO This does not compile with internal cutlass MOE gemm
add_library(
  th_common SHARED
  mlaPreprocessOp.cpp
  allgatherOp.cpp
  allreduceOp.cpp
  attentionOp.cpp
  causalConv1dOp.cpp
  convertSpecDecodingMaskToPackedMaskOp.cpp
  cutlassScaledMM.cpp
  cublasScaledMM.cpp
  cudaScaledMM.cpp
  dynamicDecodeOp.cpp
  fmhaPackMaskOp.cpp
  fp8Op.cpp
  fp8PerTensorScalingTrtllmGenGemm.cpp
  fp4Op.cpp
  fp4Gemm.cpp
  fp4GemmTrtllmGen.cpp
  fp8BatchedGemmTrtllmGen.cpp
  fp4Quantize.cpp
  fp4BatchedQuantize.cpp
  fp8BlockScalingGemm.cpp
  fp8RowwiseGemm.cpp
  fp8Quantize.cpp
  dsv3FusedAGemmOp.cpp
  fusedQKNormRopeOp.cpp
  fusedTopkSoftmax.cpp
  gatherTreeOp.cpp
  groupRmsNormOp.cpp
  llama4MinLatency.cpp
  logitsBitmaskOp.cpp
  mambaConv1dOp.cpp
  moeOp.cpp
  moeUtilOp.cpp
  moeCommOp.cpp
  moeLoadBalanceOp.cpp
  fp8BlockScaleMoe.cpp
  fp8PerTensorScaleMoe.cpp
  fp4BlockScaleMoe.cpp
  noAuxTcOp.cpp
  ncclCommunicatorOp.cpp
  parallelDecodeKVCacheUpdateOp.cpp
  redrafterCurandOp.cpp
  reducescatterOp.cpp
  relativeAttentionBiasOp.cpp
  dsv3RouterGemmOp.cpp
  renormMoeRoutingOp.cpp
  selectiveScanOp.cpp
  userbuffersFinalizeOp.cpp
  userbuffersTensor.cpp
  weightOnlyQuantGemm.cpp
  weightOnlyQuantOp.cpp
  mtpOp.cpp
  loraOp.cpp
  finegrained_mixed_dtype_gemm_thop.cpp)
set_property(TARGET th_common PROPERTY POSITION_INDEPENDENT_CODE ON)
target_link_libraries(th_common PRIVATE ${TORCH_LIBRARIES} th_utils
                                        ${Python3_LIBRARIES} ${SHARED_TARGET})

if(USING_OSS_CUTLASS_LOW_LATENCY_GEMM)
  target_compile_definitions(th_common
                             PUBLIC "USING_OSS_CUTLASS_LOW_LATENCY_GEMM")
  message(STATUS "Enable open source Cutlass low latency gemm kernel")
endif()

if(USING_OSS_CUTLASS_FP4_GEMM)
  target_compile_definitions(th_common PUBLIC USING_OSS_CUTLASS_FP4_GEMM)
endif()

if(USING_OSS_CUTLASS_MOE_GEMM)
  target_compile_definitions(th_common PUBLIC USING_OSS_CUTLASS_MOE_GEMM)
endif()

if(ENABLE_MULTI_DEVICE)
  target_include_directories(th_common PUBLIC ${MPI_C_INCLUDE_DIRS})
  target_link_libraries(th_common PRIVATE ${MPI_C_LIBRARIES} ${NCCL_LIB}
                                          CUDA::nvml)
endif()

if(NOT WIN32)
  set_target_properties(
    th_common
    PROPERTIES LINK_FLAGS
               "-Wl,-rpath='$ORIGIN' ${AS_NEEDED_FLAG} ${UNDEFINED_FLAG}")
else()
  target_link_libraries(th_common PRIVATE context_attention_src)
endif()