CUDA13 breaking changes: c++ compile successful

Signed-off-by: Xiwen Yu <13230610+VALLIS-NERIA@users.noreply.github.com>
2026-02-11 05:23:38 +08:00 · 2025-06-17 17:21:55 +08:00 · 2025-06-17 17:21:55 +08:00 · 5c09dc8304
commit 5c09dc8304
parent 303604f82d
11 changed files with 51 additions and 29 deletions
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@ -212,6 +212,7 @@ endif()
 include_directories(
  SYSTEM
  ${CUDAToolkit_INCLUDE_DIRS}
+  ${CUDAToolkit_INCLUDE_DIRS}/cccl
  ${CUDNN_ROOT_DIR}/include
  $<TARGET_PROPERTY:TensorRT::NvInfer,INTERFACE_INCLUDE_DIRECTORIES>
  ${3RDPARTY_DIR}/cutlass/include
--- a/cpp/include/tensorrt_llm/deep_gemm/tma_utils.cuh
+++ b/cpp/include/tensorrt_llm/deep_gemm/tma_utils.cuh
@ -95,7 +95,7 @@ constexpr CUtensorMapDataType get_CUtensorMapDataType()
    }
 }

-PFN_cuTensorMapEncodeTiled get_cuTensorMapEncodeTiled()
+PFN_cuTensorMapEncodeTiled_v12000 get_cuTensorMapEncodeTiled()
 {
    // Get pointer to `cuTensorMapEncodeTiled`
    cudaDriverEntryPointQueryResult driver_status;
@ -110,12 +110,12 @@ PFN_cuTensorMapEncodeTiled get_cuTensorMapEncodeTiled()

    if (driver_status != cudaDriverEntryPointSuccess)
        throw std::runtime_error("driver_status != cudaDriverEntryPointSuccess");
-    return reinterpret_cast<PFN_cuTensorMapEncodeTiled>(cuTensorMapEncodeTiled_ptr);
+    return reinterpret_cast<PFN_cuTensorMapEncodeTiled_v12000>(cuTensorMapEncodeTiled_ptr);
 }

 template <typename T>
 CUtensorMap make_2d_tma_copy_desc(T* global_address, uint64_t gmem_dim[2], uint64_t stride_in_bytes,
-    uint32_t smem_dim[2], CUtensorMapSwizzle swizzle_type, PFN_cuTensorMapEncodeTiled encode_func = nullptr)
+    uint32_t smem_dim[2], CUtensorMapSwizzle swizzle_type, PFN_cuTensorMapEncodeTiled_v12000 encode_func = nullptr)
 {
    CUtensorMap tensor_map{};
    constexpr uint32_t rank = 2;
--- a/cpp/tensorrt_llm/kernels/beamSearchKernels.cu
+++ b/cpp/tensorrt_llm/kernels/beamSearchKernels.cu
@ -134,15 +134,14 @@ void invokeUpdateCacheIndirection(int* tgtCI, int const* srcCI, BeamHypotheses&
    sync_check_cuda_error(stream);
 }

-template <typename T>
-__global__ void addCumLogProbs(T* __restrict pStage1LogProbs, float const* __restrict cumLogProbs,
+__global__ void addCumLogProbs(float* __restrict pStage1LogProbs, float const* __restrict cumLogProbs,
    FinishedState const* finished, int const* endIds, float const* diversityRates,
    runtime::SizeType32 const* batchSlots, size_t const nBS, size_t const nBMIn, size_t const nBMOut, size_t const nBM)
 {
    int const bid = blockIdx.x; // Index of request in batch
    runtime::SizeType32 const slot = batchSlots[bid];
    float const diversityRate{diversityRates[slot]};
-    T* pLocalLogProbs = pStage1LogProbs + bid * nBMIn * nBMOut * 2;
+    float* pLocalLogProbs = pStage1LogProbs + bid * nBMIn * nBMOut * 2;

    for (int i = threadIdx.x; i < nBMIn * nBMOut * 2; i += blockDim.x)
    {
@ -160,13 +159,30 @@ __global__ void addCumLogProbs(T* __restrict pStage1LogProbs, float const* __res
    return;
 }

-template __global__ void addCumLogProbs<float>(float* __restrict pStage1LogProbs, float const* __restrict cumLogProbs,
+__global__ void addCumLogProbs(half* __restrict pStage1LogProbs, float const* __restrict cumLogProbs,
    FinishedState const* finished, int const* endIds, float const* diversityRates,
-    runtime::SizeType32 const* batchSlots, size_t const nBS, size_t const nBMIn, size_t const nBMOut, size_t const nBM);
+    runtime::SizeType32 const* batchSlots, size_t const nBS, size_t const nBMIn, size_t const nBMOut, size_t const nBM)
+{
+    int const bid = blockIdx.x; // Index of request in batch
+    runtime::SizeType32 const slot = batchSlots[bid];
+    float const diversityRate{diversityRates[slot]};
+    half* pLocalLogProbs = pStage1LogProbs + bid * nBMIn * nBMOut * 2;

-template __global__ void addCumLogProbs<half>(half* __restrict pStage1LogProbs, float const* __restrict cumLogProbs,
-    FinishedState const* finished, int const* endIds, float const* diversityRates,
-    runtime::SizeType32 const* batchSlots, size_t const nBS, size_t const nBMIn, size_t const nBMOut, size_t const nBM);
+    for (int i = threadIdx.x; i < nBMIn * nBMOut * 2; i += blockDim.x)
+    {
+        int const iBMIn = i / (nBMOut * 2);
+        if (finished[slot * nBMIn + iBMIn].isFinished())
+        {
+            pLocalLogProbs[i] += (i == endIds[slot]) ? 1.0f : 0.0f;
+        }
+        else
+        {
+            // nBM is used in VBWS since `cumLogProbs` is initialized with kMaxBeamWidth earlier than BeamSearchLayer
+            pLocalLogProbs[i] += cumLogProbs[slot * nBM + iBMIn] + diversityRate * iBMIn;
+        }
+    }
+    return;
+}

 __global__ void gatherId(int const* __restrict pStage1Id, int* __restrict pStage2Id, size_t const nBS,
    size_t const nBMIn, size_t const nBMOut, size_t const nV)
--- a/cpp/tensorrt_llm/kernels/beamSearchKernels.h
+++ b/cpp/tensorrt_llm/kernels/beamSearchKernels.h
@ -130,8 +130,11 @@ void invokeTopkBeamSearch(T const* logProbs, T const* bias, void* workspace, Bea
 void invokeUpdateCacheIndirection(int* tgtCI, int const* srcCI, BeamHypotheses& bh,
    runtime::SizeType32 const maxAttentionWindow, runtime::SizeType32 sinkTokenLength, cudaStream_t stream);

-template <typename T>
-__global__ void addCumLogProbs(T* __restrict pStage1Probs, float const* __restrict cumLogProbs,
+__global__ void addCumLogProbs(float* __restrict pStage1LogProbs, float const* __restrict cumLogProbs,
+    FinishedState const* finished, int const* endIds, float const* diversityRates,
+    runtime::SizeType32 const* batchSlots, size_t const nBS, size_t const nBMIn, size_t const nBMOut, size_t const nBM);
+
+__global__ void addCumLogProbs(half* __restrict pStage1LogProbs, float const* __restrict cumLogProbs,
    FinishedState const* finished, int const* endIds, float const* diversityRates,
    runtime::SizeType32 const* batchSlots, size_t const nBS, size_t const nBMIn, size_t const nBMOut, size_t const nBM);

--- a/cpp/tensorrt_llm/kernels/cutlass_kernels/fp8_blockscale_gemm/fp8_blockscale_tma_utils.cuh
+++ b/cpp/tensorrt_llm/kernels/cutlass_kernels/fp8_blockscale_gemm/fp8_blockscale_tma_utils.cuh
@ -84,7 +84,7 @@ inline CUtensorMapDataType get_CUtensorMapDataType()
    }
 }

-PFN_cuTensorMapEncodeTiled get_cuTensorMapEncodeTiled()
+PFN_cuTensorMapEncodeTiled_v12000 get_cuTensorMapEncodeTiled()
 {
    // Get pointer to cuTensorMapEncodeTiled
    cudaDriverEntryPointQueryResult driver_status;
@ -101,12 +101,12 @@ PFN_cuTensorMapEncodeTiled get_cuTensorMapEncodeTiled()
        throw std::runtime_error("driver_status != cudaDriverEntryPointSuccess");
    }

-    return reinterpret_cast<PFN_cuTensorMapEncodeTiled>(cuTensorMapEncodeTiled_ptr);
+    return reinterpret_cast<PFN_cuTensorMapEncodeTiled_v12000>(cuTensorMapEncodeTiled_ptr);
 }

 template <typename data_type>
 CUtensorMap make_2d_tma_copy_desc(data_type* global_address, uint64_t gmem_dim[2], uint64_t stride_in_bytes,
-    uint32_t smem_dim[2], CUtensorMapSwizzle swizzle_type, PFN_cuTensorMapEncodeTiled encode_func = nullptr)
+    uint32_t smem_dim[2], CUtensorMapSwizzle swizzle_type, PFN_cuTensorMapEncodeTiled_v12000 encode_func = nullptr)
 {
    CUtensorMap tensor_map{};
    constexpr uint32_t rank = 2;
--- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderMaskedMultiheadAttentionTemplate.h
+++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderMaskedMultiheadAttentionTemplate.h
@ -2597,7 +2597,7 @@ __global__ void __launch_bounds__(MAX_THEADS_PER_BLOCK, MIN_BLOCKS_PER_SM) maske
            __shared__ typename BlockReduce::TempStorage temp_storage;
            // Obtain a segment of consecutive items that are blocked across threads (final_max from above)
            // Compute the block-wide max for thread0
-            final_max = BlockReduce(temp_storage).Reduce(thread_partial_max, cub::Max(), gridDim.z);
+            final_max = BlockReduce(temp_storage).Reduce(thread_partial_max, cuda::maximum(), gridDim.z);

            __shared__ float final_max_smem;
            if (tidx == 0)
--- a/cpp/tensorrt_llm/kernels/sageAttentionKernels.cu
+++ b/cpp/tensorrt_llm/kernels/sageAttentionKernels.cu
@ -250,7 +250,7 @@ __global__ void sage_quant_kernel(void const* q, void const* k, void const* v, i

        // Compute the block-wide max for thread0
        // cuda::maximum<>{}
-        float aggregate = BlockReduce(temp_storage).Reduce(local_amax, cub::Max{});
+        float aggregate = BlockReduce(temp_storage).Reduce(local_amax, cuda::maximum{});

        if (row_id == 0 && col_id == 0)
            s_block_amax = static_cast<float>(aggregate);
@ -429,7 +429,7 @@ __global__ void sage_quant_kernel(void const* q, void const* k, void const* v, i

        // Compute the block-wide max for thread0
        // cuda::maximum<>{}
-        float aggregate = BlockReduce(temp_storage).Reduce(local_amax, cub::Max{});
+        float aggregate = BlockReduce(temp_storage).Reduce(local_amax, cuda::maximum{});

        if (row_id == 0 && col_id == 0)
            s_block_amax = static_cast<float>(aggregate);
--- a/cpp/tensorrt_llm/kernels/speculativeDecoding/eagleDecodingKernels.cu
+++ b/cpp/tensorrt_llm/kernels/speculativeDecoding/eagleDecodingKernels.cu
@ -504,7 +504,7 @@ __global__ void prepareGenEagleNetInputsKernel(SizeType32* nextSequenceLengths,
    BlockScan(tempStorage.scan).ExclusiveSum(numNextLogits, outputLastIndicesBase);
    // Sync because tempStorage is reused.
    __syncthreads();
-    auto const maxGenLength = BlockReduce(tempStorage.reduce).Reduce(nextDraftLen, cub::Max());
+    auto const maxGenLength = BlockReduce(tempStorage.reduce).Reduce(nextDraftLen, cuda::maximum());

    // Thread 0 has the result.
    if (bid == 0)
--- a/cpp/tensorrt_llm/kernels/topkLastDim.cu
+++ b/cpp/tensorrt_llm/kernels/topkLastDim.cu
@ -25,6 +25,8 @@
 #include "topkLastDim.h"
 #include <cub/cub.cuh>
 #include <cuda/atomic>
+#include <thrust/iterator/counting_iterator.h>
+#include <thrust/iterator/transform_iterator.h>

 namespace tensorrt_llm
 {
@ -1221,9 +1223,9 @@ void standalone_stable_radix_topk_(void* buf, size_t& buf_size, T const* in, Idx
    IdxT* sort_in_idx = nullptr;

    air_topk_stable::ComputeOffset<IdxT> computeoffset(k);
-    cub::CountingInputIterator<IdxT> counting_iter(0);
-    cub::TransformInputIterator<IdxT, air_topk_stable::ComputeOffset<IdxT>, cub::CountingInputIterator<IdxT>>
-        transform_iter(counting_iter, computeoffset);
+    thrust::counting_iterator<IdxT> counting_iter(0);
+    thrust::transform_iterator<air_topk_stable::ComputeOffset<IdxT>, thrust::counting_iterator<IdxT>> transform_iter(
+        counting_iter, computeoffset);
    cub::DeviceSegmentedSort::SortPairs(NULL, temp_storage_bytes, out_idx, out_idx, out, out, k * batch_size,
        batch_size, transform_iter, transform_iter + 1, stream);
    if (sorted)
@ -1348,9 +1350,9 @@ void standalone_stable_radix_topk_one_block_(void* buf, size_t& buf_size, T cons
    const IdxT buf_len = air_topk_stable::calc_buf_len<T, IdxT, unsigned>(len);

    air_topk_stable::ComputeOffset<IdxT> computeoffset(k);
-    cub::CountingInputIterator<IdxT> counting_iter(0);
-    cub::TransformInputIterator<IdxT, air_topk_stable::ComputeOffset<IdxT>, cub::CountingInputIterator<IdxT>>
-        transform_iter(counting_iter, computeoffset);
+    thrust::counting_iterator<IdxT> counting_iter(0);
+    thrust::transform_iterator<air_topk_stable::ComputeOffset<IdxT>, thrust::counting_iterator<IdxT>> transform_iter(
+        counting_iter, computeoffset);

    cub::DeviceSegmentedSort::SortPairs(NULL, temp_storage_bytes, out_idx, out_idx, out, out, k * batch_size,
        batch_size, transform_iter, transform_iter + 1, stream);
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/blockScaleMoe/DevKernel.cu
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/blockScaleMoe/DevKernel.cu
@ -154,7 +154,7 @@ __global__ void activationDeepSeekKernel(KernelParams params)
                float constexpr E4m3MaxVal{448.f};

                // Compute the absolute max
-                float aMax = BlockReduce(temp_storage).Reduce(fabsf(out), cub::Max());
+                float aMax = BlockReduce(temp_storage).Reduce(fabsf(out), cuda::maximum());
                if (threadIdx.x == 0)
                {
                    s_scaleOut = aMax / E4m3MaxVal;
@ -657,7 +657,7 @@ __global__ void finalizeDeepSeekKernel(KernelParams params)
            float constexpr E4m3MaxVal{448.f};

            // Compute the absolute max
-            float aMax = BlockReduce(temp_storage).Reduce(fabsf(acc), cub::Max());
+            float aMax = BlockReduce(temp_storage).Reduce(fabsf(acc), cuda::maximum());

            if (threadIdx.x == 0)
            {
--- a/cpp/tensorrt_llm/runtime/utils/debugUtils.cu
+++ b/cpp/tensorrt_llm/runtime/utils/debugUtils.cu
@ -54,7 +54,7 @@ __global__ void checkTensorInvalidKernel(T const* data, std::size_t size, int* f
    __shared__ typename BlockReduceT::TempStorage tempStorage;

    // Compute block-wide maximum
-    int blockFound = BlockReduceT(tempStorage).Reduce(found, cub::Max());
+    int blockFound = BlockReduceT(tempStorage).Reduce(found, cuda::maximum());

    // Have thread 0 write out block's result
    if (threadIdx.x == 0)