CUDA13 breaking changes: c++ compile successful

Signed-off-by: Xiwen Yu <13230610+VALLIS-NERIA@users.noreply.github.com>
This commit is contained in:
Xiwen Yu 2025-06-17 17:21:55 +08:00
parent 303604f82d
commit 5c09dc8304
11 changed files with 51 additions and 29 deletions

View File

@ -212,6 +212,7 @@ endif()
include_directories(
SYSTEM
${CUDAToolkit_INCLUDE_DIRS}
${CUDAToolkit_INCLUDE_DIRS}/cccl
${CUDNN_ROOT_DIR}/include
$<TARGET_PROPERTY:TensorRT::NvInfer,INTERFACE_INCLUDE_DIRECTORIES>
${3RDPARTY_DIR}/cutlass/include

View File

@ -95,7 +95,7 @@ constexpr CUtensorMapDataType get_CUtensorMapDataType()
}
}
PFN_cuTensorMapEncodeTiled get_cuTensorMapEncodeTiled()
PFN_cuTensorMapEncodeTiled_v12000 get_cuTensorMapEncodeTiled()
{
// Get pointer to `cuTensorMapEncodeTiled`
cudaDriverEntryPointQueryResult driver_status;
@ -110,12 +110,12 @@ PFN_cuTensorMapEncodeTiled get_cuTensorMapEncodeTiled()
if (driver_status != cudaDriverEntryPointSuccess)
throw std::runtime_error("driver_status != cudaDriverEntryPointSuccess");
return reinterpret_cast<PFN_cuTensorMapEncodeTiled>(cuTensorMapEncodeTiled_ptr);
return reinterpret_cast<PFN_cuTensorMapEncodeTiled_v12000>(cuTensorMapEncodeTiled_ptr);
}
template <typename T>
CUtensorMap make_2d_tma_copy_desc(T* global_address, uint64_t gmem_dim[2], uint64_t stride_in_bytes,
uint32_t smem_dim[2], CUtensorMapSwizzle swizzle_type, PFN_cuTensorMapEncodeTiled encode_func = nullptr)
uint32_t smem_dim[2], CUtensorMapSwizzle swizzle_type, PFN_cuTensorMapEncodeTiled_v12000 encode_func = nullptr)
{
CUtensorMap tensor_map{};
constexpr uint32_t rank = 2;

View File

@ -134,15 +134,14 @@ void invokeUpdateCacheIndirection(int* tgtCI, int const* srcCI, BeamHypotheses&
sync_check_cuda_error(stream);
}
template <typename T>
__global__ void addCumLogProbs(T* __restrict pStage1LogProbs, float const* __restrict cumLogProbs,
__global__ void addCumLogProbs(float* __restrict pStage1LogProbs, float const* __restrict cumLogProbs,
FinishedState const* finished, int const* endIds, float const* diversityRates,
runtime::SizeType32 const* batchSlots, size_t const nBS, size_t const nBMIn, size_t const nBMOut, size_t const nBM)
{
int const bid = blockIdx.x; // Index of request in batch
runtime::SizeType32 const slot = batchSlots[bid];
float const diversityRate{diversityRates[slot]};
T* pLocalLogProbs = pStage1LogProbs + bid * nBMIn * nBMOut * 2;
float* pLocalLogProbs = pStage1LogProbs + bid * nBMIn * nBMOut * 2;
for (int i = threadIdx.x; i < nBMIn * nBMOut * 2; i += blockDim.x)
{
@ -160,13 +159,30 @@ __global__ void addCumLogProbs(T* __restrict pStage1LogProbs, float const* __res
return;
}
template __global__ void addCumLogProbs<float>(float* __restrict pStage1LogProbs, float const* __restrict cumLogProbs,
__global__ void addCumLogProbs(half* __restrict pStage1LogProbs, float const* __restrict cumLogProbs,
FinishedState const* finished, int const* endIds, float const* diversityRates,
runtime::SizeType32 const* batchSlots, size_t const nBS, size_t const nBMIn, size_t const nBMOut, size_t const nBM);
runtime::SizeType32 const* batchSlots, size_t const nBS, size_t const nBMIn, size_t const nBMOut, size_t const nBM)
{
int const bid = blockIdx.x; // Index of request in batch
runtime::SizeType32 const slot = batchSlots[bid];
float const diversityRate{diversityRates[slot]};
half* pLocalLogProbs = pStage1LogProbs + bid * nBMIn * nBMOut * 2;
template __global__ void addCumLogProbs<half>(half* __restrict pStage1LogProbs, float const* __restrict cumLogProbs,
FinishedState const* finished, int const* endIds, float const* diversityRates,
runtime::SizeType32 const* batchSlots, size_t const nBS, size_t const nBMIn, size_t const nBMOut, size_t const nBM);
for (int i = threadIdx.x; i < nBMIn * nBMOut * 2; i += blockDim.x)
{
int const iBMIn = i / (nBMOut * 2);
if (finished[slot * nBMIn + iBMIn].isFinished())
{
pLocalLogProbs[i] += (i == endIds[slot]) ? 1.0f : 0.0f;
}
else
{
// nBM is used in VBWS since `cumLogProbs` is initialized with kMaxBeamWidth earlier than BeamSearchLayer
pLocalLogProbs[i] += cumLogProbs[slot * nBM + iBMIn] + diversityRate * iBMIn;
}
}
return;
}
__global__ void gatherId(int const* __restrict pStage1Id, int* __restrict pStage2Id, size_t const nBS,
size_t const nBMIn, size_t const nBMOut, size_t const nV)

View File

@ -130,8 +130,11 @@ void invokeTopkBeamSearch(T const* logProbs, T const* bias, void* workspace, Bea
void invokeUpdateCacheIndirection(int* tgtCI, int const* srcCI, BeamHypotheses& bh,
runtime::SizeType32 const maxAttentionWindow, runtime::SizeType32 sinkTokenLength, cudaStream_t stream);
template <typename T>
__global__ void addCumLogProbs(T* __restrict pStage1Probs, float const* __restrict cumLogProbs,
__global__ void addCumLogProbs(float* __restrict pStage1LogProbs, float const* __restrict cumLogProbs,
FinishedState const* finished, int const* endIds, float const* diversityRates,
runtime::SizeType32 const* batchSlots, size_t const nBS, size_t const nBMIn, size_t const nBMOut, size_t const nBM);
__global__ void addCumLogProbs(half* __restrict pStage1LogProbs, float const* __restrict cumLogProbs,
FinishedState const* finished, int const* endIds, float const* diversityRates,
runtime::SizeType32 const* batchSlots, size_t const nBS, size_t const nBMIn, size_t const nBMOut, size_t const nBM);

View File

@ -84,7 +84,7 @@ inline CUtensorMapDataType get_CUtensorMapDataType()
}
}
PFN_cuTensorMapEncodeTiled get_cuTensorMapEncodeTiled()
PFN_cuTensorMapEncodeTiled_v12000 get_cuTensorMapEncodeTiled()
{
// Get pointer to cuTensorMapEncodeTiled
cudaDriverEntryPointQueryResult driver_status;
@ -101,12 +101,12 @@ PFN_cuTensorMapEncodeTiled get_cuTensorMapEncodeTiled()
throw std::runtime_error("driver_status != cudaDriverEntryPointSuccess");
}
return reinterpret_cast<PFN_cuTensorMapEncodeTiled>(cuTensorMapEncodeTiled_ptr);
return reinterpret_cast<PFN_cuTensorMapEncodeTiled_v12000>(cuTensorMapEncodeTiled_ptr);
}
template <typename data_type>
CUtensorMap make_2d_tma_copy_desc(data_type* global_address, uint64_t gmem_dim[2], uint64_t stride_in_bytes,
uint32_t smem_dim[2], CUtensorMapSwizzle swizzle_type, PFN_cuTensorMapEncodeTiled encode_func = nullptr)
uint32_t smem_dim[2], CUtensorMapSwizzle swizzle_type, PFN_cuTensorMapEncodeTiled_v12000 encode_func = nullptr)
{
CUtensorMap tensor_map{};
constexpr uint32_t rank = 2;

View File

@ -2597,7 +2597,7 @@ __global__ void __launch_bounds__(MAX_THEADS_PER_BLOCK, MIN_BLOCKS_PER_SM) maske
__shared__ typename BlockReduce::TempStorage temp_storage;
// Obtain a segment of consecutive items that are blocked across threads (final_max from above)
// Compute the block-wide max for thread0
final_max = BlockReduce(temp_storage).Reduce(thread_partial_max, cub::Max(), gridDim.z);
final_max = BlockReduce(temp_storage).Reduce(thread_partial_max, cuda::maximum(), gridDim.z);
__shared__ float final_max_smem;
if (tidx == 0)

View File

@ -250,7 +250,7 @@ __global__ void sage_quant_kernel(void const* q, void const* k, void const* v, i
// Compute the block-wide max for thread0
// cuda::maximum<>{}
float aggregate = BlockReduce(temp_storage).Reduce(local_amax, cub::Max{});
float aggregate = BlockReduce(temp_storage).Reduce(local_amax, cuda::maximum{});
if (row_id == 0 && col_id == 0)
s_block_amax = static_cast<float>(aggregate);
@ -429,7 +429,7 @@ __global__ void sage_quant_kernel(void const* q, void const* k, void const* v, i
// Compute the block-wide max for thread0
// cuda::maximum<>{}
float aggregate = BlockReduce(temp_storage).Reduce(local_amax, cub::Max{});
float aggregate = BlockReduce(temp_storage).Reduce(local_amax, cuda::maximum{});
if (row_id == 0 && col_id == 0)
s_block_amax = static_cast<float>(aggregate);

View File

@ -504,7 +504,7 @@ __global__ void prepareGenEagleNetInputsKernel(SizeType32* nextSequenceLengths,
BlockScan(tempStorage.scan).ExclusiveSum(numNextLogits, outputLastIndicesBase);
// Sync because tempStorage is reused.
__syncthreads();
auto const maxGenLength = BlockReduce(tempStorage.reduce).Reduce(nextDraftLen, cub::Max());
auto const maxGenLength = BlockReduce(tempStorage.reduce).Reduce(nextDraftLen, cuda::maximum());
// Thread 0 has the result.
if (bid == 0)

View File

@ -25,6 +25,8 @@
#include "topkLastDim.h"
#include <cub/cub.cuh>
#include <cuda/atomic>
#include <thrust/iterator/counting_iterator.h>
#include <thrust/iterator/transform_iterator.h>
namespace tensorrt_llm
{
@ -1221,9 +1223,9 @@ void standalone_stable_radix_topk_(void* buf, size_t& buf_size, T const* in, Idx
IdxT* sort_in_idx = nullptr;
air_topk_stable::ComputeOffset<IdxT> computeoffset(k);
cub::CountingInputIterator<IdxT> counting_iter(0);
cub::TransformInputIterator<IdxT, air_topk_stable::ComputeOffset<IdxT>, cub::CountingInputIterator<IdxT>>
transform_iter(counting_iter, computeoffset);
thrust::counting_iterator<IdxT> counting_iter(0);
thrust::transform_iterator<air_topk_stable::ComputeOffset<IdxT>, thrust::counting_iterator<IdxT>> transform_iter(
counting_iter, computeoffset);
cub::DeviceSegmentedSort::SortPairs(NULL, temp_storage_bytes, out_idx, out_idx, out, out, k * batch_size,
batch_size, transform_iter, transform_iter + 1, stream);
if (sorted)
@ -1348,9 +1350,9 @@ void standalone_stable_radix_topk_one_block_(void* buf, size_t& buf_size, T cons
const IdxT buf_len = air_topk_stable::calc_buf_len<T, IdxT, unsigned>(len);
air_topk_stable::ComputeOffset<IdxT> computeoffset(k);
cub::CountingInputIterator<IdxT> counting_iter(0);
cub::TransformInputIterator<IdxT, air_topk_stable::ComputeOffset<IdxT>, cub::CountingInputIterator<IdxT>>
transform_iter(counting_iter, computeoffset);
thrust::counting_iterator<IdxT> counting_iter(0);
thrust::transform_iterator<air_topk_stable::ComputeOffset<IdxT>, thrust::counting_iterator<IdxT>> transform_iter(
counting_iter, computeoffset);
cub::DeviceSegmentedSort::SortPairs(NULL, temp_storage_bytes, out_idx, out_idx, out, out, k * batch_size,
batch_size, transform_iter, transform_iter + 1, stream);

View File

@ -154,7 +154,7 @@ __global__ void activationDeepSeekKernel(KernelParams params)
float constexpr E4m3MaxVal{448.f};
// Compute the absolute max
float aMax = BlockReduce(temp_storage).Reduce(fabsf(out), cub::Max());
float aMax = BlockReduce(temp_storage).Reduce(fabsf(out), cuda::maximum());
if (threadIdx.x == 0)
{
s_scaleOut = aMax / E4m3MaxVal;
@ -657,7 +657,7 @@ __global__ void finalizeDeepSeekKernel(KernelParams params)
float constexpr E4m3MaxVal{448.f};
// Compute the absolute max
float aMax = BlockReduce(temp_storage).Reduce(fabsf(acc), cub::Max());
float aMax = BlockReduce(temp_storage).Reduce(fabsf(acc), cuda::maximum());
if (threadIdx.x == 0)
{

View File

@ -54,7 +54,7 @@ __global__ void checkTensorInvalidKernel(T const* data, std::size_t size, int* f
__shared__ typename BlockReduceT::TempStorage tempStorage;
// Compute block-wide maximum
int blockFound = BlockReduceT(tempStorage).Reduce(found, cub::Max());
int blockFound = BlockReduceT(tempStorage).Reduce(found, cuda::maximum());
// Have thread 0 write out block's result
if (threadIdx.x == 0)