mirror of
https://github.com/NVIDIA/TensorRT-LLM.git
synced 2026-02-11 05:23:38 +08:00
CUDA13 breaking changes: c++ compile successful
Signed-off-by: Xiwen Yu <13230610+VALLIS-NERIA@users.noreply.github.com>
This commit is contained in:
parent
303604f82d
commit
5c09dc8304
@ -212,6 +212,7 @@ endif()
|
||||
include_directories(
|
||||
SYSTEM
|
||||
${CUDAToolkit_INCLUDE_DIRS}
|
||||
${CUDAToolkit_INCLUDE_DIRS}/cccl
|
||||
${CUDNN_ROOT_DIR}/include
|
||||
$<TARGET_PROPERTY:TensorRT::NvInfer,INTERFACE_INCLUDE_DIRECTORIES>
|
||||
${3RDPARTY_DIR}/cutlass/include
|
||||
|
||||
@ -95,7 +95,7 @@ constexpr CUtensorMapDataType get_CUtensorMapDataType()
|
||||
}
|
||||
}
|
||||
|
||||
PFN_cuTensorMapEncodeTiled get_cuTensorMapEncodeTiled()
|
||||
PFN_cuTensorMapEncodeTiled_v12000 get_cuTensorMapEncodeTiled()
|
||||
{
|
||||
// Get pointer to `cuTensorMapEncodeTiled`
|
||||
cudaDriverEntryPointQueryResult driver_status;
|
||||
@ -110,12 +110,12 @@ PFN_cuTensorMapEncodeTiled get_cuTensorMapEncodeTiled()
|
||||
|
||||
if (driver_status != cudaDriverEntryPointSuccess)
|
||||
throw std::runtime_error("driver_status != cudaDriverEntryPointSuccess");
|
||||
return reinterpret_cast<PFN_cuTensorMapEncodeTiled>(cuTensorMapEncodeTiled_ptr);
|
||||
return reinterpret_cast<PFN_cuTensorMapEncodeTiled_v12000>(cuTensorMapEncodeTiled_ptr);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
CUtensorMap make_2d_tma_copy_desc(T* global_address, uint64_t gmem_dim[2], uint64_t stride_in_bytes,
|
||||
uint32_t smem_dim[2], CUtensorMapSwizzle swizzle_type, PFN_cuTensorMapEncodeTiled encode_func = nullptr)
|
||||
uint32_t smem_dim[2], CUtensorMapSwizzle swizzle_type, PFN_cuTensorMapEncodeTiled_v12000 encode_func = nullptr)
|
||||
{
|
||||
CUtensorMap tensor_map{};
|
||||
constexpr uint32_t rank = 2;
|
||||
|
||||
@ -134,15 +134,14 @@ void invokeUpdateCacheIndirection(int* tgtCI, int const* srcCI, BeamHypotheses&
|
||||
sync_check_cuda_error(stream);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__global__ void addCumLogProbs(T* __restrict pStage1LogProbs, float const* __restrict cumLogProbs,
|
||||
__global__ void addCumLogProbs(float* __restrict pStage1LogProbs, float const* __restrict cumLogProbs,
|
||||
FinishedState const* finished, int const* endIds, float const* diversityRates,
|
||||
runtime::SizeType32 const* batchSlots, size_t const nBS, size_t const nBMIn, size_t const nBMOut, size_t const nBM)
|
||||
{
|
||||
int const bid = blockIdx.x; // Index of request in batch
|
||||
runtime::SizeType32 const slot = batchSlots[bid];
|
||||
float const diversityRate{diversityRates[slot]};
|
||||
T* pLocalLogProbs = pStage1LogProbs + bid * nBMIn * nBMOut * 2;
|
||||
float* pLocalLogProbs = pStage1LogProbs + bid * nBMIn * nBMOut * 2;
|
||||
|
||||
for (int i = threadIdx.x; i < nBMIn * nBMOut * 2; i += blockDim.x)
|
||||
{
|
||||
@ -160,13 +159,30 @@ __global__ void addCumLogProbs(T* __restrict pStage1LogProbs, float const* __res
|
||||
return;
|
||||
}
|
||||
|
||||
template __global__ void addCumLogProbs<float>(float* __restrict pStage1LogProbs, float const* __restrict cumLogProbs,
|
||||
__global__ void addCumLogProbs(half* __restrict pStage1LogProbs, float const* __restrict cumLogProbs,
|
||||
FinishedState const* finished, int const* endIds, float const* diversityRates,
|
||||
runtime::SizeType32 const* batchSlots, size_t const nBS, size_t const nBMIn, size_t const nBMOut, size_t const nBM);
|
||||
runtime::SizeType32 const* batchSlots, size_t const nBS, size_t const nBMIn, size_t const nBMOut, size_t const nBM)
|
||||
{
|
||||
int const bid = blockIdx.x; // Index of request in batch
|
||||
runtime::SizeType32 const slot = batchSlots[bid];
|
||||
float const diversityRate{diversityRates[slot]};
|
||||
half* pLocalLogProbs = pStage1LogProbs + bid * nBMIn * nBMOut * 2;
|
||||
|
||||
template __global__ void addCumLogProbs<half>(half* __restrict pStage1LogProbs, float const* __restrict cumLogProbs,
|
||||
FinishedState const* finished, int const* endIds, float const* diversityRates,
|
||||
runtime::SizeType32 const* batchSlots, size_t const nBS, size_t const nBMIn, size_t const nBMOut, size_t const nBM);
|
||||
for (int i = threadIdx.x; i < nBMIn * nBMOut * 2; i += blockDim.x)
|
||||
{
|
||||
int const iBMIn = i / (nBMOut * 2);
|
||||
if (finished[slot * nBMIn + iBMIn].isFinished())
|
||||
{
|
||||
pLocalLogProbs[i] += (i == endIds[slot]) ? 1.0f : 0.0f;
|
||||
}
|
||||
else
|
||||
{
|
||||
// nBM is used in VBWS since `cumLogProbs` is initialized with kMaxBeamWidth earlier than BeamSearchLayer
|
||||
pLocalLogProbs[i] += cumLogProbs[slot * nBM + iBMIn] + diversityRate * iBMIn;
|
||||
}
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
__global__ void gatherId(int const* __restrict pStage1Id, int* __restrict pStage2Id, size_t const nBS,
|
||||
size_t const nBMIn, size_t const nBMOut, size_t const nV)
|
||||
|
||||
@ -130,8 +130,11 @@ void invokeTopkBeamSearch(T const* logProbs, T const* bias, void* workspace, Bea
|
||||
void invokeUpdateCacheIndirection(int* tgtCI, int const* srcCI, BeamHypotheses& bh,
|
||||
runtime::SizeType32 const maxAttentionWindow, runtime::SizeType32 sinkTokenLength, cudaStream_t stream);
|
||||
|
||||
template <typename T>
|
||||
__global__ void addCumLogProbs(T* __restrict pStage1Probs, float const* __restrict cumLogProbs,
|
||||
__global__ void addCumLogProbs(float* __restrict pStage1LogProbs, float const* __restrict cumLogProbs,
|
||||
FinishedState const* finished, int const* endIds, float const* diversityRates,
|
||||
runtime::SizeType32 const* batchSlots, size_t const nBS, size_t const nBMIn, size_t const nBMOut, size_t const nBM);
|
||||
|
||||
__global__ void addCumLogProbs(half* __restrict pStage1LogProbs, float const* __restrict cumLogProbs,
|
||||
FinishedState const* finished, int const* endIds, float const* diversityRates,
|
||||
runtime::SizeType32 const* batchSlots, size_t const nBS, size_t const nBMIn, size_t const nBMOut, size_t const nBM);
|
||||
|
||||
|
||||
@ -84,7 +84,7 @@ inline CUtensorMapDataType get_CUtensorMapDataType()
|
||||
}
|
||||
}
|
||||
|
||||
PFN_cuTensorMapEncodeTiled get_cuTensorMapEncodeTiled()
|
||||
PFN_cuTensorMapEncodeTiled_v12000 get_cuTensorMapEncodeTiled()
|
||||
{
|
||||
// Get pointer to cuTensorMapEncodeTiled
|
||||
cudaDriverEntryPointQueryResult driver_status;
|
||||
@ -101,12 +101,12 @@ PFN_cuTensorMapEncodeTiled get_cuTensorMapEncodeTiled()
|
||||
throw std::runtime_error("driver_status != cudaDriverEntryPointSuccess");
|
||||
}
|
||||
|
||||
return reinterpret_cast<PFN_cuTensorMapEncodeTiled>(cuTensorMapEncodeTiled_ptr);
|
||||
return reinterpret_cast<PFN_cuTensorMapEncodeTiled_v12000>(cuTensorMapEncodeTiled_ptr);
|
||||
}
|
||||
|
||||
template <typename data_type>
|
||||
CUtensorMap make_2d_tma_copy_desc(data_type* global_address, uint64_t gmem_dim[2], uint64_t stride_in_bytes,
|
||||
uint32_t smem_dim[2], CUtensorMapSwizzle swizzle_type, PFN_cuTensorMapEncodeTiled encode_func = nullptr)
|
||||
uint32_t smem_dim[2], CUtensorMapSwizzle swizzle_type, PFN_cuTensorMapEncodeTiled_v12000 encode_func = nullptr)
|
||||
{
|
||||
CUtensorMap tensor_map{};
|
||||
constexpr uint32_t rank = 2;
|
||||
|
||||
@ -2597,7 +2597,7 @@ __global__ void __launch_bounds__(MAX_THEADS_PER_BLOCK, MIN_BLOCKS_PER_SM) maske
|
||||
__shared__ typename BlockReduce::TempStorage temp_storage;
|
||||
// Obtain a segment of consecutive items that are blocked across threads (final_max from above)
|
||||
// Compute the block-wide max for thread0
|
||||
final_max = BlockReduce(temp_storage).Reduce(thread_partial_max, cub::Max(), gridDim.z);
|
||||
final_max = BlockReduce(temp_storage).Reduce(thread_partial_max, cuda::maximum(), gridDim.z);
|
||||
|
||||
__shared__ float final_max_smem;
|
||||
if (tidx == 0)
|
||||
|
||||
@ -250,7 +250,7 @@ __global__ void sage_quant_kernel(void const* q, void const* k, void const* v, i
|
||||
|
||||
// Compute the block-wide max for thread0
|
||||
// cuda::maximum<>{}
|
||||
float aggregate = BlockReduce(temp_storage).Reduce(local_amax, cub::Max{});
|
||||
float aggregate = BlockReduce(temp_storage).Reduce(local_amax, cuda::maximum{});
|
||||
|
||||
if (row_id == 0 && col_id == 0)
|
||||
s_block_amax = static_cast<float>(aggregate);
|
||||
@ -429,7 +429,7 @@ __global__ void sage_quant_kernel(void const* q, void const* k, void const* v, i
|
||||
|
||||
// Compute the block-wide max for thread0
|
||||
// cuda::maximum<>{}
|
||||
float aggregate = BlockReduce(temp_storage).Reduce(local_amax, cub::Max{});
|
||||
float aggregate = BlockReduce(temp_storage).Reduce(local_amax, cuda::maximum{});
|
||||
|
||||
if (row_id == 0 && col_id == 0)
|
||||
s_block_amax = static_cast<float>(aggregate);
|
||||
|
||||
@ -504,7 +504,7 @@ __global__ void prepareGenEagleNetInputsKernel(SizeType32* nextSequenceLengths,
|
||||
BlockScan(tempStorage.scan).ExclusiveSum(numNextLogits, outputLastIndicesBase);
|
||||
// Sync because tempStorage is reused.
|
||||
__syncthreads();
|
||||
auto const maxGenLength = BlockReduce(tempStorage.reduce).Reduce(nextDraftLen, cub::Max());
|
||||
auto const maxGenLength = BlockReduce(tempStorage.reduce).Reduce(nextDraftLen, cuda::maximum());
|
||||
|
||||
// Thread 0 has the result.
|
||||
if (bid == 0)
|
||||
|
||||
@ -25,6 +25,8 @@
|
||||
#include "topkLastDim.h"
|
||||
#include <cub/cub.cuh>
|
||||
#include <cuda/atomic>
|
||||
#include <thrust/iterator/counting_iterator.h>
|
||||
#include <thrust/iterator/transform_iterator.h>
|
||||
|
||||
namespace tensorrt_llm
|
||||
{
|
||||
@ -1221,9 +1223,9 @@ void standalone_stable_radix_topk_(void* buf, size_t& buf_size, T const* in, Idx
|
||||
IdxT* sort_in_idx = nullptr;
|
||||
|
||||
air_topk_stable::ComputeOffset<IdxT> computeoffset(k);
|
||||
cub::CountingInputIterator<IdxT> counting_iter(0);
|
||||
cub::TransformInputIterator<IdxT, air_topk_stable::ComputeOffset<IdxT>, cub::CountingInputIterator<IdxT>>
|
||||
transform_iter(counting_iter, computeoffset);
|
||||
thrust::counting_iterator<IdxT> counting_iter(0);
|
||||
thrust::transform_iterator<air_topk_stable::ComputeOffset<IdxT>, thrust::counting_iterator<IdxT>> transform_iter(
|
||||
counting_iter, computeoffset);
|
||||
cub::DeviceSegmentedSort::SortPairs(NULL, temp_storage_bytes, out_idx, out_idx, out, out, k * batch_size,
|
||||
batch_size, transform_iter, transform_iter + 1, stream);
|
||||
if (sorted)
|
||||
@ -1348,9 +1350,9 @@ void standalone_stable_radix_topk_one_block_(void* buf, size_t& buf_size, T cons
|
||||
const IdxT buf_len = air_topk_stable::calc_buf_len<T, IdxT, unsigned>(len);
|
||||
|
||||
air_topk_stable::ComputeOffset<IdxT> computeoffset(k);
|
||||
cub::CountingInputIterator<IdxT> counting_iter(0);
|
||||
cub::TransformInputIterator<IdxT, air_topk_stable::ComputeOffset<IdxT>, cub::CountingInputIterator<IdxT>>
|
||||
transform_iter(counting_iter, computeoffset);
|
||||
thrust::counting_iterator<IdxT> counting_iter(0);
|
||||
thrust::transform_iterator<air_topk_stable::ComputeOffset<IdxT>, thrust::counting_iterator<IdxT>> transform_iter(
|
||||
counting_iter, computeoffset);
|
||||
|
||||
cub::DeviceSegmentedSort::SortPairs(NULL, temp_storage_bytes, out_idx, out_idx, out, out, k * batch_size,
|
||||
batch_size, transform_iter, transform_iter + 1, stream);
|
||||
|
||||
@ -154,7 +154,7 @@ __global__ void activationDeepSeekKernel(KernelParams params)
|
||||
float constexpr E4m3MaxVal{448.f};
|
||||
|
||||
// Compute the absolute max
|
||||
float aMax = BlockReduce(temp_storage).Reduce(fabsf(out), cub::Max());
|
||||
float aMax = BlockReduce(temp_storage).Reduce(fabsf(out), cuda::maximum());
|
||||
if (threadIdx.x == 0)
|
||||
{
|
||||
s_scaleOut = aMax / E4m3MaxVal;
|
||||
@ -657,7 +657,7 @@ __global__ void finalizeDeepSeekKernel(KernelParams params)
|
||||
float constexpr E4m3MaxVal{448.f};
|
||||
|
||||
// Compute the absolute max
|
||||
float aMax = BlockReduce(temp_storage).Reduce(fabsf(acc), cub::Max());
|
||||
float aMax = BlockReduce(temp_storage).Reduce(fabsf(acc), cuda::maximum());
|
||||
|
||||
if (threadIdx.x == 0)
|
||||
{
|
||||
|
||||
@ -54,7 +54,7 @@ __global__ void checkTensorInvalidKernel(T const* data, std::size_t size, int* f
|
||||
__shared__ typename BlockReduceT::TempStorage tempStorage;
|
||||
|
||||
// Compute block-wide maximum
|
||||
int blockFound = BlockReduceT(tempStorage).Reduce(found, cub::Max());
|
||||
int blockFound = BlockReduceT(tempStorage).Reduce(found, cuda::maximum());
|
||||
|
||||
// Have thread 0 write out block's result
|
||||
if (threadIdx.x == 0)
|
||||
|
||||
Loading…
Reference in New Issue
Block a user