mirror of
https://github.com/NVIDIA/TensorRT-LLM.git
synced 2026-01-14 06:27:45 +08:00
* feat: Add group_rms_norm kernel to normalize multiple inputs in a single operator. Previously, the RMSNorm implementation only supported a single input tensor. With group_rms_norm, multiple tensors can be normalized together: ```python input_a, input_b, ... = group_rms_norm([input_a, input_b, ...]) ``` All input tensors must share the same batch dimension. The kernel partitions work by dynamically assigning warp groups proportional to the last dimension of each input, improving launch efficiency and reducing overhead. This MR provides two implementations: GroupRMSNormKernel: Optimized for small-to-medium batch sizes GroupRMSNormKernelLargeBatch: Contains additional optimizations for large batch sizes Both kernels are currently exposed as custom PyTorch ops. A future MR will implement heuristic-based kernel selection and expose a unified interface. Signed-off-by: Simeng Liu <simengl@nvidia.com> * Resolve comments and fix typo with IS_FLASHINFER_AVAILABLE Signed-off-by: Simeng Liu <simengl@nvidia.com> --------- Signed-off-by: Simeng Liu <simengl@nvidia.com>
965 lines
38 KiB
Plaintext
965 lines
38 KiB
Plaintext
/*
|
|
* Copyright (c) 2019-2023, NVIDIA CORPORATION. All rights reserved.
|
|
*
|
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
|
* you may not use this file except in compliance with the License.
|
|
* You may obtain a copy of the License at
|
|
*
|
|
* http://www.apache.org/licenses/LICENSE-2.0
|
|
*
|
|
* Unless required by applicable law or agreed to in writing, software
|
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
* See the License for the specific language governing permissions and
|
|
* limitations under the License.
|
|
*/
|
|
|
|
#include "tensorrt_llm/common/assert.h"
|
|
#include "tensorrt_llm/common/cudaTypeUtils.cuh"
|
|
#include "tensorrt_llm/common/logger.h"
|
|
#include "tensorrt_llm/common/memoryUtils.h"
|
|
|
|
#include <curand_kernel.h>
|
|
#include <sys/stat.h>
|
|
#include <unordered_map>
|
|
|
|
#include <sanitizer/asan_interface.h>
|
|
|
|
namespace tensorrt_llm
|
|
{
|
|
namespace common
|
|
{
|
|
|
|
#ifdef __has_feature
|
|
#if __has_feature(address_sanitizer)
|
|
#define TLLM_HAS_ASAN
|
|
#endif
|
|
#elif defined(__SANITIZE_ADDRESS__)
|
|
#define TLLM_HAS_ASAN
|
|
#endif
|
|
|
|
cudaError_t cudaMemcpyAsyncSanitized(
|
|
void* dst, void const* src, size_t count, enum cudaMemcpyKind kind, cudaStream_t stream)
|
|
{
|
|
#if defined(TLLM_HAS_ASAN)
|
|
bool needASAN = false;
|
|
if (kind == cudaMemcpyDeviceToHost)
|
|
{
|
|
needASAN = true;
|
|
}
|
|
else if (kind == cudaMemcpyDefault)
|
|
{
|
|
auto const srcType = getPtrCudaMemoryType(src);
|
|
auto const dstType = getPtrCudaMemoryType(dst);
|
|
needASAN = srcType == cudaMemoryTypeDevice && dstType != cudaMemoryTypeDevice;
|
|
}
|
|
|
|
// Poison the memory area during async copy
|
|
if (needASAN)
|
|
{
|
|
ASAN_POISON_MEMORY_REGION(dst, count);
|
|
}
|
|
|
|
auto const result = cudaMemcpyAsync(dst, src, count, kind, stream);
|
|
|
|
if (result == cudaSuccess && needASAN)
|
|
{
|
|
struct ctxType
|
|
{
|
|
void* ptr;
|
|
size_t count;
|
|
};
|
|
|
|
auto const ctx = new ctxType{dst, count};
|
|
auto cb = [](cudaStream_t, cudaError_t, void* data)
|
|
{
|
|
auto const ctx = static_cast<ctxType*>(data);
|
|
ASAN_UNPOISON_MEMORY_REGION(ctx->ptr, ctx->count);
|
|
delete ctx;
|
|
};
|
|
TLLM_CUDA_CHECK(cudaStreamAddCallback(stream, cb, ctx, 0));
|
|
}
|
|
|
|
return result;
|
|
#else
|
|
return cudaMemcpyAsync(dst, src, count, kind, stream);
|
|
#endif
|
|
}
|
|
|
|
template <typename T>
|
|
void deviceMalloc(T** ptr, size_t size, bool is_random_initialize)
|
|
{
|
|
check_cuda_error(cudaMalloc((void**) (ptr), sizeof(T) * size));
|
|
if (is_random_initialize)
|
|
{
|
|
cudaRandomUniform(*ptr, size);
|
|
}
|
|
}
|
|
|
|
template void deviceMalloc(float** ptr, size_t size, bool is_random_initialize);
|
|
template void deviceMalloc(half** ptr, size_t size, bool is_random_initialize);
|
|
#ifdef ENABLE_BF16
|
|
template void deviceMalloc(__nv_bfloat16** ptr, size_t size, bool is_random_initialize);
|
|
#endif
|
|
template void deviceMalloc(uint32_t** ptr, size_t size, bool is_random_initialize);
|
|
template void deviceMalloc(uint16_t** ptr, size_t size, bool is_random_initialize);
|
|
template void deviceMalloc(int** ptr, size_t size, bool is_random_initialize);
|
|
template void deviceMalloc(bool** ptr, size_t size, bool is_random_initialize);
|
|
template void deviceMalloc(char** ptr, size_t size, bool is_random_initialize);
|
|
template void deviceMalloc(int8_t** ptr, size_t size, bool is_random_initialize);
|
|
#ifdef ENABLE_FP8
|
|
template void deviceMalloc(__nv_fp8_e4m3** ptr, size_t size, bool is_random_initialize);
|
|
#endif
|
|
|
|
template <typename T>
|
|
void deviceMemSetZero(T* ptr, size_t size)
|
|
{
|
|
check_cuda_error(cudaMemset(static_cast<void*>(ptr), 0, sizeof(T) * size));
|
|
}
|
|
|
|
template void deviceMemSetZero(float* ptr, size_t size);
|
|
template void deviceMemSetZero(half* ptr, size_t size);
|
|
template void deviceMemSetZero(int* ptr, size_t size);
|
|
template void deviceMemSetZero(uint32_t* ptr, size_t size);
|
|
template void deviceMemSetZero(bool* ptr, size_t size);
|
|
#ifdef ENABLE_FP8
|
|
template void deviceMemSetZero(__nv_fp8_e4m3* ptr, size_t size);
|
|
#endif
|
|
#ifdef ENABLE_BF16
|
|
template void deviceMemSetZero(__nv_bfloat16* ptr, size_t size);
|
|
#endif
|
|
|
|
template <typename T>
|
|
void deviceFree(T*& ptr)
|
|
{
|
|
if (ptr != NULL)
|
|
{
|
|
check_cuda_error(cudaFree(ptr));
|
|
ptr = NULL;
|
|
}
|
|
}
|
|
|
|
template void deviceFree(float*& ptr);
|
|
template void deviceFree(half*& ptr);
|
|
#ifdef ENABLE_BF16
|
|
template void deviceFree(__nv_bfloat16*& ptr);
|
|
#endif
|
|
template void deviceFree(unsigned short*& ptr);
|
|
template void deviceFree(uint32_t*& ptr);
|
|
template void deviceFree(int*& ptr);
|
|
template void deviceFree(bool*& ptr);
|
|
template void deviceFree(char*& ptr);
|
|
template void deviceFree(int8_t*& ptr);
|
|
#ifdef ENABLE_FP8
|
|
template void deviceFree(__nv_fp8_e4m3*& ptr);
|
|
#endif
|
|
|
|
template <typename T>
|
|
void deviceFill(T* devptr, size_t size, T value, cudaStream_t stream)
|
|
{
|
|
T* arr = new T[size];
|
|
std::fill(arr, arr + size, value);
|
|
check_cuda_error(cudaMemcpyAsync(devptr, arr, sizeof(T) * size, cudaMemcpyHostToDevice, stream));
|
|
delete[] arr;
|
|
}
|
|
|
|
template void deviceFill(float* devptr, size_t size, float value, cudaStream_t stream);
|
|
template void deviceFill(half* devptr, size_t size, half value, cudaStream_t stream);
|
|
#ifdef ENABLE_BF16
|
|
template void deviceFill(__nv_bfloat16* devptr, size_t size, __nv_bfloat16 value, cudaStream_t stream);
|
|
#endif
|
|
template void deviceFill(int* devptr, size_t size, int value, cudaStream_t stream);
|
|
template void deviceFill(bool* devptr, size_t size, bool value, cudaStream_t stream);
|
|
|
|
template <typename T>
|
|
void cudaD2Hcpy(T* tgt, T const* src, const size_t size)
|
|
{
|
|
check_cuda_error(cudaMemcpy(tgt, src, sizeof(T) * size, cudaMemcpyDeviceToHost));
|
|
}
|
|
|
|
template void cudaD2Hcpy(float* tgt, float const* src, size_t size);
|
|
template void cudaD2Hcpy(half* tgt, half const* src, size_t size);
|
|
#ifdef ENABLE_BF16
|
|
template void cudaD2Hcpy(__nv_bfloat16* tgt, __nv_bfloat16 const* src, size_t size);
|
|
#endif
|
|
template void cudaD2Hcpy(int* tgt, int const* src, size_t size);
|
|
template void cudaD2Hcpy(bool* tgt, bool const* src, size_t size);
|
|
#ifdef ENABLE_FP8
|
|
template void cudaD2Hcpy(__nv_fp8_e4m3* tgt, __nv_fp8_e4m3 const* src, size_t size);
|
|
#endif
|
|
template void cudaD2Hcpy(unsigned long long* tgt, unsigned long long const* src, size_t size);
|
|
template void cudaD2Hcpy(unsigned int* tgt, unsigned int const* src, size_t size);
|
|
template void cudaD2Hcpy(int8_t* tgt, int8_t const* src, size_t size);
|
|
|
|
template <typename T>
|
|
void cudaH2Dcpy(T* tgt, T const* src, const size_t size)
|
|
{
|
|
check_cuda_error(cudaMemcpy(tgt, src, sizeof(T) * size, cudaMemcpyHostToDevice));
|
|
}
|
|
|
|
template void cudaH2Dcpy(float* tgt, float const* src, size_t size);
|
|
template void cudaH2Dcpy(half* tgt, half const* src, size_t size);
|
|
#ifdef ENABLE_BF16
|
|
template void cudaH2Dcpy(__nv_bfloat16* tgt, __nv_bfloat16 const* src, size_t size);
|
|
#endif
|
|
template void cudaH2Dcpy(int* tgt, int const* src, size_t size);
|
|
template void cudaH2Dcpy(bool* tgt, bool const* src, size_t size);
|
|
#ifdef ENABLE_FP8
|
|
template void cudaH2Dcpy(__nv_fp8_e4m3* tgt, __nv_fp8_e4m3 const* src, size_t size);
|
|
#endif
|
|
template void cudaH2Dcpy(unsigned long long* tgt, unsigned long long const* src, size_t size);
|
|
template void cudaH2Dcpy(unsigned int* tgt, unsigned int const* src, size_t size);
|
|
template void cudaH2Dcpy(int8_t* tgt, int8_t const* src, size_t size);
|
|
|
|
template <typename T>
|
|
void cudaD2Dcpy(T* tgt, T const* src, const size_t size, cudaStream_t stream)
|
|
{
|
|
check_cuda_error(cudaMemcpyAsync(tgt, src, sizeof(T) * size, cudaMemcpyDeviceToDevice, stream));
|
|
}
|
|
|
|
template void cudaD2Dcpy(float* tgt, float const* src, size_t size, cudaStream_t stream);
|
|
template void cudaD2Dcpy(half* tgt, half const* src, size_t size, cudaStream_t stream);
|
|
#ifdef ENABLE_BF16
|
|
template void cudaD2Dcpy(__nv_bfloat16* tgt, __nv_bfloat16 const* src, size_t size, cudaStream_t stream);
|
|
#endif
|
|
template void cudaD2Dcpy(int* tgt, int const* src, size_t size, cudaStream_t stream);
|
|
template void cudaD2Dcpy(bool* tgt, bool const* src, size_t size, cudaStream_t stream);
|
|
template void cudaD2Dcpy(int8_t* tgt, int8_t const* src, size_t size, cudaStream_t stream);
|
|
#ifdef ENABLE_FP8
|
|
template void cudaD2Dcpy(__nv_fp8_e4m3* tgt, __nv_fp8_e4m3 const* src, size_t size, cudaStream_t stream);
|
|
#endif
|
|
template void cudaD2Dcpy(unsigned long long* tgt, unsigned long long const* src, size_t size, cudaStream_t stream);
|
|
|
|
template <typename T_OUT, typename T_IN>
|
|
__global__ void cudaCast(T_OUT* dst, T_IN* src, const size_t size)
|
|
{
|
|
for (size_t tid = threadIdx.x + blockIdx.x * blockDim.x; tid < size; tid += blockDim.x * gridDim.x)
|
|
{
|
|
dst[tid] = (T_OUT) ((float) (src[tid]));
|
|
}
|
|
}
|
|
|
|
template <typename T_OUT, typename T_IN>
|
|
void invokeCudaCast(T_OUT* dst, T_IN const* const src, const size_t size, cudaStream_t stream)
|
|
{
|
|
cudaCast<<<256, 256, 0, stream>>>(dst, src, size);
|
|
}
|
|
|
|
template void invokeCudaCast(float* dst, half const* const src, const size_t size, cudaStream_t stream);
|
|
#ifdef ENABLE_BF16
|
|
template void invokeCudaCast(float* dst, __nv_bfloat16 const* const src, const size_t size, cudaStream_t stream);
|
|
template void invokeCudaCast(__nv_bfloat16* dst, float const* const src, const size_t size, cudaStream_t stream);
|
|
template void invokeCudaCast(__nv_bfloat16* dst, half const* const src, const size_t size, cudaStream_t stream);
|
|
template void invokeCudaCast(half* dst, __nv_bfloat16 const* const src, const size_t size, cudaStream_t stream);
|
|
#endif
|
|
#ifdef ENABLE_FP8
|
|
template void invokeCudaCast(float* dst, __nv_fp8_e4m3 const* const src, const size_t size, cudaStream_t stream);
|
|
template void invokeCudaCast(
|
|
__nv_bfloat16* dst, __nv_fp8_e4m3 const* const src, const size_t size, cudaStream_t stream);
|
|
template void invokeCudaCast(half* dst, __nv_fp8_e4m3 const* const src, const size_t size, cudaStream_t stream);
|
|
template void invokeCudaCast(__nv_fp8_e4m3* dst, float const* const src, const size_t size, cudaStream_t stream);
|
|
template void invokeCudaCast(
|
|
__nv_fp8_e4m3* dst, __nv_bfloat16 const* const src, const size_t size, cudaStream_t stream);
|
|
template void invokeCudaCast(__nv_fp8_e4m3* dst, half const* const src, const size_t size, cudaStream_t stream);
|
|
#endif
|
|
|
|
template <typename T>
|
|
void cudaAutoCpy(T* tgt, T const* src, const size_t size, cudaStream_t stream)
|
|
{
|
|
if (stream != NULL)
|
|
{
|
|
check_cuda_error(cudaMemcpyAsyncSanitized(tgt, src, sizeof(T) * size, cudaMemcpyDefault, stream));
|
|
}
|
|
else
|
|
{
|
|
check_cuda_error(cudaMemcpy(tgt, src, sizeof(T) * size, cudaMemcpyDefault));
|
|
}
|
|
}
|
|
|
|
template void cudaAutoCpy(float* tgt, float const* src, size_t size, cudaStream_t stream);
|
|
template void cudaAutoCpy(half* tgt, half const* src, size_t size, cudaStream_t stream);
|
|
#ifdef ENABLE_BF16
|
|
template void cudaAutoCpy(__nv_bfloat16* tgt, __nv_bfloat16 const* src, size_t size, cudaStream_t stream);
|
|
#endif
|
|
template void cudaAutoCpy(int* tgt, int const* src, size_t size, cudaStream_t stream);
|
|
template void cudaAutoCpy(bool* tgt, bool const* src, size_t size, cudaStream_t stream);
|
|
template void cudaAutoCpy(int8_t* tgt, int8_t const* src, size_t size, cudaStream_t stream);
|
|
template void cudaAutoCpy(uint8_t* tgt, uint8_t const* src, size_t size, cudaStream_t stream);
|
|
template void cudaAutoCpy(uint32_t* tgt, uint32_t const* src, size_t size, cudaStream_t stream);
|
|
template void cudaAutoCpy(unsigned long long* tgt, unsigned long long const* src, size_t size, cudaStream_t stream);
|
|
template void cudaAutoCpy(unsigned long* tgt, unsigned long const* src, size_t size, cudaStream_t stream);
|
|
template void cudaAutoCpy(char* tgt, char const* src, size_t size, cudaStream_t stream);
|
|
|
|
template void cudaAutoCpy(float const** tgt, float const* const* src, size_t size, cudaStream_t stream);
|
|
template void cudaAutoCpy(half const** tgt, half const* const* src, size_t size, cudaStream_t stream);
|
|
#ifdef ENABLE_BF16
|
|
template void cudaAutoCpy(__nv_bfloat16 const** tgt, __nv_bfloat16 const* const* src, size_t size, cudaStream_t stream);
|
|
#endif
|
|
template void cudaAutoCpy(int const** tgt, int const* const* src, size_t size, cudaStream_t stream);
|
|
template void cudaAutoCpy(bool const** tgt, bool const* const* src, size_t size, cudaStream_t stream);
|
|
template void cudaAutoCpy(int8_t const** tgt, int8_t const* const* src, size_t size, cudaStream_t stream);
|
|
template void cudaAutoCpy(
|
|
unsigned long long const** tgt, unsigned long long const* const* src, size_t size, cudaStream_t stream);
|
|
|
|
template <typename T>
|
|
__global__ void cuda_random_uniform_kernel(T* buffer, const size_t size, int const seq_offset)
|
|
{
|
|
const size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
|
|
curandState_t local_state;
|
|
curand_init((unsigned long long int) 1337, idx + seq_offset, 0, &local_state);
|
|
for (size_t index = idx; index < size; index += blockDim.x * gridDim.x)
|
|
{
|
|
buffer[index] = (T) (curand_uniform(&local_state) * 0.2f - 0.1f);
|
|
}
|
|
}
|
|
|
|
template <>
|
|
__global__ void cuda_random_uniform_kernel<int>(int* buffer, const size_t size, int const seq_offset)
|
|
{
|
|
const size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
|
|
curandState_t local_state;
|
|
curand_init((float) 1337.f, idx + seq_offset, 0, &local_state);
|
|
for (size_t index = idx; index < size; index += blockDim.x * gridDim.x)
|
|
{
|
|
buffer[index] = curand(&local_state);
|
|
}
|
|
}
|
|
|
|
template <>
|
|
__global__ void cuda_random_uniform_kernel<bool>(bool* buffer, const size_t size, int const seq_offset)
|
|
{
|
|
const size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
|
|
curandState_t local_state;
|
|
curand_init((float) 1337.f, idx + seq_offset, 0, &local_state);
|
|
for (size_t index = idx; index < size; index += blockDim.x * gridDim.x)
|
|
{
|
|
buffer[index] = (curand(&local_state) % 2 == 0);
|
|
}
|
|
}
|
|
|
|
template <>
|
|
__global__ void cuda_random_uniform_kernel<char>(char* buffer, const size_t size, int const seq_offset)
|
|
{
|
|
const size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
|
|
curandState_t local_state;
|
|
curand_init((float) 1337.f, idx + seq_offset, 0, &local_state);
|
|
for (size_t index = idx; index < size; index += blockDim.x * gridDim.x)
|
|
{
|
|
buffer[index] = curand(&local_state) % 0xFF;
|
|
}
|
|
}
|
|
|
|
template <typename T>
|
|
void cudaRandomUniform(T* buffer, const size_t size)
|
|
{
|
|
static int seq_offset = 0;
|
|
cuda_random_uniform_kernel<T><<<256, 256>>>(buffer, size, seq_offset);
|
|
seq_offset += 256 * 256;
|
|
}
|
|
|
|
template void cudaRandomUniform(float* buffer, const size_t size);
|
|
template void cudaRandomUniform(half* buffer, const size_t size);
|
|
#ifdef ENABLE_BF16
|
|
template void cudaRandomUniform(__nv_bfloat16* buffer, const size_t size);
|
|
#endif
|
|
template void cudaRandomUniform(int* buffer, const size_t size);
|
|
template void cudaRandomUniform(bool* buffer, const size_t size);
|
|
template void cudaRandomUniform(char* buffer, const size_t size);
|
|
#ifdef ENABLE_FP8
|
|
template void cudaRandomUniform(__nv_fp8_e4m3* buffer, const size_t size);
|
|
#endif
|
|
|
|
// loads data from binary file. If it succeeds, returns a non-empty vector. If loading fails or
|
|
// the product of the elements in shape is 0, this function will return an empty vector.
|
|
template <typename T>
|
|
std::vector<T> loadWeightFromBinHelper(std::vector<size_t> shape, std::string filename)
|
|
{
|
|
if (shape.size() > 2)
|
|
{
|
|
printf("[ERROR] shape should have less than two dims \n");
|
|
return std::vector<T>();
|
|
}
|
|
size_t dim0 = shape[0], dim1 = 1;
|
|
if (shape.size() == 2)
|
|
{
|
|
dim1 = shape[1];
|
|
}
|
|
size_t size = dim0 * dim1;
|
|
if (size == 0)
|
|
{
|
|
TLLM_LOG_WARNING("shape is zero, skip loading weight from file %s \n", filename.c_str());
|
|
return std::vector<T>();
|
|
}
|
|
|
|
std::vector<T> host_array(size);
|
|
std::ifstream in(filename, std::ios::in | std::ios::binary);
|
|
if (!in.is_open())
|
|
{
|
|
TLLM_LOG_WARNING("file %s cannot be opened, loading model fails! \n", filename.c_str());
|
|
return std::vector<T>();
|
|
}
|
|
|
|
size_t loaded_data_size = sizeof(T) * size;
|
|
in.seekg(0, in.end);
|
|
in.seekg(0, in.beg);
|
|
|
|
TLLM_LOG_DEBUG("Read " + std::to_string(loaded_data_size) + " bytes from " + filename);
|
|
in.read((char*) host_array.data(), loaded_data_size);
|
|
|
|
size_t in_get_size = in.gcount();
|
|
if (in_get_size != loaded_data_size)
|
|
{
|
|
TLLM_LOG_WARNING("file %s only has %ld, but request %ld, loading model fails! \n", filename.c_str(),
|
|
in_get_size, loaded_data_size);
|
|
return std::vector<T>();
|
|
}
|
|
in.close();
|
|
// If we succeed, return an array with values.
|
|
return host_array;
|
|
}
|
|
|
|
template <typename T, typename T_IN>
|
|
int loadWeightFromBinFunc(T* ptr, std::vector<size_t> shape, std::string filename)
|
|
{
|
|
std::vector<T_IN> host_array = loadWeightFromBinHelper<T_IN>(shape, filename);
|
|
|
|
if (host_array.empty())
|
|
{
|
|
return 0;
|
|
}
|
|
|
|
if (std::is_same<T, T_IN>::value == true)
|
|
{
|
|
cudaH2Dcpy(ptr, (T*) host_array.data(), host_array.size());
|
|
}
|
|
else
|
|
{
|
|
T_IN* ptr_2 = nullptr;
|
|
deviceMalloc(&ptr_2, host_array.size(), false);
|
|
cudaH2Dcpy(ptr_2, host_array.data(), host_array.size());
|
|
invokeCudaD2DcpyConvert(ptr, ptr_2, host_array.size());
|
|
deviceFree(ptr_2);
|
|
}
|
|
return 0;
|
|
}
|
|
|
|
template int loadWeightFromBinFunc<float, float>(float* ptr, std::vector<size_t> shape, std::string filename);
|
|
template int loadWeightFromBinFunc<half, float>(half* ptr, std::vector<size_t> shape, std::string filename);
|
|
template int loadWeightFromBinFunc<float, half>(float* ptr, std::vector<size_t> shape, std::string filename);
|
|
template int loadWeightFromBinFunc<half, half>(half* ptr, std::vector<size_t> shape, std::string filename);
|
|
template int loadWeightFromBinFunc<int8_t, int8_t>(int8_t* ptr, std::vector<size_t> shape, std::string filename);
|
|
#ifdef ENABLE_BF16
|
|
template int loadWeightFromBinFunc<__nv_bfloat16, float>(
|
|
__nv_bfloat16* ptr, std::vector<size_t> shape, std::string filename);
|
|
template int loadWeightFromBinFunc<__nv_bfloat16, half>(
|
|
__nv_bfloat16* ptr, std::vector<size_t> shape, std::string filename);
|
|
template int loadWeightFromBinFunc<float, __nv_bfloat16>(float* ptr, std::vector<size_t> shape, std::string filename);
|
|
template int loadWeightFromBinFunc<half, __nv_bfloat16>(half* ptr, std::vector<size_t> shape, std::string filename);
|
|
template int loadWeightFromBinFunc<__nv_bfloat16, __nv_bfloat16>(
|
|
__nv_bfloat16* ptr, std::vector<size_t> shape, std::string filename);
|
|
#endif // ENABLE_BF16
|
|
template int loadWeightFromBinFunc<int, int>(int* ptr, std::vector<size_t> shape, std::string filename);
|
|
#ifdef ENABLE_FP8
|
|
template int loadWeightFromBinFunc<__nv_fp8_e4m3, float>(
|
|
__nv_fp8_e4m3* ptr, std::vector<size_t> shape, std::string filename);
|
|
#endif // ENABLE_FP8
|
|
|
|
template <typename T>
|
|
int loadWeightFromBin(T* ptr, std::vector<size_t> shape, std::string filename, TRTLLMCudaDataType model_file_type)
|
|
{
|
|
switch (model_file_type)
|
|
{
|
|
case TRTLLMCudaDataType::FP32: loadWeightFromBinFunc<T, float>(ptr, shape, filename); break;
|
|
case TRTLLMCudaDataType::FP16: loadWeightFromBinFunc<T, half>(ptr, shape, filename); break;
|
|
case TRTLLMCudaDataType::INT8: loadWeightFromBinFunc<T, int8_t>(ptr, shape, filename); break;
|
|
#ifdef ENABLE_BF16
|
|
case TRTLLMCudaDataType::BF16: loadWeightFromBinFunc<T, __nv_bfloat16>(ptr, shape, filename); break;
|
|
#endif
|
|
#ifdef ENABLE_FP8
|
|
case TRTLLMCudaDataType::FP8: loadWeightFromBinFunc<T, float>(ptr, shape, filename); break;
|
|
#endif
|
|
default: TLLM_LOG_ERROR("Does not support TRTLLMCudaDataType=%d", model_file_type); TLLM_CHECK(false);
|
|
}
|
|
return 0;
|
|
}
|
|
|
|
template <>
|
|
int loadWeightFromBin(int* ptr, std::vector<size_t> shape, std::string filename, TRTLLMCudaDataType model_file_type)
|
|
{
|
|
loadWeightFromBinFunc<int, int>(ptr, shape, filename);
|
|
return 0;
|
|
}
|
|
|
|
template int loadWeightFromBin(
|
|
float* ptr, std::vector<size_t> shape, std::string filename, TRTLLMCudaDataType model_file_type);
|
|
template int loadWeightFromBin(
|
|
half* ptr, std::vector<size_t> shape, std::string filename, TRTLLMCudaDataType model_file_type);
|
|
template int loadWeightFromBin(
|
|
int8_t* ptr, std::vector<size_t> shape, std::string filename, TRTLLMCudaDataType model_file_type);
|
|
#ifdef ENABLE_BF16
|
|
template int loadWeightFromBin(
|
|
__nv_bfloat16* ptr, std::vector<size_t> shape, std::string filename, TRTLLMCudaDataType model_file_type);
|
|
#endif
|
|
#ifdef ENABLE_FP8
|
|
template int loadWeightFromBin(
|
|
__nv_fp8_e4m3* ptr, std::vector<size_t> shape, std::string filename, TRTLLMCudaDataType model_file_type);
|
|
#endif
|
|
|
|
template <typename T_IN, typename T_OUT>
|
|
__global__ void cudaD2DcpyConvert(T_OUT* dst, const T_IN* src, const size_t size)
|
|
{
|
|
for (size_t tid = threadIdx.x + blockIdx.x * blockDim.x; tid < size; tid += blockDim.x * gridDim.x)
|
|
{
|
|
dst[tid] = cuda_cast<T_OUT>(src[tid]);
|
|
}
|
|
}
|
|
|
|
template <typename T_IN, typename T_OUT>
|
|
void invokeCudaD2DcpyConvert(T_OUT* tgt, const T_IN* src, const size_t size, cudaStream_t stream)
|
|
{
|
|
cudaD2DcpyConvert<<<256, 256, 0, stream>>>(tgt, src, size);
|
|
}
|
|
|
|
template void invokeCudaD2DcpyConvert(int8_t* tgt, float const* src, const size_t size, cudaStream_t stream);
|
|
template void invokeCudaD2DcpyConvert(float* tgt, int8_t const* src, const size_t size, cudaStream_t stream);
|
|
template void invokeCudaD2DcpyConvert(float* tgt, int const* src, const size_t size, cudaStream_t stream);
|
|
template void invokeCudaD2DcpyConvert(half* tgt, int const* src, const size_t size, cudaStream_t stream);
|
|
template void invokeCudaD2DcpyConvert(float* tgt, float const* src, const size_t size, cudaStream_t stream);
|
|
template void invokeCudaD2DcpyConvert(half* tgt, float const* src, const size_t size, cudaStream_t stream);
|
|
template void invokeCudaD2DcpyConvert(float* tgt, half const* src, const size_t size, cudaStream_t stream);
|
|
template void invokeCudaD2DcpyConvert(uint32_t* tgt, int const* src, const size_t size, cudaStream_t stream);
|
|
template void invokeCudaD2DcpyConvert(int* tgt, uint32_t const* src, const size_t size, cudaStream_t stream);
|
|
template void invokeCudaD2DcpyConvert(int* tgt, float const* src, const size_t size, cudaStream_t stream);
|
|
template void invokeCudaD2DcpyConvert(int* tgt, half const* src, const size_t size, cudaStream_t stream);
|
|
|
|
#ifdef ENABLE_BF16
|
|
template void invokeCudaD2DcpyConvert(__nv_bfloat16* tgt, float const* src, const size_t size, cudaStream_t stream);
|
|
template void invokeCudaD2DcpyConvert(__nv_bfloat16* tgt, int const* src, const size_t size, cudaStream_t stream);
|
|
template void invokeCudaD2DcpyConvert(float* tgt, __nv_bfloat16 const* src, const size_t size, cudaStream_t stream);
|
|
template void invokeCudaD2DcpyConvert(int* tgt, __nv_bfloat16 const* src, const size_t size, cudaStream_t stream);
|
|
#endif // ENABLE_BF16
|
|
|
|
template <typename T_IN, typename T_OUT>
|
|
__global__ void cudaD2DScaleCpyConvert(
|
|
T_OUT* dst, const T_IN* src, float const* scale, bool invert_scale, const size_t size)
|
|
{
|
|
float const scale_value = invert_scale ? 1.0f / scale[0] : scale[0];
|
|
for (size_t tid = threadIdx.x + blockIdx.x * blockDim.x; tid < size; tid += blockDim.x * gridDim.x)
|
|
{
|
|
dst[tid] = cuda_cast<T_OUT>(cuda_cast<float>(src[tid]) * scale_value);
|
|
}
|
|
}
|
|
|
|
template <typename T_IN, typename T_OUT>
|
|
void invokeCudaD2DScaleCpyConvert(
|
|
T_OUT* tgt, const T_IN* src, float const* scale, bool invert_scale, const size_t size, cudaStream_t stream)
|
|
{
|
|
cudaD2DScaleCpyConvert<<<256, 256, 0, stream>>>(tgt, src, scale, invert_scale, size);
|
|
}
|
|
|
|
// clang-format off
|
|
template void invokeCudaD2DScaleCpyConvert(float* tgt, const int32_t* src, const float* scale, bool invert_scale, const size_t size, cudaStream_t stream);
|
|
template void invokeCudaD2DScaleCpyConvert(int32_t* tgt, const float* src, const float* scale, bool invert_scale, const size_t size, cudaStream_t stream);
|
|
template void invokeCudaD2DScaleCpyConvert(half* tgt, const int32_t* src, const float* scale, bool invert_scale, const size_t size, cudaStream_t stream);
|
|
template void invokeCudaD2DScaleCpyConvert(int32_t* tgt, const half* src, const float* scale, bool invert_scale, const size_t size, cudaStream_t stream);
|
|
#ifdef ENABLE_BF16
|
|
template void invokeCudaD2DScaleCpyConvert(__nv_bfloat16* tgt, const int32_t* src, const float* scale, bool invert_scale, const size_t size, cudaStream_t stream);
|
|
template void invokeCudaD2DScaleCpyConvert(int32_t* tgt, const __nv_bfloat16* src, const float* scale, bool invert_scale, const size_t size, cudaStream_t stream);
|
|
#endif // ENABLE_BF16
|
|
#ifdef ENABLE_FP8
|
|
template void invokeCudaD2DScaleCpyConvert(float* tgt, const __nv_fp8_e4m3* src, const float* scale, bool invert_scale, const size_t size, cudaStream_t stream);
|
|
#endif // ENABLE_FP8
|
|
// clang-format on
|
|
|
|
void invokeCudaD2DcpyHalf2Float(float* dst, half* src, const size_t size, cudaStream_t stream)
|
|
{
|
|
invokeCudaD2DcpyConvert(dst, src, size, stream);
|
|
}
|
|
|
|
void invokeCudaD2DcpyFloat2Half(half* dst, float* src, const size_t size, cudaStream_t stream)
|
|
{
|
|
invokeCudaD2DcpyConvert(dst, src, size, stream);
|
|
}
|
|
|
|
template <typename T>
|
|
void saveToBinary(T const* ptr, const size_t size, std::string filename)
|
|
{
|
|
|
|
std::vector<T> h_ptr(size);
|
|
cudaD2Hcpy(h_ptr.data(), ptr, size);
|
|
std::vector<float> float_ptr(size);
|
|
for (size_t i = 0; i < size; i++)
|
|
{
|
|
float_ptr[i] = (float) h_ptr[i];
|
|
}
|
|
|
|
std::ofstream out(filename, std::ios::out | std::ios::binary);
|
|
TLLM_CHECK_WITH_INFO(out.is_open(), "Fail to open file " + filename);
|
|
|
|
out.write((char*) float_ptr.data(), size * sizeof(float));
|
|
}
|
|
|
|
template void saveToBinary(float const* ptr, const size_t size, std::string filename);
|
|
template void saveToBinary(half const* ptr, const size_t size, std::string filename);
|
|
#ifdef ENABLE_BF16
|
|
template void saveToBinary(__nv_bfloat16 const* ptr, const size_t size, std::string filename);
|
|
#endif // ENABLE_BF16
|
|
|
|
template <>
|
|
void saveToBinary(int const* ptr, const size_t size, std::string filename)
|
|
{
|
|
std::vector<int> h_ptr(size);
|
|
cudaD2Hcpy(h_ptr.data(), ptr, size);
|
|
std::ofstream out(filename, std::ios::out | std::ios::binary);
|
|
TLLM_CHECK_WITH_INFO(out.is_open(), "Fail to open file " + filename);
|
|
out.write((char*) h_ptr.data(), size * sizeof(int));
|
|
}
|
|
|
|
template <typename T_IN, typename T_fake_type>
|
|
__global__ void fakeCast(T_IN* input_ptr, const size_t size)
|
|
{
|
|
for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < size; i += blockDim.x * gridDim.x)
|
|
{
|
|
T_fake_type tmp_val = (T_fake_type) ((float) input_ptr[i]);
|
|
input_ptr[i] = (T_IN) ((float) tmp_val);
|
|
}
|
|
}
|
|
|
|
template <typename T_IN, typename T_fake_type>
|
|
void invokeFakeCast(T_IN* input_ptr, const size_t size, cudaStream_t stream)
|
|
{
|
|
dim3 block(256);
|
|
dim3 grid((size + 255) / 256);
|
|
fakeCast<T_IN, T_fake_type><<<grid, block, 0, stream>>>(input_ptr, size);
|
|
}
|
|
|
|
#ifdef ENABLE_FP8
|
|
__global__ void cudaD2Dcpyfp82Float(float* dst, __nv_fp8_e4m3* src, const size_t size)
|
|
{
|
|
for (size_t tid = threadIdx.x + blockIdx.x * blockDim.x; tid < size; tid += blockDim.x * gridDim.x)
|
|
{
|
|
dst[tid] = (float) (src[tid]);
|
|
}
|
|
}
|
|
|
|
void invokeCudaD2Dcpyfp82Float(float* dst, __nv_fp8_e4m3* src, const size_t size, cudaStream_t stream)
|
|
{
|
|
cudaD2Dcpyfp82Float<<<256, 256, 0, stream>>>(dst, src, size);
|
|
}
|
|
|
|
__global__ void cudaD2Dcpyfp82Half(half* dst, __nv_fp8_e4m3* src, const size_t size)
|
|
{
|
|
for (size_t tid = threadIdx.x + blockIdx.x * blockDim.x; tid < size; tid += blockDim.x * gridDim.x)
|
|
{
|
|
dst[tid] = (half) ((float) (src[tid]));
|
|
}
|
|
}
|
|
|
|
void invokeCudaD2Dcpyfp82Half(half* dst, __nv_fp8_e4m3* src, const size_t size, cudaStream_t stream)
|
|
{
|
|
cudaD2Dcpyfp82Half<<<256, 256, 0, stream>>>(dst, src, size);
|
|
}
|
|
|
|
__global__ void cudaD2DcpyFloat2fp8(__nv_fp8_e4m3* dst, float* src, const size_t size)
|
|
{
|
|
for (size_t tid = threadIdx.x + blockIdx.x * blockDim.x; tid < size; tid += blockDim.x * gridDim.x)
|
|
{
|
|
dst[tid] = (__nv_fp8_e4m3) src[tid];
|
|
}
|
|
}
|
|
|
|
void invokeCudaD2DcpyFloat2fp8(__nv_fp8_e4m3* dst, float* src, const size_t size, cudaStream_t stream)
|
|
{
|
|
cudaD2DcpyFloat2fp8<<<256, 256, 0, stream>>>(dst, src, size);
|
|
}
|
|
|
|
__global__ void cudaD2DcpyHalf2fp8(__nv_fp8_e4m3* dst, half* src, const size_t size)
|
|
{
|
|
for (size_t tid = threadIdx.x + blockIdx.x * blockDim.x; tid < size; tid += blockDim.x * gridDim.x)
|
|
{
|
|
dst[tid] = (__nv_fp8_e4m3) src[tid];
|
|
}
|
|
}
|
|
|
|
void invokeCudaD2DcpyHalf2fp8(__nv_fp8_e4m3* dst, half* src, const size_t size, cudaStream_t stream)
|
|
{
|
|
cudaD2DcpyHalf2fp8<<<256, 256, 0, stream>>>(dst, src, size);
|
|
}
|
|
|
|
__global__ void cudaD2DcpyBfloat2fp8(__nv_fp8_e4m3* dst, __nv_bfloat16* src, const size_t size)
|
|
{
|
|
for (size_t tid = threadIdx.x + blockIdx.x * blockDim.x; tid < size; tid += blockDim.x * gridDim.x)
|
|
{
|
|
dst[tid] = (__nv_fp8_e4m3) src[tid];
|
|
}
|
|
}
|
|
|
|
void invokeCudaD2DcpyBfloat2fp8(__nv_fp8_e4m3* dst, __nv_bfloat16* src, const size_t size, cudaStream_t stream)
|
|
{
|
|
cudaD2DcpyBfloat2fp8<<<256, 256, 0, stream>>>(dst, src, size);
|
|
}
|
|
|
|
#endif // ENABLE_FP8
|
|
|
|
template <typename T_OUT, typename T_IN>
|
|
__global__ void transpose(T_OUT* dst, T_IN* src, const size_t dim0, const size_t dim1)
|
|
{
|
|
for (size_t tid = threadIdx.x + blockIdx.x * blockDim.x; tid < dim0 * dim1; tid += blockDim.x * gridDim.x)
|
|
{
|
|
const size_t src_col_id = tid % dim1;
|
|
const size_t src_row_id = tid / dim1;
|
|
dst[src_col_id * dim0 + src_row_id] = (T_OUT) (src[tid]);
|
|
}
|
|
}
|
|
|
|
template <typename T>
|
|
void invokeInPlaceTranspose(T* data, T* workspace, const size_t dim0, const size_t dim1)
|
|
{
|
|
// copy data to workspace, and then transpose from workspace to data
|
|
cudaD2Dcpy(workspace, data, dim0 * dim1);
|
|
transpose<<<256, 256>>>(data, workspace, dim0, dim1);
|
|
}
|
|
|
|
#ifdef ENABLE_FP8
|
|
template void invokeInPlaceTranspose(
|
|
__nv_fp8_e4m3* data, __nv_fp8_e4m3* workspace, const size_t dim0, const size_t dim1);
|
|
#endif // ENABLE_FP8
|
|
#ifdef ENABLE_BF16
|
|
template void invokeInPlaceTranspose(
|
|
__nv_bfloat16* data, __nv_bfloat16* workspace, const size_t dim0, const size_t dim1);
|
|
#endif // ENABLE_BF16
|
|
template void invokeInPlaceTranspose(float* data, float* workspace, const size_t dim0, const size_t dim1);
|
|
|
|
template <typename T_OUT, typename T_IN>
|
|
__global__ void transpose0213(
|
|
T_OUT* dst, T_IN* src, const size_t dim0, const size_t dim1, const size_t dim2, const size_t dim3)
|
|
{
|
|
// src permutation: [0, 1, 2, 3]
|
|
// dst permutation: [0, 2, 1, 3]
|
|
for (size_t tid = threadIdx.x + blockIdx.x * blockDim.x; tid < dim0 * dim1 * dim2 * dim3;
|
|
tid += blockDim.x * gridDim.x)
|
|
{
|
|
size_t tmp_idx = tid;
|
|
const size_t dim_3_idx = tmp_idx % dim3;
|
|
tmp_idx = (tmp_idx - dim_3_idx) / dim3;
|
|
const size_t dim_2_idx = tmp_idx % dim2;
|
|
tmp_idx = (tmp_idx - dim_2_idx) / dim2;
|
|
const size_t dim_1_idx = tmp_idx % dim1;
|
|
tmp_idx = (tmp_idx - dim_1_idx) / dim1;
|
|
const size_t dim_0_idx = tmp_idx % dim0;
|
|
dst[dim_0_idx * dim1 * dim2 * dim3 + dim_2_idx * dim1 * dim3 + dim_1_idx * dim3 + dim_3_idx] = src[tid];
|
|
}
|
|
}
|
|
|
|
template <typename T>
|
|
void invokeInPlaceTranspose0213(
|
|
T* data, T* workspace, const size_t dim0, const size_t dim1, const size_t dim2, const size_t dim3)
|
|
{
|
|
// copy data to workspace, and then transpose from workspace to data
|
|
// Note that this kernel is used for pre-processing and not very efficient.
|
|
cudaD2Dcpy(workspace, data, dim0 * dim1 * dim2 * dim3);
|
|
transpose0213<<<256, 256>>>(data, workspace, dim0, dim1, dim2, dim3);
|
|
}
|
|
|
|
#ifdef ENABLE_FP8
|
|
template void invokeInPlaceTranspose0213(__nv_fp8_e4m3* data, __nv_fp8_e4m3* workspace, const size_t dim0,
|
|
const size_t dim1, const size_t dim2, const size_t dim3);
|
|
#endif // ENABLE_FP8
|
|
#ifdef ENABLE_BF16
|
|
template void invokeInPlaceTranspose0213(__nv_bfloat16* data, __nv_bfloat16* workspace, const size_t dim0,
|
|
const size_t dim1, const size_t dim2, const size_t dim3);
|
|
#endif // ENABLE_BF16
|
|
template void invokeInPlaceTranspose0213(
|
|
float* data, float* workspace, const size_t dim0, const size_t dim1, const size_t dim2, const size_t dim3);
|
|
|
|
template <typename T_OUT, typename T_IN>
|
|
__global__ void transpose102(T_OUT* dst, T_IN* src, const size_t dim0, const size_t dim1, const size_t dim2)
|
|
{
|
|
// src permutation: [0, 1, 2]
|
|
// dst permutation: [1, 0, 2]
|
|
for (size_t tid = threadIdx.x + blockIdx.x * blockDim.x; tid < dim0 * dim1 * dim2; tid += blockDim.x * gridDim.x)
|
|
{
|
|
size_t tmp_idx = tid;
|
|
const size_t dim_2_idx = tmp_idx % dim2;
|
|
tmp_idx = (tmp_idx - dim_2_idx) / dim2;
|
|
const size_t dim_1_idx = tmp_idx % dim1;
|
|
tmp_idx = (tmp_idx - dim_1_idx) / dim1;
|
|
const size_t dim_0_idx = tmp_idx % dim0;
|
|
dst[dim_1_idx * dim0 * dim2 + dim_0_idx * dim2 + dim_2_idx] = src[tid];
|
|
}
|
|
}
|
|
|
|
template <typename T>
|
|
void invokeInPlaceTranspose102(T* data, T* workspace, const size_t dim0, const size_t dim1, const size_t dim2)
|
|
{
|
|
// copy data to workspace, and then transpose from workspace to data
|
|
// Note that this kernel is used for pre-processing and not very efficient.
|
|
cudaD2Dcpy(workspace, data, dim0 * dim1 * dim2);
|
|
transpose102<<<256, 256>>>(data, workspace, dim0, dim1, dim2);
|
|
}
|
|
|
|
#ifdef ENABLE_FP8
|
|
template void invokeInPlaceTranspose102(
|
|
__nv_fp8_e4m3* data, __nv_fp8_e4m3* workspace, const size_t dim0, const size_t dim1, const size_t dim2);
|
|
#endif // ENABLE_FP8
|
|
#ifdef ENABLE_BF16
|
|
template void invokeInPlaceTranspose102(
|
|
__nv_bfloat16* data, __nv_bfloat16* workspace, const size_t dim0, const size_t dim1, const size_t dim2);
|
|
#endif // ENABLE_BF16
|
|
template void invokeInPlaceTranspose102(
|
|
float* data, float* workspace, const size_t dim0, const size_t dim1, const size_t dim2);
|
|
|
|
template <typename T>
|
|
void __global__ multiplyScale(T* tensor, float scale, const size_t size)
|
|
{
|
|
for (size_t index = threadIdx.x + blockIdx.x * blockDim.x; index < size; index += blockDim.x * gridDim.x)
|
|
{
|
|
tensor[index] = (T) (((float) tensor[index]) * scale);
|
|
}
|
|
}
|
|
|
|
template <typename T>
|
|
void invokeMultiplyScale(T* tensor, float scale, const size_t size, cudaStream_t stream)
|
|
{
|
|
int block = 256;
|
|
int grid = (size + 255) / 256;
|
|
multiplyScale<<<grid, block, 0, stream>>>(tensor, scale, size);
|
|
}
|
|
|
|
template void invokeMultiplyScale(float* tensor, float scale, const size_t size, cudaStream_t stream);
|
|
template void invokeMultiplyScale(half* tensor, float scale, const size_t size, cudaStream_t stream);
|
|
#ifdef ENABLE_BF16
|
|
template void invokeMultiplyScale(__nv_bfloat16* tensor, float scale, const size_t size, cudaStream_t stream);
|
|
#endif
|
|
#ifdef ENABLE_FP8
|
|
template void invokeMultiplyScale(__nv_fp8_e4m3* tensor, float scale, const size_t size, cudaStream_t stream);
|
|
#endif
|
|
|
|
template <typename T>
|
|
void __global__ divideScale(T* tensor, float scale, const size_t size)
|
|
{
|
|
for (size_t index = threadIdx.x + blockIdx.x * blockDim.x; index < size; index += blockDim.x * gridDim.x)
|
|
{
|
|
tensor[index] = (T) (((float) tensor[index]) / scale);
|
|
}
|
|
}
|
|
|
|
template <typename T>
|
|
void invokeDivideScale(T* tensor, float scale, const size_t size, cudaStream_t stream)
|
|
{
|
|
int block = 256;
|
|
int grid = (size + 255) / 256;
|
|
divideScale<<<grid, block, 0, stream>>>(tensor, scale, size);
|
|
}
|
|
|
|
template void invokeDivideScale(float* tensor, float scale, const size_t size, cudaStream_t stream);
|
|
template void invokeDivideScale(half* tensor, float scale, const size_t size, cudaStream_t stream);
|
|
#ifdef ENABLE_BF16
|
|
template void invokeDivideScale(__nv_bfloat16* tensor, float scale, const size_t size, cudaStream_t stream);
|
|
#endif
|
|
#ifdef ENABLE_FP8
|
|
template void invokeDivideScale(__nv_fp8_e4m3* tensor, float scale, const size_t size, cudaStream_t stream);
|
|
#endif
|
|
#ifdef ENABLE_BF16
|
|
template void invokeFakeCast<float, __nv_bfloat16>(float* input_ptr, const size_t size, cudaStream_t stream);
|
|
template void invokeFakeCast<__nv_bfloat16, __nv_bfloat16>(
|
|
__nv_bfloat16* input_ptr, const size_t size, cudaStream_t stream);
|
|
template void invokeFakeCast<half, __nv_bfloat16>(half* input_ptr, const size_t size, cudaStream_t stream);
|
|
#endif
|
|
template void invokeFakeCast<float, half>(float* input_ptr, const size_t size, cudaStream_t stream);
|
|
template void invokeFakeCast<float, float>(float* input_ptr, const size_t size, cudaStream_t stream);
|
|
#ifdef ENABLE_FP8
|
|
template void invokeFakeCast<float, __nv_fp8_e4m3>(float* input_ptr, const size_t size, cudaStream_t stream);
|
|
template void invokeFakeCast<half, __nv_fp8_e4m3>(half* input_ptr, const size_t size, cudaStream_t stream);
|
|
template void invokeFakeCast<__nv_bfloat16, __nv_fp8_e4m3>(
|
|
__nv_bfloat16* input_ptr, const size_t size, cudaStream_t stream);
|
|
#endif
|
|
|
|
size_t cuda_datatype_size(TRTLLMCudaDataType dt)
|
|
{
|
|
static const std::unordered_map<TRTLLMCudaDataType, size_t> sizes{
|
|
{TRTLLMCudaDataType::FP32, sizeof(float)}, {TRTLLMCudaDataType::FP16, sizeof(half)}
|
|
#ifdef ENABLE_BF16
|
|
,
|
|
{TRTLLMCudaDataType::BF16, sizeof(__nv_bfloat16)}
|
|
#endif
|
|
};
|
|
|
|
return sizes.at(dt);
|
|
}
|
|
|
|
template <typename T>
|
|
__global__ void check_range(T const* buffer, size_t size, T min, T max, bool* d_within_range)
|
|
{
|
|
for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < size; i += blockDim.x * gridDim.x)
|
|
{
|
|
const T val = buffer[i];
|
|
if (val < min || val > max)
|
|
{
|
|
*d_within_range = false;
|
|
}
|
|
}
|
|
}
|
|
|
|
template <typename T>
|
|
bool invokeCheckRange(T const* buffer, const size_t size, T min, T max, bool* d_within_range, cudaStream_t stream)
|
|
{
|
|
cudaMemsetAsync(d_within_range, true, sizeof(bool), stream);
|
|
|
|
dim3 block(256);
|
|
dim3 grid((size + 255) / 256);
|
|
check_range<T><<<grid, block, 0, stream>>>(buffer, size, min, max, d_within_range);
|
|
|
|
bool result;
|
|
cudaD2Hcpy(&result, d_within_range, 1);
|
|
return result;
|
|
}
|
|
|
|
template bool invokeCheckRange<int>(
|
|
int const* buffer, const size_t size, int min, int max, bool* d_within_range, cudaStream_t stream);
|
|
|
|
/*
|
|
* Determine the total workspace size based on a vector containing multiple variable sizes.
|
|
*/
|
|
size_t calcAlignedSize(std::vector<size_t> const& sizes, const size_t ALIGN_BYTES)
|
|
{
|
|
const size_t ALIGN_MASK = ~(ALIGN_BYTES - 1);
|
|
// Check ALIGN_BYTES is a power of 2
|
|
assert((ALIGN_BYTES & (ALIGN_BYTES - 1)) == 0);
|
|
|
|
size_t total = 0;
|
|
for (auto sz : sizes)
|
|
{
|
|
total += (sz + ALIGN_BYTES - 1) & ALIGN_MASK;
|
|
}
|
|
|
|
// We add extra "ALIGN_BYTES - 1" bytes in case the start address passed to the function calcAlignedPointers() is
|
|
// not aligned.
|
|
return total + ALIGN_BYTES - 1;
|
|
}
|
|
|
|
/*
|
|
* Given the address of the workspace and the vector containing multiple variable sizes, calculate the start addresses
|
|
* of each variable.
|
|
*/
|
|
void calcAlignedPointers(
|
|
std::vector<void*>& outPtrs, void const* p, std::vector<size_t> const& sizes, size_t ALIGN_BYTES)
|
|
{
|
|
const size_t ALIGN_MASK = ~(ALIGN_BYTES - 1);
|
|
// Check ALIGN_BYTES is a power of 2
|
|
assert((ALIGN_BYTES & (ALIGN_BYTES - 1)) == 0);
|
|
|
|
// In case the start address is not aligned
|
|
char* ptr = reinterpret_cast<char*>((reinterpret_cast<size_t>(p) + ALIGN_BYTES - 1) & ALIGN_MASK);
|
|
|
|
outPtrs.reserve(sizes.size());
|
|
for (auto sz : sizes)
|
|
{
|
|
outPtrs.push_back(ptr);
|
|
ptr += (sz + ALIGN_BYTES - 1) & ALIGN_MASK;
|
|
}
|
|
}
|
|
|
|
} // namespace common
|
|
} // namespace tensorrt_llm
|