/* * Copyright (c) 2019-2023, NVIDIA CORPORATION. All rights reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #include "tensorrt_llm/common/assert.h" #include "tensorrt_llm/common/cudaTypeUtils.cuh" #include "tensorrt_llm/common/logger.h" #include "tensorrt_llm/common/memoryUtils.h" #include #include #include #include namespace tensorrt_llm { namespace common { #ifdef __has_feature #if __has_feature(address_sanitizer) #define TLLM_HAS_ASAN #endif #elif defined(__SANITIZE_ADDRESS__) #define TLLM_HAS_ASAN #endif cudaError_t cudaMemcpyAsyncSanitized( void* dst, void const* src, size_t count, enum cudaMemcpyKind kind, cudaStream_t stream) { #if defined(TLLM_HAS_ASAN) bool needASAN = false; if (kind == cudaMemcpyDeviceToHost) { needASAN = true; } else if (kind == cudaMemcpyDefault) { auto const srcType = getPtrCudaMemoryType(src); auto const dstType = getPtrCudaMemoryType(dst); needASAN = srcType == cudaMemoryTypeDevice && dstType != cudaMemoryTypeDevice; } // Poison the memory area during async copy if (needASAN) { ASAN_POISON_MEMORY_REGION(dst, count); } auto const result = cudaMemcpyAsync(dst, src, count, kind, stream); if (result == cudaSuccess && needASAN) { struct ctxType { void* ptr; size_t count; }; auto const ctx = new ctxType{dst, count}; auto cb = [](cudaStream_t, cudaError_t, void* data) { auto const ctx = static_cast(data); ASAN_UNPOISON_MEMORY_REGION(ctx->ptr, ctx->count); delete ctx; }; TLLM_CUDA_CHECK(cudaStreamAddCallback(stream, cb, ctx, 0)); } return result; #else return cudaMemcpyAsync(dst, src, count, kind, stream); #endif } template void deviceMalloc(T** ptr, size_t size, bool is_random_initialize) { check_cuda_error(cudaMalloc((void**) (ptr), sizeof(T) * size)); if (is_random_initialize) { cudaRandomUniform(*ptr, size); } } template void deviceMalloc(float** ptr, size_t size, bool is_random_initialize); template void deviceMalloc(half** ptr, size_t size, bool is_random_initialize); #ifdef ENABLE_BF16 template void deviceMalloc(__nv_bfloat16** ptr, size_t size, bool is_random_initialize); #endif template void deviceMalloc(uint32_t** ptr, size_t size, bool is_random_initialize); template void deviceMalloc(uint16_t** ptr, size_t size, bool is_random_initialize); template void deviceMalloc(int** ptr, size_t size, bool is_random_initialize); template void deviceMalloc(bool** ptr, size_t size, bool is_random_initialize); template void deviceMalloc(char** ptr, size_t size, bool is_random_initialize); template void deviceMalloc(int8_t** ptr, size_t size, bool is_random_initialize); #ifdef ENABLE_FP8 template void deviceMalloc(__nv_fp8_e4m3** ptr, size_t size, bool is_random_initialize); #endif template void deviceMemSetZero(T* ptr, size_t size) { check_cuda_error(cudaMemset(static_cast(ptr), 0, sizeof(T) * size)); } template void deviceMemSetZero(float* ptr, size_t size); template void deviceMemSetZero(half* ptr, size_t size); template void deviceMemSetZero(int* ptr, size_t size); template void deviceMemSetZero(uint32_t* ptr, size_t size); template void deviceMemSetZero(bool* ptr, size_t size); #ifdef ENABLE_FP8 template void deviceMemSetZero(__nv_fp8_e4m3* ptr, size_t size); #endif #ifdef ENABLE_BF16 template void deviceMemSetZero(__nv_bfloat16* ptr, size_t size); #endif template void deviceFree(T*& ptr) { if (ptr != NULL) { check_cuda_error(cudaFree(ptr)); ptr = NULL; } } template void deviceFree(float*& ptr); template void deviceFree(half*& ptr); #ifdef ENABLE_BF16 template void deviceFree(__nv_bfloat16*& ptr); #endif template void deviceFree(unsigned short*& ptr); template void deviceFree(uint32_t*& ptr); template void deviceFree(int*& ptr); template void deviceFree(bool*& ptr); template void deviceFree(char*& ptr); template void deviceFree(int8_t*& ptr); #ifdef ENABLE_FP8 template void deviceFree(__nv_fp8_e4m3*& ptr); #endif template void deviceFill(T* devptr, size_t size, T value, cudaStream_t stream) { T* arr = new T[size]; std::fill(arr, arr + size, value); check_cuda_error(cudaMemcpyAsync(devptr, arr, sizeof(T) * size, cudaMemcpyHostToDevice, stream)); delete[] arr; } template void deviceFill(float* devptr, size_t size, float value, cudaStream_t stream); template void deviceFill(half* devptr, size_t size, half value, cudaStream_t stream); #ifdef ENABLE_BF16 template void deviceFill(__nv_bfloat16* devptr, size_t size, __nv_bfloat16 value, cudaStream_t stream); #endif template void deviceFill(int* devptr, size_t size, int value, cudaStream_t stream); template void deviceFill(bool* devptr, size_t size, bool value, cudaStream_t stream); template void cudaD2Hcpy(T* tgt, T const* src, const size_t size) { check_cuda_error(cudaMemcpy(tgt, src, sizeof(T) * size, cudaMemcpyDeviceToHost)); } template void cudaD2Hcpy(float* tgt, float const* src, size_t size); template void cudaD2Hcpy(half* tgt, half const* src, size_t size); #ifdef ENABLE_BF16 template void cudaD2Hcpy(__nv_bfloat16* tgt, __nv_bfloat16 const* src, size_t size); #endif template void cudaD2Hcpy(int* tgt, int const* src, size_t size); template void cudaD2Hcpy(bool* tgt, bool const* src, size_t size); #ifdef ENABLE_FP8 template void cudaD2Hcpy(__nv_fp8_e4m3* tgt, __nv_fp8_e4m3 const* src, size_t size); #endif template void cudaD2Hcpy(unsigned long long* tgt, unsigned long long const* src, size_t size); template void cudaD2Hcpy(unsigned int* tgt, unsigned int const* src, size_t size); template void cudaD2Hcpy(int8_t* tgt, int8_t const* src, size_t size); template void cudaH2Dcpy(T* tgt, T const* src, const size_t size) { check_cuda_error(cudaMemcpy(tgt, src, sizeof(T) * size, cudaMemcpyHostToDevice)); } template void cudaH2Dcpy(float* tgt, float const* src, size_t size); template void cudaH2Dcpy(half* tgt, half const* src, size_t size); #ifdef ENABLE_BF16 template void cudaH2Dcpy(__nv_bfloat16* tgt, __nv_bfloat16 const* src, size_t size); #endif template void cudaH2Dcpy(int* tgt, int const* src, size_t size); template void cudaH2Dcpy(bool* tgt, bool const* src, size_t size); #ifdef ENABLE_FP8 template void cudaH2Dcpy(__nv_fp8_e4m3* tgt, __nv_fp8_e4m3 const* src, size_t size); #endif template void cudaH2Dcpy(unsigned long long* tgt, unsigned long long const* src, size_t size); template void cudaH2Dcpy(unsigned int* tgt, unsigned int const* src, size_t size); template void cudaH2Dcpy(int8_t* tgt, int8_t const* src, size_t size); template void cudaD2Dcpy(T* tgt, T const* src, const size_t size, cudaStream_t stream) { check_cuda_error(cudaMemcpyAsync(tgt, src, sizeof(T) * size, cudaMemcpyDeviceToDevice, stream)); } template void cudaD2Dcpy(float* tgt, float const* src, size_t size, cudaStream_t stream); template void cudaD2Dcpy(half* tgt, half const* src, size_t size, cudaStream_t stream); #ifdef ENABLE_BF16 template void cudaD2Dcpy(__nv_bfloat16* tgt, __nv_bfloat16 const* src, size_t size, cudaStream_t stream); #endif template void cudaD2Dcpy(int* tgt, int const* src, size_t size, cudaStream_t stream); template void cudaD2Dcpy(bool* tgt, bool const* src, size_t size, cudaStream_t stream); template void cudaD2Dcpy(int8_t* tgt, int8_t const* src, size_t size, cudaStream_t stream); #ifdef ENABLE_FP8 template void cudaD2Dcpy(__nv_fp8_e4m3* tgt, __nv_fp8_e4m3 const* src, size_t size, cudaStream_t stream); #endif template void cudaD2Dcpy(unsigned long long* tgt, unsigned long long const* src, size_t size, cudaStream_t stream); template __global__ void cudaCast(T_OUT* dst, T_IN* src, const size_t size) { for (size_t tid = threadIdx.x + blockIdx.x * blockDim.x; tid < size; tid += blockDim.x * gridDim.x) { dst[tid] = (T_OUT) ((float) (src[tid])); } } template void invokeCudaCast(T_OUT* dst, T_IN const* const src, const size_t size, cudaStream_t stream) { cudaCast<<<256, 256, 0, stream>>>(dst, src, size); } template void invokeCudaCast(float* dst, half const* const src, const size_t size, cudaStream_t stream); #ifdef ENABLE_BF16 template void invokeCudaCast(float* dst, __nv_bfloat16 const* const src, const size_t size, cudaStream_t stream); template void invokeCudaCast(__nv_bfloat16* dst, float const* const src, const size_t size, cudaStream_t stream); template void invokeCudaCast(__nv_bfloat16* dst, half const* const src, const size_t size, cudaStream_t stream); template void invokeCudaCast(half* dst, __nv_bfloat16 const* const src, const size_t size, cudaStream_t stream); #endif #ifdef ENABLE_FP8 template void invokeCudaCast(float* dst, __nv_fp8_e4m3 const* const src, const size_t size, cudaStream_t stream); template void invokeCudaCast( __nv_bfloat16* dst, __nv_fp8_e4m3 const* const src, const size_t size, cudaStream_t stream); template void invokeCudaCast(half* dst, __nv_fp8_e4m3 const* const src, const size_t size, cudaStream_t stream); template void invokeCudaCast(__nv_fp8_e4m3* dst, float const* const src, const size_t size, cudaStream_t stream); template void invokeCudaCast( __nv_fp8_e4m3* dst, __nv_bfloat16 const* const src, const size_t size, cudaStream_t stream); template void invokeCudaCast(__nv_fp8_e4m3* dst, half const* const src, const size_t size, cudaStream_t stream); #endif template void cudaAutoCpy(T* tgt, T const* src, const size_t size, cudaStream_t stream) { if (stream != NULL) { check_cuda_error(cudaMemcpyAsyncSanitized(tgt, src, sizeof(T) * size, cudaMemcpyDefault, stream)); } else { check_cuda_error(cudaMemcpy(tgt, src, sizeof(T) * size, cudaMemcpyDefault)); } } template void cudaAutoCpy(float* tgt, float const* src, size_t size, cudaStream_t stream); template void cudaAutoCpy(half* tgt, half const* src, size_t size, cudaStream_t stream); #ifdef ENABLE_BF16 template void cudaAutoCpy(__nv_bfloat16* tgt, __nv_bfloat16 const* src, size_t size, cudaStream_t stream); #endif template void cudaAutoCpy(int* tgt, int const* src, size_t size, cudaStream_t stream); template void cudaAutoCpy(bool* tgt, bool const* src, size_t size, cudaStream_t stream); template void cudaAutoCpy(int8_t* tgt, int8_t const* src, size_t size, cudaStream_t stream); template void cudaAutoCpy(uint8_t* tgt, uint8_t const* src, size_t size, cudaStream_t stream); template void cudaAutoCpy(uint32_t* tgt, uint32_t const* src, size_t size, cudaStream_t stream); template void cudaAutoCpy(unsigned long long* tgt, unsigned long long const* src, size_t size, cudaStream_t stream); template void cudaAutoCpy(unsigned long* tgt, unsigned long const* src, size_t size, cudaStream_t stream); template void cudaAutoCpy(char* tgt, char const* src, size_t size, cudaStream_t stream); template void cudaAutoCpy(float const** tgt, float const* const* src, size_t size, cudaStream_t stream); template void cudaAutoCpy(half const** tgt, half const* const* src, size_t size, cudaStream_t stream); #ifdef ENABLE_BF16 template void cudaAutoCpy(__nv_bfloat16 const** tgt, __nv_bfloat16 const* const* src, size_t size, cudaStream_t stream); #endif template void cudaAutoCpy(int const** tgt, int const* const* src, size_t size, cudaStream_t stream); template void cudaAutoCpy(bool const** tgt, bool const* const* src, size_t size, cudaStream_t stream); template void cudaAutoCpy(int8_t const** tgt, int8_t const* const* src, size_t size, cudaStream_t stream); template void cudaAutoCpy( unsigned long long const** tgt, unsigned long long const* const* src, size_t size, cudaStream_t stream); template __global__ void cuda_random_uniform_kernel(T* buffer, const size_t size, int const seq_offset) { const size_t idx = blockIdx.x * blockDim.x + threadIdx.x; curandState_t local_state; curand_init((unsigned long long int) 1337, idx + seq_offset, 0, &local_state); for (size_t index = idx; index < size; index += blockDim.x * gridDim.x) { buffer[index] = (T) (curand_uniform(&local_state) * 0.2f - 0.1f); } } template <> __global__ void cuda_random_uniform_kernel(int* buffer, const size_t size, int const seq_offset) { const size_t idx = blockIdx.x * blockDim.x + threadIdx.x; curandState_t local_state; curand_init((float) 1337.f, idx + seq_offset, 0, &local_state); for (size_t index = idx; index < size; index += blockDim.x * gridDim.x) { buffer[index] = curand(&local_state); } } template <> __global__ void cuda_random_uniform_kernel(bool* buffer, const size_t size, int const seq_offset) { const size_t idx = blockIdx.x * blockDim.x + threadIdx.x; curandState_t local_state; curand_init((float) 1337.f, idx + seq_offset, 0, &local_state); for (size_t index = idx; index < size; index += blockDim.x * gridDim.x) { buffer[index] = (curand(&local_state) % 2 == 0); } } template <> __global__ void cuda_random_uniform_kernel(char* buffer, const size_t size, int const seq_offset) { const size_t idx = blockIdx.x * blockDim.x + threadIdx.x; curandState_t local_state; curand_init((float) 1337.f, idx + seq_offset, 0, &local_state); for (size_t index = idx; index < size; index += blockDim.x * gridDim.x) { buffer[index] = curand(&local_state) % 0xFF; } } template void cudaRandomUniform(T* buffer, const size_t size) { static int seq_offset = 0; cuda_random_uniform_kernel<<<256, 256>>>(buffer, size, seq_offset); seq_offset += 256 * 256; } template void cudaRandomUniform(float* buffer, const size_t size); template void cudaRandomUniform(half* buffer, const size_t size); #ifdef ENABLE_BF16 template void cudaRandomUniform(__nv_bfloat16* buffer, const size_t size); #endif template void cudaRandomUniform(int* buffer, const size_t size); template void cudaRandomUniform(bool* buffer, const size_t size); template void cudaRandomUniform(char* buffer, const size_t size); #ifdef ENABLE_FP8 template void cudaRandomUniform(__nv_fp8_e4m3* buffer, const size_t size); #endif // loads data from binary file. If it succeeds, returns a non-empty vector. If loading fails or // the product of the elements in shape is 0, this function will return an empty vector. template std::vector loadWeightFromBinHelper(std::vector shape, std::string filename) { if (shape.size() > 2) { printf("[ERROR] shape should have less than two dims \n"); return std::vector(); } size_t dim0 = shape[0], dim1 = 1; if (shape.size() == 2) { dim1 = shape[1]; } size_t size = dim0 * dim1; if (size == 0) { TLLM_LOG_WARNING("shape is zero, skip loading weight from file %s \n", filename.c_str()); return std::vector(); } std::vector host_array(size); std::ifstream in(filename, std::ios::in | std::ios::binary); if (!in.is_open()) { TLLM_LOG_WARNING("file %s cannot be opened, loading model fails! \n", filename.c_str()); return std::vector(); } size_t loaded_data_size = sizeof(T) * size; in.seekg(0, in.end); in.seekg(0, in.beg); TLLM_LOG_DEBUG("Read " + std::to_string(loaded_data_size) + " bytes from " + filename); in.read((char*) host_array.data(), loaded_data_size); size_t in_get_size = in.gcount(); if (in_get_size != loaded_data_size) { TLLM_LOG_WARNING("file %s only has %ld, but request %ld, loading model fails! \n", filename.c_str(), in_get_size, loaded_data_size); return std::vector(); } in.close(); // If we succeed, return an array with values. return host_array; } template int loadWeightFromBinFunc(T* ptr, std::vector shape, std::string filename) { std::vector host_array = loadWeightFromBinHelper(shape, filename); if (host_array.empty()) { return 0; } if (std::is_same::value == true) { cudaH2Dcpy(ptr, (T*) host_array.data(), host_array.size()); } else { T_IN* ptr_2 = nullptr; deviceMalloc(&ptr_2, host_array.size(), false); cudaH2Dcpy(ptr_2, host_array.data(), host_array.size()); invokeCudaD2DcpyConvert(ptr, ptr_2, host_array.size()); deviceFree(ptr_2); } return 0; } template int loadWeightFromBinFunc(float* ptr, std::vector shape, std::string filename); template int loadWeightFromBinFunc(half* ptr, std::vector shape, std::string filename); template int loadWeightFromBinFunc(float* ptr, std::vector shape, std::string filename); template int loadWeightFromBinFunc(half* ptr, std::vector shape, std::string filename); template int loadWeightFromBinFunc(int8_t* ptr, std::vector shape, std::string filename); #ifdef ENABLE_BF16 template int loadWeightFromBinFunc<__nv_bfloat16, float>( __nv_bfloat16* ptr, std::vector shape, std::string filename); template int loadWeightFromBinFunc<__nv_bfloat16, half>( __nv_bfloat16* ptr, std::vector shape, std::string filename); template int loadWeightFromBinFunc(float* ptr, std::vector shape, std::string filename); template int loadWeightFromBinFunc(half* ptr, std::vector shape, std::string filename); template int loadWeightFromBinFunc<__nv_bfloat16, __nv_bfloat16>( __nv_bfloat16* ptr, std::vector shape, std::string filename); #endif // ENABLE_BF16 template int loadWeightFromBinFunc(int* ptr, std::vector shape, std::string filename); #ifdef ENABLE_FP8 template int loadWeightFromBinFunc<__nv_fp8_e4m3, float>( __nv_fp8_e4m3* ptr, std::vector shape, std::string filename); #endif // ENABLE_FP8 template int loadWeightFromBin(T* ptr, std::vector shape, std::string filename, TRTLLMCudaDataType model_file_type) { switch (model_file_type) { case TRTLLMCudaDataType::FP32: loadWeightFromBinFunc(ptr, shape, filename); break; case TRTLLMCudaDataType::FP16: loadWeightFromBinFunc(ptr, shape, filename); break; case TRTLLMCudaDataType::INT8: loadWeightFromBinFunc(ptr, shape, filename); break; #ifdef ENABLE_BF16 case TRTLLMCudaDataType::BF16: loadWeightFromBinFunc(ptr, shape, filename); break; #endif #ifdef ENABLE_FP8 case TRTLLMCudaDataType::FP8: loadWeightFromBinFunc(ptr, shape, filename); break; #endif default: TLLM_LOG_ERROR("Does not support TRTLLMCudaDataType=%d", model_file_type); TLLM_CHECK(false); } return 0; } template <> int loadWeightFromBin(int* ptr, std::vector shape, std::string filename, TRTLLMCudaDataType model_file_type) { loadWeightFromBinFunc(ptr, shape, filename); return 0; } template int loadWeightFromBin( float* ptr, std::vector shape, std::string filename, TRTLLMCudaDataType model_file_type); template int loadWeightFromBin( half* ptr, std::vector shape, std::string filename, TRTLLMCudaDataType model_file_type); template int loadWeightFromBin( int8_t* ptr, std::vector shape, std::string filename, TRTLLMCudaDataType model_file_type); #ifdef ENABLE_BF16 template int loadWeightFromBin( __nv_bfloat16* ptr, std::vector shape, std::string filename, TRTLLMCudaDataType model_file_type); #endif #ifdef ENABLE_FP8 template int loadWeightFromBin( __nv_fp8_e4m3* ptr, std::vector shape, std::string filename, TRTLLMCudaDataType model_file_type); #endif template __global__ void cudaD2DcpyConvert(T_OUT* dst, const T_IN* src, const size_t size) { for (size_t tid = threadIdx.x + blockIdx.x * blockDim.x; tid < size; tid += blockDim.x * gridDim.x) { dst[tid] = cuda_cast(src[tid]); } } template void invokeCudaD2DcpyConvert(T_OUT* tgt, const T_IN* src, const size_t size, cudaStream_t stream) { cudaD2DcpyConvert<<<256, 256, 0, stream>>>(tgt, src, size); } template void invokeCudaD2DcpyConvert(int8_t* tgt, float const* src, const size_t size, cudaStream_t stream); template void invokeCudaD2DcpyConvert(float* tgt, int8_t const* src, const size_t size, cudaStream_t stream); template void invokeCudaD2DcpyConvert(float* tgt, int const* src, const size_t size, cudaStream_t stream); template void invokeCudaD2DcpyConvert(half* tgt, int const* src, const size_t size, cudaStream_t stream); template void invokeCudaD2DcpyConvert(float* tgt, float const* src, const size_t size, cudaStream_t stream); template void invokeCudaD2DcpyConvert(half* tgt, float const* src, const size_t size, cudaStream_t stream); template void invokeCudaD2DcpyConvert(float* tgt, half const* src, const size_t size, cudaStream_t stream); template void invokeCudaD2DcpyConvert(uint32_t* tgt, int const* src, const size_t size, cudaStream_t stream); template void invokeCudaD2DcpyConvert(int* tgt, uint32_t const* src, const size_t size, cudaStream_t stream); template void invokeCudaD2DcpyConvert(int* tgt, float const* src, const size_t size, cudaStream_t stream); template void invokeCudaD2DcpyConvert(int* tgt, half const* src, const size_t size, cudaStream_t stream); #ifdef ENABLE_BF16 template void invokeCudaD2DcpyConvert(__nv_bfloat16* tgt, float const* src, const size_t size, cudaStream_t stream); template void invokeCudaD2DcpyConvert(__nv_bfloat16* tgt, int const* src, const size_t size, cudaStream_t stream); template void invokeCudaD2DcpyConvert(float* tgt, __nv_bfloat16 const* src, const size_t size, cudaStream_t stream); template void invokeCudaD2DcpyConvert(int* tgt, __nv_bfloat16 const* src, const size_t size, cudaStream_t stream); #endif // ENABLE_BF16 template __global__ void cudaD2DScaleCpyConvert( T_OUT* dst, const T_IN* src, float const* scale, bool invert_scale, const size_t size) { float const scale_value = invert_scale ? 1.0f / scale[0] : scale[0]; for (size_t tid = threadIdx.x + blockIdx.x * blockDim.x; tid < size; tid += blockDim.x * gridDim.x) { dst[tid] = cuda_cast(cuda_cast(src[tid]) * scale_value); } } template void invokeCudaD2DScaleCpyConvert( T_OUT* tgt, const T_IN* src, float const* scale, bool invert_scale, const size_t size, cudaStream_t stream) { cudaD2DScaleCpyConvert<<<256, 256, 0, stream>>>(tgt, src, scale, invert_scale, size); } // clang-format off template void invokeCudaD2DScaleCpyConvert(float* tgt, const int32_t* src, const float* scale, bool invert_scale, const size_t size, cudaStream_t stream); template void invokeCudaD2DScaleCpyConvert(int32_t* tgt, const float* src, const float* scale, bool invert_scale, const size_t size, cudaStream_t stream); template void invokeCudaD2DScaleCpyConvert(half* tgt, const int32_t* src, const float* scale, bool invert_scale, const size_t size, cudaStream_t stream); template void invokeCudaD2DScaleCpyConvert(int32_t* tgt, const half* src, const float* scale, bool invert_scale, const size_t size, cudaStream_t stream); #ifdef ENABLE_BF16 template void invokeCudaD2DScaleCpyConvert(__nv_bfloat16* tgt, const int32_t* src, const float* scale, bool invert_scale, const size_t size, cudaStream_t stream); template void invokeCudaD2DScaleCpyConvert(int32_t* tgt, const __nv_bfloat16* src, const float* scale, bool invert_scale, const size_t size, cudaStream_t stream); #endif // ENABLE_BF16 #ifdef ENABLE_FP8 template void invokeCudaD2DScaleCpyConvert(float* tgt, const __nv_fp8_e4m3* src, const float* scale, bool invert_scale, const size_t size, cudaStream_t stream); #endif // ENABLE_FP8 // clang-format on void invokeCudaD2DcpyHalf2Float(float* dst, half* src, const size_t size, cudaStream_t stream) { invokeCudaD2DcpyConvert(dst, src, size, stream); } void invokeCudaD2DcpyFloat2Half(half* dst, float* src, const size_t size, cudaStream_t stream) { invokeCudaD2DcpyConvert(dst, src, size, stream); } template void saveToBinary(T const* ptr, const size_t size, std::string filename) { std::vector h_ptr(size); cudaD2Hcpy(h_ptr.data(), ptr, size); std::vector float_ptr(size); for (size_t i = 0; i < size; i++) { float_ptr[i] = (float) h_ptr[i]; } std::ofstream out(filename, std::ios::out | std::ios::binary); TLLM_CHECK_WITH_INFO(out.is_open(), "Fail to open file " + filename); out.write((char*) float_ptr.data(), size * sizeof(float)); } template void saveToBinary(float const* ptr, const size_t size, std::string filename); template void saveToBinary(half const* ptr, const size_t size, std::string filename); #ifdef ENABLE_BF16 template void saveToBinary(__nv_bfloat16 const* ptr, const size_t size, std::string filename); #endif // ENABLE_BF16 template <> void saveToBinary(int const* ptr, const size_t size, std::string filename) { std::vector h_ptr(size); cudaD2Hcpy(h_ptr.data(), ptr, size); std::ofstream out(filename, std::ios::out | std::ios::binary); TLLM_CHECK_WITH_INFO(out.is_open(), "Fail to open file " + filename); out.write((char*) h_ptr.data(), size * sizeof(int)); } template __global__ void fakeCast(T_IN* input_ptr, const size_t size) { for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < size; i += blockDim.x * gridDim.x) { T_fake_type tmp_val = (T_fake_type) ((float) input_ptr[i]); input_ptr[i] = (T_IN) ((float) tmp_val); } } template void invokeFakeCast(T_IN* input_ptr, const size_t size, cudaStream_t stream) { dim3 block(256); dim3 grid((size + 255) / 256); fakeCast<<>>(input_ptr, size); } #ifdef ENABLE_FP8 __global__ void cudaD2Dcpyfp82Float(float* dst, __nv_fp8_e4m3* src, const size_t size) { for (size_t tid = threadIdx.x + blockIdx.x * blockDim.x; tid < size; tid += blockDim.x * gridDim.x) { dst[tid] = (float) (src[tid]); } } void invokeCudaD2Dcpyfp82Float(float* dst, __nv_fp8_e4m3* src, const size_t size, cudaStream_t stream) { cudaD2Dcpyfp82Float<<<256, 256, 0, stream>>>(dst, src, size); } __global__ void cudaD2Dcpyfp82Half(half* dst, __nv_fp8_e4m3* src, const size_t size) { for (size_t tid = threadIdx.x + blockIdx.x * blockDim.x; tid < size; tid += blockDim.x * gridDim.x) { dst[tid] = (half) ((float) (src[tid])); } } void invokeCudaD2Dcpyfp82Half(half* dst, __nv_fp8_e4m3* src, const size_t size, cudaStream_t stream) { cudaD2Dcpyfp82Half<<<256, 256, 0, stream>>>(dst, src, size); } __global__ void cudaD2DcpyFloat2fp8(__nv_fp8_e4m3* dst, float* src, const size_t size) { for (size_t tid = threadIdx.x + blockIdx.x * blockDim.x; tid < size; tid += blockDim.x * gridDim.x) { dst[tid] = (__nv_fp8_e4m3) src[tid]; } } void invokeCudaD2DcpyFloat2fp8(__nv_fp8_e4m3* dst, float* src, const size_t size, cudaStream_t stream) { cudaD2DcpyFloat2fp8<<<256, 256, 0, stream>>>(dst, src, size); } __global__ void cudaD2DcpyHalf2fp8(__nv_fp8_e4m3* dst, half* src, const size_t size) { for (size_t tid = threadIdx.x + blockIdx.x * blockDim.x; tid < size; tid += blockDim.x * gridDim.x) { dst[tid] = (__nv_fp8_e4m3) src[tid]; } } void invokeCudaD2DcpyHalf2fp8(__nv_fp8_e4m3* dst, half* src, const size_t size, cudaStream_t stream) { cudaD2DcpyHalf2fp8<<<256, 256, 0, stream>>>(dst, src, size); } __global__ void cudaD2DcpyBfloat2fp8(__nv_fp8_e4m3* dst, __nv_bfloat16* src, const size_t size) { for (size_t tid = threadIdx.x + blockIdx.x * blockDim.x; tid < size; tid += blockDim.x * gridDim.x) { dst[tid] = (__nv_fp8_e4m3) src[tid]; } } void invokeCudaD2DcpyBfloat2fp8(__nv_fp8_e4m3* dst, __nv_bfloat16* src, const size_t size, cudaStream_t stream) { cudaD2DcpyBfloat2fp8<<<256, 256, 0, stream>>>(dst, src, size); } #endif // ENABLE_FP8 template __global__ void transpose(T_OUT* dst, T_IN* src, const size_t dim0, const size_t dim1) { for (size_t tid = threadIdx.x + blockIdx.x * blockDim.x; tid < dim0 * dim1; tid += blockDim.x * gridDim.x) { const size_t src_col_id = tid % dim1; const size_t src_row_id = tid / dim1; dst[src_col_id * dim0 + src_row_id] = (T_OUT) (src[tid]); } } template void invokeInPlaceTranspose(T* data, T* workspace, const size_t dim0, const size_t dim1) { // copy data to workspace, and then transpose from workspace to data cudaD2Dcpy(workspace, data, dim0 * dim1); transpose<<<256, 256>>>(data, workspace, dim0, dim1); } #ifdef ENABLE_FP8 template void invokeInPlaceTranspose( __nv_fp8_e4m3* data, __nv_fp8_e4m3* workspace, const size_t dim0, const size_t dim1); #endif // ENABLE_FP8 #ifdef ENABLE_BF16 template void invokeInPlaceTranspose( __nv_bfloat16* data, __nv_bfloat16* workspace, const size_t dim0, const size_t dim1); #endif // ENABLE_BF16 template void invokeInPlaceTranspose(float* data, float* workspace, const size_t dim0, const size_t dim1); template __global__ void transpose0213( T_OUT* dst, T_IN* src, const size_t dim0, const size_t dim1, const size_t dim2, const size_t dim3) { // src permutation: [0, 1, 2, 3] // dst permutation: [0, 2, 1, 3] for (size_t tid = threadIdx.x + blockIdx.x * blockDim.x; tid < dim0 * dim1 * dim2 * dim3; tid += blockDim.x * gridDim.x) { size_t tmp_idx = tid; const size_t dim_3_idx = tmp_idx % dim3; tmp_idx = (tmp_idx - dim_3_idx) / dim3; const size_t dim_2_idx = tmp_idx % dim2; tmp_idx = (tmp_idx - dim_2_idx) / dim2; const size_t dim_1_idx = tmp_idx % dim1; tmp_idx = (tmp_idx - dim_1_idx) / dim1; const size_t dim_0_idx = tmp_idx % dim0; dst[dim_0_idx * dim1 * dim2 * dim3 + dim_2_idx * dim1 * dim3 + dim_1_idx * dim3 + dim_3_idx] = src[tid]; } } template void invokeInPlaceTranspose0213( T* data, T* workspace, const size_t dim0, const size_t dim1, const size_t dim2, const size_t dim3) { // copy data to workspace, and then transpose from workspace to data // Note that this kernel is used for pre-processing and not very efficient. cudaD2Dcpy(workspace, data, dim0 * dim1 * dim2 * dim3); transpose0213<<<256, 256>>>(data, workspace, dim0, dim1, dim2, dim3); } #ifdef ENABLE_FP8 template void invokeInPlaceTranspose0213(__nv_fp8_e4m3* data, __nv_fp8_e4m3* workspace, const size_t dim0, const size_t dim1, const size_t dim2, const size_t dim3); #endif // ENABLE_FP8 #ifdef ENABLE_BF16 template void invokeInPlaceTranspose0213(__nv_bfloat16* data, __nv_bfloat16* workspace, const size_t dim0, const size_t dim1, const size_t dim2, const size_t dim3); #endif // ENABLE_BF16 template void invokeInPlaceTranspose0213( float* data, float* workspace, const size_t dim0, const size_t dim1, const size_t dim2, const size_t dim3); template __global__ void transpose102(T_OUT* dst, T_IN* src, const size_t dim0, const size_t dim1, const size_t dim2) { // src permutation: [0, 1, 2] // dst permutation: [1, 0, 2] for (size_t tid = threadIdx.x + blockIdx.x * blockDim.x; tid < dim0 * dim1 * dim2; tid += blockDim.x * gridDim.x) { size_t tmp_idx = tid; const size_t dim_2_idx = tmp_idx % dim2; tmp_idx = (tmp_idx - dim_2_idx) / dim2; const size_t dim_1_idx = tmp_idx % dim1; tmp_idx = (tmp_idx - dim_1_idx) / dim1; const size_t dim_0_idx = tmp_idx % dim0; dst[dim_1_idx * dim0 * dim2 + dim_0_idx * dim2 + dim_2_idx] = src[tid]; } } template void invokeInPlaceTranspose102(T* data, T* workspace, const size_t dim0, const size_t dim1, const size_t dim2) { // copy data to workspace, and then transpose from workspace to data // Note that this kernel is used for pre-processing and not very efficient. cudaD2Dcpy(workspace, data, dim0 * dim1 * dim2); transpose102<<<256, 256>>>(data, workspace, dim0, dim1, dim2); } #ifdef ENABLE_FP8 template void invokeInPlaceTranspose102( __nv_fp8_e4m3* data, __nv_fp8_e4m3* workspace, const size_t dim0, const size_t dim1, const size_t dim2); #endif // ENABLE_FP8 #ifdef ENABLE_BF16 template void invokeInPlaceTranspose102( __nv_bfloat16* data, __nv_bfloat16* workspace, const size_t dim0, const size_t dim1, const size_t dim2); #endif // ENABLE_BF16 template void invokeInPlaceTranspose102( float* data, float* workspace, const size_t dim0, const size_t dim1, const size_t dim2); template void __global__ multiplyScale(T* tensor, float scale, const size_t size) { for (size_t index = threadIdx.x + blockIdx.x * blockDim.x; index < size; index += blockDim.x * gridDim.x) { tensor[index] = (T) (((float) tensor[index]) * scale); } } template void invokeMultiplyScale(T* tensor, float scale, const size_t size, cudaStream_t stream) { int block = 256; int grid = (size + 255) / 256; multiplyScale<<>>(tensor, scale, size); } template void invokeMultiplyScale(float* tensor, float scale, const size_t size, cudaStream_t stream); template void invokeMultiplyScale(half* tensor, float scale, const size_t size, cudaStream_t stream); #ifdef ENABLE_BF16 template void invokeMultiplyScale(__nv_bfloat16* tensor, float scale, const size_t size, cudaStream_t stream); #endif #ifdef ENABLE_FP8 template void invokeMultiplyScale(__nv_fp8_e4m3* tensor, float scale, const size_t size, cudaStream_t stream); #endif template void __global__ divideScale(T* tensor, float scale, const size_t size) { for (size_t index = threadIdx.x + blockIdx.x * blockDim.x; index < size; index += blockDim.x * gridDim.x) { tensor[index] = (T) (((float) tensor[index]) / scale); } } template void invokeDivideScale(T* tensor, float scale, const size_t size, cudaStream_t stream) { int block = 256; int grid = (size + 255) / 256; divideScale<<>>(tensor, scale, size); } template void invokeDivideScale(float* tensor, float scale, const size_t size, cudaStream_t stream); template void invokeDivideScale(half* tensor, float scale, const size_t size, cudaStream_t stream); #ifdef ENABLE_BF16 template void invokeDivideScale(__nv_bfloat16* tensor, float scale, const size_t size, cudaStream_t stream); #endif #ifdef ENABLE_FP8 template void invokeDivideScale(__nv_fp8_e4m3* tensor, float scale, const size_t size, cudaStream_t stream); #endif #ifdef ENABLE_BF16 template void invokeFakeCast(float* input_ptr, const size_t size, cudaStream_t stream); template void invokeFakeCast<__nv_bfloat16, __nv_bfloat16>( __nv_bfloat16* input_ptr, const size_t size, cudaStream_t stream); template void invokeFakeCast(half* input_ptr, const size_t size, cudaStream_t stream); #endif template void invokeFakeCast(float* input_ptr, const size_t size, cudaStream_t stream); template void invokeFakeCast(float* input_ptr, const size_t size, cudaStream_t stream); #ifdef ENABLE_FP8 template void invokeFakeCast(float* input_ptr, const size_t size, cudaStream_t stream); template void invokeFakeCast(half* input_ptr, const size_t size, cudaStream_t stream); template void invokeFakeCast<__nv_bfloat16, __nv_fp8_e4m3>( __nv_bfloat16* input_ptr, const size_t size, cudaStream_t stream); #endif size_t cuda_datatype_size(TRTLLMCudaDataType dt) { static const std::unordered_map sizes{ {TRTLLMCudaDataType::FP32, sizeof(float)}, {TRTLLMCudaDataType::FP16, sizeof(half)} #ifdef ENABLE_BF16 , {TRTLLMCudaDataType::BF16, sizeof(__nv_bfloat16)} #endif }; return sizes.at(dt); } template __global__ void check_range(T const* buffer, size_t size, T min, T max, bool* d_within_range) { for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < size; i += blockDim.x * gridDim.x) { const T val = buffer[i]; if (val < min || val > max) { *d_within_range = false; } } } template bool invokeCheckRange(T const* buffer, const size_t size, T min, T max, bool* d_within_range, cudaStream_t stream) { cudaMemsetAsync(d_within_range, true, sizeof(bool), stream); dim3 block(256); dim3 grid((size + 255) / 256); check_range<<>>(buffer, size, min, max, d_within_range); bool result; cudaD2Hcpy(&result, d_within_range, 1); return result; } template bool invokeCheckRange( int const* buffer, const size_t size, int min, int max, bool* d_within_range, cudaStream_t stream); /* * Determine the total workspace size based on a vector containing multiple variable sizes. */ size_t calcAlignedSize(std::vector const& sizes, const size_t ALIGN_BYTES) { const size_t ALIGN_MASK = ~(ALIGN_BYTES - 1); // Check ALIGN_BYTES is a power of 2 assert((ALIGN_BYTES & (ALIGN_BYTES - 1)) == 0); size_t total = 0; for (auto sz : sizes) { total += (sz + ALIGN_BYTES - 1) & ALIGN_MASK; } // We add extra "ALIGN_BYTES - 1" bytes in case the start address passed to the function calcAlignedPointers() is // not aligned. return total + ALIGN_BYTES - 1; } /* * Given the address of the workspace and the vector containing multiple variable sizes, calculate the start addresses * of each variable. */ void calcAlignedPointers( std::vector& outPtrs, void const* p, std::vector const& sizes, size_t ALIGN_BYTES) { const size_t ALIGN_MASK = ~(ALIGN_BYTES - 1); // Check ALIGN_BYTES is a power of 2 assert((ALIGN_BYTES & (ALIGN_BYTES - 1)) == 0); // In case the start address is not aligned char* ptr = reinterpret_cast((reinterpret_cast(p) + ALIGN_BYTES - 1) & ALIGN_MASK); outPtrs.reserve(sizes.size()); for (auto sz : sizes) { outPtrs.push_back(ptr); ptr += (sz + ALIGN_BYTES - 1) & ALIGN_MASK; } } } // namespace common } // namespace tensorrt_llm