/* * Adapted from https://github.com/state-spaces/mamba/blob/main/csrc/selective_scan/selective_scan.h * and https://github.com/Dao-AILab/causal-conv1d/blob/main/csrc/static_switch.h * Copyright (c) 2023, Tri Dao. * * Copyright (c) 2022-2025, NVIDIA CORPORATION. All rights reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #pragma once #include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/cudaUtils.h" #include #include TRTLLM_NAMESPACE_BEGIN namespace kernels::causal_conv1d { #define TLLM_CUDA_KERNEL_LAUNCH_CHECK() TLLM_CUDA_CHECK(cudaGetLastError()) /// @param COND - a boolean expression to switch by /// @param CONST_NAME - a name given for the constexpr bool variable. /// @param ... - code to execute for true and false /// /// Usage: /// ``` /// BOOL_SWITCH(flag, BoolConst, [&] { /// some_function(...); /// }); /// ``` #define BOOL_SWITCH(COND, CONST_NAME, ...) \ [&] \ { \ if (COND) \ { \ static constexpr bool CONST_NAME = true; \ return __VA_ARGS__(); \ } \ else \ { \ static constexpr bool CONST_NAME = false; \ return __VA_ARGS__(); \ } \ }() struct ConvParamsBase { using index_t = uint32_t; int batch, dim, seqlen, width; int64_t pad_slot_id; bool silu_activation; index_t x_batch_stride; index_t x_c_stride; index_t x_l_stride; index_t weight_c_stride; index_t weight_width_stride; index_t out_batch_stride; index_t out_c_stride; index_t out_l_stride; int conv_state_len; index_t conv_state_batch_stride; index_t conv_state_c_stride; index_t conv_state_l_stride; // Common data pointers. void* __restrict__ x_ptr; void* __restrict__ weight_ptr; void* __restrict__ bias_ptr; void* __restrict__ out_ptr; void* __restrict__ conv_state_ptr; void* __restrict__ query_start_loc_ptr; void* __restrict__ has_initial_state_ptr; void* __restrict__ cache_indices_ptr; int32_t* __restrict__ cache_seqlens; // For the continuous batching case. Makes it so that the mamba state for // the current batch doesn't need to be a contiguous tensor. int32_t* __restrict__ conv_state_indices_ptr; void* __restrict__ seq_idx_ptr; // No __restrict__ since initial_states could be the same as final_states. void* initial_states_ptr; index_t initial_states_batch_stride; index_t initial_states_l_stride; index_t initial_states_c_stride; void* final_states_ptr; index_t final_states_batch_stride; index_t final_states_l_stride; index_t final_states_c_stride; void* conv_states_ptr; index_t conv_states_batch_stride; index_t conv_states_l_stride; index_t conv_states_c_stride; }; template __device__ inline T shuffle_xor(T val, int offset) { return __shfl_xor_sync(uint32_t(-1), val, offset); } constexpr size_t custom_max(std::initializer_list ilist) { return std::max(ilist); } template constexpr T constexpr_min(T a, T b) { return std::min(a, b); } //////////////////////////////////////////////////////////////////////////////////////////////////// template struct BytesToType { }; template <> struct BytesToType<16> { using Type = uint4; static_assert(sizeof(Type) == 16); }; template <> struct BytesToType<8> { using Type = uint64_t; static_assert(sizeof(Type) == 8); }; template <> struct BytesToType<4> { using Type = uint32_t; static_assert(sizeof(Type) == 4); }; template <> struct BytesToType<2> { using Type = uint16_t; static_assert(sizeof(Type) == 2); }; template <> struct BytesToType<1> { using Type = uint8_t; static_assert(sizeof(Type) == 1); }; //////////////////////////////////////////////////////////////////////////////////////////////////// template struct SumOp { __device__ inline T operator()(T const& x, T const& y) { return x + y; } }; template struct Allreduce { static_assert(THREADS == 32 || THREADS == 16 || THREADS == 8 || THREADS == 4); template static __device__ inline T run(T x, Operator& op) { constexpr int OFFSET = THREADS / 2; x = op(x, __shfl_xor_sync(uint32_t(-1), x, OFFSET)); return Allreduce::run(x, op); } }; template <> struct Allreduce<2> { template static __device__ inline T run(T x, Operator& op) { x = op(x, __shfl_xor_sync(uint32_t(-1), x, 1)); return x; } }; //////////////////////////////////////////////////////////////////////////////////////////////////// template void causal_conv1d_fwd_cuda(ConvParamsBase& params, cudaStream_t stream); template void causal_conv1d_update_cuda(ConvParamsBase& params, cudaStream_t stream); } // namespace kernels::causal_conv1d TRTLLM_NAMESPACE_END