/* * Copyright (c) 2020-2023, NVIDIA CORPORATION. All rights reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #pragma once #include namespace tensorrt_llm { namespace kernels { void invokeTopPInitialize(int* topp_id_val_buf, int* topp_offset_buf, int* begin_topp_offset_buf_, const size_t batch_size, const int n, cudaStream_t stream); template void invokeTopPSampling(void* workspace, size_t& workspace_size, size_t& cub_temp_storage_size, int** output_ids, int* sequence_length, bool* finished_buf, float* cum_log_probs, float* output_log_probs, const T* log_probs, const int* id_vals, int* offset_buf, int* begin_offset_buf, curandState_t* curandstate, const int batch_size, const size_t vocab_size_padded, const int* end_ids, const float top_p, cudaStream_t stream, cudaDeviceProp* cuda_device_prop, const bool* skip_decode); template void invokeBatchTopPSampling(void* workspace, size_t& workspace_size, size_t& cub_temp_storage_size, int** output_ids, int* sequence_length, bool* finished_buf, float* cum_log_probs, float* output_log_probs, const T* log_probs, const int* id_vals, int* offset_buf, int* begin_offset_buf, curandState_t* curandstate, const int batch_size, const size_t vocab_size_padded, const int* end_ids, const float max_top_p, const float* top_ps, cudaStream_t stream, cudaDeviceProp* cuda_device_prop, const bool* skip_decode); template void invokeAddBiasSoftMax(T* logits, const T* bias, const int* end_ids, const bool* finished, const int m, const int n_padded, const int n, cudaStream_t stream); namespace segmented_topp_impl { enum DType_t { kFLOAT, kHALF, kINT8 }; template struct Segmented_topk_kernel_params { typedef Key_Data_Type_ Key_Data_Type; typedef Value_Data_Type_ Value_Data_Type; enum { BLOCK_THREADS = BLOCK_THREADS_ }; enum { ITEMS_INCREMENT = 32 }; // enum { KEYS_PER_LDG = 2 * 4 / sizeof(Key_Data_Type_) }; enum { KEYS_PER_LDG = KEYS_PER_LDG_ }; }; struct TopKPerSegmentContext { TopKPerSegmentContext() : sm_count(0) , sm_shared_size(0) , sm_version(0){}; int sm_count; int sm_shared_size; int sm_version; }; struct TopKPerSegmentParams { // input/output keys and values void *gmem_src_keys, *gmem_dst_keys, *gmem_dst_vals; // not used in the custom implementation void* gmem_src_vals; // int array of size num_segments int* gmem_active_count_per_segment; int* gmem_active_count_total; int* gmem_begin_offsets; // gmem_end_offsets will be populated int* gmem_end_offsets; void* workspace; // total number of items for all segments int num_items; int num_segments; // top_k per segment int num_top_k; float top_p; float confidence_threshold; }; int topPPerSegment(const TopKPerSegmentContext& context, TopKPerSegmentParams& params, const DType_t DT_SCORE, void* temp_storage, size_t& temp_storage_bytes, cudaStream_t stream); } // namespace segmented_topp_impl void invokeComputeToppDecay(float* runtime_top_p, const float* runtime_initial_top_p, const int** output_ids, const float* top_p_decay, const float* top_p_min, const int32_t* top_p_reset_ids, const int* sequence_lengths, const int local_batch_size, cudaStream_t stream); } // namespace kernels } // namespace tensorrt_llm