mirror of
https://github.com/NVIDIA/TensorRT-LLM.git
synced 2026-01-14 06:27:45 +08:00
122 lines
4.0 KiB
C++
122 lines
4.0 KiB
C++
/*
|
|
* Copyright (c) 2020-2023, NVIDIA CORPORATION. All rights reserved.
|
|
*
|
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
|
* you may not use this file except in compliance with the License.
|
|
* You may obtain a copy of the License at
|
|
*
|
|
* http://www.apache.org/licenses/LICENSE-2.0
|
|
*
|
|
* Unless required by applicable law or agreed to in writing, software
|
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
* See the License for the specific language governing permissions and
|
|
* limitations under the License.
|
|
*/
|
|
#pragma once
|
|
|
|
#include <curand_kernel.h>
|
|
|
|
namespace tensorrt_llm
|
|
{
|
|
namespace kernels
|
|
{
|
|
|
|
void invokeTopPInitialize(int* topp_id_val_buf, int* topp_offset_buf, int* begin_topp_offset_buf_,
|
|
const size_t batch_size, const int n, cudaStream_t stream);
|
|
|
|
template <typename T>
|
|
void invokeTopPSampling(void* workspace, size_t& workspace_size, size_t& cub_temp_storage_size, int** output_ids,
|
|
int* sequence_length, bool* finished_buf, float* cum_log_probs, float* output_log_probs, const T* log_probs,
|
|
const int* id_vals, int* offset_buf, int* begin_offset_buf, curandState_t* curandstate, const int batch_size,
|
|
const size_t vocab_size_padded, const int* end_ids, const float top_p, cudaStream_t stream,
|
|
cudaDeviceProp* cuda_device_prop, const bool* skip_decode);
|
|
|
|
template <typename T>
|
|
void invokeBatchTopPSampling(void* workspace, size_t& workspace_size, size_t& cub_temp_storage_size, int** output_ids,
|
|
int* sequence_length, bool* finished_buf, float* cum_log_probs, float* output_log_probs, const T* log_probs,
|
|
const int* id_vals, int* offset_buf, int* begin_offset_buf, curandState_t* curandstate, const int batch_size,
|
|
const size_t vocab_size_padded, const int* end_ids, const float max_top_p, const float* top_ps, cudaStream_t stream,
|
|
cudaDeviceProp* cuda_device_prop, const bool* skip_decode);
|
|
|
|
template <typename T>
|
|
void invokeAddBiasSoftMax(T* logits, const T* bias, const int* end_ids, const bool* finished, const int m,
|
|
const int n_padded, const int n, cudaStream_t stream);
|
|
|
|
namespace segmented_topp_impl
|
|
{
|
|
enum DType_t
|
|
{
|
|
kFLOAT,
|
|
kHALF,
|
|
kINT8
|
|
};
|
|
|
|
template <typename Key_Data_Type_ = float, typename Value_Data_Type_ = int32_t, int BLOCK_THREADS_ = 256,
|
|
int KEYS_PER_LDG_ = 1>
|
|
struct Segmented_topk_kernel_params
|
|
{
|
|
typedef Key_Data_Type_ Key_Data_Type;
|
|
typedef Value_Data_Type_ Value_Data_Type;
|
|
|
|
enum
|
|
{
|
|
BLOCK_THREADS = BLOCK_THREADS_
|
|
};
|
|
|
|
enum
|
|
{
|
|
ITEMS_INCREMENT = 32
|
|
};
|
|
|
|
// enum { KEYS_PER_LDG = 2 * 4 / sizeof(Key_Data_Type_) };
|
|
enum
|
|
{
|
|
KEYS_PER_LDG = KEYS_PER_LDG_
|
|
};
|
|
};
|
|
|
|
struct TopKPerSegmentContext
|
|
{
|
|
TopKPerSegmentContext()
|
|
: sm_count(0)
|
|
, sm_shared_size(0)
|
|
, sm_version(0){};
|
|
int sm_count;
|
|
int sm_shared_size;
|
|
int sm_version;
|
|
};
|
|
|
|
struct TopKPerSegmentParams
|
|
{
|
|
// input/output keys and values
|
|
void *gmem_src_keys, *gmem_dst_keys, *gmem_dst_vals;
|
|
// not used in the custom implementation
|
|
void* gmem_src_vals;
|
|
// int array of size num_segments
|
|
int* gmem_active_count_per_segment;
|
|
int* gmem_active_count_total;
|
|
int* gmem_begin_offsets;
|
|
// gmem_end_offsets will be populated
|
|
int* gmem_end_offsets;
|
|
void* workspace;
|
|
// total number of items for all segments
|
|
int num_items;
|
|
int num_segments;
|
|
// top_k per segment
|
|
int num_top_k;
|
|
float top_p;
|
|
float confidence_threshold;
|
|
};
|
|
|
|
int topPPerSegment(const TopKPerSegmentContext& context, TopKPerSegmentParams& params, const DType_t DT_SCORE,
|
|
void* temp_storage, size_t& temp_storage_bytes, cudaStream_t stream);
|
|
} // namespace segmented_topp_impl
|
|
|
|
void invokeComputeToppDecay(float* runtime_top_p, const float* runtime_initial_top_p, const int** output_ids,
|
|
const float* top_p_decay, const float* top_p_min, const int32_t* top_p_reset_ids, const int* sequence_lengths,
|
|
const int local_batch_size, cudaStream_t stream);
|
|
|
|
} // namespace kernels
|
|
} // namespace tensorrt_llm
|