/* * Copyright (c) 2019-2024, NVIDIA CORPORATION. All rights reserved. * Copyright (c) 2021, NAVER Corp. Authored by CLOVA. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #include "tensorrt_llm/common/cudaUtils.h" #include "tensorrt_llm/kernels/decodingCommon.h" #include "tensorrt_llm/layers/topKSamplingLayer.h" #include "tensorrt_llm/layers/topPSamplingLayer.h" #include "samplingLayer.h" #include using namespace tensorrt_llm::common; using namespace tensorrt_llm::kernels; using namespace tensorrt_llm::runtime; namespace tensorrt_llm::layers { template SamplingLayer::SamplingLayer(executor::DecodingMode const& mode, DecoderDomain const& decoderDomain, std::shared_ptr bufferManager) : BaseLayer(decoderDomain, bufferManager) , mDecodingMode(mode) { TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__); TLLM_CHECK_WITH_INFO(!mDecodingMode.isBeamSearch(), "SamplingLayer does not support Beam search mode"); TLLM_CHECK_WITH_INFO(mDecodingMode.isTopKorTopP(), "SamplingLayer requires TopK or TopP mode"); if (mDecodingMode.isTopK()) { mSamplingLayers.emplace_back(std::make_unique>(decoderDomain, mBufferManager)); } if (mDecodingMode.isTopP()) { mSamplingLayers.emplace_back( std::make_unique>(decoderDomain, mBufferManager, /* deterministic */ true)); } allocateBuffer(decoderDomain.getBatchSize()); TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__); } template void SamplingLayer::allocateBuffer(SizeType32 batchSize) { TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__); size_t workspaceSize = 0; for (auto&& layer : mSamplingLayers) { workspaceSize = std::max(workspaceSize, layer->getWorkspaceSize()); } mWorkspaceSize = workspaceSize; auto const batchSizeShape = ITensor::makeShape({batchSize}); mSetupWorkspaceSize = DecodingLayerWorkspace::calculateRequiredWorkspaceSize( std::make_pair(batchSizeShape, TRTDataType::value)); mSkipDecodeDevice = mBufferManager->gpu(batchSizeShape, TRTDataType::value); mCurandStatesDevice = mBufferManager->gpu(ITensor::makeShape({batchSize, sizeof(curandState_t)}), TRTDataType::value); // host buffers. mSkipDecodeHost = mBufferManager->pinnedPool(batchSizeShape, TRTDataType::value); TLLM_CHECK(mSkipDecodeHost != nullptr); TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__); } template void SamplingLayer::setup(SizeType32 batchSize, SizeType32 beamWidth, TensorConstPtr batchSlots, std::shared_ptr const& baseSetupParams, std::shared_ptr const& workspace) { TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__); auto setupParams = std::dynamic_pointer_cast(baseSetupParams); workspace->initializeDeviceCurandStates( setupParams->randomSeed, batchSize, workspace->getDeviceBatchSlots(), mCurandStatesDevice); if (setupParams->outputLogProbs) { // FIXME(nkorobov): monotonically growing mOutputLogProbs = std::any_of(setupParams->outputLogProbs->begin(), setupParams->outputLogProbs->end(), [this](bool outputLogProbs) { return this->mOutputLogProbs | outputLogProbs; }); } if (setupParams->cumLogProbs) { // FIXME(nkorobov): monotonically growing mCumLogProbs = std::any_of(setupParams->cumLogProbs->begin(), setupParams->cumLogProbs->end(), [this](bool cumLogProbs) { return this->mCumLogProbs | cumLogProbs; }); } for (auto&& layer : mSamplingLayers) { layer->setup(batchSize, beamWidth, batchSlots, setupParams, workspace); } TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__); } template void SamplingLayer::forwardAsync(std::shared_ptr const& outputs, std::shared_ptr const& baseInputs, std::shared_ptr const& workspace) { TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__); auto inputs = std::dynamic_pointer_cast(baseInputs); auto const batchSize = inputs->logits.value()->getDimension<0>(); auto const* endIds = bufferCast(*inputs->endIds); FinishedState const* finishedInput = (inputs->finished) ? reinterpret_cast(bufferCast(*inputs->finished.value())) : nullptr; auto const skipTopP = !mDecodingMode.isTopP(); // Compute probabilities either for TopP or if cumLogProbs or outputLogProbs are specified bool const skipSoftMax = skipTopP && !mOutputLogProbs && !mCumLogProbs; inputs->curandStates = reinterpret_cast(bufferCast(*mCurandStatesDevice)); inputs->probsComputed = !skipSoftMax; if (!skipSoftMax) { auto runtimeLogitsPtr = bufferCast(*workspace->getDeviceRuntimeLogits()); auto logitsPtrsPtr = static_cast(nullptr); auto biasPtr = static_cast(nullptr); auto const* batchSlotsPtr = workspace->getDeviceBatchSlotsPtr(); BiasSoftmaxParams biasSoftmaxParams; biasSoftmaxParams.logits = runtimeLogitsPtr; biasSoftmaxParams.logitsPtrs = logitsPtrsPtr; biasSoftmaxParams.probs = runtimeLogitsPtr; biasSoftmaxParams.bias = biasPtr; biasSoftmaxParams.endIds = endIds; biasSoftmaxParams.finished = finishedInput; biasSoftmaxParams.batchSlots = batchSlotsPtr; biasSoftmaxParams.batchSize = batchSize; biasSoftmaxParams.maxBatchSize = mDecoderDomain.getBatchSize(); biasSoftmaxParams.maxBeamWidth = 1; biasSoftmaxParams.vocabSize = mDecoderDomain.getVocabSize(); biasSoftmaxParams.vocabSizePadded = mDecoderDomain.getVocabSizePadded(); biasSoftmaxParams.skipSoftMax = skipSoftMax; biasSoftmaxParams.batchSlotsLogits = false; biasSoftmaxParams.checkParams(); invokeAddBiasSoftMax(biasSoftmaxParams, getStream()); sync_check_cuda_error(); } for (auto&& layer : mSamplingLayers) { layer->forwardAsync(outputs, baseInputs, workspace); } TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__); } template size_t SamplingLayer::getWorkspaceSize() const noexcept { return std::max(mWorkspaceSize, mSetupWorkspaceSize); } template class SamplingLayer; template class SamplingLayer; } // namespace tensorrt_llm::layers