/* * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #include "lookaheadDecodingLayer.h" #include "tensorrt_llm/common/assert.h" #include "tensorrt_llm/common/logger.h" #include "tensorrt_llm/common/memoryUtils.h" #include "tensorrt_llm/executor/executor.h" #include "tensorrt_llm/kernels/decodingKernels.h" #include "tensorrt_llm/kernels/samplingTopKKernels.h" #include "tensorrt_llm/layers/decodingParams.h" #include "tensorrt_llm/layers/defaultDecodingParams.h" #include "tensorrt_llm/layers/lookaheadAlgorithm.h" #include "tensorrt_llm/layers/lookaheadDecodingUtils.h" #include "tensorrt_llm/runtime/bufferManager.h" #include "tensorrt_llm/runtime/common.h" #include "tensorrt_llm/runtime/iBuffer.h" #include "tensorrt_llm/runtime/iTensor.h" #include #include #include #include namespace tensorrt_llm::layers { using namespace tensorrt_llm::common; using namespace tensorrt_llm::kernels; using namespace tensorrt_llm::runtime; template LookaheadDecodingLayer::CpuAlgorithmResources::CpuAlgorithmResources(DecoderDomain const& decoderDomain) { auto maxBatchSize = decoderDomain.getBatchSize(); auto lookaheadModule = std::dynamic_pointer_cast(decoderDomain.getSpeculativeDecodingModule()); auto const [maxW, maxN, maxG] = lookaheadModule->getExecutionConfig().get(); for (SizeType32 id = 0; id < maxBatchSize; id++) { mAlgos.emplace_back(maxW, maxN, maxG, id); } SizeType32 maxTokensPerStep, maxNumNewTokens, maxDraftLen; std::tie(maxTokensPerStep, maxNumNewTokens, maxDraftLen, std::ignore) = executor::LookaheadDecodingConfig(maxW, maxN, maxG).calculateSpeculativeResource(); auto const maxBatchShape1D = ITensor::makeShape({maxBatchSize}); mBatchSlots = BufferManager::cpu(maxBatchShape1D, nvinfer1::DataType::kINT32); mTargetTokens = BufferManager::cpu(ITensor::makeShape({maxBatchSize, maxTokensPerStep}), nvinfer1::DataType::kINT32); mTokensPerStep = BufferManager::cpu(maxBatchShape1D, nvinfer1::DataType::kINT32); mEndIds = BufferManager::cpu(maxBatchShape1D, nvinfer1::DataType::kINT32); mOutputIds = BufferManager::cpu(ITensor::makeShape({maxBatchSize, maxNumNewTokens}), nvinfer1::DataType::kINT32); mPathsOffsets = BufferManager::cpu(ITensor::makeShape({maxBatchSize, maxNumNewTokens}), nvinfer1::DataType::kINT32); mNumNewTokens = BufferManager::cpu(maxBatchShape1D, nvinfer1::DataType::kINT32); mNumNewTokensCumSum = BufferManager::cpu(ITensor::makeShape({maxBatchSize + 1}), nvinfer1::DataType::kINT32); mNextDraftTokens = BufferManager::cpu(ITensor::makeShape({maxBatchSize, maxDraftLen}), nvinfer1::DataType::kINT32); mNextDraftPosIds = BufferManager::cpu(ITensor::makeShape({maxBatchSize, maxDraftLen}), nvinfer1::DataType::kINT32); auto divUp32 = [](SizeType32 x) { return x / 32 + ((x % 32) ? 1 : 0); }; mPackedMasks = BufferManager::cpu( ITensor::makeShape({maxBatchSize, maxTokensPerStep, divUp32(maxTokensPerStep)}), nvinfer1::DataType::kINT32); mSamplingMask = BufferManager::cpu(ITensor::makeShape({maxBatchSize, maxDraftLen}), nvinfer1::DataType::kBOOL); mNextDraftLengths = BufferManager::cpu(maxBatchShape1D, nvinfer1::DataType::kINT32); mSequenceLengths = BufferManager::cpu(maxBatchShape1D, nvinfer1::DataType::kINT32); } template LookaheadDecodingLayer::LookaheadDecodingLayer( DecoderDomain const& decoderDomain, std::shared_ptr bufferManager) : BaseLayer(decoderDomain, bufferManager) , mCpuAlgo(std::make_optional(decoderDomain)) { TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__); auto const maxBatchSize = mDecoderDomain.getBatchSize(); auto const maxTokensPerStep = mDecoderDomain.getMaxDecodingTokens(); auto const vocabSizePadded = mDecoderDomain.getVocabSizePadded(); auto const maxTopK = 1; auto const maxBatchShape1D = ITensor::makeShape({maxBatchSize}); auto const maxBatchShape2D = ITensor::makeShape({maxBatchSize, maxTokensPerStep}); auto workspaceSize = getTopKWorkspaceSize(maxBatchSize, maxTokensPerStep, maxTopK, vocabSizePadded); mSamplingWorkspaceDevice = mBufferManager->gpu(ITensor::makeShape({static_cast(workspaceSize)}), nvinfer1::DataType::kINT8); TLLM_LOG_DEBUG("workspaceSize=%d", getWorkspaceSize()); mTargetTokensDevice = mBufferManager->gpu(maxBatchShape2D, nvinfer1::DataType::kINT32); mRandomSeedsDevice = mBufferManager->gpu(maxBatchShape1D, nvinfer1::DataType::kINT64); mSamplingMaskDevice = mBufferManager->gpu(maxBatchShape2D, nvinfer1::DataType::kBOOL); mCurandStatesDevice = mBufferManager->gpu(maxBatchShape1D, nvinfer1::DataType::kINT8); TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__); } template void LookaheadDecodingLayer::setup(SizeType32 batchSize, SizeType32 beamWidth, BufferConstPtr batchSlots, std::shared_ptr const& baseSetupParams) { TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__); auto setupParams = std::dynamic_pointer_cast(baseSetupParams); if (mCpuAlgo) { auto& algoConfigs = setupParams->algoConfigs; TLLM_CHECK_WITH_INFO(algoConfigs.size() == 1 || algoConfigs.size() == batchSize, "Lookahead runtime configuration size should be either 1 or batchSize"); auto const batchSlotsRange = BufferRange(*batchSlots); for (SizeType32 bi = 0; bi < batchSize; bi++) { auto const gbi = batchSlotsRange[bi]; SizeType32 bi1orN = (algoConfigs.size() == 1) ? 0 : bi; TLLM_LOG_DEBUG("CPU ALGO [ %d ] setup", gbi); PRINT_TOKENS(setupParams->prompt[bi]); auto [w, n, g] = algoConfigs[bi1orN].get(); SizeType32 runtimeTokensPerStep; std::tie(runtimeTokensPerStep, std::ignore, std::ignore, std::ignore) = executor::LookaheadDecodingConfig(w, n, g).calculateSpeculativeResource(); TLLM_CHECK_WITH_INFO(runtimeTokensPerStep <= mDecoderDomain.getMaxDecodingTokens(), "runtime w(%d) n(%d) g(%d) exceeds maxTokensPerStep(%d)", w, n, g, mDecoderDomain.getMaxDecodingTokens()); mCpuAlgo->mAlgos[gbi].setup(setupParams->prompt[bi], w, n, g); } } auto curandStatesDevicePtr = reinterpret_cast(bufferCast(*mCurandStatesDevice)); auto batchSlotsPtr = bufferCastOrNull(batchSlots); if (setupParams->randomSeed) { auto& randomSeed = setupParams->randomSeed.value(); if (randomSeed.size() == 1) { invokeCurandInitialize(curandStatesDevicePtr, batchSlotsPtr, batchSize, randomSeed.front(), getStream()); sync_check_cuda_error(); } else { TLLM_CHECK_WITH_INFO(randomSeed.size() == batchSize, "Random seed vector size mismatch."); cudaAutoCpy(bufferCast(*mRandomSeedsDevice), randomSeed.data(), batchSize, getStream()); invokeCurandBatchInitialize(curandStatesDevicePtr, batchSlotsPtr, batchSize, bufferCast(*mRandomSeedsDevice), getStream()); sync_check_cuda_error(); } } else { invokeCurandInitialize( curandStatesDevicePtr, batchSlotsPtr, batchSize, DefaultDecodingParams::getSeed(), getStream()); } TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__); } template void LookaheadDecodingLayer::forwardAsync( std::shared_ptr const& outputParams, std::shared_ptr const& inputParams) { TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__); auto inputs = std::dynamic_pointer_cast(inputParams); auto outputs = std::dynamic_pointer_cast(outputParams); auto batchSize = inputs->localBatchSize; TLLM_CHECK_WITH_INFO(inputs->batchSlots, "Batch slots must be provided for LookaheadDecoding"); TLLM_CHECK_WITH_INFO(inputs->curTokensPerStep, "curTokensPerStep must be provided for LookaheadDecoding"); TLLM_CHECK_WITH_INFO(outputs->sequenceLength, "sequenceLength must be provided for LookaheadDecoding"); // TODO(liweim) to be confirmed. TLLM_CHECK(inputs->logits); mBufferManager->copy( bufferCast(*inputs->batchSlots.value()), *mCpuAlgo->mBatchSlots, runtime::MemoryType::kGPU); mBufferManager->copy(bufferCast(*inputs->curTokensPerStep.value()), *mCpuAlgo->mTokensPerStep, runtime::MemoryType::kGPU); mBufferManager->copy(bufferCast(*inputs->endIds), *mCpuAlgo->mEndIds, runtime::MemoryType::kGPU); mBufferManager->copy(bufferCast(*outputs->sequenceLength.value()), *mCpuAlgo->mSequenceLengths, runtime::MemoryType::kGPU); TopKSamplingKernelParams params; params.maxBatchSize = mDecoderDomain.getBatchSize(); params.batchSize = batchSize; params.maxTopK = 1; params.returnAllTopK = true; params.maxTokensPerStep = mDecoderDomain.getMaxDecodingTokens(); params.maxSeqLen = mDecoderDomain.getMaxDecodingTokens(); params.vocabSizePadded = mDecoderDomain.getVocabSizePadded(); params.batchSlots = bufferCast(*inputs->batchSlots.value()); TLLM_LOG_DEBUG("batchSize = %d", batchSize); params.logProbs = bufferCastOrNull(inputs->logits); params.outputIds = bufferCast(*mTargetTokensDevice); params.workspace = bufferCast(*mSamplingWorkspaceDevice); params.curandState = reinterpret_cast(bufferCast(*mCurandStatesDevice)); params.tokensPerStep = bufferCast(*inputs->curTokensPerStep.value()); TLLM_LOG_DEBUG( "invokeBatchTopKSampling: maxBatchSize=%d, batchSize=%d, maxTopK=%d, maxTokensPerStep=%d, maxSeqLen=%d, " "vocabSizePadded=%d", params.maxBatchSize, params.batchSize, params.maxTopK, params.maxTokensPerStep, params.maxSeqLen, params.vocabSizePadded); // Sample multiple tokens per request and store them to separate to be accepted/rejected later // Sequence length is not modified, endIds is not checked, outputLogProbs are not supported. // Finished state is not set. invokeBatchTopKSampling(params, getStream()); mBufferManager->copy(*mTargetTokensDevice, *mCpuAlgo->mTargetTokens); TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__); } template void LookaheadDecodingLayer::forwardSync( std::shared_ptr const& outputParams, std::shared_ptr const& inputParams) { if (mCpuAlgo) { forwardSyncCPU(outputParams, inputParams); } } template size_t LookaheadDecodingLayer::getWorkspaceSize() const noexcept { return mSamplingWorkspaceDevice->getSizeInBytes(); } template void LookaheadDecodingLayer::posIdsToMask(TensorPtr mask, TensorConstPtr posIds) { auto len = ITensor::volume(posIds->getShape()); TLLM_CHECK(mask->getDimension<0>() > len); TLLM_CHECK(mask->getDimension<1>() * 32 > len); auto posIdsRange = BufferRange(*posIds); auto maskLocation = BufferLocation(*mask); for (auto i = 0; i < maskLocation.size(); i++) { maskLocation[i] = 0; } maskLocation.at(0, 0) = 1; auto setBit = [](SizeType32& x, SizeType32 idx) { x |= (1 << idx); }; if (len > 0) { std::vector> stack; stack.push_back(std::make_pair(0, posIdsRange[0] - 1)); for (auto i = 1; i < len + 1; i++) { auto cur = posIdsRange[i - 1]; while (stack.size() > 0 && cur <= stack.back().second) { stack.pop_back(); } TLLM_CHECK(stack.size() > 0 ? cur == stack.back().second + 1 : true); stack.push_back(std::make_pair(i, cur)); for (auto prev : stack) { setBit(maskLocation.at(i, prev.first / 32), prev.first % 32); } } } } template void LookaheadDecodingLayer::forwardSyncCPU( std::shared_ptr const& outputParams, std::shared_ptr const& inputParams) { TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__); auto inputs = std::dynamic_pointer_cast(inputParams); auto outputs = std::dynamic_pointer_cast(outputParams); auto const batchSize = inputs->localBatchSize; BufferRange tokensPerStepRange(*mCpuAlgo->mTokensPerStep); BufferRange numNewTokensRange(*mCpuAlgo->mNumNewTokens); BufferRange numNewTokensCumSumRange(*mCpuAlgo->mNumNewTokensCumSum); BufferRange batchSlotsRange(*mCpuAlgo->mBatchSlots); BufferRange nextDraftLengthsRange(*mCpuAlgo->mNextDraftLengths); BufferRange sequenceLengthsRange(*mCpuAlgo->mSequenceLengths); for (SizeType32 bi = 0; bi < batchSize; bi++) { SizeType32 gbi = batchSlotsRange[bi]; LookaheadAlgorithm& theAlgo(mCpuAlgo->mAlgos[gbi]); SizeType32 const tokensPerStep = tokensPerStepRange[gbi]; TensorPtr sampledTokens = ITensor::slice(mCpuAlgo->mTargetTokens, {gbi, 0}, tokensPerStep); if (tokensPerStep == 1) { // The first step in generation phase has no draft tokens. theAlgo.accept(sampledTokens); mBufferManager->copy(*sampledTokens, *ITensor::slice(mCpuAlgo->mOutputIds, {gbi, 0}, tokensPerStep)); BufferLocation(*mCpuAlgo->mPathsOffsets).at(gbi, 0) = 0; numNewTokensRange[gbi] = tokensPerStep; BufferLocation(*mCpuAlgo->mNextDraftLengths).at(gbi) = 0; } else { theAlgo.update( // ITensor::at(mCpuAlgo->mOutputIds, {gbi}), // ITensor::at(mCpuAlgo->mPathsOffsets, {gbi}), // ITensor::at(mCpuAlgo->mNumNewTokens, {gbi}), // sampledTokens, // ITensor::at(mCpuAlgo->mEndIds, {gbi})); } auto maxNumNewTokens = mCpuAlgo->mOutputIds->getShape().d[1]; mBufferManager->copy(*ITensor::at(mCpuAlgo->mOutputIds, {gbi}), *ITensor::slice(outputs->outputIds, {gbi, sequenceLengthsRange[gbi]}, maxNumNewTokens)); sequenceLengthsRange[gbi] += numNewTokensRange[gbi]; theAlgo.prepare( // ITensor::at(mCpuAlgo->mNextDraftTokens, {gbi}), // ITensor::at(mCpuAlgo->mNextDraftPosIds, {gbi}), // ITensor::at(mCpuAlgo->mSamplingMask, {gbi}), // ITensor::at(mCpuAlgo->mNextDraftLengths, {gbi}), // ITensor::at(mCpuAlgo->mSequenceLengths, {gbi}), // ITensor::at(mCpuAlgo->mOutputIds, {gbi, numNewTokensRange[gbi] - 1})); posIdsToMask( // ITensor::at(mCpuAlgo->mPackedMasks, {gbi}), // ITensor::slice(mCpuAlgo->mNextDraftPosIds, {gbi, 0}, nextDraftLengthsRange[gbi])); } numNewTokensCumSumRange[0] = 0; for (SizeType32 i = 0; i < numNewTokensRange.size(); i++) { numNewTokensCumSumRange[i + 1] = numNewTokensCumSumRange[i] + numNewTokensRange[i]; } TLLM_CHECK(outputs->numNewTokens); mBufferManager->copy(*mCpuAlgo->mSequenceLengths, // const_cast(outputs->sequenceLength.value()->data()), runtime::MemoryType::kGPU); mBufferManager->copy(*mCpuAlgo->mPathsOffsets, // const_cast(outputs->pathsOffsets->data()), runtime::MemoryType::kGPU); mBufferManager->copy(*mCpuAlgo->mNumNewTokens, // const_cast(outputs->numNewTokens.value()->data()), runtime::MemoryType::kGPU); mBufferManager->copy(*mCpuAlgo->mNumNewTokensCumSum, // const_cast(outputs->numNewTokensCumSum->data()), runtime::MemoryType::kGPU); mBufferManager->copy(*mCpuAlgo->mNextDraftTokens, // const_cast(outputs->nextDraftTokens->data()), runtime::MemoryType::kGPU); mBufferManager->copy(*mCpuAlgo->mNextDraftPosIds, // const_cast(outputs->nextDraftPosIds->data()), runtime::MemoryType::kGPU); mBufferManager->copy(*mCpuAlgo->mPackedMasks, // const_cast(outputs->packedMasks->data()), runtime::MemoryType::kGPU); mBufferManager->copy(*mCpuAlgo->mNextDraftLengths, // const_cast(outputs->nextDraftLengths->data()), runtime::MemoryType::kGPU); // TODO(liweim) do we need this? // mBufferManager->getStream().synchronize(); TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__); } template class LookaheadDecodingLayer; template class LookaheadDecodingLayer; } // namespace tensorrt_llm::layers