mirror of
https://github.com/NVIDIA/TensorRT-LLM.git
synced 2026-01-13 22:18:36 +08:00
* Update TensorRT-LLM --------- Co-authored-by: Puneesh Khanna <puneesh.khanna@tii.ae> Co-authored-by: Ethan Zhang <26497102+ethnzhng@users.noreply.github.com>
871 lines
32 KiB
C++
871 lines
32 KiB
C++
/*
|
|
* Copyright (c) 2023-2024, NVIDIA CORPORATION. All rights reserved.
|
|
*
|
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
|
* you may not use this file except in compliance with the License.
|
|
* You may obtain a copy of the License at
|
|
*
|
|
* http://www.apache.org/licenses/LICENSE-2.0
|
|
*
|
|
* Unless required by applicable law or agreed to in writing, software
|
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
* See the License for the specific language governing permissions and
|
|
* limitations under the License.
|
|
*/
|
|
|
|
#include <gtest/gtest.h>
|
|
#include <optional>
|
|
#include <queue>
|
|
#include <sstream>
|
|
#include <tuple>
|
|
#include <vector>
|
|
|
|
#include "tensorrt_llm/common/assert.h"
|
|
#include "tensorrt_llm/common/logger.h"
|
|
#include "tensorrt_llm/executor/executor.h"
|
|
#include "tensorrt_llm/layers/decodingParams.h"
|
|
#include "tensorrt_llm/layers/lookaheadDecodingLayer.h"
|
|
#include "tensorrt_llm/layers/lookaheadDecodingUtils.h"
|
|
#include "tensorrt_llm/runtime/common.h"
|
|
#include "tensorrt_llm/runtime/iBuffer.h"
|
|
#include "tensorrt_llm/runtime/iTensor.h"
|
|
#include "tensorrt_llm/runtime/lookaheadModule.h"
|
|
#include "tensorrt_llm/runtime/runtimeKernels.h"
|
|
#include "tests/layers/randomLlm.h"
|
|
|
|
namespace tensorrt_llm::tests::layers
|
|
{
|
|
using namespace tensorrt_llm::runtime;
|
|
using namespace tensorrt_llm::layers;
|
|
|
|
namespace trk = tensorrt_llm::runtime::kernels;
|
|
|
|
using TensorPtr = runtime::ITensor::SharedPtr;
|
|
using TensorConstPtr = runtime::ITensor::SharedConstPtr;
|
|
|
|
struct TestParam
|
|
{
|
|
SizeType32 maxBatchSize;
|
|
|
|
enum BatchType
|
|
{
|
|
SINGLE_ONCE,
|
|
SINGLE_TWICE,
|
|
DYNAMIC
|
|
} batchType;
|
|
|
|
SizeType32 maxW;
|
|
SizeType32 w;
|
|
SizeType32 maxN;
|
|
SizeType32 n;
|
|
SizeType32 maxG;
|
|
SizeType32 g;
|
|
};
|
|
|
|
class BatchSlotsManager
|
|
{
|
|
public:
|
|
BatchSlotsManager(SizeType32 maxBatchSize, SizeType32 cases)
|
|
: mMaxBatchSize(maxBatchSize)
|
|
, mCases(cases)
|
|
{
|
|
}
|
|
|
|
virtual std::vector<SizeType32> alloc(void) = 0;
|
|
virtual void free(SizeType32 id) = 0;
|
|
|
|
bool finished()
|
|
{
|
|
return mCases == 0;
|
|
}
|
|
|
|
protected:
|
|
SizeType32 quota(void)
|
|
{
|
|
return mCases - mRunning;
|
|
}
|
|
|
|
void consume(SizeType32 cases)
|
|
{
|
|
TLLM_CHECK(cases >= 0);
|
|
TLLM_CHECK_DEBUG_WITH_INFO(cases <= mCases, "cases=%d, mCases=%d", cases, mCases);
|
|
mRunning -= cases;
|
|
mCases -= cases;
|
|
}
|
|
|
|
protected:
|
|
SizeType32 mMaxBatchSize{0};
|
|
SizeType32 mCases{0};
|
|
SizeType32 mRunning{0};
|
|
};
|
|
|
|
class SingleBatchSlotsManager : public BatchSlotsManager
|
|
{
|
|
public:
|
|
SingleBatchSlotsManager(SizeType32 maxBatchSize, SizeType32 cases, SizeType32 id)
|
|
: BatchSlotsManager(maxBatchSize, cases)
|
|
, mId(id)
|
|
{
|
|
TLLM_CHECK(id < maxBatchSize);
|
|
}
|
|
|
|
virtual std::vector<SizeType32> alloc(void)
|
|
{
|
|
if (mState == FREE && quota() > 0)
|
|
{
|
|
mState = BUSY;
|
|
mRunning += 1;
|
|
return std::vector<SizeType32>({mId});
|
|
}
|
|
else
|
|
{
|
|
return std::vector<SizeType32>();
|
|
}
|
|
}
|
|
|
|
virtual void free(SizeType32 id)
|
|
{
|
|
TLLM_CHECK(id == mId);
|
|
mState = FREE;
|
|
consume(1);
|
|
}
|
|
|
|
private:
|
|
enum
|
|
{
|
|
FREE,
|
|
BUSY
|
|
} mState{FREE};
|
|
|
|
SizeType32 mId;
|
|
};
|
|
|
|
class DynamicBatchSlotsManager : public BatchSlotsManager
|
|
{
|
|
public:
|
|
DynamicBatchSlotsManager(SizeType32 maxBatchSize, SizeType32 cases)
|
|
: BatchSlotsManager(maxBatchSize, cases)
|
|
{
|
|
for (SizeType32 bi = 0; bi * 3 + 2 < maxBatchSize; bi++)
|
|
{
|
|
mFreeList.push(bi * 3 + 1);
|
|
mFreeList.push(bi * 3 + 2);
|
|
mFreeList.push(bi * 3);
|
|
}
|
|
}
|
|
|
|
virtual std::vector<SizeType32> alloc()
|
|
{
|
|
SizeType32 waterline = mMaxBatchSize / 4;
|
|
SizeType32 plan = mBusySet.size() < waterline ? rand() % (mMaxBatchSize / 4) : 0;
|
|
SizeType32 num = std::min(plan, quota());
|
|
std::vector<SizeType32> result;
|
|
for (SizeType32 i = 0; i < num && !mFreeList.empty(); i++)
|
|
{
|
|
SizeType32 id = mFreeList.front();
|
|
result.push_back(id);
|
|
mBusySet.insert(id);
|
|
mFreeList.pop();
|
|
}
|
|
mRunning += result.size();
|
|
return result;
|
|
}
|
|
|
|
virtual void free(SizeType32 id)
|
|
{
|
|
auto search = mBusySet.find(id);
|
|
TLLM_CHECK(search != mBusySet.end());
|
|
mBusySet.erase(search);
|
|
mFreeList.push(id);
|
|
consume(1);
|
|
}
|
|
|
|
private:
|
|
std::queue<SizeType32> mFreeList;
|
|
std::set<SizeType32> mBusySet;
|
|
};
|
|
|
|
class LookaheadDecodingLayerTest : public testing::Test
|
|
{
|
|
public:
|
|
void SetUp() override;
|
|
void TearDown() override;
|
|
void runTest(TestParam const& param);
|
|
|
|
private:
|
|
void allocateBuffers();
|
|
|
|
void setupBuffers();
|
|
|
|
void newRequests(std::vector<SizeType32> requestIds);
|
|
|
|
void manageBatch();
|
|
|
|
void llmForward();
|
|
|
|
void decodeForward();
|
|
|
|
void verifyDecode();
|
|
|
|
protected:
|
|
std::shared_ptr<tensorrt_llm::runtime::BufferManager> mBufferManager;
|
|
std::shared_ptr<tensorrt_llm::runtime::CudaStream> mStream;
|
|
|
|
struct cudaDeviceProp mDeviceProp;
|
|
|
|
TensorPtr mAlgoConfigBatch;
|
|
|
|
TensorPtr mOutputIds;
|
|
TensorPtr mSequenceLengths;
|
|
TensorPtr mProbs;
|
|
TensorPtr mEndIds;
|
|
TensorPtr mTokensPerStep;
|
|
TensorPtr mGoldenSampledTokens;
|
|
TensorPtr mBatchSlots;
|
|
TensorPtr mBatchSlotsMax;
|
|
|
|
TensorPtr mNewTokens;
|
|
TensorPtr mNumNewTokens;
|
|
TensorPtr mNumNewTokensCumSum;
|
|
TensorPtr mPathsOffsets;
|
|
TensorPtr mDraftLengths;
|
|
TensorPtr mPrevDraftLengths;
|
|
TensorPtr mDraftTokens;
|
|
TensorPtr mPackedMasks;
|
|
TensorPtr mPackedMasksBool;
|
|
TensorPtr mGenerationLengths;
|
|
TensorPtr mPositionOffsets;
|
|
TensorPtr mPositionIds;
|
|
TensorPtr mAttentionPackedMask;
|
|
|
|
TensorPtr mInputTokensBatch;
|
|
TensorPtr mPositionIdsBatch;
|
|
|
|
int32_t mMaxTopK = 1;
|
|
static constexpr int32_t mMaxSeqLen = 512;
|
|
float mMaxTopP = 1.0;
|
|
std::shared_ptr<AsciiRandomTokenLogits> mAscii;
|
|
std::vector<std::string> mOracle;
|
|
std::vector<TensorPtr> mPrompt;
|
|
std::vector<std::shared_ptr<RandomLlm>> mLlm;
|
|
std::shared_ptr<LookaheadDecodingLayer<float>> mDecoder;
|
|
std::shared_ptr<DecodingLayerWorkspace> mDecodingWorkspace;
|
|
SizeType32 mVocabSize;
|
|
SizeType32 mMaxTokensPerStep;
|
|
TestParam mTestParam;
|
|
std::shared_ptr<BatchSlotsManager> mBatchSlotsManager;
|
|
std::vector<std::ostringstream> mScoreBoard;
|
|
std::vector<TensorPtr> mHistogram;
|
|
std::list<std::string> mReports;
|
|
};
|
|
|
|
void LookaheadDecodingLayerTest::SetUp()
|
|
{
|
|
mStream = std::make_shared<tensorrt_llm::runtime::CudaStream>();
|
|
mBufferManager = std::make_shared<tensorrt_llm::runtime::BufferManager>(mStream);
|
|
|
|
int32_t device = 0;
|
|
cudaGetDevice(&device);
|
|
cudaGetDeviceProperties(&mDeviceProp, device);
|
|
|
|
mAscii = std::make_shared<AsciiRandomTokenLogits>();
|
|
mVocabSize = mAscii->getVocabSize();
|
|
}
|
|
|
|
void LookaheadDecodingLayerTest::TearDown() {}
|
|
|
|
void LookaheadDecodingLayerTest::allocateBuffers()
|
|
{
|
|
TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__);
|
|
auto const maxBatchSize = mTestParam.maxBatchSize;
|
|
auto const vocabSize = mAscii->getVocabSize();
|
|
auto const maxBeamSize = 1;
|
|
|
|
SizeType32 maxNumNewTokens, maxDraftLen, maxAcceptedDraftLen;
|
|
std::tie(mMaxTokensPerStep, maxNumNewTokens, maxDraftLen, maxAcceptedDraftLen)
|
|
= executor::LookaheadDecodingConfig(mTestParam.maxW, mTestParam.maxN, mTestParam.maxG)
|
|
.calculateSpeculativeResource();
|
|
// mMaxTokensPerStep = maxTokensPerStep;
|
|
|
|
auto const vocabSizePadded = vocabSize;
|
|
auto const maxNumHeads = 1;
|
|
std::ostringstream buf;
|
|
|
|
std::vector<std::string> text({//
|
|
std::string("To be, or not to be: that is the question. "
|
|
"To Be, Or Not To Be: That Is The Question.&"),
|
|
std::string("Be not afraid of greatness. Some are born great, some achieve greatness, and others have "
|
|
"greatness thrust upon them. "
|
|
"Be Not Afraid Of Greatness. Some Are Born Great, Some Achieve Greatness, And Others Have "
|
|
"Greatness Thrust Upon Them.&"),
|
|
std::string("Sweet are the uses of adversity which, like the toad, ugly and venomous, wears yet a precious "
|
|
"jewel in his head. "
|
|
"Sweet Are the Uses Of Adversity Which, Like The Toad, Ugly And Venomous, Wears Yet A Precious "
|
|
"Jewel In His Head.&"),
|
|
std::string("Talking isn't doing. It is a kind of good deed to say well; and yet words are not deeds. "
|
|
"Talking Isn't Doing. It Is A Kind Of Good Deed To Say Well; And Yet Words Are Not Deeds.&"),
|
|
std::string(
|
|
"Reputation is an idle and most false imposition; oft got without merit, and lost without deserving. "
|
|
"Reputation Is An Idle And Most False Imposition; Oft Got Without Merit, And Lost Without Deserving.&")});
|
|
|
|
mOracle.resize(maxBatchSize);
|
|
mLlm.resize(maxBatchSize);
|
|
mPrompt.resize(maxBatchSize);
|
|
mScoreBoard.resize(maxBatchSize);
|
|
mHistogram.resize(maxBatchSize);
|
|
for (SizeType32 gbi = 0; gbi < maxBatchSize; gbi++)
|
|
{
|
|
mOracle[gbi] = text[rand() % text.size()];
|
|
mLlm[gbi] = std::make_shared<LookaheadRandomLlm>(mAscii, mOracle[gbi], gbi);
|
|
|
|
mScoreBoard[gbi] = std::ostringstream();
|
|
mHistogram[gbi] = BufferManager::cpu(ITensor::makeShape({mTestParam.n + 1}), nvinfer1::DataType::kINT32);
|
|
}
|
|
switch (mTestParam.batchType)
|
|
{
|
|
case TestParam::SINGLE_ONCE:
|
|
mBatchSlotsManager = std::make_shared<SingleBatchSlotsManager>(maxBatchSize, 1, 1);
|
|
break;
|
|
case TestParam::SINGLE_TWICE:
|
|
mBatchSlotsManager = std::make_shared<SingleBatchSlotsManager>(maxBatchSize, 2, 1);
|
|
break;
|
|
case TestParam::DYNAMIC:
|
|
mBatchSlotsManager = std::make_shared<DynamicBatchSlotsManager>(maxBatchSize, maxBatchSize * 2);
|
|
break;
|
|
}
|
|
|
|
auto lookaheadModule = std::make_shared<LookaheadModule>(mTestParam.maxN, mMaxTokensPerStep - 1);
|
|
|
|
lookaheadModule->setExecutionConfig(
|
|
executor::LookaheadDecodingConfig(mTestParam.maxW, mTestParam.maxN, mTestParam.maxG));
|
|
auto const decodingDomain
|
|
= tensorrt_llm::layers::DecoderDomain(maxBatchSize, 1, vocabSize, vocabSizePadded, lookaheadModule);
|
|
|
|
mDecoder = std::make_shared<LookaheadDecodingLayer<float>>(decodingDomain, mBufferManager);
|
|
|
|
TLLM_LOG_DEBUG("decoder ok");
|
|
|
|
auto maxBatchShape1D = ITensor::makeShape({maxBatchSize});
|
|
|
|
mAlgoConfigBatch = BufferManager::pinnedPool(ITensor::makeShape({maxBatchSize, 3}), nvinfer1::DataType::kINT32);
|
|
|
|
mEndIds = BufferManager::pinnedPool(maxBatchShape1D, nvinfer1::DataType::kINT32);
|
|
mTokensPerStep = BufferManager::pinnedPool(maxBatchShape1D, nvinfer1::DataType::kINT32);
|
|
|
|
mOutputIds = BufferManager::pinnedPool(
|
|
ITensor::makeShape({maxBatchSize, maxBeamSize, mMaxSeqLen + mMaxTokensPerStep}), nvinfer1::DataType::kINT32);
|
|
mSequenceLengths = BufferManager::pinnedPool(maxBatchShape1D, nvinfer1::DataType::kINT32);
|
|
|
|
mProbs = BufferManager::pinnedPool(
|
|
ITensor::makeShape({maxBatchSize, mMaxTokensPerStep, vocabSize}), nvinfer1::DataType::kFLOAT);
|
|
|
|
mGoldenSampledTokens
|
|
= BufferManager::cpu(ITensor::makeShape({maxBatchSize, mMaxTokensPerStep}), nvinfer1::DataType::kINT32);
|
|
mInputTokensBatch
|
|
= BufferManager::pinnedPool(ITensor::makeShape({maxBatchSize, mMaxTokensPerStep}), nvinfer1::DataType::kINT32);
|
|
mPositionIdsBatch
|
|
= BufferManager::pinnedPool(ITensor::makeShape({maxBatchSize, mMaxTokensPerStep}), nvinfer1::DataType::kINT32);
|
|
|
|
mNewTokens = BufferManager::pinnedPool(
|
|
ITensor::makeShape({mMaxTokensPerStep, maxBatchSize, 1}), nvinfer1::DataType::kINT32);
|
|
mNumNewTokens = BufferManager::pinnedPool(maxBatchShape1D, nvinfer1::DataType::kINT32);
|
|
mDraftLengths = BufferManager::pinnedPool(maxBatchShape1D, nvinfer1::DataType::kINT32);
|
|
mPrevDraftLengths = BufferManager::pinnedPool(maxBatchShape1D, nvinfer1::DataType::kINT32);
|
|
mDraftTokens
|
|
= BufferManager::pinnedPool(ITensor::makeShape({maxBatchSize, maxDraftLen}), nvinfer1::DataType::kINT32);
|
|
auto packedMaskShape = ITensor::makeShape(
|
|
{maxBatchSize, mMaxTokensPerStep, static_cast<ITensor::DimType64>(common::divUp(mMaxTokensPerStep, 32))});
|
|
mPackedMasks = BufferManager::pinnedPool(packedMaskShape, nvinfer1::DataType::kINT32);
|
|
mPackedMasksBool = BufferManager::pinnedPool(
|
|
ITensor::makeShape({maxBatchSize, mMaxTokensPerStep, mMaxTokensPerStep}), nvinfer1::DataType::kBOOL);
|
|
mNumNewTokensCumSum = BufferManager::pinnedPool(ITensor::makeShape({maxBatchSize + 1}), nvinfer1::DataType::kINT32);
|
|
mPathsOffsets = BufferManager::pinnedPool(
|
|
ITensor::makeShape({maxBatchSize, maxAcceptedDraftLen}), nvinfer1::DataType::kINT32);
|
|
mGenerationLengths = BufferManager::pinnedPool(maxBatchShape1D, nvinfer1::DataType::kINT32);
|
|
mPositionOffsets
|
|
= BufferManager::pinnedPool(ITensor::makeShape({maxBatchSize, mMaxTokensPerStep}), nvinfer1::DataType::kINT32);
|
|
mPositionIds
|
|
= BufferManager::pinnedPool(ITensor::makeShape({maxBatchSize, mMaxTokensPerStep}), nvinfer1::DataType::kINT32);
|
|
mAttentionPackedMask = BufferManager::pinnedPool(packedMaskShape, nvinfer1::DataType::kINT32);
|
|
|
|
mBatchSlotsMax = BufferManager::pinnedPool(maxBatchShape1D, nvinfer1::DataType::kINT32);
|
|
|
|
auto const batchSize = 0;
|
|
auto batchShape1D = ITensor::makeShape({batchSize});
|
|
auto batchShape2D = ITensor::makeShape({batchSize, mMaxTokensPerStep});
|
|
|
|
mBatchSlots = ITensor::slice(mBatchSlotsMax, 0, batchSize);
|
|
|
|
trk::invokeFill(*mEndIds, mAscii->getEndToken(), *mStream);
|
|
trk::invokeFill(*mOutputIds, int32_t{0}, *mStream);
|
|
trk::invokeFill(*mSequenceLengths, int32_t{0}, *mStream);
|
|
trk::invokeFill(*mTokensPerStep, mMaxTokensPerStep, *mStream);
|
|
mDecodingWorkspace = std::make_unique<tensorrt_llm::runtime::DecodingLayerWorkspace>(
|
|
mBufferManager, decodingDomain, TRTDataType<float>::value, mDecoder->getWorkspaceSize());
|
|
|
|
TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__);
|
|
}
|
|
|
|
void LookaheadDecodingLayerTest::setupBuffers() {}
|
|
|
|
void LookaheadDecodingLayerTest::newRequests(std::vector<SizeType32> requestIds)
|
|
{
|
|
TLLM_LOG_DEBUG("%s start", __PRETTY_FUNCTION__);
|
|
auto const requestSize = requestIds.size();
|
|
|
|
auto const beamSize = 1;
|
|
SizeType32 vocabSize = mAscii->getVocabSize();
|
|
|
|
////////////////////////////////
|
|
for (auto gbi : requestIds)
|
|
{
|
|
auto len = 5 + rand() % 10;
|
|
auto prompt = mOracle[gbi].substr(0, len);
|
|
|
|
TokenIdType contextToken = mOracle[gbi][len];
|
|
SizeType32 contextLen = len + 1;
|
|
|
|
BufferRange<TokenIdType> outputRange(*ITensor::at(mOutputIds, {gbi, 0}));
|
|
for (auto& v : outputRange)
|
|
{
|
|
v = 0;
|
|
}
|
|
std::copy(prompt.begin(), prompt.end(), outputRange.begin());
|
|
outputRange[len] = contextToken;
|
|
BufferLocation<TokenIdType>(*mSequenceLengths).at(gbi) = len + 1;
|
|
BufferLocation<TokenIdType>(*mDraftLengths).at(gbi) = 0;
|
|
BufferLocation<SizeType32>(*mNumNewTokens).at(gbi) = 0;
|
|
|
|
mPrompt[gbi] = ITensor::slice(mOutputIds, {gbi, 0, 0}, len + 1);
|
|
|
|
for (auto& v : BufferRange<SizeType32>(*mHistogram[gbi]))
|
|
{
|
|
v = 0;
|
|
}
|
|
mScoreBoard[gbi] << "request id=[" << gbi << "] starts. prompt len=[" << len << "].";
|
|
}
|
|
|
|
TLLM_LOG_DEBUG("batch slots");
|
|
////////////////////////////////
|
|
auto batchSize = ITensor::volume(mBatchSlots->getShape());
|
|
BufferRange<SizeType32> batchSlotMaxRange(*mBatchSlotsMax);
|
|
std::copy(requestIds.begin(), requestIds.end(), batchSlotMaxRange.begin() + batchSize);
|
|
|
|
////////////////////////////////
|
|
auto setupParams = std::make_shared<LookaheadSetupParams>();
|
|
setupParams->prompt.resize(0);
|
|
setupParams->algoConfigs.resize(0);
|
|
for (SizeType32 bi = 0; bi < requestSize; bi++)
|
|
{
|
|
SizeType32 gbi = requestIds[bi];
|
|
setupParams->prompt.emplace_back(mPrompt[gbi]);
|
|
setupParams->algoConfigs.emplace_back(mTestParam.w, mTestParam.n, mTestParam.g);
|
|
PRINT_TOKENS(setupParams->prompt[bi]);
|
|
setupParams->generationLengths = mGenerationLengths;
|
|
setupParams->positionOffsets = mPositionOffsets;
|
|
setupParams->attentionPackedMasks = mPackedMasks;
|
|
}
|
|
std::vector<uint64_t> seed(requestIds.begin(), requestIds.end());
|
|
setupParams->randomSeed = std::make_optional(seed);
|
|
TensorPtr newRequestSlots = ITensor::slice(mBatchSlotsMax, batchSize, requestSize);
|
|
PRINT_VALUES(newRequestSlots);
|
|
PRINT_VALUES(mBatchSlotsMax);
|
|
mBatchSlots = ITensor::slice(mBatchSlotsMax, 0, batchSize);
|
|
mDecodingWorkspace->setDeviceBatchSlots(newRequestSlots);
|
|
mDecoder->setup(requestSize, beamSize, newRequestSlots, setupParams, mDecodingWorkspace);
|
|
|
|
PRINT_VALUES(mPositionOffsets);
|
|
|
|
batchSize += requestIds.size();
|
|
mBatchSlots = ITensor::slice(mBatchSlotsMax, 0, batchSize);
|
|
TLLM_LOG_DEBUG("new Requests mBatchSlots %s", D(mBatchSlots).values<int32_t>().c_str());
|
|
PRINT_VALUES(mSequenceLengths);
|
|
|
|
TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__);
|
|
}
|
|
|
|
void LookaheadDecodingLayerTest::manageBatch()
|
|
{
|
|
TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__);
|
|
auto const maxBatchSize = mTestParam.maxBatchSize;
|
|
auto requests = mBatchSlotsManager->alloc();
|
|
if (requests.size() > 0)
|
|
{
|
|
newRequests(requests);
|
|
}
|
|
PRINT_VALUES(mSequenceLengths);
|
|
|
|
auto batchSize = ITensor::volume(mBatchSlots->getShape());
|
|
BufferRange<SizeType32> batchSlotsRange(*mBatchSlots);
|
|
auto batchShape1D = ITensor::makeShape({batchSize});
|
|
auto batchShape2D = ITensor::makeShape({batchSize, mMaxTokensPerStep});
|
|
auto newBatchSize = 0;
|
|
PRINT_VALUES(mBatchSlots);
|
|
for (SizeType32 bi = 0; bi < batchSize; bi++)
|
|
{
|
|
SizeType32 gbi = batchSlotsRange[bi];
|
|
SizeType32 nbi = newBatchSize;
|
|
|
|
TensorPtr theSequence = ITensor::at(mOutputIds, {gbi, 0});
|
|
BufferRange<SizeType32> theSequenceRange(*theSequence);
|
|
auto theSequenceLength = BufferRange<SizeType32>(*mSequenceLengths)[gbi];
|
|
auto theNumNewTokens = BufferRange<SizeType32>(*mNumNewTokens)[gbi];
|
|
|
|
TensorPtr generated = ITensor::slice(theSequence, 0, theSequenceLength);
|
|
|
|
PRINT_TOKENS(generated);
|
|
EXPECT_TRUE(mLlm[gbi]->verify(0, generated));
|
|
|
|
BufferRange<SizeType32>(*mHistogram[gbi])[theNumNewTokens] += 1;
|
|
|
|
if (BufferLocation<TokenIdType>(*theSequence).at(theSequenceLength - 1) == mAscii->getEndToken())
|
|
{
|
|
TLLM_LOG_DEBUG("request[%d] ends: '%s'", gbi, D(theSequence).string().c_str());
|
|
mScoreBoard[gbi] << "[" << gbi << "] ends. " << D(mHistogram[gbi]).values<SizeType32>();
|
|
mReports.push_back(mScoreBoard[gbi].str());
|
|
mScoreBoard[gbi].str("");
|
|
mScoreBoard[gbi].clear();
|
|
mBatchSlotsManager->free(gbi);
|
|
}
|
|
else
|
|
{
|
|
batchSlotsRange[newBatchSize++] = gbi;
|
|
}
|
|
|
|
auto theDraftLen = BufferRange<SizeType32>(*mDraftLengths)[gbi];
|
|
auto theGenerationLength = BufferRange<SizeType32>(*mGenerationLengths)[gbi];
|
|
TLLM_CHECK_DEBUG_WITH_INFO(
|
|
theDraftLen + 1 == theGenerationLength, "%d + 1 == %d", theDraftLen, theGenerationLength);
|
|
BufferLocation<SizeType32>(*mTokensPerStep).at(gbi) = theGenerationLength;
|
|
|
|
BufferLocation<TokenIdType>(*mInputTokensBatch).at(nbi, 0) = theSequenceRange[theSequenceLength - 1];
|
|
mBufferManager->copy(*ITensor::slice(mDraftTokens, {gbi, 0}, theDraftLen),
|
|
*ITensor::slice(mInputTokensBatch, {nbi, 1}, theDraftLen));
|
|
mBufferManager->copy(*ITensor::slice(mPositionIds, {gbi, 0}), *ITensor::slice(mPositionIdsBatch, {nbi, 0}));
|
|
BufferLocation<SizeType32>(*mPositionIdsBatch).at(nbi, 0) = theSequenceLength - 1;
|
|
|
|
TLLM_LOG_DEBUG("W=%d, N=%d, G=%d, w=%d, n=%d, g=%d, draftLen = %d", mTestParam.maxW, mTestParam.maxN,
|
|
mTestParam.maxG, mTestParam.w, mTestParam.n, mTestParam.g, theDraftLen);
|
|
|
|
auto len = BufferRange<SizeType32>(*mTokensPerStep)[gbi];
|
|
PRINT_TOKENS(ITensor::slice(mInputTokensBatch, {nbi, 0}, len));
|
|
PRINT_VALUES(ITensor::slice(mPositionIdsBatch, {nbi, 0}, len));
|
|
}
|
|
mBatchSlots = ITensor::slice(mBatchSlotsMax, 0, newBatchSize);
|
|
PRINT_VALUES(mBatchSlots);
|
|
|
|
TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__);
|
|
}
|
|
|
|
void convertInt32ToBool(TensorPtr const& dst, TensorConstPtr const& src)
|
|
{
|
|
auto dstShape = dst->getShape();
|
|
auto srcShape = src->getShape();
|
|
TLLM_CHECK(dstShape.d[0] == srcShape.d[0]);
|
|
TLLM_CHECK(dstShape.d[1] <= srcShape.d[1] * 32);
|
|
BufferLocation<bool> dstLocation(*dst);
|
|
BufferLocation<SizeType32 const> srcLocation(*src);
|
|
auto testBit = [](SizeType32 x, SizeType32 idx) { return x & (1 << idx); };
|
|
for (auto i = 0; i < dstShape.d[0]; i++)
|
|
{
|
|
for (auto j = 0; j < dstShape.d[1]; j++)
|
|
{
|
|
dstLocation.at(i, j) = testBit(srcLocation.at(i, j / 32), j % 32);
|
|
}
|
|
}
|
|
}
|
|
|
|
void convertBoolToInt32(TensorPtr const& dst, TensorConstPtr const& src)
|
|
{
|
|
auto dstShape = dst->getShape();
|
|
auto srcShape = src->getShape();
|
|
TLLM_CHECK(dstShape.d[0] == srcShape.d[0]);
|
|
TLLM_CHECK(dstShape.d[1] * 32 >= srcShape.d[1]);
|
|
BufferLocation<SizeType32> dstLocation(*dst);
|
|
BufferLocation<bool const> srcLocation(*src);
|
|
|
|
for (auto i = 0; i < dstLocation.size(); i++)
|
|
{
|
|
dstLocation[i] = 0;
|
|
}
|
|
|
|
auto setBit = [](SizeType32& x, SizeType32 idx, bool value) { x |= (value << idx); };
|
|
for (auto i = 0; i < srcShape.d[0]; i++)
|
|
{
|
|
for (auto j = 0; j < srcShape.d[1]; j++)
|
|
{
|
|
setBit(dstLocation.at(i, j / 32), j % 32, srcLocation.at(i, j));
|
|
}
|
|
}
|
|
}
|
|
|
|
void LookaheadDecodingLayerTest::llmForward()
|
|
{
|
|
TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__);
|
|
|
|
auto batchSize = ITensor::volume(mBatchSlots->getShape());
|
|
|
|
for (SizeType32 bi = 0; bi < batchSize; bi++)
|
|
{
|
|
auto gbi = BufferRange<SizeType32>(*mBatchSlots)[bi];
|
|
auto start = BufferRange<SizeType32>(*mSequenceLengths)[gbi] - 1;
|
|
auto len = BufferRange<SizeType32>(*mTokensPerStep)[gbi];
|
|
TLLM_LOG_DEBUG("LookaheadDecodingLayerTest::llmForward input len=%d", len);
|
|
TensorPtr output = ITensor::slice(mProbs, {bi, 0}, len);
|
|
TensorPtr golden = ITensor::slice(mGoldenSampledTokens, {gbi, 0}, len);
|
|
|
|
BufferRange<SizeType32> idRange(*ITensor::slice(mPositionIdsBatch, {bi, 0}, len));
|
|
BufferRange<SizeType32> offsetRange(*ITensor::slice(mPositionOffsets, {gbi, 0}, len));
|
|
PRINT_VALUES(ITensor::slice(mPositionIdsBatch, {bi, 0}));
|
|
PRINT_VALUES(ITensor::slice(mPositionOffsets, {bi, 0}));
|
|
for (auto i = 0; i < idRange.size(); i++)
|
|
{
|
|
TLLM_CHECK(idRange[i] == start + offsetRange[i]);
|
|
}
|
|
|
|
if (false)
|
|
{
|
|
convertInt32ToBool(ITensor::at(mPackedMasksBool, {gbi}), ITensor::at(mPackedMasks, {gbi}));
|
|
mLlm[gbi]->forward(output, //
|
|
ITensor::slice(mInputTokensBatch, {bi, 0}, len), //
|
|
ITensor::slice(mPositionIdsBatch, {bi, 0}, len), //
|
|
ITensor::at(mPackedMasksBool, {gbi}));
|
|
}
|
|
else
|
|
{
|
|
convertInt32ToBool(ITensor::at(mPackedMasksBool, {gbi}), ITensor::at(mPackedMasks, {gbi}));
|
|
mLlm[gbi]->forward(output, //
|
|
start, //
|
|
ITensor::slice(mInputTokensBatch, {bi, 0}, len), //
|
|
ITensor::slice(mPositionOffsets, {gbi, 0}, len), //
|
|
ITensor::at(mPackedMasksBool, {gbi}));
|
|
}
|
|
|
|
mAscii->logitsToTensor(golden, output);
|
|
TLLM_LOG_DEBUG("batch[%d] LLM golden: '%s'", gbi, D(golden).tokens().c_str());
|
|
}
|
|
|
|
TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__);
|
|
}
|
|
|
|
void LookaheadDecodingLayerTest::decodeForward()
|
|
{
|
|
TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__);
|
|
|
|
auto batchSize = ITensor::volume(mBatchSlots->getShape());
|
|
PRINT_VALUES(mBatchSlots);
|
|
|
|
auto inputParams = std::make_shared<LookaheadDecodingInputs>(mEndIds, mBatchSlots);
|
|
inputParams->localBatchSize = batchSize;
|
|
inputParams->logits = ITensor::slice(mProbs, 0, batchSize);
|
|
inputParams->batchSlots = mBatchSlots;
|
|
inputParams->curTokensPerStep = mTokensPerStep;
|
|
|
|
auto outputParams = std::make_shared<LookaheadDecodingOutputs>(mOutputIds);
|
|
|
|
PRINT_VALUES(mSequenceLengths);
|
|
outputParams->sequenceLength = mSequenceLengths;
|
|
outputParams->nextDraftLengths = mDraftLengths;
|
|
outputParams->prevDraftLengths = mPrevDraftLengths;
|
|
outputParams->nextDraftTokens = mDraftTokens;
|
|
outputParams->packedMasks = mPackedMasks;
|
|
outputParams->numNewTokens = mNumNewTokens;
|
|
outputParams->newTokens = mNewTokens;
|
|
outputParams->numNewTokensCumSum = mNumNewTokensCumSum;
|
|
outputParams->pathsOffsets = mPathsOffsets;
|
|
outputParams->generationLengths = mGenerationLengths;
|
|
outputParams->positionOffsets = mPositionOffsets;
|
|
outputParams->positionIds = mPositionIds;
|
|
outputParams->packedMasks = mPackedMasks;
|
|
|
|
PRINT_VALUES(mTokensPerStep);
|
|
|
|
mDecodingWorkspace->setDeviceBatchSlots(mBatchSlots);
|
|
mDecoder->forwardAsync(outputParams, inputParams, mDecodingWorkspace);
|
|
|
|
mStream->synchronize();
|
|
|
|
mDecodingWorkspace->setDeviceBatchSlots(mBatchSlots);
|
|
mDecoder->forwardSync(outputParams, inputParams, mDecodingWorkspace);
|
|
|
|
mStream->synchronize();
|
|
|
|
TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__);
|
|
}
|
|
|
|
void LookaheadDecodingLayerTest::verifyDecode()
|
|
{
|
|
auto batchSize = ITensor::volume(mBatchSlots->getShape());
|
|
for (SizeType32 bi = 0; bi < batchSize; bi++)
|
|
{
|
|
auto gbi = BufferRange<SizeType32>(*mBatchSlots)[bi];
|
|
auto len = BufferRange<SizeType32>(*mTokensPerStep)[gbi];
|
|
auto sequenceLength = BufferLocation<SizeType32>(*mSequenceLengths).at(gbi);
|
|
|
|
auto draftLength = BufferLocation<SizeType32>(*mDraftLengths).at(gbi);
|
|
auto generationLength = BufferLocation<SizeType32>(*mGenerationLengths).at(gbi);
|
|
BufferRange<SizeType32> posOffsetRange(*ITensor::slice(mPositionOffsets, {gbi, 0}, generationLength));
|
|
BufferRange<SizeType32> posIdRange(*ITensor::slice(mPositionIds, {gbi, 0}, generationLength));
|
|
TLLM_LOG_DEBUG("generationLength = %d, draftLength = %d", generationLength, draftLength);
|
|
TLLM_CHECK(draftLength + 1 == generationLength);
|
|
TLLM_CHECK(posOffsetRange[0] == 0);
|
|
TLLM_CHECK(posIdRange[0] == sequenceLength - 1);
|
|
for (SizeType32 i = 0; i < posIdRange.size(); i++)
|
|
{
|
|
TLLM_CHECK(posIdRange[i] == posOffsetRange[i] + sequenceLength - 1);
|
|
}
|
|
}
|
|
|
|
BufferRange<SizeType32> cumSumRange(*mNumNewTokensCumSum);
|
|
BufferRange<SizeType32> pathOffsetsRange(*mPathsOffsets);
|
|
PRINT_VALUES(mNumNewTokensCumSum);
|
|
for (SizeType32 bi = 0; bi < batchSize; bi++)
|
|
{
|
|
auto gbi = BufferRange<SizeType32>(*mBatchSlots)[bi];
|
|
SizeType32 pathOffsetBegin = cumSumRange[bi];
|
|
SizeType32 pathOffsetEnd = cumSumRange[bi + 1];
|
|
TensorPtr golden = ITensor::at(mGoldenSampledTokens, {gbi});
|
|
auto sequenceLength = BufferLocation<SizeType32>(*mSequenceLengths).at(gbi);
|
|
auto numNewTokens = BufferLocation<SizeType32>(*mNumNewTokens).at(gbi);
|
|
TensorPtr newTokens = ITensor::slice(mOutputIds, {gbi, 0, sequenceLength - numNewTokens}, numNewTokens);
|
|
BufferRange<SizeType32> goldenRange(*ITensor::at(mGoldenSampledTokens, {gbi}));
|
|
BufferRange<TokenIdType> newTokensRange(*newTokens);
|
|
|
|
SizeType32 ni = 1;
|
|
for (SizeType32 poi = pathOffsetBegin; poi < pathOffsetEnd; poi++)
|
|
{
|
|
TLLM_CHECK(goldenRange[pathOffsetsRange[poi] + 1] == newTokensRange[ni++]);
|
|
}
|
|
}
|
|
}
|
|
|
|
void LookaheadDecodingLayerTest::runTest(TestParam const& param)
|
|
{
|
|
TLLM_LOG_DEBUG("TEST BEGIN: maxBatchSize=%d, mode=%d, WNG=(%d, %d, %d), wng=(%d, %d, %d)", param.maxBatchSize,
|
|
param.batchType, param.maxW, param.maxN, param.maxG, param.w, param.n, param.g);
|
|
srand(42);
|
|
|
|
mTestParam = param;
|
|
allocateBuffers();
|
|
|
|
int step = 0;
|
|
for (; !mBatchSlotsManager->finished() && step < 3000; step++)
|
|
{
|
|
TLLM_LOG_DEBUG("!!!!!!!!!!!!!!!! < %d > !!!!!!!!!!!!!!!!", step);
|
|
manageBatch();
|
|
if (ITensor::volume(mBatchSlots->getShape()))
|
|
{
|
|
llmForward();
|
|
mStream->synchronize();
|
|
decodeForward();
|
|
verifyDecode();
|
|
}
|
|
}
|
|
|
|
for (auto& r : mReports)
|
|
{
|
|
TLLM_LOG_DEBUG(r);
|
|
}
|
|
if (!mBatchSlotsManager->finished())
|
|
{
|
|
TLLM_LOG_INFO("step=%d is not enough", step);
|
|
}
|
|
}
|
|
|
|
TEST_F(LookaheadDecodingLayerTest, singleOnce)
|
|
{
|
|
this->runTest(TestParam{16, TestParam::SINGLE_ONCE, 5, 3, 5, 3, 5, 3});
|
|
}
|
|
|
|
TEST_F(LookaheadDecodingLayerTest, singleTwice)
|
|
{
|
|
this->runTest(TestParam{16, TestParam::SINGLE_TWICE, 7, 5, 7, 5, 7, 5});
|
|
}
|
|
|
|
TEST_F(LookaheadDecodingLayerTest, dynamic)
|
|
{
|
|
this->runTest(TestParam{16, TestParam::DYNAMIC, 5, 5, 5, 5, 5, 5});
|
|
}
|
|
|
|
TEST_F(LookaheadDecodingLayerTest, dynamicLarge)
|
|
{
|
|
this->runTest(TestParam{32, TestParam::DYNAMIC, 7, 6, 7, 6, 9, 8});
|
|
}
|
|
|
|
TEST_F(LookaheadDecodingLayerTest, dynamicSmall_110)
|
|
{
|
|
this->runTest(TestParam{16, TestParam::SINGLE_TWICE, 1, 1, 2, 2, 0, 0});
|
|
}
|
|
|
|
TEST_F(LookaheadDecodingLayerTest, dynamicSmall_311)
|
|
{
|
|
this->runTest(TestParam{32, TestParam::DYNAMIC, 3, 2, 2, 2, 1, 1});
|
|
}
|
|
|
|
TEST_F(LookaheadDecodingLayerTest, dynamicSmall_131)
|
|
{
|
|
this->runTest(TestParam{32, TestParam::DYNAMIC, 1, 1, 3, 2, 1, 1});
|
|
}
|
|
|
|
TEST_F(LookaheadDecodingLayerTest, dynamicSmall_113)
|
|
{
|
|
this->runTest(TestParam{32, TestParam::DYNAMIC, 1, 1, 2, 2, 3, 2});
|
|
}
|
|
|
|
TEST_F(LookaheadDecodingLayerTest, dynamicSmall_112110)
|
|
{
|
|
this->runTest(TestParam{4, TestParam::SINGLE_TWICE, 1, 1, 2, 1, 1, 0});
|
|
}
|
|
|
|
using ParamType = std::tuple<SizeType32, TestParam::BatchType, std::tuple<SizeType32, SizeType32>,
|
|
std::tuple<SizeType32, SizeType32>, std::tuple<SizeType32, SizeType32>>;
|
|
|
|
static int g_id = 0;
|
|
|
|
std::string generateTestName(testing::TestParamInfo<ParamType> const& info)
|
|
{
|
|
auto [maxBatchSize, mode, Ww, Nn, Gg] = info.param;
|
|
auto [W, w] = Ww;
|
|
auto [N, n] = Nn;
|
|
auto [G, g] = Gg;
|
|
std::ostringstream buf;
|
|
buf << (g_id++) << "maxBatchSize_" << maxBatchSize << "__mode_" << mode << '_' << '_' << W << '_' << w << '_' << '_'
|
|
<< N << '_' << n << '_' << '_' << G << '_' << g << '_';
|
|
return buf.str();
|
|
}
|
|
|
|
class ParamTest : public LookaheadDecodingLayerTest, public ::testing::WithParamInterface<ParamType>
|
|
{
|
|
};
|
|
|
|
TEST_P(ParamTest, Test)
|
|
{
|
|
srand(42);
|
|
|
|
auto [maxBatchSize, mode, Ww, Nn, Gg] = GetParam();
|
|
auto [W, w] = Ww;
|
|
auto [N, n] = Nn;
|
|
auto [G, g] = Gg;
|
|
if (!executor::LookaheadDecodingConfig::isLegal(W, N, G) || !executor::LookaheadDecodingConfig::isLegal(w, n, g))
|
|
{
|
|
TLLM_LOG_DEBUG("Just Pass for illegal parameter combination");
|
|
GTEST_SKIP() << "Algorithm does not support these parameters WNG=(" << W << ", " << N << ", " << G << "), wng=("
|
|
<< w << ", " << n << ", " << g << ")";
|
|
}
|
|
runTest(TestParam{maxBatchSize, mode, W, w, N, n, G, g});
|
|
}
|
|
|
|
INSTANTIATE_TEST_SUITE_P(LookaheadDecodingLayerParamTest, ParamTest,
|
|
testing::Combine( //
|
|
testing::Values(4, 16), testing::Values(TestParam::DYNAMIC),
|
|
testing::Values(std::make_tuple(1, 1), std::make_tuple(3, 3), std::make_tuple(5, 5), std::make_tuple(2, 1),
|
|
std::make_tuple(3, 2), std::make_tuple(5, 3)),
|
|
testing::Values(std::make_tuple(1, 1), std::make_tuple(3, 3), std::make_tuple(5, 5), std::make_tuple(2, 1),
|
|
std::make_tuple(3, 2), std::make_tuple(5, 3)),
|
|
testing::Values(std::make_tuple(0, 0), std::make_tuple(3, 3), std::make_tuple(5, 5), std::make_tuple(1, 0),
|
|
std::make_tuple(3, 2), std::make_tuple(5, 3))),
|
|
generateTestName);
|
|
|
|
} // namespace tensorrt_llm::tests::layers
|