mirror of
https://github.com/NVIDIA/TensorRT-LLM.git
synced 2026-01-14 06:27:45 +08:00
304 lines
8.2 KiB
C++
304 lines
8.2 KiB
C++
/*
|
|
* Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
|
|
*
|
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
|
* you may not use this file except in compliance with the License.
|
|
* You may obtain a copy of the License at
|
|
*
|
|
* http://www.apache.org/licenses/LICENSE-2.0
|
|
*
|
|
* Unless required by applicable law or agreed to in writing, software
|
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
* See the License for the specific language governing permissions and
|
|
* limitations under the License.
|
|
*/
|
|
#pragma once
|
|
|
|
#include "tensorrt_llm/layers/eagleDecodingLayer.h"
|
|
#include "tensorrt_llm/runtime/bufferManager.h"
|
|
#include "tensorrt_llm/runtime/cudaStream.h"
|
|
|
|
#include <gtest/gtest.h>
|
|
|
|
#include <memory>
|
|
|
|
namespace tensorrt_llm::tests::layers
|
|
{
|
|
|
|
class SamplingParams
|
|
{
|
|
public:
|
|
SamplingParams() {}
|
|
|
|
inline void setBatchSize(runtime::SizeType32 batchSize)
|
|
{
|
|
mBatchSize = batchSize;
|
|
}
|
|
|
|
inline void setMaxPathLen(runtime::SizeType32 maxPathLen)
|
|
{
|
|
mMaxPathLen = maxPathLen;
|
|
}
|
|
|
|
inline void setMaxDecodingTokens(runtime::SizeType32 maxDecodingTokens)
|
|
{
|
|
mMaxDecodingTokens = maxDecodingTokens;
|
|
}
|
|
|
|
[[nodiscard]] inline runtime::SizeType32 getBatchSize() const
|
|
{
|
|
return mBatchSize;
|
|
}
|
|
|
|
[[nodiscard]] inline runtime::SizeType32 getVocabSize() const
|
|
{
|
|
return mVocabSize;
|
|
}
|
|
|
|
[[nodiscard]] inline runtime::SizeType32 getMaxBatchSize() const
|
|
{
|
|
return 2 * getBatchSize();
|
|
}
|
|
|
|
[[nodiscard]] inline runtime::SizeType32 getMaxPathLen() const
|
|
{
|
|
return mMaxPathLen;
|
|
}
|
|
|
|
[[nodiscard]] inline runtime::SizeType32 getMaxDraftPathLen() const
|
|
{
|
|
return getMaxPathLen() - 1;
|
|
}
|
|
|
|
[[nodiscard]] inline runtime::SizeType32 getMaxDecodingTokens() const
|
|
{
|
|
return mMaxDecodingTokens;
|
|
}
|
|
|
|
[[nodiscard]] inline runtime::SizeType32 getMaxDecodingDraftTokens() const
|
|
{
|
|
return getMaxDecodingTokens() - 1;
|
|
}
|
|
|
|
[[nodiscard]] inline runtime::SizeType32 getMaxSeqLen() const
|
|
{
|
|
return getMaxDecodingTokens() * 2;
|
|
}
|
|
|
|
[[nodiscard]] inline runtime::TokenIdType getPadId() const
|
|
{
|
|
return mPadId;
|
|
}
|
|
|
|
private:
|
|
runtime::SizeType32 mBatchSize{6};
|
|
runtime::SizeType32 mMaxPathLen{4};
|
|
runtime::SizeType32 mMaxDecodingTokens{32};
|
|
runtime::SizeType32 mVocabSize{256};
|
|
runtime::TokenIdType mPadId{-1};
|
|
};
|
|
|
|
using TensorPtr = tensorrt_llm::runtime::ITensor::SharedPtr;
|
|
using BufferPtr = tensorrt_llm::runtime::IBuffer::SharedPtr;
|
|
using SizeType32 = tensorrt_llm::runtime::SizeType32;
|
|
using TokenIdType = tensorrt_llm::runtime::TokenIdType;
|
|
|
|
using TokensVec = std::vector<TokenIdType>;
|
|
using DraftLettersVec = std::vector<std::string>;
|
|
using DraftTokensVec = std::vector<TokensVec>;
|
|
using DraftPath = std::vector<std::vector<SizeType32>>;
|
|
using DraftPaths = std::vector<DraftPath>;
|
|
|
|
class EagleDummyNetwork
|
|
{
|
|
public:
|
|
void forward(SamplingParams const& params, std::vector<std::string> const& prompts,
|
|
std::vector<std::vector<std::string>> const& predictionLetters,
|
|
std::vector<DraftLettersVec> const& nextDraftLetters, std::vector<DraftLettersVec> const& lastDraftLetters);
|
|
|
|
TokensVec tokenize(std::string const& letters) const;
|
|
|
|
std::string detokenize(TokensVec const& tokens) const;
|
|
|
|
SizeType32 longestCommonPrefixLength(TokensVec const& a, TokensVec const& b) const;
|
|
|
|
DraftTokensVec draftLettersToTokens(DraftLettersVec const& draftLetters) const;
|
|
|
|
DraftPath pathFromDraftTokens(
|
|
DraftTokensVec const& tokens, SizeType32 maxDecodingTokens, SizeType32 maxPathLen) const;
|
|
|
|
TokensVec flattenTokens(DraftTokensVec const& tokens, DraftPath const& path, bool isDraftTokens) const;
|
|
|
|
void acceptTokens(std::vector<TokensVec> const& predictionTokens, DraftTokensVec const& lastDraftTokens,
|
|
DraftPaths const& lastDraftPaths);
|
|
|
|
std::vector<std::vector<std::vector<bool>>> createMasks(DraftPaths const& paths) const;
|
|
|
|
void setSamplingParams(SamplingParams const& params)
|
|
{
|
|
mSamplingParams = params;
|
|
}
|
|
|
|
std::vector<TokensVec> getPrompts() const
|
|
{
|
|
return mPrompts;
|
|
}
|
|
|
|
std::vector<TokensVec> getOutputIds() const
|
|
{
|
|
return mOutputIds;
|
|
}
|
|
|
|
DraftTokensVec getNextDraftTokens() const
|
|
{
|
|
return mNextDraftTokens;
|
|
}
|
|
|
|
std::vector<SizeType32> getNextDraftLens() const
|
|
{
|
|
return mNextDraftLens;
|
|
}
|
|
|
|
DraftPaths getNextDraftPaths() const
|
|
{
|
|
return mNextDraftPaths;
|
|
}
|
|
|
|
DraftTokensVec getLastDraftTokens() const
|
|
{
|
|
return mLastDraftTokens;
|
|
}
|
|
|
|
std::vector<SizeType32> getLastDraftLens() const
|
|
{
|
|
return mLastDraftLens;
|
|
}
|
|
|
|
DraftPaths getLastDraftPaths() const
|
|
{
|
|
return mLastDraftPaths;
|
|
}
|
|
|
|
std::vector<TokensVec> getAcceptedTokens() const
|
|
{
|
|
return mAcceptedTokens;
|
|
}
|
|
|
|
std::vector<SizeType32> getAcceptedLens() const
|
|
{
|
|
return mAcceptedLens;
|
|
}
|
|
|
|
std::vector<SizeType32> getAcceptedPathIds() const
|
|
{
|
|
return mAcceptedPathIds;
|
|
}
|
|
|
|
std::vector<std::vector<std::vector<bool>>> getNextMasks() const
|
|
{
|
|
return mMasks;
|
|
}
|
|
|
|
private:
|
|
SamplingParams mSamplingParams;
|
|
|
|
std::vector<TokensVec> mPrompts;
|
|
std::vector<TokensVec> mOutputIds;
|
|
|
|
DraftTokensVec mNextDraftTokens;
|
|
std::vector<SizeType32> mNextDraftLens;
|
|
DraftPaths mNextDraftPaths;
|
|
|
|
DraftTokensVec mLastDraftTokens;
|
|
std::vector<SizeType32> mLastDraftLens;
|
|
DraftPaths mLastDraftPaths;
|
|
|
|
std::vector<TokensVec> mAcceptedTokens;
|
|
std::vector<SizeType32> mAcceptedLens;
|
|
std::vector<SizeType32> mAcceptedPathIds;
|
|
|
|
std::vector<std::vector<std::vector<bool>>> mMasks;
|
|
};
|
|
|
|
template <typename T>
|
|
class EagleDecodingLayerTest : public testing::Test
|
|
{
|
|
private:
|
|
void SetUp() override;
|
|
|
|
private:
|
|
SamplingParams mSamplingParams;
|
|
|
|
// Outputs
|
|
TensorPtr mOutputIds;
|
|
TensorPtr mSeqLengths;
|
|
TensorPtr mOutputNextDraftTokens;
|
|
TensorPtr mOutputUnpackedNextDraftTokens;
|
|
TensorPtr mAcceptedLengths;
|
|
TensorPtr mNextPosIds;
|
|
TensorPtr mPrevDraftLengths;
|
|
TensorPtr mNextDraftLengths;
|
|
TensorPtr mNextGenerationLengths;
|
|
TensorPtr mNextGenerationLengthsHost;
|
|
TensorPtr mAcceptedLengthCumSum;
|
|
TensorPtr mPathsOffsets;
|
|
TensorPtr mPackedMasks;
|
|
TensorPtr mRandomDataSample;
|
|
TensorPtr mRandomDataValidation;
|
|
TensorPtr mOutputTemperatures;
|
|
TensorPtr mOutputNextDraftPaths;
|
|
TensorPtr mEagleNetCtxRequestTypesHost;
|
|
TensorPtr mEagleNetCtxContextLengthsHost;
|
|
TensorPtr mEagleNetCtxPastKeyValueLengthsHost;
|
|
TensorPtr mEagleNetGenRequestTypesHost;
|
|
TensorPtr mEagleNetGenContextLengthsHost;
|
|
TensorPtr mEagleNetGenPastKeyValueLengthsHost;
|
|
|
|
// inputs
|
|
TensorPtr mBatchSlots;
|
|
TensorPtr mEndIds;
|
|
|
|
TensorPtr mInputNextDraftTokens;
|
|
TensorPtr mInputNextDraftLens;
|
|
TensorPtr mInputNextDraftPaths;
|
|
TensorPtr mInputLastDraftTokens;
|
|
TensorPtr mInputLastDraftLens;
|
|
TensorPtr mInputLastDraftPaths;
|
|
TensorPtr mInputAcceptedTokens;
|
|
TensorPtr mInputAcceptedLens;
|
|
TensorPtr mInputAcceptedPathIds;
|
|
TensorPtr mChunkedContextNextTokens;
|
|
|
|
// Setup params
|
|
std::vector<uint64_t> mRandomSeeds;
|
|
std::vector<float> mTemperatures;
|
|
|
|
std::shared_ptr<tensorrt_llm::runtime::CudaStream> mStream;
|
|
std::shared_ptr<tensorrt_llm::runtime::BufferManager> mBufferManager;
|
|
std::shared_ptr<tensorrt_llm::layers::EagleDecodingLayer<T>> mEagleLayer;
|
|
std::shared_ptr<runtime::DecodingLayerWorkspace> mDecodingWorkspace;
|
|
|
|
EagleDummyNetwork mNetwork;
|
|
|
|
private:
|
|
void allocateBuffers();
|
|
|
|
void setup();
|
|
|
|
std::shared_ptr<tensorrt_llm::layers::EagleInputs> createInputTensors();
|
|
|
|
std::shared_ptr<tensorrt_llm::layers::EagleOutputs> createOutputTensors();
|
|
|
|
void checkLayerResult();
|
|
|
|
public:
|
|
void runTest(std::vector<std::string> const& prompts, std::vector<DraftLettersVec> const& predictions,
|
|
std::vector<DraftLettersVec> const& nextDraftLetters, std::vector<DraftLettersVec> const& lastDraftLetters,
|
|
SamplingParams& params);
|
|
};
|
|
|
|
typedef testing::Types<float, half> FloatAndHalfTypes;
|
|
|
|
} // namespace tensorrt_llm::tests::layers
|