/* * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #pragma once #include "tensorrt_llm/layers/eagleDecodingLayer.h" #include "tensorrt_llm/runtime/bufferManager.h" #include "tensorrt_llm/runtime/cudaStream.h" #include #include namespace tensorrt_llm::tests::layers { class SamplingParams { public: SamplingParams() {} inline void setBatchSize(runtime::SizeType32 batchSize) { mBatchSize = batchSize; } inline void setMaxPathLen(runtime::SizeType32 maxPathLen) { mMaxPathLen = maxPathLen; } inline void setMaxDecodingTokens(runtime::SizeType32 maxDecodingTokens) { mMaxDecodingTokens = maxDecodingTokens; } [[nodiscard]] inline runtime::SizeType32 getBatchSize() const { return mBatchSize; } [[nodiscard]] inline runtime::SizeType32 getVocabSize() const { return mVocabSize; } [[nodiscard]] inline runtime::SizeType32 getMaxBatchSize() const { return 2 * getBatchSize(); } [[nodiscard]] inline runtime::SizeType32 getMaxPathLen() const { return mMaxPathLen; } [[nodiscard]] inline runtime::SizeType32 getMaxDraftPathLen() const { return getMaxPathLen() - 1; } [[nodiscard]] inline runtime::SizeType32 getMaxDecodingTokens() const { return mMaxDecodingTokens; } [[nodiscard]] inline runtime::SizeType32 getMaxDecodingDraftTokens() const { return getMaxDecodingTokens() - 1; } [[nodiscard]] inline runtime::SizeType32 getMaxSeqLen() const { return getMaxDecodingTokens() * 2; } [[nodiscard]] inline runtime::TokenIdType getPadId() const { return mPadId; } private: runtime::SizeType32 mBatchSize{6}; runtime::SizeType32 mMaxPathLen{4}; runtime::SizeType32 mMaxDecodingTokens{32}; runtime::SizeType32 mVocabSize{256}; runtime::TokenIdType mPadId{-1}; }; using TensorPtr = tensorrt_llm::runtime::ITensor::SharedPtr; using BufferPtr = tensorrt_llm::runtime::IBuffer::SharedPtr; using SizeType32 = tensorrt_llm::runtime::SizeType32; using TokenIdType = tensorrt_llm::runtime::TokenIdType; using TokensVec = std::vector; using DraftLettersVec = std::vector; using DraftTokensVec = std::vector; using DraftPath = std::vector>; using DraftPaths = std::vector; class EagleDummyNetwork { public: void forward(SamplingParams const& params, std::vector const& prompts, std::vector> const& predictionLetters, std::vector const& nextDraftLetters, std::vector const& lastDraftLetters); TokensVec tokenize(std::string const& letters) const; std::string detokenize(TokensVec const& tokens) const; SizeType32 longestCommonPrefixLength(TokensVec const& a, TokensVec const& b) const; DraftTokensVec draftLettersToTokens(DraftLettersVec const& draftLetters) const; DraftPath pathFromDraftTokens( DraftTokensVec const& tokens, SizeType32 maxDecodingTokens, SizeType32 maxPathLen) const; TokensVec flattenTokens(DraftTokensVec const& tokens, DraftPath const& path, bool isDraftTokens) const; void acceptTokens(std::vector const& predictionTokens, DraftTokensVec const& lastDraftTokens, DraftPaths const& lastDraftPaths); std::vector>> createMasks(DraftPaths const& paths) const; void setSamplingParams(SamplingParams const& params) { mSamplingParams = params; } std::vector getPrompts() const { return mPrompts; } std::vector getOutputIds() const { return mOutputIds; } DraftTokensVec getNextDraftTokens() const { return mNextDraftTokens; } std::vector getNextDraftLens() const { return mNextDraftLens; } DraftPaths getNextDraftPaths() const { return mNextDraftPaths; } DraftTokensVec getLastDraftTokens() const { return mLastDraftTokens; } std::vector getLastDraftLens() const { return mLastDraftLens; } DraftPaths getLastDraftPaths() const { return mLastDraftPaths; } std::vector getAcceptedTokens() const { return mAcceptedTokens; } std::vector getAcceptedLens() const { return mAcceptedLens; } std::vector getAcceptedPathIds() const { return mAcceptedPathIds; } std::vector>> getNextMasks() const { return mMasks; } private: SamplingParams mSamplingParams; std::vector mPrompts; std::vector mOutputIds; DraftTokensVec mNextDraftTokens; std::vector mNextDraftLens; DraftPaths mNextDraftPaths; DraftTokensVec mLastDraftTokens; std::vector mLastDraftLens; DraftPaths mLastDraftPaths; std::vector mAcceptedTokens; std::vector mAcceptedLens; std::vector mAcceptedPathIds; std::vector>> mMasks; }; template class EagleDecodingLayerTest : public testing::Test { private: void SetUp() override; private: SamplingParams mSamplingParams; // Outputs TensorPtr mOutputIds; TensorPtr mSeqLengths; TensorPtr mOutputNextDraftTokens; TensorPtr mOutputUnpackedNextDraftTokens; TensorPtr mAcceptedLengths; TensorPtr mNextPosIds; TensorPtr mPrevDraftLengths; TensorPtr mNextDraftLengths; TensorPtr mNextGenerationLengths; TensorPtr mNextGenerationLengthsHost; TensorPtr mAcceptedLengthCumSum; TensorPtr mPathsOffsets; TensorPtr mPackedMasks; TensorPtr mRandomDataSample; TensorPtr mRandomDataValidation; TensorPtr mOutputTemperatures; TensorPtr mOutputNextDraftPaths; TensorPtr mEagleNetCtxRequestTypesHost; TensorPtr mEagleNetCtxContextLengthsHost; TensorPtr mEagleNetCtxPastKeyValueLengthsHost; TensorPtr mEagleNetGenRequestTypesHost; TensorPtr mEagleNetGenContextLengthsHost; TensorPtr mEagleNetGenPastKeyValueLengthsHost; // inputs TensorPtr mBatchSlots; TensorPtr mEndIds; TensorPtr mInputNextDraftTokens; TensorPtr mInputNextDraftLens; TensorPtr mInputNextDraftPaths; TensorPtr mInputLastDraftTokens; TensorPtr mInputLastDraftLens; TensorPtr mInputLastDraftPaths; TensorPtr mInputAcceptedTokens; TensorPtr mInputAcceptedLens; TensorPtr mInputAcceptedPathIds; TensorPtr mChunkedContextNextTokens; // Setup params std::vector mRandomSeeds; std::vector mTemperatures; std::shared_ptr mStream; std::shared_ptr mBufferManager; std::shared_ptr> mEagleLayer; std::shared_ptr mDecodingWorkspace; EagleDummyNetwork mNetwork; private: void allocateBuffers(); void setup(); std::shared_ptr createInputTensors(); std::shared_ptr createOutputTensors(); void checkLayerResult(); public: void runTest(std::vector const& prompts, std::vector const& predictions, std::vector const& nextDraftLetters, std::vector const& lastDraftLetters, SamplingParams& params); }; typedef testing::Types FloatAndHalfTypes; } // namespace tensorrt_llm::tests::layers