/* * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #pragma once #include "tensorrt_llm/layers/explicitDraftTokensLayer.h" #include "tensorrt_llm/runtime/bufferManager.h" #include "tensorrt_llm/runtime/cudaStream.h" #include #include namespace tensorrt_llm::tests::layers { class SamplingParams { public: SamplingParams() {} inline void setBatchSize(runtime::SizeType32 batchSize) { mBatchSize = batchSize; } inline void setMaxNumPaths(runtime::SizeType32 maxNumPaths) { mMaxNumPaths = maxNumPaths; } inline void setMaxDraftPathLen(runtime::SizeType32 maxDraftPathLen) { mMaxDraftPathLen = maxDraftPathLen; } [[nodiscard]] inline runtime::SizeType32 getBatchSize() const { return mBatchSize; } [[nodiscard]] inline runtime::SizeType32 getVocabSize() const { return mVocabSize; } [[nodiscard]] inline runtime::SizeType32 getMaxBatchSize() const { return 2 * getBatchSize(); } [[nodiscard]] inline runtime::SizeType32 getMaxDraftPathLen() const { return mMaxDraftPathLen; } [[nodiscard]] inline runtime::SizeType32 getMaxPathLen() const { return getMaxDraftPathLen() + 1; } [[nodiscard]] inline runtime::SizeType32 getMaxNumPaths() const { return mMaxNumPaths; } [[nodiscard]] inline runtime::SizeType32 getMaxDecodingDraftTokens() const { return getMaxDraftPathLen() * getMaxNumPaths(); } [[nodiscard]] inline runtime::SizeType32 getMaxDecodingTokens() const { return getMaxDecodingDraftTokens() + 1; } [[nodiscard]] inline runtime::SizeType32 getMaxSeqLen() const { return getMaxDecodingTokens() * 2; } [[nodiscard]] inline runtime::TokenIdType getPadId() const { return mPadId; } private: runtime::SizeType32 mBatchSize{6}; runtime::SizeType32 mMaxDraftPathLen{6}; runtime::SizeType32 mMaxNumPaths{4}; runtime::TokenIdType mPadId{-1}; runtime::SizeType32 mVocabSize{256}; }; using TensorPtr = tensorrt_llm::runtime::ITensor::SharedPtr; using BufferPtr = tensorrt_llm::runtime::IBuffer::SharedPtr; using SizeType32 = tensorrt_llm::runtime::SizeType32; using TokenIdType = tensorrt_llm::runtime::TokenIdType; using TokensVec = std::vector; using DraftLettersVec = std::vector>; using DraftTokensVec = std::vector>; using DraftTokensIndices = std::vector>>; class ExplicitDraftTokensDummyNetwork { public: void forward(SamplingParams const& params, std::vector const& prompts, std::vector const& predictionLetters, DraftLettersVec const& nextDraftLetters, DraftLettersVec const& lastDraftLetters); TokensVec tokenize(std::string const& letters) const; std::string detokenize(TokensVec const& tokens) const; DraftTokensVec draftLettersToTokens(DraftLettersVec const& draftLetters) const; SizeType32 longestCommonPrefixLength(TokensVec const& a, TokensVec const& b) const; SizeType32 computeCompressedVectorAndIndices(TokensVec& compressedVector, std::vector& packedPosIds, DraftTokensIndices& indices, std::vector const& vectors, SizeType32 basePosId); void compressTokens(TokensVec& compressedVector, std::vector& packedPosIds, DraftTokensIndices& indices, std::vector& generationLengths, DraftTokensVec const& draftTokens, std::vector const& basePosIds); void acceptTokens(std::vector const& predictionTokens, DraftTokensVec const& lastDraftTokens, DraftTokensVec const& nextDraftTokens); void createNextMasks(DraftTokensIndices const& indices, DraftTokensVec const& draftTokens, SizeType32 maxGenLength); void setSamplingParams(SamplingParams const& params) { mSamplingParams = params; } std::vector getPrompts() const { return mPrompts; } std::vector getOutputIds() const { return mOutputIds; } TokensVec getNextFlatTokens() const { return mNextCompressedVector; } DraftTokensVec getNextDraftTokens() const { return mNextDraftTokens; } DraftTokensIndices getNextDraftIndices() const { return mNextDraftTokenIndices; } DraftTokensIndices getLastDraftIndices() const { return mLastDraftTokenIndices; } DraftTokensVec getLastDraftTokens() const { return mLastDraftTokens; } std::vector getBestPathLengths() const { return mBestPathLengths; } std::vector getBestPathIndices() const { return mBestPathIndices; } std::vector getNextPackedPosId() const { return mNextPackedPosIds; } std::vector getNextGenerationLengths() const { return mNextGenerationLengths; } SizeType32 getMaxNextGenerationLength() const { return mMaxNextGenLength; } std::vector>> getNextMasks() const { return mMasks; } private: SamplingParams mSamplingParams; std::vector mPrompts; std::vector mOutputIds; DraftTokensVec mNextDraftTokens; DraftTokensVec mLastDraftTokens; TokensVec mNextCompressedVector; std::vector mNextPackedPosIds; DraftTokensIndices mNextDraftTokenIndices; TokensVec mLastCompressedVector; std::vector mLastPackedPosIds; DraftTokensIndices mLastDraftTokenIndices; std::vector mBestPathLengths; std::vector mBestPathIndices; std::vector mNextGenerationLengths; std::vector mLastGenerationLengths; SizeType32 mMaxNextGenLength; std::vector>> mMasks; }; template class ExplicitDraftTokensLayerTest : public testing::Test { private: void SetUp() override; private: SamplingParams mSamplingParams; // Outputs TensorPtr mSeqLengths; TensorPtr mAcceptedLengths; TensorPtr mOutputIds; TensorPtr mOutputNextDraftTokens; TensorPtr mOutputPositionIdsBase; TensorPtr mRandomDataSample; TensorPtr mRandomDataValidation; TensorPtr mAcceptedLengthCumSum; TensorPtr mPackedMasks; TensorPtr mPathsOffsets; TensorPtr mNextPosIds; TensorPtr mNextDraftLengths; TensorPtr mPrevDraftLengths; TensorPtr mOutputUnpackedNextDraftTokens; TensorPtr mOutputUnpackedNextDraftIndices; TensorPtr mOutputDraftProbs; TensorPtr mOutputTemperatures; TensorPtr mOutputGenerationLengths; TensorPtr mOutputGenerationLengthsHost; TensorPtr mMaxGenLengthHost; // inputs TensorPtr mBatchSlots; TensorPtr mMasks; TensorPtr mInputNextDraftTokens; TensorPtr mNextDraftIndices; TensorPtr mLastDraftTokens; TensorPtr mLastDraftIndices; TensorPtr mNextDraftProbs; TensorPtr mPackedPosIds; TensorPtr mBestPathLengths; TensorPtr mBestPathIndices; TensorPtr mSpecDecodingGenerationLengths; TensorPtr mTokensPerStep; TensorPtr mNextFlatTokens; TensorPtr mInputPositionIdsBase; TensorPtr mEndIds; TensorPtr mMaxGenLengthDevice; // Packed inputs TensorPtr mMaxGenerationLength; TensorPtr mCumSumGenerationLengths; // Packed outputs TensorPtr mPackedPositionIdsBase; TensorPtr mPackedGenerationLengths; TensorPtr mPackedRandomDataSample; TensorPtr mPackedRandomDataVerification; TensorPtr mPackedNextDraftTokens; TensorPtr mPackedNextDraftIndices; TensorPtr mPackedPackedMasks; TensorPtr mPackedPositionOffsets; TensorPtr mPackedPackedPosIds; TensorPtr mPackedDraftProbs; TensorPtr mPackedTemperatures; // Setup params std::vector mRandomSeeds; std::vector mTemperatures; std::shared_ptr mStream; std::shared_ptr mBufferManager; std::shared_ptr> mExplicitDraftTokensLayer; std::shared_ptr mDecodingWorkspace; ExplicitDraftTokensDummyNetwork mNetwork; private: void allocateBuffers(); void setup(); std::shared_ptr createInputTensors(); std::shared_ptr createOutputTensors(); void checkLayerResult(); void packData(); void checkPackResult(); public: void runTest(std::vector const& prompts, std::vector const& predictions, DraftLettersVec const& nextDraftLetters, DraftLettersVec const& lastDraftLetters, SamplingParams& params); }; template struct TypePair { using LayerType = T; using DataType = U; }; #ifdef ENABLE_BF16 using TestTypes = testing::Types, TypePair, TypePair>; #else using TestTypes = testing::Types, TypePair>; #endif // ENABLE_BF16 } // namespace tensorrt_llm::tests::layers