/* * Copyright (c) 2022-2024, NVIDIA CORPORATION. All rights reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #pragma once #include #include #include "tensorrt_llm/layers/dynamicDecodeLayer.h" #include "tensorrt_llm/runtime/bufferManager.h" #include "tensorrt_llm/runtime/cudaStream.h" #include "tensorrt_llm/kernels/penaltyKernels.h" #include "tensorrt_llm/kernels/samplingTopKKernels.h" #include "tensorrt_llm/kernels/samplingTopPKernels.h" #include "tensorrt_llm/runtime/bufferManager.h" #include "tensorrt_llm/runtime/cudaStream.h" #include "tensorrt_llm/runtime/runtimeKernels.h" #include "tensorrt_llm/runtime/tllmLogger.h" #include "tensorrt_llm/common/cudaAllocator.h" #include "tensorrt_llm/common/tensorConversion.h" #include "tensorrt_llm/common/tllmException.h" namespace tensorrt_llm::tests::layers::sampling { struct SamplingParams { std::vector topKs; std::vector topPs; std::vector temperatures; std::vector repetitionPenalties; std::vector presencePenalties; std::vector frequencyPenalties; std::vector minLengths; std::vector decay; std::vector minTopP; std::vector topPResetIds; std::vector>> badWords; std::vector>> stopWords; bool useBias = false; }; template class DynamicDecodeLayerTest : public testing::Test { private: void SetUp() override; using TensorPtr = tensorrt_llm::runtime::ITensor::SharedPtr; using BufferPtr = tensorrt_llm::runtime::IBuffer::SharedPtr; int32_t seed = 0; const static uint64_t mMaxSeed = 32; int32_t const mBatchSize = 6; int32_t const mMaxBatchSize = 2 * mBatchSize; int32_t const mBeamWidth = 1; int32_t const mBatchBeam = mBatchSize * mBeamWidth; int32_t const mVocabSize = 8; int32_t const mVocabSizePadded = mVocabSize; int32_t const mMaxInputLen = 0; // has no effect. int32_t const mMaxOutputLen = 4; int32_t const mMaxSeqLen = mMaxInputLen + mMaxOutputLen; int32_t const mSinkTokenLength = 0; int32_t mEndId = mVocabSize; bool mUseLogitsVec = false; TensorPtr mLogitsDevice; TensorPtr mLogitsRefHost; TensorPtr mContextLengthDevice; TensorPtr mSeqLengthsDevice; TensorPtr mFinishedDevice; TensorPtr mFinishedSumDevice; TensorPtr mOutputIdsDevice; TensorPtr mNewTokens; TensorPtr mEndIdsDevice; TensorPtr mBatchSlots; TensorPtr mBadWordsLens; TensorPtr mBadWords; TensorPtr mBadWordsPtrs; TensorPtr mStopWordsLens; TensorPtr mStopWords; TensorPtr mStopWordsPtrs; TensorPtr mEmbeddingBiasHost; TensorPtr mEmbeddingBiasDevice; TensorPtr mCumLogProbsDevice; std::vector mLogitsVec; struct cudaDeviceProp mDeviceProp; const tensorrt_llm::common::DataType data_type = tensorrt_llm::common::getTensorType(); // Order is important because we pass mAllocator to mDecodeLayer and it is used in destructor std::shared_ptr mStream; std::shared_ptr mBufferManager; std::shared_ptr mAllocator; std::shared_ptr> mDecodeLayer; std::vector mTestLogitsInit; int32_t mMaxBadWordsLen{0}; int32_t mMaxStopWordsLen{0}; private: void setup(uint64_t seed, SamplingParams const& params); int32_t getMaxWordsLen(std::vector>> const& inputWords); void initXWordsTensors(int32_t* batchSlotsPtr, int32_t* wordsData, int32_t** wordsPtr, int32_t* wordsLenData, int32_t maxWordsLen, std::vector>> const& inputWords); typename tensorrt_llm::layers::DynamicDecodeLayer::ForwardParams createInputTensors(int32_t step); typename tensorrt_llm::layers::DynamicDecodeLayer::OutputParams createOutputTensors(); void batchCopy(int32_t step); bool checkResult(int32_t* outputIds, std::vector>& expectedIds, int32_t* seqLens, int32_t leadingDim, int32_t stride, int32_t step); void runTestImpl( std::vector> expectedOutputIds, SamplingParams const& params, int32_t endId = -1); public: void runTest(std::vector> expectedOutputIds, SamplingParams const& params, int32_t endId = -1); }; typedef testing::Types FloatAndHalfTypes; } // namespace tensorrt_llm::tests::layers::sampling