/* * Copyright (c) 2022-2024, NVIDIA CORPORATION. All rights reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #pragma once #include #include #include "tensorrt_llm/layers/samplingLayer.h" #include "tensorrt_llm/layers/topKSamplingLayer.h" #include "tensorrt_llm/layers/topPSamplingLayer.h" #include "tensorrt_llm/runtime/bufferManager.h" #include "tensorrt_llm/runtime/cudaStream.h" #include "tensorrt_llm/kernels/penaltyKernels.h" #include "tensorrt_llm/kernels/samplingTopKKernels.h" #include "tensorrt_llm/kernels/samplingTopPKernels.h" #include "tensorrt_llm/runtime/bufferManager.h" #include "tensorrt_llm/runtime/cudaStream.h" #include "tensorrt_llm/runtime/runtimeKernels.h" #include "tensorrt_llm/runtime/tllmLogger.h" #include "tensorrt_llm/common/tllmException.h" namespace tensorrt_llm::tests::layers::sampling { constexpr float EPSILON = 1e-20f; template void computeProb(T* probs, T const* logits, int batchSize, int vocabSize) { // Compute the log probability from logits. // logits = batchSize x vocabSize. // probs = softmax(logits) (softmax along with vocab dimension) // float is used for either T=float or half, since operations of half are // not fully supported in a host function. for (int bidx = 0; bidx < batchSize; ++bidx) { float maxval = -FLT_MAX; for (int i = 0; i < vocabSize; ++i) { float logit = static_cast(logits[bidx * vocabSize + i]); if (logit > maxval) { maxval = logit; } } float sum = 0.0f; for (int i = 0; i < vocabSize; ++i) { sum += expf(static_cast(logits[bidx * vocabSize + i]) - maxval); } for (int i = 0; i < vocabSize; ++i) { int idx = bidx * vocabSize + i; float logit = static_cast(logits[idx]) - maxval; probs[idx] = static_cast(expf(logit) / (sum + EPSILON)); } } } struct TestSamplingParams { std::vector topKs; std::vector topPs; std::vector temperatures; std::vector repetitionPenalties; std::vector presencePenalties; std::vector frequencyPenalties; std::vector minLengths; std::vector decay; std::vector minTopP; std::vector topPResetIds; bool useBias = false; }; template class BaseSamplingLayerTest : public testing::Test { protected: using TensorPtr = tensorrt_llm::runtime::ITensor::SharedPtr; using BufferPtr = tensorrt_llm::runtime::IBuffer::SharedPtr; int32_t seed = 0; static uint64_t const mMaxSeed = 32; int32_t const mBatchSize = 6; int32_t const mMaxBatchSize = 2 * mBatchSize; int32_t const mBeamWidth = 1; int32_t const mBatchBeam = mBatchSize * mBeamWidth; int32_t const mVocabSize = 8; int32_t const mVocabSizePadded = mVocabSize; int32_t const mMaxInputLen = 0; // has no effect. int32_t const mMaxOutputLen = 4; int32_t const mMaxSeqLen = mMaxInputLen + mMaxOutputLen; int32_t mEndId = mVocabSize; bool mComputeProbs = false; TensorPtr mLogitsDevice; TensorPtr mContextLengthDevice; TensorPtr mSeqLengthsDevice; TensorPtr mFinishedDevice; TensorPtr mOutputIdsDevice; TensorPtr mEndIdsDevice; TensorPtr mIdsPtrHost; TensorPtr mBatchSlots; TensorPtr mEmbeddingBiasHost; TensorPtr mEmbeddingBiasDevice; TensorPtr mCumLogProbsDevice; TensorPtr mOutputLogProbsDevice; TensorPtr mCurandStatesDevice; TensorPtr mPenaltyWorkspaceDevice; BufferPtr mSamplingWorkspaceDevice; std::shared_ptr mStream; std::shared_ptr mBufferManager; std::shared_ptr mSamplingLayer; std::vector mTestLogitsInit; void setup(uint64_t seed, TestSamplingParams const& params); virtual void initLayer(TestSamplingParams const& params) = 0; std::shared_ptr createInputTensors(int32_t step); std::shared_ptr createOutputTensors(); void batchCopy(int32_t step); bool checkResult(int32_t* outputIds, std::vector>& expectedIds); public: void runTest( std::vector> expectedOutputIds, TestSamplingParams const& params, int32_t endId = -1); }; typedef testing::Types FloatAndHalfTypes; } // namespace tensorrt_llm::tests::layers::sampling