/* * Copyright (c) 2022-2024, NVIDIA CORPORATION. All rights reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #include "tests/kernels/sampling/samplingTest.h" namespace tensorrt_llm::tests::kernels::sampling { using namespace tensorrt_llm::runtime; namespace tc = tensorrt_llm::common; namespace tk = tensorrt_llm::kernels; namespace trk = tensorrt_llm::runtime::kernels; template void SamplingKernelTest::SetUp() { mStream = std::make_shared(); mBufferManager = std::make_shared(mStream); auto const device = tc::getDevice(); cudaGetDeviceProperties(&mDeviceProp, device); } template void SamplingKernelTest::TearDown() { } template void SamplingKernelTest::allocateBuffers(SamplingKernelTestParam const& param) { auto const batchSize = param.batchSize; auto const maxBatchSize = 2 * batchSize; auto const vocabSize = param.vocabSize; auto const maxTokensPerStep = param.maxTokensPerStep; auto const dataType = TRTDataType::value; auto const ptrType = TRTDataType::value; // Allocate GPU data mSeqLengthsHost = BufferManager::pinned(ITensor::makeShape({maxBatchSize}), nvinfer1::DataType::kINT32); mSeqLengthsDevice = mBufferManager->gpu(ITensor::makeShape({maxBatchSize}), nvinfer1::DataType::kINT32); mFinishedHost = BufferManager::pinned( ITensor::makeShape({maxBatchSize}), TRTDataType::value); mFinishedDevice = mBufferManager->gpu( ITensor::makeShape({maxBatchSize}), TRTDataType::value); mOutputIdsHost = BufferManager::pinned(ITensor::makeShape({maxBatchSize, mMaxSeqLen}), nvinfer1::DataType::kINT32); mOutputIdsDevice = mBufferManager->gpu(ITensor::makeShape({maxBatchSize, mMaxSeqLen}), nvinfer1::DataType::kINT32); mProbsHost = BufferManager::pinned(ITensor::makeShape({batchSize, maxTokensPerStep, vocabSize}), dataType); mProbsDevice = mBufferManager->gpu(ITensor::makeShape({batchSize, maxTokensPerStep, vocabSize}), dataType); mProbsPtrsDevice = BufferManager::pinned(ITensor::makeShape({batchSize, maxTokensPerStep}), nvinfer1::DataType::kINT64); mCumLogProbsDevice = mBufferManager->gpu(ITensor::makeShape({maxBatchSize}), nvinfer1::DataType::kFLOAT); mOutputLogProbsDevice = mBufferManager->gpu(ITensor::makeShape({mMaxSeqLen, maxBatchSize}), nvinfer1::DataType::kFLOAT); mZeroParentIdsDevice = mBufferManager->gpu(ITensor::makeShape({maxBatchSize, maxTokensPerStep}), nvinfer1::DataType::kINT32); mLogitsHost = BufferManager::pinned(ITensor::makeShape({batchSize, maxTokensPerStep, vocabSize}), dataType); mLogProbsHost = BufferManager::pinned(ITensor::makeShape({batchSize, maxTokensPerStep, vocabSize}), dataType); mIdsPtrHost = BufferManager::pinned(ITensor::makeShape({2 * maxBatchSize}), ptrType); mEndIdsHost = BufferManager::pinned(ITensor::makeShape({maxBatchSize}), nvinfer1::DataType::kINT32); mEndIdsDevice = mBufferManager->gpu(ITensor::makeShape({maxBatchSize}), nvinfer1::DataType::kINT32); mTopPsHost = BufferManager::pinned(ITensor::makeShape({maxBatchSize}), nvinfer1::DataType::kFLOAT); mTopPsDevice = mBufferManager->gpu(ITensor::makeShape({maxBatchSize}), nvinfer1::DataType::kFLOAT); mTopKsHost = BufferManager::pinned(ITensor::makeShape({maxBatchSize}), nvinfer1::DataType::kINT32); mTopKsDevice = mBufferManager->gpu(ITensor::makeShape({maxBatchSize}), nvinfer1::DataType::kINT32); mSkipDecodeHost = BufferManager::pinned(ITensor::makeShape({maxBatchSize}), nvinfer1::DataType::kBOOL); mSkipDecodeDevice = mBufferManager->gpu(ITensor::makeShape({maxBatchSize}), nvinfer1::DataType::kBOOL); mTokensPerStep = BufferManager::pinned(ITensor::makeShape({maxBatchSize}), nvinfer1::DataType::kINT32); mBatchSlots = BufferManager::pinned(ITensor::makeShape({batchSize}), nvinfer1::DataType::kINT32); mExpectedCumLogProbsHost = BufferManager::pinned(ITensor::makeShape({maxBatchSize}), nvinfer1::DataType::kFLOAT); mCurandStatesDevice = mBufferManager->gpu(ITensor::makeShape({maxBatchSize, sizeof(curandState_t)}), nvinfer1::DataType::kINT8); } template void SamplingKernelTest::setupBuffers(SamplingKernelTestParam const& param) { auto const batchSize = param.batchSize; auto const maxBatchSize = 2 * batchSize; auto const vocabSize = param.vocabSize; auto const maxTokensPerStep = param.maxTokensPerStep; auto const topK = param.topK; auto const topP = param.topP; // TopK == 0 case (TopP kernel) auto const topKDistUpperBound = std::max(topK, static_cast(1)); std::mt19937 gen(42); auto* batchSlotsPtr = bufferCast(*mBatchSlots); auto probsPtr = BufferRange(*mProbsPtrsDevice); auto probsDevicePtr = bufferCast(*mProbsDevice); for (SizeType32 bi = 0; bi < batchSize; ++bi) { batchSlotsPtr[bi] = 2 * bi; for (SizeType32 ti = 0; ti < maxTokensPerStep; ++ti) { probsPtr[bi * maxTokensPerStep + ti] = probsDevicePtr + bi * maxTokensPerStep * vocabSize + ti * vocabSize; } } // Allocate and init curand states tk::invokeCurandInitialize(reinterpret_cast(bufferCast(*mCurandStatesDevice)), batchSlotsPtr, batchSize, mSeed, mStream->get()); std::uniform_int_distribution<> endIdsDistr( 0, vocabSize - 1); // -1 because uniform_int_distribution generates closed interval std::uniform_real_distribution<> skipDecodeDist(0, 1); std::uniform_real_distribution<> topPDist(0, topP); std::uniform_int_distribution<> topKDist(1, topKDistUpperBound); std::uniform_int_distribution<> tokensPerStepDist(1, maxTokensPerStep); std::uniform_int_distribution<> seqLenDist(0, mMaxSeqLen - maxTokensPerStep); std::uniform_real_distribution<> logProbDist(-3.f, 3.f); std::uniform_real_distribution<> finishedDist(0, 1); // Init by zero. trk::invokeFill(*mFinishedDevice, uint8_t{0}, *mStream); trk::invokeFill(*mOutputLogProbsDevice, float{0.0f}, *mStream); trk::invokeFill(*mZeroParentIdsDevice, int32_t{0}, *mStream); trk::invokeFill(*mOutputIdsDevice, int32_t{0}, *mStream); // Init topK, topP and endIds for each request in batch auto skipDecodeHostPtr = bufferCast(*mSkipDecodeHost); auto topPsHostPtr = bufferCast(*mTopPsHost); auto topKsHostPtr = bufferCast(*mTopKsHost); auto endIdsHostPtr = bufferCast(*mEndIdsHost); auto tokensPerStepPtr = bufferCast(*mTokensPerStep); auto finishedHostPtr = reinterpret_cast(bufferCast(*mFinishedHost)); for (SizeType32 bi = 0; bi < maxBatchSize; ++bi) { endIdsHostPtr[bi] = endIdsDistr(gen); skipDecodeHostPtr[bi] = skipDecodeDist(gen) > 0.8; topPsHostPtr[bi] = topPDist(gen); topKsHostPtr[bi] = topK == 0 ? 0 : topKDist(gen); tokensPerStepPtr[bi] = tokensPerStepDist(gen); finishedHostPtr[bi] = finishedDist(gen) > 0.8 ? tk::FinishedState::finished() : tk::FinishedState::empty(); } mMaxTopK = topK; mMaxTopP = topP; TLLM_CHECK(mMaxTopK * maxTokensPerStep <= mMaxSeqLen); // Setup pointers to output ids for each request in batch auto idsPtrHostPtr = BufferRange(*mIdsPtrHost); auto outputIdsDevicePtr = bufferCast(*mOutputIdsDevice); auto zeroParentIdsDevicePtr = bufferCast(*mZeroParentIdsDevice); auto seqLensHostPtr = bufferCast(*mSeqLengthsHost); auto logProbHostPtr = bufferCast(*mExpectedCumLogProbsHost); for (SizeType32 bi = 0; bi < maxBatchSize; bi++) { idsPtrHostPtr[bi] = outputIdsDevicePtr + bi * mMaxSeqLen; idsPtrHostPtr[maxBatchSize + bi] = zeroParentIdsDevicePtr + bi * mMaxSeqLen; } for (SizeType32 bi = 0; bi < maxBatchSize; bi++) { seqLensHostPtr[bi] = seqLenDist(gen); logProbHostPtr[bi] = logProbDist(gen); } mBufferManager->copy(*mEndIdsHost, *mEndIdsDevice); mBufferManager->copy(*mSkipDecodeHost, *mSkipDecodeDevice); mBufferManager->copy(*mTopPsHost, *mTopPsDevice); mBufferManager->copy(*mTopKsHost, *mTopKsDevice); mBufferManager->copy(*mSeqLengthsHost, *mSeqLengthsDevice); mBufferManager->copy(*mExpectedCumLogProbsHost, *mCumLogProbsDevice); mBufferManager->copy(*mFinishedHost, *mFinishedDevice); // Init logits randomly auto logitsHostPtr = bufferCast(*mLogitsHost); initRandom(logitsHostPtr, batchSize * maxTokensPerStep * vocabSize, -3.0f, 3.0f); // Only in greedy search we can guarantee the selected token and stop by condition // TopK == 1 for TopK kernel greedy, TopK == 0 for TopP kernels if (topK <= 1) { for (SizeType32 bi = 0; bi < batchSize; ++bi) { auto const batchSlot = batchSlotsPtr[bi]; for (int32_t ti = 0; ti < maxTokensPerStep; ++ti) { // Set logit of the endId for the finished request to the value above others // NOTE that we can guarantee finish only in greedy search logitsHostPtr[(bi * maxTokensPerStep + ti) * vocabSize + endIdsHostPtr[batchSlot]] = 4.0f; } } } // Compute probabilities for each token computeProb(bufferCast(*mProbsHost), logitsHostPtr, batchSize * maxTokensPerStep, vocabSize); mBufferManager->copy(*mProbsHost, *mProbsDevice); } template std::vector SamplingKernelTest::computeTopKTopPVariants( int32_t bi, int32_t batchSlot, int32_t ti, int32_t maxTokensPerStep, int32_t vocabSize) { std::vector allowedTokens; auto probsPtr = bufferCast(*mProbsHost) + (bi * maxTokensPerStep + ti) * vocabSize; std::vector indices(vocabSize); std::iota(indices.begin(), indices.end(), 0); std::sort(indices.begin(), indices.end(), [probsPtr](SizeType32 i1, SizeType32 i2) { return probsPtr[i1] > probsPtr[i2]; }); auto topK = bufferCast(*mTopKsHost)[batchSlot]; auto topP = bufferCast(*mTopPsHost)[batchSlot]; if (topK > 0) // handling top K kernel, top P result based on topK tokens { float sSum = 0.f; // sSum as in samplingTopKKernels.cu for (auto ki = 0; ki < topK; ki++) { sSum += static_cast(probsPtr[indices[ki]]); } topP *= sSum; // the adjusted topP in the selected topK distribution } float totalProb = 0.f; SizeType32 idx = 0; while (totalProb < topP && idx < vocabSize) { allowedTokens.push_back(indices[idx]); totalProb += static_cast(probsPtr[indices[idx++]]); // cuda may selected a different index with same probability in kernel reduce, in test we allow them while (idx < vocabSize && static_cast(probsPtr[indices[idx]]) == static_cast(probsPtr[indices[idx - 1]])) { allowedTokens.push_back(indices[idx]); totalProb += static_cast(probsPtr[indices[idx++]]); } } return allowedTokens; } template void SamplingKernelTest::verifyResult(SamplingKernelTestParam const& param) { auto const batchSize = param.batchSize; auto const vocabSize = param.vocabSize; auto const maxTokensPerStep = param.maxTokensPerStep; auto const outputIdsHost = mBufferManager->copyFrom(*mOutputIdsDevice, MemoryType::kCPU); auto const seqLenHost = mBufferManager->copyFrom(*mSeqLengthsDevice, MemoryType::kCPU); auto const finishedHost = mBufferManager->copyFrom(*mFinishedDevice, MemoryType::kCPU); auto const cumLogProbsHost = mBufferManager->copyFrom(*mCumLogProbsDevice, MemoryType::kCPU); // Synchronize to get valid data on Host mStream->synchronize(); // Compute reference. computeLogProb(bufferCast(*mLogProbsHost), bufferCast(*mLogitsHost), batchSize * maxTokensPerStep, vocabSize); auto const batchSlotsPtr = bufferCast(*mBatchSlots); auto const outputIdsHostPtr = bufferCast(*outputIdsHost); auto const seqLengthsHostPtr = bufferCast(*seqLenHost); auto const finishedHostPtr = reinterpret_cast(bufferCast(*finishedHost)); auto const outputIdsOrigHostPtr = bufferCast(*mOutputIdsHost); auto const seqLengthsOrigHostPtr = bufferCast(*mSeqLengthsHost); auto const finishedOrigHostPtr = reinterpret_cast(bufferCast(*mFinishedHost)); auto const logProbsHostPtr = bufferCast(*mLogProbsHost); auto const endIdsHostPtr = bufferCast(*mEndIdsHost); auto const skipDecodeHostPtr = bufferCast(*mSkipDecodeHost); auto const tokensPerStepPtr = bufferCast(*mTokensPerStep); auto const expectedCumLogProbsHostPtr = bufferCast(*mExpectedCumLogProbsHost); for (SizeType32 bi = 0; bi < batchSize; ++bi) { auto const batchSlot = batchSlotsPtr[bi]; auto const tokensPerStep = tokensPerStepPtr[batchSlot]; for (SizeType32 ti = 0; ti < tokensPerStep; ++ti) { auto topK = bufferCast(*mTopKsHost)[batchSlot]; auto kResults = param.returnAllSelectedTokens ? (topK == 0 ? vocabSize : topK) : 1; auto topKTopPVariants = computeTopKTopPVariants(bi, batchSlot, ti, maxTokensPerStep, vocabSize); SizeType32 ki; for (ki = 0; ki < kResults && ki < topKTopPVariants.size(); ++ki) { // Set reference finished state to true if we finished before or at current step auto const idsIdx = param.returnAllSelectedTokens ? ti * mMaxTopK + ki : seqLengthsOrigHostPtr[batchSlot] + ti; auto const outputId = outputIdsHostPtr[batchSlot * mMaxSeqLen + idsIdx]; // Check the range of the returned token ([0, vocabSize)) EXPECT_TRUE((outputId >= 0) && (outputId < vocabSize)); bool const generatedEOS = outputId == endIdsHostPtr[batchSlot]; // If decoding for this batch is skipped ignore cumLog calculation if (!skipDecodeHostPtr[batchSlot] && !finishedOrigHostPtr[batchSlot].isFinished() && !finishedOrigHostPtr[batchSlot].isSkipDecoding()) { if (maxTokensPerStep == 1 && !param.returnAllSelectedTokens) { if (generatedEOS) { EXPECT_EQ(seqLengthsHostPtr[batchSlot], seqLengthsOrigHostPtr[batchSlot]); EXPECT_TRUE(finishedHostPtr[batchSlot].isFinished()); } else { EXPECT_EQ(seqLengthsHostPtr[batchSlot], seqLengthsOrigHostPtr[batchSlot] + tokensPerStep); EXPECT_EQ( finishedHostPtr[batchSlot].isFinished(), finishedOrigHostPtr[batchSlot].isFinished()); } } bool found = false; for (auto const& var : topKTopPVariants) { if (outputId == var) { found = true; break; } } EXPECT_TRUE(found) << "Incorrect output id token"; // Compute reference cumLogProb by summing all logProbs up to the stop token expectedCumLogProbsHostPtr[batchSlot] += static_cast(logProbsHostPtr[bi * vocabSize + outputId]); } else { // Check that tensors are not modified auto const idsIdx = batchSlot * mMaxSeqLen + seqLengthsOrigHostPtr[batchSlot] + ti; EXPECT_EQ(outputId, outputIdsOrigHostPtr[idsIdx]); EXPECT_EQ(seqLengthsHostPtr[batchSlot], seqLengthsOrigHostPtr[batchSlot]); EXPECT_EQ(finishedHostPtr[batchSlot].isFinished(), finishedOrigHostPtr[batchSlot].isFinished()); } } // a boundary check for returnAllSelectedTokens in topP kernel and when TopP selected indices < topK in topK // kernel. if (!skipDecodeHostPtr[batchSlot] && !finishedOrigHostPtr[batchSlot].isFinished() && !finishedOrigHostPtr[batchSlot].isSkipDecoding()) { if (param.returnAllSelectedTokens && (topK == 0 || ki != topK)) { auto const idsIdx = ti * mMaxTopK + ki; auto const outputId = outputIdsHostPtr[batchSlot * mMaxSeqLen + idsIdx]; EXPECT_EQ(outputId, -1); } } } } // Cum log probs is not supported for multiple tokens per step or all top K return if (maxTokensPerStep == 1 && !param.returnAllSelectedTokens) { for (int32_t bi = 0; bi < batchSize; ++bi) { auto* batchSlotsPtr = bufferCast(*mBatchSlots); auto const batchSlot = batchSlotsPtr[bi]; bool passed = checkResult("cum log probs", bufferCast(*cumLogProbsHost) + batchSlot, bufferCast(*mExpectedCumLogProbsHost) + batchSlot, 1); EXPECT_TRUE(passed); } } } template void SamplingKernelTest::runTest(SamplingKernelTestParam const& param) { // Allocate buffers allocateBuffers(param); // Setup buffers setupBuffers(param); // Retrieve the workspace size of the sampling kernel. auto const workspaceSize = getWorkspaceSize(param); TensorPtr workspaceDevice = mBufferManager->gpu(ITensor::makeShape({static_cast(workspaceSize)}), nvinfer1::DataType::kINT8); // Call tested function sampling callTestedFunction(param, workspaceDevice); // Verify results verifyResult(param); } template class SamplingKernelTest; template class SamplingKernelTest; } // namespace tensorrt_llm::tests::kernels::sampling