TensorRT-LLMs/cpp/tests/unit_tests/kernels/sampling/samplingPenaltyTest.cpp
2025-10-27 13:12:31 -04:00

1690 lines
73 KiB
C++

/*
* Copyright (c) 2022-2024, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "tensorrt_llm/kernels/penaltyTypes.h"
#include "tests/unit_tests/kernels/sampling/samplingTest.h"
using namespace tensorrt_llm::tests::kernels::sampling;
namespace
{
namespace tc = tensorrt_llm::common;
namespace tk = tensorrt_llm::kernels;
namespace trk = tensorrt_llm::runtime::kernels;
using TensorPtr = tensorrt_llm::runtime::ITensor::SharedPtr;
using namespace tensorrt_llm::runtime;
using namespace tensorrt_llm::kernels;
struct TemperatureTestParam
{
int32_t batchSize;
int32_t vocabSize;
TensorPtr temperatures;
int32_t temperaturesSize;
int32_t maxTokensPerStep{1};
TemperatureTestParam& setBatchSize(int32_t bs)
{
batchSize = bs;
return *this;
}
TemperatureTestParam& setVocabSize(int32_t vs)
{
vocabSize = vs;
return *this;
}
TemperatureTestParam& setTemperaturesSize(int32_t ts)
{
temperaturesSize = ts;
return *this;
}
TemperatureTestParam& setTemperatures(TensorPtr temp)
{
temperatures = temp;
return *this;
}
TemperatureTestParam& setMaxTokensPerStep(int32_t ts)
{
maxTokensPerStep = ts;
return *this;
}
std::string toString() const
{
return tc::fmtstr("TemperatureTestParam[batch=%d, vocab=%d, temperatures=%s]", batchSize, vocabSize,
tc::arr2str(bufferCast<float>(*temperatures), temperaturesSize).c_str());
}
};
size_t padVocabSize(size_t vocabSize, size_t pad = 8)
{
return (vocabSize + pad - 1) / pad * pad;
}
template <typename T>
void initLogitsAndBias(T* logits, T* bias, size_t const batchSize, size_t const maxTokensPerStep,
size_t const vocabSize, size_t const vocabSizePadded)
{
initRandom(logits, batchSize * maxTokensPerStep * vocabSizePadded, -5.0f, 5.0f);
if (bias != nullptr)
{
initRandom(bias, batchSize * vocabSizePadded, -5.0f, 5.0f);
}
bool is_half = sizeof(T) == 2;
for (size_t i = 0; i < batchSize; ++i)
{
for (size_t t = 0; t < maxTokensPerStep; ++t)
{
for (size_t j = vocabSize; j < vocabSizePadded; ++j)
{
logits[(i * maxTokensPerStep + t) * vocabSizePadded + j]
= static_cast<T>(is_half ? -HALF_FLT_MAX : -FLT_MAX);
}
}
for (size_t j = vocabSize; j < vocabSizePadded; ++j)
{
if (bias != nullptr && i == 0)
{
bias[i * vocabSizePadded + j] = (T) 0.0f;
}
}
}
}
/////////////////////////////////// Tests //////////////////////////////////////////
template <typename T>
class TemperaturePenaltyTest : public SamplingKernelTest<T>
{
protected:
// Set up test
int32_t mMaxBatchSize;
int32_t mBatchSize;
int32_t mVocabSize;
int32_t mVocabSizePadded;
int32_t mMaxTokensPerStep;
using SamplingKernelTest<T>::mBufferManager;
using SamplingKernelTest<T>::mStream;
using SamplingKernelTest<T>::mLogitsHost;
TensorPtr mLogitsDevice;
TensorPtr mOutLogitsDevice;
TensorPtr mLogitsRefHost;
TensorPtr mLogitsPtrs;
TensorPtr mPenaltyWorkspaceDevice;
TensorPtr mTokensPerStep;
TensorPtr mBiasHost;
TensorPtr mBiasDevice;
TensorPtr mTemperaturesDevice;
TensorPtr mBatchSlots;
void subsetup(TemperatureTestParam const& param)
{
auto const dataType = TRTDataType<T>::value;
auto const ptrType = TRTDataType<T*>::value;
mBatchSize = param.batchSize;
mMaxBatchSize = 2 * mBatchSize;
mVocabSize = param.vocabSize;
mVocabSizePadded = padVocabSize(mVocabSize);
mMaxTokensPerStep = param.maxTokensPerStep;
mLogitsHost
= BufferManager::pinned(ITensor::makeShape({mBatchSize, mMaxTokensPerStep, mVocabSizePadded}), dataType);
mLogitsRefHost
= BufferManager::pinned(ITensor::makeShape({mBatchSize, mMaxTokensPerStep, mVocabSizePadded}), dataType);
mLogitsDevice
= mBufferManager->gpu(ITensor::makeShape({mBatchSize, mMaxTokensPerStep, mVocabSizePadded}), dataType);
mOutLogitsDevice
= mBufferManager->gpu(ITensor::makeShape({mBatchSize, mMaxTokensPerStep, mVocabSizePadded}), dataType);
mLogitsPtrs = BufferManager::pinned(ITensor::makeShape({mBatchSize}), ptrType);
mPenaltyWorkspaceDevice = mBufferManager->gpu(
ITensor::makeShape({mMaxBatchSize, mMaxTokensPerStep, mVocabSize * 2}), nvinfer1::DataType::kINT32);
mTokensPerStep = BufferManager::pinned(ITensor::makeShape({mMaxBatchSize}), nvinfer1::DataType::kINT32);
mBiasHost = BufferManager::pinned(ITensor::makeShape({mMaxBatchSize, mVocabSizePadded}), dataType);
mBiasDevice = mBufferManager->gpu(ITensor::makeShape({mMaxBatchSize, mVocabSizePadded}), dataType);
mBatchSlots = BufferManager::pinned(ITensor::makeShape({mBatchSize}), nvinfer1::DataType::kINT32);
trk::invokeFill(*mLogitsRefHost, T{0.0f}, *mStream);
trk::invokeFill(*mOutLogitsDevice, T{0.0f}, *mStream);
trk::invokeFill(*mLogitsRefHost, T{0.0f}, *mStream);
trk::invokeFill(*mOutLogitsDevice, T{0.0f}, *mStream);
auto batchSlotsPtr = bufferCast<int32_t>(*mBatchSlots);
for (SizeType32 bi = 0; bi < mBatchSize; ++bi)
{
batchSlotsPtr[bi] = 2 * bi;
}
initLogitsAndBias(bufferCast<T>(*mLogitsHost), bufferCast<T>(*mBiasHost), mBatchSize, mMaxTokensPerStep,
mVocabSize, mVocabSizePadded);
mBufferManager->copy(*mLogitsHost, *mLogitsDevice);
auto logitsPtrs = BufferRange<T*>(*mLogitsPtrs);
for (SizeType32 bi = 0; bi < mBatchSize; ++bi)
{
logitsPtrs[bi]
= bufferCast<T>(*mLogitsDevice) + (mBatchSize - bi - 1) * mMaxTokensPerStep * mVocabSizePadded;
}
auto tokensPerStepPtr = bufferCast<int32_t>(*mTokensPerStep);
for (SizeType32 bi = 0; bi < mMaxBatchSize; ++bi)
{
tokensPerStepPtr[bi] = (std::rand() % mMaxTokensPerStep) + 1;
}
mBufferManager->copy(*mBiasHost, *mBiasDevice);
ASSERT_EQ(param.temperaturesSize, mMaxBatchSize) << "Invalid test configuration.";
mTemperaturesDevice
= mBufferManager->gpu(ITensor::makeShape({param.temperaturesSize}), nvinfer1::DataType::kFLOAT);
mBufferManager->copy(*param.temperatures, *mTemperaturesDevice);
}
void computeReference(T const* const inLogits, T* const outLogits, T const* const bias,
float const* const temperatures, size_t const temperaturesSize)
{
bool const IS_FP16 = std::is_same<T, half>::value;
T const MAX_T_VAL = (IS_FP16) ? HALF_FLT_MAX : FLT_MAX;
auto const batchSlotsPtr = bufferCast<int32_t>(*mBatchSlots);
auto const tokensPerStepPtr = bufferCast<int32_t>(*mTokensPerStep);
for (int32_t bi = 0; bi < mBatchSize; ++bi)
{
auto const batchSlot = batchSlotsPtr[bi];
for (int32_t ti = 0; ti < tokensPerStepPtr[batchSlot]; ++ti)
{
for (int32_t vi = 0; vi < mVocabSizePadded; ++vi)
{
auto const srcIdx = ((mBatchSize - bi - 1) * mMaxTokensPerStep + ti) * mVocabSizePadded + vi;
auto const dstIdx = (bi * mMaxTokensPerStep + ti) * mVocabSizePadded + vi;
outLogits[dstIdx] = inLogits[srcIdx];
}
}
}
for (size_t bi = 0; bi < mBatchSize; ++bi)
{
auto const batchSlot = batchSlotsPtr[bi];
for (int32_t ti = 0; ti < tokensPerStepPtr[batchSlot]; ++ti)
{
auto temperature = temperatures[batchSlot];
ASSERT_GT(temperature, 0.0f) << "temperature should be positive but got " << temperature;
for (size_t j = 0; j < mVocabSizePadded; ++j)
{
size_t index = (bi * mMaxTokensPerStep + ti) * mVocabSizePadded + j;
auto logit = static_cast<float>(outLogits[index]);
if (j < mVocabSize && bias != nullptr)
{
logit += static_cast<float>(bias[batchSlot * mVocabSizePadded + j]);
}
outLogits[index] = j < mVocabSize ? static_cast<T>(logit / temperature) : -MAX_T_VAL;
}
}
}
}
public:
void runTest(TemperatureTestParam param)
{
subsetup(param);
// Do test
InvokeBatchApplyPenaltyParams<T> penaltyParams{reinterpret_cast<T**>(bufferCast<int64_t>(*mLogitsPtrs)),
bufferCast<T>(*mOutLogitsDevice), bufferCast<T>(*mBiasDevice),
bufferCast<int32_t>(*mPenaltyWorkspaceDevice), nullptr, bufferCast<float>(*mTemperaturesDevice), nullptr,
nullptr, nullptr, nullptr, mBatchSize, 1, 1, mVocabSize, mVocabSizePadded, nullptr, nullptr, nullptr,
nullptr, nullptr, nullptr, bufferCast<int32_t>(*mBatchSlots), mMaxTokensPerStep,
bufferCast<int32_t>(*mTokensPerStep), nullptr, mStream->get()};
tk::invokeBatchApplyPenalty(penaltyParams);
auto logitsOutHost = mBufferManager->copyFrom(*mOutLogitsDevice, MemoryType::kCPU);
mStream->synchronize();
computeReference(bufferCast<T>(*mLogitsHost), bufferCast<T>(*mLogitsRefHost), bufferCast<T>(*mBiasHost),
bufferCast<float>(*param.temperatures), param.temperaturesSize);
bool passed = checkResult(param.toString(), bufferCast<T>(*logitsOutHost), bufferCast<T>(*mLogitsRefHost),
mBatchSize * mMaxTokensPerStep * mVocabSizePadded);
EXPECT_TRUE(passed);
}
};
TYPED_TEST_SUITE(TemperaturePenaltyTest, FloatAndHalfTypes);
TYPED_TEST(TemperaturePenaltyTest, NoPenalty)
{
int32_t batchSize = 6;
int32_t maxBatchSize = 2 * batchSize;
TensorPtr temperaturesHost = BufferManager::pinned(ITensor::makeShape({maxBatchSize}), nvinfer1::DataType::kFLOAT);
for (int32_t i = 0; i < maxBatchSize; ++i)
{
bufferCast<float>(*temperaturesHost)[i] = 1.0f;
}
this->runTest(TemperatureTestParam()
.setBatchSize(batchSize)
.setVocabSize(4)
.setTemperaturesSize(maxBatchSize)
.setTemperatures(temperaturesHost));
}
TYPED_TEST(TemperaturePenaltyTest, LessThanOne)
{
int32_t batchSize = 6;
int32_t maxBatchSize = 2 * batchSize;
TensorPtr temperaturesHost = BufferManager::pinned(ITensor::makeShape({maxBatchSize}), nvinfer1::DataType::kFLOAT);
for (int32_t i = 0; i < maxBatchSize; ++i)
{
bufferCast<float>(*temperaturesHost)[i] = 0.53f;
}
this->runTest(TemperatureTestParam()
.setBatchSize(batchSize)
.setVocabSize(4)
.setTemperaturesSize(maxBatchSize)
.setTemperatures(temperaturesHost));
}
TYPED_TEST(TemperaturePenaltyTest, GreaterThaneOne)
{
int32_t batchSize = 6;
int32_t maxBatchSize = 2 * batchSize;
TensorPtr temperaturesHost = BufferManager::pinned(ITensor::makeShape({maxBatchSize}), nvinfer1::DataType::kFLOAT);
for (int32_t i = 0; i < maxBatchSize; ++i)
{
bufferCast<float>(*temperaturesHost)[i] = 2.01f;
}
this->runTest(TemperatureTestParam()
.setBatchSize(batchSize)
.setVocabSize(4)
.setTemperaturesSize(maxBatchSize)
.setTemperatures(temperaturesHost));
}
TYPED_TEST(TemperaturePenaltyTest, Mixed)
{
int32_t batchSize = 6;
int32_t maxBatchSize = 2 * batchSize;
TensorPtr temperaturesHost = BufferManager::pinned(ITensor::makeShape({maxBatchSize}), nvinfer1::DataType::kFLOAT);
for (int32_t i = 0; i < maxBatchSize; ++i)
{
bufferCast<float>(*temperaturesHost)[i] = 0.53f + 0.2f * i;
}
this->runTest(TemperatureTestParam()
.setBatchSize(batchSize)
.setVocabSize(4)
.setTemperaturesSize(maxBatchSize)
.setTemperatures(temperaturesHost));
}
TYPED_TEST(TemperaturePenaltyTest, LargeVocab)
{
int32_t batchSize = 6;
int32_t maxBatchSize = 2 * batchSize;
TensorPtr temperaturesHost = BufferManager::pinned(ITensor::makeShape({maxBatchSize}), nvinfer1::DataType::kFLOAT);
for (int32_t i = 0; i < maxBatchSize; ++i)
{
bufferCast<float>(*temperaturesHost)[i] = 0.53f + 0.2f * i;
}
this->runTest(TemperatureTestParam()
.setBatchSize(batchSize)
.setVocabSize(50001)
.setTemperaturesSize(maxBatchSize)
.setTemperatures(temperaturesHost));
}
TYPED_TEST(TemperaturePenaltyTest, LargeVocabTokensPerStep)
{
int32_t batchSize = 6;
int32_t maxBatchSize = 2 * batchSize;
TensorPtr temperaturesHost = BufferManager::pinned(ITensor::makeShape({maxBatchSize}), nvinfer1::DataType::kFLOAT);
for (int32_t i = 0; i < maxBatchSize; ++i)
{
bufferCast<float>(*temperaturesHost)[i] = 1.f; // 0.53f + 0.2f * i;
}
this->runTest(TemperatureTestParam()
.setBatchSize(batchSize)
.setVocabSize(8)
.setTemperaturesSize(maxBatchSize)
.setTemperatures(temperaturesHost)
.setMaxTokensPerStep(4));
}
struct RepetitionPenaltyTestCase
{
int32_t batchSize;
int32_t vocabSize;
int32_t maxInputLength;
TensorPtr repetitionPenalties;
TensorPtr presencePenalties;
TensorPtr frequencyPenalties;
TensorPtr promptIgnoreLengths;
int32_t repetitionPenaltiesSize;
int32_t presencePenaltiesSize;
int32_t frequencyPenaltiesSize;
int32_t promptIgnoreLengthsSize;
int32_t maxTokensPerStep{1};
RepetitionPenaltyTestCase& setBatchSize(int32_t bs)
{
batchSize = bs;
return *this;
}
RepetitionPenaltyTestCase& setVocabSize(int32_t vs)
{
vocabSize = vs;
return *this;
}
RepetitionPenaltyTestCase& setMaxInputLength(int32_t len)
{
maxInputLength = len;
return *this;
}
RepetitionPenaltyTestCase& setRepetitionPenalties(TensorPtr rp)
{
repetitionPenalties = rp;
return *this;
}
RepetitionPenaltyTestCase& setPresencePenalties(TensorPtr pp)
{
presencePenalties = pp;
return *this;
}
RepetitionPenaltyTestCase& setFrequencyPenalties(TensorPtr fp)
{
frequencyPenalties = fp;
return *this;
}
RepetitionPenaltyTestCase& setPromptIgnoreLengths(TensorPtr pil)
{
promptIgnoreLengths = pil;
return *this;
}
RepetitionPenaltyTestCase& setRepetitionPenaltiesSize(int32_t rps)
{
repetitionPenaltiesSize = rps;
return *this;
}
RepetitionPenaltyTestCase& setPresencePenaltiesSize(int32_t pps)
{
presencePenaltiesSize = pps;
return *this;
}
RepetitionPenaltyTestCase& setFrequencyPenaltiesSize(int32_t fps)
{
frequencyPenaltiesSize = fps;
return *this;
}
RepetitionPenaltyTestCase& setPromptIgnoreLengthsSize(int32_t pils)
{
promptIgnoreLengthsSize = pils;
return *this;
}
RepetitionPenaltyTestCase& setMaxTokensPerStep(int32_t ts)
{
maxTokensPerStep = ts;
return *this;
}
std::string toString() const
{
return tc::fmtstr(
"RepetitionPenaltyTestCase[batch=%d, vocab=%d, maxInputLength=%d, "
"repetitionPenalties=%s, presencePenalties=%s, frequencyPenalties=%s, promptIgnoreLengths=%s]",
batchSize, vocabSize, maxInputLength,
tc::arr2str(bufferCast<float>(*repetitionPenalties), repetitionPenaltiesSize).c_str(),
tc::arr2str(bufferCast<float>(*presencePenalties), presencePenaltiesSize).c_str(),
tc::arr2str(bufferCast<float>(*frequencyPenalties), frequencyPenaltiesSize).c_str(),
tc::arr2str(bufferCast<int32_t>(*promptIgnoreLengths), promptIgnoreLengthsSize).c_str());
}
};
template <typename T>
class RepetitionPenaltyTest : public SamplingKernelTest<T>
{
protected:
// Set up test
int32_t mBatchSize;
int32_t mMaxBatchSize;
int32_t mVocabSize;
int32_t mVocabSizePadded;
int32_t mMaxInputLength;
int32_t mSequenceLength;
int32_t mMaxTokensPerStep;
using SamplingKernelTest<T>::mBufferManager;
using SamplingKernelTest<T>::mStream;
using SamplingKernelTest<T>::mLogitsHost;
TensorPtr mLogitsDevice;
TensorPtr mOutLogitsDevice;
TensorPtr mLogitsRefHost;
TensorPtr mLogitsPtrs;
TensorPtr mPenaltyWorkspaceDevice;
TensorPtr mTokensPerStep;
TensorPtr mOutputIdsHost;
TensorPtr mOutputIdsDevice;
TensorPtr mContextLengthHost;
TensorPtr mContextLengthDevice;
TensorPtr mSeqLengthHost;
TensorPtr mSeqLengthDevice;
TensorPtr mIdsPtrHost;
TensorPtr mIdsPtrDevice;
TensorPtr mRepetitionPenaltiesDevice;
TensorPtr mPresencePenaltiesDevice;
TensorPtr mFrequencyPenaltiesDevice;
TensorPtr mPromptIgnoreLengthsDevice;
TensorPtr mBatchSlots;
void subsetup(RepetitionPenaltyTestCase param)
{
auto const dataType = TRTDataType<T>::value;
auto const ptrType = TRTDataType<T*>::value;
mBatchSize = param.batchSize;
mMaxBatchSize = 2 * mBatchSize;
mVocabSize = param.vocabSize;
mVocabSizePadded = padVocabSize(mVocabSize);
mMaxInputLength = param.maxInputLength;
mSequenceLength = 2 * mMaxInputLength; // input + output
mMaxTokensPerStep = param.maxTokensPerStep;
mLogitsHost
= BufferManager::pinned(ITensor::makeShape({mBatchSize, mMaxTokensPerStep, mVocabSizePadded}), dataType);
mLogitsRefHost
= BufferManager::pinned(ITensor::makeShape({mBatchSize, mMaxTokensPerStep, mVocabSizePadded}), dataType);
mLogitsDevice
= mBufferManager->gpu(ITensor::makeShape({mBatchSize, mMaxTokensPerStep, mVocabSizePadded}), dataType);
mOutLogitsDevice
= mBufferManager->gpu(ITensor::makeShape({mBatchSize, mMaxTokensPerStep, mVocabSizePadded}), dataType);
mLogitsPtrs = BufferManager::pinned(ITensor::makeShape({mBatchSize}), ptrType);
mPenaltyWorkspaceDevice = mBufferManager->gpu(
ITensor::makeShape({mBatchSize, mMaxTokensPerStep, mVocabSize * 2}), nvinfer1::DataType::kINT32);
mTokensPerStep = BufferManager::pinned(ITensor::makeShape({mMaxBatchSize}), nvinfer1::DataType::kINT32);
mOutputIdsHost
= BufferManager::pinned(ITensor::makeShape({mMaxBatchSize, mSequenceLength}), nvinfer1::DataType::kINT32);
mOutputIdsDevice
= mBufferManager->gpu(ITensor::makeShape({mMaxBatchSize, mSequenceLength}), nvinfer1::DataType::kINT32);
mSeqLengthHost = BufferManager::pinned(ITensor::makeShape({mMaxBatchSize}), nvinfer1::DataType::kINT32);
mSeqLengthDevice = mBufferManager->gpu(ITensor::makeShape({mMaxBatchSize}), nvinfer1::DataType::kINT32);
mContextLengthHost = BufferManager::pinned(ITensor::makeShape({mMaxBatchSize}), nvinfer1::DataType::kINT32);
mContextLengthDevice = mBufferManager->gpu(ITensor::makeShape({mMaxBatchSize}), nvinfer1::DataType::kINT32);
mIdsPtrHost = BufferManager::pinned(ITensor::makeShape({mMaxBatchSize}), ptrType);
mIdsPtrDevice = BufferManager::pinned(ITensor::makeShape({mMaxBatchSize}), ptrType);
mBatchSlots = BufferManager::pinned(ITensor::makeShape({mBatchSize}), nvinfer1::DataType::kINT32);
auto batchSlotsPtr = bufferCast<int32_t>(*mBatchSlots);
for (SizeType32 bi = 0; bi < mBatchSize; ++bi)
{
batchSlotsPtr[bi] = 2 * bi;
}
initLogitsAndBias(bufferCast<T>(*mLogitsHost), static_cast<T*>(nullptr), mBatchSize, mMaxTokensPerStep,
mVocabSize, mVocabSizePadded);
initRandomInt(bufferCast<int32_t>(*mOutputIdsHost), mSequenceLength * mMaxBatchSize, 0, mVocabSize);
initRandomInt(bufferCast<int32_t>(*mSeqLengthHost), mMaxBatchSize, 1, mSequenceLength);
for (size_t i = 0; i < mMaxBatchSize; ++i)
{
bufferCast<int32_t>(*mContextLengthHost)[i] = bufferCast<int32_t>(*mSeqLengthHost)[i];
}
trk::invokeFill(*mLogitsRefHost, T{0.0f}, *mStream);
trk::invokeFill(*mOutLogitsDevice, T{0.0f}, *mStream);
auto idsPtrHostPtr = BufferRange<void*>(*mIdsPtrHost);
auto outputIdsDevicePtr = bufferCast<int32_t>(*mOutputIdsDevice);
auto tokensPerStepPtr = bufferCast<int32_t>(*mTokensPerStep);
for (SizeType32 bi = 0; bi < mMaxBatchSize; bi++)
{
idsPtrHostPtr[bi] = outputIdsDevicePtr + bi * mSequenceLength;
tokensPerStepPtr[bi] = (std::rand() % mMaxTokensPerStep) + 1;
}
mBufferManager->copy(*mLogitsHost, *mLogitsDevice);
mBufferManager->copy(*mOutputIdsHost, *mOutputIdsDevice);
mBufferManager->copy(*mContextLengthHost, *mContextLengthDevice);
mBufferManager->copy(*mSeqLengthHost, *mSeqLengthDevice);
mBufferManager->copy(*mIdsPtrHost, *mIdsPtrDevice);
auto logitsPtrs = BufferRange<T*>(*mLogitsPtrs);
for (SizeType32 bi = 0; bi < mBatchSize; ++bi)
{
logitsPtrs[bi]
= bufferCast<T>(*mLogitsDevice) + (mBatchSize - bi - 1) * mMaxTokensPerStep * mVocabSizePadded;
}
ASSERT_EQ(param.repetitionPenaltiesSize, mMaxBatchSize) << "Invalid test configuration.";
ASSERT_EQ(param.presencePenaltiesSize, mMaxBatchSize) << "Invalid test configuration.";
ASSERT_EQ(param.frequencyPenaltiesSize, mMaxBatchSize) << "Invalid test configuration.";
ASSERT_EQ(param.promptIgnoreLengthsSize, mMaxBatchSize) << "Invalid test configuration.";
mRepetitionPenaltiesDevice
= mBufferManager->gpu(ITensor::makeShape({param.repetitionPenaltiesSize}), nvinfer1::DataType::kFLOAT);
mPresencePenaltiesDevice
= mBufferManager->gpu(ITensor::makeShape({param.presencePenaltiesSize}), nvinfer1::DataType::kFLOAT);
mFrequencyPenaltiesDevice
= mBufferManager->gpu(ITensor::makeShape({param.frequencyPenaltiesSize}), nvinfer1::DataType::kFLOAT);
mPromptIgnoreLengthsDevice
= mBufferManager->gpu(ITensor::makeShape({param.promptIgnoreLengthsSize}), nvinfer1::DataType::kINT32);
mBufferManager->copy(*param.repetitionPenalties, *mRepetitionPenaltiesDevice);
mBufferManager->copy(*param.presencePenalties, *mPresencePenaltiesDevice);
mBufferManager->copy(*param.frequencyPenalties, *mFrequencyPenaltiesDevice);
mBufferManager->copy(*param.promptIgnoreLengths, *mPromptIgnoreLengthsDevice);
}
void computeReference(T const* const inLogits, T* const outLogits, int32_t const* const outputIds,
int32_t const* const sequenceLengths, float const* const repetitionPenalties,
float const* const presencePenalties, float const* const frequencyPenalties,
int32_t const* const promptIgnoreLengths, int32_t const repetitionPenaltiesSize,
int32_t const presencePenaltiesSize, int32_t const frequencyPenaltiesSize,
int32_t const promptIgnoreLengthsSize)
{
std::vector<bool> repetitionPenalized(mVocabSize), presencePenalized(mVocabSize);
auto const batchSlotsPtr = bufferCast<int32_t>(*mBatchSlots);
auto const tokensPerStepPtr = bufferCast<int32_t>(*mTokensPerStep);
for (int32_t bi = 0; bi < mBatchSize; ++bi)
{
auto const batchSlot = batchSlotsPtr[bi];
for (int32_t ti = 0; ti < tokensPerStepPtr[batchSlot]; ++ti)
{
for (int32_t vi = 0; vi < mVocabSizePadded; ++vi)
{
auto const srcIdx = ((mBatchSize - bi - 1) * mMaxTokensPerStep + ti) * mVocabSizePadded + vi;
auto const dstIdx = (bi * mMaxTokensPerStep + ti) * mVocabSizePadded + vi;
outLogits[dstIdx] = inLogits[srcIdx];
}
}
}
for (int32_t bi = 0; bi < mBatchSize; ++bi)
{
auto const batchSlot = batchSlotsPtr[bi];
for (int32_t ti = 0; ti < tokensPerStepPtr[batchSlot]; ++ti)
{
float repetitionPenalty
= repetitionPenaltiesSize > 1 ? repetitionPenalties[batchSlot] : repetitionPenalties[0];
float presencePenalty = presencePenaltiesSize > 1 ? presencePenalties[batchSlot] : presencePenalties[0];
float frequencyPenalty
= frequencyPenaltiesSize > 1 ? frequencyPenalties[batchSlot] : frequencyPenalties[0];
int32_t promptIgnoreLength
= promptIgnoreLengthsSize > 1 ? promptIgnoreLengths[batchSlot] : promptIgnoreLengths[0];
std::fill(repetitionPenalized.begin(), repetitionPenalized.end(), false);
std::fill(presencePenalized.begin(), presencePenalized.end(), false);
size_t offset = (bi * mMaxTokensPerStep + ti) * mVocabSizePadded;
auto const step = sequenceLengths[batchSlot];
// clamping to the inputLength (set to same as sequenceLength)
promptIgnoreLength = std::min(promptIgnoreLength, step);
std::vector<int32_t> numOccurences(mVocabSize, 0);
for (int32_t t = 0; t < step; ++t)
{
auto tokenId = outputIds[batchSlot * mSequenceLength + t];
if (!repetitionPenalized[tokenId])
{
auto logit = static_cast<float>(outLogits[offset + tokenId]);
outLogits[offset + tokenId]
= static_cast<T>((logit < 0.0f ? logit * repetitionPenalty : logit / repetitionPenalty));
repetitionPenalized[tokenId] = true;
}
if (!(t < promptIgnoreLength))
{
presencePenalized[tokenId] = true;
numOccurences[tokenId] += 1;
}
}
for (int32_t vi = 0; vi < mVocabSize; ++vi)
{
if (presencePenalized[vi])
{
outLogits[offset + vi] -= presencePenalty;
}
if (numOccurences[vi] > 0)
{
outLogits[offset + vi] -= numOccurences[vi] * frequencyPenalty;
}
}
}
}
}
public:
void runTest(RepetitionPenaltyTestCase param)
{
subsetup(param);
InvokeBatchApplyPenaltyParams<T> penaltyParams{reinterpret_cast<T**>(bufferCast<int64_t>(*mLogitsPtrs)),
bufferCast<T>(*mOutLogitsDevice), nullptr, bufferCast<int32_t>(*mPenaltyWorkspaceDevice), nullptr, nullptr,
bufferCast<float>(*mRepetitionPenaltiesDevice), bufferCast<float>(*mPresencePenaltiesDevice),
bufferCast<float>(*mFrequencyPenaltiesDevice), bufferCast<int32_t>(*mPromptIgnoreLengthsDevice), mBatchSize,
1, mSequenceLength, mVocabSize, mVocabSizePadded,
reinterpret_cast<int32_t const**>(bufferCast<int64_t>(*mIdsPtrDevice)), nullptr,
bufferCast<int32_t>(*mContextLengthDevice), bufferCast<int32_t>(*mSeqLengthDevice), nullptr, nullptr,
bufferCast<int32_t>(*mBatchSlots), mMaxTokensPerStep, bufferCast<int32_t>(*mTokensPerStep), nullptr,
mStream->get()};
tk::invokeBatchApplyPenalty(penaltyParams);
auto logitsOutHost = mBufferManager->copyFrom(*mOutLogitsDevice, MemoryType::kCPU);
computeReference(bufferCast<T>(*mLogitsHost), bufferCast<T>(*mLogitsRefHost),
bufferCast<int32_t>(*mOutputIdsHost), bufferCast<int32_t>(*mSeqLengthHost),
bufferCast<float>(*param.repetitionPenalties), bufferCast<float>(*param.presencePenalties),
bufferCast<float>(*param.frequencyPenalties), bufferCast<int32_t>(*param.promptIgnoreLengths),
param.repetitionPenaltiesSize, param.presencePenaltiesSize, param.frequencyPenaltiesSize,
param.promptIgnoreLengthsSize);
mStream->synchronize();
bool passed = checkResult(param.toString(), bufferCast<T>(*logitsOutHost), bufferCast<T>(*mLogitsRefHost),
mBatchSize * mMaxTokensPerStep * mVocabSizePadded);
EXPECT_TRUE(passed);
}
};
TYPED_TEST_SUITE(RepetitionPenaltyTest, FloatAndHalfTypes);
TYPED_TEST(RepetitionPenaltyTest, BatchNoPenalty)
{
int32_t batchSize = 6;
int32_t maxBatchSize = 2 * batchSize;
TensorPtr repetitionPenaltyHost
= BufferManager::pinned(ITensor::makeShape({maxBatchSize}), nvinfer1::DataType::kFLOAT);
TensorPtr presencePenaltyHost
= BufferManager::pinned(ITensor::makeShape({maxBatchSize}), nvinfer1::DataType::kFLOAT);
TensorPtr frequencyPenaltyHost
= BufferManager::pinned(ITensor::makeShape({maxBatchSize}), nvinfer1::DataType::kFLOAT);
TensorPtr promptIgnoreLengthsHost
= BufferManager::pinned(ITensor::makeShape({maxBatchSize}), nvinfer1::DataType::kINT32);
for (int32_t i = 0; i < maxBatchSize; ++i)
{
bufferCast<float>(*repetitionPenaltyHost)[i] = 1.0f;
bufferCast<float>(*presencePenaltyHost)[i] = 0.0f;
bufferCast<float>(*frequencyPenaltyHost)[i] = 0.0f;
bufferCast<int32_t>(*promptIgnoreLengthsHost)[i] = 0;
}
this->runTest(RepetitionPenaltyTestCase()
.setBatchSize(batchSize)
.setVocabSize(4)
.setMaxInputLength(5)
.setRepetitionPenalties(repetitionPenaltyHost)
.setPresencePenalties(presencePenaltyHost)
.setFrequencyPenalties(frequencyPenaltyHost)
.setPromptIgnoreLengths(promptIgnoreLengthsHost)
.setRepetitionPenaltiesSize(maxBatchSize)
.setPresencePenaltiesSize(maxBatchSize)
.setFrequencyPenaltiesSize(maxBatchSize)
.setPromptIgnoreLengthsSize(maxBatchSize));
}
TYPED_TEST(RepetitionPenaltyTest, BatchRepetitionLessThanOne)
{
int32_t batchSize = 6;
int32_t maxBatchSize = 2 * batchSize;
TensorPtr repetitionPenaltyHost
= BufferManager::pinned(ITensor::makeShape({maxBatchSize}), nvinfer1::DataType::kFLOAT);
TensorPtr presencePenaltyHost
= BufferManager::pinned(ITensor::makeShape({maxBatchSize}), nvinfer1::DataType::kFLOAT);
TensorPtr frequencyPenaltyHost
= BufferManager::pinned(ITensor::makeShape({maxBatchSize}), nvinfer1::DataType::kFLOAT);
TensorPtr promptIgnoreLengthsHost
= BufferManager::pinned(ITensor::makeShape({maxBatchSize}), nvinfer1::DataType::kINT32);
for (int32_t i = 0; i < maxBatchSize; ++i)
{
bufferCast<float>(*repetitionPenaltyHost)[i] = 0.53f;
bufferCast<float>(*presencePenaltyHost)[i] = 0.0f;
bufferCast<float>(*frequencyPenaltyHost)[i] = 0.0f;
bufferCast<int32_t>(*promptIgnoreLengthsHost)[i] = 0;
}
this->runTest(RepetitionPenaltyTestCase()
.setBatchSize(batchSize)
.setVocabSize(4)
.setMaxInputLength(5)
.setRepetitionPenalties(repetitionPenaltyHost)
.setPresencePenalties(presencePenaltyHost)
.setFrequencyPenalties(frequencyPenaltyHost)
.setPromptIgnoreLengths(promptIgnoreLengthsHost)
.setRepetitionPenaltiesSize(maxBatchSize)
.setPresencePenaltiesSize(maxBatchSize)
.setFrequencyPenaltiesSize(maxBatchSize)
.setPromptIgnoreLengthsSize(maxBatchSize));
}
TYPED_TEST(RepetitionPenaltyTest, BatchRepetitionGreaterThaneOne)
{
int32_t batchSize = 6;
int32_t maxBatchSize = 2 * batchSize;
TensorPtr repetitionPenaltyHost
= BufferManager::pinned(ITensor::makeShape({maxBatchSize}), nvinfer1::DataType::kFLOAT);
TensorPtr presencePenaltyHost
= BufferManager::pinned(ITensor::makeShape({maxBatchSize}), nvinfer1::DataType::kFLOAT);
TensorPtr frequencyPenaltyHost
= BufferManager::pinned(ITensor::makeShape({maxBatchSize}), nvinfer1::DataType::kFLOAT);
TensorPtr promptIgnoreLengthsHost
= BufferManager::pinned(ITensor::makeShape({maxBatchSize}), nvinfer1::DataType::kINT32);
for (int32_t i = 0; i < maxBatchSize; ++i)
{
bufferCast<float>(*repetitionPenaltyHost)[i] = 2.01f;
bufferCast<float>(*presencePenaltyHost)[i] = 0.0f;
bufferCast<float>(*frequencyPenaltyHost)[i] = 0.0f;
bufferCast<int32_t>(*promptIgnoreLengthsHost)[i] = 0;
}
this->runTest(RepetitionPenaltyTestCase()
.setBatchSize(batchSize)
.setVocabSize(4)
.setMaxInputLength(5)
.setRepetitionPenalties(repetitionPenaltyHost)
.setPresencePenalties(presencePenaltyHost)
.setFrequencyPenalties(frequencyPenaltyHost)
.setPromptIgnoreLengths(promptIgnoreLengthsHost)
.setRepetitionPenaltiesSize(maxBatchSize)
.setPresencePenaltiesSize(maxBatchSize)
.setFrequencyPenaltiesSize(maxBatchSize)
.setPromptIgnoreLengthsSize(maxBatchSize));
}
TYPED_TEST(RepetitionPenaltyTest, BatchRepetitionMixed)
{
int32_t batchSize = 6;
int32_t maxBatchSize = 2 * batchSize;
TensorPtr repetitionPenaltyHost
= BufferManager::pinned(ITensor::makeShape({maxBatchSize}), nvinfer1::DataType::kFLOAT);
TensorPtr presencePenaltyHost
= BufferManager::pinned(ITensor::makeShape({maxBatchSize}), nvinfer1::DataType::kFLOAT);
TensorPtr frequencyPenaltyHost
= BufferManager::pinned(ITensor::makeShape({maxBatchSize}), nvinfer1::DataType::kFLOAT);
TensorPtr promptIgnoreLengthsHost
= BufferManager::pinned(ITensor::makeShape({maxBatchSize}), nvinfer1::DataType::kINT32);
for (int32_t i = 0; i < maxBatchSize; ++i)
{
bufferCast<float>(*repetitionPenaltyHost)[i] = 0.53 + i * 0.2f;
bufferCast<float>(*presencePenaltyHost)[i] = 0.0f;
bufferCast<float>(*frequencyPenaltyHost)[i] = 0.0f;
bufferCast<int32_t>(*promptIgnoreLengthsHost)[i] = 0;
}
this->runTest(RepetitionPenaltyTestCase()
.setBatchSize(batchSize)
.setVocabSize(4)
.setMaxInputLength(5)
.setRepetitionPenalties(repetitionPenaltyHost)
.setPresencePenalties(presencePenaltyHost)
.setFrequencyPenalties(frequencyPenaltyHost)
.setPromptIgnoreLengths(promptIgnoreLengthsHost)
.setRepetitionPenaltiesSize(maxBatchSize)
.setPresencePenaltiesSize(maxBatchSize)
.setFrequencyPenaltiesSize(maxBatchSize)
.setPromptIgnoreLengthsSize(maxBatchSize));
}
TYPED_TEST(RepetitionPenaltyTest, BatchPresenceMixed)
{
int32_t batchSize = 6;
int32_t maxBatchSize = 2 * batchSize;
TensorPtr repetitionPenaltyHost
= BufferManager::pinned(ITensor::makeShape({maxBatchSize}), nvinfer1::DataType::kFLOAT);
TensorPtr presencePenaltyHost
= BufferManager::pinned(ITensor::makeShape({maxBatchSize}), nvinfer1::DataType::kFLOAT);
TensorPtr frequencyPenaltyHost
= BufferManager::pinned(ITensor::makeShape({maxBatchSize}), nvinfer1::DataType::kFLOAT);
TensorPtr promptIgnoreLengthsHost
= BufferManager::pinned(ITensor::makeShape({maxBatchSize}), nvinfer1::DataType::kINT32);
for (int32_t i = 0; i < maxBatchSize; ++i)
{
bufferCast<float>(*repetitionPenaltyHost)[i] = 1.0f;
bufferCast<float>(*presencePenaltyHost)[i] = 0.53 + i * 0.2f;
bufferCast<float>(*frequencyPenaltyHost)[i] = 0.0f;
bufferCast<int32_t>(*promptIgnoreLengthsHost)[i] = 0;
}
this->runTest(RepetitionPenaltyTestCase()
.setBatchSize(batchSize)
.setVocabSize(4)
.setMaxInputLength(5)
.setRepetitionPenalties(repetitionPenaltyHost)
.setPresencePenalties(presencePenaltyHost)
.setFrequencyPenalties(frequencyPenaltyHost)
.setPromptIgnoreLengths(promptIgnoreLengthsHost)
.setRepetitionPenaltiesSize(maxBatchSize)
.setPresencePenaltiesSize(maxBatchSize)
.setFrequencyPenaltiesSize(maxBatchSize)
.setPromptIgnoreLengthsSize(maxBatchSize));
}
TYPED_TEST(RepetitionPenaltyTest, BatchPresenceHasDefaultValueZero2)
{
int32_t batchSize = 6;
int32_t maxBatchSize = 2 * batchSize;
TensorPtr repetitionPenaltyHost
= BufferManager::pinned(ITensor::makeShape({maxBatchSize}), nvinfer1::DataType::kFLOAT);
TensorPtr presencePenaltyHost
= BufferManager::pinned(ITensor::makeShape({maxBatchSize}), nvinfer1::DataType::kFLOAT);
TensorPtr frequencyPenaltyHost
= BufferManager::pinned(ITensor::makeShape({maxBatchSize}), nvinfer1::DataType::kFLOAT);
TensorPtr promptIgnoreLengthsHost
= BufferManager::pinned(ITensor::makeShape({maxBatchSize}), nvinfer1::DataType::kINT32);
for (int32_t i = 0; i < maxBatchSize; ++i)
{
bufferCast<float>(*repetitionPenaltyHost)[i] = 1.0f;
bufferCast<float>(*presencePenaltyHost)[i] = i % 2 == 0 ? 1.0f : 0.0f;
bufferCast<float>(*frequencyPenaltyHost)[i] = 0.0f;
bufferCast<int32_t>(*promptIgnoreLengthsHost)[i] = 0;
}
this->runTest(RepetitionPenaltyTestCase()
.setBatchSize(batchSize)
.setVocabSize(4)
.setMaxInputLength(5)
.setRepetitionPenalties(repetitionPenaltyHost)
.setPresencePenalties(presencePenaltyHost)
.setFrequencyPenalties(frequencyPenaltyHost)
.setPromptIgnoreLengths(promptIgnoreLengthsHost)
.setRepetitionPenaltiesSize(maxBatchSize)
.setPresencePenaltiesSize(maxBatchSize)
.setFrequencyPenaltiesSize(maxBatchSize)
.setPromptIgnoreLengthsSize(maxBatchSize));
}
TYPED_TEST(RepetitionPenaltyTest, BatchFrequencyMixed)
{
int32_t batchSize = 6;
int32_t maxBatchSize = 2 * batchSize;
TensorPtr repetitionPenaltyHost
= BufferManager::pinned(ITensor::makeShape({maxBatchSize}), nvinfer1::DataType::kFLOAT);
TensorPtr presencePenaltyHost
= BufferManager::pinned(ITensor::makeShape({maxBatchSize}), nvinfer1::DataType::kFLOAT);
TensorPtr frequencyPenaltyHost
= BufferManager::pinned(ITensor::makeShape({maxBatchSize}), nvinfer1::DataType::kFLOAT);
TensorPtr promptIgnoreLengthsHost
= BufferManager::pinned(ITensor::makeShape({maxBatchSize}), nvinfer1::DataType::kINT32);
for (int32_t i = 0; i < maxBatchSize; ++i)
{
bufferCast<float>(*repetitionPenaltyHost)[i] = 1.0f;
bufferCast<float>(*presencePenaltyHost)[i] = 0.0f;
bufferCast<float>(*frequencyPenaltyHost)[i] = 0.53 + i * 0.2f;
bufferCast<int32_t>(*promptIgnoreLengthsHost)[i] = 0;
}
this->runTest(RepetitionPenaltyTestCase()
.setBatchSize(batchSize)
.setVocabSize(4)
.setMaxInputLength(5)
.setRepetitionPenalties(repetitionPenaltyHost)
.setPresencePenalties(presencePenaltyHost)
.setFrequencyPenalties(frequencyPenaltyHost)
.setPromptIgnoreLengths(promptIgnoreLengthsHost)
.setRepetitionPenaltiesSize(maxBatchSize)
.setPresencePenaltiesSize(maxBatchSize)
.setFrequencyPenaltiesSize(maxBatchSize)
.setPromptIgnoreLengthsSize(maxBatchSize));
}
TYPED_TEST(RepetitionPenaltyTest, BatchFrequencyHasDefaultValueZero2)
{
int32_t batchSize = 6;
int32_t maxBatchSize = 2 * batchSize;
TensorPtr repetitionPenaltyHost
= BufferManager::pinned(ITensor::makeShape({maxBatchSize}), nvinfer1::DataType::kFLOAT);
TensorPtr presencePenaltyHost
= BufferManager::pinned(ITensor::makeShape({maxBatchSize}), nvinfer1::DataType::kFLOAT);
TensorPtr frequencyPenaltyHost
= BufferManager::pinned(ITensor::makeShape({maxBatchSize}), nvinfer1::DataType::kFLOAT);
TensorPtr promptIgnoreLengthsHost
= BufferManager::pinned(ITensor::makeShape({maxBatchSize}), nvinfer1::DataType::kINT32);
for (int32_t i = 0; i < maxBatchSize; ++i)
{
bufferCast<float>(*repetitionPenaltyHost)[i] = 1.0f;
bufferCast<float>(*presencePenaltyHost)[i] = 0.0f;
bufferCast<float>(*frequencyPenaltyHost)[i] = i % 2 == 0 ? 1.0f : 0.0f;
bufferCast<int32_t>(*promptIgnoreLengthsHost)[i] = 0;
}
this->runTest(RepetitionPenaltyTestCase()
.setBatchSize(batchSize)
.setVocabSize(4)
.setMaxInputLength(5)
.setRepetitionPenalties(repetitionPenaltyHost)
.setPresencePenalties(presencePenaltyHost)
.setFrequencyPenalties(frequencyPenaltyHost)
.setPromptIgnoreLengths(promptIgnoreLengthsHost)
.setRepetitionPenaltiesSize(maxBatchSize)
.setPresencePenaltiesSize(maxBatchSize)
.setFrequencyPenaltiesSize(maxBatchSize)
.setPromptIgnoreLengthsSize(maxBatchSize));
}
TYPED_TEST(RepetitionPenaltyTest, PenaltyTypeRepetitionPresence)
{
int32_t batchSize = 6;
int32_t maxBatchSize = 2 * batchSize;
TensorPtr repetitionPenaltyHost
= BufferManager::pinned(ITensor::makeShape({maxBatchSize}), nvinfer1::DataType::kFLOAT);
TensorPtr presencePenaltyHost
= BufferManager::pinned(ITensor::makeShape({maxBatchSize}), nvinfer1::DataType::kFLOAT);
TensorPtr frequencyPenaltyHost
= BufferManager::pinned(ITensor::makeShape({maxBatchSize}), nvinfer1::DataType::kFLOAT);
TensorPtr promptIgnoreLengthsHost
= BufferManager::pinned(ITensor::makeShape({maxBatchSize}), nvinfer1::DataType::kINT32);
for (int32_t i = 0; i < maxBatchSize; ++i)
{
bufferCast<float>(*repetitionPenaltyHost)[i] = 0.53 + i * 0.2f;
bufferCast<float>(*presencePenaltyHost)[i] = 0.53 + i * 0.2f;
bufferCast<float>(*frequencyPenaltyHost)[i] = 0.0f;
bufferCast<int32_t>(*promptIgnoreLengthsHost)[i] = 0;
}
this->runTest(RepetitionPenaltyTestCase()
.setBatchSize(batchSize)
.setVocabSize(4)
.setMaxInputLength(5)
.setRepetitionPenalties(repetitionPenaltyHost)
.setPresencePenalties(presencePenaltyHost)
.setFrequencyPenalties(frequencyPenaltyHost)
.setPromptIgnoreLengths(promptIgnoreLengthsHost)
.setRepetitionPenaltiesSize(maxBatchSize)
.setPresencePenaltiesSize(maxBatchSize)
.setFrequencyPenaltiesSize(maxBatchSize)
.setPromptIgnoreLengthsSize(maxBatchSize));
}
TYPED_TEST(RepetitionPenaltyTest, PenaltyTypeRepetitionFrequency)
{
int32_t batchSize = 6;
int32_t maxBatchSize = 2 * batchSize;
TensorPtr repetitionPenaltyHost
= BufferManager::pinned(ITensor::makeShape({maxBatchSize}), nvinfer1::DataType::kFLOAT);
TensorPtr presencePenaltyHost
= BufferManager::pinned(ITensor::makeShape({maxBatchSize}), nvinfer1::DataType::kFLOAT);
TensorPtr frequencyPenaltyHost
= BufferManager::pinned(ITensor::makeShape({maxBatchSize}), nvinfer1::DataType::kFLOAT);
TensorPtr promptIgnoreLengthsHost
= BufferManager::pinned(ITensor::makeShape({maxBatchSize}), nvinfer1::DataType::kINT32);
for (int32_t i = 0; i < maxBatchSize; ++i)
{
bufferCast<float>(*repetitionPenaltyHost)[i] = 0.53 + i * 0.2f;
bufferCast<float>(*presencePenaltyHost)[i] = 0.0f;
bufferCast<float>(*frequencyPenaltyHost)[i] = 0.53 + i * 0.2f;
bufferCast<int32_t>(*promptIgnoreLengthsHost)[i] = 0;
}
this->runTest(RepetitionPenaltyTestCase()
.setBatchSize(batchSize)
.setVocabSize(4)
.setMaxInputLength(5)
.setRepetitionPenalties(repetitionPenaltyHost)
.setPresencePenalties(presencePenaltyHost)
.setFrequencyPenalties(frequencyPenaltyHost)
.setPromptIgnoreLengths(promptIgnoreLengthsHost)
.setRepetitionPenaltiesSize(maxBatchSize)
.setPresencePenaltiesSize(maxBatchSize)
.setFrequencyPenaltiesSize(maxBatchSize)
.setPromptIgnoreLengthsSize(maxBatchSize));
}
TYPED_TEST(RepetitionPenaltyTest, PenaltyTypePresenceFrequency)
{
int32_t batchSize = 6;
int32_t maxBatchSize = 2 * batchSize;
TensorPtr repetitionPenaltyHost
= BufferManager::pinned(ITensor::makeShape({maxBatchSize}), nvinfer1::DataType::kFLOAT);
TensorPtr presencePenaltyHost
= BufferManager::pinned(ITensor::makeShape({maxBatchSize}), nvinfer1::DataType::kFLOAT);
TensorPtr frequencyPenaltyHost
= BufferManager::pinned(ITensor::makeShape({maxBatchSize}), nvinfer1::DataType::kFLOAT);
TensorPtr promptIgnoreLengthsHost
= BufferManager::pinned(ITensor::makeShape({maxBatchSize}), nvinfer1::DataType::kINT32);
for (int32_t i = 0; i < maxBatchSize; ++i)
{
bufferCast<float>(*repetitionPenaltyHost)[i] = 1.0f;
bufferCast<float>(*presencePenaltyHost)[i] = 0.53 + i * 0.2f;
bufferCast<float>(*frequencyPenaltyHost)[i] = 0.53 + i * 0.2f;
bufferCast<int32_t>(*promptIgnoreLengthsHost)[i] = 0;
}
this->runTest(RepetitionPenaltyTestCase()
.setBatchSize(batchSize)
.setVocabSize(4)
.setMaxInputLength(5)
.setRepetitionPenalties(repetitionPenaltyHost)
.setPresencePenalties(presencePenaltyHost)
.setFrequencyPenalties(frequencyPenaltyHost)
.setPromptIgnoreLengths(promptIgnoreLengthsHost)
.setRepetitionPenaltiesSize(maxBatchSize)
.setPresencePenaltiesSize(maxBatchSize)
.setFrequencyPenaltiesSize(maxBatchSize)
.setPromptIgnoreLengthsSize(maxBatchSize));
}
TYPED_TEST(RepetitionPenaltyTest, PenaltyTypeFull)
{
int32_t batchSize = 6;
int32_t maxBatchSize = 2 * batchSize;
TensorPtr repetitionPenaltyHost
= BufferManager::pinned(ITensor::makeShape({maxBatchSize}), nvinfer1::DataType::kFLOAT);
TensorPtr presencePenaltyHost
= BufferManager::pinned(ITensor::makeShape({maxBatchSize}), nvinfer1::DataType::kFLOAT);
TensorPtr frequencyPenaltyHost
= BufferManager::pinned(ITensor::makeShape({maxBatchSize}), nvinfer1::DataType::kFLOAT);
TensorPtr promptIgnoreLengthsHost
= BufferManager::pinned(ITensor::makeShape({maxBatchSize}), nvinfer1::DataType::kINT32);
for (int32_t i = 0; i < maxBatchSize; ++i)
{
bufferCast<float>(*repetitionPenaltyHost)[i] = 0.53 + i * 0.2f;
bufferCast<float>(*presencePenaltyHost)[i] = 0.53 + i * 0.2f;
bufferCast<float>(*frequencyPenaltyHost)[i] = 0.53 + i * 0.2f;
bufferCast<int32_t>(*promptIgnoreLengthsHost)[i] = 0;
}
this->runTest(RepetitionPenaltyTestCase()
.setBatchSize(batchSize)
.setVocabSize(4)
.setMaxInputLength(5)
.setRepetitionPenalties(repetitionPenaltyHost)
.setPresencePenalties(presencePenaltyHost)
.setFrequencyPenalties(frequencyPenaltyHost)
.setPromptIgnoreLengths(promptIgnoreLengthsHost)
.setRepetitionPenaltiesSize(maxBatchSize)
.setPresencePenaltiesSize(maxBatchSize)
.setFrequencyPenaltiesSize(maxBatchSize)
.setPromptIgnoreLengthsSize(maxBatchSize));
}
TYPED_TEST(RepetitionPenaltyTest, PenaltyTypeFullTokensPerStep)
{
int32_t batchSize = 6;
int32_t maxBatchSize = 2 * batchSize;
TensorPtr repetitionPenaltyHost
= BufferManager::pinned(ITensor::makeShape({maxBatchSize}), nvinfer1::DataType::kFLOAT);
TensorPtr presencePenaltyHost
= BufferManager::pinned(ITensor::makeShape({maxBatchSize}), nvinfer1::DataType::kFLOAT);
TensorPtr frequencyPenaltyHost
= BufferManager::pinned(ITensor::makeShape({maxBatchSize}), nvinfer1::DataType::kFLOAT);
TensorPtr promptIgnoreLengthsHost
= BufferManager::pinned(ITensor::makeShape({maxBatchSize}), nvinfer1::DataType::kINT32);
for (int32_t i = 0; i < maxBatchSize; ++i)
{
bufferCast<float>(*repetitionPenaltyHost)[i] = 0.53 + i * 0.2f;
bufferCast<float>(*presencePenaltyHost)[i] = 0.53 + i * 0.2f;
bufferCast<float>(*frequencyPenaltyHost)[i] = 0.53 + i * 0.2f;
bufferCast<int32_t>(*promptIgnoreLengthsHost)[i] = 0;
}
this->runTest(RepetitionPenaltyTestCase()
.setBatchSize(batchSize)
.setVocabSize(4)
.setMaxInputLength(5)
.setRepetitionPenalties(repetitionPenaltyHost)
.setPresencePenalties(presencePenaltyHost)
.setFrequencyPenalties(frequencyPenaltyHost)
.setPromptIgnoreLengths(promptIgnoreLengthsHost)
.setRepetitionPenaltiesSize(maxBatchSize)
.setPresencePenaltiesSize(maxBatchSize)
.setFrequencyPenaltiesSize(maxBatchSize)
.setPromptIgnoreLengthsSize(maxBatchSize)
.setMaxTokensPerStep(4));
}
TYPED_TEST(RepetitionPenaltyTest, PenaltyTypeFullWithPartialPromptIgnore)
{
int32_t batchSize = 6;
int32_t maxBatchSize = 2 * batchSize;
TensorPtr repetitionPenaltyHost
= BufferManager::pinned(ITensor::makeShape({maxBatchSize}), nvinfer1::DataType::kFLOAT);
TensorPtr presencePenaltyHost
= BufferManager::pinned(ITensor::makeShape({maxBatchSize}), nvinfer1::DataType::kFLOAT);
TensorPtr frequencyPenaltyHost
= BufferManager::pinned(ITensor::makeShape({maxBatchSize}), nvinfer1::DataType::kFLOAT);
TensorPtr promptIgnoreLengthsHost
= BufferManager::pinned(ITensor::makeShape({maxBatchSize}), nvinfer1::DataType::kINT32);
for (int32_t i = 0; i < maxBatchSize; ++i)
{
bufferCast<float>(*repetitionPenaltyHost)[i] = 0.53 + i * 0.2f;
bufferCast<float>(*presencePenaltyHost)[i] = 0.53 + i * 0.2f;
bufferCast<float>(*frequencyPenaltyHost)[i] = 0.53 + i * 0.2f;
bufferCast<int32_t>(*promptIgnoreLengthsHost)[i] = 1; // set to 1 to ignore first prompt token
}
this->runTest(RepetitionPenaltyTestCase()
.setBatchSize(batchSize)
.setVocabSize(4)
.setMaxInputLength(5)
.setRepetitionPenalties(repetitionPenaltyHost)
.setPresencePenalties(presencePenaltyHost)
.setFrequencyPenalties(frequencyPenaltyHost)
.setPromptIgnoreLengths(promptIgnoreLengthsHost)
.setRepetitionPenaltiesSize(maxBatchSize)
.setPresencePenaltiesSize(maxBatchSize)
.setFrequencyPenaltiesSize(maxBatchSize)
.setPromptIgnoreLengthsSize(maxBatchSize));
}
TYPED_TEST(RepetitionPenaltyTest, PenaltyTypeFullTokensPerStepWithFullPromptIgnore)
{
int32_t batchSize = 6;
int32_t maxBatchSize = 2 * batchSize;
TensorPtr repetitionPenaltyHost
= BufferManager::pinned(ITensor::makeShape({maxBatchSize}), nvinfer1::DataType::kFLOAT);
TensorPtr presencePenaltyHost
= BufferManager::pinned(ITensor::makeShape({maxBatchSize}), nvinfer1::DataType::kFLOAT);
TensorPtr frequencyPenaltyHost
= BufferManager::pinned(ITensor::makeShape({maxBatchSize}), nvinfer1::DataType::kFLOAT);
TensorPtr promptIgnoreLengthsHost
= BufferManager::pinned(ITensor::makeShape({maxBatchSize}), nvinfer1::DataType::kINT32);
for (int32_t i = 0; i < maxBatchSize; ++i)
{
bufferCast<float>(*repetitionPenaltyHost)[i] = 0.53 + i * 0.2f;
bufferCast<float>(*presencePenaltyHost)[i] = 0.53 + i * 0.2f;
bufferCast<float>(*frequencyPenaltyHost)[i] = 0.53 + i * 0.2f;
bufferCast<int32_t>(*promptIgnoreLengthsHost)[i] = 5; // set to max input length to ignore full prompt
}
this->runTest(RepetitionPenaltyTestCase()
.setBatchSize(batchSize)
.setVocabSize(4)
.setMaxInputLength(5)
.setRepetitionPenalties(repetitionPenaltyHost)
.setPresencePenalties(presencePenaltyHost)
.setFrequencyPenalties(frequencyPenaltyHost)
.setPromptIgnoreLengths(promptIgnoreLengthsHost)
.setRepetitionPenaltiesSize(maxBatchSize)
.setPresencePenaltiesSize(maxBatchSize)
.setFrequencyPenaltiesSize(maxBatchSize)
.setPromptIgnoreLengthsSize(maxBatchSize)
.setMaxTokensPerStep(4));
}
struct MinLengthPenaltyTestParams
{
int32_t batchSize;
int32_t vocabSize;
int32_t maxSeqLength;
int32_t maxTokensPerStep{1};
MinLengthPenaltyTestParams& setBatchSize(int32_t bs)
{
batchSize = bs;
return *this;
}
MinLengthPenaltyTestParams& setVocabSize(int32_t vs)
{
vocabSize = vs;
return *this;
}
MinLengthPenaltyTestParams& setMaxSeqLength(int32_t sl)
{
maxSeqLength = sl;
return *this;
}
MinLengthPenaltyTestParams& setMaxTokensPerStep(int32_t ts)
{
maxTokensPerStep = ts;
return *this;
}
std::string toString() const
{
return tc::fmtstr("MinLengthPenaltyTestParams[batch=%d, vocab=%d, maxSeqLen=%d, maxTokensPerStep=%d]",
batchSize, vocabSize, maxSeqLength, maxTokensPerStep);
}
};
template <typename T>
class MinLengthPenaltyTest : public SamplingKernelTest<T>
{
protected:
// Set up test
int32_t mBatchSize;
int32_t mMaxBatchSize;
int32_t mVocabSize;
int32_t mVocabSizePadded;
int32_t mMaxInputLength;
int32_t mSequenceLength;
int32_t mMaxTokensPerStep;
using SamplingKernelTest<T>::mBufferManager;
using SamplingKernelTest<T>::mStream;
using SamplingKernelTest<T>::mLogitsHost;
TensorPtr mLogitsDevice;
TensorPtr mOutLogitsDevice;
TensorPtr mLogitsRefHost;
TensorPtr mLogitsPtrs;
TensorPtr mPenaltyWorkspaceDevice;
TensorPtr mTokensPerStep;
TensorPtr mContextLengthHost;
TensorPtr mContextLengthDevice;
TensorPtr mSeqLengthHost;
TensorPtr mSeqLengthDevice;
TensorPtr mMinLengthHost;
TensorPtr mMinLengthDevice;
TensorPtr mEndIdsHost;
TensorPtr mEndIdsDevice;
TensorPtr mBatchSlots;
void subsetup(MinLengthPenaltyTestParams param)
{
auto const dataType = TRTDataType<T>::value;
auto const ptrType = TRTDataType<T*>::value;
mBatchSize = param.batchSize;
mMaxBatchSize = 2 * mBatchSize;
mVocabSize = param.vocabSize;
mVocabSizePadded = padVocabSize(mVocabSize);
mMaxInputLength = param.maxSeqLength;
mSequenceLength = 2 * mMaxInputLength; // input + output
mMaxTokensPerStep = param.maxTokensPerStep;
mLogitsHost
= BufferManager::pinned(ITensor::makeShape({mBatchSize, mMaxTokensPerStep, mVocabSizePadded}), dataType);
mLogitsRefHost
= BufferManager::pinned(ITensor::makeShape({mBatchSize, mMaxTokensPerStep, mVocabSizePadded}), dataType);
mLogitsDevice
= mBufferManager->gpu(ITensor::makeShape({mBatchSize, mMaxTokensPerStep, mVocabSizePadded}), dataType);
mOutLogitsDevice
= mBufferManager->gpu(ITensor::makeShape({mBatchSize, mMaxTokensPerStep, mVocabSizePadded}), dataType);
mLogitsPtrs = BufferManager::pinned(ITensor::makeShape({mBatchSize}), ptrType);
mPenaltyWorkspaceDevice = mBufferManager->gpu(
ITensor::makeShape({mBatchSize, mMaxTokensPerStep, mVocabSize}), nvinfer1::DataType::kINT32);
mTokensPerStep = BufferManager::pinned(ITensor::makeShape({mMaxBatchSize}), nvinfer1::DataType::kINT32);
mSeqLengthHost = BufferManager::pinned(ITensor::makeShape({mMaxBatchSize}), nvinfer1::DataType::kINT32);
mSeqLengthDevice = mBufferManager->gpu(ITensor::makeShape({mMaxBatchSize}), nvinfer1::DataType::kINT32);
mContextLengthHost = BufferManager::pinned(ITensor::makeShape({mMaxBatchSize}), nvinfer1::DataType::kINT32);
mContextLengthDevice = mBufferManager->gpu(ITensor::makeShape({mMaxBatchSize}), nvinfer1::DataType::kINT32);
mMinLengthHost = BufferManager::pinned(ITensor::makeShape({mMaxBatchSize}), nvinfer1::DataType::kINT32);
mMinLengthDevice = mBufferManager->gpu(ITensor::makeShape({mMaxBatchSize}), nvinfer1::DataType::kINT32);
mEndIdsHost = BufferManager::pinned(ITensor::makeShape({mMaxBatchSize}), nvinfer1::DataType::kINT32);
mEndIdsDevice = mBufferManager->gpu(ITensor::makeShape({mMaxBatchSize}), nvinfer1::DataType::kINT32);
mBatchSlots = BufferManager::pinned(ITensor::makeShape({mBatchSize}), nvinfer1::DataType::kINT32);
auto batchSlotsPtr = bufferCast<int32_t>(*mBatchSlots);
for (SizeType32 bi = 0; bi < mBatchSize; ++bi)
{
batchSlotsPtr[bi] = 2 * bi;
}
initLogitsAndBias(bufferCast<T>(*mLogitsHost), static_cast<T*>(nullptr), mBatchSize, mMaxTokensPerStep,
mVocabSize, mVocabSizePadded);
initRandomInt(bufferCast<int32_t>(*mContextLengthHost), mMaxBatchSize, 0, mMaxInputLength);
initRandomInt(bufferCast<int32_t>(*mMinLengthHost), mMaxBatchSize, 1, mMaxInputLength);
initRandomInt(bufferCast<int32_t>(*mEndIdsHost), mMaxBatchSize, 0, mVocabSize);
trk::invokeFill(*mLogitsRefHost, T{0.0f}, *mStream);
trk::invokeFill(*mOutLogitsDevice, T{0.0f}, *mStream);
auto seqLengthHostPtr = bufferCast<int32_t>(*mSeqLengthHost);
auto contextLengthHostPtr = bufferCast<int32_t>(*mContextLengthHost);
auto minLengthHostPtr = bufferCast<int32_t>(*mMinLengthHost);
auto tokensPerStepPtr = bufferCast<int32_t>(*mTokensPerStep);
for (SizeType32 bi = 0; bi < mMaxBatchSize; bi++)
{
// Current generated seq len is randomly either smaller than min length or larger
auto const generatedSeqLen = std::max(0,
std::min(
static_cast<int32_t>(minLengthHostPtr[bi] + 2 * std::pow(-1, std::rand() % 2)), mMaxInputLength));
seqLengthHostPtr[bi] = contextLengthHostPtr[bi] + generatedSeqLen;
tokensPerStepPtr[bi] = (std::rand() % mMaxTokensPerStep) + 1;
}
mBufferManager->copy(*mLogitsHost, *mLogitsDevice);
mBufferManager->copy(*mMinLengthHost, *mMinLengthDevice);
mBufferManager->copy(*mContextLengthHost, *mContextLengthDevice);
mBufferManager->copy(*mSeqLengthHost, *mSeqLengthDevice);
mBufferManager->copy(*mEndIdsHost, *mEndIdsDevice);
auto logitsPtrs = BufferRange<T*>(*mLogitsPtrs);
for (SizeType32 bi = 0; bi < mBatchSize; ++bi)
{
logitsPtrs[bi]
= bufferCast<T>(*mLogitsDevice) + (mBatchSize - bi - 1) * mMaxTokensPerStep * mVocabSizePadded;
}
}
void computeReference(T const* const inLogits, T* const outLogits, int32_t const* const minSeqLen,
int32_t const* const endIds, int32_t const* const sequenceLengths, int32_t const* const contextLengths)
{
bool const IS_FP16 = std::is_same<T, half>::value;
T const MAX_T_VAL = (IS_FP16) ? HALF_FLT_MAX : FLT_MAX;
auto const batchSlotsPtr = bufferCast<int32_t>(*mBatchSlots);
auto const tokensPerStepPtr = bufferCast<int32_t>(*mTokensPerStep);
for (int32_t bi = 0; bi < mBatchSize; ++bi)
{
auto const batchSlot = batchSlotsPtr[bi];
for (int32_t ti = 0; ti < tokensPerStepPtr[batchSlot]; ++ti)
{
for (int32_t vi = 0; vi < mVocabSizePadded; ++vi)
{
auto const srcIdx = ((mBatchSize - bi - 1) * mMaxTokensPerStep + ti) * mVocabSizePadded + vi;
auto const dstIdx = (bi * mMaxTokensPerStep + ti) * mVocabSizePadded + vi;
outLogits[dstIdx] = inLogits[srcIdx];
}
}
}
for (int32_t bi = 0; bi < mBatchSize; ++bi)
{
auto const batchSlot = batchSlotsPtr[bi];
for (int32_t ti = 0; ti < tokensPerStepPtr[batchSlot]; ++ti)
{
auto const generatedSeqLen = sequenceLengths[batchSlot] - contextLengths[batchSlot];
auto const endId = endIds[batchSlot];
if (generatedSeqLen < minSeqLen[batchSlot])
{
outLogits[(bi * mMaxTokensPerStep + ti) * mVocabSizePadded + endId] = -MAX_T_VAL;
}
}
}
}
public:
void runTest(MinLengthPenaltyTestParams param)
{
subsetup(param);
InvokeBatchApplyPenaltyParams<T> penaltyParams{reinterpret_cast<T**>(bufferCast<int64_t>(*mLogitsPtrs)),
bufferCast<T>(*mOutLogitsDevice), nullptr, bufferCast<int32_t>(*mPenaltyWorkspaceDevice), nullptr, nullptr,
nullptr, nullptr, nullptr, nullptr, mBatchSize, 1, mSequenceLength, mVocabSize, mVocabSizePadded, nullptr,
nullptr, bufferCast<int32_t>(*mContextLengthDevice), bufferCast<int32_t>(*mSeqLengthDevice),
bufferCast<int32_t>(*mMinLengthDevice), bufferCast<int32_t>(*mEndIdsDevice),
bufferCast<int32_t>(*mBatchSlots), mMaxTokensPerStep, bufferCast<int32_t>(*mTokensPerStep), nullptr,
mStream->get()};
tk::invokeBatchApplyPenalty(penaltyParams);
mStream->synchronize();
computeReference(bufferCast<T>(*mLogitsHost), bufferCast<T>(*mLogitsRefHost),
bufferCast<int32_t>(*mMinLengthHost), bufferCast<int32_t>(*mEndIdsHost),
bufferCast<int32_t>(*mSeqLengthHost), bufferCast<int32_t>(*mContextLengthHost));
auto logitsOutHost = mBufferManager->copyFrom(*mOutLogitsDevice, MemoryType::kCPU);
mStream->synchronize();
bool passed = checkResult(param.toString(), bufferCast<T>(*logitsOutHost), bufferCast<T>(*mLogitsRefHost),
mBatchSize * mMaxTokensPerStep * mVocabSizePadded);
EXPECT_TRUE(passed);
}
};
TYPED_TEST_SUITE(MinLengthPenaltyTest, FloatAndHalfTypes);
TYPED_TEST(MinLengthPenaltyTest, BatchMaxSeqLen2)
{
this->runTest(MinLengthPenaltyTestParams().setBatchSize(16).setVocabSize(10).setMaxSeqLength(2));
}
TYPED_TEST(MinLengthPenaltyTest, BatchMaxSeqLen64)
{
this->runTest(MinLengthPenaltyTestParams().setBatchSize(16).setVocabSize(51200).setMaxSeqLength(64));
}
TYPED_TEST(MinLengthPenaltyTest, BatchMaxSeqLen64TokensPerStep)
{
this->runTest(
MinLengthPenaltyTestParams().setBatchSize(16).setVocabSize(51200).setMaxSeqLength(64).setMaxTokensPerStep(4));
}
// When endId=-1 is not handled properly, minLength penalty kernel accesses out-of-bounds memory just
// prior to mOutLogitsDevice. Input logits are laid out just before mOutLogitsDevice in memory in this test
// to show that last input logit is corrupted when endId=-1 is not handled correctly.
template <typename T>
class MinLengthPenaltyOOBSafetyTest : public SamplingKernelTest<T>
{
protected:
// Set up test
int32_t mBatchSize;
int32_t mMaxBatchSize;
int32_t mVocabSize;
int32_t mVocabSizePadded;
int32_t mMaxInputLength;
int32_t mSequenceLength;
int32_t mMaxTokensPerStep;
using SamplingKernelTest<T>::mBufferManager;
using SamplingKernelTest<T>::mStream;
using SamplingKernelTest<T>::mLogitsHost;
TensorPtr mLogitsDevice;
TensorPtr mOutLogitsDevice;
TensorPtr mLogitsRefHost;
TensorPtr mLogitsPtrs;
TensorPtr mContextLengthHost;
TensorPtr mContextLengthDevice;
TensorPtr mSeqLengthHost;
TensorPtr mSeqLengthDevice;
TensorPtr mMinLengthHost;
TensorPtr mMinLengthDevice;
TensorPtr mEndIdsHost;
TensorPtr mEndIdsDevice;
TensorPtr mBatchSlots;
void subsetupMemorySafety()
{
auto const dataType = TRTDataType<T>::value;
auto const ptrType = TRTDataType<T*>::value;
mBatchSize = 1;
mMaxBatchSize = 1;
// Set vocabSize to be same as vocabSizePadded so that no mask value is set
// in output logits because of vocabSize < vocabSizePadded. This means outputs
// logits are set to input logits as is without any substitution.
mVocabSize = 16;
mVocabSizePadded = padVocabSize(mVocabSize);
mMaxInputLength = 5;
mSequenceLength = 10; // input + output
mMaxTokensPerStep = 1;
// Create a large chunk of memory which hosts both input and output logits contiguously.
// Goal is to show that last input logit is corrupted when endId=-1 is not handled correctly.
mLogitsDevice
= mBufferManager->gpu(ITensor::makeShape({2 * mBatchSize, mMaxTokensPerStep, mVocabSizePadded}), dataType);
trk::invokeFill(*mLogitsDevice, T{0.1f}, *mStream);
mOutLogitsDevice = ITensor::slice(mLogitsDevice, mBatchSize);
trk::invokeFill(*mOutLogitsDevice, T{0.2f}, *mStream);
mLogitsPtrs = BufferManager::pinned(ITensor::makeShape({mBatchSize}), ptrType);
auto logitsPtrs = BufferRange<T*>(*mLogitsPtrs);
for (SizeType32 bi = 0; bi < mBatchSize; ++bi)
{
logitsPtrs[bi]
= bufferCast<T>(*mLogitsDevice) + (mBatchSize - bi - 1) * mMaxTokensPerStep * mVocabSizePadded;
}
// Defines currentStep.
mSeqLengthHost = BufferManager::pinned(ITensor::makeShape({mMaxBatchSize}), nvinfer1::DataType::kINT32);
initConstant<int>(bufferCast<int32_t>(*mSeqLengthHost), mMaxBatchSize, 3);
mSeqLengthDevice = mBufferManager->gpu(ITensor::makeShape({mMaxBatchSize}), nvinfer1::DataType::kINT32);
mBufferManager->copy(*mSeqLengthHost, *mSeqLengthDevice);
// Defines inputLength.
mContextLengthHost = BufferManager::pinned(ITensor::makeShape({mMaxBatchSize}), nvinfer1::DataType::kINT32);
initConstant<int>(bufferCast<int32_t>(*mContextLengthHost), mMaxBatchSize, 2);
mContextLengthDevice = mBufferManager->gpu(ITensor::makeShape({mMaxBatchSize}), nvinfer1::DataType::kINT32);
mBufferManager->copy(*mContextLengthHost, *mContextLengthDevice);
// Defines minLength.
mMinLengthHost = BufferManager::pinned(ITensor::makeShape({mMaxBatchSize}), nvinfer1::DataType::kINT32);
initConstant<int>(bufferCast<int32_t>(*mMinLengthHost), mMaxBatchSize, 10);
mMinLengthDevice = mBufferManager->gpu(ITensor::makeShape({mMaxBatchSize}), nvinfer1::DataType::kINT32);
mBufferManager->copy(*mMinLengthHost, *mMinLengthDevice);
// Defines endIds.
mEndIdsHost = BufferManager::pinned(ITensor::makeShape({mMaxBatchSize}), nvinfer1::DataType::kINT32);
initConstant<int>(bufferCast<int32_t>(*mEndIdsHost), mMaxBatchSize, -1);
mEndIdsDevice = mBufferManager->gpu(ITensor::makeShape({mMaxBatchSize}), nvinfer1::DataType::kINT32);
mBufferManager->copy(*mEndIdsHost, *mEndIdsDevice);
mBatchSlots = BufferManager::pinned(ITensor::makeShape({mBatchSize}), nvinfer1::DataType::kINT32);
auto batchSlotsPtr = bufferCast<int32_t>(*mBatchSlots);
for (SizeType32 bi = 0; bi < mBatchSize; ++bi)
{
batchSlotsPtr[bi] = 2 * bi;
}
}
public:
void runMemorySafetyTest()
{
subsetupMemorySafety();
InvokeBatchApplyPenaltyParams<T> penaltyParams{
/*inputLogits=*/reinterpret_cast<T**>(bufferCast<int64_t>(*mLogitsPtrs)),
/*outputLogits=*/bufferCast<T>(*mOutLogitsDevice),
/*biases=*/nullptr,
/*penaltyWorkspace=*/nullptr,
/*penaltyWorkspacePrev=*/nullptr,
/*temperatures=*/nullptr,
/*repetitionPenalties=*/nullptr,
/*presencePenalties=*/nullptr,
/*frequencyPenalties=*/nullptr,
/*promptIgnoreLengths=*/nullptr,
/*batchSize=*/mBatchSize,
/*beamWidth=*/1,
/*maxSeqLen=*/mSequenceLength,
/*vocabSize=*/mVocabSize,
/*vocabSizePadded=*/mVocabSizePadded,
/*outputIdsPtr=*/nullptr,
/*parentIdsPtr=*/nullptr,
/*inputLengths=*/bufferCast<int32_t>(*mContextLengthDevice),
/*sequenceLengths=*/bufferCast<int32_t>(*mSeqLengthDevice),
/*minLengths=*/bufferCast<int32_t>(*mMinLengthDevice),
/*endIds=*/bufferCast<int32_t>(*mEndIdsDevice),
/*batchSlots=*/bufferCast<int32_t>(*mBatchSlots),
/*maxTokensPerStep=*/mMaxTokensPerStep,
/*tokensPerStep=*/nullptr,
/*finished=*/nullptr,
/*stream=*/mStream->get()};
tk::invokeBatchApplyPenalty(penaltyParams);
mStream->synchronize();
auto logitsOutHost = mBufferManager->copyFrom(*mOutLogitsDevice, MemoryType::kCPU);
auto logitsInHost = mBufferManager->copyFrom(*mLogitsDevice, MemoryType::kCPU);
auto minLengthsHost = mBufferManager->copyFrom(*mMinLengthDevice, MemoryType::kCPU);
auto inputLengthsHost = mBufferManager->copyFrom(*mContextLengthDevice, MemoryType::kCPU);
auto endIdsHost = mBufferManager->copyFrom(*mEndIdsDevice, MemoryType::kCPU);
auto seqLengthsHost = mBufferManager->copyFrom(*mSeqLengthDevice, MemoryType::kCPU);
mStream->synchronize();
// Verify that output logits were modified to input logits.
auto logitsOutPtr = bufferCast<T>(*logitsOutHost);
for (int32_t i = 0; i < mBatchSize * mMaxTokensPerStep * mVocabSizePadded; ++i)
{
// All output logits must be updated to be same as input logits.
// No mask value is set anywhere because vocabSize is same as vocabSizePadded.
EXPECT_EQ(static_cast<T>(logitsOutPtr[i]), T{0.1f})
<< "Output logits were modified to unexpected value at index " << i;
}
// Verify that batch slots were not modified.
auto batchSlotsPtr = bufferCast<int32_t>(*mBatchSlots);
for (SizeType32 bi = 0; bi < mBatchSize; ++bi)
{
EXPECT_EQ(batchSlotsPtr[bi], 2 * bi) << "Batch slots were modified at index " << bi;
}
// Verify that inLogits were not modified.
auto logitsInPtr = bufferCast<T>(*logitsInHost);
for (int32_t i = 0; i < mBatchSize * mMaxTokensPerStep * mVocabSizePadded; ++i)
{
EXPECT_EQ(static_cast<T>(logitsInPtr[i]), T{0.1f})
<< "Input logits were modified to unexpected value at index " << i;
}
// Verify that minLengths were not modified.
auto minLengthsPtr = bufferCast<int32_t>(*minLengthsHost);
for (int32_t i = 0; i < mMaxBatchSize; ++i)
{
EXPECT_EQ(minLengthsPtr[i], 10) << "MinLengths were modified at index " << i;
}
// Verify that inputLengths were not modified.
auto inputLengthsPtr = bufferCast<int32_t>(*inputLengthsHost);
for (int32_t i = 0; i < mMaxBatchSize; ++i)
{
EXPECT_EQ(inputLengthsPtr[i], 2) << "InputLengths were modified at index " << i;
}
// Verify that endIds were not modified.
auto endIdsPtr = bufferCast<int32_t>(*endIdsHost);
for (int32_t i = 0; i < mMaxBatchSize; ++i)
{
EXPECT_EQ(endIdsPtr[i], -1) << "EndIds were modified at index " << i;
}
// Verify that seqLengths were not modified.
auto seqLengthsPtr = bufferCast<int32_t>(*seqLengthsHost);
for (int32_t i = 0; i < mMaxBatchSize; ++i)
{
EXPECT_EQ(seqLengthsPtr[i], 3) << "SeqLengths were modified at index " << i;
}
}
};
TYPED_TEST_SUITE(MinLengthPenaltyOOBSafetyTest, FloatAndHalfTypes);
TYPED_TEST(MinLengthPenaltyOOBSafetyTest, VerifyMemorySafetyForNegativeEndId)
{
this->runMemorySafetyTest();
}
} // namespace