mirror of
https://github.com/NVIDIA/TensorRT-LLM.git
synced 2026-01-14 06:27:45 +08:00
* feat: Variable-Beam-Width-Search Part2 Signed-off-by: wili-65535 <wili-65535@user.noreply.github.com> * feat: Variable-Beam-Width-Search Part2 Signed-off-by: wili-65535 <wili-65535@user.noreply.github.com> * feat: Variable-Beam-Width-Search Part2, fix CPP tests Signed-off-by: wili-65535 <wili-65535@user.noreply.github.com> * feat: Variable-Beam-Width-Search Part3, simplify CPP tests Signed-off-by: wili-65535 <wili-65535@user.noreply.github.com> * feat: Variable-Beam-Width-Search Part4, move beam_width_array param Signed-off-by: wili-65535 <wili-65535@user.noreply.github.com> * feat: Variable-Beam-Width-Search, fix CI error Signed-off-by: wili-65535 <wili-65535@user.noreply.github.com> * feat: Variable-Beam-Width-Search part2 Signed-off-by: wili-65535 <wili-65535@user.noreply.github.com> * feat: Variable-Beam-Width-Search part2 Signed-off-by: wili-65535 <wili-65535@user.noreply.github.com> * feat: Variable-Beam-Width-Search part2, fix pre-commit Signed-off-by: wili-65535 <wili-65535@user.noreply.github.com> * feat: Variable-Beam-Width-Search part2, fix review Signed-off-by: wili-65535 <wili-65535@user.noreply.github.com> --------- Signed-off-by: wili-65535 <wili-65535@user.noreply.github.com> Co-authored-by: wili-65535 <wili-65535@user.noreply.github.com>
389 lines
17 KiB
C++
389 lines
17 KiB
C++
/*
|
|
* Copyright (c) 2022-2024, NVIDIA CORPORATION. All rights reserved.
|
|
*
|
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
|
* you may not use this file except in compliance with the License.
|
|
* You may obtain a copy of the License at
|
|
*
|
|
* http://www.apache.org/licenses/LICENSE-2.0
|
|
*
|
|
* Unless required by applicable law or agreed to in writing, software
|
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
* See the License for the specific language governing permissions and
|
|
* limitations under the License.
|
|
*/
|
|
|
|
#include "tests/unit_tests/layers/baseSamplingLayerTest.h"
|
|
|
|
namespace tensorrt_llm::tests::layers::sampling
|
|
{
|
|
|
|
using namespace tensorrt_llm::runtime;
|
|
using namespace tensorrt_llm::layers;
|
|
using namespace tensorrt_llm::common;
|
|
|
|
namespace tk = tensorrt_llm::kernels;
|
|
namespace trk = tensorrt_llm::runtime::kernels;
|
|
|
|
template <typename T>
|
|
void BaseSamplingLayerTest<T>::setup(uint64_t seed, TestSamplingParams const& params)
|
|
{
|
|
auto const dataType = TRTDataType<T>::value;
|
|
auto const ptrType = TRTDataType<T*>::value;
|
|
|
|
// clang-format off
|
|
|
|
// logits = (-0.9163, -1.2040, -1.6094, -2.3026) -> prob = (0.4, 0.3, 0.2, 0.1)
|
|
std::vector<T> testLogits = {
|
|
-FLT_MAX, -FLT_MAX, -FLT_MAX, -FLT_MAX, -0.9163, -1.2040, -1.6094, -2.3026, // step 0
|
|
-0.9163, -1.2040, -1.6094, -2.3026, -FLT_MAX, -FLT_MAX, -FLT_MAX, -FLT_MAX, // step 1
|
|
-FLT_MAX, -FLT_MAX, -0.9163, -1.2040, -1.6094, -2.3026, -FLT_MAX, -FLT_MAX, // step 2
|
|
-0.9163, -1.2040, -1.6094, -2.3026, -FLT_MAX, -FLT_MAX, -FLT_MAX, -FLT_MAX // step 3
|
|
};
|
|
|
|
// clang-format on
|
|
|
|
if (params.beamWidth == 1)
|
|
{
|
|
mTestLogitsInit = testLogits;
|
|
}
|
|
else
|
|
{
|
|
for (int step = 0; step < mMaxSeqLen; ++step)
|
|
{
|
|
auto const& logitsBegin = testLogits.begin() + mVocabSize * step;
|
|
auto const& logitsEnd = testLogits.begin() + mVocabSize * (step + 1);
|
|
for (int bm = 0; bm < params.beamWidth; ++bm)
|
|
{
|
|
mTestLogitsInit.insert(mTestLogitsInit.end(), logitsBegin, logitsEnd);
|
|
}
|
|
}
|
|
}
|
|
|
|
if (mComputeProbs)
|
|
{
|
|
computeProb(mTestLogitsInit.data(), mTestLogitsInit.data(),
|
|
BaseSamplingLayerTest::mMaxOutputLen * params.beamWidth, mVocabSize);
|
|
}
|
|
|
|
mSeqLengthsDevice = mBufferManager->gpu(ITensor::makeShape({maxBatchSize()}), nvinfer1::DataType::kINT32);
|
|
mContextLengthDevice = mBufferManager->gpu(ITensor::makeShape({maxBatchSize()}), nvinfer1::DataType::kINT32);
|
|
mFinishedDevice = params.isExternalDraftTokensLayerTest
|
|
? mBufferManager->gpu(ITensor::makeShape({mMaxTokensPerEngineStep, maxBatchSize()}),
|
|
TRTDataType<tk::FinishedState::UnderlyingType>::value)
|
|
: mBufferManager->gpu(
|
|
ITensor::makeShape({maxBatchSize()}), TRTDataType<tk::FinishedState::UnderlyingType>::value);
|
|
mOutputIdsDevice
|
|
= mBufferManager->gpu(ITensor::makeShape({maxBatchSize(), mBeamWidth, mMaxSeqLen}), nvinfer1::DataType::kINT32);
|
|
mEndIdsDevice = mBufferManager->gpu(ITensor::makeShape({maxBatchSize()}), nvinfer1::DataType::kINT32);
|
|
mIdsPtrHost = mBufferManager->pinned(ITensor::makeShape({maxBatchSize()}), ptrType);
|
|
|
|
mCumLogProbsDevice = mBufferManager->gpu(ITensor::makeShape({maxBatchSize()}), nvinfer1::DataType::kFLOAT);
|
|
mOutputLogProbsDevice
|
|
= mBufferManager->gpu(ITensor::makeShape({maxBatchSize(), mMaxSeqLen}), nvinfer1::DataType::kFLOAT);
|
|
|
|
mBatchSlots
|
|
= mBufferManager->pinned(ITensor::makeShape({mBatchSize + mBatchSizeBadPad}), nvinfer1::DataType::kINT32);
|
|
mCurandStatesDevice
|
|
= mBufferManager->gpu(ITensor::makeShape({maxBatchSize(), sizeof(curandState_t)}), nvinfer1::DataType::kINT8);
|
|
|
|
auto const workspaceSize = mSamplingLayer->getWorkspaceSize();
|
|
|
|
trk::invokeFill(*mSeqLengthsDevice, int32_t{0}, *mStream);
|
|
trk::invokeFill(*mContextLengthDevice, int32_t{0}, *mStream);
|
|
trk::invokeFill(*mFinishedDevice, uint8_t{0}, *mStream);
|
|
trk::invokeFill(*mOutputIdsDevice, int32_t{0}, *mStream);
|
|
trk::invokeFill(*mCumLogProbsDevice, float{0.0f}, *mStream);
|
|
trk::invokeFill(*mOutputLogProbsDevice, float{0.0f}, *mStream);
|
|
trk::invokeFill(*mEndIdsDevice, int32_t{mEndId}, *mStream);
|
|
tk::invokeCurandInitialize(reinterpret_cast<curandState_t*>(bufferCast<int8_t>(*mCurandStatesDevice)), nullptr,
|
|
maxBatchSize(), seed, mStream->get());
|
|
|
|
if (mBeamWidth > 1)
|
|
{
|
|
mSrcCacheIndirection = mBufferManager->gpu(
|
|
ITensor::makeShape({maxBatchSize(), mBeamWidth, mMaxSeqLen}), nvinfer1::DataType::kINT32);
|
|
mTgtCacheIndirection = mBufferManager->gpu(
|
|
ITensor::makeShape({maxBatchSize(), mBeamWidth, mMaxSeqLen}), nvinfer1::DataType::kINT32);
|
|
mParentIds = mBufferManager->gpu(
|
|
ITensor::makeShape({maxBatchSize(), mBeamWidth, mMaxSeqLen}), nvinfer1::DataType::kINT32);
|
|
|
|
auto constexpr nvTokenIdType = TRTDataType<TokenIdType>::value;
|
|
auto constexpr nvSizeType = TRTDataType<SizeType32>::value;
|
|
auto constexpr nvFloatType = TRTDataType<float>::value;
|
|
auto constexpr nvBoolType = TRTDataType<bool>::value;
|
|
mOutputIdsCBA
|
|
= mBufferManager->gpu(ITensor::makeShape({maxBatchSize(), 2 * mBeamWidth, mMaxSeqLen}), nvTokenIdType);
|
|
mLogProbsCBA
|
|
= mBufferManager->gpu(ITensor::makeShape({maxBatchSize(), 2 * mBeamWidth, mMaxSeqLen}), nvFloatType);
|
|
mSequenceLengthsCBA = mBufferManager->gpu(ITensor::makeShape({maxBatchSize(), 2 * mBeamWidth}), nvSizeType);
|
|
mCumLogProbsCBA = mBufferManager->gpu(ITensor::makeShape({maxBatchSize(), 2 * mBeamWidth}), nvFloatType);
|
|
mNormedScoresCBA = mBufferManager->gpu(ITensor::makeShape({maxBatchSize(), 2 * mBeamWidth}), nvFloatType);
|
|
mNumBeamsCBA = mBufferManager->gpu(ITensor::makeShape({maxBatchSize()}), nvSizeType);
|
|
mMinNormedScoresCBA = mBufferManager->gpu(ITensor::makeShape({maxBatchSize()}), nvFloatType);
|
|
mBatchDones = mBufferManager->gpu(ITensor::makeShape({maxBatchSize()}), nvBoolType);
|
|
mOutputIdsPtr = mBufferManager->pinned(ITensor::makeShape({maxBatchSize()}), ptrType);
|
|
mParentIdsPtr = mBufferManager->pinned(ITensor::makeShape({maxBatchSize()}), ptrType);
|
|
|
|
trk::invokeFill(*mSrcCacheIndirection, int32_t{0}, *mStream);
|
|
trk::invokeFill(*mTgtCacheIndirection, int32_t{0}, *mStream);
|
|
trk::invokeFill(*mParentIds, int32_t{0}, *mStream);
|
|
trk::invokeFill(*mOutputIdsCBA, int32_t{0}, *mStream);
|
|
trk::invokeFill(*mLogProbsCBA, float{0}, *mStream);
|
|
trk::invokeFill(*mSequenceLengthsCBA, int32_t{0}, *mStream);
|
|
trk::invokeFill(*mCumLogProbsCBA, float{0}, *mStream);
|
|
trk::invokeFill(*mNormedScoresCBA, float{0}, *mStream);
|
|
trk::invokeFill(*mNumBeamsCBA, int32_t{0}, *mStream);
|
|
trk::invokeFill(*mMinNormedScoresCBA, float{0}, *mStream);
|
|
trk::invokeFill(*mBatchDones, bool{0}, *mStream);
|
|
}
|
|
|
|
auto batchSlotsPtr = bufferCast<int32_t>(*mBatchSlots);
|
|
for (SizeType32 bi = 0; bi < mBatchSize; ++bi)
|
|
{
|
|
batchSlotsPtr[bi] = kDoubleBatchIdx * bi;
|
|
}
|
|
for (SizeType32 bi = 0; bi < mBatchSizeBadPad; ++bi)
|
|
{
|
|
batchSlotsPtr[mBatchSize + bi] = 0xbaadf00d;
|
|
}
|
|
|
|
auto idsPtrHostPtr = BufferRange<void*>(*mIdsPtrHost);
|
|
auto outputIdsDevicePtr = bufferCast<int32_t>(*mOutputIdsDevice);
|
|
for (SizeType32 bi = 0; bi < maxBatchSize(); bi++)
|
|
{
|
|
idsPtrHostPtr[bi] = outputIdsDevicePtr + bi * mMaxSeqLen;
|
|
}
|
|
if (mBeamWidth > 1)
|
|
{
|
|
auto outputIdsPtr = bufferCast<int*>(*mOutputIdsPtr);
|
|
auto parentIdsPtr = bufferCast<int*>(*mParentIdsPtr);
|
|
for (SizeType32 bi = 0; bi < maxBatchSize(); bi++)
|
|
{
|
|
outputIdsPtr[bi] = outputIdsDevicePtr + bi * mMaxSeqLen;
|
|
parentIdsPtr[bi] = outputIdsDevicePtr + bi * mMaxSeqLen;
|
|
}
|
|
}
|
|
|
|
std::shared_ptr<DecodingSetupParams> setupParams;
|
|
if (mBeamWidth > 1)
|
|
{
|
|
auto samplingSetupParams = std::make_shared<BeamSearchSetupParams>();
|
|
|
|
setupParams = samplingSetupParams;
|
|
}
|
|
else if (!params.isExternalDraftTokensLayerTest)
|
|
{
|
|
auto samplingSetupParams = std::make_shared<SamplingSetupParams>();
|
|
samplingSetupParams->randomSeed = std::make_optional<std::vector<uint64_t>>({seed});
|
|
samplingSetupParams->runtimeTopK
|
|
= params.topKs.size() ? std::make_optional<std::vector<SizeType32>>(params.topKs) : std::nullopt;
|
|
samplingSetupParams->runtimeTopP
|
|
= params.topPs.size() ? std::make_optional<std::vector<float>>(params.topPs) : std::nullopt;
|
|
samplingSetupParams->topPDecay
|
|
= params.decay.size() ? std::make_optional<std::vector<float>>(params.decay) : std::nullopt;
|
|
samplingSetupParams->topPMin
|
|
= params.minTopP.size() ? std::make_optional<std::vector<float>>(params.minTopP) : std::nullopt;
|
|
samplingSetupParams->topPResetIds
|
|
= params.topPResetIds.size() ? std::make_optional<std::vector<int32_t>>(params.topPResetIds) : std::nullopt;
|
|
|
|
setupParams = samplingSetupParams;
|
|
}
|
|
else
|
|
{
|
|
auto externalDraftTokensSetupParams = std::make_shared<ExternalDraftTokensSetupParams>();
|
|
externalDraftTokensSetupParams->randomSeed = std::make_optional<std::vector<uint64_t>>({seed});
|
|
externalDraftTokensSetupParams->runtimeTopK
|
|
= params.topKs.size() ? std::make_optional<std::vector<SizeType32>>(params.topKs) : std::nullopt;
|
|
externalDraftTokensSetupParams->runtimeTopP
|
|
= params.topPs.size() ? std::make_optional<std::vector<float>>(params.topPs) : std::nullopt;
|
|
|
|
setupParams = externalDraftTokensSetupParams;
|
|
}
|
|
|
|
mDecodingWorkspace->setDeviceBatchSlots(mBatchSlots);
|
|
mDecodingWorkspace->getDeviceRuntimeLogits()->reshape(ITensor::makeShape({mBatchSize, mBeamWidth, mVocabSize}));
|
|
mSamplingLayer->setup(mBatchSize, mBeamWidth, mBatchSlots, setupParams, mDecodingWorkspace);
|
|
mStream->synchronize();
|
|
}
|
|
|
|
template <typename T>
|
|
std::shared_ptr<DecodingInputs> BaseSamplingLayerTest<T>::createInputTensors(int32_t step)
|
|
{
|
|
constexpr int32_t ite = 0;
|
|
if (mBeamWidth > 1)
|
|
{
|
|
auto decodeInputTensors = std::make_shared<DecodingInputs>(mEndIdsDevice, mBatchSlots, step, ite, mBatchSize);
|
|
decodeInputTensors->logits = mDecodingWorkspace->getDeviceRuntimeLogits();
|
|
decodeInputTensors->inputLengths = mContextLengthDevice;
|
|
decodeInputTensors->finished = mFinishedDevice;
|
|
decodeInputTensors->srcCacheIndirection = mSrcCacheIndirection;
|
|
return decodeInputTensors;
|
|
}
|
|
else
|
|
{
|
|
auto decodeInputTensors = std::make_shared<SamplingInputs>(mEndIdsDevice, mBatchSlots, step, ite, mBatchSize);
|
|
decodeInputTensors->logits = mDecodingWorkspace->getDeviceRuntimeLogits();
|
|
decodeInputTensors->inputLengths = mContextLengthDevice;
|
|
decodeInputTensors->finished = mFinishedDevice;
|
|
decodeInputTensors->probsComputed = mComputeProbs;
|
|
decodeInputTensors->curandStates = reinterpret_cast<curandState_t*>(bufferCast<int8_t>(*mCurandStatesDevice));
|
|
return decodeInputTensors;
|
|
}
|
|
}
|
|
|
|
template <typename T>
|
|
std::shared_ptr<BaseDecodingOutputs> BaseSamplingLayerTest<T>::createOutputTensors()
|
|
{
|
|
// TODO: check log probs and cum_log_probs
|
|
|
|
if (mBeamWidth > 1)
|
|
{
|
|
auto decodeOutputs = std::make_shared<BeamSearchOutputs>(mOutputIdsDevice);
|
|
decodeOutputs->outputIdsPtr = mOutputIdsPtr;
|
|
decodeOutputs->outputIdsPtrHost = mIdsPtrHost;
|
|
decodeOutputs->sequenceLength = mSeqLengthsDevice;
|
|
decodeOutputs->finished = mFinishedDevice;
|
|
decodeOutputs->outputLogProbs = mOutputLogProbsDevice;
|
|
decodeOutputs->cumLogProbs = mCumLogProbsDevice;
|
|
decodeOutputs->tgtCacheIndirection = mTgtCacheIndirection;
|
|
decodeOutputs->parentIds = mParentIds;
|
|
decodeOutputs->parentIdsPtr = mIdsPtrHost;
|
|
|
|
decodeOutputs->beamHypotheses = std::make_unique<tensorrt_llm::kernels::BeamHypotheses>();
|
|
decodeOutputs->beamHypotheses->outputIdsCBA = bufferCast<int>(*mOutputIdsCBA);
|
|
decodeOutputs->beamHypotheses->logProbsCBA = bufferCast<float>(*mLogProbsCBA);
|
|
decodeOutputs->beamHypotheses->sequenceLengthsCBA = bufferCast<int>(*mSequenceLengthsCBA);
|
|
decodeOutputs->beamHypotheses->cumLogProbsCBA = bufferCast<float>(*mCumLogProbsCBA);
|
|
decodeOutputs->beamHypotheses->normedScoresCBA = bufferCast<float>(*mNormedScoresCBA);
|
|
decodeOutputs->beamHypotheses->numBeamsCBA = bufferCast<int>(*mNumBeamsCBA);
|
|
decodeOutputs->beamHypotheses->minNormedScoresCBA = bufferCast<float>(*mMinNormedScoresCBA);
|
|
decodeOutputs->beamHypotheses->batchDones = bufferCast<bool>(*mBatchDones);
|
|
|
|
return decodeOutputs;
|
|
}
|
|
else
|
|
{
|
|
auto decodeOutputs = std::make_shared<BaseDecodingOutputs>(mOutputIdsDevice);
|
|
decodeOutputs->outputIdsPtr = mIdsPtrHost;
|
|
decodeOutputs->outputIdsPtrHost = mIdsPtrHost;
|
|
decodeOutputs->sequenceLength = mSeqLengthsDevice;
|
|
decodeOutputs->finished = mFinishedDevice;
|
|
decodeOutputs->outputLogProbs = mOutputLogProbsDevice;
|
|
decodeOutputs->cumLogProbs = mCumLogProbsDevice;
|
|
return decodeOutputs;
|
|
}
|
|
}
|
|
|
|
template <typename T>
|
|
void BaseSamplingLayerTest<T>::batchCopy(int32_t step)
|
|
{
|
|
auto const logitsHost = ITensor::wrap(mTestLogitsInit.data() + step * mBeamWidth * mVocabSize,
|
|
TRTDataType<T>::value, ITensor::makeShape({mBeamWidth, mVocabSize}));
|
|
|
|
for (int32_t bi = 0; bi < mBatchSize; ++bi)
|
|
{
|
|
auto logitsDeviceView = ITensor::slice(mDecodingWorkspace->getDeviceRuntimeLogits(), bi, 1);
|
|
mBufferManager->copy(*logitsHost, *logitsDeviceView);
|
|
}
|
|
}
|
|
|
|
template <typename T>
|
|
bool BaseSamplingLayerTest<T>::checkResult(int32_t const* outputIds, std::vector<std::set<int32_t>> const& expectedIds)
|
|
{
|
|
assert(expectedIds.size() == mMaxSeqLen * batchBeam());
|
|
int failures = 0;
|
|
auto* const batchSlotsPtr = bufferCast<int32_t>(*mBatchSlots);
|
|
for (int32_t i = 0; i < mMaxSeqLen * mBatchSize; ++i)
|
|
{
|
|
int32_t s = i / mBatchSize;
|
|
int32_t b = i % mBatchSize;
|
|
auto const batchSlot = batchSlotsPtr[b];
|
|
std::set<int32_t> expts = expectedIds.at(i);
|
|
auto const outputId = outputIds[batchSlot * mMaxSeqLen + s];
|
|
if (expts.count(outputId) == 0)
|
|
{
|
|
if (failures < 10)
|
|
{
|
|
std::stringstream ss;
|
|
ss << " - Fail "
|
|
<< " (step=" << s << ", batch=" << b << ") "
|
|
<< "actual=" << outputId << ", expected";
|
|
for (auto const& expt : expts)
|
|
{
|
|
ss << " " << expt;
|
|
}
|
|
TLLM_LOG_DEBUG("%s", ss.str().c_str());
|
|
}
|
|
++failures;
|
|
}
|
|
}
|
|
TLLM_LOG_DEBUG(
|
|
"check...%6s : failures: %d / %d", failures == 0 ? "....OK" : "FAILED", failures, mMaxSeqLen * batchBeam());
|
|
return failures == 0;
|
|
}
|
|
|
|
template <typename T>
|
|
void BaseSamplingLayerTest<T>::runTest(
|
|
std::vector<std::set<int32_t>> const& expectedOutputIds, TestSamplingParams const& params, int32_t endId)
|
|
{
|
|
mBatchSize = params.batchSize;
|
|
if (params.beamWidth > 1)
|
|
{
|
|
mBeamWidth = params.beamWidth;
|
|
mMaxSeed = 1;
|
|
mComputeProbs = true;
|
|
}
|
|
initLayer(params);
|
|
|
|
auto const decoderDomain
|
|
= tensorrt_llm::layers::DecoderDomain(maxBatchSize(), mBeamWidth, mVocabSize, mVocabSizePadded);
|
|
mDecodingWorkspace = std::make_unique<tensorrt_llm::runtime::DecodingLayerWorkspace>(
|
|
mBufferManager, decoderDomain, TRTDataType<T>::value, mSamplingLayer->getWorkspaceSize());
|
|
mEndId = endId;
|
|
for (uint64_t seed = 0; seed < mMaxSeed; ++seed)
|
|
{
|
|
setup(seed, params);
|
|
|
|
int32_t step = mMaxInputLen;
|
|
auto inputTensors = createInputTensors(step);
|
|
auto outputTensors = createOutputTensors();
|
|
|
|
for (step = mMaxInputLen; step < mMaxOutputLen; ++step)
|
|
{
|
|
// Reset by the test value since the sampling layer internally updates the logit buffer.
|
|
batchCopy(step);
|
|
if (params.isExternalDraftTokensLayerTest)
|
|
{
|
|
inputTensors = createInputTensors(step);
|
|
}
|
|
else
|
|
{
|
|
inputTensors->step = step;
|
|
}
|
|
mDecodingWorkspace->setDeviceBatchSlots(mBatchSlots);
|
|
mSamplingLayer->forwardAsync(outputTensors, inputTensors, mDecodingWorkspace);
|
|
mStream->synchronize();
|
|
}
|
|
|
|
auto const outputIdsHost = mBufferManager->copyFrom(*mOutputIdsDevice, tensorrt_llm::runtime::MemoryType::kCPU);
|
|
|
|
mStream->synchronize();
|
|
|
|
bool passed = checkResult(bufferCast<int32_t>(*outputIdsHost), expectedOutputIds);
|
|
EXPECT_TRUE(passed) << "Output ids check failed at seed " << seed;
|
|
if (!passed)
|
|
{
|
|
std::stringstream ss;
|
|
ss << "Actual output ids:" << std::endl << *outputIdsHost;
|
|
TLLM_LOG_DEBUG(ss.str());
|
|
}
|
|
}
|
|
}
|
|
|
|
template class BaseSamplingLayerTest<float>;
|
|
template class BaseSamplingLayerTest<half>;
|
|
|
|
} // namespace tensorrt_llm::tests::layers::sampling
|