TensorRT-LLMs/cpp/tensorrt_llm/layers/decodingParams.h
wili 34e63d07e6
feat: Variable-Beam-Width-Search (VBWS) Part2 (#3133)
* feat: Variable-Beam-Width-Search Part2

Signed-off-by: wili-65535 <wili-65535@user.noreply.github.com>

* feat: Variable-Beam-Width-Search Part2

Signed-off-by: wili-65535 <wili-65535@user.noreply.github.com>

* feat: Variable-Beam-Width-Search Part2, fix CPP tests

Signed-off-by: wili-65535 <wili-65535@user.noreply.github.com>

* feat: Variable-Beam-Width-Search Part3, simplify CPP tests

Signed-off-by: wili-65535 <wili-65535@user.noreply.github.com>

* feat: Variable-Beam-Width-Search Part4, move beam_width_array param

Signed-off-by: wili-65535 <wili-65535@user.noreply.github.com>

* feat: Variable-Beam-Width-Search, fix CI error

Signed-off-by: wili-65535 <wili-65535@user.noreply.github.com>

* feat: Variable-Beam-Width-Search part2

Signed-off-by: wili-65535 <wili-65535@user.noreply.github.com>

* feat: Variable-Beam-Width-Search part2

Signed-off-by: wili-65535 <wili-65535@user.noreply.github.com>

* feat: Variable-Beam-Width-Search part2, fix pre-commit

Signed-off-by: wili-65535 <wili-65535@user.noreply.github.com>

* feat: Variable-Beam-Width-Search part2, fix review

Signed-off-by: wili-65535 <wili-65535@user.noreply.github.com>

---------

Signed-off-by: wili-65535 <wili-65535@user.noreply.github.com>
Co-authored-by: wili-65535 <wili-65535@user.noreply.github.com>
2025-04-02 12:31:28 +08:00

735 lines
29 KiB
C++

/*
* Copyright (c) 2022-2024, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once
#include "tensorrt_llm/common/assert.h"
#include "tensorrt_llm/executor/executor.h"
#include "tensorrt_llm/kernels/beamSearchKernels.h"
#include "tensorrt_llm/runtime/iTensor.h"
#include <tensorrt_llm/runtime/common.h>
#include <tensorrt_llm/runtime/speculativeDecodingModule.h>
#include <optional>
#include <utility>
#include <vector>
namespace tensorrt_llm::layers
{
using TensorPtr = runtime::ITensor::SharedPtr;
using TensorConstPtr = runtime::ITensor::SharedConstPtr;
using BufferPtr = runtime::IBuffer::SharedPtr;
using BufferConstPtr = runtime::IBuffer::SharedConstPtr;
//!
//! \brief In a DecodingLayer's life cycle, it is constructed once;
//! `setup` repeatedly, but once per request; `forward*` repeatedly, many times per request.
//! A possible sequence would be, construct(maxBatchSize) -> setup({1,3}) -> forward({1, 3})
//! -> forward({1, 3}) -> setup({2,4}) -> forward({1, 3, 2, 4}) -> forward({1, 3, 2, 4})
//! -> forward({1, 2, 4}), where {a,b} are batchSlots, and {3} ends at last step.
//! As a result there are three types of batches.
//! 1. `maxBatchSize` for each layers to reserve resources.
//! It is passed through class constructor, in DecoderDomain.getBatchSize().
//! 2. `setupBatchSize` for setting up layers for a batch of new requests.
//! It is passed through `setup` method.
//! 3. `forwardBatchSize` for layers forwarding for a batch of existing active requests.
//! it is passed through `forwardAsync` and `forwardSync` methods.
//! `setup` and `forward` always provide `batchSlots` indexed by
//! local batch index ranging in [0, setupBatchSize) or [0, forwardBatchSize),
//! holding the global batch index ranging in [0, maxBatchSize).
//! In case of beam search, maxBatchSize = forwardBatchSize = 1.
class DecoderDomain
{
public:
DecoderDomain(runtime::SizeType32 batchSize, runtime::SizeType32 beamWidth, runtime::SizeType32 vocabSize,
std::optional<runtime::SizeType32> vocabSizePadded = std::nullopt,
std::shared_ptr<runtime::SpeculativeDecodingModule const> speculativeDecodingModule = nullptr,
bool const useVariableBeamWidthSearch = false)
: mBatchSize(batchSize)
, mBeamWidth(beamWidth)
, mVocabSize(vocabSize)
, mVocabSizePadded(vocabSizePadded.value_or(vocabSize))
, mSpeculativeDecodingModule(std::move(speculativeDecodingModule))
, mUseVariableBeamWidthSearch(useVariableBeamWidthSearch)
{
}
[[nodiscard]] runtime::SizeType32 getBatchSize() const
{
return mBatchSize;
}
[[nodiscard]] runtime::SizeType32 getBeamWidth() const
{
return mBeamWidth;
}
[[nodiscard]] runtime::SizeType32 getVocabSize() const
{
return mVocabSize;
}
[[nodiscard]] runtime::SizeType32 getVocabSizePadded() const
{
return mVocabSizePadded;
}
[[nodiscard]] runtime::SizeType32 getMaxDecodingTokens() const
{
return mSpeculativeDecodingModule ? mSpeculativeDecodingModule->getMaxDecodingTokens() : 1;
}
[[nodiscard]] std::shared_ptr<runtime::SpeculativeDecodingModule const> getSpeculativeDecodingModule() const
{
TLLM_CHECK_WITH_INFO(mSpeculativeDecodingModule, "Speculative decoding module is not set to decoder domain");
return mSpeculativeDecodingModule;
}
[[nodiscard]] std::shared_ptr<runtime::SpeculativeDecodingModule const> getSpeculativeDecodingModulePtr() const
{
return mSpeculativeDecodingModule;
}
[[nodiscard]] bool getUseVariableBeamWidthSearch() const
{
return mUseVariableBeamWidthSearch;
}
private:
runtime::SizeType32 mBatchSize;
runtime::SizeType32 mBeamWidth;
runtime::SizeType32 mVocabSize;
runtime::SizeType32 mVocabSizePadded;
std::shared_ptr<runtime::SpeculativeDecodingModule const> mSpeculativeDecodingModule;
bool mUseVariableBeamWidthSearch;
};
class BaseSetupParams
{
public:
virtual ~BaseSetupParams() = default;
};
// Penalty layer
class PenaltySetupParams : public BaseSetupParams
{
public:
std::optional<std::vector<float>> temperature; // [1] or [setupBatchSize] on cpu
std::optional<std::vector<runtime::SizeType32>> minLength; // [1] or [setupBatchSize] on cpu
std::optional<std::vector<float>> repetitionPenalty; // [1] or [setupBatchSize] on cpu
std::optional<std::vector<float>> presencePenalty; // [1] or [setupBatchSize] on cpu
std::optional<std::vector<float>> frequencyPenalty; // [1] or [setupBatchSize] on cpu
};
// Ban words layer
class BanWordsSetupParams : public BaseSetupParams
{
public:
std::optional<std::vector<runtime::SizeType32>> noRepeatNgramSize; // [1] or [setupBatchSize] on cpu
};
class DecodingSetupParams : public BaseSetupParams
{
public:
virtual ~DecodingSetupParams() = default;
std::optional<std::vector<uint64_t>> randomSeed; // [1] or [setupBatchSize] on cpu
std::optional<std::vector<bool>> outputLogProbs; // [setupBatchSize]
std::optional<std::vector<bool>> cumLogProbs; // [setupBatchSize]
};
class SamplingSetupParams : public DecodingSetupParams
{
public:
// baseSamplingLayer
std::optional<std::vector<runtime::SizeType32>> runtimeTopK; // [1] or [setupBatchSize] on cpu
std::optional<std::vector<float>> runtimeTopP; // [1] or [setupBatchSize] on cpu
std::optional<std::vector<float>> runtimeMinP; // [1] or [setupBatchSize] on cpu
// topPSamplingLayer
std::optional<std::vector<float>> topPDecay; // [setupBatchSize], must between [0, 1]
std::optional<std::vector<float>> topPMin; // [setupBatchSize], must between [0, 1]
std::optional<std::vector<runtime::TokenIdType>> topPResetIds; // [setupBatchSize]
std::optional<bool> normalizeLogProbs;
};
class BeamSearchSetupParams : public DecodingSetupParams
{
public:
// BeamSearchLayer
std::optional<std::vector<float>> beamSearchDiversityRate; // [setupBatchSize] on cpu
std::optional<std::vector<float>> lengthPenalty; // [setupBatchSize] on cpu
std::optional<std::vector<int>> earlyStopping; // [setupBatchSize] on cpu
std::optional<std::vector<std::vector<runtime::SizeType32>>>
beamWidthArray; // [setupBatchSize, nMaxBeamWidthArray] on cpu
bool hasDiffRuntimeArgs{false};
};
class MedusaSetupParams : public DecodingSetupParams
{
public:
// Medusa params
std::optional<std::vector<runtime::SizeType32>> runtimeTopK; // [setupBatchSize] on cpu
std::optional<std::vector<std::vector<runtime::SizeType32>>> runtimeHeadsTopK; // [setupBatchSize, maxMedusaHeads]
};
class ExplicitDraftTokensSetupParams : public DecodingSetupParams
{
public:
std::optional<std::vector<float>> temperature; // [setupBatchSize] on cpu
// Hack to init some data for the context phase in the setup.
TensorPtr randomDataSample; // [maxBatchSize], on gpu
TensorPtr temperatures; // [maxBatchSize], on gpu
nvinfer1::DataType dtype; // [1], on cpu
};
class EagleSetupParams : public DecodingSetupParams
{
public:
std::optional<std::vector<float>> temperature; // [setupBatchSize] on cpu
// Hack to init some data for the context phase in the setup.
TensorPtr randomDataSample; // [maxBatchSize], on gpu
TensorPtr temperatures; // [maxBatchSize], on gpu
nvinfer1::DataType dtype; // [1], on cpu
};
class DynamicDecodeSetupParams : public BaseSetupParams
{
public:
std::shared_ptr<PenaltySetupParams> penaltyParams;
std::shared_ptr<BanWordsSetupParams> banWordsParams;
std::shared_ptr<DecodingSetupParams> decodingParams;
};
struct LookaheadSetupParams : public DecodingSetupParams
{
using TensorPtr = runtime::ITensor::SharedPtr;
std::vector<runtime::ITensor::SharedConstPtr> prompt; // [batchSize][maxSeqLen] on cpu
std::vector<executor::LookaheadDecodingConfig> algoConfigs; // [1 or batchSize] on cpu
//! see LookaheadDecodingOutputs::generationLengths
TensorPtr generationLengths;
//! see LookaheadDecodingOutputs::positionOffsets
TensorPtr positionOffsets;
//! see LookaheadDecodingOutputs::attentionPackedMasks
TensorPtr attentionPackedMasks;
};
class ExternalDraftTokensSetupParams : public DecodingSetupParams
{
public:
std::optional<std::vector<runtime::SizeType32>> runtimeTopK; // [1] or [setupBatchSize] on cpu
std::optional<std::vector<float>> runtimeTopP; // [1] or [setupBatchSize] on cpu
};
class BaseDecodingInputs
{
public:
BaseDecodingInputs(runtime::SizeType32 localBatchSize)
: localBatchSize(localBatchSize)
{
}
virtual ~BaseDecodingInputs() = default;
runtime::SizeType32 localBatchSize;
};
// Ban words inputs
class BanWordsDecodingInputs : public BaseDecodingInputs
{
public:
BanWordsDecodingInputs(runtime::SizeType32 localBatchSize)
: BaseDecodingInputs(localBatchSize)
{
}
runtime::SizeType32 maxBadWordsLen{0};
//! [maxBatchSize][2, bad_words_length], on gpu
std::optional<TensorConstPtr> badWordsPtr;
//! [maxBatchSize], on gpu
std::optional<TensorConstPtr> badWordsLengths;
};
// Stop criteria inputs
class StopCriteriaDecodingInputs : public BaseDecodingInputs
{
public:
StopCriteriaDecodingInputs(runtime::SizeType32 localBatchSize)
: BaseDecodingInputs(localBatchSize)
{
}
runtime::SizeType32 maxStopWordsLen{0};
//! [maxBatchSize], on gpu
std::optional<TensorConstPtr> sequenceLimitLength;
//! [maxBatchSize][2, stop_words_length], pinned
std::optional<TensorConstPtr> stopWordsPtr;
//! [maxBatchSize], pinned
std::optional<TensorConstPtr> stopWordsLengths;
};
class DecodingInputs : public BaseDecodingInputs
{
public:
DecodingInputs(TensorConstPtr endIds, TensorConstPtr batchSlots, runtime::SizeType32 step = 0,
runtime::SizeType32 ite = 0, runtime::SizeType32 localBatchSize = 0, runtime::SizeType32 maxAttentionWindow = 0,
runtime::SizeType32 sinkTokenLength = 0)
: BaseDecodingInputs(localBatchSize)
, endIds{std::move(endIds)}
, step{step}
, ite{ite}
, maxAttentionWindow{maxAttentionWindow}
, sinkTokenLength{sinkTokenLength}
, batchSlots{std::move(batchSlots)}
{
}
//! [maxBatchSize]
TensorConstPtr endIds;
// used only for python runtime
runtime::SizeType32 step;
runtime::SizeType32 ite;
// mandatory parameters
runtime::SizeType32 maxAttentionWindow;
runtime::SizeType32 sinkTokenLength;
//! One of these two fields has to be set
//! DynamicDecodeLayer::forward checks for it
//! Need both of these fields to support legacy code during transition period to the batched decoder
//! [forwardBatchSize, beamWidth, vocabSizePadded]
std::optional<TensorConstPtr> logits;
//! [forwardBatchSize][beamWidth, vocabSizePadded], on gpu
std::optional<std::vector<TensorConstPtr>> logitsVec;
//! [forwardBatchSize], on pinned memory
TensorConstPtr batchSlots;
// optional parameters
//! the indices of the selected beams, mandatory for beam search, on gpu
//! [forwardBatchSize, maxBeamWidth, maxSeqLen]
std::optional<TensorPtr> srcCacheIndirection;
//! [vocabSizePadded], on gpu
std::optional<TensorConstPtr> embeddingBias;
//! [maxBatchSize, maxBeamWidth], on gpu
std::optional<TensorConstPtr> inputLengths;
//! [maxBatchSize, maxBeamWidth]
std::optional<TensorConstPtr> finished;
//! [maxBatchSize], on gpu
std::optional<TensorPtr> curTokensPerStep;
std::shared_ptr<BanWordsDecodingInputs> banWordsInputs;
std::shared_ptr<StopCriteriaDecodingInputs> stopCriteriaInputs;
//! Step of each request, for Variable-Beam-Width-Search
std::optional<std::vector<runtime::SizeType32>> beamSearchSteps; // [batchSize]
};
class SamplingInputs : public DecodingInputs
{
public:
explicit SamplingInputs(TensorConstPtr endIds, TensorConstPtr batchSlots, runtime::SizeType32 step,
runtime::SizeType32 ite, runtime::SizeType32 localBatchSize)
: DecodingInputs{std::move(endIds), std::move(batchSlots), step, ite, localBatchSize}
{
}
//! optional parameters
//! [localBatchSize]
curandState_t* curandStates{};
//! Flag to mark that logits tensor contains probabilities
bool probsComputed{};
};
class ExternalDraftTokensInputs : public DecodingInputs
{
public:
explicit ExternalDraftTokensInputs(TensorConstPtr endIds, TensorConstPtr batchSlots, runtime::SizeType32 step,
runtime::SizeType32 ite, runtime::SizeType32 localBatchSize)
: DecodingInputs{std::move(endIds), std::move(batchSlots), step, ite, localBatchSize}
{
}
TensorPtr draftLogits;
TensorPtr draftProbs;
TensorPtr targetProbs;
TensorPtr numDraftTokens;
TensorPtr numDraftTokensHost;
TensorPtr draftTokenIds;
TensorPtr useDraftLogits;
TensorPtr useDraftLogitsHost;
runtime::SizeType32 step{};
float constantThreshold{};
bool useRandomAcceptanceThreshold{};
//! optional parameters
//! [localBatchSize]
curandState_t* curandStates{};
//! Flag to mark that logits tensor contains probabilities
bool probsComputed{};
};
// Medusa inputs
class MedusaDecodingInputs : public DecodingInputs
{
public:
explicit MedusaDecodingInputs(TensorConstPtr endIds, TensorConstPtr batchSlots, runtime::SizeType32 localBatchSize)
: DecodingInputs(std::move(endIds), std::move(batchSlots), 0, 0, localBatchSize)
{
}
//! [maxBatchSize], on gpu
TensorConstPtr targetTokensPerStep;
//! [maxBatchSize, maxPathLen, maxPathLen], on gpu
TensorConstPtr paths;
//! [maxBatchSize, maxDecodingTokens], on gpu
TensorConstPtr treeIds;
//! [maxBatchSize][maxDraftPathLen][maxDecodingTokens, vocabSizePadded], on gpu
std::vector<std::vector<TensorPtr>> medusaLogits;
};
// Explicit draft tokens inputs
class ExplicitDraftTokensInputs : public DecodingInputs
{
public:
explicit ExplicitDraftTokensInputs(TensorConstPtr endIds, TensorConstPtr batchSlots, runtime::SizeType32 batchSize)
: DecodingInputs(std::move(endIds), std::move(batchSlots), 0, 0, batchSize)
{
}
//! Draft tokens for the next iteration. The first token in each path is the last accepted token at current
//! iteration. E.g. if forwardBatchSize == 1, maxNumPaths == 2, maxPathLen== 3, [[[0, 1, 2], [0, 1, 10]]]
TensorConstPtr nextDraftTokens; // [forwardBatchSize, maxNumPaths, maxPathLen], gpu
//! Compressed form of `nextDraftTokens`, where common prefixes and collapsed.
//! Using example above [0, 1, 2, 10]
TensorConstPtr nextFlatTokens; // [forwardBatchSize * maxDecodingTokens], gpu
//! Indices of draft tokens in the compressed `nextFlatTokens` for the next iteration.
//! Using example above, [[[0, 1, 2], [0, 1, 3]]]
TensorConstPtr nextDraftIndices; // [forwardBatchSize, maxNumPaths, maxPathLen], gpu
//! Probabilities of the next draft tokens.
TensorConstPtr nextDraftProbs; // [forwardBatchSize, maxNumPaths, maxDraftPathLen, vocabSize], gpu
//! Same as `nextDraftTokens`, but for current iteration.
//! Current accepted tokens obtained as `lastDraftTokens[bi][bestPathIndices[bi]][1:bestPathLengths[bi]]`.
TensorConstPtr lastDraftTokens; // [forwardBatchSize, maxNumPaths, maxPathLen], gpu
//! Same as `nextDraftIndices`, but for current iteration.
TensorConstPtr lastDraftIndices; // [forwardBatchSize, maxNumPaths, maxPathLen], gpu
//! Boolean attention masks.
//! maxDecodingTokens' = generationLengths.max()
TensorConstPtr masks; // [forwardBatchSize, maxDecodingTokens', maxDecodingTokens'], gpu
//! Relative to `positionIdsBase` position ids. Same as `nextFlatTokens` for next draft indices.
//! Using example above, [0, 1, 2, 3]
TensorConstPtr packedPosIds; // [forwardBatchSize * maxDecodingTokens], gpu
//! Lengths of the accepted paths for each request. It is 1 for context phase (Only 1 primary tokens is accepted).
TensorConstPtr bestPathLengths; // [forwardBatchSize], gpu
//! Indices of the accepted paths for each request. It is 0 for context phase.
TensorConstPtr bestPathIndices; // [forwardBatchSize], gpu
//! Number of the draft tokens for the next iteration.
TensorConstPtr generationLengths; // [forwardBatchSize], gpu
//! Baseline for the position ids.
TensorConstPtr positionIdsBase; // [forwardBatchSize], gpu
//! Generation length for the previous stage.
TensorConstPtr lastGenerationLengths; // [forwardBatchSize], gpu
//! Maximum number of generated tokens for the next step across whole batch
TensorConstPtr maxGenLengthDevice; // [1], on gpu
//! Address map to map from linear indices of the engine outputs to seqSlot.
//! It is not the same as batchSlots because it maps the ordered engine outputs to the respective seqSlot,
//! while batchSlots is just a a list of active seqSlots.
TensorConstPtr seqSlots; // [forwardBatchSize], on gpu
};
class LookaheadDecodingInputs : public DecodingInputs
{
public:
explicit LookaheadDecodingInputs(TensorConstPtr endIds, TensorConstPtr batchSlots)
: DecodingInputs{std::move(endIds), std::move(batchSlots)}
{
}
};
// Explicit draft tokens inputs
class EagleInputs : public DecodingInputs
{
public:
explicit EagleInputs(TensorConstPtr endIds, TensorConstPtr batchSlots, runtime::SizeType32 batchSize,
TensorConstPtr nextDraftTokens, TensorConstPtr nextDraftLens, TensorConstPtr nextDraftPaths,
TensorConstPtr lastDraftTokens, TensorConstPtr lastDraftLens, TensorConstPtr lastDraftPaths,
TensorConstPtr acceptedTokens, TensorConstPtr acceptedLens, TensorConstPtr acceptedPathIds,
TensorConstPtr chunkedContextNextTokens, TensorConstPtr seqSlots)
: DecodingInputs(std::move(endIds), std::move(batchSlots), 0, 0, batchSize)
, nextDraftTokens(nextDraftTokens)
, nextDraftLens(nextDraftLens)
, nextDraftPaths(nextDraftPaths)
, lastDraftTokens(lastDraftTokens)
, lastDraftLens(lastDraftLens)
, lastDraftPaths(lastDraftPaths)
, acceptedTokens(acceptedTokens)
, acceptedLens(acceptedLens)
, acceptedPathIds(acceptedPathIds)
, chunkedContextNextTokens(chunkedContextNextTokens)
, seqSlots(seqSlots)
{
}
//! Draft tokens for the next iteration.
TensorConstPtr nextDraftTokens; // [forwardBatchSize, maxDecodingDraftTokens], gpu
//! Number of the draft tokens for the next iteration.
TensorConstPtr nextDraftLens; // [forwardBatchSize], gpu
//! Draft paths for the next iteration.
TensorConstPtr nextDraftPaths; // [forwardBatchSize, maxDecodingTokens, maxPathLen], gpu
//! Same as `nextDraftTokens`, but for current iteration.
TensorConstPtr lastDraftTokens; // [forwardBatchSize, maxNumPaths, maxPathLen], gpu
//! Number of the draft tokens input to the previous TRT iteration.
TensorConstPtr lastDraftLens; // [forwardBatchSize], gpu
//! Same as `nextDraftPaths`, but for current iteration.
TensorConstPtr lastDraftPaths; // [forwardBatchSize, maxDecodingTokens, maxPathLen], gpu
//! Lastly accepted tokens (including golden token).
TensorConstPtr acceptedTokens; // [forwardBatchSize, maxPathLen]
//! Number of accepted tokens (at least 1).
TensorConstPtr acceptedLens; // [forwardBatchSize]
//! Ids of the accepted path.
TensorConstPtr acceptedPathIds; // [forwardBatchSize]
//! Indicator whether the context request last chunk or not.
TensorConstPtr chunkedContextNextTokens; // [forwardBatchSize]
TensorConstPtr seqSlots; // [forwardBatchSize], on gpu
};
class BaseDecodingOutputs
{
public:
explicit BaseDecodingOutputs(TensorPtr outputIds)
: outputIds{std::move(outputIds)}
{
}
virtual ~BaseDecodingOutputs() = default;
// mandatory parameters
TensorPtr outputIds; // [maxBatchSize, maxSeqLen]
// optional parameters
//! [maxBatchSize * maxBeamWidth], optional
std::optional<TensorPtr> finished;
//! [maxBatchSize * maxBeamWidth], optional
std::optional<TensorPtr> sequenceLength;
//! [maxBatchSize * maxBeamWidth], necessary in beam search
std::optional<TensorPtr> cumLogProbs;
//! [maxBatchSize, maxBeamWidth, maxSeqLen], must be float*, optional
std::optional<TensorPtr> outputLogProbs;
//! [maxBatchSize, maxBeamWidth, maxSeqLen], necessary in beam search
std::optional<TensorPtr> parentIds;
//! [maxBatchSize] int* (2-d array), each int* has [maxBeamWidth, maxSeqLen]
TensorPtr outputIdsPtr;
TensorPtr outputIdsPtrHost;
//! [maxBatchSize] int* (2-d array), each int* has [maxBeamWidth, maxSeqLen]
TensorPtr parentIdsPtr;
// Tokens predicted at current iteration.
TensorPtr newTokens; // [maxBatchSize, maxBeamWidth]
// optional parameters
//! Number of tokens predicted at current iteration.
//! [maxBatchSize]
std::optional<TensorPtr> numNewTokens;
//! [1] in pinned host memory
std::optional<TensorPtr> finishedSum;
//! [maxSeqLen, maxBatchSize, maxBeamWidth], must be float*
std::optional<TensorPtr> outputLogProbsTiled;
};
class BeamSearchOutputs : public BaseDecodingOutputs
{
public:
explicit BeamSearchOutputs(TensorPtr outputIds)
: BaseDecodingOutputs{std::move(outputIds)}
{
}
//! the k/v cache index for beam search
//! [forwardBatchSize, maxBeamWidth, maxSeqLen]
TensorPtr tgtCacheIndirection;
//! structure maintains some pointers of beam search
std::unique_ptr<kernels::BeamHypotheses> beamHypotheses;
};
//!
//! \brief SpeculativeDecodingOutputs outputs.
//!
//! For one example sequence [a, b] [c] <x, y, z>, where, [a, b, c] is the accepted sequence,
//! [c] is the last accepted token, and <x, y, z> is the draft tokens from `nextDraftTokens` saved by last step.
//! [c]'s position id is known, only position ids for <x, y, z> need to be provided in `nextDraftPosIds`.
//! LLM inputs {c, x, y, z} and generates {c', x', y', z'}.
//!
//! {c'} is always accepted and {x', z'} is supposed to be accepted.
//! The accepted tokens [c', x', z'] is saved in `outputIds` in-place, starting from `sequenceLength`.
//! The `acceptedLength` is 3, and the accepted draft tokens length is 2.
//! `sequenceLength` is also increased by `acceptedLength` in-place.
//! The pathsOffset is {0, 1, 3} for {c', x', z'}.
//! [] for accepted, <> for draft, {} for input/output.
//!
//! For a batchSlots {1, 3}, `numNewTokensCumSum` is an exclusive sum of `numNewTokens` over the batch,
//! the `numNewTokens` may be {3, 5}, `numNewTokensCumSum` is {0, 3, 8}.
//!
//! `nextDraftLengths` and `prevDraftLengths` are needed for methods that support if variable
//! draft length. `nextDraftLengths` must contain the number of draft tokens per request for the next iteration.
//! `prevDraftLengths` must contain the number of draft tokens used in the current iteraiton.
//!
//! `pathsOffsets` is needed for KV cache rewind. It contains the positions of the accepted draft tokens in the
//! flattened tensor of draft tokens. E.g. if for sequence {c, x, y, z} only `y` and `z` were accepted,
//! `pathsOffsets` contains [1, 2]. `pathsOffsets` is flattened tensor for whole batch.
//!
//! The order of `pathsOffsets` and `numNewTokensCumSum` must be aligned. Such that
//! `pathsOffset[numNewTokensCumSum[bi]:numNewTokensCumSum[bi+1]]` is the slice of offsets for `bi`th request.
//! Furthermore, the order of requests is important and must be aligned with sorted `RuntimeBuffers::seqSlots`
//! such that the request with smaller `seqSlot` stays earlier in the tensors.
//! However, this condition usually holds if method does not expect from the engine anything else, but logits.
class SpeculativeDecodingOutputs : public BaseDecodingOutputs
{
public:
explicit SpeculativeDecodingOutputs(TensorPtr outputIds)
: BaseDecodingOutputs{std::move(outputIds)}
{
}
//! Draft tokens for the next step
// [maxBatchSize, maxDecodingDraftTokens]
TensorPtr nextDraftTokens;
//! Draft token position IDs
//! [maxBatchSize, maxDecodingDraftTokens]
TensorPtr nextDraftPosIds;
//! Prev step draft tokens lengths, should be filled only for variable draft length speculative decoding mode
//! [maxBatchSize]
TensorPtr prevDraftLengths;
//! Next step draft tokens lengths, should be filled only for variable draft length speculative decoding mode
//! [maxBatchSize]
TensorPtr nextDraftLengths;
//! Accumulative sum along batchSlots.
//! [maxBatchSize + 1]
TensorPtr numNewTokensCumSum;
//! [maxBatchSize * maxPathLen]
TensorPtr pathsOffsets;
//! [maxBatchSize, maxDecodingTokens, divUp(maxDecodingTokens, 32)]
TensorPtr packedMasks;
};
class LookaheadDecodingOutputs : public SpeculativeDecodingOutputs
{
using TensorPtr = runtime::ITensor::SharedPtr;
public:
explicit LookaheadDecodingOutputs(TensorPtr outputIds)
: SpeculativeDecodingOutputs{std::move(outputIds)}
{
}
//! for TLLM engine input "spec_decoding_generation_lengths", indicating how many tokens to be generated.
//! currently, the 1st step of generation is 1, set at `setup`, others are maxDecodingTokens, set at `forward`.
//! [maxBatchSize]
TensorPtr generationLengths;
//! for TLLM engine input "spec_decoding_position_offsets",
//! indicating each token position offset base on the last golden token = 0.
//! ABC<D>efgxyz--- // sequence tokens, ABCD: golden; efg, xyz: draft; ---: padding.
//! ***<0>123123--- // positionOffsets.
//! 012<3>456456--- // positionIds.
//! [maxBatchSize, maxDecodingTokens]
TensorPtr positionOffsets;
//! [maxBatchSize, maxDecodingTokens]
TensorPtr positionIds;
};
class ExplicitDraftTokensOutputs : public SpeculativeDecodingOutputs
{
public:
explicit ExplicitDraftTokensOutputs(TensorPtr outputIds)
: SpeculativeDecodingOutputs{std::move(outputIds)}
{
}
//! Draft tokens for the next iteration. The first token in each path is the last accepted token at current
//! iteration. E.g. if batchSize == 1, maxNumPaths == 2, maxPathLen== 3, [[[0, 1, 2], [0, 1, 10]]]
TensorPtr unpackedNextDraftTokens; // [maxBatchSize, maxNumPaths, maxPathLen] on gpu
//! Indices of draft tokens in the compressed `nextFlatTokens` for the next iteration.
//! Using example above, [[[0, 1, 2], [0, 1, 3]]]
TensorPtr unpackedNextDraftIndices; // [maxBatchSize, maxNumPaths, maxPathLen] on gpu
//! Probabilities of the next draft tokens.
TensorPtr nextDraftProbs; // [maxBatchSize, maxNumPaths, maxPathDraftLen, vocabSize] on gpu
//! Baseline for the position ids.
TensorPtr positionIdsBase; // [maxBatchSize] on gpu
//! Randomly sampled data (between 0.f and 1.f)
TensorPtr randomDataSample; // [maxBatchSize] on gpu
//! Randomly sampled data (between 0.f and 1.f)
TensorPtr randomDataValidation; // [maxBatchSize, maxNumPaths, maxDraftPathLen] on gpu
//! Sampling temperature.
TensorPtr temperatures; // [maxBatchSize] on gpu
//! Next generation lengths.
TensorPtr generationLengths; // [maxBatchSize] on gpu
//! Next generation lengths on host.
TensorPtr generationLengthsHost; // [maxBatchSize] on pinned
//! Maximum number of generated tokens for the next step across whole batch
TensorPtr maxGenLengthHost; // [1] on pinned
};
class EagleOutputs : public SpeculativeDecodingOutputs
{
public:
explicit EagleOutputs(TensorPtr outputIds)
: SpeculativeDecodingOutputs{std::move(outputIds)}
{
}
//! Unpacked draft tokens
TensorPtr unpackedNextDraftTokens; // [maxBatchSize, maxDecodingDraftTokens] on gpu
//! Draft paths for the next iteration.
TensorPtr nextDraftPaths; // [maxBatchSize, maxDecodingTokens, maxPathLen] on gpu
//! Randomly sampled data (between 0.f and 1.f)
TensorPtr randomDataSample; // [maxBatchSize] on gpu
//! Randomly sampled data (between 0.f and 1.f)
TensorPtr randomDataValidation; // [maxBatchSize] on gpu
//! Sampling temperature.
TensorPtr temperatures; // [maxBatchSize] on gpu
//! Next generation lengths.
TensorPtr generationLengths; // [maxBatchSize] on gpu
//! Next generation lengths.
TensorPtr generationLengthsHost; // [maxBatchSize] on pinned
//! Request types for ctx stage of the EagleNet0 (filled with 0s).
TensorPtr eagleNetCtxRequestTypesHost; //! [maxBatchSize] on pinned
//! Context lengths of the context EagleNet0.
TensorPtr eagleNetCtxContextLengthsHost; //! [maxBatchSize] on pinned
//! Past kv lengths of the context EagleNet0.
TensorPtr eagleNetCtxPastKeyValueLengthsHost; //! [maxBatchSize] on pinned
//! Request types for ctx stage of the EagleNetX (filled with 1s).
TensorPtr eagleNetGenRequestTypesHost; //! [maxBatchSize] on pinned
//! Context lengths of the generation EagleNetX.
TensorPtr eagleNetGenContextLengthsHost; //! [maxBatchSize] on pinned
//! Past kv lengths of the generation EagleNetX.
TensorPtr eagleNetGenPastKeyValueLengthsHost; //! [maxBatchSize] on pinned
};
} // namespace tensorrt_llm::layers