/* * Copyright (c) 2022-2024, NVIDIA CORPORATION. All rights reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #pragma once #include "tensorrt_llm/kernels/beamSearchKernels.h" #include #include #include #include #include namespace tc = tensorrt_llm::common; namespace tensorrt_llm::layers { //! //! \brief In a DecodingLayer's life cycle, it is constructed once; //! `setup` repeatedly, but once per request; `forward*` repeatedly, many times per request. //! A possible sequence would be, construct(maxBatchSize) -> setup({1,3}) -> forward({1, 3}) //! -> forward({1, 3}) -> setup({2,4}) -> forward({1, 3, 2, 4}) -> forward({1, 3, 2, 4}) //! -> forward({1, 2, 4}), where {a,b} are batchSlots, and {3} ends at last step. //! As a result there are three types of batches. //! 1. `maxBatchSize` for each layers to reserve resources. //! It is passed through class constructor, in DecoderDomain.getBatchSize(). //! 2. `setupBatchSize` for setting up layers for a batch of new requests. //! It is passed through `setup` method. //! 3. `forwardBatchSize` for layers forwarding for a batch of existing active requests. //! it is passed through `forwardAsync` and `forwardSync` methods. //! `setup` and `forward` always provide `batch_slots` indexed by //! local batch index ranging in [0, setupBatchSize) or [0, forwardBatchSize), //! holding the global batch index ranging in [0, maxBatchSize). //! In case of beam search, maxBatchSize = forwardBatchSize = 1. class DecoderDomain { public: DecoderDomain(runtime::SizeType32 batchSize, runtime::SizeType32 beamWidth, runtime::SizeType32 vocabSize, std::optional vocabSizePadded = std::nullopt, std::shared_ptr speculativeDecodingModule = nullptr) : mBatchSize(batchSize) , mBeamWidth(beamWidth) , mVocabSize(vocabSize) , mVocabSizePadded(vocabSizePadded.value_or(vocabSize)) , mSpeculativeDecodingModule(speculativeDecodingModule) { } [[nodiscard]] runtime::SizeType32 getBatchSize() const { return mBatchSize; } [[nodiscard]] runtime::SizeType32 getBeamWidth() const { return mBeamWidth; } [[nodiscard]] runtime::SizeType32 getVocabSize() const { return mVocabSize; } [[nodiscard]] runtime::SizeType32 getVocabSizePadded() const { return mVocabSizePadded; } [[nodiscard]] runtime::SizeType32 getMaxDecodingTokens() const { return mSpeculativeDecodingModule ? mSpeculativeDecodingModule->getMaxDecodingTokens() : 1; } [[nodiscard]] std::shared_ptr getSpeculativeDecodingModule() const { TLLM_CHECK_WITH_INFO(mSpeculativeDecodingModule, "Speculative decoding module is not set to decoder domain"); return mSpeculativeDecodingModule; } private: runtime::SizeType32 mBatchSize; runtime::SizeType32 mBeamWidth; runtime::SizeType32 mVocabSize; runtime::SizeType32 mVocabSizePadded; std::shared_ptr mSpeculativeDecodingModule; }; class BaseSetupParams { public: virtual ~BaseSetupParams() {} }; class DynamicDecodeSetupParams : public BaseSetupParams { public: // Penalty layer struct PenaltyParams { std::optional> temperature; // [1] or [setupBatchSize] on cpu std::optional> minLength; // [1] or [setupBatchSize] on cpu std::optional> repetitionPenalty; // [1] or [setupBatchSize] on cpu std::optional> presencePenalty; // [1] or [setupBatchSize] on cpu std::optional> frequencyPenalty; // [1] or [setupBatchSize] on cpu std::optional> noRepeatNgramSize; // [1] or [setupBatchSize] on cpu }; struct SamplingParams { // baseSamplingLayer std::optional> runtime_top_k; // [1] or [setupBatchSize] on cpu std::optional> runtime_top_p; // [1] or [setupBatchSize] on cpu // topPSamplingLayer std::optional> top_p_decay; // [setupBatchSize], must between [0, 1] std::optional> top_p_min; // [setupBatchSize], must between [0, 1] std::optional> top_p_reset_ids; // [setupBatchSize] std::optional normalize_log_probs; std::optional> outputLogProbs; // [setupBatchSize] std::optional> cumLogProbs; // [setupBatchSize] }; struct BeamSearchParams { // BeamSearchLayer std::optional> beam_search_diversity_rate; // [setupBatchSize] on cpu std::optional> length_penalty; // [setupBatchSize] on cpu std::optional> early_stopping; // [setupBatchSize] on cpu }; struct MedusaParams { // Medusa params std::optional>> topKMedusaHeads; // [setupBatchSize, maxMedusaHeads] }; std::optional> randomSeed; // [1] or [setupBatchSize] on cpu PenaltyParams penaltyParams; SamplingParams samplingParams; BeamSearchParams beamSearchParams; MedusaParams medusaParams; }; class BaseInputParams { public: explicit BaseInputParams(runtime::SizeType32 step, runtime::SizeType32 ite, tc::Tensor endIds) : step{step} , ite{ite} , end_ids{std::move(endIds)} { } virtual ~BaseInputParams() {} // mandatory parameters runtime::SizeType32 step; runtime::SizeType32 ite; tc::Tensor end_ids; // [maxBatchSize] std::optional batch_slots; // [forwardBatchSize], on pinned memory std::optional finished; // [maxBatchSize, maxBeamWidth] }; class DynamicDecodeInputParams : public BaseInputParams { public: DynamicDecodeInputParams(runtime::SizeType32 step, runtime::SizeType32 ite, runtime::SizeType32 maxInputLength, runtime::SizeType32 maxAttentionWindow, runtime::SizeType32 sinkTokenLength, runtime::SizeType32 localBatchSize, tc::Tensor endIds) : BaseInputParams(step, ite, std::move(endIds)) , max_input_length{maxInputLength} , max_attention_window{maxAttentionWindow} , sink_token_length{sinkTokenLength} , local_batch_size{localBatchSize} , max_stop_words_len{0} , max_bad_words_len{0} { } // mandatory parameters runtime::SizeType32 max_input_length; runtime::SizeType32 max_attention_window; runtime::SizeType32 sink_token_length; runtime::SizeType32 local_batch_size; runtime::SizeType32 max_stop_words_len; runtime::SizeType32 max_bad_words_len; // One of these two fields has to be set // DynamicDecodeLayer::forward checks for it // Need both of these fields to support legacy code during transition period to the batched decoder std::optional logits; // [maxBatchSize, beamWidth, vocabSizePadded] std::optional> logits_vec; // [forwardBatchSize][beamWidth, vocabSizePadded], on gpu // optional parameters std::optional src_cache_indirection; // [forwardBatchSize, maxBeamWidth, maxSeqLen] - the k/v cache // index for beam search, mandatory for beam search, on gpu std::optional sequence_limit_length; // [maxBatchSize], on gpu std::optional embedding_bias; // [vocabSizePadded], on gpu std::optional input_lengths; // [maxBatchSize, maxBeamWidth], on gpu std::optional bad_words_ptr; // [maxBatchSize][2, bad_words_length], on gpu std::optional bad_words_lengths; // [maxBatchSize], on gpu std::optional stop_words_ptr; // [maxBatchSize][2, stop_words_length], on gpu std::optional stop_words_lengths; // [maxBatchSize], on gpu // Medusa inputs class MedusaInputs { public: tc::Tensor medusaCurTokensPerStep; // [maxBatchSize], optional, on gpu tc::Tensor medusaTargetTokensPerStep; // [maxBatchSize], optional, on gpu tc::Tensor medusaPaths; // [maxBatchSize, maxPathLen, maxPathLen] // optional, on gpu tc::Tensor medusaTreeIds; // [maxBatchSize, maxDecodingTokens], optional, on gpu std::vector> medusaLogits; // [maxBatchSize][maxDraftPathLen] // [maxDecodingTokens, vocabSizePadded], optional, on gpu }; // Explicit draft tokens inputs // FIXME(nkorobov): this should be ExplicitDraftTokensBuffers? class ExplicitDraftTokensInputs { public: }; std::optional medusaInputs; std::optional explicitDraftTokensInputs; }; class BaseOutputParams { public: explicit BaseOutputParams(tc::Tensor outputIds) : output_ids{std::move(outputIds)} { } virtual ~BaseOutputParams() {} // mandatory parameters tc::Tensor output_ids; // [maxBatchSize, maxSeqLen] // optional parameters std::optional finished; // [maxBatchSize * maxBeamWidth], optional std::optional sequence_length; // [maxBatchSize * maxBeamWidth], optional std::optional cum_log_probs; // [maxBatchSize * maxBeamWidth], necessary in beam search std::optional output_log_probs; // [maxBatchSize, maxBeamWidth, maxSeqLen], must be float*, optional std::optional parent_ids; // [maxBatchSize, maxBeamWidth, maxSeqLen], necessary in beam search tc::Tensor output_ids_ptr; // [maxBatchSize] int* (2-d array), each int* has [maxBeamWidth, maxSeqLen] //! //! \brief SpeculativeDecodingOutputs outputs. //! //! For one example sequence [a, b] [c] , where, [a, b, c] is the accepted sequence, //! [c] is the last accepted token, and is the draft tokens from `nextDraftTokens` saved by last step. //! [c]'s position id is known, only position ids for need to be provided in `nextDraftPosIds`. //! LLM inputs {c, x, y, z} and generates {c', x', y', z'}. //! //! {c'} is always accepted and {x', z'} is supposed to be accepted. //! The accepted tokens [c', x', z'] is saved in `output_ids` in-place, starting from `sequence_length`. //! The `acceptedLength` is 3, and the accepted draft tokens length is 2. //! `sequence_length` is also increaded by `acceptedLength` in-place. //! The pathsOffset is {0, 1, 3} for {c', x', z'}. //! [] for accepted, <> for draft, {} for input/output. //! //! For a batchSlots {1, 3}, `acceptedLengthsCumSum` is an exclusive sum of `acceptedLength` over the batch, //! the `acceptedLengths` may be {3, 5}, `acceptedLengthsCumSum` is {0, 3, 8}. class SpeculativeDecodingOutputs { public: tc::Tensor nextDraftTokens; // [maxBatchSize, maxDecodingDraftTokens], draft tokens for the next step tc::Tensor nextDraftPosIds; // [maxBatchSize, maxDecodingDraftTokens], draft token position IDs tc::Tensor nextDraftLengths; // [maxBatchSize], next step draft tokens lengths tc::Tensor acceptedLengths; // [maxBatchSize], lengths of the accepted draft tokens + 1. tc::Tensor acceptedLengthsCumSum; // [maxBatchSize + 1] accumulative sum along batchSlots. tc::Tensor pathsOffsets; // [maxBatchSize, maxPathLen] tc::Tensor packedMasks; // [maxBatchSize, maxDecodingTokens, divUp(maxDecodingTokens, 32)] }; class ExplicitDraftTokensOutputs : public SpeculativeDecodingOutputs { public: //! Draft tokens for the next iteration. The first token in each path is the last accepted token at current //! iteration. E.g. if batchSize == 1, maxNumPaths == 2, maxPathLen== 3, [[[0, 1, 2], [0, 1, 10]]] tc::Tensor unpackedNextDraftTokens; // [maxBatchSize, maxNumPaths, maxPathLen] on gpu //! Indices of draft tokens in the compressed `nextFlatTokens` for the next iteration. //! Using example above, [[[0, 1, 2], [0, 1, 3]]] tc::Tensor unpackedNextDraftIndices; // [maxBatchSize, maxNumPaths, maxPathLen] on gpu //! Probabilities of the next draft tokens. tc::Tensor nextDraftProbs; // [maxBatchSize, maxNumPaths, maxPathDraftLen, vocabSize] on gpu //! Baseline for the position ids. tc::Tensor positionIdsBase; // [maxBatchSize] on gpu //! Randomly sampled data (between 0.f and 1.f) tc::Tensor randomDataSample; // [maxBatchSize] on gpu //! Randomly sampled data (between 0.f and 1.f) tc::Tensor randomDataValidation; // [maxBatchSize, maxNumPaths, maxDraftPathLen] on gpu //! Sampling temperature. tc::Tensor temperatures; // [maxBatchSize] on gpu }; std::optional speculativeDecodingOutputs; std::optional explicitDraftTokensOutputs; }; class DynamicDecodeOutputParams : public BaseOutputParams { public: explicit DynamicDecodeOutputParams(tc::Tensor outputIds) : BaseOutputParams{std::move(outputIds)} { } // mandatory parameters tc::Tensor newTokens; // [maxBatchSize, maxBeamWidth] // optional parameters std::optional finished_sum; // [1] in pinned host memory std::optional output_log_probs_tiled; // [maxSeqLen, maxBatchSize, maxBeamWidth], must be float* std::optional tgt_cache_indirection; // [forwardBatchSize, maxBeamWidth, maxSeqLen], the k/v cache index for beam search std::unique_ptr beamHypotheses; // structure maintains some pointers of beam search tc::Tensor parent_ids_ptr; // [maxBatchSize] int* (2-d array), each int* has [maxBeamWidth, maxSeqLen] }; } // namespace tensorrt_llm::layers