/* * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & * AFFILIATES. All rights reserved. SPDX-License-Identifier: Apache-2.0 * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #include "eagleDecodeDraftTokensPlugin.h" #include "tensorrt_llm/common/assert.h" #include "tensorrt_llm/common/dataType.h" #include "tensorrt_llm/common/memoryUtils.h" #include "tensorrt_llm/kernels/samplingTopKKernels.h" #include "tensorrt_llm/kernels/speculativeDecoding/eagleDecodingKernels.h" #include "tensorrt_llm/kernels/speculativeDecoding/medusaDecodingKernels.h" #include "tensorrt_llm/runtime/common.h" #include "tensorrt_llm/runtime/iTensor.h" using namespace nvinfer1; using tensorrt_llm::plugins::EagleDecodeDraftTokensPluginCreator; using tensorrt_llm::plugins::EagleDecodeDraftTokensPlugin; using namespace tensorrt_llm::kernels; using namespace tensorrt_llm::kernels::speculative_decoding; using namespace tensorrt_llm::runtime; namespace tc = tensorrt_llm::common; static char const* EAGLE_DECODE_DRAFT_TOKENS_PLUGIN_VERSION{"1"}; static char const* EAGLE_DECODE_DRAFT_TOKENS_PLUGIN_NAME{"EagleDecodeDraftTokens"}; PluginFieldCollection EagleDecodeDraftTokensPluginCreator::mFC{}; std::vector EagleDecodeDraftTokensPluginCreator::mPluginAttributes; EagleDecodeDraftTokensPlugin::EagleDecodeDraftTokensPlugin( nvinfer1::DataType type, int32_t layerIdx, int32_t numEagleLayers, bool topKSampling) : mDtype(type) , mLayerIdx(layerIdx) , mNumEagleLayers(numEagleLayers) , mTopKSampling(topKSampling) { TLLM_CHECK_WITH_INFO(mTopKSampling, "Multinomial sampling is not supported yet."); } // Parameterized constructor EagleDecodeDraftTokensPlugin::EagleDecodeDraftTokensPlugin(void const* data, size_t length) { char const *d = reinterpret_cast(data), *a = d; read(d, mDtype); read(d, mLayerIdx); read(d, mNumEagleLayers); read(d, mTopKSampling); TLLM_CHECK_WITH_INFO(d == a + length, "Expected length (%d) != real length (%d). This is often " "caused by using different TensorRT LLM version to build " "engine and run engine.", static_cast(length), static_cast(d - a)); } // IPluginV2DynamicExt Methods nvinfer1::IPluginV2DynamicExt* EagleDecodeDraftTokensPlugin::clone() const noexcept { auto* plugin = new EagleDecodeDraftTokensPlugin(*this); plugin->setPluginNamespace(mNamespace.c_str()); return plugin; } nvinfer1::DimsExprs EagleDecodeDraftTokensPlugin::getOutputDimensions( int outputIndex, nvinfer1::DimsExprs const* inputs, int nbInputs, nvinfer1::IExprBuilder& exprBuilder) noexcept { TLLM_CHECK(outputIndex < getNbOutputs()); TLLM_CHECK(nbInputs == 12); auto const batchSizeExpr = inputs[getIdx(InputIdxEntry::PATHS)].d[0]; auto const maxDecodingTokensExpr = inputs[getIdx(InputIdxEntry::PATHS)].d[1]; auto const maxPathLengthExpr = inputs[getIdx(InputIdxEntry::PATHS)].d[2]; auto const maxDecodingDraftTokensExpr = exprBuilder.operation(DimensionOperation::kSUB, *maxDecodingTokensExpr, *exprBuilder.constant(1)); auto const numEagleLayersExpr = exprBuilder.operation(DimensionOperation::kSUB, *maxPathLengthExpr, *exprBuilder.constant(1)); auto const maxDecodingDraftTokensSquareExpr = exprBuilder.operation(DimensionOperation::kPROD, *maxDecodingDraftTokensExpr, *maxDecodingDraftTokensExpr); // maxDecodingDraftTokensExpr * maxDecodingDraftTokensExpr nvinfer1::DimsExprs ret; if (outputIndex == getIdx(OutputIdxEntry::OUTPUT_DRAFT_TOKEN_IDS)) { // output_draft_token_ids: [batch_size, max_decoding_draft_tokens] ret.nbDims = 2; ret.d[0] = batchSizeExpr; ret.d[1] = maxDecodingDraftTokensExpr; } else if (outputIndex == getIdx(OutputIdxEntry::OUTPUT_DRAFT_LENS)) { // output_draft_lens: [batch_size] ret.nbDims = 1; ret.d[0] = batchSizeExpr; } else if (outputIndex == getIdx(OutputIdxEntry::OUTPUT_PATHS)) { // output_path: [batch_size, max_decoding_tokens, max_path_len] ret.nbDims = 3; ret.d[0] = batchSizeExpr; ret.d[1] = maxDecodingTokensExpr; ret.d[2] = maxPathLengthExpr; } else if (outputIndex == getIdx(OutputIdxEntry::OUTPUT_CURRENT_SCORES)) { // output_current_scores: [batch_size, max_decoding_draft_tokens] ret.nbDims = 2; ret.d[0] = batchSizeExpr; ret.d[1] = maxDecodingDraftTokensExpr; } else if (outputIndex == getIdx(OutputIdxEntry::OUTPUT_NEXT_EXPAND_INDICES)) { // output_next_expand_index ret.nbDims = 2; ret.d[0] = batchSizeExpr; ret.d[1] = maxDecodingDraftTokensExpr; } else if (outputIndex == getIdx(OutputIdxEntry::OUTPUT_ALL_LAYERS_SCORES)) { // output_all_layers_scores: // [batch_size, num_eagle_layers, max_decoding_draft_tokens x max_decoding_draft_tokens] ret.nbDims = 3; ret.d[0] = batchSizeExpr; ret.d[1] = numEagleLayersExpr; ret.d[2] = maxDecodingDraftTokensSquareExpr; } else if (outputIndex == getIdx(OutputIdxEntry::OUTPUT_ALL_LAYERS_DRAFT_TOKEN_IDS)) { // output_all_layers_draft_token_ids: // [batch_size, num_eagle_layers, max_decoding_draft_tokens x max_decoding_draft_tokens] ret.nbDims = 3; ret.d[0] = batchSizeExpr; ret.d[1] = numEagleLayersExpr; ret.d[2] = maxDecodingDraftTokensSquareExpr; } else if (outputIndex == getIdx(OutputIdxEntry::OUTPUT_ALL_LAYERS_DRAFT_TOKEN_IDS_PREDECESSOR)) { // output_all_layers_draft_token_ids_predecessor // [batch_size, num_eagle_layers, max_decoding_draft_tokens x max_decoding_draft_tokens] ret.nbDims = 3; ret.d[0] = batchSizeExpr; ret.d[1] = numEagleLayersExpr; ret.d[2] = maxDecodingDraftTokensSquareExpr; } else { TLLM_CHECK_WITH_INFO( false, "Wrong outputIndex %d in EagleDecodeDraftTokensPlugin::getOutputDimensions", outputIndex); } return ret; } bool EagleDecodeDraftTokensPlugin::supportsFormatCombination( int pos, nvinfer1::PluginTensorDesc const* inOut, int nbInputs, int nbOutputs) noexcept { TLLM_CHECK(nbInputs == 12 && nbOutputs == getNbOutputs()); TLLM_CHECK(pos < nbInputs + nbOutputs); if (pos == getIdx(InputIdxEntry::LOGITS)) { // input: logits // output: output_all_layers_scores return (inOut[pos].type == mDtype) && (inOut[pos].format == TensorFormat::kLINEAR); } else if (pos == getIdx(InputIdxEntry::INPUT_ALL_LAYERS_SCORES) || pos == getIdx(InputIdxEntry::INPUT_PREV_SCORES) || pos == nbInputs + getIdx(OutputIdxEntry::OUTPUT_ALL_LAYERS_SCORES) || pos == nbInputs + getIdx(OutputIdxEntry::OUTPUT_CURRENT_SCORES)) { // input: rand_sample, input_all_layers_scores, input_prev_scores // output: output_all_layers_scores, output_current_scores return (inOut[pos].type == nvinfer1::DataType::kFLOAT) && (inOut[pos].format == TensorFormat::kLINEAR); } else { // input: path, num_valid_logits, use_dynamic_tree, dynamic_tree_max_topK, input_draft_token_ids, // input_draft_lens, input_current_expand_index, input_all_layers_draft_token_ids // output: output_draft_token_ids, output_draft_lens, output_path, output_next_expand_index // output_all_layers_draft_token_ids, output_all_alyers_draft_token_predecessor return (inOut[pos].type == nvinfer1::DataType::kINT32) && (inOut[pos].format == TensorFormat::kLINEAR); } } void EagleDecodeDraftTokensPlugin::configurePlugin(nvinfer1::DynamicPluginTensorDesc const* in, int nbInputs, nvinfer1::DynamicPluginTensorDesc const* out, int nbOutputs) noexcept { } template size_t EagleDecodeDraftTokensPlugin::getWorkspaceSizeType(nvinfer1::PluginTensorDesc const* inputs, int nbInputs, nvinfer1::PluginTensorDesc const* outputs, int nbOutputs) const noexcept { size_t workspaceSize{0}; auto const numInputLogits = inputs[getIdx(InputIdxEntry::LOGITS)].dims.d[0]; auto const batchSize = inputs[getIdx(InputIdxEntry::PATHS)].dims.d[0]; auto const vocabSizePadded = inputs[getIdx(InputIdxEntry::LOGITS)].dims.d[1]; auto const maxDecodingTokens = inputs[getIdx(InputIdxEntry::PATHS)].dims.d[1]; auto const maxDecodingDraftTokens = maxDecodingTokens - 1; auto const maxTopK = maxDecodingDraftTokens; auto const mNumEagleLayers = inputs[getIdx(InputIdxEntry::INPUT_ALL_LAYERS_SCORES)].dims.d[1]; // Greedy sampling if (mTopKSampling) { // 0. The first topK sampling workspace auto const draftTokenSamplingWorkspaceSize = getTopKWorkspaceSize(numInputLogits, /* maxTokensPerStep */ 1, /* maxTopK */ maxTopK, vocabSizePadded); // 1. The first TopKs [numInputLogits] auto const topKsSize = numInputLogits * sizeof(SizeType32); // 2. Topks offset [batchSize] // Each request will have different number of logits that need to be sampled // This tensor will record the start offset of the topK for each request auto const topKOffsetSize = batchSize * sizeof(SizeType32); // 3. Logits ptrs [numInputLogits] auto const logitsPtrsSize = numInputLogits * sizeof(T*); // 4. The first topK sampling's output ids ptrs [numInputLogits][maxDecodingDraftTokens] auto const firstTopKOutputIdsPtrsSize = numInputLogits * sizeof(TokenIdType*); // 5. The first topK sampling's output ids (temporary buffer) [numInputLogits * maxDecodingDraftTokens] auto const firstTopKOutputIdsSize = numInputLogits * maxDecodingDraftTokens * sizeof(TokenIdType); // 6. Number of successors for each nodes, extract from the paths and layerId // [batchSize * maxDecodingTokens] auto const numSuccessorsForEachNodeSize = batchSize * maxDecodingTokens * sizeof(SizeType32); // 7. Flag whether to do decoding or not. SamplingTopK is done for numInputLogits tokens. // But only sum(numValidLogitsPerRequest[:]) of them are valid. // [batchSize * maxDecodingTokens] auto const skipDecodeSize = numInputLogits * sizeof(bool); // 8. The first topK sampling's logprobs [batchSize * maxDecodingDraftTokens] auto const firstTopKOutputLogProbsSize = numInputLogits * maxDecodingDraftTokens * sizeof(float); // 9. Eagle-2, the second topK sampling workspace // Sampling from [batchSize, maxTopK * maxTopK] to [batchSize, maxTopK] auto const secondTopKSamplingWorkspaceSize = getTopKWorkspaceSize( batchSize, /* maxTokensPerStep */ 1, /* maxTopK */ maxTopK, maxTopK * maxTopK); // 10. Eagle-2, the outputIds of the second topK sampling, shape [batchSize, maxDecodingTokens] auto const secondTopKOutputIdsSize = batchSize * maxDecodingTokens * sizeof(TokenIdType); // 11. Eagle-2, the outputIdsPtr of the second topK sampling, shape [batchSize] auto const secondTopKOutputIdsPtrSize = batchSize * sizeof(TokenIdType*); // 12. Eagle-2, the inputScoresPtrs of the second topK sampling, shape [batchSize] auto const secondTopKInputScoresPtrsSize = batchSize * sizeof(float*); // 13. Eagle-2, the outpuLogProbs of the second topK samplig, shape [batchSize, maxDecodingDraftTokens] auto const secondTopKOutputLogProbsSize = batchSize * maxDecodingDraftTokens * sizeof(float); // 14. Eagle-2, the input scores pointers of the third topK sampling, shape [batchSize] // Each points to a vocabSize = '(mNumEagleLayers - 1) * dynamicTreeMaxTopK * dynamicTreeMaxTopK + // dynamicTreeMaxTopK' auto const thirdTopKInputScoresPtrsSize = batchSize * sizeof(float*); // 15. Eagle-2, the output of the third topK sampling, shape [batchSize, maxDecodingDraftTokens] auto const thirdTopKOutputIdsSize = batchSize * maxDecodingDraftTokens * sizeof(TokenIdType); // 16. Eagle-2, the output pointers of the third topK sampling, shape [batchSize] auto const thirdTopKOutputIdsPtrsSize = batchSize * sizeof(TokenIdType*); // 17. Eagle-2, the workspace of the third topK sampling // Sampling from [batchSize, '(mNumEagleLayers - 1) * dynamicTreeMaxTopK * dynamicTreeMaxTopK + // dynamicTreeMaxTopK'] to [batchSize, maxDecodingDraftTokens] We over-set the vocabsize here. auto const thridTopKSamplingWorkspaceSize = getTopKWorkspaceSize(batchSize, /* maxTokensPerStep */ 1, /* maxTopK */ maxDecodingDraftTokens, mNumEagleLayers * maxDecodingDraftTokens * maxDecodingDraftTokens); // 18. Eagle-2, the topKs for each request in the third topK sampling // The real topK value is min(maxDecodingDraftTokens, totalNumDraftTokensForAllLayers) auto const thirdTopKsSize = batchSize * sizeof(SizeType32); SizeType32 constexpr NUM_BUFFERS{19}; size_t workspaces[NUM_BUFFERS]; workspaces[0] = draftTokenSamplingWorkspaceSize; workspaces[1] = topKsSize; workspaces[2] = topKOffsetSize; workspaces[3] = logitsPtrsSize; workspaces[4] = firstTopKOutputIdsPtrsSize; workspaces[5] = firstTopKOutputIdsSize; workspaces[6] = numSuccessorsForEachNodeSize; workspaces[7] = skipDecodeSize; workspaces[8] = firstTopKOutputLogProbsSize; workspaces[9] = secondTopKSamplingWorkspaceSize; workspaces[10] = secondTopKOutputIdsSize; workspaces[11] = secondTopKOutputIdsPtrSize; workspaces[12] = secondTopKInputScoresPtrsSize; workspaces[13] = secondTopKOutputLogProbsSize; workspaces[14] = thirdTopKInputScoresPtrsSize; workspaces[15] = thirdTopKOutputIdsSize; workspaces[16] = thirdTopKOutputIdsPtrsSize; workspaces[17] = thridTopKSamplingWorkspaceSize; workspaces[18] = thirdTopKsSize; workspaceSize = tc::calculateTotalWorkspaceSize(workspaces, NUM_BUFFERS); } else { // TODO fill me // Multinomial sampling TLLM_CHECK_WITH_INFO(false, "Multinomial sampling is not supported yet."); } return workspaceSize; } size_t EagleDecodeDraftTokensPlugin::getWorkspaceSize(nvinfer1::PluginTensorDesc const* inputs, int nbInputs, nvinfer1::PluginTensorDesc const* outputs, int nbOutputs) const noexcept { auto const logitsType = inputs[getIdx(InputIdxEntry::LOGITS)].type; if (logitsType == nvinfer1::DataType::kFLOAT) { return getWorkspaceSizeType(inputs, nbInputs, outputs, nbOutputs); } else if (logitsType == nvinfer1::DataType::kHALF) { return getWorkspaceSizeType<__half>(inputs, nbInputs, outputs, nbOutputs); } else { TLLM_CHECK_WITH_INFO(false, "Unsupported logits type"); } return 0; } template void EagleDecodeDraftTokensPlugin::doTopKSampling(nvinfer1::PluginTensorDesc const* inputDesc, nvinfer1::PluginTensorDesc const* outputDesc, void const* const* inputs, void* const* outputs, void* workspace, cudaStream_t stream) noexcept { TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__); // We allocate many buffers with 'numInputLogits' size, but the input logits will include some padding logits. // So only 'batchSize' or 'numValidLogits' size will be actually used. auto const numInputLogits = inputDesc[getIdx(InputIdxEntry::LOGITS)].dims.d[0]; auto const vocabSizePadded = inputDesc[getIdx(InputIdxEntry::LOGITS)].dims.d[1]; auto const batchSize = inputDesc[getIdx(InputIdxEntry::PATHS)].dims.d[0]; auto const maxDecodingTokens = inputDesc[getIdx(InputIdxEntry::PATHS)].dims.d[1]; auto const maxPathLen = inputDesc[getIdx(InputIdxEntry::PATHS)].dims.d[2]; auto const maxDecodingDraftTokens = maxDecodingTokens - 1; auto const maxTopK = maxDecodingDraftTokens; ////////////////////////////////////////// Get plugin inputs ////////////////////////////////////////// // Plugin inputs // Input logits for sampling, shape: [numInputLogits, vocabSizePadded] auto pluginInputLogits = static_cast(inputs[getIdx(InputIdxEntry::LOGITS)]); // Input paths, shape: [batchSize, maxDecodingTokens, maxPathLen] auto pluginInputPaths = static_cast(inputs[getIdx(InputIdxEntry::PATHS)]); auto numValidLogits = static_cast(inputs[getIdx(InputIdxEntry::NUM_VALID_LOGITS)]); // For Eagle-2 // Whether to use dynamic tree (i.e., Eagle-2) auto useDynamicTree = *(static_cast(inputs[getIdx(InputIdxEntry::USE_DYNAMIC_TREE)])); // The max topK for dynamic tree. All the requests have the same expand topK. // In Eagle-2, dynamicTreeMaxTopK is equal to maxNonLeavesPerLayer in the internal EagleNets. auto dynamicTreeMaxTopK = *(static_cast(inputs[getIdx(InputIdxEntry::DYNAMIC_TREE_MAX_TOPK)])); // All layer's draft tokenIds, shape: [batchSize, maxDecodingDraftTokens] auto pluginInputDraftTokenIds = reinterpret_cast(inputs[getIdx(InputIdxEntry::INPUT_DRAFT_TOKEN_IDS)]); // The number of all layer's draft tokenIds, shape: [batchSize] auto pluginInputDraftLens = reinterpret_cast(inputs[getIdx(InputIdxEntry::INPUT_DRAFT_LENS)]); // The previous EagleNet's scores, shape: [batchSize, maxDecodingDraftTokens] auto pluginInputPrevScores = static_cast(inputs[getIdx(InputIdxEntry::INPUT_PREV_SCORES)]); // The indices of the nodes that will be expand in this layer, shape: [batchSize, maxDecodingDraftTokens] // The index is related to the final output tree, which has max_decoding_draft_tokens draft tokens. auto pluginInputCurrentExpandIndices = reinterpret_cast(inputs[getIdx(InputIdxEntry::INPUT_CURRENT_EXPAND_INDICES)]); // The scores from all previous EagleNets, // shape: [batchSize, mNumEagleLayers, maxDecodingDraftTokens x maxDecodingDraftTokens] auto pluginInputAllLayersScores = static_cast(inputs[getIdx(InputIdxEntry::INPUT_ALL_LAYERS_SCORES)]); // The draft tokens from all previous EagleNets, // shape: [batchSize, mNumEagleLayers, maxDecodingDraftTokens x maxDecodingDraftTokens] auto pluginInputAllLayersDraftTokenIds = reinterpret_cast(inputs[getIdx(InputIdxEntry::INPUT_ALL_LAYERS_DRAFT_TOKEN_IDS)]); // The predecessor of all the draft tokens, // shape: [batchSize, mNumEagleLayers, maxDecodingDraftTokens x maxDecodingDraftTokens] auto pluginInputAllLayersDraftTokenIdsPredecessor = reinterpret_cast( inputs[getIdx(InputIdxEntry::INPUT_ALL_LAYERS_DRAFT_TOKEN_IDS_PREDECESSOR)]); ////////////////////////////////////////// Get plugin outputs ////////////////////////////////////////// // Plugin outputs // All layer's draft tokenIds, shape: [batchSize, maxDecodingDraftTokens] auto pluginOutputDraftTokenIds = reinterpret_cast(outputs[getIdx(OutputIdxEntry::OUTPUT_DRAFT_TOKEN_IDS)]); // The number of all layer's draft tokenIds, shape: [batchSize] auto pluginOutputDraftLens = reinterpret_cast(outputs[getIdx(OutputIdxEntry::OUTPUT_DRAFT_LENS)]); // For Eagle-2 // Updated paths base on this layer's sampling result, shape: [batchSize, maxDecodingTokens, maxPathLen] auto pluginOutputPaths = reinterpret_cast(outputs[getIdx(OutputIdxEntry::OUTPUT_PATHS)]); // This layer's scores, which will be used in next layers [batchSize, maxDecodingDraftTokens] auto pluginOutputCurrentScores = static_cast(outputs[getIdx(OutputIdxEntry::OUTPUT_CURRENT_SCORES)]); // The indices of the nodes that will be expand in next layer, shape: [batchSize, maxDecodingDraftTokens] // The index is related to the final output tree, which has max_decoding_draft_tokens draft tokens. auto pluginOutputNextExpandIndices = reinterpret_cast(outputs[getIdx(OutputIdxEntry::OUTPUT_NEXT_EXPAND_INDICES)]); // Updated scores, shape: [batchSize, mNumEagleLayers, maxDecodingDraftTokens x maxDecodingDraftTokens] auto pluginOutputAllLayersScores = static_cast(outputs[getIdx(OutputIdxEntry::OUTPUT_ALL_LAYERS_SCORES)]); // Updated draft tokens, shape: [batchSize, mNumEagleLayers, maxDecodingDraftTokens x maxDecodingDraftTokens] auto pluginOutputAllLayersDraftTokenIds = reinterpret_cast(outputs[getIdx(OutputIdxEntry::OUTPUT_ALL_LAYERS_DRAFT_TOKEN_IDS)]); // Update the predecessor of the draft tokens, shape: [batchSize, mNumEagleLayers, maxDecodingDraftTokens x // maxDecodingDraftTokens] auto pluginOutputAllLayersDraftTokenIdsPredecessor = reinterpret_cast(outputs[getIdx(OutputIdxEntry::OUTPUT_ALL_LAYERS_DRAFT_TOKEN_IDS_PREDECESSOR)]); ////////////////////////////////////////// Get workspaces ////////////////////////////////////////// int8_t* workspaceBytePtr = reinterpret_cast(workspace); size_t offset{0}; // Workspace 0: Sampling workspace. // Treat numInputLogits as batchSize auto const samplingWorkspaceSize = getTopKWorkspaceSize(numInputLogits, /* maxTokensPerStep */ 1, /* maxTopK */ maxTopK, vocabSizePadded); void* workspaceSampling = reinterpret_cast(tc::nextWorkspacePtr(workspaceBytePtr, offset, samplingWorkspaceSize)); // Workspace 1: Topks tensor: shape [numInputLogits] SizeType32* topKs = reinterpret_cast( tc::nextWorkspacePtr(workspaceBytePtr, offset, numInputLogits * sizeof(SizeType32))); // Workspace 2: topKOffset tensor: shape: [batchSize], number of nodes that have successors for each requests SizeType32* topKOffset = reinterpret_cast(tc::nextWorkspacePtr(workspaceBytePtr, offset, batchSize * sizeof(SizeType32))); // Workspace 3: logits pointers tensor: shape: [numInputLogits] T const** logitsPtrs = reinterpret_cast(tc::nextWorkspacePtr(workspaceBytePtr, offset, numInputLogits * sizeof(T*))); // Workspace 4: outputIds pointers tensor: shape [numInputLogits], each points to a [maxDecodingDraftTokens] buffer TokenIdType** firstTopKOutputIdsPtrs = reinterpret_cast( tc::nextWorkspacePtr(workspaceBytePtr, offset, numInputLogits * sizeof(TokenIdType*))); // Workspace 5: outputIds tensor: flatten outputIds, shape [numInputLogits * maxDecodingDraftTokens] TokenIdType* firstTopKOutputIdsFlatten = reinterpret_cast( tc::nextWorkspacePtr(workspaceBytePtr, offset, numInputLogits * maxDecodingDraftTokens * sizeof(TokenIdType))); // Workspace 6: number of successors for each nodes tensor: shape [batchSize * maxDecodingTokens] SizeType32* numSuccessorsForEachNode = reinterpret_cast( tc::nextWorkspacePtr(workspaceBytePtr, offset, batchSize * maxDecodingTokens * sizeof(SizeType32))); // Workspace 7: skip decoding mask [numInputLogits] bool* skipDecode = reinterpret_cast(tc::nextWorkspacePtr(workspaceBytePtr, offset, numInputLogits * sizeof(bool))); // In Eagle-1, we do not need to return logProbs float* firstTopKOutputLogProbs = nullptr; if (useDynamicTree) { // Workspace 8. The output logProbs of the first topK sampling. // Which will be updated with the previous layer's scores (i.e., pluginInputPrevScores), and will be treat as // the input of the second topK sampling. For mLayerIdx == 0, shape: [numInputLogits(batchSize), // maxDecodingDraftTokens] For mLayerIdx > 0, shape: [numInputLogits(batchSize * dynamicTreeMaxTopK), // maxDecodingDraftTokens] firstTopKOutputLogProbs = reinterpret_cast( tc::nextWorkspacePtr(workspaceBytePtr, offset, numInputLogits * maxDecodingDraftTokens * sizeof(float))); } SizeType32 const secondTopKVocabSize = dynamicTreeMaxTopK * maxDecodingDraftTokens; // Workspace 9: Sampling from [batchSize, dynamicTreeMaxTopK * maxDecodingDraftTokens] to [batchSize, // dynamicTreeMaxTopK] auto const secondTopKSamplingWorkspaceSize = getTopKWorkspaceSize(batchSize, /* maxTokensPerStep */ 1, /* maxTopK */ maxTopK, secondTopKVocabSize); void* workspaceScoresSampling = reinterpret_cast(tc::nextWorkspacePtr(workspaceBytePtr, offset, secondTopKSamplingWorkspaceSize)); // Workspace 10: the second (scores) sampling's outputIds, shape: [batchSize, maxDecodingDraftTokens] TokenIdType* secondTopKOutputIdsFlatten = reinterpret_cast( tc::nextWorkspacePtr(workspaceBytePtr, offset, batchSize * maxDecodingDraftTokens * sizeof(TokenIdType))); // Workspace 11: the second (scores) sampling's outputIdsPtrs TokenIdType** secondTopKOutputIdsPtrs = reinterpret_cast( tc::nextWorkspacePtr(workspaceBytePtr, offset, batchSize * sizeof(TokenIdType*))); // Workspace 12: input scores pointers float** secondTopKInputScoresPtrs = reinterpret_cast(tc::nextWorkspacePtr(workspaceBytePtr, offset, batchSize * sizeof(float*))); // Workspace 13: the second sampling's outputLogProbs float* secondTopKOutputLogProbs = reinterpret_cast( tc::nextWorkspacePtr(workspaceBytePtr, offset, batchSize * maxDecodingDraftTokens * sizeof(float))); // Workspace 14: The input scores pointers of the third topK sampling, shape [batchSize] float** thirdTopKInputScoresPtrs = reinterpret_cast(tc::nextWorkspacePtr(workspaceBytePtr, offset, batchSize * sizeof(float*))); // Workspace 15: The output of the third topK sampling, shape [batchSize, maxDecodingDraftTokens] TokenIdType* thirdTopKOutputIds = reinterpret_cast( tc::nextWorkspacePtr(workspaceBytePtr, offset, batchSize * maxDecodingDraftTokens * sizeof(TokenIdType))); // Workspace 16: The output pointers of the third topK sampling, shape [batchSize] TokenIdType** thirdTopKOutputIdsPtrs = reinterpret_cast( tc::nextWorkspacePtr(workspaceBytePtr, offset, batchSize * sizeof(TokenIdType*))); // The number of draft tokens among all layers long const totalNumDraftTokensForAllLayers = (mNumEagleLayers - 1) * dynamicTreeMaxTopK * dynamicTreeMaxTopK + dynamicTreeMaxTopK; auto const thridTopKSamplingWorkspaceSize = getTopKWorkspaceSize( batchSize, /* maxTokensPerStep */ 1, /* maxTopK */ maxDecodingDraftTokens, totalNumDraftTokensForAllLayers); // Workspace 17: The workspace of the third topK sampling void* workspaceThirdTopKSampling = reinterpret_cast(tc::nextWorkspacePtr(workspaceBytePtr, offset, thridTopKSamplingWorkspaceSize)); // Workspace 18. Eagle-2, the topKs for each request in the third topK sampling, shape [batchSize] // The real topK value is min(maxDecodingDraftTokens, totalNumDraftTokensForAllLayers) SizeType32* thirdTopKs = reinterpret_cast(tc::nextWorkspacePtr(workspaceBytePtr, offset, batchSize * sizeof(SizeType32))); ////////////////////////////////////////// Main logic ////////////////////////////////////////// // Fill logitsPtrs from plugin input logits // And fill firstTopKOutputIdsPtrs from firstTopKOutputIdsFlatten invokeAssembleDraftLogitsOffsets(logitsPtrs, pluginInputLogits, firstTopKOutputIdsPtrs, firstTopKOutputIdsFlatten, skipDecode, numValidLogits, numInputLogits, batchSize, maxDecodingDraftTokens, vocabSizePadded, stream); sync_check_cuda_error(stream); if (useDynamicTree) { // For Eagle-2, the topK value between different requests are the same, all set to 'dynamicTreeMaxTopK'. invokeSetTopKsFromDyanmicTreeMaxTopK( mLayerIdx, batchSize, numInputLogits, topKs, topKOffset, dynamicTreeMaxTopK, numValidLogits, stream); sync_check_cuda_error(stream); // Do softmax for the input logits // We set the 'batchSize' and 'maxBatchSize' to 'numInputLogits', while 'numInputLogits' logits may contain // some padding logits, which do not need to be calculated. // We use 'skipDecode' list to skip these padding logits. This could avoid redundant calculations. BiasSoftmaxParams biasSoftmaxParams; biasSoftmaxParams.logits = const_cast(pluginInputLogits); biasSoftmaxParams.logitsPtrs = nullptr; biasSoftmaxParams.probs = const_cast(pluginInputLogits); biasSoftmaxParams.maxBeamWidth = 1; biasSoftmaxParams.batchSlots = nullptr; biasSoftmaxParams.batchSize = numInputLogits; biasSoftmaxParams.maxBatchSize = numInputLogits; biasSoftmaxParams.vocabSize = vocabSizePadded; biasSoftmaxParams.vocabSizePadded = vocabSizePadded; biasSoftmaxParams.skipSoftMax = false; biasSoftmaxParams.batchSlotsLogits = false; biasSoftmaxParams.skipDecode = skipDecode; biasSoftmaxParams.checkParams(); invokeAddBiasSoftMax(biasSoftmaxParams, stream); sync_check_cuda_error(stream); } else { // For Eagle-1, extract topK value from input path. invokeExtractTopKsFromPath(pluginInputPaths, topKs, topKOffset, numSuccessorsForEachNode, mLayerIdx, batchSize, maxDecodingTokens, maxPathLen, stream); sync_check_cuda_error(stream); } TopKSamplingKernelParams params{}; params.logProbsPtrs = logitsPtrs; // [numInputLogits][vocabSizePadded] params.outputIdsPtrs = firstTopKOutputIdsPtrs; // [numInputLogits][maxDecodingDraftTokens] params.workspace = workspaceSampling; params.maxTopK = maxTopK; params.topKs = topKs; // [numInputLogits] params.batchSize = numInputLogits; params.maxBatchSize = numInputLogits; params.maxTokensPerStep = 1; params.vocabSizePadded = vocabSizePadded; params.returnAllSelectedTokens = true; params.strictTopPBoundary = false; params.skipDecode = skipDecode; params.outputLogProbs = firstTopKOutputLogProbs; // [numInputLogits * maxDecodingDraftTokens] params.logitsHasProbs = true; invokeBatchTopKSampling(params, stream); sync_check_cuda_error(stream); if (useDynamicTree) { // When mLayerIdx == 0, we do not need to update scores. // We take the outputLogProbs of the first topK sampling as the scores directly. if (mLayerIdx != 0) { // Update firstTopKOutputLogProbs with pluginInputPrevScores, which is the scores from the previous layer invokeUpdateScores(batchSize, dynamicTreeMaxTopK, maxDecodingDraftTokens, firstTopKOutputLogProbs, pluginInputPrevScores, stream); sync_check_cuda_error(stream); // Do the second top-dynamicTreeMaxTopK sampling among this dynamicTreeMaxTopK x dynamicTreeMaxTopK draft // tokens. Through the second topK sampling, we obtain the dynamicTreeMaxTopK output draft tokens of this // layer. // Although theoretically we only need to select 'dynamicTreeMaxTopK' draft tokens from 'dynamicTreeMaxTopK // * dynamicTreeMaxTopK' draft tokens, we over-set vocabSize here. This is because when we write the scores // into firstTopKOutputLogProbs, we store it in the form of [batchSize * dynamicTreeMaxTopK, // maxDecodingDraftTokens]. For each request, these 'dynamicTreeMaxTopK * dynamicTreeMaxTopK' scores are not // saved continuously, but in the format of [dynamicTreeMaxTopK, maxDecodingDraftTokens]. For unused // positions, we set '-inf' to ensure that they will not be sampled. Examples: For a request, // dynamicTreeMaxTopK == 3, the scores in its buffer ([dynamicTreeMaxTopK, maxDecodingDraftTokens]) are as // follow: // [[1.1, 2.2, 3.3, -inf, -inf, ...], // [4.4, 5.5, 6.6, -inf, -inf, ...], // [7.7, 8.8, 9.9, -inf, -inf, ...]] // Prepare the input of the second topK sampling. invokeAssembleSecondTopKSamplingInputs(batchSize, dynamicTreeMaxTopK, maxDecodingDraftTokens, firstTopKOutputLogProbs, secondTopKInputScoresPtrs, secondTopKOutputIdsFlatten, secondTopKOutputIdsPtrs, stream); sync_check_cuda_error(stream); TopKSamplingKernelParams params{}; params.logProbsPtrs = secondTopKInputScoresPtrs; params.outputIdsPtrs = secondTopKOutputIdsPtrs; params.workspace = workspaceScoresSampling; params.maxTopK = maxTopK; // Same to maxDecodingTokens params.topKs = topKs; // [batchSize], all set to dynamicTreeMaxTopK params.batchSize = batchSize; params.maxBatchSize = batchSize; params.maxTokensPerStep = 1; params.vocabSizePadded = secondTopKVocabSize; params.returnAllSelectedTokens = true; params.strictTopPBoundary = false; invokeBatchTopKSampling(params, stream); sync_check_cuda_error(stream); } // Copy this layer's scores and draft tokensId: // 1) Copy this layer's scores to pluginOutputAllLayersScores // 2) Copy dynamicTreeMaxTopK (or dynamicTreeMaxTopK * dynamicTreeMaxTopK) draft tokens to // pluginOutputAllLayersDraftTokenIds 3) Set the predecessors of these draft tokens and save to // pluginOutputAllLayersDraftTokenIdsPredecessor, // which will be used to reconstruct the final output tree at the last layer invokeCopyScoresAndDraftTokenIds(mLayerIdx, mNumEagleLayers, maxDecodingDraftTokens, batchSize, dynamicTreeMaxTopK, pluginInputCurrentExpandIndices, // The indices of the nodes that expand in this layer (i.e., the input // logits). The index is related to the final tree. pluginInputAllLayersScores, pluginInputAllLayersDraftTokenIds, pluginInputAllLayersDraftTokenIdsPredecessor, pluginOutputAllLayersScores, pluginOutputAllLayersDraftTokenIds, pluginOutputAllLayersDraftTokenIdsPredecessor, firstTopKOutputLogProbs, // This layer's scores firstTopKOutputIdsFlatten, // This layer's draft tokens stream); sync_check_cuda_error(stream); // Update Path // For mLayerIdx == 0, the output of the first topK sampling are the output draft tokens of this layers. The // update logic is simple. For mLayerIdx > 0, the output of the second topK sampling are the output draft tokens // of this layers. 'secondTopKOutputIdsPtrs' contains the top-dynamicTreeMaxTopK selected from the second topK // sampling. 'pluginOutputNextExpandIndices' record the selected the top-dynamicTreeMaxTopK draft token's Id of // this layer, // which will be used in the next layer to compute the predecessors. // The last layer will completely reconstruct the paths, so there is no need to update the paths here. if (mLayerIdx != mNumEagleLayers - 1) { invokeUpdatePath(mLayerIdx, batchSize, dynamicTreeMaxTopK, maxDecodingTokens, maxPathLen, pluginInputPaths, pluginOutputPaths, secondTopKOutputIdsPtrs, // if mLayerIdx == 0, secondTopKOutputIdsPtrs == nullptr, and it's useless // during update paths pluginOutputNextExpandIndices, stream); sync_check_cuda_error(stream); } if (mLayerIdx != 0) { // We will extract the real draft tokenIds and scores from 'firstTopKOutputIdsFlatten' and // 'secondTopKInputScoresPtrs' according to the 'secondTopKOutputIdsPtrs'. And store them into // 'secondTopKOutputIdsPtrs' and 'secondTopKOutputLogProbs' (reuse these buffers). // secondTopKInputScoresPtrs: shape [batchSize * dynamicTreeMaxTopK, maxDecodingDraftTokens] // The original scores, which were used to do the second TopK sampling // secondTopKOutputIdsPtrs: shape [batchSize], each points to a [maxDecodingDraftTokens] buffer // The output of the second TopK sampling, which are the indices of the top-dynamicTreeMaxTopK among // 'dynamicTreeMaxTopK * dynamicTreeMaxTopK'. We need to figure out what these top-dynamicTreeMaxTopK // draft tokens' real tokenIds. // firstTopKOutputIdsFlatten: shape [batchSize * dynamicTreeMaxTopK, maxDecodingDraftTokens] // The value are related to the vocabSize, which is the real tokenIds. invokeExtractScoresAndRealDraftTokensIds(batchSize, dynamicTreeMaxTopK, maxDecodingDraftTokens, secondTopKInputScoresPtrs, secondTopKOutputIdsPtrs, firstTopKOutputIdsFlatten, secondTopKOutputLogProbs, stream); sync_check_cuda_error(stream); } // Copy this layer's output draft tokens and scores. // This layer's output scores is next layer's previous scores. // if mLayerIdx == 0, directly use the first topK's outputIds / logProbs as this layer's output draft tokens / // scores if mLayerIdx > 0, we use the second topK's outputIds / logProbs, // which is updated with the real draft tokenIds / logprobs in 'invokeExtractScoresAndRealDraftTokensIds' invokeUpdateDraftTokensAndLensAndCurScores(mLayerIdx, batchSize, dynamicTreeMaxTopK, maxDecodingDraftTokens, mLayerIdx == 0 ? firstTopKOutputIdsPtrs : secondTopKOutputIdsPtrs, pluginInputDraftTokenIds, pluginInputDraftLens, pluginOutputDraftTokenIds, pluginOutputDraftLens, mLayerIdx == 0 ? firstTopKOutputLogProbs : secondTopKOutputLogProbs, pluginOutputCurrentScores, stream); sync_check_cuda_error(stream); if (mLayerIdx == mNumEagleLayers - 1) { // The maximum number of nodes on the final tree (exclude the root node) auto const maxNodesOnFinalTree = std::min(maxDecodingDraftTokens, totalNumDraftTokensForAllLayers); // When reach the last EagleNet, we need to do the third sampling, which take all layers' draft tokens and // scores as input, and then select top-maxDecodingDraftTokens draft tokens among them. We need to // reconstruct the path/tree after the third topK sampling. invokeAssembleThridTopKSamplingInputs(batchSize, maxDecodingDraftTokens, mNumEagleLayers, maxNodesOnFinalTree, thirdTopKs, pluginOutputAllLayersScores, thirdTopKInputScoresPtrs, thirdTopKOutputIds, thirdTopKOutputIdsPtrs, stream); sync_check_cuda_error(stream); // 1) Do topK sampling among all previous draft tokens TopKSamplingKernelParams params{}; params.logProbsPtrs = thirdTopKInputScoresPtrs; params.outputIdsPtrs = thirdTopKOutputIdsPtrs; params.workspace = workspaceThirdTopKSampling; params.topKs = thirdTopKs; // All set to 'maxNodesOnFinalTree' params.maxTopK = maxDecodingDraftTokens; // We set maxTopK to 'maxDecodingDraftTokens' to align the // outputIdsPtrs offsets when written back. params.batchSize = batchSize; params.maxBatchSize = batchSize; params.maxTokensPerStep = 1; params.vocabSizePadded = totalNumDraftTokensForAllLayers; params.returnAllSelectedTokens = true; params.strictTopPBoundary = false; // Make sure to select topK tokens. invokeBatchTopKSampling(params, stream); sync_check_cuda_error(stream); // 2) Reconstruct the Path invokeReconstructFinalPath(batchSize, dynamicTreeMaxTopK, maxDecodingDraftTokens, maxDecodingTokens, maxPathLen, mNumEagleLayers, maxNodesOnFinalTree, thirdTopKOutputIdsPtrs, pluginOutputAllLayersDraftTokenIdsPredecessor, pluginOutputPaths, stream); sync_check_cuda_error(stream); // 3) Copy this layer's outputIds to outputDraftTokenIds invokeCopyFinalDraftTokens(batchSize, maxDecodingDraftTokens, mNumEagleLayers, maxNodesOnFinalTree, thirdTopKOutputIdsPtrs, pluginOutputAllLayersDraftTokenIds, pluginOutputDraftTokenIds, pluginOutputDraftLens, stream); sync_check_cuda_error(stream); } } else { // Eagle-1: Copy output token id from outputIdsPtrs to the plugin output buffer invokeCopyOutputTokensIds(firstTopKOutputIdsPtrs, topKs, topKOffset, pluginInputDraftTokenIds, pluginInputDraftLens, numValidLogits, pluginOutputDraftTokenIds, pluginOutputDraftLens, mLayerIdx, batchSize, maxDecodingDraftTokens, pluginInputPaths, pluginOutputPaths, maxPathLen, stream); sync_check_cuda_error(stream); } TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__); } template void EagleDecodeDraftTokensPlugin::enqueueType(nvinfer1::PluginTensorDesc const* inputDesc, nvinfer1::PluginTensorDesc const* outputDesc, void const* const* inputs, void* const* outputs, void* workspace, cudaStream_t stream) noexcept { TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__); // TODO split batch into greedy and non-greedy and execute both paths if (mTopKSampling) { doTopKSampling(inputDesc, outputDesc, inputs, outputs, workspace, stream); } else { // TODO fill me TLLM_CHECK_WITH_INFO(false, "Multinomial sampling is not supported yet"); } TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__); } int EagleDecodeDraftTokensPlugin::enqueue(nvinfer1::PluginTensorDesc const* inputDesc, nvinfer1::PluginTensorDesc const* outputDesc, void const* const* inputs, void* const* outputs, void* workspace, cudaStream_t stream) noexcept { auto const logitsType = inputDesc[getIdx(InputIdxEntry::LOGITS)].type; if (logitsType == nvinfer1::DataType::kFLOAT) { enqueueType(inputDesc, outputDesc, inputs, outputs, workspace, stream); } else if (logitsType == nvinfer1::DataType::kHALF) { enqueueType<__half>(inputDesc, outputDesc, inputs, outputs, workspace, stream); } else { TLLM_CHECK_WITH_INFO(false, "Unsupported logits type"); } return 0; } // IPluginV2Ext Methods nvinfer1::DataType EagleDecodeDraftTokensPlugin::getOutputDataType( int index, nvinfer1::DataType const* inputTypes, int nbInputs) const noexcept { TLLM_CHECK(index < getNbOutputs()); TLLM_CHECK(index < getNbOutputs()); if (index == getIdx(OutputIdxEntry::OUTPUT_ALL_LAYERS_SCORES) || index == getIdx(OutputIdxEntry::OUTPUT_CURRENT_SCORES)) { // Only output_prev_socres are float return inputTypes[getIdx(InputIdxEntry::INPUT_ALL_LAYERS_SCORES)]; } else { // output_draft_token_ids, output_draft_lens, output_paths, output_next_expand_index, // output_all_layers_draft_token_ids, output_all_layers_draft_token_ids_predecessor // are all int32 type, same as path return inputTypes[getIdx(InputIdxEntry::PATHS)]; } } // IPluginV2 Methods char const* EagleDecodeDraftTokensPlugin::getPluginType() const noexcept { return EAGLE_DECODE_DRAFT_TOKENS_PLUGIN_NAME; } char const* EagleDecodeDraftTokensPlugin::getPluginVersion() const noexcept { return EAGLE_DECODE_DRAFT_TOKENS_PLUGIN_VERSION; } int EagleDecodeDraftTokensPlugin::getNbOutputs() const noexcept { return 8; } int EagleDecodeDraftTokensPlugin::initialize() noexcept { return 0; } void EagleDecodeDraftTokensPlugin::terminate() noexcept {} size_t EagleDecodeDraftTokensPlugin::getSerializationSize() const noexcept { return sizeof(mDtype) + sizeof(mLayerIdx) + sizeof(mNumEagleLayers) + sizeof(mTopKSampling); } void EagleDecodeDraftTokensPlugin::serialize(void* buffer) const noexcept { char *d = static_cast(buffer), *a = d; write(d, mDtype); write(d, mLayerIdx); write(d, mNumEagleLayers); write(d, mTopKSampling); TLLM_CHECK(d == a + getSerializationSize()); } void EagleDecodeDraftTokensPlugin::destroy() noexcept { // This gets called when the network containing plugin is destroyed delete this; } /////////////// EagleDecodeDraftTokensPluginCreator::EagleDecodeDraftTokensPluginCreator() { // Fill PluginFieldCollection with PluginField arguments metadata mPluginAttributes.clear(); mPluginAttributes.emplace_back(PluginField("type_id", nullptr, PluginFieldType::kINT32)); mPluginAttributes.emplace_back(PluginField("layer_idx", nullptr, PluginFieldType::kINT32)); mPluginAttributes.emplace_back(PluginField("num_eagle_layers", nullptr, PluginFieldType::kINT32)); mPluginAttributes.emplace_back(PluginField("top_k_sampling", nullptr, PluginFieldType::kINT32)); mFC.nbFields = mPluginAttributes.size(); mFC.fields = mPluginAttributes.data(); } char const* EagleDecodeDraftTokensPluginCreator::getPluginName() const noexcept { return EAGLE_DECODE_DRAFT_TOKENS_PLUGIN_NAME; } char const* EagleDecodeDraftTokensPluginCreator::getPluginVersion() const noexcept { return EAGLE_DECODE_DRAFT_TOKENS_PLUGIN_VERSION; } PluginFieldCollection const* EagleDecodeDraftTokensPluginCreator::getFieldNames() noexcept { return &mFC; } IPluginV2* EagleDecodeDraftTokensPluginCreator::createPlugin(char const* name, PluginFieldCollection const* fc) noexcept { PluginField const* fields = fc->fields; int32_t layerIdx{}; int32_t numEagleLayers{}; nvinfer1::DataType type{}; bool topKSampling{}; // Read configurations from each fields for (int i = 0; i < fc->nbFields; ++i) { char const* attrName = fields[i].name; if (!strcmp(attrName, "layer_idx")) { TLLM_CHECK(fields[i].type == PluginFieldType::kINT32); layerIdx = *static_cast(fields[i].data); } else if (!strcmp(attrName, "num_eagle_layers")) { TLLM_CHECK(fields[i].type == PluginFieldType::kINT32); numEagleLayers = *static_cast(fields[i].data); } else if (!strcmp(attrName, "type_id")) { TLLM_CHECK(fields[i].type == PluginFieldType::kINT32); type = static_cast(*(static_cast(fields[i].data))); } else if (!strcmp(attrName, "top_k_sampling")) { TLLM_CHECK(fields[i].type == PluginFieldType::kINT32); topKSampling = static_cast(*static_cast(fields[i].data)); } } try { auto* obj = new EagleDecodeDraftTokensPlugin(type, layerIdx, numEagleLayers, topKSampling); obj->setPluginNamespace(mNamespace.c_str()); return obj; } catch (std::exception const& e) { caughtError(e); } return nullptr; } IPluginV2* EagleDecodeDraftTokensPluginCreator::deserializePlugin( char const* name, void const* serialData, size_t serialLength) noexcept { // This object will be deleted when the network is destroyed, which will // call EagleDecodeDraftTokensPlugin::destroy() try { auto* obj = new EagleDecodeDraftTokensPlugin(serialData, serialLength); obj->setPluginNamespace(mNamespace.c_str()); return obj; } catch (std::exception const& e) { caughtError(e); } return nullptr; }