[None] [refactor] Minor cleanup and improvements (#7619)

Signed-off-by: Robin Kobus <19427718+Funatiq@users.noreply.github.com>
This commit is contained in:
Robin Kobus 2025-10-03 11:40:06 +02:00 committed by GitHub
parent ba3dbb6c94
commit e2f69c5c23
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
9 changed files with 38 additions and 49 deletions

View File

@ -1,5 +1,5 @@
/*
* SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-FileCopyrightText: Copyright (c) 2022-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
@ -20,7 +20,7 @@
#include "tensorrt_llm/batch_manager/common.h"
#include "tensorrt_llm/common/algorithm.h"
#include "tensorrt_llm/common/optionalRef.h"
#include "tensorrt_llm/runtime/bufferManager.h"
#include "tensorrt_llm/executor/executor.h"
#include "tensorrt_llm/runtime/common.h"
#include "tensorrt_llm/runtime/iTensor.h"
#include "tensorrt_llm/runtime/modelConfig.h"
@ -28,11 +28,7 @@
namespace tensorrt_llm::runtime
{
class DecodingInput;
class DecodingOutput;
class GptDecoderBatched;
class SamplingConfig;
class SpeculativeDecodingMode;
namespace decoder
{
@ -56,10 +52,6 @@ public:
using CudaStream = tensorrt_llm::runtime::CudaStream;
using TensorPtr = runtime::ITensor::SharedPtr;
using SharedConstPtr = runtime::ITensor::SharedConstPtr;
using DecodingInput = runtime::DecodingInput;
using DecodingOutput = runtime::DecodingOutput;
using SpeculativeDecodingMode = runtime::SpeculativeDecodingMode;
using GptDecoderBatched = runtime::GptDecoderBatched;
template <typename T>
using OptionalRef = tensorrt_llm::common::OptionalRef<T>;
@ -70,7 +62,7 @@ public:
{
}
std::tuple<TensorPtr, std::vector<runtime::SamplingConfig>, std::vector<runtime::ITensor::SharedConstPtr>,
[[nodiscard]] std::tuple<TensorPtr, std::vector<SamplingConfig>, std::vector<SharedConstPtr>,
std::vector<executor::LookaheadDecodingConfig>>
operator()(runtime::ModelConfig const& modelConfig, runtime::WorldConfig const& worldConfig,
executor::DecodingConfig const& decodingConfig, RequestVector const& contextRequests,
@ -78,8 +70,7 @@ public:
CudaStream const& runtimeStream, CudaStream const& decoderStream, SizeType32 maxSequenceLength,
SizeType32 beamWidth, OptionalRef<MedusaBuffers const> medusaBuffers) const;
[[nodiscard]] std::tuple<std::vector<runtime::ITensor::SharedConstPtr>,
std::vector<executor::LookaheadDecodingConfig>>
[[nodiscard]] std::tuple<std::vector<SharedConstPtr>, std::vector<executor::LookaheadDecodingConfig>>
createDecoderRequests(RequestVector const& finishedContextRequests, TensorPtr const& inputIds,
executor::DecodingConfig const& decodingConfig, runtime::decoder::DecoderState& decoderState,
nvinfer1::DataType logitsType, runtime::ModelConfig const& modelConfig, runtime::WorldConfig const& worldConfig,

View File

@ -1,5 +1,5 @@
/*
* Copyright (c) 2022-2024, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2022-2025, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
@ -29,6 +29,8 @@
#include <cassert>
#include <chrono>
#include <cstdint>
#include <cstring>
#include <list>
#include <memory>
#include <optional>
#include <utility>
@ -56,9 +58,9 @@ enum class LlmRequestState : int32_t
/// used in layer-wise transmission
kDISAGG_GENERATION_TRANS_COMPLETE = 12, ///< Kv cache transmission are finished
kGENERATION_IN_PROGRESS = 13, ///< Generation phase is in progress
kGENERATION_TO_COMPLETE = 14, ///< Generation phase is to be completed
// schedulable states ends
kGENERATION_TO_COMPLETE = 14, ///< Generation phase is to be completed
kGENERATION_COMPLETE = 20, ///< Generation phase completed
kDISAGG_CONTEXT_TRANS_IN_PROGRESS = 21, ///< Waiting context-only request transmitting the kv cache,
/// after computation finished
@ -1075,7 +1077,6 @@ public:
TLLM_CHECK_WITH_INFO(prepopulatedPromptLen < promptLen,
"Invalid state: prepopulatedPromptLen (%d) >= promptLen (%d) for request %lu", prepopulatedPromptLen,
promptLen, mRequestId);
TLLM_CHECK(prepopulatedPromptLen < promptLen);
auto& prePromptLen = mUseDraftModel ? mPrepopulatedPromptLenDraft : mPrepopulatedPromptLenTarget;
auto& contextCurrentPosition = mUseDraftModel ? mContextCurrentPositionDraft : mContextCurrentPositionTarget;
@ -1116,9 +1117,9 @@ public:
mDraftLogits = draftLogits;
}
[[nodiscard]] SizeType32 getNumDraftTokens() const
[[nodiscard]] SizeType32 getNumDraftTokens() const noexcept
{
return hasDraftTokens() ? mDraftTokens->size() : 0;
return hasDraftTokens() ? static_cast<SizeType32>(mDraftTokens->size()) : 0;
}
void discardDraftTokens(SizeType32 numTokensToDiscard)
@ -1379,17 +1380,17 @@ public:
mGenerationLogitsFragments.push_back(genLogits);
}
SizeType32 getGenerationLogitsFragmentsSize()
[[nodiscard]] SizeType32 getGenerationLogitsFragmentsSize() const noexcept
{
return mGenerationLogitsFragments.size();
return static_cast<SizeType32>(mGenerationLogitsFragments.size());
}
void clearGenerationLogitsFragments()
void clearGenerationLogitsFragments() noexcept
{
mGenerationLogitsFragments.clear();
}
bool hasAdditionalOutputs()
[[nodiscard]] bool hasAdditionalOutputs() const noexcept
{
return !mAdditionalContextOutputTensors.empty() || !mAdditionalGenerationOutputTensors.empty();
}

View File

@ -1478,7 +1478,8 @@ private:
class ExecutorConfig
{
public:
static constexpr uint64_t kDefaultMaxSeqIdleMicroseconds = 180000000;
static constexpr uint64_t kDefaultMaxSeqIdleMicroseconds
= std::chrono::duration_cast<std::chrono::microseconds>(std::chrono::minutes(3)).count();
static constexpr SizeType32 kDefaultIterStatsMaxIterations = 1000;

View File

@ -19,7 +19,6 @@
#include "tensorrt_llm/executor/executor.h"
#include "tensorrt_llm/runtime/common.h"
#include "tensorrt_llm/runtime/speculativeDecodingModule.h"
#include <memory>
namespace tensorrt_llm::runtime
{
@ -29,7 +28,6 @@ class LookaheadModule : public SpeculativeDecodingModule
public:
explicit LookaheadModule(SizeType32 maxDraftPathLen, SizeType32 maxDecodingDraftTokens) noexcept
: SpeculativeDecodingModule(maxDraftPathLen, maxDecodingDraftTokens, maxDecodingDraftTokens)
, mExecutionConfig()
{
}
@ -43,7 +41,7 @@ public:
mExecutionConfig = config;
}
executor::LookaheadDecodingConfig const getExecutionConfig() const
[[nodiscard]] executor::LookaheadDecodingConfig const& getExecutionConfig() const
{
return mExecutionConfig;
}

View File

@ -21,6 +21,7 @@
#include "tensorrt_llm/runtime/lookaheadModule.h"
#include "tensorrt_llm/runtime/loraModule.h"
#include "tensorrt_llm/runtime/speculativeDecodingMode.h"
#include "tensorrt_llm/runtime/speculativeDecodingModule.h"
#include <NvInferRuntime.h>
#include <array>

View File

@ -39,7 +39,6 @@ using namespace tensorrt_llm::runtime;
namespace tc = tensorrt_llm::common;
namespace te = tensorrt_llm::executor;
namespace tk = tensorrt_llm::kernels;
namespace tr = tensorrt_llm::runtime;
namespace tensorrt_llm::batch_manager

View File

@ -39,8 +39,8 @@ public:
if (offset + size > mBuffer->getSize())
{
throw std::out_of_range(std::string("slice ") + std::to_string(offset + size) + " exceeds buffer size "
+ std::to_string(mBuffer->getSize()));
throw std::out_of_range(std::string("offset ") + std::to_string(offset) + std::string(" + size ")
+ std::to_string(size) + " exceeds buffer size " + std::to_string(mBuffer->getSize()));
}
}

View File

@ -284,8 +284,8 @@ void verifyOutput(RequestList const& finishedRequestList,
}
// Pick a different endId at random from one of the expected tokens
std::vector<TokenIdType> pickRandomEndIds(TestData const& testData, TrtGptModelType const& modelType,
std::vector<SizeType32> const& givenInputLengths, SizeType32 const maxNewTokens, bool replaceLogits)
std::vector<TokenIdType> pickRandomEndIds(TestData const& testData, std::vector<SizeType32> const& givenInputLengths,
SizeType32 const maxNewTokens, bool replaceLogits)
{
auto const nbGivenInputs = testData.nbGivenInputs;
auto const beamWidth = testData.beamWidth;
@ -328,9 +328,9 @@ std::vector<TokenIdType> pickRandomEndIds(TestData const& testData, TrtGptModelT
return endIds;
}
TestData loadTestData(ModelSpec const& modelSpec, TrtGptModelType const& modelType, ModelIds const modelIds,
BeamResult const& beamResult, ITensor const& givenInput, SizeType32 const maxBeamWidth, bool const useRandomEndId,
bool const replaceLogits, BufferManager& manager)
TestData loadTestData(ModelSpec const& modelSpec, ModelIds const modelIds, BeamResult const& beamResult,
ITensor const& givenInput, SizeType32 const maxBeamWidth, bool const useRandomEndId, bool const replaceLogits,
BufferManager& manager)
{
auto const [givenInputLengths, nbGivenInputs, maxInputLength] = getGivenInputLengths(givenInput, modelIds.padId);
auto const& [beamWidth, resultsFile, contextLogitsFile, genLogitsFile, cumLogProbsFile, logProbsFile] = beamResult;
@ -353,7 +353,7 @@ TestData loadTestData(ModelSpec const& modelSpec, TrtGptModelType const& modelTy
if (useRandomEndId)
{
testData.endIds = pickRandomEndIds(testData, modelType, givenInputLengths, maxNewTokens, replaceLogits);
testData.endIds = pickRandomEndIds(testData, givenInputLengths, maxNewTokens, replaceLogits);
}
else
{
@ -409,9 +409,8 @@ TestData loadTestData(ModelSpec const& modelSpec, TrtGptModelType const& modelTy
}
std::tuple<std::vector<SizeType32>, std::unordered_map<SizeType32, TestData>> loadTestData(ModelSpec const& modelSpec,
TrtGptModelType const& modelType, ModelIds const modelIds, BeamResults const& resultsFilesBeamWidths,
ITensor const& givenInput, SizeType32 const maxBeamWidth, bool const useRandomEndId, bool const replaceLogits,
BufferManager& manager)
ModelIds const modelIds, BeamResults const& resultsFilesBeamWidths, ITensor const& givenInput,
SizeType32 const maxBeamWidth, bool const useRandomEndId, bool const replaceLogits, BufferManager& manager)
{
// Map between beam width, and expected results for that beam width
std::unordered_map<SizeType32, TestData> beamWidthTestData;
@ -424,8 +423,8 @@ std::tuple<std::vector<SizeType32>, std::unordered_map<SizeType32, TestData>> lo
EXPECT_EQ(std::find(beamWidths.begin(), beamWidths.end(), beamWidth), beamWidths.end());
beamWidths.push_back(beamWidth);
auto testData = loadTestData(modelSpec, modelType, modelIds, beamResult, givenInput, maxBeamWidth,
useRandomEndId, replaceLogits, manager);
auto testData = loadTestData(
modelSpec, modelIds, beamResult, givenInput, maxBeamWidth, useRandomEndId, replaceLogits, manager);
beamWidthTestData.emplace(beamWidth, std::move(testData));
}
@ -435,9 +434,8 @@ std::tuple<std::vector<SizeType32>, std::unordered_map<SizeType32, TestData>> lo
RequestList runGptModelInference(std::shared_ptr<TrtGptModel>& trtGptModel, std::vector<SizeType32> const& beamWidths,
std::unordered_map<SizeType32, TestData> const& beamWidthTestData, SizeType32 batchSize, SizeType32 nbGivenInputs,
SizeType32 maxInputLength, SizeType32 padId, std::vector<SizeType32> const& givenInputLengths,
TokenIdType const* givenInputData, ModelSpec const& modelSpec, TrtGptModelIfbTestType testType,
TrtGptModelType modelType, int maxReqPerStep, bool prepopulateKVCache, bool enableStreamingMode,
bool enableBlockReuse)
TokenIdType const* givenInputData, ModelSpec const& modelSpec, TrtGptModelIfbTestType testType, int maxReqPerStep,
bool prepopulateKVCache, bool enableStreamingMode, bool enableBlockReuse)
{
// Fill the requests using givenInput
// requestList will have batchSize requests
@ -641,8 +639,8 @@ void runIfbTest(fs::path const& modelPath, ModelSpec const& modelSpec, ModelIds
auto const maxBeamWidth = executorConfig.getMaxBeamWidth();
// Load expected outputs for each beam width value
auto [beamWidths, beamWidthTestData] = loadTestData(modelSpec, modelType, modelIds, resultsFilesBeamWidths,
*givenInput, maxBeamWidth, useRandomEndId, modelSpec.mReplaceLogits, manager);
auto [beamWidths, beamWidthTestData] = loadTestData(modelSpec, modelIds, resultsFilesBeamWidths, *givenInput,
maxBeamWidth, useRandomEndId, modelSpec.mReplaceLogits, manager);
int const worldSize = modelSpec.mTPSize * modelSpec.mPPSize * modelSpec.mCPSize;
auto const worldConfig = WorldConfig::mpi(worldSize, modelSpec.mTPSize, modelSpec.mPPSize, modelSpec.mCPSize);
@ -663,14 +661,14 @@ void runIfbTest(fs::path const& modelPath, ModelSpec const& modelSpec, ModelIds
// Prepopulate KV cache for speculative decoding test
bool const prepopulateKVCache = modelSpec.mMaxDraftTokens > 0;
auto finishedRequestList = runGptModelInference(trtGptModel, beamWidths, beamWidthTestData, batchSize,
nbGivenInputs, maxInputLength, padId, givenInputLengths, givenInputData, modelSpec, testType, modelType,
maxReqPerStep, prepopulateKVCache, enableStreamingMode, modelSpec.mKVCacheReuse);
nbGivenInputs, maxInputLength, padId, givenInputLengths, givenInputData, modelSpec, testType, maxReqPerStep,
prepopulateKVCache, enableStreamingMode, modelSpec.mKVCacheReuse);
if (prepopulateKVCache)
{
// Call the 2nd time with prefilled KV cache
finishedRequestList = runGptModelInference(trtGptModel, beamWidths, beamWidthTestData, batchSize,
nbGivenInputs, maxInputLength, padId, givenInputLengths, givenInputData, modelSpec, testType, modelType,
nbGivenInputs, maxInputLength, padId, givenInputLengths, givenInputData, modelSpec, testType,
maxReqPerStep, false, enableStreamingMode, modelSpec.mKVCacheReuse);
}

View File

@ -56,7 +56,7 @@ TEST_F(LlmRequestTest, fromExecutorRequest)
EXPECT_EQ(llmReq.getState(), tb::LlmRequestState::kCONTEXT_INIT);
EXPECT_FALSE(llmReq.mSeqSlot);
// No speculative decoding config, draft tokens should be empty
EXPECT_EQ(llmReq.getDraftTokens()->size(), 0);
EXPECT_EQ(llmReq.getNumDraftTokens(), 0);
EXPECT_FALSE(llmReq.getEmbeddingBias().has_value());
EXPECT_FALSE(llmReq.getBadWordsList().has_value());
EXPECT_FALSE(llmReq.getStopWordsList().has_value());