Update TensorRT-LLM (#2215)

* Update TensorRT-LLM

---------

Co-authored-by: Sherlock Xu <65327072+Sherlock113@users.noreply.github.com>
This commit is contained in:
Kaiyu Xie 2024-09-10 18:21:22 +08:00 committed by GitHub
parent 78f5c2936b
commit 31ac30e928
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
272 changed files with 764351 additions and 84677 deletions

3
.gitmodules vendored
View File

@ -11,3 +11,6 @@
[submodule "3rdparty/NVTX"]
path = 3rdparty/NVTX
url = https://github.com/NVIDIA/NVTX.git
[submodule "3rdparty/ucxx"]
path = 3rdparty/ucxx
url = https://github.com/GuanLuo/ucxx.git

1
3rdparty/ucxx vendored Submodule

@ -0,0 +1 @@
Subproject commit b99181779672965c6f325a95a29eb433b6e9cbbd

View File

@ -17,6 +17,9 @@ TensorRT-LLM
<div align="left">
## Latest News
* [2024/09/04] 🏎️🏎️🏎️ Best Practices for Tuning TensorRT-LLM for Optimal Serving with BentoML
[➡️ link](https://www.bentoml.com/blog/tuning-tensor-rt-llm-for-optimal-serving-with-bentoml)
* [2024/08/20] 🏎SDXL with #TensorRT Model Optimizer ⏱️⚡ 🏁 cache diffusion 🏁 quantization aware training 🏁 QLoRA 🏁 #Python 3.12
[➡️ link](https://developer.nvidia.com/blog/nvidia-tensorrt-model-optimizer-v0-15-boosts-inference-performance-and-expands-model-support/)

View File

@ -160,6 +160,10 @@ or manually set a max sequence length that you plan to run with specifically:
trtllm-bench --model meta-llama/Llama-2-7b-hf build --max_seq_len 256 --quantization FP8
```
> [!NOTE] `trtllm-bench build` reproduces benchmark engines for performance study. These engine
configurations are not guaranteed to be optimal for all cases and should be viewed as reproducers
for the benchmark data we provide on our [Performance Overview](../docs/source/performance/perf-overview.md).
Looking a little closer, the `build` sub-command
will perform a lookup and build an engine using those reference settings. The
look up table directly corresponds to the performance table found in our

View File

@ -157,7 +157,7 @@ struct BenchmarkParams
int randomSeed = 430;
std::optional<std::vector<int>> maxAttentionWindowVec{std::nullopt};
std::optional<int> sinkTokenLength{std::nullopt};
bool multiBlockMode{false};
bool multiBlockMode{true};
bool enableContextFMHAFP32Acc{false};
// lora / peft params
@ -1943,7 +1943,7 @@ int main(int argc, char* argv[])
options.add_options()("multi_block_mode",
"Distribute the work across multiple CUDA thread-blocks on the GPU for masked MHA kernel",
cxxopts::value<bool>()->default_value("false"));
cxxopts::value<bool>()->default_value("true"));
options.add_options()(
"encoder_engine_dir", "Directory that store the engines of the encoder models.", cxxopts::value<std::string>());

View File

@ -381,7 +381,7 @@ endif()
# set(CMAKE_CUDA_FLAGS_DEBUG "${CMAKE_CUDA_FLAGS_DEBUG} -G")
set(CMAKE_CXX_FLAGS
"${CMAKE_CXX_FLAGS} -DBUILD_SYSTEM=cmake_oss -DENABLE_MULTI_DEVICE=${ENABLE_MULTI_DEVICE}"
"${CMAKE_CXX_FLAGS} -DBUILD_SYSTEM=cmake_oss -DENABLE_MULTI_DEVICE=${ENABLE_MULTI_DEVICE} -DENABLE_UCX=${ENABLE_UCX}"
)
# Fix linking issue with TRT 10, the detailed description about `--mcmodel` can
@ -538,6 +538,30 @@ elseif(NOT WIN32)
message(STATUS "Build without PyTorch, USE_CXX11_ABI=${USE_CXX11_ABI}")
endif()
# Defer UCX/UCXX setup until after USE_CXX11_ABI is well defined, as UCXX will
# need to be built to have aligned symbols
set_ifndef(ENABLE_UCX 0)
if(ENABLE_UCX)
# Only enable UCX related features if the system has UCX library
find_package(ucx)
if(NOT ${ucx_FOUND})
set(ENABLE_UCX 0)
else()
# installing ucxx via add_subdirectory results in strange cudart linking
# error, thus using their installation script to isolate the installation
# process until the issue is understood. And always trigger the build so
# that change in USE_CXX11_ABI will not be ignored.
execute_process(
COMMAND
${3RDPARTY_DIR}/ucxx/build.sh libucxx -n
--cmake-args=\"-DBUILD_SHARED_LIBS=OFF
-DCMAKE_CXX_FLAGS=-D_GLIBCXX_USE_CXX11_ABI=${USE_CXX11_ABI}\"
COMMAND_ECHO STDOUT)
find_package(ucxx REQUIRED PATHS ${3RDPARTY_DIR}/ucxx/cpp/build
NO_DEFAULT_PATH)
endif()
endif()
file(STRINGS "${TRT_INCLUDE_DIR}/NvInferVersion.h" VERSION_STRINGS
REGEX "#define NV_TENSORRT_.*")
foreach(TYPE MAJOR MINOR PATCH BUILD)

View File

@ -41,6 +41,7 @@ auto constexpr kDraftInputIdsTensorName = "draft_input_ids";
auto constexpr kDraftLogitsTensorName = "draft_logits";
auto constexpr kMaxNewTokensTensorName = "request_output_len";
auto constexpr kBeamWidthTensorName = "beam_width";
auto constexpr kNumReturnSequencesTensorName = "num_return_sequences";
auto constexpr kEndIdTensorName = "end_id";
auto constexpr kPadIdTensorName = "pad_id";
auto constexpr kBadWordsListTensorName = "bad_words_list";
@ -194,6 +195,7 @@ public:
inference_request::kDraftLogitsTensorName,
inference_request::kMaxNewTokensTensorName,
inference_request::kBeamWidthTensorName,
inference_request::kNumReturnSequencesTensorName,
inference_request::kEndIdTensorName,
inference_request::kPadIdTensorName,
inference_request::kBadWordsListTensorName,
@ -263,6 +265,7 @@ public:
TENSOR_GETTER_SETTER(DraftLogits, inference_request::kDraftLogitsTensorName)
TENSOR_GETTER_SETTER(MaxNewTokens, inference_request::kMaxNewTokensTensorName)
TENSOR_GETTER_SETTER(BeamWidth, inference_request::kBeamWidthTensorName)
TENSOR_GETTER_SETTER(NumReturnSequences, inference_request::kNumReturnSequencesTensorName)
TENSOR_GETTER_SETTER(EndId, inference_request::kEndIdTensorName)
TENSOR_GETTER_SETTER(PadId, inference_request::kPadIdTensorName)
TENSOR_GETTER_SETTER(BadWordsList, inference_request::kBadWordsListTensorName)

View File

@ -85,6 +85,7 @@ public:
using TensorPtr = TTensor;
using LogitsPostProcessor = std::function<void(
RequestIdType, TensorPtr&, BeamTokens const&, TStream const&, std::optional<RequestIdType>)>;
using RequestPtr = std::shared_ptr<GenericLlmRequest>;
GenericLlmRequest(RequestIdType requestId, SizeType32 maxNewTokens, std::shared_ptr<VecTokens> inputTokens,
runtime::SamplingConfig const& samplingConfig, bool isStreaming, std::optional<SizeType32> endId = std::nullopt,
@ -107,7 +108,8 @@ public:
std::optional<TensorPtr> encoderInputFeatures = std::nullopt,
std::optional<SizeType32> encoderOutputLength = std::nullopt,
LlmRequestType llmRequestType = LlmRequestType::LLMREQUEST_TYPE_CONTEXT_AND_GENERATION,
std::optional<std::shared_ptr<VecTokenExtraIds>> inputTokenExtraIds = std::nullopt)
std::optional<std::shared_ptr<VecTokenExtraIds>> inputTokenExtraIds = std::nullopt,
SizeType32 numReturnSequences = 1)
: mRequestId(requestId)
, mPromptLen(inputTokens->size())
, mMaxNewTokens(maxNewTokens)
@ -152,11 +154,14 @@ public:
, mEncoderOutputLength(encoderOutputLength)
, mLlmRequestType(llmRequestType)
, mInputTokenExtraIds(std::move(inputTokenExtraIds))
, mNumReturnSequences(numReturnSequences)
, mSequenceIndex(0)
{
if (mEncoderTokens.has_value() || encoderInputFeatures.has_value())
{
mState = REQUEST_STATE_ENCODER_INIT;
}
initialize(*inputTokens, returnLogProbs);
}
@ -202,6 +207,8 @@ public:
, mEncoderOutputLength(req.getEncoderOutputLength())
, mContextPhaseParams(req.getContextPhaseParams())
, mInputTokenExtraIds(std::nullopt)
, mNumReturnSequences(req.getNumReturnSequences())
, mSequenceIndex(0)
{
if (req.getRequestType() == executor::RequestType::REQUEST_TYPE_GENERATION_ONLY)
{
@ -217,6 +224,7 @@ public:
"length).");
mReturnAllGeneratedTokens = true;
}
if (mIsStreaming && mSamplingConfig.beamWidth > 1 && mReturnGenerationLogits == true)
{
TLLM_LOG_WARNING(
@ -276,13 +284,15 @@ public:
mLoraTaskId = loraConfig->getTaskId();
if (loraConfig.value().getWeights())
{
mLoraWeights = executor::detail::toITensor(loraConfig.value().getWeights().value());
mLoraWeights = tensorrt_llm::runtime::ITensor::view(
executor::detail::toITensor(loraConfig.value().getWeights().value()));
mLoraWeights.value()->unsqueeze(0);
}
if (loraConfig.value().getConfig())
{
mLoraConfig = executor::detail::toITensor(loraConfig.value().getConfig().value());
mLoraConfig = tensorrt_llm::runtime::ITensor::view(
executor::detail::toITensor(loraConfig.value().getConfig().value()));
mLoraConfig.value()->unsqueeze(0);
}
}
@ -429,6 +439,20 @@ public:
return mTokens.at(beam).size() - mNumPreDecodedTokens[beam];
}
/// @brief Get number of return sequences for this req.
/// @return The number of sequences to return.
[[nodiscard]] SizeType32 getNumReturnSequences() const
{
return mNumReturnSequences;
}
/// @brief Get child requests spawned by this req.
/// @return A vector of child requests.
[[nodiscard]] std::vector<RequestPtr> const& getChildRequests() const
{
return mChildRequests;
}
/// @brief Get max number of tokens across all beams
/// @return The number of tokens
[[nodiscard]] SizeType32 getMaxBeamNumTokens() const
@ -618,6 +642,25 @@ public:
}
}
/// @brief Sets the number of return sequences.
/// @param numReturnSequences The number of return sequences.
void setNumReturnSequences(SizeType32 const& numReturnSequences)
{
TLLM_CHECK_WITH_INFO(!isChild(), "A child request cannot change numReturnSequences.");
TLLM_CHECK_WITH_INFO(
numReturnSequences > 0, "numReturnSequences should be a positive integer, got %d.", numReturnSequences);
TLLM_CHECK_WITH_INFO(mChildRequests.size() <= static_cast<size_t>(numReturnSequences),
"Cannot set numReturnSequences %d smaller than the number %ld of child requests that have already created.",
numReturnSequences, mChildRequests.size());
mNumReturnSequences = numReturnSequences;
mSequenceFinalVec->resize(mNumReturnSequences);
}
[[nodiscard]] bool constexpr isChild() const noexcept
{
return mSequenceIndex > 0;
}
/// @brief Return a vector of the last-generated tokens of shape [num_beams]
[[nodiscard]] VecTokens const& getLastTokens()
{
@ -886,6 +929,11 @@ public:
mEncoderOutputHost = std::move(encoderOutputHost);
}
void setEncoderOutput(TensorPtr encoderOutput)
{
mEncoderOutput = std::move(encoderOutput);
}
void allocEncoderOutputHost(SizeType32 encoderHiddenSize, nvinfer1::DataType dataType)
{
mEncoderOutputHost = runtime::BufferManager::pinned(
@ -1204,7 +1252,14 @@ public:
TLLM_LOG_DEBUG("Creating response for request %lu", mRequestId);
executor::Result result;
result.isFinal = isGenerationCompleteState() || isDisaggContextTransmissionState();
result.sequenceIndex = mSequenceIndex;
result.isSequenceFinal = isGenerationCompleteState() || isDisaggContextTransmissionState();
mSequenceFinalVec->at(mSequenceIndex) = result.isSequenceFinal;
result.isFinal = std::all_of(mSequenceFinalVec->begin(), mSequenceFinalVec->end(),
[](bool isSequenceFinal) { return isSequenceFinal; });
auto const nbBeams = mSamplingConfig.beamWidth;
auto const maxNbTokens = getMaxBeamNumTokens();
@ -1295,7 +1350,9 @@ public:
// Update position of last sent response
setMaxSentTokenLen(maxNbTokens);
auto response = executor::Response(mRequestId, std::move(result));
auto requestId = isChild() ? mParentRequestId : mRequestId;
auto response = executor::Response(requestId, std::move(result));
return response;
}
}
@ -1413,6 +1470,12 @@ protected:
// TODO: add real extra id for encoder tokens
std::optional<std::shared_ptr<VecUniqueTokens>> mEncoderUniqueTokens;
SizeType32 mNumReturnSequences;
SizeType32 mSequenceIndex;
std::vector<RequestPtr> mChildRequests;
RequestIdType mParentRequestId;
std::shared_ptr<std::vector<bool>> mSequenceFinalVec; // Indicators whether each sibling completes generation.
private:
void initialize(VecTokens const& inputTokens, bool outputLogProbs)
{
@ -1475,6 +1538,12 @@ private:
}
setReturnLogProbs(outputLogProbs);
if (!isChild())
{
// Initialize result states unless it is a child and a child request should share parent's one.
mSequenceFinalVec = std::make_shared<std::vector<bool>>(getNumReturnSequences(), false);
}
}
TensorPtr createListTensor(std::list<VecTokens> const& wordsList)
@ -1540,7 +1609,8 @@ public:
std::optional<TensorPtr> encoderInputFeatures = std::nullopt,
std::optional<SizeType32> encoderOutputLength = std::nullopt,
LlmRequestType llmRequestType = LlmRequestType::LLMREQUEST_TYPE_CONTEXT_AND_GENERATION,
std::optional<std::shared_ptr<VecTokenExtraIds>> inputTokenExtraIds = std::nullopt)
std::optional<std::shared_ptr<VecTokenExtraIds>> inputTokenExtraIds = std::nullopt,
SizeType32 numReturnSequences = 1)
: Base(requestId, maxNewTokens, std::move(inputTokens), samplingConfig, isStreaming, endId, padId,
std::move(embeddingBias), std::move(badWordsList), std::move(stopWordsList), std::move(positionIds),
std::move(promptEmbeddingTable), promptVocabSize, loraTaskId, std::move(loraWeights), std::move(loraConfig),
@ -1548,18 +1618,49 @@ public:
std::move(draftTokens), std::move(draftLogits), excludeInputFromOutput, std::move(logitsPostProcessor),
applyLogitsPostProcessorBatched, std::move(encoderInputTokens), returnEncoderOutput, clientId, priority,
std::move(encoderInputFeatures), std::move(encoderOutputLength), llmRequestType,
std::move(inputTokenExtraIds))
std::move(inputTokenExtraIds), numReturnSequences)
{
}
LlmRequest(RequestIdType requestId, executor::Request const& Request,
LlmRequest(RequestIdType requestId, executor::Request const& request,
std::optional<Base::LogitsPostProcessor> logitsPostProcessor = std::nullopt,
bool applyLogitsPostProcessorBatched = false)
: Base(requestId, Request)
: Base(requestId, request)
{
mLogitsPostProcessor = std::move(logitsPostProcessor);
mApplyLogitsPostProcessorBatched = applyLogitsPostProcessorBatched;
mLookaheadConfig = Request.getLookaheadConfig();
mLookaheadConfig = request.getLookaheadConfig();
}
std::shared_ptr<LlmRequest> createChildRequest(RequestIdType requestId)
{
TLLM_CHECK_WITH_INFO(!isChild(), "A child request cannot create its own child.");
TLLM_CHECK_WITH_INFO(mChildRequests.size() + 1 < static_cast<size_t>(getNumReturnSequences()),
"Cannot create child requests more than the number of return sequences (%d)", getNumReturnSequences());
auto childReq = std::make_shared<LlmRequest>(*this);
childReq->mRequestId = requestId;
childReq->mSequenceIndex = mChildRequests.size() + 1;
childReq->mParentRequestId = this->mRequestId;
childReq->mSequenceFinalVec = this->mSequenceFinalVec;
childReq->mSeqSlot.reset();
// To ensure different randomness across children, assign a unique random seed to each child
// by adding its sequence index to the base seed. If no seed is provided, the parent's seed defaults to 0.
using RandomSeedType = tensorrt_llm::executor::RandomSeedType;
if (childReq->mSamplingConfig.randomSeed.has_value())
{
childReq->mSamplingConfig.randomSeed->at(0) += static_cast<RandomSeedType>(childReq->mSequenceIndex);
}
else
{
RandomSeedType defaultSeed{0};
mSamplingConfig.randomSeed = std::vector<RandomSeedType>(1, defaultSeed);
childReq->mSamplingConfig.randomSeed
= std::vector<RandomSeedType>(1, defaultSeed + static_cast<RandomSeedType>(childReq->mSequenceIndex));
}
mChildRequests.push_back(childReq);
return childReq;
}
void movePromptEmbeddingTableToGpu(runtime::BufferManager const& manager)

View File

@ -32,7 +32,7 @@ using runtime::SizeType32;
struct PeftCacheManagerConfig
{
static float constexpr kDefaultDeviceCachePercent = 0.05;
static float constexpr kDefaultDeviceCachePercent = 0.02;
static size_t constexpr kDefaultHostCacheSize = 1024 * 1024 * 1024;
explicit PeftCacheManagerConfig(SizeType32 numHostModuleLayer = 0, SizeType32 numDeviceModuleLayer = 0,

View File

@ -78,15 +78,6 @@ public:
{
}
// Copy constructor
TrtGptModelOptionalParams(TrtGptModelOptionalParams const& other)
: TrtGptModelOptionalParams(other.kvCacheConfig, other.enableTrtOverlap, other.deviceIds,
other.normalizeLogProbs, other.enableChunkedContext, other.peftCacheManagerConfig, other.decodingConfig,
other.gpuWeightsPercent, other.maxBeamWidth, other.maxBatchSize, other.maxNumTokens, other.schedulerConfig,
other.extendedRuntimePerfKnobConfig, other.debugConfig, other.maxSeqIdleMicroseconds)
{
}
bool operator==(TrtGptModelOptionalParams const& other) const
{
return kvCacheConfig == other.kvCacheConfig //

View File

@ -0,0 +1,38 @@
/*
* Copyright (c) 2022-2024, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once
#include <string>
#ifndef _WIN32
#include <pthread.h>
#endif
namespace tensorrt_llm::common
{
inline bool setThreadName(std::string const& name)
{
#ifdef _WIN32
return false;
#else
auto const ret = pthread_setname_np(pthread_self(), name.c_str());
return !ret;
#endif
}
} // namespace tensorrt_llm::common

View File

@ -24,6 +24,7 @@
#include <deque>
#include <filesystem>
#include <list>
#include <map>
#include <memory>
#include <optional>
#include <string>
@ -343,6 +344,7 @@ public:
/// convolution down-sampling, etc.)
/// @param type Indicate the request type for disaggregated serving mode.
/// @param contextPhaseParams Generated token ID from context only executor.
/// @param numReturnSequences The number of returning sequences.
Request(VecTokens inputTokenIds, SizeType32 maxTokens, bool streaming = false,
SamplingConfig const& samplingConfig = SamplingConfig(), OutputConfig const& outputConfig = OutputConfig(),
std::optional<SizeType32> const& endId = std::nullopt, std::optional<SizeType32> const& padId = std::nullopt,
@ -360,7 +362,7 @@ public:
RequestType type = RequestType::REQUEST_TYPE_CONTEXT_AND_GENERATION,
std::optional<ContextPhaseParams> contextPhaseParams = std::nullopt,
std::optional<Tensor> encoderInputFeatures = std::nullopt,
std::optional<SizeType32> encoderOutputLength = std::nullopt);
std::optional<SizeType32> encoderOutputLength = std::nullopt, SizeType32 numReturnSequences = 1);
/// @brief This logits postprocessor name will dispatch to the batched logits postprocessor
static auto constexpr kBatchedPostProcessorName = "batched";
@ -396,6 +398,7 @@ public:
[[nodiscard]] std::optional<Tensor> getEncoderInputFeatures() const;
[[nodiscard]] std::optional<SizeType32> getEncoderOutputLength() const;
[[nodiscard]] RequestType getRequestType() const;
[[nodiscard]] SizeType32 getNumReturnSequences() const;
void setStreaming(bool streaming);
void setSamplingConfig(SamplingConfig const& config);
@ -419,6 +422,7 @@ public:
void setContextPhaseParams(ContextPhaseParams contextPhaseParams);
void setEncoderInputFeatures(Tensor encoderInputFeatures);
void setEncoderOutputLength(SizeType32 encoderOutputLength);
void setNumReturnSequences(SizeType32 numReturnSequences);
private:
friend class Serialization;
@ -461,6 +465,12 @@ struct Result
/// @brief The decoding iterations it takes.
SizeType32 decodingIter{0};
/// @brief The index of the output sequence where 0 <= sequenceIndex < numReturnSequences
SizeType32 sequenceIndex{0};
/// @brief Indicates if this is the final result for a given sequence in the request
bool isSequenceFinal;
};
/// @brief Class that holds either an error or a result
@ -583,7 +593,7 @@ private:
class ExtendedRuntimePerfKnobConfig
{
public:
explicit ExtendedRuntimePerfKnobConfig(bool multiBlockMode = false, bool enableContextFMHAFP32Acc = false);
explicit ExtendedRuntimePerfKnobConfig(bool multiBlockMode = true, bool enableContextFMHAFP32Acc = false);
bool operator==(ExtendedRuntimePerfKnobConfig const& other) const
{
@ -612,27 +622,33 @@ class DebugConfig
using StringVec = std::vector<std::string>;
public:
explicit DebugConfig(bool dumpInputTensors = false, bool dumpOuputTensors = false, StringVec debugTensorNames = {});
explicit DebugConfig(bool debugInputTensors = false, bool debugOutputTensors = false,
StringVec debugTensorNames = {}, SizeType32 debugTensorsMaxIterations = 0);
bool operator==(DebugConfig const& other) const;
[[nodiscard]] bool getDumpInputTensors() const;
[[nodiscard]] bool getDumpOutputTensors() const;
[[nodiscard]] bool getDebugInputTensors() const;
[[nodiscard]] bool getDebugOutputTensors() const;
[[nodiscard]] StringVec const& getDebugTensorNames() const;
[[nodiscard]] SizeType32 getDebugTensorsMaxIterations() const;
void setDumpInputTensors(bool dumpInputTensors);
void setDumpOuputTensors(bool dumpOuputTensors);
void setDebugInputTensors(bool debugInputTensors);
void setDebugOutputTensors(bool debugOutputTensors);
void setDebugTensorNames(StringVec const& debugTensorNames);
void setDebugTensorsMaxIterations(SizeType32 debugTensorsMaxIterations);
private:
friend class Serialization;
/// @brief If true, dump all input tensors.
bool mDumpInputTensors;
/// @brief If true, dump all output tensors.
bool mDumpOuputTensors;
/// @brief If not empty, only dump tensors in this list.
/// @brief If true, debug all input tensors.
bool mDebugInputTensors;
/// @brief If true, debug all output tensors.
bool mDebugOutputTensors;
/// @brief If not empty, only debug tensors in this list.
StringVec mDebugTensorNames;
/// @brief If > 0, provide debug tensors for at most debugTensorsMaxIterations past iterations,
/// else dump them to files.
SizeType32 mDebugTensorsMaxIterations;
};
SizeType32 const kDefaultIterStatsMaxIterations = 1000;
@ -960,7 +976,8 @@ public:
ModelType modelType, ExecutorConfig const& executorConfig);
Executor(BufferView const& engineBuffer, std::string const& jsonConfigStr, ModelType modelType,
ExecutorConfig const& executorConfig);
ExecutorConfig const& executorConfig,
std::optional<std::map<std::string, Tensor>> const& managedWeights = std::nullopt);
Executor(BufferView const& encoderEngineBuffer, std::string const& encoderJsonConfigStr,
BufferView const& decoderEngineBuffer, std::string const& decoderJsonConfigStr, ModelType modelType,
@ -1021,20 +1038,25 @@ public:
/// @param id The request id for which to cancel the response
void cancelRequest(IdType requestId);
/// @brief Signals the server to shutdown
/// This call is blocking. Only returns when all requests have terminated or timeout has been reached
/// @brief Signals the server to shutdown.
/// @details This call is blocking. Only returns when all requests have terminated or timeout has been reached
void shutdown();
/// @brief Returns the per-iterations statistics computed since last call to getLatestIterationStats
/// Contains at most iterStatsMaxIterations iterations
/// @brief Returns the per-iterations statistics computed since last call to getLatestIterationStats.
/// Contains at most iterStatsMaxIterations iterations.
/// @return Iteration stats
std::deque<IterationStats> getLatestIterationStats();
/// @brief Returns the request stats of each iteration computed since last call to getLatestRequestStats
/// Contains at most requestStatsMaxIterations iterations
/// @brief Returns the request stats of each iteration computed since last call to getLatestRequestStats.
/// Contains at most requestStatsMaxIterations iterations.
/// @return Request stats grouped by iterations
std::deque<RequestStatsPerIteration> getLatestRequestStats();
/// @brief Returns the debug tensors of each iteration computed since last call to getLatestDebugTensors.
/// Contains at most debugTensorsMaxIterations iterations.
/// @return Request debug tensors grouped by iterations
std::deque<DebugTensorsPerIteration> getLatestDebugTensors();
/// @brief Indicates if the current process is allowed to enqueueRequests
[[nodiscard]] bool canEnqueueRequests() const;

View File

@ -25,6 +25,13 @@
namespace tensorrt_llm::executor
{
namespace kv_cache
{
class CommState;
class CacheState;
class SocketState;
} // namespace kv_cache
class Serialization
{
public:
@ -53,6 +60,21 @@ public:
static void serialize(LoraConfig const& config, std::ostream& os);
[[nodiscard]] static size_t serializedSize(LoraConfig const& config);
// CommState
[[nodiscard]] static kv_cache::CommState deserializeCommState(std::istream& is);
static void serialize(kv_cache::CommState const& state, std::ostream& os);
[[nodiscard]] static size_t serializedSize(kv_cache::CommState const& state);
// SocketState
[[nodiscard]] static kv_cache::SocketState deserializeSocketState(std::istream& is);
static void serialize(kv_cache::SocketState const& state, std::ostream& os);
[[nodiscard]] static size_t serializedSize(kv_cache::SocketState const& state);
// CacheState
[[nodiscard]] static kv_cache::CacheState deserializeCacheState(std::istream& is);
static void serialize(kv_cache::CacheState const& state, std::ostream& os);
[[nodiscard]] static size_t serializedSize(kv_cache::CacheState const& state);
// ContextPhaseState
[[nodiscard]] static ContextPhaseState deserializeContextPhaseState(std::istream& is);
static void serialize(ContextPhaseState const& contextPhaseState, std::ostream& os);

View File

@ -18,6 +18,7 @@
#include <cstdint>
#include <functional>
#include <map>
#include <memory>
#include <optional>
#include <string>
@ -361,6 +362,15 @@ struct RequestStatsPerIteration
std::vector<RequestStats> requestStats;
};
/// @brief Struct that holds the debug tensors in an iteration
struct DebugTensorsPerIteration
{
/// @brief The iteration id for these tensors
IterationType iter;
/// @brief The debug tensors for this iteration
std::map<std::string, Tensor> debugTensors;
};
/// @brief The reason why the model stopped generating tokens for a request.
enum class FinishReason
{

View File

@ -32,7 +32,7 @@ public:
using BufferPtr = IBuffer::SharedPtr;
// MAX_ALL_REDUCE_BLOCKS for block_barrier, 1 for multi_gpu_barrier
size_t static constexpr FLAGS_SIZE = (kernels::MAX_ALL_REDUCE_BLOCKS + 1) * sizeof(uint32_t);
size_t static constexpr FLAGS_SIZE = (tensorrt_llm::kernels::MAX_ALL_REDUCE_BLOCKS + 1) * sizeof(uint32_t);
IpcMemory(
std::size_t bufferSize, BufferManager const& manager, WorldConfig const& worldConfig, bool openIpc = true);

View File

@ -17,9 +17,11 @@
#pragma once
#include "tensorrt_llm/common/assert.h"
#include "tensorrt_llm/executor/tensor.h"
#include <NvInferRuntime.h>
#include <filesystem>
#include <map>
#include <optional>
namespace tensorrt_llm::runtime
@ -75,6 +77,17 @@ public:
mEnginePath = std::move(enginePath);
}
[[nodiscard]] std::optional<std::map<std::string, tensorrt_llm::executor::Tensor>> const&
getManagedWeightsMapOpt() const
{
return mManagedWeightsMap;
}
void setManagedWeightsMap(std::map<std::string, tensorrt_llm::executor::Tensor> managedWeightsMap)
{
mManagedWeightsMap = std::move(managedWeightsMap);
}
[[nodiscard]] void const* getAddress() const
{
TLLM_CHECK(mType == AddressWithSize);
@ -104,6 +117,7 @@ private:
};
nvinfer1::IHostMemory const* mEngineBuffer{};
std::optional<std::map<std::string, tensorrt_llm::executor::Tensor>> mManagedWeightsMap;
};
} // namespace tensorrt_llm::runtime

View File

@ -186,6 +186,16 @@ find_package(Threads REQUIRED)
target_link_libraries(${BATCH_MANAGER_TARGET} INTERFACE Threads::Threads)
target_link_libraries(${EXECUTOR_TARGET} INTERFACE Threads::Threads)
if(ENABLE_UCX)
find_package(ucx REQUIRED)
find_package(ucxx REQUIRED)
if(BUILD_BATCH_MANAGER)
target_include_directories(
${BATCH_MANAGER_TARGET}
PRIVATE $<TARGET_PROPERTY:ucxx::ucxx,INTERFACE_INCLUDE_DIRECTORIES>)
endif()
endif()
if(NOT WIN32)
if(USE_CXX11_ABI)
add_custom_command(
@ -331,6 +341,10 @@ if(ENABLE_MULTI_DEVICE)
set(TRTLLM_LINK_LIBS ${TRTLLM_LINK_LIBS} ${MPI_C_LIBRARIES} ${NCCL_LIB})
endif()
if(ENABLE_UCX)
set(TRTLLM_LINK_LIBS ${TRTLLM_LINK_LIBS} ucxx::ucxx ucx::ucs)
endif()
if(NOT WIN32) # Unix-like compilers
set(UNDEFINED_FLAG "-Wl,--no-undefined")
set(AS_NEEDED_FLAG "-Wl,--as-needed")
@ -366,6 +380,9 @@ target_link_libraries(${SHARED_TARGET} PUBLIC ${TRTLLM_LINK_LIBS})
link_whole_archive(${SHARED_TARGET} ${BATCH_MANAGER_TARGET})
link_whole_archive(${SHARED_TARGET} ${EXECUTOR_TARGET})
link_whole_archive(${SHARED_TARGET} ${INTERNAL_CUTLASS_KERNELS_TARGET})
if(ENABLE_UCX)
link_whole_archive(${SHARED_TARGET} ucxx::ucxx)
endif()
# Cyclic dependency of batch manager on TRT-LLM
target_link_libraries(${BATCH_MANAGER_TARGET} INTERFACE ${SHARED_TARGET})

View File

@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:5837518b278aa82cfdaeb3279bfe396de8e0638d31c3447f2eaa7443c22fa3f7
size 4459926
oid sha256:1ce35a0714ef753c5328aa982b1fefa58b90994bd87a6739634ec47ec9373f9e
size 4565552

View File

@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:ff7c6c16dc4755cfcf398d6b92ddd056ad6ae40f1dd830deeccd37ceb795edb6
size 4567634
oid sha256:1aa2a508d865410915b9ae4b21b11062a07d6143d19bc84fa53145da7911aa2a
size 4667530

View File

@ -1,3 +1,3 @@
fe9c16bd1eed122234ece7f9afeea382 libtensorrt_llm_batch_manager_static.a
040e15c175f987c30ebfdbcc8a9c2021 libtensorrt_llm_batch_manager_static.pre_cxx11.a
052edd4c2bca0a186eed2169a9681d317f67a712 commit
7a30229eedc22a924052cd5440c5adb4 libtensorrt_llm_batch_manager_static.a
e46c1e13209f90acdcc8b5f0c9e8a15c libtensorrt_llm_batch_manager_static.pre_cxx11.a
0e09af04945b8cb0eaf65780a7fa7907a2baee1a commit

View File

@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:1a7311134408b791dbd4f11ad3144f67314a8e6a288d14f12767004d79a82ac2
size 4318978
oid sha256:106cab5936a2ac034785050804890aa4deb1436983215439462c244475ebb90c
size 4422078

View File

@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:8181b7a634293d981caf0f5011c2619f541c02e818aae8fbff4bf4e72cea6cab
size 4291752
oid sha256:cb9ad53702b1fbe66dbb989a50422ab4cf01c53b4943d48296cc96f92742c363
size 4388652

View File

@ -1,3 +1,3 @@
3869999ed0175550deb0d73d0ab0fd08 libtensorrt_llm_batch_manager_static.a
e6e05e4c36d868dfb1f9c93c77993cbd libtensorrt_llm_batch_manager_static.pre_cxx11.a
052edd4c2bca0a186eed2169a9681d317f67a712 commit
251ebd85cff41a2af7f6dcca8489f8fb libtensorrt_llm_batch_manager_static.a
801b1b6ffd0ab4ec3a66afeb010d97c4 libtensorrt_llm_batch_manager_static.pre_cxx11.a
0e09af04945b8cb0eaf65780a7fa7907a2baee1a commit

View File

@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:ce95b46f9ae1ff46967984f0c956a0b51bc7c57cebbf0ed6e553729ce84fe8b6
size 26424318
oid sha256:291630c536d8262087c9dd5f3bbd4c9b301aea8afbae4af9bc2cdce4db4e8f23
size 27510016

View File

@ -1,2 +1,2 @@
6c8dbccd4cde7ca451e8e99ecb480f55 tensorrt_llm_batch_manager_static.lib
052edd4c2bca0a186eed2169a9681d317f67a712 commit
8873c98ec05794c5ebaf05c8da73dd65 tensorrt_llm_batch_manager_static.lib
0e09af04945b8cb0eaf65780a7fa7907a2baee1a commit

View File

@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:356deeea43040cf5529f89bd880028b8cf0a0600967df382b4107e796d9a301c
size 1630654
oid sha256:1f7087b56c34700e048ee9d40086b34d65952e66507ea36986ab11260e0a3300
size 1759444

View File

@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:115bf7be63c22bb7fd69cafa600ebe9785b104e78e1a7e5b475bcbe1d2069037
size 1655182
oid sha256:3288603c000a6eecb7a28319c69536ac0b37b2337474330b61442b9940e1d988
size 1787862

View File

@ -1,3 +1,3 @@
aae0acac4fab096666be84b6e630bd71 libtensorrt_llm_executor_static.a
a174142f8f74f1c1a439cc3d040b0b5f libtensorrt_llm_executor_static.pre_cxx11.a
052edd4c2bca0a186eed2169a9681d317f67a712 commit
cfe12cab670a58d56b0a2d881c218015 libtensorrt_llm_executor_static.a
b177ad21ac5636091ee267a5a550aa77 libtensorrt_llm_executor_static.pre_cxx11.a
0e09af04945b8cb0eaf65780a7fa7907a2baee1a commit

View File

@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:e5beefedcd2309d37550bac7e9d9ec0cf7f7c18d2fbf9a2d9bf9c7625954b6c8
size 1694400
oid sha256:f565a5225dec3f1f88df931d8f8a0718f2ab24f705a639ae4ebf358b37b4555d
size 1824992

View File

@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:89acdd3e6b904f9dc86bcf8e73fe03c43b82ceab5d325357ea1f3ed1ef797aaf
size 1615086
oid sha256:21aaf66b1435d4fec2f41ce0029522da71509cb7c6a856bbf9411c88c105cd5c
size 1735024

View File

@ -1,3 +1,3 @@
5bebbd31919ac2b34579c8653295dfd2 libtensorrt_llm_executor_static.a
2332b6ea1e0b8683844168949f9dfb9c libtensorrt_llm_executor_static.pre_cxx11.a
052edd4c2bca0a186eed2169a9681d317f67a712 commit
4dcc8b42ff4afe11178bc2f145394b41 libtensorrt_llm_executor_static.a
ead139d0835f7e86d2bf7ecd41ad0999 libtensorrt_llm_executor_static.pre_cxx11.a
0e09af04945b8cb0eaf65780a7fa7907a2baee1a commit

View File

@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:6fceeee8a39af844a5cfefd32d3b2b024b659f5d7dceacc0f5dd0b69b5d37b7c
size 17485396
oid sha256:9763058bec6c637ec101384b9e89681ff315ddd514fb1e37fde8ef5c51de540a
size 19341056

View File

@ -1,2 +1,2 @@
ac61b12b5aa440f5f8f0e05511a12d17 tensorrt_llm_executor_static.lib
052edd4c2bca0a186eed2169a9681d317f67a712 commit
94894f3c80436b5dfbb6864dbe686baa tensorrt_llm_executor_static.lib
0e09af04945b8cb0eaf65780a7fa7907a2baee1a commit

Some files were not shown because too many files have changed in this diff Show More