feat: Integrate GPUDirect Storage (GDS) into Executor API (#3582)

* feat: Integrate GPUDirect Storage (GDS) into Executor API

Squash of several dev commits

Signed-off-by: Dom Brown <3886319+DomBrown@users.noreply.github.com>
This commit is contained in:
Dom Brown 2025-04-18 08:59:21 +01:00 committed by GitHub
parent 90a28b917f
commit dbd9a83b0d
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
23 changed files with 410 additions and 82 deletions

View File

@ -74,15 +74,17 @@ std::string engineFilename(
}
void benchmarkBert(std::string const& modelName, std::filesystem::path const& dataPath,
std::vector<int> const& batchSizes, std::vector<int> const& inLens, std::vector<float> const& gpuWeightsPercents,
std::shared_ptr<nvinfer1::ILogger> const& logger, int warmUp, int numRuns, int duration)
std::vector<int> const& batchSizes, std::vector<int> const& inLens, bool useGpuDirectStorage,
std::vector<float> const& gpuWeightsPercents, std::shared_ptr<nvinfer1::ILogger> const& logger, int warmUp,
int numRuns, int duration)
{
auto const worldConfig = WorldConfig::mpi();
auto const enginePath = dataPath / engineFilename(dataPath, worldConfig, modelName);
for (float gpuWeightsPercent : gpuWeightsPercents)
{
auto rt = std::make_shared<TllmRuntime>(RawEngine(enginePath), logger.get(), gpuWeightsPercent);
auto rt = std::make_shared<TllmRuntime>(
RawEngine(enginePath), logger.get(), useGpuDirectStorage, gpuWeightsPercent);
rt->addContext(0);
for (auto inLen : inLens)
{
@ -174,6 +176,8 @@ int main(int argc, char* argv[])
"by \";\", "
"example: \"0.0;0.5;1.0\".",
cxxopts::value<std::string>()->default_value("1.0"));
options.add_options()("use_gpu_direct_storage", "Enable GPUDirect Storage (GDS) for loading engine.",
cxxopts::value<bool>()->default_value("false"));
auto result = options.parse(argc, argv);
@ -258,8 +262,8 @@ int main(int argc, char* argv[])
try
{
benchmarkBert(result["model"].as<std::string>(), result["engine_dir"].as<std::string>(), batchSizes, inLens,
gpuWeightsPercents, logger, result["warm_up"].as<int>(), result["num_runs"].as<int>(),
result["duration"].as<int>());
result["use_gpu_direct_storage"].as<bool>(), gpuWeightsPercents, logger, result["warm_up"].as<int>(),
result["num_runs"].as<int>(), result["duration"].as<int>());
}
catch (std::exception const& e)
{

View File

@ -41,9 +41,9 @@ public:
std::optional<std::vector<SizeType32>> deviceIds = std::nullopt, bool normalizeLogProbs = true,
bool enableChunkedContext = true,
PeftCacheManagerConfig const& peftCacheManagerConfig = PeftCacheManagerConfig{},
executor::DecodingConfig decodingConfig = executor::DecodingConfig{}, float gpuWeightsPercent = 1,
std::optional<SizeType32> maxBeamWidth = std::nullopt, std::optional<SizeType32> maxBatchSize = std::nullopt,
std::optional<SizeType32> maxNumTokens = std::nullopt,
executor::DecodingConfig decodingConfig = executor::DecodingConfig{}, bool useGpuDirectStorage = false,
float gpuWeightsPercent = 1, std::optional<SizeType32> maxBeamWidth = std::nullopt,
std::optional<SizeType32> maxBatchSize = std::nullopt, std::optional<SizeType32> maxNumTokens = std::nullopt,
executor::SchedulerConfig schedulerConfig = executor::SchedulerConfig{},
executor::ExtendedRuntimePerfKnobConfig const& extendedRuntimePerfKnobConfig
= executor::ExtendedRuntimePerfKnobConfig{},
@ -61,6 +61,7 @@ public:
, enableChunkedContext{enableChunkedContext}
, peftCacheManagerConfig(peftCacheManagerConfig)
, decodingConfig(std::move(decodingConfig))
, useGpuDirectStorage(useGpuDirectStorage)
, gpuWeightsPercent(gpuWeightsPercent)
, maxBeamWidth(maxBeamWidth)
, maxBatchSize(maxBatchSize)
@ -87,12 +88,12 @@ public:
executorConfig.getNormalizeLogProbs(), executorConfig.getEnableChunkedContext(),
PeftCacheManagerConfig(executorConfig.getPeftCacheConfig().value_or(executor::PeftCacheConfig())),
executorConfig.getDecodingConfig().value_or(executor::DecodingConfig{}),
executorConfig.getGpuWeightsPercent(), executorConfig.getMaxBeamWidth(), executorConfig.getMaxBatchSize(),
executorConfig.getMaxNumTokens(), executorConfig.getSchedulerConfig(),
executorConfig.getExtendedRuntimePerfKnobConfig(), executorConfig.getDebugConfig(),
executorConfig.getMaxSeqIdleMicroseconds(), executorConfig.getSpecDecConfig(),
executorConfig.getGuidedDecodingConfig(), isLeaderInOrchMode, executorConfig.getAdditionalModelOutputs(),
executorConfig.getGatherGenerationLogits())
executorConfig.getUseGpuDirectStorage(), executorConfig.getGpuWeightsPercent(),
executorConfig.getMaxBeamWidth(), executorConfig.getMaxBatchSize(), executorConfig.getMaxNumTokens(),
executorConfig.getSchedulerConfig(), executorConfig.getExtendedRuntimePerfKnobConfig(),
executorConfig.getDebugConfig(), executorConfig.getMaxSeqIdleMicroseconds(),
executorConfig.getSpecDecConfig(), executorConfig.getGuidedDecodingConfig(), isLeaderInOrchMode,
executorConfig.getAdditionalModelOutputs(), executorConfig.getGatherGenerationLogits())
{
}
@ -106,6 +107,8 @@ public:
bool enableChunkedContext;
PeftCacheManagerConfig peftCacheManagerConfig;
executor::DecodingConfig decodingConfig;
// Use GDS to load the engines?
bool useGpuDirectStorage;
// Percentage of weights on the gpu at runtime
float gpuWeightsPercent;
std::optional<SizeType32> maxBeamWidth;

View File

@ -1400,8 +1400,8 @@ public:
std::optional<ParallelConfig> parallelConfig = std::nullopt,
std::optional<PeftCacheConfig> const& peftCacheConfig = std::nullopt,
std::optional<LogitsPostProcessorConfig> logitsPostProcessorConfig = std::nullopt,
std::optional<DecodingConfig> decodingConfig = std::nullopt, float gpuWeightsPercent = 1,
std::optional<SizeType32> maxQueueSize = std::nullopt,
std::optional<DecodingConfig> decodingConfig = std::nullopt, bool useGpuDirectStorage = false,
float gpuWeightsPercent = 1, std::optional<SizeType32> maxQueueSize = std::nullopt,
ExtendedRuntimePerfKnobConfig const& extendedRuntimePerfKnobConfig = ExtendedRuntimePerfKnobConfig(),
std::optional<DebugConfig> debugConfig = std::nullopt, SizeType32 recvPollPeriodMs = 0,
uint64_t maxSeqIdleMicroseconds = kDefaultMaxSeqIdleMicroseconds,
@ -1429,6 +1429,7 @@ public:
[[nodiscard]] std::optional<PeftCacheConfig> getPeftCacheConfig() const;
[[nodiscard]] std::optional<LogitsPostProcessorConfig> getLogitsPostProcessorConfig() const;
[[nodiscard]] std::optional<DecodingConfig> getDecodingConfig() const;
[[nodiscard]] bool getUseGpuDirectStorage() const;
[[nodiscard]] float getGpuWeightsPercent() const;
[[nodiscard]] std::optional<SizeType32> getMaxQueueSize() const;
[[nodiscard]] ExtendedRuntimePerfKnobConfig getExtendedRuntimePerfKnobConfig() const;
@ -1455,6 +1456,7 @@ public:
void setPeftCacheConfig(PeftCacheConfig const& peftCacheConfig);
void setLogitsPostProcessorConfig(LogitsPostProcessorConfig const& logitsPostProcessorConfig);
void setDecodingConfig(DecodingConfig const& decodingConfig);
void setUseGpuDirectStorage(bool const& useGpuDirectStorage);
void setGpuWeightsPercent(float const& gpuWeightsPercent);
void setMaxQueueSize(std::optional<SizeType32> const& maxQueueSize);
void setExtendedRuntimePerfKnobConfig(ExtendedRuntimePerfKnobConfig const& extendedRuntimePerfKnobConfig);
@ -1510,6 +1512,9 @@ private:
/// @brief Decoding configuration.
std::optional<DecodingConfig> mDecodingConfig;
/// @brief Enable/disable use of GPU Direct Storage (GDS) to load engines.
bool mUseGpuDirectStorage;
/// @brief GPU weights percent for weight streaming.
float mGpuWeightsPercent;

View File

@ -99,6 +99,9 @@ public:
SizeType32 maxBeamWidth;
// The length of the longest input sequence
SizeType32 maxSequenceLength;
// Enable/disable GPUDirectStorage
// Not supported by GptSession so hard-coded as false
bool useGpuDirectStorage{false};
// Percentage of weights on the gpu at runtime
float gpuWeightsPercent;
// Whether the session will use a different decoder per request.

View File

@ -45,7 +45,8 @@ TrtEncoderModel::TrtEncoderModel(runtime::ModelConfig const& modelConfig, WorldC
, mWorldConfig{worldConfig}
, mDevice{runtime::utils::initDevice(worldConfig)}
, mLogger{logger ? std::move(logger) : std::make_shared<TllmLogger>()}
, mRuntime{std::make_shared<TllmRuntime>(rawEngine, mLogger.get(), optionalParams.gpuWeightsPercent)}
, mRuntime{std::make_shared<TllmRuntime>(
rawEngine, mLogger.get(), optionalParams.useGpuDirectStorage, optionalParams.gpuWeightsPercent)}
, mMicroBatchId(0)
, mCopyBufferManager{std::make_shared<CudaStream>()}
{

View File

@ -138,8 +138,8 @@ TrtGptModelInflightBatching::TrtGptModelInflightBatching(std::shared_ptr<nvinfer
, mDebugConfig{optionalParams.debugConfig}
, mAdditionalModelOutputs{optionalParams.additionalModelOutputs}
, mLogger{logger ? std::move(logger) : std::make_shared<TllmLogger>()}
, mRuntime{std::make_shared<TllmRuntime>(
rawEngine, mLogger.get(), optionalParams.gpuWeightsPercent, modelConfig.useShapeInference())}
, mRuntime{std::make_shared<TllmRuntime>(rawEngine, mLogger.get(), optionalParams.useGpuDirectStorage,
optionalParams.gpuWeightsPercent, modelConfig.useShapeInference())}
, mCopyBufferManager{std::make_shared<CudaStream>()}
, mCtxGenFusion(ctxGenFusion)
, mOperatingBeamWidth{getMaxBeamWidth()}

View File

@ -28,7 +28,7 @@ ExecutorConfig::ExecutorConfig(SizeType32 maxBeamWidth, SchedulerConfig schedule
std::optional<SizeType32> maxNumTokens, std::optional<ParallelConfig> parallelConfig,
std::optional<PeftCacheConfig> const& peftCacheConfig,
std::optional<LogitsPostProcessorConfig> logitsPostProcessorConfig, std::optional<DecodingConfig> decodingConfig,
float gpuWeightPercent, std::optional<SizeType32> maxQueueSize,
bool useGpuDirectStorage, float gpuWeightPercent, std::optional<SizeType32> maxQueueSize,
ExtendedRuntimePerfKnobConfig const& extendedRuntimePerfKnobConfig, std::optional<DebugConfig> debugConfig,
SizeType32 recvPollPeriodMs, uint64_t maxSeqIdleMicroseconds,
std::optional<SpeculativeDecodingConfig> specDecConfig, std::optional<GuidedDecodingConfig> guidedDecodingConfig,
@ -48,6 +48,7 @@ ExecutorConfig::ExecutorConfig(SizeType32 maxBeamWidth, SchedulerConfig schedule
, mPeftCacheConfig(peftCacheConfig)
, mLogitsPostProcessorConfig(std::move(logitsPostProcessorConfig))
, mDecodingConfig(std::move(decodingConfig))
, mUseGpuDirectStorage((useGpuDirectStorage))
, mGpuWeightsPercent(gpuWeightPercent)
, mMaxQueueSize(maxQueueSize)
, mExtendedRuntimePerfKnobConfig(extendedRuntimePerfKnobConfig)
@ -146,6 +147,11 @@ std::optional<DecodingConfig> ExecutorConfig::getDecodingConfig() const
return mDecodingConfig;
}
bool ExecutorConfig::getUseGpuDirectStorage() const
{
return mUseGpuDirectStorage;
}
float ExecutorConfig::getGpuWeightsPercent() const
{
return mGpuWeightsPercent;
@ -276,6 +282,11 @@ void ExecutorConfig::setDecodingConfig(DecodingConfig const& decodingConfig)
mDecodingConfig = decodingConfig;
}
void ExecutorConfig::setUseGpuDirectStorage(bool const& useGpuDirectStorage)
{
mUseGpuDirectStorage = useGpuDirectStorage;
}
void ExecutorConfig::setGpuWeightsPercent(float const& gpuWeightsPercent)
{
mGpuWeightsPercent = gpuWeightsPercent;

View File

@ -978,6 +978,7 @@ ExecutorConfig Serialization::deserializeExecutorConfig(std::istream& is)
auto parallelConfig = su::deserializeWithGetterType<decltype(&ExecutorConfig::getParallelConfig)>(is);
auto peftCacheConfig = su::deserializeWithGetterType<decltype(&ExecutorConfig::getPeftCacheConfig)>(is);
auto decodingConfig = su::deserializeWithGetterType<decltype(&ExecutorConfig::getDecodingConfig)>(is);
auto useGpuDirectStorage = su::deserializeWithGetterType<decltype(&ExecutorConfig::getUseGpuDirectStorage)>(is);
auto gpuWeightsPercent = su::deserializeWithGetterType<decltype(&ExecutorConfig::getGpuWeightsPercent)>(is);
auto maxQueueSize = su::deserializeWithGetterType<decltype(&ExecutorConfig::getMaxQueueSize)>(is);
auto extendedRuntimePerfKnobConfig
@ -995,9 +996,9 @@ ExecutorConfig Serialization::deserializeExecutorConfig(std::istream& is)
return ExecutorConfig{maxBeamWidth, schedulerConfig, kvCacheConfig, enableChunkedContext, normalizeLogProbs,
iterStatsMaxIterations, requestStatsMaxIterations, batchingType, maxBatchSize, maxNumTokens, parallelConfig,
peftCacheConfig, std::nullopt, decodingConfig, gpuWeightsPercent, maxQueueSize, extendedRuntimePerfKnobConfig,
debugConfig, recvPollPeriodMs, maxSeqIdleMicroseconds, specDecConfig, guidedDecodingConfig,
additionalModelOutputs, gatherGenerationLogits};
peftCacheConfig, std::nullopt, decodingConfig, useGpuDirectStorage, gpuWeightsPercent, maxQueueSize,
extendedRuntimePerfKnobConfig, debugConfig, recvPollPeriodMs, maxSeqIdleMicroseconds, specDecConfig,
guidedDecodingConfig, additionalModelOutputs, gatherGenerationLogits};
}
size_t Serialization::serializedSize(ExecutorConfig const& executorConfig)
@ -1020,6 +1021,7 @@ size_t Serialization::serializedSize(ExecutorConfig const& executorConfig)
totalSize += su::serializedSize(executorConfig.getParallelConfig());
totalSize += su::serializedSize(executorConfig.getPeftCacheConfig());
totalSize += su::serializedSize(executorConfig.getDecodingConfig());
totalSize += su::serializedSize(executorConfig.getUseGpuDirectStorage());
totalSize += su::serializedSize(executorConfig.getGpuWeightsPercent());
totalSize += su::serializedSize(executorConfig.getMaxQueueSize());
totalSize += su::serializedSize(executorConfig.getExtendedRuntimePerfKnobConfig());
@ -1052,6 +1054,7 @@ void Serialization::serialize(ExecutorConfig const& executorConfig, std::ostream
su::serialize(executorConfig.getParallelConfig(), os);
su::serialize(executorConfig.getPeftCacheConfig(), os);
su::serialize(executorConfig.getDecodingConfig(), os);
su::serialize(executorConfig.getUseGpuDirectStorage(), os);
su::serialize(executorConfig.getGpuWeightsPercent(), os);
su::serialize(executorConfig.getMaxQueueSize(), os);
su::serialize(executorConfig.getExtendedRuntimePerfKnobConfig(), os);

View File

@ -527,6 +527,7 @@ PYBIND11_MODULE(TRTLLM_PYBIND_MODULE, m)
.def_readwrite("enable_chunked_context", &tb::TrtGptModelOptionalParams::enableChunkedContext)
.def_readwrite("normalize_log_probs", &tb::TrtGptModelOptionalParams::normalizeLogProbs)
.def_readwrite("decoding_config", &tb::TrtGptModelOptionalParams::decodingConfig)
.def_readwrite("use_gpu_direct_storage", &tb::TrtGptModelOptionalParams::useGpuDirectStorage)
.def_readwrite("gpu_weights_percent", &tb::TrtGptModelOptionalParams::gpuWeightsPercent)
.def_readwrite("max_beam_width", &tb::TrtGptModelOptionalParams::maxBeamWidth)
.def_readwrite("scheduler_config", &tb::TrtGptModelOptionalParams::schedulerConfig)

View File

@ -414,8 +414,9 @@ void initConfigBindings(pybind11::module_& m)
c.getEnableChunkedContext(), c.getNormalizeLogProbs(), c.getIterStatsMaxIterations(),
c.getRequestStatsMaxIterations(), c.getBatchingType(), c.getMaxBatchSize(), c.getMaxNumTokens(),
c.getParallelConfig(), c.getPeftCacheConfig(), c.getLogitsPostProcessorConfig(), c.getDecodingConfig(),
c.getGpuWeightsPercent(), c.getMaxQueueSize(), c.getExtendedRuntimePerfKnobConfig(), c.getDebugConfig(),
c.getRecvPollPeriodMs(), c.getMaxSeqIdleMicroseconds(), c.getSpecDecConfig(), c.getGuidedDecodingConfig(),
c.getUseGpuDirectStorage(), c.getGpuWeightsPercent(), c.getMaxQueueSize(),
c.getExtendedRuntimePerfKnobConfig(), c.getDebugConfig(), c.getRecvPollPeriodMs(),
c.getMaxSeqIdleMicroseconds(), c.getSpecDecConfig(), c.getGuidedDecodingConfig(),
c.getAdditionalModelOutputs(), c.getGatherGenerationLogits(), c.getUseVariableBeamWidthSearch());
auto pickle_tuple = py::make_tuple(cpp_states, py::getattr(self, "__dict__"));
return pickle_tuple;
@ -429,7 +430,7 @@ void initConfigBindings(pybind11::module_& m)
// Restore C++ data
auto cpp_states = state[0].cast<py::tuple>();
if (cpp_states.size() != 25)
if (cpp_states.size() != 26)
{
throw std::runtime_error("Invalid cpp_states!");
}
@ -449,17 +450,18 @@ void initConfigBindings(pybind11::module_& m)
cpp_states[11].cast<std::optional<tle::PeftCacheConfig>>(), // PeftCacheConfig
cpp_states[12].cast<std::optional<tle::LogitsPostProcessorConfig>>(), // LogitsPostProcessorConfig
cpp_states[13].cast<std::optional<tle::DecodingConfig>>(), // DecodingConfig
cpp_states[14].cast<float>(), // GpuWeightsPercent
cpp_states[15].cast<std::optional<SizeType32>>(), // MaxQueueSize
cpp_states[16].cast<tle::ExtendedRuntimePerfKnobConfig>(), // ExtendedRuntimePerfKnobConfig
cpp_states[17].cast<std::optional<tle::DebugConfig>>(), // DebugConfig
cpp_states[18].cast<SizeType32>(), // RecvPollPeriodMs
cpp_states[19].cast<uint64_t>(), // MaxSeqIdleMicroseconds
cpp_states[20].cast<std::optional<tle::SpeculativeDecodingConfig>>(), // SpecDecConfig
cpp_states[21].cast<std::optional<tle::GuidedDecodingConfig>>(), // GuidedDecodingConfig
cpp_states[22].cast<std::optional<std::vector<tle::AdditionalModelOutput>>>(), // AdditionalModelOutputs
cpp_states[23].cast<bool>(), // GatherGenerationLogits
cpp_states[24].cast<bool>() // UseVariableBeamWidthSearch
cpp_states[14].cast<bool>(), // UseGpuDirectStorage
cpp_states[15].cast<float>(), // GpuWeightsPercent
cpp_states[16].cast<std::optional<SizeType32>>(), // MaxQueueSize
cpp_states[17].cast<tle::ExtendedRuntimePerfKnobConfig>(), // ExtendedRuntimePerfKnobConfig
cpp_states[18].cast<std::optional<tle::DebugConfig>>(), // DebugConfig
cpp_states[19].cast<SizeType32>(), // RecvPollPeriodMs
cpp_states[20].cast<uint64_t>(), // MaxSeqIdleMicroseconds
cpp_states[21].cast<std::optional<tle::SpeculativeDecodingConfig>>(), // SpecDecConfig
cpp_states[22].cast<std::optional<tle::GuidedDecodingConfig>>(), // GuidedDecodingConfig
cpp_states[23].cast<std::optional<std::vector<tle::AdditionalModelOutput>>>(), // AdditionalModelOutputs
cpp_states[24].cast<bool>(), // GatherGenerationLogits
cpp_states[25].cast<bool>() // UseVariableBeamWidthSearch
);
auto py_state = state[1].cast<py::dict>();
@ -483,6 +485,7 @@ void initConfigBindings(pybind11::module_& m)
tle::PeftCacheConfig const&, // PeftCacheConfig
std::optional<tle::LogitsPostProcessorConfig>, // LogitsPostProcessorConfig
std::optional<tle::DecodingConfig>, // DecodingConfig
bool, // UseGpuDirectStorage
float, // GpuWeightsPercent
std::optional<SizeType32>, // MaxQueueSize
tle::ExtendedRuntimePerfKnobConfig const&, // ExtendedRuntimePerfKnobConfig
@ -505,7 +508,8 @@ void initConfigBindings(pybind11::module_& m)
py::arg("parallel_config") = py::none(),
py::arg_v("peft_cache_config", tle::PeftCacheConfig(), "PeftCacheConfig()"),
py::arg("logits_post_processor_config") = py::none(), py::arg("decoding_config") = py::none(),
py::arg("gpu_weights_percent") = 1.0, py::arg("max_queue_size") = py::none(),
py::arg("use_gpu_direct_storage") = false, py::arg("gpu_weights_percent") = 1.0,
py::arg("max_queue_size") = py::none(),
py::arg_v("extended_runtime_perf_knob_config", tle::ExtendedRuntimePerfKnobConfig(),
"ExtendedRuntimePerfKnobConfig()"),
py::arg("debug_config") = py::none(), py::arg("recv_poll_period_ms") = 0,
@ -537,6 +541,8 @@ void initConfigBindings(pybind11::module_& m)
&tle::ExecutorConfig::setLogitsPostProcessorConfig)
.def_property(
"decoding_config", &tle::ExecutorConfig::getDecodingConfig, &tle::ExecutorConfig::setDecodingConfig)
.def_property("use_gpu_direct_storage", &tle::ExecutorConfig::getUseGpuDirectStorage,
&tle::ExecutorConfig::setUseGpuDirectStorage)
.def_property("gpu_weights_percent", &tle::ExecutorConfig::getGpuWeightsPercent,
&tle::ExecutorConfig::setGpuWeightsPercent)
.def_property("max_queue_size", &tle::ExecutorConfig::getMaxQueueSize, &tle::ExecutorConfig::setMaxQueueSize)

View File

@ -53,6 +53,7 @@ set(SRCS
statefulGptDecoderBatched.cpp
tllmBuffers.cpp
tllmRuntime.cpp
tllmStreamReaders.cpp
tllmLogger.cpp
transformerBuffers.cpp
workerPool.cpp

View File

@ -83,7 +83,8 @@ GptSession::GptSession(Config const& sessionConfig, ModelConfig const& modelConf
, mWorldConfig{worldConfig}
, mDevice{utils::initDevice(worldConfig)}
, mLogger{logger ? std::move(logger) : std::make_shared<TllmLogger>()}
, mRuntime{std::make_shared<TllmRuntime>(rawEngine, mLogger.get(), sessionConfig.gpuWeightsPercent)}
, mRuntime{std::make_shared<TllmRuntime>(
rawEngine, mLogger.get(), sessionConfig.useGpuDirectStorage, sessionConfig.gpuWeightsPercent)}
, mGatherGenerationLogits{sessionConfig.gatherGenerationLogits}
{
TLLM_LOG_WARNING(

View File

@ -23,6 +23,7 @@
#include "tensorrt_llm/kernels/userbuffers/ub_interface.h"
#include "tensorrt_llm/runtime/utils/mpiUtils.h"
#include "tllmLogger.h"
#include "tllmStreamReaders.h"
#include "nlohmann/json.hpp"
#include <NvInferRuntime.h>
@ -73,36 +74,6 @@ std::vector<std::size_t> dimsToShape(nvinfer1::Dims const& dims)
tensorrt_llm::runtime::TllmLogger defaultLogger{};
class StreamReader final : public nvinfer1::IStreamReader
{
public:
StreamReader(std::filesystem::path fp)
{
mFile.open(fp.string(), std::ios::binary | std::ios::in);
TLLM_CHECK_WITH_INFO(mFile.good(), std::string("Error opening engine file: " + fp.string()));
}
virtual ~StreamReader()
{
if (mFile.is_open())
{
mFile.close();
}
}
int64_t read(void* destination, int64_t nbBytes) final
{
if (!mFile.good())
{
return -1;
}
mFile.read(static_cast<char*>(destination), nbBytes);
return mFile.gcount();
}
std::ifstream mFile;
};
void setWeightStreaming(nvinfer1::ICudaEngine& engine, float const gpuWeightsPercent)
{
if (gpuWeightsPercent < 1)
@ -211,22 +182,34 @@ void assessLikelihoodOfRuntimeAllocation(
numWarnings);
}
}
} // namespace
TllmRuntime::TllmRuntime(
RawEngine const& rawEngine, nvinfer1::ILogger* logger, float gpuWeightsPercent, bool useShapeInference)
TllmRuntime::TllmRuntime(RawEngine const& rawEngine, nvinfer1::ILogger* logger, bool useGpuDirectStorage,
float gpuWeightsPercent, bool useShapeInference)
: mStream(std::make_shared<CudaStream>())
, mBufferManager{mStream, true} // Ensure to trim the memory pool on destruction.
, mRuntime{nvinfer1::createInferRuntime(static_cast<bool>(logger) ? *logger : defaultLogger)}
, mUseShapeInference{useShapeInference}
, mUserBufferEnabled{false}
{
auto const startTime = std::chrono::high_resolution_clock::now();
switch (rawEngine.getType())
{
case RawEngine::Type::FilePath:
{
auto reader = StreamReader(rawEngine.getPath());
mEngine.reset(mRuntime->deserializeCudaEngine(reader));
if (useGpuDirectStorage)
{
TLLM_LOG_INFO("GDS is used to load the engine!");
auto reader = GDSStreamReader(rawEngine.getPath());
mEngine.reset(mRuntime->deserializeCudaEngine(reader));
}
else
{
auto reader = StreamReader(rawEngine.getPath());
mEngine.reset(mRuntime->deserializeCudaEngine(reader));
}
break;
}
case RawEngine::Type::AddressWithSize:
@ -239,6 +222,11 @@ TllmRuntime::TllmRuntime(
default: TLLM_THROW("Unsupported raw engine type.");
}
auto const elapsedMs
= std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::high_resolution_clock::now() - startTime);
TLLM_LOG_INFO("Engine load time %lld ms", elapsedMs);
TLLM_CHECK_WITH_INFO(mEngine != nullptr, "Failed to deserialize cuda engine.");
mEngineInspector.reset(mEngine->createEngineInspector());
assessLikelihoodOfRuntimeAllocation(*mEngine, *mEngineInspector);

View File

@ -36,8 +36,8 @@ class TllmRuntime
public:
using TensorMap = StringPtrMap<ITensor>;
explicit TllmRuntime(RawEngine const& rawEngine, nvinfer1::ILogger* logger, float gpuWeightsPercent = 1.0f,
bool useShapeInference = true);
explicit TllmRuntime(RawEngine const& rawEngine, nvinfer1::ILogger* logger, bool useGpuDirectStorage = false,
float gpuWeightsPercent = 1.0f, bool useShapeInference = true);
SizeType32 getNbContexts() const
{

View File

@ -0,0 +1,217 @@
/*
* Copyright (c) 2022-2024, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "tllmStreamReaders.h"
#include "tensorrt_llm/common/assert.h"
#include "tensorrt_llm/common/logger.h"
#include <cufile.h>
#include <dlfcn.h>
#include <fcntl.h>
#include <filesystem>
#include <fstream>
#include <string>
#include <unistd.h>
// Non-GDS StreamReader
StreamReader::StreamReader(std::filesystem::path fp)
{
mFile.open(fp.string(), std::ios::binary | std::ios::in);
TLLM_CHECK_WITH_INFO(mFile.good(), std::string("Error opening engine file: " + fp.string()));
}
StreamReader::~StreamReader()
{
if (mFile.is_open())
{
mFile.close();
}
}
int64_t StreamReader::read(void* destination, int64_t nbBytes)
{
if (!mFile.good())
{
return -1;
}
mFile.read(static_cast<char*>(destination), nbBytes);
return mFile.gcount();
}
// StreamReader using GDS
GDSStreamReader::GDSStreamReader(std::filesystem::path const& filePath)
{
auto const start_time = std::chrono::high_resolution_clock::now();
initializeDriver();
auto const elapsed_ms
= std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::high_resolution_clock::now() - start_time);
TLLM_LOG_INFO("GDS driver initialization time %lld ms", elapsed_ms);
open(filePath);
}
bool GDSStreamReader::open(std::string const& filepath)
{
if (!initializeDriver())
{
TLLM_LOG_INFO("Failed to initialize cuFile driver");
return false;
}
int32_t const ret = ::open(filepath.c_str(), O_CREAT | O_RDWR | O_DIRECT, 0664);
if (ret < 0)
{
TLLM_LOG_INFO("Failed to open engine file");
return false;
}
mFd = ret;
mFileSize = lseek(mFd, 0, SEEK_END);
lseek(mFd, 0, SEEK_SET);
CUfileDescr_t fileDescr;
memset((void*) &fileDescr, 0, sizeof(fileDescr));
fileDescr.handle.fd = mFd;
fileDescr.type = CU_FILE_HANDLE_TYPE_OPAQUE_FD;
CUfileError_t gdsStatus = cuFileHandleRegister(&mFileHandle, &fileDescr);
if (gdsStatus.err != CU_FILE_SUCCESS)
{
TLLM_LOG_INFO("Failed to cuFileHandleRegister");
::close(mFd);
return false;
}
return true;
}
void GDSStreamReader::close()
{
if (mFd >= 0)
{
::close(mFd);
mFd = -1;
}
}
GDSStreamReader::~GDSStreamReader()
{
if (mFileHandle)
{
cuFileHandleDeregister(mFileHandle);
mFileHandle = nullptr;
}
if (mDriverInitialized)
{
cuFileDriverClose();
}
}
bool GDSStreamReader::seek(int64_t offset, nvinfer1::SeekPosition where) noexcept
{
switch (where)
{
case nvinfer1::SeekPosition::kSET: mCursor = offset; return true;
case nvinfer1::SeekPosition::kCUR: mCursor += offset; return true;
case nvinfer1::SeekPosition::kEND: mCursor = -offset; return true;
default: return false;
}
return true;
}
int64_t GDSStreamReader::read(void* dest, int64_t bytes, cudaStream_t stream) noexcept
{
cudaPointerAttributes attributes{};
if (cudaPointerGetAttributes(&attributes, dest) != cudaSuccess)
{
TLLM_LOG_INFO("cudaPointerGetAttributes failed");
}
off_t destOffset = 0;
void* destBase = dest;
if (attributes.type == cudaMemoryTypeDevice)
{
CUdeviceptr cuDest = reinterpret_cast<CUdeviceptr>(dest);
CUdeviceptr cuBufBase = 0;
size_t cuBufSize = 0;
cuMemGetAddressRange(&cuBufBase, &cuBufSize, cuDest);
destOffset += cuDest - cuBufBase;
destBase = reinterpret_cast<void*>(cuBufBase);
}
cuFileRead(this->mFileHandle, destBase, bytes, mCursor, destOffset);
mCursor += bytes;
return bytes;
}
void GDSStreamReader::reset()
{
lseek(mFd, 0, SEEK_SET);
mCursor = 0;
}
[[nodiscard]] bool GDSStreamReader::isOpen() const
{
bool open = mFd >= 0;
return open;
}
bool GDSStreamReader::initializeDriver()
{
if (mDriverInitialized)
{
return true;
}
mCuFileLibHandle = dlopen("libcufile.so", RTLD_LAZY | RTLD_GLOBAL);
if (!mCuFileLibHandle)
{
TLLM_LOG_INFO("Failed to dlopen libcufile.so");
return false;
}
// Load the required functions
*reinterpret_cast<void**>(&cuFileDriverOpen) = dlsym(mCuFileLibHandle, "cuFileDriverOpen");
*reinterpret_cast<void**>(&cuFileHandleRegister) = dlsym(mCuFileLibHandle, "cuFileHandleRegister");
*reinterpret_cast<void**>(&cuFileHandleDeregister) = dlsym(mCuFileLibHandle, "cuFileHandleDeregister");
*reinterpret_cast<void**>(&cuFileDriverClose) = dlsym(mCuFileLibHandle, "cuFileDriverClose");
*reinterpret_cast<void**>(&cuFileRead) = dlsym(mCuFileLibHandle, "cuFileRead");
if (!cuFileDriverOpen || !cuFileHandleRegister || !cuFileHandleDeregister || !cuFileDriverClose || !cuFileRead)
{
TLLM_LOG_INFO("Failed to dlsym libcufile.so");
return false;
}
CUfileError_t gdsStatus = cuFileDriverOpen();
if (gdsStatus.err != CU_FILE_SUCCESS)
{
TLLM_LOG_INFO("cuFileDriverOpen failed");
return false;
}
mDriverInitialized = true;
return true;
}

View File

@ -0,0 +1,71 @@
/*
* Copyright (c) 2022-2024, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once
#include <NvInferRuntime.h>
#include <cufile.h>
#include <filesystem>
#include <fstream>
class StreamReader final : public nvinfer1::IStreamReader
{
public:
StreamReader(std::filesystem::path fp);
virtual ~StreamReader();
int64_t read(void* destination, int64_t nbBytes) final;
private:
std::ifstream mFile;
};
class GDSStreamReader final : public nvinfer1::IStreamReaderV2
{
public:
explicit GDSStreamReader(std::filesystem::path const& filePath);
virtual ~GDSStreamReader();
void close();
[[nodiscard]] bool isOpen() const;
bool open(std::string const& filepath);
int64_t read(void* dest, int64_t bytes, cudaStream_t stream) noexcept final;
void reset();
bool seek(int64_t offset, nvinfer1::SeekPosition where) noexcept final;
private:
bool initializeDriver();
void* mCuFileLibHandle{};
CUfileHandle_t mFileHandle{nullptr};
bool mDriverInitialized{false};
int32_t mFd{-1};
int64_t mCursor{0};
int64_t mFileSize{0};
CUfileError_t (*cuFileDriverOpen)(){};
CUfileError_t (*cuFileHandleRegister)(CUfileHandle_t*, CUfileDescr_t*){};
CUfileError_t (*cuFileHandleDeregister)(CUfileHandle_t){};
CUfileError_t (*cuFileDriverClose)(){};
ssize_t (*cuFileRead)(CUfileHandle_t, void*, size_t, int64_t, int64_t){};
};

View File

@ -101,7 +101,7 @@ std::unique_ptr<DecoderTestShared<TLogits>> SetupDecoderTest(TrivialConstantDeco
logger, modelConfig, worldConfig, engine, false, optionalParams);
auto const executorConfig = tensorrt_llm::executor::ExecutorConfig(params.maxBeamWidth, executor::SchedulerConfig(),
executor::KvCacheConfig{}, true, true, 1, 1, executor::BatchingType::kINFLIGHT, params.maxBatchSize,
params.maxNumTokens, std::nullopt, std::nullopt, std::nullopt, std::nullopt, 1, std::nullopt,
params.maxNumTokens, std::nullopt, std::nullopt, std::nullopt, std::nullopt, false, 1, std::nullopt,
executor::ExtendedRuntimePerfKnobConfig(), std::nullopt, 0,
executor::ExecutorConfig::kDefaultMaxSeqIdleMicroseconds, std::nullopt, std::nullopt);
return std::make_unique<DecoderTestShared<TLogits>>(

View File

@ -123,8 +123,8 @@ std::unique_ptr<DecoderTestShared<TLogits>> SetupDecoderTest(
auto const executorConfig
= executor::ExecutorConfig(params.maxBeamWidth, executor::SchedulerConfig(), kvCacheConfig, true, true, 1, 1,
executor::BatchingType::kINFLIGHT, params.maxBatchSize, params.maxNumTokens, std::nullopt, std::nullopt,
std::nullopt, std::nullopt, 1, std::nullopt, executor::ExtendedRuntimePerfKnobConfig(), std::nullopt, 0,
executor::ExecutorConfig::kDefaultMaxSeqIdleMicroseconds, std::nullopt, std::nullopt,
std::nullopt, std::nullopt, false, 1, std::nullopt, executor::ExtendedRuntimePerfKnobConfig(), std::nullopt,
0, executor::ExecutorConfig::kDefaultMaxSeqIdleMicroseconds, std::nullopt, std::nullopt,
std::vector<executor::AdditionalModelOutput>{
executor::AdditionalModelOutput{DecoderTestShared<TLogits>::kTopKTensorName, params.gatherContext}});

View File

@ -766,8 +766,8 @@ TEST(SerializeUtilsTest, ExecutorConfig)
texec::KvCacheConfig(true), true, false, 500, 200, texec::BatchingType::kSTATIC, 128, 64,
texec::ParallelConfig(texec::CommunicationType::kMPI, texec::CommunicationMode::kORCHESTRATOR),
texec::PeftCacheConfig(10), std::nullopt,
texec::DecodingConfig(texec::DecodingMode::Lookahead(), texec::LookaheadDecodingConfig(3, 5, 7)), 0.5f, 8,
texec::ExtendedRuntimePerfKnobConfig(true), texec::DebugConfig(true), 60000000, 180000000,
texec::DecodingConfig(texec::DecodingMode::Lookahead(), texec::LookaheadDecodingConfig(3, 5, 7)), false, 0.5f,
8, texec::ExtendedRuntimePerfKnobConfig(true), texec::DebugConfig(true), 60000000, 180000000,
texec::SpeculativeDecodingConfig(true),
texec::GuidedDecodingConfig(
texec::GuidedDecodingConfig::GuidedDecodingBackend::kXGRAMMAR, std::initializer_list<std::string>{"eos"}));
@ -788,6 +788,7 @@ TEST(SerializeUtilsTest, ExecutorConfig)
executorConfig2.getParallelConfig().value().getCommunicationMode());
EXPECT_EQ(executorConfig.getPeftCacheConfig(), executorConfig2.getPeftCacheConfig());
EXPECT_EQ(executorConfig.getDecodingConfig(), executorConfig2.getDecodingConfig());
EXPECT_EQ(executorConfig.getUseGpuDirectStorage(), executorConfig2.getUseGpuDirectStorage());
EXPECT_EQ(executorConfig.getGpuWeightsPercent(), executorConfig2.getGpuWeightsPercent());
EXPECT_EQ(executorConfig.getMaxQueueSize(), executorConfig2.getMaxQueueSize());
EXPECT_EQ(executorConfig.getExtendedRuntimePerfKnobConfig(), executorConfig2.getExtendedRuntimePerfKnobConfig());

View File

@ -91,7 +91,7 @@ protected:
TEST_F(TllmRuntimeTest, SinglePass)
{
EXPECT_TRUE(mSerializedEngine);
TllmRuntime rt{RawEngine(mSerializedEngine.get()), &mLogger, 1.0F};
TllmRuntime rt{RawEngine(mSerializedEngine.get()), &mLogger, false, 1.0F};
auto& engine = rt.getEngine();
EXPECT_FALSE(engine.hasImplicitBatchDimension());
EXPECT_EQ(rt.getNbProfiles(), engine.getNbOptimizationProfiles());

View File

@ -536,7 +536,8 @@ def main(args):
enable_chunked_context=args.enable_chunked_context,
multi_block_mode=args.multi_block_mode,
cuda_graph_mode=args.cuda_graph_mode,
gather_generation_logits=args.eval_ppl)
gather_generation_logits=args.eval_ppl,
use_gpu_direct_storage=args.use_gpu_direct_storage)
runner_kwargs.update(
enable_context_fmha_fp32_acc=args.enable_context_fmha_fp32_acc)
if args.prompt_lookup_config is not None:
@ -867,6 +868,10 @@ if __name__ == '__main__':
help=
"evaluate.load('rouge') will attempt to pull rouge package from HF. Use cached rouge can avoid network outage of host or HF."
)
parser.add_argument("--use_gpu_direct_storage",
default=False,
action="store_true",
help="Use GPUDirect Storage (GDS) to load the engine")
parser = add_common_args(parser)
args = parser.parse_args()

View File

@ -108,6 +108,7 @@ class ModelRunnerCpp(ModelRunnerMixin):
lookahead_config: list[int] | None = None,
debug_mode: bool = False,
lora_ckpt_source: str = "hf",
use_gpu_direct_storage: bool = False,
gpu_weights_percent: float = 1,
max_tokens_in_paged_kv_cache: int | None = None,
kv_cache_enable_block_reuse: bool = False,
@ -385,6 +386,7 @@ class ModelRunnerCpp(ModelRunnerMixin):
decoding_config=decoding_config,
peft_cache_config=peft_cache_config,
debug_config=debug_config,
use_gpu_direct_storage=use_gpu_direct_storage,
gpu_weights_percent=gpu_weights_percent,
gather_generation_logits=gather_generation_logits,
use_variable_beam_width_search=use_variable_beam_width_search,

View File

@ -1526,6 +1526,7 @@ def test_executor_config():
assert config.additional_model_outputs is None
assert config.gather_generation_logits is False
assert config.use_variable_beam_width_search is False
assert config.use_gpu_direct_storage is False
kwargs = {
"max_beam_width":
@ -1575,6 +1576,8 @@ def test_executor_config():
"gather_generation_logits":
True,
"use_variable_beam_width_search":
True,
"use_gpu_direct_storage":
True
}
config = trtllm.ExecutorConfig(**kwargs)
@ -1599,6 +1602,7 @@ def test_executor_config():
assert config.additional_model_outputs[0].gather_context is False
assert config.gather_generation_logits is True
assert config.use_variable_beam_width_search is True
assert config.use_gpu_direct_storage is True
def test_parallel_config():
@ -2354,6 +2358,7 @@ def test_executor_config_pickle():
assert config.max_seq_idle_microseconds == config_copy.max_seq_idle_microseconds
assert config.backend == config_copy.backend
assert config.spec_dec_config.fast_logits == config_copy.spec_dec_config.fast_logits
assert config.use_gpu_direct_storage == config_copy.use_gpu_direct_storage
def test_return_full_tokens():