feat: Integrate GPUDirect Storage (GDS) into Executor API (#3582)

* feat: Integrate GPUDirect Storage (GDS) into Executor API Squash of several dev commits Signed-off-by: Dom Brown <3886319+DomBrown@users.noreply.github.com>
2026-01-13 22:18:36 +08:00 · 2025-04-18 08:59:21 +01:00 · 2025-04-18 08:59:21 +01:00 · dbd9a83b0d
commit dbd9a83b0d
parent 90a28b917f
23 changed files with 410 additions and 82 deletions
--- a/benchmarks/cpp/bertBenchmark.cpp
+++ b/benchmarks/cpp/bertBenchmark.cpp
@ -74,15 +74,17 @@ std::string engineFilename(
 }

 void benchmarkBert(std::string const& modelName, std::filesystem::path const& dataPath,
-    std::vector<int> const& batchSizes, std::vector<int> const& inLens, std::vector<float> const& gpuWeightsPercents,
-    std::shared_ptr<nvinfer1::ILogger> const& logger, int warmUp, int numRuns, int duration)
+    std::vector<int> const& batchSizes, std::vector<int> const& inLens, bool useGpuDirectStorage,
+    std::vector<float> const& gpuWeightsPercents, std::shared_ptr<nvinfer1::ILogger> const& logger, int warmUp,
+    int numRuns, int duration)
 {
    auto const worldConfig = WorldConfig::mpi();
    auto const enginePath = dataPath / engineFilename(dataPath, worldConfig, modelName);

    for (float gpuWeightsPercent : gpuWeightsPercents)
    {
-        auto rt = std::make_shared<TllmRuntime>(RawEngine(enginePath), logger.get(), gpuWeightsPercent);
+        auto rt = std::make_shared<TllmRuntime>(
+            RawEngine(enginePath), logger.get(), useGpuDirectStorage, gpuWeightsPercent);
        rt->addContext(0);
        for (auto inLen : inLens)
        {
@ -174,6 +176,8 @@ int main(int argc, char* argv[])
        "by \";\", "
        "example: \"0.0;0.5;1.0\".",
        cxxopts::value<std::string>()->default_value("1.0"));
+    options.add_options()("use_gpu_direct_storage", "Enable GPUDirect Storage (GDS) for loading engine.",
+        cxxopts::value<bool>()->default_value("false"));

    auto result = options.parse(argc, argv);

@ -258,8 +262,8 @@ int main(int argc, char* argv[])
    try
    {
        benchmarkBert(result["model"].as<std::string>(), result["engine_dir"].as<std::string>(), batchSizes, inLens,
-            gpuWeightsPercents, logger, result["warm_up"].as<int>(), result["num_runs"].as<int>(),
-            result["duration"].as<int>());
+            result["use_gpu_direct_storage"].as<bool>(), gpuWeightsPercents, logger, result["warm_up"].as<int>(),
+            result["num_runs"].as<int>(), result["duration"].as<int>());
    }
    catch (std::exception const& e)
    {
--- a/cpp/include/tensorrt_llm/batch_manager/trtGptModelOptionalParams.h
+++ b/cpp/include/tensorrt_llm/batch_manager/trtGptModelOptionalParams.h
@ -41,9 +41,9 @@ public:
        std::optional<std::vector<SizeType32>> deviceIds = std::nullopt, bool normalizeLogProbs = true,
        bool enableChunkedContext = true,
        PeftCacheManagerConfig const& peftCacheManagerConfig = PeftCacheManagerConfig{},
-        executor::DecodingConfig decodingConfig = executor::DecodingConfig{}, float gpuWeightsPercent = 1,
-        std::optional<SizeType32> maxBeamWidth = std::nullopt, std::optional<SizeType32> maxBatchSize = std::nullopt,
-        std::optional<SizeType32> maxNumTokens = std::nullopt,
+        executor::DecodingConfig decodingConfig = executor::DecodingConfig{}, bool useGpuDirectStorage = false,
+        float gpuWeightsPercent = 1, std::optional<SizeType32> maxBeamWidth = std::nullopt,
+        std::optional<SizeType32> maxBatchSize = std::nullopt, std::optional<SizeType32> maxNumTokens = std::nullopt,
        executor::SchedulerConfig schedulerConfig = executor::SchedulerConfig{},
        executor::ExtendedRuntimePerfKnobConfig const& extendedRuntimePerfKnobConfig
        = executor::ExtendedRuntimePerfKnobConfig{},
@ -61,6 +61,7 @@ public:
        , enableChunkedContext{enableChunkedContext}
        , peftCacheManagerConfig(peftCacheManagerConfig)
        , decodingConfig(std::move(decodingConfig))
+        , useGpuDirectStorage(useGpuDirectStorage)
        , gpuWeightsPercent(gpuWeightsPercent)
        , maxBeamWidth(maxBeamWidth)
        , maxBatchSize(maxBatchSize)
@ -87,12 +88,12 @@ public:
            executorConfig.getNormalizeLogProbs(), executorConfig.getEnableChunkedContext(),
            PeftCacheManagerConfig(executorConfig.getPeftCacheConfig().value_or(executor::PeftCacheConfig())),
            executorConfig.getDecodingConfig().value_or(executor::DecodingConfig{}),
-            executorConfig.getGpuWeightsPercent(), executorConfig.getMaxBeamWidth(), executorConfig.getMaxBatchSize(),
-            executorConfig.getMaxNumTokens(), executorConfig.getSchedulerConfig(),
-            executorConfig.getExtendedRuntimePerfKnobConfig(), executorConfig.getDebugConfig(),
-            executorConfig.getMaxSeqIdleMicroseconds(), executorConfig.getSpecDecConfig(),
-            executorConfig.getGuidedDecodingConfig(), isLeaderInOrchMode, executorConfig.getAdditionalModelOutputs(),
-            executorConfig.getGatherGenerationLogits())
+            executorConfig.getUseGpuDirectStorage(), executorConfig.getGpuWeightsPercent(),
+            executorConfig.getMaxBeamWidth(), executorConfig.getMaxBatchSize(), executorConfig.getMaxNumTokens(),
+            executorConfig.getSchedulerConfig(), executorConfig.getExtendedRuntimePerfKnobConfig(),
+            executorConfig.getDebugConfig(), executorConfig.getMaxSeqIdleMicroseconds(),
+            executorConfig.getSpecDecConfig(), executorConfig.getGuidedDecodingConfig(), isLeaderInOrchMode,
+            executorConfig.getAdditionalModelOutputs(), executorConfig.getGatherGenerationLogits())
    {
    }

@ -106,6 +107,8 @@ public:
    bool enableChunkedContext;
    PeftCacheManagerConfig peftCacheManagerConfig;
    executor::DecodingConfig decodingConfig;
+    // Use GDS to load the engines?
+    bool useGpuDirectStorage;
    // Percentage of weights on the gpu at runtime
    float gpuWeightsPercent;
    std::optional<SizeType32> maxBeamWidth;
--- a/cpp/include/tensorrt_llm/executor/executor.h
+++ b/cpp/include/tensorrt_llm/executor/executor.h
@ -1400,8 +1400,8 @@ public:
        std::optional<ParallelConfig> parallelConfig = std::nullopt,
        std::optional<PeftCacheConfig> const& peftCacheConfig = std::nullopt,
        std::optional<LogitsPostProcessorConfig> logitsPostProcessorConfig = std::nullopt,
-        std::optional<DecodingConfig> decodingConfig = std::nullopt, float gpuWeightsPercent = 1,
-        std::optional<SizeType32> maxQueueSize = std::nullopt,
+        std::optional<DecodingConfig> decodingConfig = std::nullopt, bool useGpuDirectStorage = false,
+        float gpuWeightsPercent = 1, std::optional<SizeType32> maxQueueSize = std::nullopt,
        ExtendedRuntimePerfKnobConfig const& extendedRuntimePerfKnobConfig = ExtendedRuntimePerfKnobConfig(),
        std::optional<DebugConfig> debugConfig = std::nullopt, SizeType32 recvPollPeriodMs = 0,
        uint64_t maxSeqIdleMicroseconds = kDefaultMaxSeqIdleMicroseconds,
@ -1429,6 +1429,7 @@ public:
    [[nodiscard]] std::optional<PeftCacheConfig> getPeftCacheConfig() const;
    [[nodiscard]] std::optional<LogitsPostProcessorConfig> getLogitsPostProcessorConfig() const;
    [[nodiscard]] std::optional<DecodingConfig> getDecodingConfig() const;
+    [[nodiscard]] bool getUseGpuDirectStorage() const;
    [[nodiscard]] float getGpuWeightsPercent() const;
    [[nodiscard]] std::optional<SizeType32> getMaxQueueSize() const;
    [[nodiscard]] ExtendedRuntimePerfKnobConfig getExtendedRuntimePerfKnobConfig() const;
@ -1455,6 +1456,7 @@ public:
    void setPeftCacheConfig(PeftCacheConfig const& peftCacheConfig);
    void setLogitsPostProcessorConfig(LogitsPostProcessorConfig const& logitsPostProcessorConfig);
    void setDecodingConfig(DecodingConfig const& decodingConfig);
+    void setUseGpuDirectStorage(bool const& useGpuDirectStorage);
    void setGpuWeightsPercent(float const& gpuWeightsPercent);
    void setMaxQueueSize(std::optional<SizeType32> const& maxQueueSize);
    void setExtendedRuntimePerfKnobConfig(ExtendedRuntimePerfKnobConfig const& extendedRuntimePerfKnobConfig);
@ -1510,6 +1512,9 @@ private:
    /// @brief Decoding configuration.
    std::optional<DecodingConfig> mDecodingConfig;

+    /// @brief Enable/disable use of GPU Direct Storage (GDS) to load engines.
+    bool mUseGpuDirectStorage;
+
    /// @brief GPU weights percent for weight streaming.
    float mGpuWeightsPercent;

--- a/cpp/include/tensorrt_llm/runtime/gptSession.h
+++ b/cpp/include/tensorrt_llm/runtime/gptSession.h
@ -99,6 +99,9 @@ public:
        SizeType32 maxBeamWidth;
        // The length of the longest input sequence
        SizeType32 maxSequenceLength;
+        // Enable/disable GPUDirectStorage
+        // Not supported by GptSession so hard-coded as false
+        bool useGpuDirectStorage{false};
        // Percentage of weights on the gpu at runtime
        float gpuWeightsPercent;
        // Whether the session will use a different decoder per request.
--- a/cpp/tensorrt_llm/batch_manager/trtEncoderModel.cpp
+++ b/cpp/tensorrt_llm/batch_manager/trtEncoderModel.cpp
@ -45,7 +45,8 @@ TrtEncoderModel::TrtEncoderModel(runtime::ModelConfig const& modelConfig, WorldC
    , mWorldConfig{worldConfig}
    , mDevice{runtime::utils::initDevice(worldConfig)}
    , mLogger{logger ? std::move(logger) : std::make_shared<TllmLogger>()}
-    , mRuntime{std::make_shared<TllmRuntime>(rawEngine, mLogger.get(), optionalParams.gpuWeightsPercent)}
+    , mRuntime{std::make_shared<TllmRuntime>(
+          rawEngine, mLogger.get(), optionalParams.useGpuDirectStorage, optionalParams.gpuWeightsPercent)}
    , mMicroBatchId(0)
    , mCopyBufferManager{std::make_shared<CudaStream>()}
 {
--- a/cpp/tensorrt_llm/batch_manager/trtGptModelInflightBatching.cpp
+++ b/cpp/tensorrt_llm/batch_manager/trtGptModelInflightBatching.cpp
@ -138,8 +138,8 @@ TrtGptModelInflightBatching::TrtGptModelInflightBatching(std::shared_ptr<nvinfer
    , mDebugConfig{optionalParams.debugConfig}
    , mAdditionalModelOutputs{optionalParams.additionalModelOutputs}
    , mLogger{logger ? std::move(logger) : std::make_shared<TllmLogger>()}
-    , mRuntime{std::make_shared<TllmRuntime>(
-          rawEngine, mLogger.get(), optionalParams.gpuWeightsPercent, modelConfig.useShapeInference())}
+    , mRuntime{std::make_shared<TllmRuntime>(rawEngine, mLogger.get(), optionalParams.useGpuDirectStorage,
+          optionalParams.gpuWeightsPercent, modelConfig.useShapeInference())}
    , mCopyBufferManager{std::make_shared<CudaStream>()}
    , mCtxGenFusion(ctxGenFusion)
    , mOperatingBeamWidth{getMaxBeamWidth()}
--- a/cpp/tensorrt_llm/executor/executorConfig.cpp
+++ b/cpp/tensorrt_llm/executor/executorConfig.cpp
@ -28,7 +28,7 @@ ExecutorConfig::ExecutorConfig(SizeType32 maxBeamWidth, SchedulerConfig schedule
    std::optional<SizeType32> maxNumTokens, std::optional<ParallelConfig> parallelConfig,
    std::optional<PeftCacheConfig> const& peftCacheConfig,
    std::optional<LogitsPostProcessorConfig> logitsPostProcessorConfig, std::optional<DecodingConfig> decodingConfig,
-    float gpuWeightPercent, std::optional<SizeType32> maxQueueSize,
+    bool useGpuDirectStorage, float gpuWeightPercent, std::optional<SizeType32> maxQueueSize,
    ExtendedRuntimePerfKnobConfig const& extendedRuntimePerfKnobConfig, std::optional<DebugConfig> debugConfig,
    SizeType32 recvPollPeriodMs, uint64_t maxSeqIdleMicroseconds,
    std::optional<SpeculativeDecodingConfig> specDecConfig, std::optional<GuidedDecodingConfig> guidedDecodingConfig,
@ -48,6 +48,7 @@ ExecutorConfig::ExecutorConfig(SizeType32 maxBeamWidth, SchedulerConfig schedule
    , mPeftCacheConfig(peftCacheConfig)
    , mLogitsPostProcessorConfig(std::move(logitsPostProcessorConfig))
    , mDecodingConfig(std::move(decodingConfig))
+    , mUseGpuDirectStorage((useGpuDirectStorage))
    , mGpuWeightsPercent(gpuWeightPercent)
    , mMaxQueueSize(maxQueueSize)
    , mExtendedRuntimePerfKnobConfig(extendedRuntimePerfKnobConfig)
@ -146,6 +147,11 @@ std::optional<DecodingConfig> ExecutorConfig::getDecodingConfig() const
    return mDecodingConfig;
 }

+bool ExecutorConfig::getUseGpuDirectStorage() const
+{
+    return mUseGpuDirectStorage;
+}
+
 float ExecutorConfig::getGpuWeightsPercent() const
 {
    return mGpuWeightsPercent;
@ -276,6 +282,11 @@ void ExecutorConfig::setDecodingConfig(DecodingConfig const& decodingConfig)
    mDecodingConfig = decodingConfig;
 }

+void ExecutorConfig::setUseGpuDirectStorage(bool const& useGpuDirectStorage)
+{
+    mUseGpuDirectStorage = useGpuDirectStorage;
+}
+
 void ExecutorConfig::setGpuWeightsPercent(float const& gpuWeightsPercent)
 {
    mGpuWeightsPercent = gpuWeightsPercent;
--- a/cpp/tensorrt_llm/executor/serialization.cpp
+++ b/cpp/tensorrt_llm/executor/serialization.cpp
@ -978,6 +978,7 @@ ExecutorConfig Serialization::deserializeExecutorConfig(std::istream& is)
    auto parallelConfig = su::deserializeWithGetterType<decltype(&ExecutorConfig::getParallelConfig)>(is);
    auto peftCacheConfig = su::deserializeWithGetterType<decltype(&ExecutorConfig::getPeftCacheConfig)>(is);
    auto decodingConfig = su::deserializeWithGetterType<decltype(&ExecutorConfig::getDecodingConfig)>(is);
+    auto useGpuDirectStorage = su::deserializeWithGetterType<decltype(&ExecutorConfig::getUseGpuDirectStorage)>(is);
    auto gpuWeightsPercent = su::deserializeWithGetterType<decltype(&ExecutorConfig::getGpuWeightsPercent)>(is);
    auto maxQueueSize = su::deserializeWithGetterType<decltype(&ExecutorConfig::getMaxQueueSize)>(is);
    auto extendedRuntimePerfKnobConfig
@ -995,9 +996,9 @@ ExecutorConfig Serialization::deserializeExecutorConfig(std::istream& is)

    return ExecutorConfig{maxBeamWidth, schedulerConfig, kvCacheConfig, enableChunkedContext, normalizeLogProbs,
        iterStatsMaxIterations, requestStatsMaxIterations, batchingType, maxBatchSize, maxNumTokens, parallelConfig,
-        peftCacheConfig, std::nullopt, decodingConfig, gpuWeightsPercent, maxQueueSize, extendedRuntimePerfKnobConfig,
-        debugConfig, recvPollPeriodMs, maxSeqIdleMicroseconds, specDecConfig, guidedDecodingConfig,
-        additionalModelOutputs, gatherGenerationLogits};
+        peftCacheConfig, std::nullopt, decodingConfig, useGpuDirectStorage, gpuWeightsPercent, maxQueueSize,
+        extendedRuntimePerfKnobConfig, debugConfig, recvPollPeriodMs, maxSeqIdleMicroseconds, specDecConfig,
+        guidedDecodingConfig, additionalModelOutputs, gatherGenerationLogits};
 }

 size_t Serialization::serializedSize(ExecutorConfig const& executorConfig)
@ -1020,6 +1021,7 @@ size_t Serialization::serializedSize(ExecutorConfig const& executorConfig)
    totalSize += su::serializedSize(executorConfig.getParallelConfig());
    totalSize += su::serializedSize(executorConfig.getPeftCacheConfig());
    totalSize += su::serializedSize(executorConfig.getDecodingConfig());
+    totalSize += su::serializedSize(executorConfig.getUseGpuDirectStorage());
    totalSize += su::serializedSize(executorConfig.getGpuWeightsPercent());
    totalSize += su::serializedSize(executorConfig.getMaxQueueSize());
    totalSize += su::serializedSize(executorConfig.getExtendedRuntimePerfKnobConfig());
@ -1052,6 +1054,7 @@ void Serialization::serialize(ExecutorConfig const& executorConfig, std::ostream
    su::serialize(executorConfig.getParallelConfig(), os);
    su::serialize(executorConfig.getPeftCacheConfig(), os);
    su::serialize(executorConfig.getDecodingConfig(), os);
+    su::serialize(executorConfig.getUseGpuDirectStorage(), os);
    su::serialize(executorConfig.getGpuWeightsPercent(), os);
    su::serialize(executorConfig.getMaxQueueSize(), os);
    su::serialize(executorConfig.getExtendedRuntimePerfKnobConfig(), os);
--- a/cpp/tensorrt_llm/pybind/bindings.cpp
+++ b/cpp/tensorrt_llm/pybind/bindings.cpp
@ -527,6 +527,7 @@ PYBIND11_MODULE(TRTLLM_PYBIND_MODULE, m)
        .def_readwrite("enable_chunked_context", &tb::TrtGptModelOptionalParams::enableChunkedContext)
        .def_readwrite("normalize_log_probs", &tb::TrtGptModelOptionalParams::normalizeLogProbs)
        .def_readwrite("decoding_config", &tb::TrtGptModelOptionalParams::decodingConfig)
+        .def_readwrite("use_gpu_direct_storage", &tb::TrtGptModelOptionalParams::useGpuDirectStorage)
        .def_readwrite("gpu_weights_percent", &tb::TrtGptModelOptionalParams::gpuWeightsPercent)
        .def_readwrite("max_beam_width", &tb::TrtGptModelOptionalParams::maxBeamWidth)
        .def_readwrite("scheduler_config", &tb::TrtGptModelOptionalParams::schedulerConfig)
--- a/cpp/tensorrt_llm/pybind/executor/executorConfig.cpp
+++ b/cpp/tensorrt_llm/pybind/executor/executorConfig.cpp
@ -414,8 +414,9 @@ void initConfigBindings(pybind11::module_& m)
            c.getEnableChunkedContext(), c.getNormalizeLogProbs(), c.getIterStatsMaxIterations(),
            c.getRequestStatsMaxIterations(), c.getBatchingType(), c.getMaxBatchSize(), c.getMaxNumTokens(),
            c.getParallelConfig(), c.getPeftCacheConfig(), c.getLogitsPostProcessorConfig(), c.getDecodingConfig(),
-            c.getGpuWeightsPercent(), c.getMaxQueueSize(), c.getExtendedRuntimePerfKnobConfig(), c.getDebugConfig(),
-            c.getRecvPollPeriodMs(), c.getMaxSeqIdleMicroseconds(), c.getSpecDecConfig(), c.getGuidedDecodingConfig(),
+            c.getUseGpuDirectStorage(), c.getGpuWeightsPercent(), c.getMaxQueueSize(),
+            c.getExtendedRuntimePerfKnobConfig(), c.getDebugConfig(), c.getRecvPollPeriodMs(),
+            c.getMaxSeqIdleMicroseconds(), c.getSpecDecConfig(), c.getGuidedDecodingConfig(),
            c.getAdditionalModelOutputs(), c.getGatherGenerationLogits(), c.getUseVariableBeamWidthSearch());
        auto pickle_tuple = py::make_tuple(cpp_states, py::getattr(self, "__dict__"));
        return pickle_tuple;
@ -429,7 +430,7 @@ void initConfigBindings(pybind11::module_& m)

        // Restore C++ data
        auto cpp_states = state[0].cast<py::tuple>();
-        if (cpp_states.size() != 25)
+        if (cpp_states.size() != 26)
        {
            throw std::runtime_error("Invalid cpp_states!");
        }
@ -449,17 +450,18 @@ void initConfigBindings(pybind11::module_& m)
            cpp_states[11].cast<std::optional<tle::PeftCacheConfig>>(),           // PeftCacheConfig
            cpp_states[12].cast<std::optional<tle::LogitsPostProcessorConfig>>(), // LogitsPostProcessorConfig
            cpp_states[13].cast<std::optional<tle::DecodingConfig>>(),            // DecodingConfig
-            cpp_states[14].cast<float>(),                                         // GpuWeightsPercent
-            cpp_states[15].cast<std::optional<SizeType32>>(),                     // MaxQueueSize
-            cpp_states[16].cast<tle::ExtendedRuntimePerfKnobConfig>(),            // ExtendedRuntimePerfKnobConfig
-            cpp_states[17].cast<std::optional<tle::DebugConfig>>(),               // DebugConfig
-            cpp_states[18].cast<SizeType32>(),                                    // RecvPollPeriodMs
-            cpp_states[19].cast<uint64_t>(),                                      // MaxSeqIdleMicroseconds
-            cpp_states[20].cast<std::optional<tle::SpeculativeDecodingConfig>>(), // SpecDecConfig
-            cpp_states[21].cast<std::optional<tle::GuidedDecodingConfig>>(),      // GuidedDecodingConfig
-            cpp_states[22].cast<std::optional<std::vector<tle::AdditionalModelOutput>>>(), // AdditionalModelOutputs
-            cpp_states[23].cast<bool>(),                                                   // GatherGenerationLogits
-            cpp_states[24].cast<bool>()                                                    // UseVariableBeamWidthSearch
+            cpp_states[14].cast<bool>(),                                          // UseGpuDirectStorage
+            cpp_states[15].cast<float>(),                                         // GpuWeightsPercent
+            cpp_states[16].cast<std::optional<SizeType32>>(),                     // MaxQueueSize
+            cpp_states[17].cast<tle::ExtendedRuntimePerfKnobConfig>(),            // ExtendedRuntimePerfKnobConfig
+            cpp_states[18].cast<std::optional<tle::DebugConfig>>(),               // DebugConfig
+            cpp_states[19].cast<SizeType32>(),                                    // RecvPollPeriodMs
+            cpp_states[20].cast<uint64_t>(),                                      // MaxSeqIdleMicroseconds
+            cpp_states[21].cast<std::optional<tle::SpeculativeDecodingConfig>>(), // SpecDecConfig
+            cpp_states[22].cast<std::optional<tle::GuidedDecodingConfig>>(),      // GuidedDecodingConfig
+            cpp_states[23].cast<std::optional<std::vector<tle::AdditionalModelOutput>>>(), // AdditionalModelOutputs
+            cpp_states[24].cast<bool>(),                                                   // GatherGenerationLogits
+            cpp_states[25].cast<bool>()                                                    // UseVariableBeamWidthSearch
        );

        auto py_state = state[1].cast<py::dict>();
@ -483,6 +485,7 @@ void initConfigBindings(pybind11::module_& m)
                 tle::PeftCacheConfig const&,                            // PeftCacheConfig
                 std::optional<tle::LogitsPostProcessorConfig>,          // LogitsPostProcessorConfig
                 std::optional<tle::DecodingConfig>,                     // DecodingConfig
+                 bool,                                                   // UseGpuDirectStorage
                 float,                                                  // GpuWeightsPercent
                 std::optional<SizeType32>,                              // MaxQueueSize
                 tle::ExtendedRuntimePerfKnobConfig const&,              // ExtendedRuntimePerfKnobConfig
@ -505,7 +508,8 @@ void initConfigBindings(pybind11::module_& m)
            py::arg("parallel_config") = py::none(),
            py::arg_v("peft_cache_config", tle::PeftCacheConfig(), "PeftCacheConfig()"),
            py::arg("logits_post_processor_config") = py::none(), py::arg("decoding_config") = py::none(),
-            py::arg("gpu_weights_percent") = 1.0, py::arg("max_queue_size") = py::none(),
+            py::arg("use_gpu_direct_storage") = false, py::arg("gpu_weights_percent") = 1.0,
+            py::arg("max_queue_size") = py::none(),
            py::arg_v("extended_runtime_perf_knob_config", tle::ExtendedRuntimePerfKnobConfig(),
                "ExtendedRuntimePerfKnobConfig()"),
            py::arg("debug_config") = py::none(), py::arg("recv_poll_period_ms") = 0,
@ -537,6 +541,8 @@ void initConfigBindings(pybind11::module_& m)
            &tle::ExecutorConfig::setLogitsPostProcessorConfig)
        .def_property(
            "decoding_config", &tle::ExecutorConfig::getDecodingConfig, &tle::ExecutorConfig::setDecodingConfig)
+        .def_property("use_gpu_direct_storage", &tle::ExecutorConfig::getUseGpuDirectStorage,
+            &tle::ExecutorConfig::setUseGpuDirectStorage)
        .def_property("gpu_weights_percent", &tle::ExecutorConfig::getGpuWeightsPercent,
            &tle::ExecutorConfig::setGpuWeightsPercent)
        .def_property("max_queue_size", &tle::ExecutorConfig::getMaxQueueSize, &tle::ExecutorConfig::setMaxQueueSize)
--- a/cpp/tensorrt_llm/runtime/CMakeLists.txt
+++ b/cpp/tensorrt_llm/runtime/CMakeLists.txt
@ -53,6 +53,7 @@ set(SRCS
    statefulGptDecoderBatched.cpp
    tllmBuffers.cpp
    tllmRuntime.cpp
+    tllmStreamReaders.cpp
    tllmLogger.cpp
    transformerBuffers.cpp
    workerPool.cpp
--- a/cpp/tensorrt_llm/runtime/gptSession.cpp
+++ b/cpp/tensorrt_llm/runtime/gptSession.cpp
@ -83,7 +83,8 @@ GptSession::GptSession(Config const& sessionConfig, ModelConfig const& modelConf
    , mWorldConfig{worldConfig}
    , mDevice{utils::initDevice(worldConfig)}
    , mLogger{logger ? std::move(logger) : std::make_shared<TllmLogger>()}
-    , mRuntime{std::make_shared<TllmRuntime>(rawEngine, mLogger.get(), sessionConfig.gpuWeightsPercent)}
+    , mRuntime{std::make_shared<TllmRuntime>(
+          rawEngine, mLogger.get(), sessionConfig.useGpuDirectStorage, sessionConfig.gpuWeightsPercent)}
    , mGatherGenerationLogits{sessionConfig.gatherGenerationLogits}
 {
    TLLM_LOG_WARNING(
--- a/cpp/tensorrt_llm/runtime/tllmRuntime.cpp
+++ b/cpp/tensorrt_llm/runtime/tllmRuntime.cpp
@ -23,6 +23,7 @@
 #include "tensorrt_llm/kernels/userbuffers/ub_interface.h"
 #include "tensorrt_llm/runtime/utils/mpiUtils.h"
 #include "tllmLogger.h"
+#include "tllmStreamReaders.h"

 #include "nlohmann/json.hpp"
 #include <NvInferRuntime.h>
@ -73,36 +74,6 @@ std::vector<std::size_t> dimsToShape(nvinfer1::Dims const& dims)

 tensorrt_llm::runtime::TllmLogger defaultLogger{};

-class StreamReader final : public nvinfer1::IStreamReader
-{
-public:
-    StreamReader(std::filesystem::path fp)
-    {
-        mFile.open(fp.string(), std::ios::binary | std::ios::in);
-        TLLM_CHECK_WITH_INFO(mFile.good(), std::string("Error opening engine file: " + fp.string()));
-    }
-
-    virtual ~StreamReader()
-    {
-        if (mFile.is_open())
-        {
-            mFile.close();
-        }
-    }
-
-    int64_t read(void* destination, int64_t nbBytes) final
-    {
-        if (!mFile.good())
-        {
-            return -1;
-        }
-        mFile.read(static_cast<char*>(destination), nbBytes);
-        return mFile.gcount();
-    }
-
-    std::ifstream mFile;
-};
-
 void setWeightStreaming(nvinfer1::ICudaEngine& engine, float const gpuWeightsPercent)
 {
    if (gpuWeightsPercent < 1)
@ -211,22 +182,34 @@ void assessLikelihoodOfRuntimeAllocation(
            numWarnings);
    }
 }
+
 } // namespace

-TllmRuntime::TllmRuntime(
-    RawEngine const& rawEngine, nvinfer1::ILogger* logger, float gpuWeightsPercent, bool useShapeInference)
+TllmRuntime::TllmRuntime(RawEngine const& rawEngine, nvinfer1::ILogger* logger, bool useGpuDirectStorage,
+    float gpuWeightsPercent, bool useShapeInference)
    : mStream(std::make_shared<CudaStream>())
    , mBufferManager{mStream, true} // Ensure to trim the memory pool on destruction.
    , mRuntime{nvinfer1::createInferRuntime(static_cast<bool>(logger) ? *logger : defaultLogger)}
    , mUseShapeInference{useShapeInference}
    , mUserBufferEnabled{false}
 {
+    auto const startTime = std::chrono::high_resolution_clock::now();
+
    switch (rawEngine.getType())
    {
    case RawEngine::Type::FilePath:
    {
-        auto reader = StreamReader(rawEngine.getPath());
-        mEngine.reset(mRuntime->deserializeCudaEngine(reader));
+        if (useGpuDirectStorage)
+        {
+            TLLM_LOG_INFO("GDS is used to load the engine!");
+            auto reader = GDSStreamReader(rawEngine.getPath());
+            mEngine.reset(mRuntime->deserializeCudaEngine(reader));
+        }
+        else
+        {
+            auto reader = StreamReader(rawEngine.getPath());
+            mEngine.reset(mRuntime->deserializeCudaEngine(reader));
+        }
        break;
    }
    case RawEngine::Type::AddressWithSize:
@ -239,6 +222,11 @@ TllmRuntime::TllmRuntime(
    default: TLLM_THROW("Unsupported raw engine type.");
    }

+    auto const elapsedMs
+        = std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::high_resolution_clock::now() - startTime);
+
+    TLLM_LOG_INFO("Engine load time %lld ms", elapsedMs);
+
    TLLM_CHECK_WITH_INFO(mEngine != nullptr, "Failed to deserialize cuda engine.");
    mEngineInspector.reset(mEngine->createEngineInspector());
    assessLikelihoodOfRuntimeAllocation(*mEngine, *mEngineInspector);
--- a/cpp/tensorrt_llm/runtime/tllmRuntime.h
+++ b/cpp/tensorrt_llm/runtime/tllmRuntime.h
@ -36,8 +36,8 @@ class TllmRuntime
 public:
    using TensorMap = StringPtrMap<ITensor>;

-    explicit TllmRuntime(RawEngine const& rawEngine, nvinfer1::ILogger* logger, float gpuWeightsPercent = 1.0f,
-        bool useShapeInference = true);
+    explicit TllmRuntime(RawEngine const& rawEngine, nvinfer1::ILogger* logger, bool useGpuDirectStorage = false,
+        float gpuWeightsPercent = 1.0f, bool useShapeInference = true);

    SizeType32 getNbContexts() const
    {
--- a/cpp/tensorrt_llm/runtime/tllmStreamReaders.cpp
+++ b/cpp/tensorrt_llm/runtime/tllmStreamReaders.cpp
@ -0,0 +1,217 @@
+/*
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "tllmStreamReaders.h"
+#include "tensorrt_llm/common/assert.h"
+#include "tensorrt_llm/common/logger.h"
+
+#include <cufile.h>
+#include <dlfcn.h>
+#include <fcntl.h>
+#include <filesystem>
+#include <fstream>
+#include <string>
+#include <unistd.h>
+
+// Non-GDS StreamReader
+
+StreamReader::StreamReader(std::filesystem::path fp)
+{
+    mFile.open(fp.string(), std::ios::binary | std::ios::in);
+    TLLM_CHECK_WITH_INFO(mFile.good(), std::string("Error opening engine file: " + fp.string()));
+}
+
+StreamReader::~StreamReader()
+{
+    if (mFile.is_open())
+    {
+        mFile.close();
+    }
+}
+
+int64_t StreamReader::read(void* destination, int64_t nbBytes)
+{
+    if (!mFile.good())
+    {
+        return -1;
+    }
+
+    mFile.read(static_cast<char*>(destination), nbBytes);
+
+    return mFile.gcount();
+}
+
+// StreamReader using GDS
+
+GDSStreamReader::GDSStreamReader(std::filesystem::path const& filePath)
+{
+    auto const start_time = std::chrono::high_resolution_clock::now();
+    initializeDriver();
+    auto const elapsed_ms
+        = std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::high_resolution_clock::now() - start_time);
+
+    TLLM_LOG_INFO("GDS driver initialization time %lld ms", elapsed_ms);
+
+    open(filePath);
+}
+
+bool GDSStreamReader::open(std::string const& filepath)
+{
+    if (!initializeDriver())
+    {
+        TLLM_LOG_INFO("Failed to initialize cuFile driver");
+        return false;
+    }
+
+    int32_t const ret = ::open(filepath.c_str(), O_CREAT | O_RDWR | O_DIRECT, 0664);
+
+    if (ret < 0)
+    {
+        TLLM_LOG_INFO("Failed to open engine file");
+        return false;
+    }
+
+    mFd = ret;
+    mFileSize = lseek(mFd, 0, SEEK_END);
+    lseek(mFd, 0, SEEK_SET);
+
+    CUfileDescr_t fileDescr;
+    memset((void*) &fileDescr, 0, sizeof(fileDescr));
+    fileDescr.handle.fd = mFd;
+    fileDescr.type = CU_FILE_HANDLE_TYPE_OPAQUE_FD;
+
+    CUfileError_t gdsStatus = cuFileHandleRegister(&mFileHandle, &fileDescr);
+
+    if (gdsStatus.err != CU_FILE_SUCCESS)
+    {
+        TLLM_LOG_INFO("Failed to cuFileHandleRegister");
+        ::close(mFd);
+        return false;
+    }
+    return true;
+}
+
+void GDSStreamReader::close()
+{
+    if (mFd >= 0)
+    {
+        ::close(mFd);
+        mFd = -1;
+    }
+}
+
+GDSStreamReader::~GDSStreamReader()
+{
+    if (mFileHandle)
+    {
+        cuFileHandleDeregister(mFileHandle);
+        mFileHandle = nullptr;
+    }
+
+    if (mDriverInitialized)
+    {
+        cuFileDriverClose();
+    }
+}
+
+bool GDSStreamReader::seek(int64_t offset, nvinfer1::SeekPosition where) noexcept
+{
+    switch (where)
+    {
+    case nvinfer1::SeekPosition::kSET: mCursor = offset; return true;
+    case nvinfer1::SeekPosition::kCUR: mCursor += offset; return true;
+    case nvinfer1::SeekPosition::kEND: mCursor = -offset; return true;
+    default: return false;
+    }
+    return true;
+}
+
+int64_t GDSStreamReader::read(void* dest, int64_t bytes, cudaStream_t stream) noexcept
+{
+    cudaPointerAttributes attributes{};
+    if (cudaPointerGetAttributes(&attributes, dest) != cudaSuccess)
+    {
+        TLLM_LOG_INFO("cudaPointerGetAttributes failed");
+    }
+
+    off_t destOffset = 0;
+    void* destBase = dest;
+
+    if (attributes.type == cudaMemoryTypeDevice)
+    {
+        CUdeviceptr cuDest = reinterpret_cast<CUdeviceptr>(dest);
+        CUdeviceptr cuBufBase = 0;
+        size_t cuBufSize = 0;
+
+        cuMemGetAddressRange(&cuBufBase, &cuBufSize, cuDest);
+        destOffset += cuDest - cuBufBase;
+        destBase = reinterpret_cast<void*>(cuBufBase);
+    }
+    cuFileRead(this->mFileHandle, destBase, bytes, mCursor, destOffset);
+
+    mCursor += bytes;
+    return bytes;
+}
+
+void GDSStreamReader::reset()
+{
+    lseek(mFd, 0, SEEK_SET);
+    mCursor = 0;
+}
+
+[[nodiscard]] bool GDSStreamReader::isOpen() const
+{
+    bool open = mFd >= 0;
+    return open;
+}
+
+bool GDSStreamReader::initializeDriver()
+{
+    if (mDriverInitialized)
+    {
+        return true;
+    }
+
+    mCuFileLibHandle = dlopen("libcufile.so", RTLD_LAZY | RTLD_GLOBAL);
+    if (!mCuFileLibHandle)
+    {
+        TLLM_LOG_INFO("Failed to dlopen libcufile.so");
+        return false;
+    }
+
+    // Load the required functions
+    *reinterpret_cast<void**>(&cuFileDriverOpen) = dlsym(mCuFileLibHandle, "cuFileDriverOpen");
+    *reinterpret_cast<void**>(&cuFileHandleRegister) = dlsym(mCuFileLibHandle, "cuFileHandleRegister");
+    *reinterpret_cast<void**>(&cuFileHandleDeregister) = dlsym(mCuFileLibHandle, "cuFileHandleDeregister");
+    *reinterpret_cast<void**>(&cuFileDriverClose) = dlsym(mCuFileLibHandle, "cuFileDriverClose");
+    *reinterpret_cast<void**>(&cuFileRead) = dlsym(mCuFileLibHandle, "cuFileRead");
+
+    if (!cuFileDriverOpen || !cuFileHandleRegister || !cuFileHandleDeregister || !cuFileDriverClose || !cuFileRead)
+    {
+        TLLM_LOG_INFO("Failed to dlsym libcufile.so");
+        return false;
+    }
+
+    CUfileError_t gdsStatus = cuFileDriverOpen();
+    if (gdsStatus.err != CU_FILE_SUCCESS)
+    {
+        TLLM_LOG_INFO("cuFileDriverOpen failed");
+        return false;
+    }
+
+    mDriverInitialized = true;
+    return true;
+}
--- a/cpp/tensorrt_llm/runtime/tllmStreamReaders.h
+++ b/cpp/tensorrt_llm/runtime/tllmStreamReaders.h
@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <NvInferRuntime.h>
+
+#include <cufile.h>
+#include <filesystem>
+#include <fstream>
+
+class StreamReader final : public nvinfer1::IStreamReader
+{
+public:
+    StreamReader(std::filesystem::path fp);
+
+    virtual ~StreamReader();
+
+    int64_t read(void* destination, int64_t nbBytes) final;
+
+private:
+    std::ifstream mFile;
+};
+
+class GDSStreamReader final : public nvinfer1::IStreamReaderV2
+{
+public:
+    explicit GDSStreamReader(std::filesystem::path const& filePath);
+
+    virtual ~GDSStreamReader();
+
+    void close();
+
+    [[nodiscard]] bool isOpen() const;
+
+    bool open(std::string const& filepath);
+
+    int64_t read(void* dest, int64_t bytes, cudaStream_t stream) noexcept final;
+
+    void reset();
+
+    bool seek(int64_t offset, nvinfer1::SeekPosition where) noexcept final;
+
+private:
+    bool initializeDriver();
+
+    void* mCuFileLibHandle{};
+    CUfileHandle_t mFileHandle{nullptr};
+    bool mDriverInitialized{false};
+    int32_t mFd{-1};
+    int64_t mCursor{0};
+    int64_t mFileSize{0};
+
+    CUfileError_t (*cuFileDriverOpen)(){};
+    CUfileError_t (*cuFileHandleRegister)(CUfileHandle_t*, CUfileDescr_t*){};
+    CUfileError_t (*cuFileHandleDeregister)(CUfileHandle_t){};
+    CUfileError_t (*cuFileDriverClose)(){};
+    ssize_t (*cuFileRead)(CUfileHandle_t, void*, size_t, int64_t, int64_t){};
+};
--- a/cpp/tests/unit_tests/executor/executorTestSmall.cpp
+++ b/cpp/tests/unit_tests/executor/executorTestSmall.cpp
@ -101,7 +101,7 @@ std::unique_ptr<DecoderTestShared<TLogits>> SetupDecoderTest(TrivialConstantDeco
        logger, modelConfig, worldConfig, engine, false, optionalParams);
    auto const executorConfig = tensorrt_llm::executor::ExecutorConfig(params.maxBeamWidth, executor::SchedulerConfig(),
        executor::KvCacheConfig{}, true, true, 1, 1, executor::BatchingType::kINFLIGHT, params.maxBatchSize,
-        params.maxNumTokens, std::nullopt, std::nullopt, std::nullopt, std::nullopt, 1, std::nullopt,
+        params.maxNumTokens, std::nullopt, std::nullopt, std::nullopt, std::nullopt, false, 1, std::nullopt,
        executor::ExtendedRuntimePerfKnobConfig(), std::nullopt, 0,
        executor::ExecutorConfig::kDefaultMaxSeqIdleMicroseconds, std::nullopt, std::nullopt);
    return std::make_unique<DecoderTestShared<TLogits>>(
--- a/cpp/tests/unit_tests/executor/executorTestSmallArbitraryOutputTensors.cpp
+++ b/cpp/tests/unit_tests/executor/executorTestSmallArbitraryOutputTensors.cpp
@ -123,8 +123,8 @@ std::unique_ptr<DecoderTestShared<TLogits>> SetupDecoderTest(
    auto const executorConfig
        = executor::ExecutorConfig(params.maxBeamWidth, executor::SchedulerConfig(), kvCacheConfig, true, true, 1, 1,
            executor::BatchingType::kINFLIGHT, params.maxBatchSize, params.maxNumTokens, std::nullopt, std::nullopt,
-            std::nullopt, std::nullopt, 1, std::nullopt, executor::ExtendedRuntimePerfKnobConfig(), std::nullopt, 0,
-            executor::ExecutorConfig::kDefaultMaxSeqIdleMicroseconds, std::nullopt, std::nullopt,
+            std::nullopt, std::nullopt, false, 1, std::nullopt, executor::ExtendedRuntimePerfKnobConfig(), std::nullopt,
+            0, executor::ExecutorConfig::kDefaultMaxSeqIdleMicroseconds, std::nullopt, std::nullopt,
            std::vector<executor::AdditionalModelOutput>{
                executor::AdditionalModelOutput{DecoderTestShared<TLogits>::kTopKTensorName, params.gatherContext}});

--- a/cpp/tests/unit_tests/executor/serializeUtilsTest.cpp
+++ b/cpp/tests/unit_tests/executor/serializeUtilsTest.cpp
@ -766,8 +766,8 @@ TEST(SerializeUtilsTest, ExecutorConfig)
        texec::KvCacheConfig(true), true, false, 500, 200, texec::BatchingType::kSTATIC, 128, 64,
        texec::ParallelConfig(texec::CommunicationType::kMPI, texec::CommunicationMode::kORCHESTRATOR),
        texec::PeftCacheConfig(10), std::nullopt,
-        texec::DecodingConfig(texec::DecodingMode::Lookahead(), texec::LookaheadDecodingConfig(3, 5, 7)), 0.5f, 8,
-        texec::ExtendedRuntimePerfKnobConfig(true), texec::DebugConfig(true), 60000000, 180000000,
+        texec::DecodingConfig(texec::DecodingMode::Lookahead(), texec::LookaheadDecodingConfig(3, 5, 7)), false, 0.5f,
+        8, texec::ExtendedRuntimePerfKnobConfig(true), texec::DebugConfig(true), 60000000, 180000000,
        texec::SpeculativeDecodingConfig(true),
        texec::GuidedDecodingConfig(
            texec::GuidedDecodingConfig::GuidedDecodingBackend::kXGRAMMAR, std::initializer_list<std::string>{"eos"}));
@ -788,6 +788,7 @@ TEST(SerializeUtilsTest, ExecutorConfig)
        executorConfig2.getParallelConfig().value().getCommunicationMode());
    EXPECT_EQ(executorConfig.getPeftCacheConfig(), executorConfig2.getPeftCacheConfig());
    EXPECT_EQ(executorConfig.getDecodingConfig(), executorConfig2.getDecodingConfig());
+    EXPECT_EQ(executorConfig.getUseGpuDirectStorage(), executorConfig2.getUseGpuDirectStorage());
    EXPECT_EQ(executorConfig.getGpuWeightsPercent(), executorConfig2.getGpuWeightsPercent());
    EXPECT_EQ(executorConfig.getMaxQueueSize(), executorConfig2.getMaxQueueSize());
    EXPECT_EQ(executorConfig.getExtendedRuntimePerfKnobConfig(), executorConfig2.getExtendedRuntimePerfKnobConfig());
--- a/cpp/tests/unit_tests/runtime/tllmRuntimeTest.cpp
+++ b/cpp/tests/unit_tests/runtime/tllmRuntimeTest.cpp
@ -91,7 +91,7 @@ protected:
 TEST_F(TllmRuntimeTest, SinglePass)
 {
    EXPECT_TRUE(mSerializedEngine);
-    TllmRuntime rt{RawEngine(mSerializedEngine.get()), &mLogger, 1.0F};
+    TllmRuntime rt{RawEngine(mSerializedEngine.get()), &mLogger, false, 1.0F};
    auto& engine = rt.getEngine();
    EXPECT_FALSE(engine.hasImplicitBatchDimension());
    EXPECT_EQ(rt.getNbProfiles(), engine.getNbOptimizationProfiles());
--- a/examples/summarize.py
+++ b/examples/summarize.py
@ -536,7 +536,8 @@ def main(args):
                enable_chunked_context=args.enable_chunked_context,
                multi_block_mode=args.multi_block_mode,
                cuda_graph_mode=args.cuda_graph_mode,
-                gather_generation_logits=args.eval_ppl)
+                gather_generation_logits=args.eval_ppl,
+                use_gpu_direct_storage=args.use_gpu_direct_storage)
        runner_kwargs.update(
            enable_context_fmha_fp32_acc=args.enable_context_fmha_fp32_acc)
        if args.prompt_lookup_config is not None:
@ -867,6 +868,10 @@ if __name__ == '__main__':
        help=
        "evaluate.load('rouge') will attempt to pull rouge package from HF. Use cached rouge can avoid network outage of host or HF."
    )
+    parser.add_argument("--use_gpu_direct_storage",
+                        default=False,
+                        action="store_true",
+                        help="Use GPUDirect Storage (GDS) to load the engine")
    parser = add_common_args(parser)
    args = parser.parse_args()

--- a/tensorrt_llm/runtime/model_runner_cpp.py
+++ b/tensorrt_llm/runtime/model_runner_cpp.py
@ -108,6 +108,7 @@ class ModelRunnerCpp(ModelRunnerMixin):
        lookahead_config: list[int] | None = None,
        debug_mode: bool = False,
        lora_ckpt_source: str = "hf",
+        use_gpu_direct_storage: bool = False,
        gpu_weights_percent: float = 1,
        max_tokens_in_paged_kv_cache: int | None = None,
        kv_cache_enable_block_reuse: bool = False,
@ -385,6 +386,7 @@ class ModelRunnerCpp(ModelRunnerMixin):
            decoding_config=decoding_config,
            peft_cache_config=peft_cache_config,
            debug_config=debug_config,
+            use_gpu_direct_storage=use_gpu_direct_storage,
            gpu_weights_percent=gpu_weights_percent,
            gather_generation_logits=gather_generation_logits,
            use_variable_beam_width_search=use_variable_beam_width_search,
--- a/tests/unittest/bindings/test_executor_bindings.py
+++ b/tests/unittest/bindings/test_executor_bindings.py
@ -1526,6 +1526,7 @@ def test_executor_config():
    assert config.additional_model_outputs is None
    assert config.gather_generation_logits is False
    assert config.use_variable_beam_width_search is False
+    assert config.use_gpu_direct_storage is False

    kwargs = {
        "max_beam_width":
@ -1575,6 +1576,8 @@ def test_executor_config():
        "gather_generation_logits":
        True,
        "use_variable_beam_width_search":
+        True,
+        "use_gpu_direct_storage":
        True
    }
    config = trtllm.ExecutorConfig(**kwargs)
@ -1599,6 +1602,7 @@ def test_executor_config():
    assert config.additional_model_outputs[0].gather_context is False
    assert config.gather_generation_logits is True
    assert config.use_variable_beam_width_search is True
+    assert config.use_gpu_direct_storage is True


 def test_parallel_config():
@ -2354,6 +2358,7 @@ def test_executor_config_pickle():
    assert config.max_seq_idle_microseconds == config_copy.max_seq_idle_microseconds
    assert config.backend == config_copy.backend
    assert config.spec_dec_config.fast_logits == config_copy.spec_dec_config.fast_logits
+    assert config.use_gpu_direct_storage == config_copy.use_gpu_direct_storage


 def test_return_full_tokens():