mirror of
https://github.com/NVIDIA/TensorRT-LLM.git
synced 2026-01-13 22:18:36 +08:00
feat: Integrate GPUDirect Storage (GDS) into Executor API (#3582)
* feat: Integrate GPUDirect Storage (GDS) into Executor API Squash of several dev commits Signed-off-by: Dom Brown <3886319+DomBrown@users.noreply.github.com>
This commit is contained in:
parent
90a28b917f
commit
dbd9a83b0d
@ -74,15 +74,17 @@ std::string engineFilename(
|
||||
}
|
||||
|
||||
void benchmarkBert(std::string const& modelName, std::filesystem::path const& dataPath,
|
||||
std::vector<int> const& batchSizes, std::vector<int> const& inLens, std::vector<float> const& gpuWeightsPercents,
|
||||
std::shared_ptr<nvinfer1::ILogger> const& logger, int warmUp, int numRuns, int duration)
|
||||
std::vector<int> const& batchSizes, std::vector<int> const& inLens, bool useGpuDirectStorage,
|
||||
std::vector<float> const& gpuWeightsPercents, std::shared_ptr<nvinfer1::ILogger> const& logger, int warmUp,
|
||||
int numRuns, int duration)
|
||||
{
|
||||
auto const worldConfig = WorldConfig::mpi();
|
||||
auto const enginePath = dataPath / engineFilename(dataPath, worldConfig, modelName);
|
||||
|
||||
for (float gpuWeightsPercent : gpuWeightsPercents)
|
||||
{
|
||||
auto rt = std::make_shared<TllmRuntime>(RawEngine(enginePath), logger.get(), gpuWeightsPercent);
|
||||
auto rt = std::make_shared<TllmRuntime>(
|
||||
RawEngine(enginePath), logger.get(), useGpuDirectStorage, gpuWeightsPercent);
|
||||
rt->addContext(0);
|
||||
for (auto inLen : inLens)
|
||||
{
|
||||
@ -174,6 +176,8 @@ int main(int argc, char* argv[])
|
||||
"by \";\", "
|
||||
"example: \"0.0;0.5;1.0\".",
|
||||
cxxopts::value<std::string>()->default_value("1.0"));
|
||||
options.add_options()("use_gpu_direct_storage", "Enable GPUDirect Storage (GDS) for loading engine.",
|
||||
cxxopts::value<bool>()->default_value("false"));
|
||||
|
||||
auto result = options.parse(argc, argv);
|
||||
|
||||
@ -258,8 +262,8 @@ int main(int argc, char* argv[])
|
||||
try
|
||||
{
|
||||
benchmarkBert(result["model"].as<std::string>(), result["engine_dir"].as<std::string>(), batchSizes, inLens,
|
||||
gpuWeightsPercents, logger, result["warm_up"].as<int>(), result["num_runs"].as<int>(),
|
||||
result["duration"].as<int>());
|
||||
result["use_gpu_direct_storage"].as<bool>(), gpuWeightsPercents, logger, result["warm_up"].as<int>(),
|
||||
result["num_runs"].as<int>(), result["duration"].as<int>());
|
||||
}
|
||||
catch (std::exception const& e)
|
||||
{
|
||||
|
||||
@ -41,9 +41,9 @@ public:
|
||||
std::optional<std::vector<SizeType32>> deviceIds = std::nullopt, bool normalizeLogProbs = true,
|
||||
bool enableChunkedContext = true,
|
||||
PeftCacheManagerConfig const& peftCacheManagerConfig = PeftCacheManagerConfig{},
|
||||
executor::DecodingConfig decodingConfig = executor::DecodingConfig{}, float gpuWeightsPercent = 1,
|
||||
std::optional<SizeType32> maxBeamWidth = std::nullopt, std::optional<SizeType32> maxBatchSize = std::nullopt,
|
||||
std::optional<SizeType32> maxNumTokens = std::nullopt,
|
||||
executor::DecodingConfig decodingConfig = executor::DecodingConfig{}, bool useGpuDirectStorage = false,
|
||||
float gpuWeightsPercent = 1, std::optional<SizeType32> maxBeamWidth = std::nullopt,
|
||||
std::optional<SizeType32> maxBatchSize = std::nullopt, std::optional<SizeType32> maxNumTokens = std::nullopt,
|
||||
executor::SchedulerConfig schedulerConfig = executor::SchedulerConfig{},
|
||||
executor::ExtendedRuntimePerfKnobConfig const& extendedRuntimePerfKnobConfig
|
||||
= executor::ExtendedRuntimePerfKnobConfig{},
|
||||
@ -61,6 +61,7 @@ public:
|
||||
, enableChunkedContext{enableChunkedContext}
|
||||
, peftCacheManagerConfig(peftCacheManagerConfig)
|
||||
, decodingConfig(std::move(decodingConfig))
|
||||
, useGpuDirectStorage(useGpuDirectStorage)
|
||||
, gpuWeightsPercent(gpuWeightsPercent)
|
||||
, maxBeamWidth(maxBeamWidth)
|
||||
, maxBatchSize(maxBatchSize)
|
||||
@ -87,12 +88,12 @@ public:
|
||||
executorConfig.getNormalizeLogProbs(), executorConfig.getEnableChunkedContext(),
|
||||
PeftCacheManagerConfig(executorConfig.getPeftCacheConfig().value_or(executor::PeftCacheConfig())),
|
||||
executorConfig.getDecodingConfig().value_or(executor::DecodingConfig{}),
|
||||
executorConfig.getGpuWeightsPercent(), executorConfig.getMaxBeamWidth(), executorConfig.getMaxBatchSize(),
|
||||
executorConfig.getMaxNumTokens(), executorConfig.getSchedulerConfig(),
|
||||
executorConfig.getExtendedRuntimePerfKnobConfig(), executorConfig.getDebugConfig(),
|
||||
executorConfig.getMaxSeqIdleMicroseconds(), executorConfig.getSpecDecConfig(),
|
||||
executorConfig.getGuidedDecodingConfig(), isLeaderInOrchMode, executorConfig.getAdditionalModelOutputs(),
|
||||
executorConfig.getGatherGenerationLogits())
|
||||
executorConfig.getUseGpuDirectStorage(), executorConfig.getGpuWeightsPercent(),
|
||||
executorConfig.getMaxBeamWidth(), executorConfig.getMaxBatchSize(), executorConfig.getMaxNumTokens(),
|
||||
executorConfig.getSchedulerConfig(), executorConfig.getExtendedRuntimePerfKnobConfig(),
|
||||
executorConfig.getDebugConfig(), executorConfig.getMaxSeqIdleMicroseconds(),
|
||||
executorConfig.getSpecDecConfig(), executorConfig.getGuidedDecodingConfig(), isLeaderInOrchMode,
|
||||
executorConfig.getAdditionalModelOutputs(), executorConfig.getGatherGenerationLogits())
|
||||
{
|
||||
}
|
||||
|
||||
@ -106,6 +107,8 @@ public:
|
||||
bool enableChunkedContext;
|
||||
PeftCacheManagerConfig peftCacheManagerConfig;
|
||||
executor::DecodingConfig decodingConfig;
|
||||
// Use GDS to load the engines?
|
||||
bool useGpuDirectStorage;
|
||||
// Percentage of weights on the gpu at runtime
|
||||
float gpuWeightsPercent;
|
||||
std::optional<SizeType32> maxBeamWidth;
|
||||
|
||||
@ -1400,8 +1400,8 @@ public:
|
||||
std::optional<ParallelConfig> parallelConfig = std::nullopt,
|
||||
std::optional<PeftCacheConfig> const& peftCacheConfig = std::nullopt,
|
||||
std::optional<LogitsPostProcessorConfig> logitsPostProcessorConfig = std::nullopt,
|
||||
std::optional<DecodingConfig> decodingConfig = std::nullopt, float gpuWeightsPercent = 1,
|
||||
std::optional<SizeType32> maxQueueSize = std::nullopt,
|
||||
std::optional<DecodingConfig> decodingConfig = std::nullopt, bool useGpuDirectStorage = false,
|
||||
float gpuWeightsPercent = 1, std::optional<SizeType32> maxQueueSize = std::nullopt,
|
||||
ExtendedRuntimePerfKnobConfig const& extendedRuntimePerfKnobConfig = ExtendedRuntimePerfKnobConfig(),
|
||||
std::optional<DebugConfig> debugConfig = std::nullopt, SizeType32 recvPollPeriodMs = 0,
|
||||
uint64_t maxSeqIdleMicroseconds = kDefaultMaxSeqIdleMicroseconds,
|
||||
@ -1429,6 +1429,7 @@ public:
|
||||
[[nodiscard]] std::optional<PeftCacheConfig> getPeftCacheConfig() const;
|
||||
[[nodiscard]] std::optional<LogitsPostProcessorConfig> getLogitsPostProcessorConfig() const;
|
||||
[[nodiscard]] std::optional<DecodingConfig> getDecodingConfig() const;
|
||||
[[nodiscard]] bool getUseGpuDirectStorage() const;
|
||||
[[nodiscard]] float getGpuWeightsPercent() const;
|
||||
[[nodiscard]] std::optional<SizeType32> getMaxQueueSize() const;
|
||||
[[nodiscard]] ExtendedRuntimePerfKnobConfig getExtendedRuntimePerfKnobConfig() const;
|
||||
@ -1455,6 +1456,7 @@ public:
|
||||
void setPeftCacheConfig(PeftCacheConfig const& peftCacheConfig);
|
||||
void setLogitsPostProcessorConfig(LogitsPostProcessorConfig const& logitsPostProcessorConfig);
|
||||
void setDecodingConfig(DecodingConfig const& decodingConfig);
|
||||
void setUseGpuDirectStorage(bool const& useGpuDirectStorage);
|
||||
void setGpuWeightsPercent(float const& gpuWeightsPercent);
|
||||
void setMaxQueueSize(std::optional<SizeType32> const& maxQueueSize);
|
||||
void setExtendedRuntimePerfKnobConfig(ExtendedRuntimePerfKnobConfig const& extendedRuntimePerfKnobConfig);
|
||||
@ -1510,6 +1512,9 @@ private:
|
||||
/// @brief Decoding configuration.
|
||||
std::optional<DecodingConfig> mDecodingConfig;
|
||||
|
||||
/// @brief Enable/disable use of GPU Direct Storage (GDS) to load engines.
|
||||
bool mUseGpuDirectStorage;
|
||||
|
||||
/// @brief GPU weights percent for weight streaming.
|
||||
float mGpuWeightsPercent;
|
||||
|
||||
|
||||
@ -99,6 +99,9 @@ public:
|
||||
SizeType32 maxBeamWidth;
|
||||
// The length of the longest input sequence
|
||||
SizeType32 maxSequenceLength;
|
||||
// Enable/disable GPUDirectStorage
|
||||
// Not supported by GptSession so hard-coded as false
|
||||
bool useGpuDirectStorage{false};
|
||||
// Percentage of weights on the gpu at runtime
|
||||
float gpuWeightsPercent;
|
||||
// Whether the session will use a different decoder per request.
|
||||
|
||||
@ -45,7 +45,8 @@ TrtEncoderModel::TrtEncoderModel(runtime::ModelConfig const& modelConfig, WorldC
|
||||
, mWorldConfig{worldConfig}
|
||||
, mDevice{runtime::utils::initDevice(worldConfig)}
|
||||
, mLogger{logger ? std::move(logger) : std::make_shared<TllmLogger>()}
|
||||
, mRuntime{std::make_shared<TllmRuntime>(rawEngine, mLogger.get(), optionalParams.gpuWeightsPercent)}
|
||||
, mRuntime{std::make_shared<TllmRuntime>(
|
||||
rawEngine, mLogger.get(), optionalParams.useGpuDirectStorage, optionalParams.gpuWeightsPercent)}
|
||||
, mMicroBatchId(0)
|
||||
, mCopyBufferManager{std::make_shared<CudaStream>()}
|
||||
{
|
||||
|
||||
@ -138,8 +138,8 @@ TrtGptModelInflightBatching::TrtGptModelInflightBatching(std::shared_ptr<nvinfer
|
||||
, mDebugConfig{optionalParams.debugConfig}
|
||||
, mAdditionalModelOutputs{optionalParams.additionalModelOutputs}
|
||||
, mLogger{logger ? std::move(logger) : std::make_shared<TllmLogger>()}
|
||||
, mRuntime{std::make_shared<TllmRuntime>(
|
||||
rawEngine, mLogger.get(), optionalParams.gpuWeightsPercent, modelConfig.useShapeInference())}
|
||||
, mRuntime{std::make_shared<TllmRuntime>(rawEngine, mLogger.get(), optionalParams.useGpuDirectStorage,
|
||||
optionalParams.gpuWeightsPercent, modelConfig.useShapeInference())}
|
||||
, mCopyBufferManager{std::make_shared<CudaStream>()}
|
||||
, mCtxGenFusion(ctxGenFusion)
|
||||
, mOperatingBeamWidth{getMaxBeamWidth()}
|
||||
|
||||
@ -28,7 +28,7 @@ ExecutorConfig::ExecutorConfig(SizeType32 maxBeamWidth, SchedulerConfig schedule
|
||||
std::optional<SizeType32> maxNumTokens, std::optional<ParallelConfig> parallelConfig,
|
||||
std::optional<PeftCacheConfig> const& peftCacheConfig,
|
||||
std::optional<LogitsPostProcessorConfig> logitsPostProcessorConfig, std::optional<DecodingConfig> decodingConfig,
|
||||
float gpuWeightPercent, std::optional<SizeType32> maxQueueSize,
|
||||
bool useGpuDirectStorage, float gpuWeightPercent, std::optional<SizeType32> maxQueueSize,
|
||||
ExtendedRuntimePerfKnobConfig const& extendedRuntimePerfKnobConfig, std::optional<DebugConfig> debugConfig,
|
||||
SizeType32 recvPollPeriodMs, uint64_t maxSeqIdleMicroseconds,
|
||||
std::optional<SpeculativeDecodingConfig> specDecConfig, std::optional<GuidedDecodingConfig> guidedDecodingConfig,
|
||||
@ -48,6 +48,7 @@ ExecutorConfig::ExecutorConfig(SizeType32 maxBeamWidth, SchedulerConfig schedule
|
||||
, mPeftCacheConfig(peftCacheConfig)
|
||||
, mLogitsPostProcessorConfig(std::move(logitsPostProcessorConfig))
|
||||
, mDecodingConfig(std::move(decodingConfig))
|
||||
, mUseGpuDirectStorage((useGpuDirectStorage))
|
||||
, mGpuWeightsPercent(gpuWeightPercent)
|
||||
, mMaxQueueSize(maxQueueSize)
|
||||
, mExtendedRuntimePerfKnobConfig(extendedRuntimePerfKnobConfig)
|
||||
@ -146,6 +147,11 @@ std::optional<DecodingConfig> ExecutorConfig::getDecodingConfig() const
|
||||
return mDecodingConfig;
|
||||
}
|
||||
|
||||
bool ExecutorConfig::getUseGpuDirectStorage() const
|
||||
{
|
||||
return mUseGpuDirectStorage;
|
||||
}
|
||||
|
||||
float ExecutorConfig::getGpuWeightsPercent() const
|
||||
{
|
||||
return mGpuWeightsPercent;
|
||||
@ -276,6 +282,11 @@ void ExecutorConfig::setDecodingConfig(DecodingConfig const& decodingConfig)
|
||||
mDecodingConfig = decodingConfig;
|
||||
}
|
||||
|
||||
void ExecutorConfig::setUseGpuDirectStorage(bool const& useGpuDirectStorage)
|
||||
{
|
||||
mUseGpuDirectStorage = useGpuDirectStorage;
|
||||
}
|
||||
|
||||
void ExecutorConfig::setGpuWeightsPercent(float const& gpuWeightsPercent)
|
||||
{
|
||||
mGpuWeightsPercent = gpuWeightsPercent;
|
||||
|
||||
@ -978,6 +978,7 @@ ExecutorConfig Serialization::deserializeExecutorConfig(std::istream& is)
|
||||
auto parallelConfig = su::deserializeWithGetterType<decltype(&ExecutorConfig::getParallelConfig)>(is);
|
||||
auto peftCacheConfig = su::deserializeWithGetterType<decltype(&ExecutorConfig::getPeftCacheConfig)>(is);
|
||||
auto decodingConfig = su::deserializeWithGetterType<decltype(&ExecutorConfig::getDecodingConfig)>(is);
|
||||
auto useGpuDirectStorage = su::deserializeWithGetterType<decltype(&ExecutorConfig::getUseGpuDirectStorage)>(is);
|
||||
auto gpuWeightsPercent = su::deserializeWithGetterType<decltype(&ExecutorConfig::getGpuWeightsPercent)>(is);
|
||||
auto maxQueueSize = su::deserializeWithGetterType<decltype(&ExecutorConfig::getMaxQueueSize)>(is);
|
||||
auto extendedRuntimePerfKnobConfig
|
||||
@ -995,9 +996,9 @@ ExecutorConfig Serialization::deserializeExecutorConfig(std::istream& is)
|
||||
|
||||
return ExecutorConfig{maxBeamWidth, schedulerConfig, kvCacheConfig, enableChunkedContext, normalizeLogProbs,
|
||||
iterStatsMaxIterations, requestStatsMaxIterations, batchingType, maxBatchSize, maxNumTokens, parallelConfig,
|
||||
peftCacheConfig, std::nullopt, decodingConfig, gpuWeightsPercent, maxQueueSize, extendedRuntimePerfKnobConfig,
|
||||
debugConfig, recvPollPeriodMs, maxSeqIdleMicroseconds, specDecConfig, guidedDecodingConfig,
|
||||
additionalModelOutputs, gatherGenerationLogits};
|
||||
peftCacheConfig, std::nullopt, decodingConfig, useGpuDirectStorage, gpuWeightsPercent, maxQueueSize,
|
||||
extendedRuntimePerfKnobConfig, debugConfig, recvPollPeriodMs, maxSeqIdleMicroseconds, specDecConfig,
|
||||
guidedDecodingConfig, additionalModelOutputs, gatherGenerationLogits};
|
||||
}
|
||||
|
||||
size_t Serialization::serializedSize(ExecutorConfig const& executorConfig)
|
||||
@ -1020,6 +1021,7 @@ size_t Serialization::serializedSize(ExecutorConfig const& executorConfig)
|
||||
totalSize += su::serializedSize(executorConfig.getParallelConfig());
|
||||
totalSize += su::serializedSize(executorConfig.getPeftCacheConfig());
|
||||
totalSize += su::serializedSize(executorConfig.getDecodingConfig());
|
||||
totalSize += su::serializedSize(executorConfig.getUseGpuDirectStorage());
|
||||
totalSize += su::serializedSize(executorConfig.getGpuWeightsPercent());
|
||||
totalSize += su::serializedSize(executorConfig.getMaxQueueSize());
|
||||
totalSize += su::serializedSize(executorConfig.getExtendedRuntimePerfKnobConfig());
|
||||
@ -1052,6 +1054,7 @@ void Serialization::serialize(ExecutorConfig const& executorConfig, std::ostream
|
||||
su::serialize(executorConfig.getParallelConfig(), os);
|
||||
su::serialize(executorConfig.getPeftCacheConfig(), os);
|
||||
su::serialize(executorConfig.getDecodingConfig(), os);
|
||||
su::serialize(executorConfig.getUseGpuDirectStorage(), os);
|
||||
su::serialize(executorConfig.getGpuWeightsPercent(), os);
|
||||
su::serialize(executorConfig.getMaxQueueSize(), os);
|
||||
su::serialize(executorConfig.getExtendedRuntimePerfKnobConfig(), os);
|
||||
|
||||
@ -527,6 +527,7 @@ PYBIND11_MODULE(TRTLLM_PYBIND_MODULE, m)
|
||||
.def_readwrite("enable_chunked_context", &tb::TrtGptModelOptionalParams::enableChunkedContext)
|
||||
.def_readwrite("normalize_log_probs", &tb::TrtGptModelOptionalParams::normalizeLogProbs)
|
||||
.def_readwrite("decoding_config", &tb::TrtGptModelOptionalParams::decodingConfig)
|
||||
.def_readwrite("use_gpu_direct_storage", &tb::TrtGptModelOptionalParams::useGpuDirectStorage)
|
||||
.def_readwrite("gpu_weights_percent", &tb::TrtGptModelOptionalParams::gpuWeightsPercent)
|
||||
.def_readwrite("max_beam_width", &tb::TrtGptModelOptionalParams::maxBeamWidth)
|
||||
.def_readwrite("scheduler_config", &tb::TrtGptModelOptionalParams::schedulerConfig)
|
||||
|
||||
@ -414,8 +414,9 @@ void initConfigBindings(pybind11::module_& m)
|
||||
c.getEnableChunkedContext(), c.getNormalizeLogProbs(), c.getIterStatsMaxIterations(),
|
||||
c.getRequestStatsMaxIterations(), c.getBatchingType(), c.getMaxBatchSize(), c.getMaxNumTokens(),
|
||||
c.getParallelConfig(), c.getPeftCacheConfig(), c.getLogitsPostProcessorConfig(), c.getDecodingConfig(),
|
||||
c.getGpuWeightsPercent(), c.getMaxQueueSize(), c.getExtendedRuntimePerfKnobConfig(), c.getDebugConfig(),
|
||||
c.getRecvPollPeriodMs(), c.getMaxSeqIdleMicroseconds(), c.getSpecDecConfig(), c.getGuidedDecodingConfig(),
|
||||
c.getUseGpuDirectStorage(), c.getGpuWeightsPercent(), c.getMaxQueueSize(),
|
||||
c.getExtendedRuntimePerfKnobConfig(), c.getDebugConfig(), c.getRecvPollPeriodMs(),
|
||||
c.getMaxSeqIdleMicroseconds(), c.getSpecDecConfig(), c.getGuidedDecodingConfig(),
|
||||
c.getAdditionalModelOutputs(), c.getGatherGenerationLogits(), c.getUseVariableBeamWidthSearch());
|
||||
auto pickle_tuple = py::make_tuple(cpp_states, py::getattr(self, "__dict__"));
|
||||
return pickle_tuple;
|
||||
@ -429,7 +430,7 @@ void initConfigBindings(pybind11::module_& m)
|
||||
|
||||
// Restore C++ data
|
||||
auto cpp_states = state[0].cast<py::tuple>();
|
||||
if (cpp_states.size() != 25)
|
||||
if (cpp_states.size() != 26)
|
||||
{
|
||||
throw std::runtime_error("Invalid cpp_states!");
|
||||
}
|
||||
@ -449,17 +450,18 @@ void initConfigBindings(pybind11::module_& m)
|
||||
cpp_states[11].cast<std::optional<tle::PeftCacheConfig>>(), // PeftCacheConfig
|
||||
cpp_states[12].cast<std::optional<tle::LogitsPostProcessorConfig>>(), // LogitsPostProcessorConfig
|
||||
cpp_states[13].cast<std::optional<tle::DecodingConfig>>(), // DecodingConfig
|
||||
cpp_states[14].cast<float>(), // GpuWeightsPercent
|
||||
cpp_states[15].cast<std::optional<SizeType32>>(), // MaxQueueSize
|
||||
cpp_states[16].cast<tle::ExtendedRuntimePerfKnobConfig>(), // ExtendedRuntimePerfKnobConfig
|
||||
cpp_states[17].cast<std::optional<tle::DebugConfig>>(), // DebugConfig
|
||||
cpp_states[18].cast<SizeType32>(), // RecvPollPeriodMs
|
||||
cpp_states[19].cast<uint64_t>(), // MaxSeqIdleMicroseconds
|
||||
cpp_states[20].cast<std::optional<tle::SpeculativeDecodingConfig>>(), // SpecDecConfig
|
||||
cpp_states[21].cast<std::optional<tle::GuidedDecodingConfig>>(), // GuidedDecodingConfig
|
||||
cpp_states[22].cast<std::optional<std::vector<tle::AdditionalModelOutput>>>(), // AdditionalModelOutputs
|
||||
cpp_states[23].cast<bool>(), // GatherGenerationLogits
|
||||
cpp_states[24].cast<bool>() // UseVariableBeamWidthSearch
|
||||
cpp_states[14].cast<bool>(), // UseGpuDirectStorage
|
||||
cpp_states[15].cast<float>(), // GpuWeightsPercent
|
||||
cpp_states[16].cast<std::optional<SizeType32>>(), // MaxQueueSize
|
||||
cpp_states[17].cast<tle::ExtendedRuntimePerfKnobConfig>(), // ExtendedRuntimePerfKnobConfig
|
||||
cpp_states[18].cast<std::optional<tle::DebugConfig>>(), // DebugConfig
|
||||
cpp_states[19].cast<SizeType32>(), // RecvPollPeriodMs
|
||||
cpp_states[20].cast<uint64_t>(), // MaxSeqIdleMicroseconds
|
||||
cpp_states[21].cast<std::optional<tle::SpeculativeDecodingConfig>>(), // SpecDecConfig
|
||||
cpp_states[22].cast<std::optional<tle::GuidedDecodingConfig>>(), // GuidedDecodingConfig
|
||||
cpp_states[23].cast<std::optional<std::vector<tle::AdditionalModelOutput>>>(), // AdditionalModelOutputs
|
||||
cpp_states[24].cast<bool>(), // GatherGenerationLogits
|
||||
cpp_states[25].cast<bool>() // UseVariableBeamWidthSearch
|
||||
);
|
||||
|
||||
auto py_state = state[1].cast<py::dict>();
|
||||
@ -483,6 +485,7 @@ void initConfigBindings(pybind11::module_& m)
|
||||
tle::PeftCacheConfig const&, // PeftCacheConfig
|
||||
std::optional<tle::LogitsPostProcessorConfig>, // LogitsPostProcessorConfig
|
||||
std::optional<tle::DecodingConfig>, // DecodingConfig
|
||||
bool, // UseGpuDirectStorage
|
||||
float, // GpuWeightsPercent
|
||||
std::optional<SizeType32>, // MaxQueueSize
|
||||
tle::ExtendedRuntimePerfKnobConfig const&, // ExtendedRuntimePerfKnobConfig
|
||||
@ -505,7 +508,8 @@ void initConfigBindings(pybind11::module_& m)
|
||||
py::arg("parallel_config") = py::none(),
|
||||
py::arg_v("peft_cache_config", tle::PeftCacheConfig(), "PeftCacheConfig()"),
|
||||
py::arg("logits_post_processor_config") = py::none(), py::arg("decoding_config") = py::none(),
|
||||
py::arg("gpu_weights_percent") = 1.0, py::arg("max_queue_size") = py::none(),
|
||||
py::arg("use_gpu_direct_storage") = false, py::arg("gpu_weights_percent") = 1.0,
|
||||
py::arg("max_queue_size") = py::none(),
|
||||
py::arg_v("extended_runtime_perf_knob_config", tle::ExtendedRuntimePerfKnobConfig(),
|
||||
"ExtendedRuntimePerfKnobConfig()"),
|
||||
py::arg("debug_config") = py::none(), py::arg("recv_poll_period_ms") = 0,
|
||||
@ -537,6 +541,8 @@ void initConfigBindings(pybind11::module_& m)
|
||||
&tle::ExecutorConfig::setLogitsPostProcessorConfig)
|
||||
.def_property(
|
||||
"decoding_config", &tle::ExecutorConfig::getDecodingConfig, &tle::ExecutorConfig::setDecodingConfig)
|
||||
.def_property("use_gpu_direct_storage", &tle::ExecutorConfig::getUseGpuDirectStorage,
|
||||
&tle::ExecutorConfig::setUseGpuDirectStorage)
|
||||
.def_property("gpu_weights_percent", &tle::ExecutorConfig::getGpuWeightsPercent,
|
||||
&tle::ExecutorConfig::setGpuWeightsPercent)
|
||||
.def_property("max_queue_size", &tle::ExecutorConfig::getMaxQueueSize, &tle::ExecutorConfig::setMaxQueueSize)
|
||||
|
||||
@ -53,6 +53,7 @@ set(SRCS
|
||||
statefulGptDecoderBatched.cpp
|
||||
tllmBuffers.cpp
|
||||
tllmRuntime.cpp
|
||||
tllmStreamReaders.cpp
|
||||
tllmLogger.cpp
|
||||
transformerBuffers.cpp
|
||||
workerPool.cpp
|
||||
|
||||
@ -83,7 +83,8 @@ GptSession::GptSession(Config const& sessionConfig, ModelConfig const& modelConf
|
||||
, mWorldConfig{worldConfig}
|
||||
, mDevice{utils::initDevice(worldConfig)}
|
||||
, mLogger{logger ? std::move(logger) : std::make_shared<TllmLogger>()}
|
||||
, mRuntime{std::make_shared<TllmRuntime>(rawEngine, mLogger.get(), sessionConfig.gpuWeightsPercent)}
|
||||
, mRuntime{std::make_shared<TllmRuntime>(
|
||||
rawEngine, mLogger.get(), sessionConfig.useGpuDirectStorage, sessionConfig.gpuWeightsPercent)}
|
||||
, mGatherGenerationLogits{sessionConfig.gatherGenerationLogits}
|
||||
{
|
||||
TLLM_LOG_WARNING(
|
||||
|
||||
@ -23,6 +23,7 @@
|
||||
#include "tensorrt_llm/kernels/userbuffers/ub_interface.h"
|
||||
#include "tensorrt_llm/runtime/utils/mpiUtils.h"
|
||||
#include "tllmLogger.h"
|
||||
#include "tllmStreamReaders.h"
|
||||
|
||||
#include "nlohmann/json.hpp"
|
||||
#include <NvInferRuntime.h>
|
||||
@ -73,36 +74,6 @@ std::vector<std::size_t> dimsToShape(nvinfer1::Dims const& dims)
|
||||
|
||||
tensorrt_llm::runtime::TllmLogger defaultLogger{};
|
||||
|
||||
class StreamReader final : public nvinfer1::IStreamReader
|
||||
{
|
||||
public:
|
||||
StreamReader(std::filesystem::path fp)
|
||||
{
|
||||
mFile.open(fp.string(), std::ios::binary | std::ios::in);
|
||||
TLLM_CHECK_WITH_INFO(mFile.good(), std::string("Error opening engine file: " + fp.string()));
|
||||
}
|
||||
|
||||
virtual ~StreamReader()
|
||||
{
|
||||
if (mFile.is_open())
|
||||
{
|
||||
mFile.close();
|
||||
}
|
||||
}
|
||||
|
||||
int64_t read(void* destination, int64_t nbBytes) final
|
||||
{
|
||||
if (!mFile.good())
|
||||
{
|
||||
return -1;
|
||||
}
|
||||
mFile.read(static_cast<char*>(destination), nbBytes);
|
||||
return mFile.gcount();
|
||||
}
|
||||
|
||||
std::ifstream mFile;
|
||||
};
|
||||
|
||||
void setWeightStreaming(nvinfer1::ICudaEngine& engine, float const gpuWeightsPercent)
|
||||
{
|
||||
if (gpuWeightsPercent < 1)
|
||||
@ -211,22 +182,34 @@ void assessLikelihoodOfRuntimeAllocation(
|
||||
numWarnings);
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
TllmRuntime::TllmRuntime(
|
||||
RawEngine const& rawEngine, nvinfer1::ILogger* logger, float gpuWeightsPercent, bool useShapeInference)
|
||||
TllmRuntime::TllmRuntime(RawEngine const& rawEngine, nvinfer1::ILogger* logger, bool useGpuDirectStorage,
|
||||
float gpuWeightsPercent, bool useShapeInference)
|
||||
: mStream(std::make_shared<CudaStream>())
|
||||
, mBufferManager{mStream, true} // Ensure to trim the memory pool on destruction.
|
||||
, mRuntime{nvinfer1::createInferRuntime(static_cast<bool>(logger) ? *logger : defaultLogger)}
|
||||
, mUseShapeInference{useShapeInference}
|
||||
, mUserBufferEnabled{false}
|
||||
{
|
||||
auto const startTime = std::chrono::high_resolution_clock::now();
|
||||
|
||||
switch (rawEngine.getType())
|
||||
{
|
||||
case RawEngine::Type::FilePath:
|
||||
{
|
||||
auto reader = StreamReader(rawEngine.getPath());
|
||||
mEngine.reset(mRuntime->deserializeCudaEngine(reader));
|
||||
if (useGpuDirectStorage)
|
||||
{
|
||||
TLLM_LOG_INFO("GDS is used to load the engine!");
|
||||
auto reader = GDSStreamReader(rawEngine.getPath());
|
||||
mEngine.reset(mRuntime->deserializeCudaEngine(reader));
|
||||
}
|
||||
else
|
||||
{
|
||||
auto reader = StreamReader(rawEngine.getPath());
|
||||
mEngine.reset(mRuntime->deserializeCudaEngine(reader));
|
||||
}
|
||||
break;
|
||||
}
|
||||
case RawEngine::Type::AddressWithSize:
|
||||
@ -239,6 +222,11 @@ TllmRuntime::TllmRuntime(
|
||||
default: TLLM_THROW("Unsupported raw engine type.");
|
||||
}
|
||||
|
||||
auto const elapsedMs
|
||||
= std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::high_resolution_clock::now() - startTime);
|
||||
|
||||
TLLM_LOG_INFO("Engine load time %lld ms", elapsedMs);
|
||||
|
||||
TLLM_CHECK_WITH_INFO(mEngine != nullptr, "Failed to deserialize cuda engine.");
|
||||
mEngineInspector.reset(mEngine->createEngineInspector());
|
||||
assessLikelihoodOfRuntimeAllocation(*mEngine, *mEngineInspector);
|
||||
|
||||
@ -36,8 +36,8 @@ class TllmRuntime
|
||||
public:
|
||||
using TensorMap = StringPtrMap<ITensor>;
|
||||
|
||||
explicit TllmRuntime(RawEngine const& rawEngine, nvinfer1::ILogger* logger, float gpuWeightsPercent = 1.0f,
|
||||
bool useShapeInference = true);
|
||||
explicit TllmRuntime(RawEngine const& rawEngine, nvinfer1::ILogger* logger, bool useGpuDirectStorage = false,
|
||||
float gpuWeightsPercent = 1.0f, bool useShapeInference = true);
|
||||
|
||||
SizeType32 getNbContexts() const
|
||||
{
|
||||
|
||||
217
cpp/tensorrt_llm/runtime/tllmStreamReaders.cpp
Normal file
217
cpp/tensorrt_llm/runtime/tllmStreamReaders.cpp
Normal file
@ -0,0 +1,217 @@
|
||||
/*
|
||||
* Copyright (c) 2022-2024, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include "tllmStreamReaders.h"
|
||||
#include "tensorrt_llm/common/assert.h"
|
||||
#include "tensorrt_llm/common/logger.h"
|
||||
|
||||
#include <cufile.h>
|
||||
#include <dlfcn.h>
|
||||
#include <fcntl.h>
|
||||
#include <filesystem>
|
||||
#include <fstream>
|
||||
#include <string>
|
||||
#include <unistd.h>
|
||||
|
||||
// Non-GDS StreamReader
|
||||
|
||||
StreamReader::StreamReader(std::filesystem::path fp)
|
||||
{
|
||||
mFile.open(fp.string(), std::ios::binary | std::ios::in);
|
||||
TLLM_CHECK_WITH_INFO(mFile.good(), std::string("Error opening engine file: " + fp.string()));
|
||||
}
|
||||
|
||||
StreamReader::~StreamReader()
|
||||
{
|
||||
if (mFile.is_open())
|
||||
{
|
||||
mFile.close();
|
||||
}
|
||||
}
|
||||
|
||||
int64_t StreamReader::read(void* destination, int64_t nbBytes)
|
||||
{
|
||||
if (!mFile.good())
|
||||
{
|
||||
return -1;
|
||||
}
|
||||
|
||||
mFile.read(static_cast<char*>(destination), nbBytes);
|
||||
|
||||
return mFile.gcount();
|
||||
}
|
||||
|
||||
// StreamReader using GDS
|
||||
|
||||
GDSStreamReader::GDSStreamReader(std::filesystem::path const& filePath)
|
||||
{
|
||||
auto const start_time = std::chrono::high_resolution_clock::now();
|
||||
initializeDriver();
|
||||
auto const elapsed_ms
|
||||
= std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::high_resolution_clock::now() - start_time);
|
||||
|
||||
TLLM_LOG_INFO("GDS driver initialization time %lld ms", elapsed_ms);
|
||||
|
||||
open(filePath);
|
||||
}
|
||||
|
||||
bool GDSStreamReader::open(std::string const& filepath)
|
||||
{
|
||||
if (!initializeDriver())
|
||||
{
|
||||
TLLM_LOG_INFO("Failed to initialize cuFile driver");
|
||||
return false;
|
||||
}
|
||||
|
||||
int32_t const ret = ::open(filepath.c_str(), O_CREAT | O_RDWR | O_DIRECT, 0664);
|
||||
|
||||
if (ret < 0)
|
||||
{
|
||||
TLLM_LOG_INFO("Failed to open engine file");
|
||||
return false;
|
||||
}
|
||||
|
||||
mFd = ret;
|
||||
mFileSize = lseek(mFd, 0, SEEK_END);
|
||||
lseek(mFd, 0, SEEK_SET);
|
||||
|
||||
CUfileDescr_t fileDescr;
|
||||
memset((void*) &fileDescr, 0, sizeof(fileDescr));
|
||||
fileDescr.handle.fd = mFd;
|
||||
fileDescr.type = CU_FILE_HANDLE_TYPE_OPAQUE_FD;
|
||||
|
||||
CUfileError_t gdsStatus = cuFileHandleRegister(&mFileHandle, &fileDescr);
|
||||
|
||||
if (gdsStatus.err != CU_FILE_SUCCESS)
|
||||
{
|
||||
TLLM_LOG_INFO("Failed to cuFileHandleRegister");
|
||||
::close(mFd);
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
void GDSStreamReader::close()
|
||||
{
|
||||
if (mFd >= 0)
|
||||
{
|
||||
::close(mFd);
|
||||
mFd = -1;
|
||||
}
|
||||
}
|
||||
|
||||
GDSStreamReader::~GDSStreamReader()
|
||||
{
|
||||
if (mFileHandle)
|
||||
{
|
||||
cuFileHandleDeregister(mFileHandle);
|
||||
mFileHandle = nullptr;
|
||||
}
|
||||
|
||||
if (mDriverInitialized)
|
||||
{
|
||||
cuFileDriverClose();
|
||||
}
|
||||
}
|
||||
|
||||
bool GDSStreamReader::seek(int64_t offset, nvinfer1::SeekPosition where) noexcept
|
||||
{
|
||||
switch (where)
|
||||
{
|
||||
case nvinfer1::SeekPosition::kSET: mCursor = offset; return true;
|
||||
case nvinfer1::SeekPosition::kCUR: mCursor += offset; return true;
|
||||
case nvinfer1::SeekPosition::kEND: mCursor = -offset; return true;
|
||||
default: return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
int64_t GDSStreamReader::read(void* dest, int64_t bytes, cudaStream_t stream) noexcept
|
||||
{
|
||||
cudaPointerAttributes attributes{};
|
||||
if (cudaPointerGetAttributes(&attributes, dest) != cudaSuccess)
|
||||
{
|
||||
TLLM_LOG_INFO("cudaPointerGetAttributes failed");
|
||||
}
|
||||
|
||||
off_t destOffset = 0;
|
||||
void* destBase = dest;
|
||||
|
||||
if (attributes.type == cudaMemoryTypeDevice)
|
||||
{
|
||||
CUdeviceptr cuDest = reinterpret_cast<CUdeviceptr>(dest);
|
||||
CUdeviceptr cuBufBase = 0;
|
||||
size_t cuBufSize = 0;
|
||||
|
||||
cuMemGetAddressRange(&cuBufBase, &cuBufSize, cuDest);
|
||||
destOffset += cuDest - cuBufBase;
|
||||
destBase = reinterpret_cast<void*>(cuBufBase);
|
||||
}
|
||||
cuFileRead(this->mFileHandle, destBase, bytes, mCursor, destOffset);
|
||||
|
||||
mCursor += bytes;
|
||||
return bytes;
|
||||
}
|
||||
|
||||
void GDSStreamReader::reset()
|
||||
{
|
||||
lseek(mFd, 0, SEEK_SET);
|
||||
mCursor = 0;
|
||||
}
|
||||
|
||||
[[nodiscard]] bool GDSStreamReader::isOpen() const
|
||||
{
|
||||
bool open = mFd >= 0;
|
||||
return open;
|
||||
}
|
||||
|
||||
bool GDSStreamReader::initializeDriver()
|
||||
{
|
||||
if (mDriverInitialized)
|
||||
{
|
||||
return true;
|
||||
}
|
||||
|
||||
mCuFileLibHandle = dlopen("libcufile.so", RTLD_LAZY | RTLD_GLOBAL);
|
||||
if (!mCuFileLibHandle)
|
||||
{
|
||||
TLLM_LOG_INFO("Failed to dlopen libcufile.so");
|
||||
return false;
|
||||
}
|
||||
|
||||
// Load the required functions
|
||||
*reinterpret_cast<void**>(&cuFileDriverOpen) = dlsym(mCuFileLibHandle, "cuFileDriverOpen");
|
||||
*reinterpret_cast<void**>(&cuFileHandleRegister) = dlsym(mCuFileLibHandle, "cuFileHandleRegister");
|
||||
*reinterpret_cast<void**>(&cuFileHandleDeregister) = dlsym(mCuFileLibHandle, "cuFileHandleDeregister");
|
||||
*reinterpret_cast<void**>(&cuFileDriverClose) = dlsym(mCuFileLibHandle, "cuFileDriverClose");
|
||||
*reinterpret_cast<void**>(&cuFileRead) = dlsym(mCuFileLibHandle, "cuFileRead");
|
||||
|
||||
if (!cuFileDriverOpen || !cuFileHandleRegister || !cuFileHandleDeregister || !cuFileDriverClose || !cuFileRead)
|
||||
{
|
||||
TLLM_LOG_INFO("Failed to dlsym libcufile.so");
|
||||
return false;
|
||||
}
|
||||
|
||||
CUfileError_t gdsStatus = cuFileDriverOpen();
|
||||
if (gdsStatus.err != CU_FILE_SUCCESS)
|
||||
{
|
||||
TLLM_LOG_INFO("cuFileDriverOpen failed");
|
||||
return false;
|
||||
}
|
||||
|
||||
mDriverInitialized = true;
|
||||
return true;
|
||||
}
|
||||
71
cpp/tensorrt_llm/runtime/tllmStreamReaders.h
Normal file
71
cpp/tensorrt_llm/runtime/tllmStreamReaders.h
Normal file
@ -0,0 +1,71 @@
|
||||
/*
|
||||
* Copyright (c) 2022-2024, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#pragma once
|
||||
|
||||
#include <NvInferRuntime.h>
|
||||
|
||||
#include <cufile.h>
|
||||
#include <filesystem>
|
||||
#include <fstream>
|
||||
|
||||
class StreamReader final : public nvinfer1::IStreamReader
|
||||
{
|
||||
public:
|
||||
StreamReader(std::filesystem::path fp);
|
||||
|
||||
virtual ~StreamReader();
|
||||
|
||||
int64_t read(void* destination, int64_t nbBytes) final;
|
||||
|
||||
private:
|
||||
std::ifstream mFile;
|
||||
};
|
||||
|
||||
class GDSStreamReader final : public nvinfer1::IStreamReaderV2
|
||||
{
|
||||
public:
|
||||
explicit GDSStreamReader(std::filesystem::path const& filePath);
|
||||
|
||||
virtual ~GDSStreamReader();
|
||||
|
||||
void close();
|
||||
|
||||
[[nodiscard]] bool isOpen() const;
|
||||
|
||||
bool open(std::string const& filepath);
|
||||
|
||||
int64_t read(void* dest, int64_t bytes, cudaStream_t stream) noexcept final;
|
||||
|
||||
void reset();
|
||||
|
||||
bool seek(int64_t offset, nvinfer1::SeekPosition where) noexcept final;
|
||||
|
||||
private:
|
||||
bool initializeDriver();
|
||||
|
||||
void* mCuFileLibHandle{};
|
||||
CUfileHandle_t mFileHandle{nullptr};
|
||||
bool mDriverInitialized{false};
|
||||
int32_t mFd{-1};
|
||||
int64_t mCursor{0};
|
||||
int64_t mFileSize{0};
|
||||
|
||||
CUfileError_t (*cuFileDriverOpen)(){};
|
||||
CUfileError_t (*cuFileHandleRegister)(CUfileHandle_t*, CUfileDescr_t*){};
|
||||
CUfileError_t (*cuFileHandleDeregister)(CUfileHandle_t){};
|
||||
CUfileError_t (*cuFileDriverClose)(){};
|
||||
ssize_t (*cuFileRead)(CUfileHandle_t, void*, size_t, int64_t, int64_t){};
|
||||
};
|
||||
@ -101,7 +101,7 @@ std::unique_ptr<DecoderTestShared<TLogits>> SetupDecoderTest(TrivialConstantDeco
|
||||
logger, modelConfig, worldConfig, engine, false, optionalParams);
|
||||
auto const executorConfig = tensorrt_llm::executor::ExecutorConfig(params.maxBeamWidth, executor::SchedulerConfig(),
|
||||
executor::KvCacheConfig{}, true, true, 1, 1, executor::BatchingType::kINFLIGHT, params.maxBatchSize,
|
||||
params.maxNumTokens, std::nullopt, std::nullopt, std::nullopt, std::nullopt, 1, std::nullopt,
|
||||
params.maxNumTokens, std::nullopt, std::nullopt, std::nullopt, std::nullopt, false, 1, std::nullopt,
|
||||
executor::ExtendedRuntimePerfKnobConfig(), std::nullopt, 0,
|
||||
executor::ExecutorConfig::kDefaultMaxSeqIdleMicroseconds, std::nullopt, std::nullopt);
|
||||
return std::make_unique<DecoderTestShared<TLogits>>(
|
||||
|
||||
@ -123,8 +123,8 @@ std::unique_ptr<DecoderTestShared<TLogits>> SetupDecoderTest(
|
||||
auto const executorConfig
|
||||
= executor::ExecutorConfig(params.maxBeamWidth, executor::SchedulerConfig(), kvCacheConfig, true, true, 1, 1,
|
||||
executor::BatchingType::kINFLIGHT, params.maxBatchSize, params.maxNumTokens, std::nullopt, std::nullopt,
|
||||
std::nullopt, std::nullopt, 1, std::nullopt, executor::ExtendedRuntimePerfKnobConfig(), std::nullopt, 0,
|
||||
executor::ExecutorConfig::kDefaultMaxSeqIdleMicroseconds, std::nullopt, std::nullopt,
|
||||
std::nullopt, std::nullopt, false, 1, std::nullopt, executor::ExtendedRuntimePerfKnobConfig(), std::nullopt,
|
||||
0, executor::ExecutorConfig::kDefaultMaxSeqIdleMicroseconds, std::nullopt, std::nullopt,
|
||||
std::vector<executor::AdditionalModelOutput>{
|
||||
executor::AdditionalModelOutput{DecoderTestShared<TLogits>::kTopKTensorName, params.gatherContext}});
|
||||
|
||||
|
||||
@ -766,8 +766,8 @@ TEST(SerializeUtilsTest, ExecutorConfig)
|
||||
texec::KvCacheConfig(true), true, false, 500, 200, texec::BatchingType::kSTATIC, 128, 64,
|
||||
texec::ParallelConfig(texec::CommunicationType::kMPI, texec::CommunicationMode::kORCHESTRATOR),
|
||||
texec::PeftCacheConfig(10), std::nullopt,
|
||||
texec::DecodingConfig(texec::DecodingMode::Lookahead(), texec::LookaheadDecodingConfig(3, 5, 7)), 0.5f, 8,
|
||||
texec::ExtendedRuntimePerfKnobConfig(true), texec::DebugConfig(true), 60000000, 180000000,
|
||||
texec::DecodingConfig(texec::DecodingMode::Lookahead(), texec::LookaheadDecodingConfig(3, 5, 7)), false, 0.5f,
|
||||
8, texec::ExtendedRuntimePerfKnobConfig(true), texec::DebugConfig(true), 60000000, 180000000,
|
||||
texec::SpeculativeDecodingConfig(true),
|
||||
texec::GuidedDecodingConfig(
|
||||
texec::GuidedDecodingConfig::GuidedDecodingBackend::kXGRAMMAR, std::initializer_list<std::string>{"eos"}));
|
||||
@ -788,6 +788,7 @@ TEST(SerializeUtilsTest, ExecutorConfig)
|
||||
executorConfig2.getParallelConfig().value().getCommunicationMode());
|
||||
EXPECT_EQ(executorConfig.getPeftCacheConfig(), executorConfig2.getPeftCacheConfig());
|
||||
EXPECT_EQ(executorConfig.getDecodingConfig(), executorConfig2.getDecodingConfig());
|
||||
EXPECT_EQ(executorConfig.getUseGpuDirectStorage(), executorConfig2.getUseGpuDirectStorage());
|
||||
EXPECT_EQ(executorConfig.getGpuWeightsPercent(), executorConfig2.getGpuWeightsPercent());
|
||||
EXPECT_EQ(executorConfig.getMaxQueueSize(), executorConfig2.getMaxQueueSize());
|
||||
EXPECT_EQ(executorConfig.getExtendedRuntimePerfKnobConfig(), executorConfig2.getExtendedRuntimePerfKnobConfig());
|
||||
|
||||
@ -91,7 +91,7 @@ protected:
|
||||
TEST_F(TllmRuntimeTest, SinglePass)
|
||||
{
|
||||
EXPECT_TRUE(mSerializedEngine);
|
||||
TllmRuntime rt{RawEngine(mSerializedEngine.get()), &mLogger, 1.0F};
|
||||
TllmRuntime rt{RawEngine(mSerializedEngine.get()), &mLogger, false, 1.0F};
|
||||
auto& engine = rt.getEngine();
|
||||
EXPECT_FALSE(engine.hasImplicitBatchDimension());
|
||||
EXPECT_EQ(rt.getNbProfiles(), engine.getNbOptimizationProfiles());
|
||||
|
||||
@ -536,7 +536,8 @@ def main(args):
|
||||
enable_chunked_context=args.enable_chunked_context,
|
||||
multi_block_mode=args.multi_block_mode,
|
||||
cuda_graph_mode=args.cuda_graph_mode,
|
||||
gather_generation_logits=args.eval_ppl)
|
||||
gather_generation_logits=args.eval_ppl,
|
||||
use_gpu_direct_storage=args.use_gpu_direct_storage)
|
||||
runner_kwargs.update(
|
||||
enable_context_fmha_fp32_acc=args.enable_context_fmha_fp32_acc)
|
||||
if args.prompt_lookup_config is not None:
|
||||
@ -867,6 +868,10 @@ if __name__ == '__main__':
|
||||
help=
|
||||
"evaluate.load('rouge') will attempt to pull rouge package from HF. Use cached rouge can avoid network outage of host or HF."
|
||||
)
|
||||
parser.add_argument("--use_gpu_direct_storage",
|
||||
default=False,
|
||||
action="store_true",
|
||||
help="Use GPUDirect Storage (GDS) to load the engine")
|
||||
parser = add_common_args(parser)
|
||||
args = parser.parse_args()
|
||||
|
||||
|
||||
@ -108,6 +108,7 @@ class ModelRunnerCpp(ModelRunnerMixin):
|
||||
lookahead_config: list[int] | None = None,
|
||||
debug_mode: bool = False,
|
||||
lora_ckpt_source: str = "hf",
|
||||
use_gpu_direct_storage: bool = False,
|
||||
gpu_weights_percent: float = 1,
|
||||
max_tokens_in_paged_kv_cache: int | None = None,
|
||||
kv_cache_enable_block_reuse: bool = False,
|
||||
@ -385,6 +386,7 @@ class ModelRunnerCpp(ModelRunnerMixin):
|
||||
decoding_config=decoding_config,
|
||||
peft_cache_config=peft_cache_config,
|
||||
debug_config=debug_config,
|
||||
use_gpu_direct_storage=use_gpu_direct_storage,
|
||||
gpu_weights_percent=gpu_weights_percent,
|
||||
gather_generation_logits=gather_generation_logits,
|
||||
use_variable_beam_width_search=use_variable_beam_width_search,
|
||||
|
||||
@ -1526,6 +1526,7 @@ def test_executor_config():
|
||||
assert config.additional_model_outputs is None
|
||||
assert config.gather_generation_logits is False
|
||||
assert config.use_variable_beam_width_search is False
|
||||
assert config.use_gpu_direct_storage is False
|
||||
|
||||
kwargs = {
|
||||
"max_beam_width":
|
||||
@ -1575,6 +1576,8 @@ def test_executor_config():
|
||||
"gather_generation_logits":
|
||||
True,
|
||||
"use_variable_beam_width_search":
|
||||
True,
|
||||
"use_gpu_direct_storage":
|
||||
True
|
||||
}
|
||||
config = trtllm.ExecutorConfig(**kwargs)
|
||||
@ -1599,6 +1602,7 @@ def test_executor_config():
|
||||
assert config.additional_model_outputs[0].gather_context is False
|
||||
assert config.gather_generation_logits is True
|
||||
assert config.use_variable_beam_width_search is True
|
||||
assert config.use_gpu_direct_storage is True
|
||||
|
||||
|
||||
def test_parallel_config():
|
||||
@ -2354,6 +2358,7 @@ def test_executor_config_pickle():
|
||||
assert config.max_seq_idle_microseconds == config_copy.max_seq_idle_microseconds
|
||||
assert config.backend == config_copy.backend
|
||||
assert config.spec_dec_config.fast_logits == config_copy.spec_dec_config.fast_logits
|
||||
assert config.use_gpu_direct_storage == config_copy.use_gpu_direct_storage
|
||||
|
||||
|
||||
def test_return_full_tokens():
|
||||
|
||||
Loading…
Reference in New Issue
Block a user