TensorRT-LLMs/cpp/include/tensorrt_llm/batch_manager/peftCacheManagerConfig.h
Dan Blanaru 16d2467ea8 Update TensorRT-LLM (#2755)
* Update TensorRT-LLM

---------

Co-authored-by: Denis Kayshev <topenkoff@gmail.com>
Co-authored-by: akhoroshev <arthoroshev@gmail.com>
Co-authored-by: Patrick Reiter Horn <patrick.horn@gmail.com>

Update
2025-02-11 03:01:00 +00:00

97 lines
4.1 KiB
C++

/*
* SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once
#include "tensorrt_llm/executor/executor.h"
#include "tensorrt_llm/runtime/common.h"
#include <optional>
namespace tensorrt_llm::batch_manager
{
using runtime::SizeType32;
struct PeftCacheManagerConfig
{
static float constexpr kDefaultDeviceCachePercent = 0.02;
static size_t constexpr kDefaultHostCacheSize = 1024 * 1024 * 1024;
explicit PeftCacheManagerConfig(SizeType32 numHostModuleLayer = 0, SizeType32 numDeviceModuleLayer = 0,
SizeType32 optimalAdapterSize = 8, SizeType32 maxAdapterSize = 64, SizeType32 numPutWorkers = 1,
SizeType32 numEnsureWorkers = 1, SizeType32 numCopyStreams = 1, SizeType32 maxPagesPerBlockHost = 24,
SizeType32 maxPagesPerBlockDevice = 8, std::optional<float> deviceCachePercent = std::nullopt,
std::optional<size_t> hostCacheSize = std::nullopt, std::optional<std::string> loraPrefetchDir = std::nullopt)
: numHostModuleLayer(numHostModuleLayer)
, numDeviceModuleLayer(numDeviceModuleLayer)
, optimalAdapterSize(optimalAdapterSize)
, maxAdapterSize(maxAdapterSize)
, numPutWorkers(numPutWorkers)
, numEnsureWorkers(numEnsureWorkers)
, numCopyStreams(numCopyStreams)
, maxPagesPerBlockHost(maxPagesPerBlockHost)
, maxPagesPerBlockDevice(maxPagesPerBlockDevice)
, deviceCachePercent(deviceCachePercent)
, hostCacheSize(hostCacheSize)
, loraPrefetchDir(loraPrefetchDir)
{
}
explicit PeftCacheManagerConfig(executor::PeftCacheConfig cfg)
: numHostModuleLayer(cfg.getNumHostModuleLayer())
, numDeviceModuleLayer(cfg.getNumDeviceModuleLayer())
, optimalAdapterSize(cfg.getOptimalAdapterSize())
, maxAdapterSize(cfg.getMaxAdapterSize())
, numPutWorkers(cfg.getNumPutWorkers())
, numEnsureWorkers(cfg.getNumEnsureWorkers())
, numCopyStreams(cfg.getNumCopyStreams())
, maxPagesPerBlockHost(cfg.getMaxPagesPerBlockHost())
, maxPagesPerBlockDevice(cfg.getMaxPagesPerBlockDevice())
, deviceCachePercent(cfg.getDeviceCachePercent())
, hostCacheSize(cfg.getHostCacheSize())
, loraPrefetchDir(cfg.getLoraPrefetchDir())
{
}
// number of max sized 1-layer 1-module adapterSize=1 sets of weights that can be stored in host cache
SizeType32 numHostModuleLayer;
// number of max sized 1-layer 1-module sets of weights that can be stored in host cache
SizeType32 numDeviceModuleLayer;
// optimal adapter size used to set page width
SizeType32 optimalAdapterSize;
// max supported adapter size. Used to compute minimum
SizeType32 maxAdapterSize;
// number of worker threads used to put weights into host cache
SizeType32 numPutWorkers;
// number of worker threads used to copy weights from host to device
SizeType32 numEnsureWorkers;
// number of streams used to copy weights from host to device
SizeType32 numCopyStreams;
// Number of cache pages per allocation block (host)
SizeType32 maxPagesPerBlockHost;
// Number of cache pages per allocation block (device)
SizeType32 maxPagesPerBlockDevice;
// percent of memory after engine load to use for cache
std::optional<float> deviceCachePercent;
// size in bytes to use for host cache
std::optional<size_t> hostCacheSize;
// folder to store the LoRA weights we hope to load during engine initialization
std::optional<std::string> loraPrefetchDir;
};
} // namespace tensorrt_llm::batch_manager