TensorRT-LLMs/cpp/tensorrt_llm/batch_manager/kvCacheTransferManager.cpp
liji-nv 1d4f748773
[fix] Fix illegal mem access and possible accuracy lose. Cherry-pick … (#5017)
Signed-off-by: Jin Li <59594262+liji-nv@users.noreply.github.com>
2025-06-09 17:50:57 +08:00

317 lines
12 KiB
C++

/*
* SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include <cstdint>
#include "tensorrt_llm/batch_manager/kvCacheTransferManager.h"
#include "tensorrt_llm/batch_manager/kvCacheEventManager.h"
#include "tensorrt_llm/batch_manager/kvCacheManager.h"
#include "tensorrt_llm/common/logger.h"
#include "tensorrt_llm/executor/executor.h"
#include "tensorrt_llm/kernels/kvCachePartialCopy.h"
#include "tensorrt_llm/runtime/bufferManager.h"
#include "tensorrt_llm/runtime/cudaEvent.h"
#include "tensorrt_llm/runtime/cudaStream.h"
#ifdef ENABLE_CUFILE
#include <cufile.h>
#endif
#include <fcntl.h>
#include <sys/stat.h>
#include <sys/types.h>
#include <unistd.h>
#include <cstring>
#include <string>
#include <vector>
namespace tr = tensorrt_llm::runtime;
namespace tk = tensorrt_llm::kernels;
namespace tensorrt_llm::batch_manager::kv_cache_manager
{
static bool gpuToFilePosix(tr::ITensor::SharedPtr const& srcPtr, std::string const& filename)
{
int fd = ::open(filename.c_str(), O_CREAT | O_WRONLY, 0664);
TLLM_CHECK_WITH_INFO(fd >= 0, "Failed to open '%s' for writing (POSIX fallback)", filename.c_str());
ssize_t numBytes = static_cast<ssize_t>(srcPtr->getSizeInBytes());
std::vector<uint8_t> hostBuffer(numBytes);
cudaError_t cpyErr = cudaMemcpy(hostBuffer.data(), srcPtr->data(), numBytes, cudaMemcpyDeviceToHost);
TLLM_CHECK_WITH_INFO(cpyErr == cudaSuccess, "cudaMemcpy to host failed, error=%d", cpyErr);
ssize_t written = ::write(fd, hostBuffer.data(), numBytes);
TLLM_CHECK_WITH_INFO(written >= 0, "POSIX write error=%zd", written);
TLLM_LOG_DEBUG("Wrote %zd bytes to %s (POSIX fallback)", written, filename.c_str());
::close(fd);
return true;
}
static bool fileToGpuPosix(tr::ITensor::SharedPtr const& dstPtr, std::string const& filename)
{
int fd = ::open(filename.c_str(), O_RDONLY);
TLLM_CHECK_WITH_INFO(fd >= 0, "Failed to open '%s' for reading (POSIX fallback)", filename.c_str());
ssize_t numBytes = static_cast<ssize_t>(dstPtr->getSizeInBytes());
std::vector<uint8_t> hostBuffer(numBytes);
ssize_t bytesRead = ::read(fd, hostBuffer.data(), numBytes);
TLLM_CHECK_WITH_INFO(bytesRead >= 0, "POSIX read error=%zd", bytesRead);
TLLM_LOG_DEBUG("Read %zd bytes from %s (POSIX fallback)", bytesRead, filename.c_str());
cudaError_t cpyErr = cudaMemcpy(dstPtr->data(), hostBuffer.data(), numBytes, cudaMemcpyHostToDevice);
TLLM_CHECK_WITH_INFO(cpyErr == cudaSuccess, "cudaMemcpy to device failed, error=%d", cpyErr);
::close(fd);
return true;
}
KVCacheTransferManager::KVCacheTransferManager(tr::BufferManager const& bufferManager)
: mBufferManager{bufferManager}
, mOnboardManager(std::make_shared<tr::CudaStream>())
, mOffloadManager(std::make_shared<tr::CudaStream>())
{
}
tr::ITensor::SharedPtr KVCacheTransferManager::computeBlockPointer(
BlockPtr const& block, std::vector<KVCacheBlockPool> const& pools, size_t poolIdx)
{
TLLM_CHECK_WITH_INFO(!pools.empty(), "Pool index %lu is out of bounds", poolIdx);
auto const& pool = pools.at(poolIdx);
auto ptr = block->isPrimary() ? pool.primaryPtr : pool.secondaryPtr;
auto const blockOffset = block->getMemoryPoolBlockIndex();
tr::ITensor::SharedPtr blockTensor{tr::ITensor::slice(ptr, blockOffset, 1)};
return blockTensor;
}
void KVCacheTransferManager::copyBlock(BlockPtr const& src, BlockPtr const& dst,
std::vector<KVCacheBlockPool> const& pools, bool isOffload, int numTokensToCopy, executor::KvCacheTransferMode mode,
std::optional<std::string> directory)
{
TLLM_LOG_DEBUG("copyBlock entered: srcId=%d, dstId=%d, isOffload=%s, mode=%d", src->getBlockId(), dst->getBlockId(),
(isOffload ? "true" : "false"), static_cast<int>(mode));
if (mode == executor::KvCacheTransferMode::DRAM)
{
TLLM_LOG_DEBUG("Using DRAM-based copy (GPU <-> CPU) for this block.");
// Iterate over all pools, partial-copy logic
for (size_t poolIdx = 0; poolIdx < pools.size(); ++poolIdx)
{
auto srcPtr = computeBlockPointer(src, pools, poolIdx);
auto dstPtr = computeBlockPointer(dst, pools, poolIdx);
// If no partial tokens or if the dataType is not supported for partial copy, copy entire block.
if (numTokensToCopy <= 0 || srcPtr->getDataType() == nvinfer1::DataType::kINT4
|| srcPtr->getDataType() == nvinfer1::DataType::kFP4)
{
// For partial copy not implemented with these data types,
// just do a full copy.
(isOffload ? mOffloadManager : mOnboardManager).copy(*srcPtr, *dstPtr);
}
else
{
int const tokensPerBlock = pools[poolIdx].tokensPerBlock;
if (numTokensToCopy >= tokensPerBlock)
{
// If requested tokens >= entire block, just do a full copy.
(isOffload ? mOffloadManager : mOnboardManager).copy(*srcPtr, *dstPtr);
}
else
{
auto stream = (isOffload ? mOffloadManager : mOnboardManager).getStream().get();
int const numLayers = pools[poolIdx].numLayers;
int const kvFactor = pools[poolIdx].kvFactor;
int const numHeads = pools[poolIdx].numKvHeads;
int const sizePerHead = pools[poolIdx].sizePerHead;
auto shape = srcPtr->getShape();
TLLM_CHECK_WITH_INFO(
shape.nbDims == 4, "Expected KVCache block to have 4 dims, got %d", shape.nbDims);
tk::kvCacheBlockPartialCopy(*dstPtr, *srcPtr, numLayers, numHeads, tokensPerBlock, sizePerHead,
numTokensToCopy, kvFactor, stream);
}
}
}
TLLM_LOG_DEBUG("copyBlock: DRAM mode complete. Returning...");
return;
}
for (size_t poolIdx = 0; poolIdx < pools.size(); ++poolIdx)
{
auto srcPtr = computeBlockPointer(src, pools, poolIdx);
auto dstPtr = computeBlockPointer(dst, pools, poolIdx);
TLLM_CHECK_WITH_INFO(
directory.has_value(), "Expected a directory path for KVCache offload, but none was provided.");
int size = std::snprintf(
nullptr, 0, "%s/block_%d_pool_%zu.bin", directory.value().c_str(), src->getBlockId(), poolIdx);
std::string filename(size + 1, '\0');
std::snprintf(filename.data(), filename.size(), "%s/block_%d_pool_%zu.bin", directory.value().c_str(),
src->getBlockId(), poolIdx);
if (mode == executor::KvCacheTransferMode::POSIX_DEBUG_FALLBACK)
{
TLLM_LOG_INFO("Forcing POSIX fallback for file: %s", filename.c_str());
if (isOffload)
{
gpuToFilePosix(srcPtr, filename);
}
else
{
fileToGpuPosix(dstPtr, filename);
}
continue;
}
int openFlags = isOffload ? (O_CREAT | O_WRONLY) : O_RDONLY;
int fd = ::open(filename.c_str(), openFlags, 0664);
if (fd < 0)
{
TLLM_LOG_ERROR(
"Failed to open '%s' for %s; fallback POSIX", filename.c_str(), (isOffload ? "writing" : "reading"));
if (isOffload)
{
gpuToFilePosix(srcPtr, filename);
}
else
{
fileToGpuPosix(dstPtr, filename);
}
continue;
}
#ifdef ENABLE_CUFILE
CUfileDescr_t cufileDesc = {};
cufileDesc.type = CU_FILE_HANDLE_TYPE_OPAQUE_FD;
cufileDesc.handle.fd = fd;
CUfileHandle_t cufileHandle;
CUfileError_t status = cuFileHandleRegister(&cufileHandle, &cufileDesc);
if (status.err != CU_FILE_SUCCESS)
{
// Fallback to POSIX
TLLM_LOG_WARN(
"cuFileHandleRegister failed (err=%d). Falling back to POSIX for '%s'", status.err, filename.c_str());
::close(fd);
if (isOffload)
gpuToFilePosix(srcPtr, filename);
else
fileToGpuPosix(dstPtr, filename);
continue;
}
ssize_t numBytes = static_cast<ssize_t>(srcPtr->getSizeInBytes());
if (isOffload)
{
ssize_t written = cuFileWrite(cufileHandle, srcPtr->data(), numBytes, 0, 0);
if (written < 0)
{
TLLM_LOG_ERROR("cuFileWrite error=%zd. Fallback to POSIX", written);
cuFileHandleDeregister(cufileHandle);
::close(fd);
gpuToFilePosix(srcPtr, filename);
continue;
}
}
else
{
ssize_t readCount = cuFileRead(cufileHandle, dstPtr->data(), numBytes, 0, 0);
if (readCount < 0)
{
TLLM_LOG_ERROR("cuFileRead error=%zd. Fallback to POSIX", readCount);
cuFileHandleDeregister(cufileHandle);
::close(fd);
fileToGpuPosix(dstPtr, filename);
continue;
}
}
cuFileHandleDeregister(cufileHandle);
::close(fd);
#else
// If GDS isn't enabled, fallback to POSIX automatically
TLLM_LOG_DEBUG("ENABLE_CUFILE=OFF, so fallback to POSIX for %s", filename.c_str());
::close(fd); // close the file opened for GDS
if (isOffload)
{
gpuToFilePosix(srcPtr, filename);
}
else
{
fileToGpuPosix(dstPtr, filename);
}
#endif
}
}
void KVCacheTransferManager::onboard(BlockPtr const& offloadBlock, BlockPtr const& block,
std::vector<KVCacheBlockPool> const& pools, int numTokensToCopy, executor::KvCacheTransferMode mode,
std::optional<std::string> directory)
{
if (mode != executor::KvCacheTransferMode::DRAM
&& mPendingOffloads.find(offloadBlock->getBlockId()) == mPendingOffloads.end())
{
TLLM_LOG_DEBUG("Skipping onboard for block %d because it was never previously offloaded to disk",
offloadBlock->getBlockId());
return;
}
if (mPendingOffloads.find(offloadBlock->getBlockId()) != mPendingOffloads.end())
{
mOnboardManager.getStream().wait(mPendingOffloads[offloadBlock->getBlockId()]);
}
copyBlock(offloadBlock, block, pools, false, numTokensToCopy, mode, directory);
}
void KVCacheTransferManager::offload(BlockPtr const& block, BlockPtr const& offloadBlock,
std::vector<KVCacheBlockPool> const& pools, int numTokensToCopy, executor::KvCacheTransferMode mode,
std::optional<std::string> directory)
{
mPendingOffloads[block->getBlockId()] = tr::CudaEvent();
copyBlock(block, offloadBlock, pools, true, numTokensToCopy, mode, directory);
mOffloadManager.getStream().record(mPendingOffloads[block->getBlockId()]);
}
void KVCacheTransferManager::syncTransfers()
{
tr::CudaEvent offloadEvent;
mOffloadManager.getStream().record(offloadEvent);
tr::CudaEvent onboardEvent;
mOnboardManager.getStream().record(onboardEvent);
mBufferManager.getStream().wait(offloadEvent);
mBufferManager.getStream().wait(onboardEvent);
// Once we synchronize, clear our list of pending thransfers.
mPendingOffloads.clear();
}
} // namespace tensorrt_llm::batch_manager::kv_cache_manager