TensorRT-LLMs/cpp/tensorrt_llm/kernels/userbuffers/userbuffersManager.cpp
Ludwig Schneider 41ce14ab04
[None][feat] Enable NCCL_SYMMETRIC as default fallback for AllReduce (#9314)
Signed-off-by: Ludwig Schneider <lschneider@nvidia.com>
2025-12-07 09:43:26 -08:00

106 lines
3.6 KiB
C++

/*
* Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "userbuffersManager.h"
#include "tensorrt_llm/common/logger.h"
namespace tensorrt_llm::runtime::ub
{
void UserBufferDeleter::operator()(void* ptr)
{
UserBuffersManager::get_instance().release_buffer(ptr);
}
UserBuffersManager& UserBuffersManager::get_instance()
{
static UserBuffersManager allocator;
return allocator;
}
void UserBuffersManager::initialize(
int64_t tp_size, int64_t pp_size, int64_t cp_size, int64_t rank, int64_t gpus_per_node, int64_t buffer_size)
{
std::lock_guard<std::mutex> lock(mutex_);
tensorrt_llm::runtime::WorldConfig world_config(tp_size, pp_size, cp_size, rank, gpus_per_node);
tensorrt_llm::runtime::ub::ub_initialize(world_config);
TLLM_CHECK(tensorrt_llm::runtime::ub::ub_is_initialized());
buffer_size_ = buffer_size;
}
std::pair<UBBufferPtr, UBBuffer> UserBuffersManager::allocate_userbuffers(int64_t buffer_size)
{
std::lock_guard<std::mutex> lock(mutex_);
TLLM_CHECK(buffer_size <= buffer_size_);
// Check for all unused buffers
int i = 0;
for (auto& buffer : buffers_)
{
if (buffer.second)
{
i++;
continue;
}
buffer.second = true;
TLLM_LOG_DEBUG("Reusing buffer %d", i);
return std::make_pair(std::unique_ptr<void, UserBufferDeleter>(buffer.first.addr), buffer.first);
}
auto new_ub = tensorrt_llm::runtime::ub::ub_allocate(buffer_size_);
TLLM_CHECK(!new_ub.invalid());
buffers_.push_back({new_ub, true});
TLLM_LOG_DEBUG("Creating new buffer %d", static_cast<int>(buffers_.size() - 1));
return std::make_pair(std::unique_ptr<void, UserBufferDeleter>(new_ub.addr), new_ub);
}
void UserBuffersManager::release_buffer(void* addr)
{
std::lock_guard<std::mutex> lock(mutex_);
auto buffer_iter = std::find_if(
buffers_.begin(), buffers_.end(), [addr](auto const& buffer) { return buffer.first.addr == addr; });
// The UB should be assigned to a tensor
TLLM_CHECK(buffer_iter != buffers_.end());
TLLM_CHECK(buffer_iter->second);
TLLM_CHECK(!buffer_iter->first.invalid());
TLLM_LOG_DEBUG("Releasing buffer %d", static_cast<int>(std::distance(buffers_.begin(), buffer_iter)));
buffer_iter->second = false;
}
tensorrt_llm::runtime::ub::UBBuffer UserBuffersManager::search_buffer(void* addr)
{
std::lock_guard<std::mutex> lock(mutex_);
auto buffer_iter = std::find_if(
buffers_.begin(), buffers_.end(), [addr](auto const& buffer) { return buffer.first.addr == addr; });
if (buffer_iter == buffers_.end())
{
return tensorrt_llm::runtime::ub::UBBuffer();
}
return buffer_iter->first;
}
tensorrt_llm::runtime::ub::communicator* UserBuffersManager::comm()
{
return tensorrt_llm::runtime::ub::ub_comm();
}
void initialize_userbuffers_manager(
int64_t tp_size, int64_t pp_size, int64_t cp_size, int64_t rank, int64_t gpus_per_node, int64_t buffer_size)
{
UserBuffersManager::get_instance().initialize(tp_size, pp_size, cp_size, rank, gpus_per_node, buffer_size);
}
} // namespace tensorrt_llm::runtime::ub