TensorRT-LLMs/cpp/tensorrt_llm/common/mcastDevMemUtils.cpp
Zongfei Jing dbaddb3a29
Adding two-shot allreduce kernel and mnnvl multicasting buffer (#4216)
* Adding two-shot allreduce kernel and mnnvl multicasting buffergit gffe

Signed-off-by: Shiyu Li <shili@nvidia.com>

Adding comments

Signed-off-by: Shiyu Li <shili@nvidia.com>

Add unittest of the twoshot kernel.

Signed-off-by: Shiyu Li <shili@nvidia.com>

Update dispatch logic

Signed-off-by: Shiyu Li <shili@nvidia.com>

Use cpu barrier instead of GPU at init

Signed-off-by: Shiyu Li <shili@nvidia.com>

Merge dispatch logic fix

Signed-off-by: Shiyu Li <shili@nvidia.com>

Update the kernel to use GPU-managed buffer

Signed-off-by: Shiyu Li <shili@nvidia.com>

* Refine

Signed-off-by: Zongfei Jing <20381269+zongfeijing@users.noreply.github.com>

* Clean code

Signed-off-by: Zongfei Jing <20381269+zongfeijing@users.noreply.github.com>

* Fix compile error

Signed-off-by: Zongfei Jing <20381269+zongfeijing@users.noreply.github.com>

* Fix issue

Signed-off-by: Zongfei Jing <20381269+zongfeijing@users.noreply.github.com>

* Clean up

Signed-off-by: Zongfei Jing <20381269+zongfeijing@users.noreply.github.com>

* Simplify AllReduce interface

Signed-off-by: Zongfei Jing <20381269+zongfeijing@users.noreply.github.com>

* Rename

Signed-off-by: Zongfei Jing <20381269+zongfeijing@users.noreply.github.com>

* Fix warning

Signed-off-by: Zongfei Jing <20381269+zongfeijing@users.noreply.github.com>

* Tidy code

Signed-off-by: Zongfei Jing <20381269+zongfeijing@users.noreply.github.com>

* Rename

Signed-off-by: Zongfei Jing <20381269+zongfeijing@users.noreply.github.com>

* Fix compile error

Signed-off-by: Zongfei Jing <20381269+zongfeijing@users.noreply.github.com>

* Refine

Signed-off-by: Zongfei Jing <20381269+zongfeijing@users.noreply.github.com>

* Skip ut for no_fusion

Signed-off-by: Zongfei Jing <20381269+zongfeijing@users.noreply.github.com>

* Refine

Signed-off-by: Zongfei Jing <20381269+zongfeijing@users.noreply.github.com>

---------

Signed-off-by: Shiyu Li <shili@nvidia.com>
Signed-off-by: Zongfei Jing <20381269+zongfeijing@users.noreply.github.com>
Co-authored-by: Shiyu Li <shili@nvidia.com>
2025-05-22 03:42:36 +08:00

88 lines
2.4 KiB
C++

/*
* Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "mcastDevMemUtils.h"
#include <unordered_map>
namespace tensorrt_llm::common
{
using McastDeviceMemory = tensorrt_llm::runtime::McastDeviceMemory;
namespace
{
class McastDevMemBufferRegistry
{
public:
McastDevMemBufferRegistry(McastDevMemBufferRegistry const&) = delete;
McastDevMemBufferRegistry& operator=(McastDevMemBufferRegistry const&) = delete;
static McastDevMemBufferRegistry& getInstance()
{
static McastDevMemBufferRegistry instance;
return instance;
}
void registerBuffer(void* ptr, McastDeviceMemory* buf)
{
_ptr_to_buf[ptr] = buf;
}
void unregisterBuffer(McastDeviceMemory* buf)
{
// Potential performance issue! Can use erase-if when we adopt C++20
// Remove mappings in the table
for (auto it = _ptr_to_buf.begin(); it != _ptr_to_buf.end();)
{
if (it->second == buf)
{
it = _ptr_to_buf.erase(it);
}
else
{
++it;
}
}
}
McastDeviceMemory* findBuffer(void* ptr)
{
auto it = _ptr_to_buf.find(ptr);
return it == _ptr_to_buf.end() ? nullptr : it->second;
}
private:
McastDevMemBufferRegistry() = default;
~McastDevMemBufferRegistry() = default;
std::unordered_map<void*, McastDeviceMemory*> _ptr_to_buf;
};
} // namespace
void registerMcastDevMemBuffer(void* ptr, McastDeviceMemory* buf)
{
McastDevMemBufferRegistry::getInstance().registerBuffer(ptr, buf);
}
void unregisterMcastDevMemBuffer(McastDeviceMemory* buf)
{
McastDevMemBufferRegistry::getInstance().unregisterBuffer(buf);
}
McastDeviceMemory* findMcastDevMemBuffer(void* ptr)
{
return McastDevMemBufferRegistry::getInstance().findBuffer(ptr);
}
} // namespace tensorrt_llm::common