mirror of
https://github.com/NVIDIA/TensorRT-LLM.git
synced 2026-02-13 22:43:46 +08:00
Merge branch 'main' into update_mnnvl_test
Signed-off-by: Bo Li <22713281+bobboli@users.noreply.github.com>
This commit is contained in:
commit
e72b9f9377
@ -25486,7 +25486,7 @@ limitations under the License.
|
||||
```
|
||||
|
||||
### URLs
|
||||
- `Homepage`: https://github.com/NVIDIA/TensorRT-Model-Optimizer
|
||||
- `Homepage`: https://github.com/NVIDIA/Model-Optimizer
|
||||
|
||||
|
||||
## nvidia-modelopt-core (0.33.1)
|
||||
@ -25513,7 +25513,7 @@ limitations under the License.
|
||||
```
|
||||
|
||||
### URLs
|
||||
- `Homepage`: https://github.com/NVIDIA/TensorRT-Model-Optimizer
|
||||
- `Homepage`: https://github.com/NVIDIA/Model-Optimizer
|
||||
|
||||
|
||||
## nvidia-nccl-cu12 (2.27.3)
|
||||
|
||||
@ -164,7 +164,7 @@ state-of-the-art optimizations to perform inference efficiently on NVIDIA GPUs.<
|
||||
[➡️ link](https://www.bentoml.com/blog/tuning-tensor-rt-llm-for-optimal-serving-with-bentoml)
|
||||
|
||||
|
||||
* [2024/08/20] 🏎️SDXL with #TensorRT Model Optimizer ⏱️⚡ 🏁 cache diffusion 🏁 quantization aware training 🏁 QLoRA 🏁 #Python 3.12
|
||||
* [2024/08/20] 🏎️SDXL with #Model Optimizer ⏱️⚡ 🏁 cache diffusion 🏁 quantization aware training 🏁 QLoRA 🏁 #Python 3.12
|
||||
[➡️ link](https://developer.nvidia.com/blog/nvidia-tensorrt-model-optimizer-v0-15-boosts-inference-performance-and-expands-model-support/)
|
||||
|
||||
* [2024/08/13] 🐍 DIY Code Completion with #Mamba ⚡ #TensorRT #LLM for speed 🤖 NIM for ease ☁️ deploy anywhere
|
||||
@ -209,7 +209,7 @@ Technical Deep Dive for serious coders ✅+99% compression ✅1 set of weights
|
||||
* [2024/05/21] ✨@modal_labs has the codes for serverless @AIatMeta Llama 3 on #TensorRT #LLM ✨👀 📚 Marvelous Modal Manual:
|
||||
Serverless TensorRT LLM (LLaMA 3 8B) | Modal Docs [➡️ link](https://modal.com/docs/examples/trtllm_llama)
|
||||
|
||||
* [2024/05/08] NVIDIA TensorRT Model Optimizer -- the newest member of the #TensorRT ecosystem is a library of post-training and training-in-the-loop model optimization techniques ✅quantization ✅sparsity ✅QAT [➡️ blog](https://developer.nvidia.com/blog/accelerate-generative-ai-inference-performance-with-nvidia-tensorrt-model-optimizer-now-publicly-available/)
|
||||
* [2024/05/08] NVIDIA Model Optimizer -- the newest member of the #TensorRT ecosystem is a library of post-training and training-in-the-loop model optimization techniques ✅quantization ✅sparsity ✅QAT [➡️ blog](https://developer.nvidia.com/blog/accelerate-generative-ai-inference-performance-with-nvidia-tensorrt-model-optimizer-now-publicly-available/)
|
||||
|
||||
* [2024/05/07] 🦙🦙🦙 24,000 tokens per second 🛫Meta Llama 3 takes off with #TensorRT #LLM 📚[➡️ link](https://blogs.nvidia.com/blog/meta-llama3-inference-acceleration/)
|
||||
|
||||
|
||||
@ -81,7 +81,6 @@ inline AllReduceStrategyType SelectStrategyLP(size_t seq_len, size_t hidden_size
|
||||
{
|
||||
return AllReduceStrategyType::ONESHOT;
|
||||
}
|
||||
return AllReduceStrategyType::NCCL;
|
||||
}
|
||||
|
||||
// use 1D vector to store the best strategy instead of a map for each sm version
|
||||
@ -143,7 +142,7 @@ inline AllReduceStrategyType selectStrategyLookUpTable(
|
||||
sm_version = 100;
|
||||
}
|
||||
|
||||
// Check if the entry is out of bounds, otherwise return NCCL as fallback
|
||||
// Check if the entry is out of bounds, otherwise return NCCL_SYMMETRIC as fallback
|
||||
if (AllReduceBestStrategyTable.find(sm_version) == AllReduceBestStrategyTable.end()
|
||||
|| tp_index >= AllReduceBestStrategyTable.at(sm_version).size()
|
||||
|| fusion_op_index >= AllReduceBestStrategyTable.at(sm_version).at(tp_index).size()
|
||||
@ -151,7 +150,7 @@ inline AllReduceStrategyType selectStrategyLookUpTable(
|
||||
|| num_token_index
|
||||
>= AllReduceBestStrategyTable.at(sm_version).at(tp_index).at(fusion_op_index).at(hidden_size_index).size())
|
||||
{
|
||||
return AllReduceStrategyType::NCCL;
|
||||
return AllReduceStrategyType::NCCL_SYMMETRIC;
|
||||
}
|
||||
|
||||
return static_cast<AllReduceStrategyType>(
|
||||
|
||||
585
cpp/tensorrt_llm/common/ncclUtils.cpp
Normal file
585
cpp/tensorrt_llm/common/ncclUtils.cpp
Normal file
@ -0,0 +1,585 @@
|
||||
/*
|
||||
* Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include "tensorrt_llm/common/ncclUtils.h"
|
||||
|
||||
#if ENABLE_MULTI_DEVICE
|
||||
|
||||
#include "tensorrt_llm/common/assert.h"
|
||||
#include "tensorrt_llm/common/cudaUtils.h"
|
||||
#include "tensorrt_llm/common/logger.h"
|
||||
#include <limits>
|
||||
#include <stdexcept>
|
||||
|
||||
namespace tensorrt_llm::common::nccl_util
|
||||
{
|
||||
|
||||
//==============================================================================
|
||||
// NcclCommResourceManager Implementation
|
||||
//==============================================================================
|
||||
|
||||
NcclCommResourceManager& NcclCommResourceManager::getInstance() noexcept
|
||||
{
|
||||
static NcclCommResourceManager instance;
|
||||
return instance;
|
||||
}
|
||||
|
||||
void NcclCommResourceManager::registerResource(ncclComm_t comm, ResourceCleanupFunc cleanup, char const* debugName)
|
||||
{
|
||||
if (!comm)
|
||||
{
|
||||
TLLM_LOG_WARNING("[NCCLUtil] Attempted to register resource for null NCCL comm");
|
||||
return;
|
||||
}
|
||||
|
||||
std::lock_guard<std::mutex> lock(mMutex);
|
||||
auto& resources = mCommResources[comm];
|
||||
resources.emplace_back(std::move(cleanup), debugName ? debugName : "unnamed");
|
||||
|
||||
TLLM_LOG_TRACE("[NCCLUtil] Registered resource '%s' for NCCL comm %p (total: %zu)",
|
||||
debugName ? debugName : "unnamed", static_cast<void*>(comm), resources.size());
|
||||
}
|
||||
|
||||
void NcclCommResourceManager::cleanupResources(ncclComm_t comm) noexcept
|
||||
{
|
||||
if (!comm)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
std::vector<ResourceEntry> resourcesToClean;
|
||||
|
||||
{
|
||||
std::lock_guard<std::mutex> lock(mMutex);
|
||||
auto it = mCommResources.find(comm);
|
||||
if (it == mCommResources.end())
|
||||
{
|
||||
// Nothing registered for this comm, nothing to clean up
|
||||
return;
|
||||
}
|
||||
|
||||
// Move resources out (preserves order) and remove from map
|
||||
resourcesToClean = std::move(it->second);
|
||||
mCommResources.erase(it);
|
||||
|
||||
TLLM_LOG_TRACE(
|
||||
"[NCCLUtil] Cleaning up %zu resources for NCCL comm %p", resourcesToClean.size(), static_cast<void*>(comm));
|
||||
}
|
||||
|
||||
// Clean up outside the lock to avoid deadlocks if cleanup functions try to access the manager
|
||||
// Order is preserved: resources are cleaned up in registration order
|
||||
for (auto& [cleanup, name] : resourcesToClean)
|
||||
{
|
||||
try
|
||||
{
|
||||
TLLM_LOG_TRACE(
|
||||
"[NCCLUtil] Cleaning up resource '%s' for NCCL comm %p", name.c_str(), static_cast<void*>(comm));
|
||||
cleanup();
|
||||
}
|
||||
catch (std::exception const& e)
|
||||
{
|
||||
TLLM_LOG_ERROR("[NCCLUtil] Exception during cleanup of resource '%s' for NCCL comm %p: %s", name.c_str(),
|
||||
static_cast<void*>(comm), e.what());
|
||||
}
|
||||
catch (...)
|
||||
{
|
||||
TLLM_LOG_ERROR("[NCCLUtil] Unknown exception during cleanup of resource '%s' for NCCL comm %p",
|
||||
name.c_str(), static_cast<void*>(comm));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
bool NcclCommResourceManager::hasResources(ncclComm_t comm) const noexcept
|
||||
{
|
||||
std::lock_guard<std::mutex> lock(mMutex);
|
||||
return mCommResources.find(comm) != mCommResources.end();
|
||||
}
|
||||
|
||||
size_t NcclCommResourceManager::getResourceCount(ncclComm_t comm) const noexcept
|
||||
{
|
||||
std::lock_guard<std::mutex> lock(mMutex);
|
||||
auto it = mCommResources.find(comm);
|
||||
return it != mCommResources.end() ? it->second.size() : 0;
|
||||
}
|
||||
|
||||
//==============================================================================
|
||||
// NCCLHelper Implementation
|
||||
//==============================================================================
|
||||
|
||||
NCCLHelper& NCCLHelper::getInstance()
|
||||
{
|
||||
static NCCLHelper instance;
|
||||
return instance;
|
||||
}
|
||||
|
||||
NCCLHelper::NCCLHelper()
|
||||
: mLibraryHandle(nullptr)
|
||||
, mNCCLCommWindowRegister(nullptr)
|
||||
, mNCCLMemAlloc(nullptr)
|
||||
, mIsLoaded(false)
|
||||
{
|
||||
loadNCCLLibrary();
|
||||
}
|
||||
|
||||
NCCLHelper::~NCCLHelper()
|
||||
{
|
||||
if (mLibraryHandle)
|
||||
{
|
||||
#ifdef _WIN32
|
||||
FreeLibrary(mLibraryHandle);
|
||||
#else
|
||||
dlclose(mLibraryHandle);
|
||||
#endif
|
||||
mLibraryHandle = nullptr;
|
||||
}
|
||||
}
|
||||
|
||||
void NCCLHelper::loadNCCLLibrary()
|
||||
{
|
||||
try
|
||||
{
|
||||
#ifdef _WIN32
|
||||
char const* libraryNames[] = {"nccl.dll"};
|
||||
#else
|
||||
char const* libraryNames[] = {"libnccl.so"};
|
||||
#endif
|
||||
|
||||
for (auto const* name : libraryNames)
|
||||
{
|
||||
mLibraryHandle = loadLibraryHandle(name);
|
||||
if (mLibraryHandle)
|
||||
{
|
||||
TLLM_LOG_INFO("Successfully loaded NCCL library: %s", name);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (!mLibraryHandle)
|
||||
{
|
||||
TLLM_LOG_WARNING("Failed to load NCCL library");
|
||||
return;
|
||||
}
|
||||
|
||||
// Load the required symbols
|
||||
mNCCLCommWindowRegister
|
||||
= reinterpret_cast<ncclCommWindowRegisterFunc>(getSymbolAddress(mLibraryHandle, "ncclCommWindowRegister"));
|
||||
|
||||
mNCCLMemAlloc = reinterpret_cast<ncclMemAllocFunc>(getSymbolAddress(mLibraryHandle, "ncclMemAlloc"));
|
||||
|
||||
if (mNCCLCommWindowRegister == nullptr)
|
||||
{
|
||||
TLLM_LOG_WARNING("Failed to load ncclCommWindowRegister symbol, NCCL symmetric will not be supported.");
|
||||
}
|
||||
|
||||
if (mNCCLMemAlloc == nullptr)
|
||||
{
|
||||
TLLM_LOG_WARNING("Failed to load ncclMemAlloc symbol, NCCL symmetric will not be supported.");
|
||||
}
|
||||
|
||||
if (mNCCLCommWindowRegister != nullptr && mNCCLMemAlloc != nullptr)
|
||||
{
|
||||
mIsLoaded = true;
|
||||
}
|
||||
else
|
||||
{
|
||||
TLLM_LOG_WARNING(
|
||||
"Failed to load required NCCL symbols (both ncclCommWindowRegister and ncclMemAlloc are required)");
|
||||
}
|
||||
}
|
||||
catch (std::exception const& e)
|
||||
{
|
||||
TLLM_LOG_WARNING("Exception while loading NCCL library: %s", e.what());
|
||||
}
|
||||
}
|
||||
|
||||
void* NCCLHelper::loadLibraryHandle(char const* libName)
|
||||
{
|
||||
#ifdef _WIN32
|
||||
return LoadLibraryA(libName);
|
||||
#else
|
||||
return dlopen(libName, RTLD_LAZY | RTLD_GLOBAL);
|
||||
#endif
|
||||
}
|
||||
|
||||
void* NCCLHelper::getSymbolAddress(void* handle, char const* symbolName)
|
||||
{
|
||||
if (!handle)
|
||||
{
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
#ifdef _WIN32
|
||||
return GetProcAddress(static_cast<HMODULE>(handle), symbolName);
|
||||
#else
|
||||
return dlsym(handle, symbolName);
|
||||
#endif
|
||||
}
|
||||
|
||||
NCCLHelper::ncclCommWindowRegisterFunc NCCLHelper::getNCCLCommWindowRegister()
|
||||
{
|
||||
return mNCCLCommWindowRegister;
|
||||
}
|
||||
|
||||
NCCLHelper::ncclMemAllocFunc NCCLHelper::getNCCLMemAlloc()
|
||||
{
|
||||
return mNCCLMemAlloc;
|
||||
}
|
||||
|
||||
bool NCCLHelper::isLoaded() const
|
||||
{
|
||||
return mIsLoaded;
|
||||
}
|
||||
|
||||
//==============================================================================
|
||||
// NCCLWindowAllocator Implementation
|
||||
//==============================================================================
|
||||
|
||||
NCCLWindowAllocator& NCCLWindowAllocator::getInstance()
|
||||
{
|
||||
static NCCLWindowAllocator instance;
|
||||
return instance;
|
||||
}
|
||||
|
||||
NCCLWindowBuffer NCCLWindowAllocator::requestBuffer(ncclComm_t comm, size_t size)
|
||||
{
|
||||
TLLM_CHECK_WITH_INFO(comm != nullptr, "NCCL communicator cannot be null");
|
||||
TLLM_CHECK_WITH_INFO(size > 0, "Buffer size must be greater than 0");
|
||||
|
||||
std::lock_guard<std::mutex> lock(mMutex);
|
||||
|
||||
// Register cleanup callback for this communicator if not already registered
|
||||
// This is cheap even if no buffers exist yet - cleanup will just return early
|
||||
registerBufferCleanup(comm);
|
||||
|
||||
// Check if we have an available buffer of at least the requested size for this communicator
|
||||
// Use best-fit: find the smallest buffer that's >= requested size
|
||||
auto& commBuffers = mBufferPool[comm];
|
||||
auto bestFit = commBuffers.end();
|
||||
size_t bestFitSize = std::numeric_limits<size_t>::max();
|
||||
|
||||
for (auto it = commBuffers.begin(); it != commBuffers.end(); ++it)
|
||||
{
|
||||
if (!it->inUse && it->buffer.size >= size && it->buffer.size < bestFitSize)
|
||||
{
|
||||
bestFit = it;
|
||||
bestFitSize = it->buffer.size;
|
||||
}
|
||||
}
|
||||
|
||||
if (bestFit != commBuffers.end())
|
||||
{
|
||||
bestFit->inUse = true;
|
||||
TLLM_LOG_TRACE(
|
||||
"[NCCLUtil] Reusing NCCL window buffer for comm %p: handle=%d, ptr=%p, size=%zu (requested: %zu)",
|
||||
static_cast<void*>(comm), bestFit->buffer.handle, bestFit->buffer.ptr, bestFit->buffer.size, size);
|
||||
return bestFit->buffer;
|
||||
}
|
||||
|
||||
// No available buffer found, allocate a new one
|
||||
TLLM_LOG_TRACE(
|
||||
"[NCCLUtil] Allocating new NCCL window buffer for comm %p, size=%zu", static_cast<void*>(comm), size);
|
||||
int handle = static_cast<int>(commBuffers.size());
|
||||
NCCLWindowBuffer buffer = allocateAndRegisterBuffer(comm, size, handle);
|
||||
commBuffers.push_back({buffer, true});
|
||||
|
||||
return buffer;
|
||||
}
|
||||
|
||||
NCCLWindowBuffer NCCLWindowAllocator::searchBuffer(ncclComm_t comm, void* ptr) const
|
||||
{
|
||||
if (!comm || !ptr)
|
||||
{
|
||||
return NCCLWindowBuffer();
|
||||
}
|
||||
|
||||
std::lock_guard<std::mutex> lock(mMutex);
|
||||
return searchBufferLocked(comm, ptr);
|
||||
}
|
||||
|
||||
void NCCLWindowAllocator::releaseBuffer(ncclComm_t comm, void* ptr)
|
||||
{
|
||||
if (!comm || !ptr)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
std::lock_guard<std::mutex> lock(mMutex);
|
||||
auto commIt = mBufferPool.find(comm);
|
||||
if (commIt == mBufferPool.end())
|
||||
{
|
||||
TLLM_LOG_WARNING(
|
||||
"[NCCLUtil] Attempted to release buffer %p for unknown comm %p", ptr, static_cast<void*>(comm));
|
||||
return;
|
||||
}
|
||||
|
||||
for (auto& entry : commIt->second)
|
||||
{
|
||||
if (entry.buffer.ptr == ptr)
|
||||
{
|
||||
entry.inUse = false;
|
||||
TLLM_LOG_TRACE("[NCCLUtil] Released NCCL window buffer for comm %p: ptr=%p", static_cast<void*>(comm), ptr);
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
TLLM_LOG_WARNING("[NCCLUtil] Attempted to release unknown buffer %p for comm %p", ptr, static_cast<void*>(comm));
|
||||
}
|
||||
|
||||
ncclWindow_t NCCLWindowAllocator::getWindow(ncclComm_t comm, void* ptr) const
|
||||
{
|
||||
std::lock_guard<std::mutex> lock(mMutex);
|
||||
NCCLWindowBuffer buffer = searchBufferLocked(comm, ptr);
|
||||
return buffer.isValid() ? buffer.window : nullptr;
|
||||
}
|
||||
|
||||
size_t NCCLWindowAllocator::getSize(ncclComm_t comm, void* ptr) const
|
||||
{
|
||||
std::lock_guard<std::mutex> lock(mMutex);
|
||||
NCCLWindowBuffer buffer = searchBufferLocked(comm, ptr);
|
||||
return buffer.isValid() ? buffer.size : 0;
|
||||
}
|
||||
|
||||
NCCLWindowBuffer NCCLWindowAllocator::getBufferInfo(ncclComm_t comm, void* ptr) const
|
||||
{
|
||||
std::lock_guard<std::mutex> lock(mMutex);
|
||||
return searchBufferLocked(comm, ptr);
|
||||
}
|
||||
|
||||
size_t NCCLWindowAllocator::getBufferCount(ncclComm_t comm) const
|
||||
{
|
||||
std::lock_guard<std::mutex> lock(mMutex);
|
||||
auto commIt = mBufferPool.find(comm);
|
||||
return commIt != mBufferPool.end() ? commIt->second.size() : 0;
|
||||
}
|
||||
|
||||
size_t NCCLWindowAllocator::getBufferInUseCount(ncclComm_t comm) const
|
||||
{
|
||||
std::lock_guard<std::mutex> lock(mMutex);
|
||||
auto commIt = mBufferPool.find(comm);
|
||||
if (commIt == mBufferPool.end())
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
||||
size_t count = 0;
|
||||
for (auto const& entry : commIt->second)
|
||||
{
|
||||
if (entry.inUse)
|
||||
{
|
||||
++count;
|
||||
}
|
||||
}
|
||||
return count;
|
||||
}
|
||||
|
||||
bool NCCLWindowAllocator::isCommValid(ncclComm_t comm) const noexcept
|
||||
{
|
||||
// Simply check for null - all non-null comms are valid
|
||||
// We don't track cleaned-up comms because NCCL can reuse memory addresses,
|
||||
// making pointer-based tracking unreliable. New comms will be registered when used.
|
||||
return comm != nullptr;
|
||||
}
|
||||
|
||||
NCCLWindowBuffer NCCLWindowAllocator::allocateAndRegisterBuffer(ncclComm_t comm, size_t size, int handle)
|
||||
{
|
||||
NCCLWindowBuffer buffer;
|
||||
buffer.handle = handle;
|
||||
|
||||
// Get NCCL helper for dynamic symbol loading
|
||||
auto& ncclHelper = NCCLHelper::getInstance();
|
||||
if (!ncclHelper.isLoaded())
|
||||
{
|
||||
TLLM_THROW("NCCL library could not be loaded for dynamic symbol access");
|
||||
}
|
||||
|
||||
auto ncclMemAllocFunc = ncclHelper.getNCCLMemAlloc();
|
||||
auto ncclCommWindowRegisterFunc = ncclHelper.getNCCLCommWindowRegister();
|
||||
|
||||
// Defensive checks: both function pointers must be non-null
|
||||
if (ncclMemAllocFunc == nullptr)
|
||||
{
|
||||
TLLM_THROW("ncclMemAlloc function pointer is null, cannot allocate NCCL window buffer");
|
||||
}
|
||||
|
||||
if (ncclCommWindowRegisterFunc == nullptr)
|
||||
{
|
||||
TLLM_THROW("ncclCommWindowRegister function pointer is null, cannot register NCCL window buffer");
|
||||
}
|
||||
|
||||
// Allocate device memory using ncclMemAlloc
|
||||
ncclResult_t allocResult = ncclMemAllocFunc(&buffer.ptr, size);
|
||||
if (allocResult != ncclSuccess)
|
||||
{
|
||||
TLLM_THROW("ncclMemAlloc failed with error: %d", allocResult);
|
||||
}
|
||||
buffer.size = size;
|
||||
|
||||
// Register the buffer with NCCL as a window
|
||||
ncclResult_t regResult
|
||||
= ncclCommWindowRegisterFunc(comm, buffer.ptr, size, &buffer.window, NCCL_WIN_COLL_SYMMETRIC);
|
||||
if (regResult != ncclSuccess)
|
||||
{
|
||||
ncclMemFree(buffer.ptr);
|
||||
TLLM_THROW("ncclCommWindowRegister failed with error: %d", regResult);
|
||||
}
|
||||
|
||||
TLLM_LOG_TRACE("[NCCLUtil] Allocated and registered NCCL window buffer: handle=%d, ptr=%p, size=%zu, window=%p",
|
||||
handle, buffer.ptr, size, static_cast<void*>(buffer.window));
|
||||
|
||||
return buffer;
|
||||
}
|
||||
|
||||
NCCLWindowBuffer NCCLWindowAllocator::searchBufferLocked(ncclComm_t comm, void* ptr) const
|
||||
{
|
||||
auto commIt = mBufferPool.find(comm);
|
||||
if (commIt == mBufferPool.end())
|
||||
{
|
||||
return NCCLWindowBuffer();
|
||||
}
|
||||
|
||||
for (auto const& entry : commIt->second)
|
||||
{
|
||||
if (entry.buffer.ptr == ptr)
|
||||
{
|
||||
return entry.buffer;
|
||||
}
|
||||
}
|
||||
|
||||
return NCCLWindowBuffer();
|
||||
}
|
||||
|
||||
void NCCLWindowAllocator::registerBufferCleanup(ncclComm_t comm)
|
||||
{
|
||||
// Don't register if already registered
|
||||
if (mRegisteredComms.find(comm) != mRegisteredComms.end())
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
mRegisteredComms.insert(comm);
|
||||
|
||||
// Register cleanup with the resource manager
|
||||
NcclCommResourceManager::getInstance().registerResource(
|
||||
comm, [this, comm]() { this->cleanupBuffersForComm(comm); }, "NCCLWindowAllocator");
|
||||
}
|
||||
|
||||
void NCCLWindowAllocator::cleanupBuffersForComm(ncclComm_t comm) noexcept
|
||||
{
|
||||
if (!comm)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
// Synchronize CUDA to ensure all operations using these buffers are complete
|
||||
// before we deregister windows and free memory
|
||||
cudaError_t cudaErr = cudaDeviceSynchronize();
|
||||
if (cudaErr != cudaSuccess)
|
||||
{
|
||||
TLLM_LOG_WARNING("[NCCLUtil] cudaDeviceSynchronize failed with error: %d before cleanup for comm %p", cudaErr,
|
||||
static_cast<void*>(comm));
|
||||
// Continue anyway - the sync failure might be from a previous error
|
||||
}
|
||||
|
||||
std::lock_guard<std::mutex> lock(mMutex);
|
||||
|
||||
// Check if we've already cleaned up this communicator
|
||||
if (mRegisteredComms.find(comm) == mRegisteredComms.end())
|
||||
{
|
||||
// Already cleaned up or never registered
|
||||
return;
|
||||
}
|
||||
|
||||
auto commIt = mBufferPool.find(comm);
|
||||
if (commIt == mBufferPool.end())
|
||||
{
|
||||
// No buffers to clean up, but mark as cleaned
|
||||
mRegisteredComms.erase(comm);
|
||||
return;
|
||||
}
|
||||
|
||||
TLLM_LOG_TRACE(
|
||||
"[NCCLUtil] Cleaning up %zu NCCL window buffers for comm %p", commIt->second.size(), static_cast<void*>(comm));
|
||||
|
||||
// Check for buffers still in use - this shouldn't happen if cleanup is called properly,
|
||||
// but we log a warning if it does
|
||||
size_t inUseCount = 0;
|
||||
for (auto const& entry : commIt->second)
|
||||
{
|
||||
if (entry.inUse)
|
||||
{
|
||||
++inUseCount;
|
||||
}
|
||||
}
|
||||
if (inUseCount > 0)
|
||||
{
|
||||
TLLM_LOG_WARNING(
|
||||
"[NCCLUtil] Cleaning up %zu buffers still marked as in-use for comm %p. "
|
||||
"This may indicate buffers weren't properly released before cleanup.",
|
||||
inUseCount, static_cast<void*>(comm));
|
||||
}
|
||||
|
||||
for (auto& entry : commIt->second)
|
||||
{
|
||||
if (entry.buffer.isValid())
|
||||
{
|
||||
// Deregister the window - the communicator is still valid at this point
|
||||
// (cleanup happens before ncclCommDestroy), but we need to be careful
|
||||
// if buffers are still in use by active operations
|
||||
if (entry.buffer.window && comm)
|
||||
{
|
||||
// Note: Even if buffer is marked inUse, we must deregister since
|
||||
// the communicator is being destroyed. The communicator is valid,
|
||||
// but we should handle potential errors gracefully.
|
||||
ncclResult_t result = ncclCommWindowDeregister(comm, entry.buffer.window);
|
||||
if (result != ncclSuccess)
|
||||
{
|
||||
TLLM_LOG_WARNING(
|
||||
"[NCCLUtil] ncclCommWindowDeregister failed with error: %d for comm %p, "
|
||||
"window %p (buffer inUse: %d)",
|
||||
result, static_cast<void*>(comm), static_cast<void*>(entry.buffer.window), entry.inUse);
|
||||
}
|
||||
}
|
||||
|
||||
// Free device memory using ncclMemFree
|
||||
// This should be safe even if deregister failed
|
||||
if (entry.buffer.ptr)
|
||||
{
|
||||
try
|
||||
{
|
||||
ncclResult_t ncclResult = ncclMemFree(entry.buffer.ptr);
|
||||
if (ncclResult != ncclSuccess)
|
||||
{
|
||||
TLLM_LOG_WARNING("[NCCLUtil] ncclMemFree failed with error: %d", ncclResult);
|
||||
}
|
||||
}
|
||||
catch (...)
|
||||
{
|
||||
TLLM_LOG_ERROR("[NCCLUtil] Exception during ncclMemFree for ptr %p", entry.buffer.ptr);
|
||||
}
|
||||
}
|
||||
|
||||
TLLM_LOG_TRACE(
|
||||
"[NCCLUtil] Freed NCCL window buffer: ptr=%p, size=%zu", entry.buffer.ptr, entry.buffer.size);
|
||||
}
|
||||
}
|
||||
|
||||
mBufferPool.erase(commIt);
|
||||
mRegisteredComms.erase(comm);
|
||||
}
|
||||
|
||||
} // namespace tensorrt_llm::common::nccl_util
|
||||
|
||||
#endif // ENABLE_MULTI_DEVICE
|
||||
397
cpp/tensorrt_llm/common/ncclUtils.h
Normal file
397
cpp/tensorrt_llm/common/ncclUtils.h
Normal file
@ -0,0 +1,397 @@
|
||||
/*
|
||||
* Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#pragma once
|
||||
|
||||
#include "tensorrt_llm/common/assert.h"
|
||||
#include "tensorrt_llm/common/cudaUtils.h"
|
||||
#include "tensorrt_llm/common/logger.h"
|
||||
|
||||
#if ENABLE_MULTI_DEVICE
|
||||
#include <nccl.h>
|
||||
#include <torch/extension.h>
|
||||
#endif
|
||||
|
||||
#include <algorithm>
|
||||
#include <functional>
|
||||
#include <limits>
|
||||
#include <memory>
|
||||
#include <mutex>
|
||||
#include <numeric>
|
||||
#include <sstream>
|
||||
#include <stdexcept>
|
||||
#include <string>
|
||||
#include <unordered_map>
|
||||
#include <unordered_set>
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
|
||||
#if ENABLE_MULTI_DEVICE
|
||||
|
||||
#ifdef _WIN32
|
||||
#include <windows.h>
|
||||
#else
|
||||
#include <dlfcn.h>
|
||||
#endif
|
||||
|
||||
namespace tensorrt_llm::common::nccl_util
|
||||
{
|
||||
|
||||
//==============================================================================
|
||||
// NCCL Helper - Dynamic Library Loading
|
||||
//==============================================================================
|
||||
|
||||
// Helper class for dynamically loading NCCL symbols (ncclMemAlloc, ncclCommWindowRegister)
|
||||
// This allows the code to work with NCCL libraries that may or may not have these symbols
|
||||
class NCCLHelper
|
||||
{
|
||||
public:
|
||||
static NCCLHelper& getInstance();
|
||||
|
||||
// Dynamic loading function type definition
|
||||
using ncclCommWindowRegisterFunc = ncclResult_t (*)(ncclComm_t, void*, size_t, ncclWindow_t*, int);
|
||||
using ncclMemAllocFunc = ncclResult_t (*)(void**, size_t);
|
||||
|
||||
// Get function pointer for ncclCommWindowRegister
|
||||
ncclCommWindowRegisterFunc getNCCLCommWindowRegister();
|
||||
|
||||
// Get function pointer for ncclMemAlloc
|
||||
ncclMemAllocFunc getNCCLMemAlloc();
|
||||
|
||||
// Check if NCCL library is successfully loaded
|
||||
bool isLoaded() const;
|
||||
|
||||
NCCLHelper(NCCLHelper const&) = delete;
|
||||
NCCLHelper& operator=(NCCLHelper const&) = delete;
|
||||
NCCLHelper(NCCLHelper&&) = delete;
|
||||
NCCLHelper& operator=(NCCLHelper&&) = delete;
|
||||
|
||||
private:
|
||||
NCCLHelper();
|
||||
~NCCLHelper();
|
||||
|
||||
void loadNCCLLibrary();
|
||||
void* loadLibraryHandle(char const* libName);
|
||||
void* getSymbolAddress(void* handle, char const* symbolName);
|
||||
|
||||
#ifdef _WIN32
|
||||
HMODULE mLibraryHandle;
|
||||
#else
|
||||
void* mLibraryHandle;
|
||||
#endif
|
||||
|
||||
ncclCommWindowRegisterFunc mNCCLCommWindowRegister;
|
||||
ncclMemAllocFunc mNCCLMemAlloc;
|
||||
bool mIsLoaded;
|
||||
};
|
||||
|
||||
//==============================================================================
|
||||
// NCCL Resource Management
|
||||
//==============================================================================
|
||||
|
||||
// Resource cleanup function type. Called before the NCCL communicator is destroyed.
|
||||
using ResourceCleanupFunc = std::function<void()>;
|
||||
|
||||
// Manages resources associated with NCCL communicators. Thread-safe singleton that maintains
|
||||
// a pool of resources per NCCL comm. Resources are automatically cleaned up when the
|
||||
// communicator is destroyed.
|
||||
class NcclCommResourceManager
|
||||
{
|
||||
public:
|
||||
static NcclCommResourceManager& getInstance() noexcept;
|
||||
|
||||
// Register a resource cleanup function for a specific NCCL communicator.
|
||||
// The cleanup function will be called before ncclCommDestroy.
|
||||
// Thread-safe: Uses global mutex to serialize all operations.
|
||||
void registerResource(ncclComm_t comm, ResourceCleanupFunc cleanup, char const* debugName = nullptr);
|
||||
|
||||
// Cleanup all resources associated with a communicator. Called automatically by
|
||||
// the shared_ptr deleter before ncclCommDestroy.
|
||||
// Thread-safe: Uses global mutex to serialize cleanup operations.
|
||||
// Order-preserving: Resources are cleaned up in registration order.
|
||||
void cleanupResources(ncclComm_t comm) noexcept;
|
||||
|
||||
// Check if a communicator has registered resources.
|
||||
bool hasResources(ncclComm_t comm) const noexcept;
|
||||
|
||||
// Get the number of resources registered for a communicator.
|
||||
size_t getResourceCount(ncclComm_t comm) const noexcept;
|
||||
|
||||
NcclCommResourceManager(NcclCommResourceManager const&) = delete;
|
||||
NcclCommResourceManager& operator=(NcclCommResourceManager const&) = delete;
|
||||
NcclCommResourceManager(NcclCommResourceManager&&) = delete;
|
||||
NcclCommResourceManager& operator=(NcclCommResourceManager&&) = delete;
|
||||
|
||||
private:
|
||||
NcclCommResourceManager() = default;
|
||||
~NcclCommResourceManager() = default;
|
||||
|
||||
using ResourceEntry = std::pair<ResourceCleanupFunc, std::string>;
|
||||
|
||||
mutable std::mutex mMutex;
|
||||
std::unordered_map<ncclComm_t, std::vector<ResourceEntry>> mCommResources;
|
||||
};
|
||||
|
||||
// RAII helper to register a resource with a NCCL communicator.
|
||||
// Automatically registers cleanup function on construction.
|
||||
template <typename ResourceType>
|
||||
class NcclCommResource
|
||||
{
|
||||
public:
|
||||
NcclCommResource(ncclComm_t comm, ResourceType&& resource, std::function<void(ResourceType&)> cleanup,
|
||||
char const* debugName = nullptr)
|
||||
: mComm(comm)
|
||||
, mResource(std::forward<ResourceType>(resource))
|
||||
, mCleanup(std::move(cleanup))
|
||||
, mRegistered(true)
|
||||
{
|
||||
// Register with the manager
|
||||
NcclCommResourceManager::getInstance().registerResource(
|
||||
comm,
|
||||
[this]()
|
||||
{
|
||||
if (mCleanup)
|
||||
{
|
||||
mCleanup(mResource);
|
||||
}
|
||||
},
|
||||
debugName);
|
||||
}
|
||||
|
||||
ResourceType& get()
|
||||
{
|
||||
return mResource;
|
||||
}
|
||||
|
||||
ResourceType const& get() const
|
||||
{
|
||||
return mResource;
|
||||
}
|
||||
|
||||
NcclCommResource(NcclCommResource const&) = delete;
|
||||
NcclCommResource& operator=(NcclCommResource const&) = delete;
|
||||
NcclCommResource(NcclCommResource&&) = delete;
|
||||
NcclCommResource& operator=(NcclCommResource&&) = delete;
|
||||
|
||||
private:
|
||||
ncclComm_t mComm;
|
||||
ResourceType mResource;
|
||||
std::function<void(ResourceType&)> mCleanup;
|
||||
bool mRegistered;
|
||||
};
|
||||
|
||||
//==============================================================================
|
||||
// NCCL Window Buffer Allocation
|
||||
//==============================================================================
|
||||
|
||||
// Represents a buffer with an associated NCCL window
|
||||
struct NCCLWindowBuffer
|
||||
{
|
||||
void* ptr; // Device pointer (same as UBBuffer.addr)
|
||||
int handle; // Buffer handle/index (for compatibility with UB interface)
|
||||
size_t size; // Size in bytes
|
||||
ncclWindow_t window; // NCCL window handle
|
||||
|
||||
NCCLWindowBuffer(void* p = nullptr, int h = -1, size_t s = 0, ncclWindow_t w = nullptr)
|
||||
: ptr(p)
|
||||
, handle(h)
|
||||
, size(s)
|
||||
, window(w)
|
||||
{
|
||||
}
|
||||
|
||||
[[nodiscard]] bool isValid() const
|
||||
{
|
||||
return ptr != nullptr && handle >= 0 && size > 0 && window != nullptr;
|
||||
}
|
||||
|
||||
[[nodiscard]] bool invalid() const
|
||||
{
|
||||
return !isValid();
|
||||
}
|
||||
|
||||
// Alias for compatibility with UBBuffer interface
|
||||
void* addr() const
|
||||
{
|
||||
return ptr;
|
||||
}
|
||||
};
|
||||
|
||||
// Manages NCCL window-registered buffers with pooling and automatic cleanup.
|
||||
// Buffers are tied to the lifetime of their associated NCCL communicator.
|
||||
class NCCLWindowAllocator
|
||||
{
|
||||
public:
|
||||
static NCCLWindowAllocator& getInstance();
|
||||
|
||||
// Request a buffer for the given communicator and size.
|
||||
// If an unused buffer of at least the requested size exists for this communicator, it will be reused.
|
||||
// Uses best-fit strategy: selects the smallest available buffer that meets the size requirement.
|
||||
// Otherwise, a new buffer is allocated and registered.
|
||||
NCCLWindowBuffer requestBuffer(ncclComm_t comm, size_t size);
|
||||
|
||||
// Search for a buffer by pointer. Returns an invalid buffer if not found.
|
||||
// This matches the UBManager.search_buffer() interface.
|
||||
NCCLWindowBuffer searchBuffer(ncclComm_t comm, void* ptr) const;
|
||||
|
||||
// Release a buffer back to the pool for potential reuse
|
||||
void releaseBuffer(ncclComm_t comm, void* ptr);
|
||||
|
||||
// Get the window handle for a specific buffer pointer
|
||||
ncclWindow_t getWindow(ncclComm_t comm, void* ptr) const;
|
||||
|
||||
// Get the size of a specific buffer pointer
|
||||
size_t getSize(ncclComm_t comm, void* ptr) const;
|
||||
|
||||
// Get buffer info by pointer
|
||||
NCCLWindowBuffer getBufferInfo(ncclComm_t comm, void* ptr) const;
|
||||
|
||||
// Get the number of buffers allocated for a communicator
|
||||
size_t getBufferCount(ncclComm_t comm) const;
|
||||
|
||||
// Get the number of buffers in use for a communicator
|
||||
size_t getBufferInUseCount(ncclComm_t comm) const;
|
||||
|
||||
// Check if a communicator is valid (non-null)
|
||||
// Note: We don't track cleaned-up comms because NCCL can reuse memory addresses.
|
||||
// All non-null comms are considered valid and will be registered when first used.
|
||||
bool isCommValid(ncclComm_t comm) const noexcept;
|
||||
|
||||
NCCLWindowAllocator(NCCLWindowAllocator const&) = delete;
|
||||
NCCLWindowAllocator& operator=(NCCLWindowAllocator const&) = delete;
|
||||
NCCLWindowAllocator(NCCLWindowAllocator&&) = delete;
|
||||
NCCLWindowAllocator& operator=(NCCLWindowAllocator&&) = delete;
|
||||
|
||||
private:
|
||||
NCCLWindowAllocator() = default;
|
||||
~NCCLWindowAllocator() = default;
|
||||
|
||||
// Allocate a new buffer and register it with NCCL as a window
|
||||
NCCLWindowBuffer allocateAndRegisterBuffer(ncclComm_t comm, size_t size, int handle);
|
||||
|
||||
// Search for a buffer by pointer (assumes mMutex is already locked)
|
||||
NCCLWindowBuffer searchBufferLocked(ncclComm_t comm, void* ptr) const;
|
||||
|
||||
// Register cleanup function for all buffers associated with a communicator
|
||||
void registerBufferCleanup(ncclComm_t comm);
|
||||
|
||||
// Cleanup all buffers for a specific communicator
|
||||
void cleanupBuffersForComm(ncclComm_t comm) noexcept;
|
||||
|
||||
struct BufferEntry
|
||||
{
|
||||
NCCLWindowBuffer buffer;
|
||||
bool inUse;
|
||||
};
|
||||
|
||||
mutable std::mutex mMutex;
|
||||
std::unordered_map<ncclComm_t, std::vector<BufferEntry>> mBufferPool;
|
||||
std::unordered_set<ncclComm_t> mRegisteredComms;
|
||||
};
|
||||
|
||||
// RAII wrapper for NCCL window buffers
|
||||
class ScopedNCCLWindowBuffer
|
||||
{
|
||||
public:
|
||||
ScopedNCCLWindowBuffer(ncclComm_t comm, size_t size)
|
||||
: mComm(comm)
|
||||
, mBuffer(NCCLWindowAllocator::getInstance().requestBuffer(comm, size))
|
||||
{
|
||||
}
|
||||
|
||||
~ScopedNCCLWindowBuffer()
|
||||
{
|
||||
if (mBuffer.isValid())
|
||||
{
|
||||
NCCLWindowAllocator::getInstance().releaseBuffer(mComm, mBuffer.ptr);
|
||||
}
|
||||
}
|
||||
|
||||
void* getPtr() const
|
||||
{
|
||||
return mBuffer.ptr;
|
||||
}
|
||||
|
||||
size_t getSize() const
|
||||
{
|
||||
return mBuffer.size;
|
||||
}
|
||||
|
||||
ncclWindow_t getWindow() const
|
||||
{
|
||||
return mBuffer.window;
|
||||
}
|
||||
|
||||
NCCLWindowBuffer const& getBuffer() const
|
||||
{
|
||||
return mBuffer;
|
||||
}
|
||||
|
||||
ScopedNCCLWindowBuffer(ScopedNCCLWindowBuffer const&) = delete;
|
||||
ScopedNCCLWindowBuffer& operator=(ScopedNCCLWindowBuffer const&) = delete;
|
||||
ScopedNCCLWindowBuffer(ScopedNCCLWindowBuffer&&) = delete;
|
||||
ScopedNCCLWindowBuffer& operator=(ScopedNCCLWindowBuffer&&) = delete;
|
||||
|
||||
private:
|
||||
ncclComm_t mComm;
|
||||
NCCLWindowBuffer mBuffer;
|
||||
};
|
||||
|
||||
// Creates a PyTorch tensor backed by an NCCL window buffer.
|
||||
// The tensor will automatically release the buffer back to the pool when destroyed.
|
||||
// This is analogous to torch_ext::create_userbuffers_tensor() but for NCCLWindowAllocator.
|
||||
inline std::pair<torch::Tensor, NCCLWindowBuffer> createNCCLWindowTensor(
|
||||
ncclComm_t comm, at::IntArrayRef shape, torch::ScalarType dtype)
|
||||
{
|
||||
// Calculate buffer size
|
||||
int64_t buffer_size
|
||||
= std::accumulate(shape.begin(), shape.end(), 1LL, std::multiplies<int64_t>()) * torch::elementSize(dtype);
|
||||
|
||||
// Calculate strides
|
||||
std::vector<int64_t> strides_vec(shape.size());
|
||||
if (!shape.empty())
|
||||
{
|
||||
strides_vec[shape.size() - 1] = 1;
|
||||
for (int64_t i = static_cast<int64_t>(shape.size()) - 1; i >= 1; --i)
|
||||
{
|
||||
strides_vec[i - 1] = strides_vec[i] * shape[i];
|
||||
}
|
||||
}
|
||||
|
||||
// Request buffer from allocator
|
||||
auto& allocator = NCCLWindowAllocator::getInstance();
|
||||
auto buffer = allocator.requestBuffer(comm, buffer_size);
|
||||
|
||||
// Defensive validation: ensure buffer is valid before proceeding
|
||||
if (!buffer.isValid())
|
||||
{
|
||||
std::ostringstream oss;
|
||||
oss << "Failed to allocate NCCL window buffer: invalid buffer returned from requestBuffer "
|
||||
<< "(comm=" << static_cast<void*>(comm) << ", buffer_size=" << buffer_size << ")";
|
||||
throw std::runtime_error(oss.str());
|
||||
}
|
||||
|
||||
// Create custom deleter that releases the buffer
|
||||
auto deleter = [comm, ptr = buffer.ptr](void*) { NCCLWindowAllocator::getInstance().releaseBuffer(comm, ptr); };
|
||||
|
||||
// Create tensor from the buffer
|
||||
auto tensor = torch::from_blob(buffer.ptr, shape, strides_vec, deleter, torch::dtype(dtype).device(torch::kCUDA));
|
||||
|
||||
return std::make_pair(tensor, buffer);
|
||||
}
|
||||
|
||||
} // namespace tensorrt_llm::common::nccl_util
|
||||
|
||||
#endif // ENABLE_MULTI_DEVICE
|
||||
@ -16,6 +16,7 @@
|
||||
*/
|
||||
|
||||
#include "tensorrt_llm/common/opUtils.h"
|
||||
#include "tensorrt_llm/common/ncclUtils.h"
|
||||
#include "tensorrt_llm/runtime/utils/mpiTags.h"
|
||||
#include "tensorrt_llm/runtime/utils/mpiUtils.h"
|
||||
|
||||
@ -112,7 +113,29 @@ std::shared_ptr<ncclComm_t> getComm(std::set<int> const& group)
|
||||
std::shared_ptr<ncclComm_t> ncclComm(new ncclComm_t,
|
||||
[](ncclComm_t* comm)
|
||||
{
|
||||
ncclCommDestroy(*comm);
|
||||
if (!comm)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
// STEP 1: Clean up resources and destroy NCCL communicator if it's valid
|
||||
if (*comm)
|
||||
{
|
||||
// Clean up all registered resources FIRST
|
||||
tensorrt_llm::common::nccl_util::NcclCommResourceManager::getInstance().cleanupResources(*comm);
|
||||
|
||||
// Now destroy the NCCL communicator
|
||||
ncclResult_t result = ncclCommDestroy(*comm);
|
||||
if (result != ncclSuccess)
|
||||
{
|
||||
TLLM_LOG_WARNING("ncclCommDestroy failed with error: %d", result);
|
||||
}
|
||||
|
||||
// Clear the communicator value before freeing the pointer
|
||||
*comm = nullptr;
|
||||
}
|
||||
|
||||
// STEP 2: Always free the pointer memory (regardless of whether *comm was valid)
|
||||
delete comm;
|
||||
});
|
||||
#if defined(_WIN32)
|
||||
|
||||
@ -22,16 +22,8 @@ namespace tensorrt_llm::runtime::ub
|
||||
{
|
||||
UserBufferAllocator& UserBufferAllocator::Instance()
|
||||
{
|
||||
if (use_nccl_symmetric)
|
||||
{
|
||||
static NCCLUserBufferAllocator _;
|
||||
return _;
|
||||
}
|
||||
else
|
||||
{
|
||||
static UserBufferAllocator _;
|
||||
return _;
|
||||
}
|
||||
static UserBufferAllocator _;
|
||||
return _;
|
||||
}
|
||||
|
||||
void UserBufferAllocator::initialize(tensorrt_llm::runtime::WorldConfig const& worldConfig)
|
||||
@ -83,167 +75,4 @@ communicator* UserBufferAllocator::comm()
|
||||
return mUbComm;
|
||||
}
|
||||
|
||||
void NCCLUserBufferAllocator::initialize(tensorrt_llm::runtime::WorldConfig const& worldConfig)
|
||||
{
|
||||
if (!isInitialized())
|
||||
{
|
||||
TLLM_LOG_INFO("Initializing NCCLUserBufferAllocator");
|
||||
std::set<int> group;
|
||||
for (int i = 0; i < worldConfig.getSize(); i++)
|
||||
{
|
||||
group.insert(i);
|
||||
}
|
||||
mComm = getComm(group);
|
||||
mIsInitialized = true;
|
||||
}
|
||||
}
|
||||
|
||||
UBBuffer NCCLUserBufferAllocator::registerUBBuffer(size_t bytes)
|
||||
{
|
||||
TLLM_CHECK(isInitialized());
|
||||
UBBuffer ub_buffer;
|
||||
|
||||
auto& ncclHelper = getNCCLHelper();
|
||||
if (!ncclHelper.isLoaded())
|
||||
{
|
||||
TLLM_THROW("NCCL library could not be loaded for dynamic symbol access");
|
||||
}
|
||||
|
||||
auto ncclMemAllocFunc = ncclHelper.getNCCLMemAlloc();
|
||||
auto ncclCommWindowRegisterFunc = ncclHelper.getNCCLCommWindowRegister();
|
||||
|
||||
NCCLCHECK(ncclMemAllocFunc(&ub_buffer.addr, bytes));
|
||||
NCCLCHECK(ncclCommWindowRegisterFunc((*mComm), ub_buffer.addr, bytes, &ub_buffer.window, NCCL_WIN_COLL_SYMMETRIC));
|
||||
ub_buffer.handle = 5;
|
||||
ub_buffer.size = bytes;
|
||||
return ub_buffer;
|
||||
}
|
||||
|
||||
// Static member definitions
|
||||
std::unique_ptr<NCCLHelper> NCCLUserBufferAllocator::mNCCLHelper = nullptr;
|
||||
|
||||
NCCLHelper& NCCLUserBufferAllocator::getNCCLHelper()
|
||||
{
|
||||
if (!mNCCLHelper)
|
||||
{
|
||||
mNCCLHelper = std::make_unique<NCCLHelper>();
|
||||
}
|
||||
return *mNCCLHelper;
|
||||
}
|
||||
|
||||
// NCCLHelper implementation
|
||||
NCCLHelper::NCCLHelper()
|
||||
: mLibraryHandle(nullptr)
|
||||
, mNCCLCommWindowRegister(nullptr)
|
||||
, mNCCLMemAlloc(nullptr)
|
||||
, mIsLoaded(false)
|
||||
{
|
||||
loadNCCLLibrary();
|
||||
}
|
||||
|
||||
NCCLHelper::~NCCLHelper()
|
||||
{
|
||||
if (mLibraryHandle)
|
||||
{
|
||||
#ifdef _WIN32
|
||||
FreeLibrary(mLibraryHandle);
|
||||
#else
|
||||
dlclose(mLibraryHandle);
|
||||
#endif
|
||||
mLibraryHandle = nullptr;
|
||||
}
|
||||
}
|
||||
|
||||
void NCCLHelper::loadNCCLLibrary()
|
||||
{
|
||||
try
|
||||
{
|
||||
#ifdef _WIN32
|
||||
char const* libraryNames[] = {"nccl.dll"};
|
||||
#else
|
||||
char const* libraryNames[] = {"libnccl.so"};
|
||||
#endif
|
||||
|
||||
for (int i = 0; libraryNames[i] != nullptr; ++i)
|
||||
{
|
||||
mLibraryHandle = loadLibraryHandle(libraryNames[i]);
|
||||
if (mLibraryHandle)
|
||||
{
|
||||
TLLM_LOG_INFO("Successfully loaded NCCL library: %s", libraryNames[i]);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (!mLibraryHandle)
|
||||
{
|
||||
TLLM_LOG_WARNING("Failed to load NCCL library");
|
||||
return;
|
||||
}
|
||||
|
||||
// Load the required symbols
|
||||
mNCCLCommWindowRegister
|
||||
= reinterpret_cast<ncclCommWindowRegisterFunc>(getSymbolAddress(mLibraryHandle, "ncclCommWindowRegister"));
|
||||
|
||||
mNCCLMemAlloc = reinterpret_cast<ncclMemAllocFunc>(getSymbolAddress(mLibraryHandle, "ncclMemAlloc"));
|
||||
|
||||
if (mNCCLCommWindowRegister == nullptr)
|
||||
{
|
||||
TLLM_LOG_WARNING("Failed to load ncclCommWindowRegister symbol, NCCL symmetric will not be supported.");
|
||||
}
|
||||
|
||||
if (mNCCLMemAlloc)
|
||||
{
|
||||
mIsLoaded = true;
|
||||
}
|
||||
else
|
||||
{
|
||||
TLLM_LOG_WARNING("Failed to load required NCCL symbols");
|
||||
}
|
||||
}
|
||||
catch (std::exception const& e)
|
||||
{
|
||||
TLLM_LOG_WARNING("Exception while loading NCCL library: %s", e.what());
|
||||
}
|
||||
}
|
||||
|
||||
void* NCCLHelper::loadLibraryHandle(char const* libName)
|
||||
{
|
||||
#ifdef _WIN32
|
||||
return LoadLibraryA(libName);
|
||||
#else
|
||||
return dlopen(libName, RTLD_LAZY | RTLD_GLOBAL);
|
||||
#endif
|
||||
}
|
||||
|
||||
void* NCCLHelper::getSymbolAddress(void* handle, char const* symbolName)
|
||||
{
|
||||
if (!handle)
|
||||
{
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
#ifdef _WIN32
|
||||
return GetProcAddress(static_cast<HMODULE>(handle), symbolName);
|
||||
#else
|
||||
return dlsym(handle, symbolName);
|
||||
#endif
|
||||
}
|
||||
|
||||
NCCLHelper::ncclCommWindowRegisterFunc NCCLHelper::getNCCLCommWindowRegister()
|
||||
{
|
||||
return mNCCLCommWindowRegister;
|
||||
}
|
||||
|
||||
NCCLHelper::ncclMemAllocFunc NCCLHelper::getNCCLMemAlloc()
|
||||
{
|
||||
return mNCCLMemAlloc;
|
||||
}
|
||||
|
||||
bool NCCLHelper::isLoaded() const
|
||||
{
|
||||
return mIsLoaded;
|
||||
}
|
||||
|
||||
bool UserBufferAllocator::use_nccl_symmetric = false;
|
||||
|
||||
}; // namespace tensorrt_llm::runtime::ub
|
||||
|
||||
@ -19,11 +19,6 @@
|
||||
#if ENABLE_MULTI_DEVICE
|
||||
#include "nccl.h"
|
||||
#include "userbuffers.h"
|
||||
#ifdef _WIN32
|
||||
#include <windows.h>
|
||||
#else
|
||||
#include <dlfcn.h>
|
||||
#endif
|
||||
#else
|
||||
using ncclWindow_t = void*;
|
||||
#endif
|
||||
@ -69,8 +64,6 @@ public:
|
||||
communicator* comm();
|
||||
virtual UBBuffer registerUBBuffer(size_t bytes);
|
||||
|
||||
static bool use_nccl_symmetric;
|
||||
|
||||
private:
|
||||
communicator* mUbComm;
|
||||
|
||||
@ -80,55 +73,6 @@ protected:
|
||||
tensorrt_llm::runtime::WorldConfig mWorldConfig;
|
||||
};
|
||||
|
||||
class NCCLHelper
|
||||
{
|
||||
public:
|
||||
NCCLHelper();
|
||||
~NCCLHelper();
|
||||
|
||||
// Dynamic loading function type definition
|
||||
using ncclCommWindowRegisterFunc = ncclResult_t (*)(ncclComm_t, void*, size_t, ncclWindow_t*, int);
|
||||
using ncclMemAllocFunc = ncclResult_t (*)(void**, size_t);
|
||||
|
||||
// Get function pointer for ncclCommWindowRegister
|
||||
ncclCommWindowRegisterFunc getNCCLCommWindowRegister();
|
||||
|
||||
// Get function pointer for ncclMemAlloc
|
||||
ncclMemAllocFunc getNCCLMemAlloc();
|
||||
|
||||
// Check if NCCL library is successfully loaded
|
||||
bool isLoaded() const;
|
||||
|
||||
private:
|
||||
void loadNCCLLibrary();
|
||||
void* loadLibraryHandle(char const* libName);
|
||||
void* getSymbolAddress(void* handle, char const* symbolName);
|
||||
|
||||
#ifdef _WIN32
|
||||
HMODULE mLibraryHandle;
|
||||
#else
|
||||
void* mLibraryHandle;
|
||||
#endif
|
||||
|
||||
ncclCommWindowRegisterFunc mNCCLCommWindowRegister;
|
||||
ncclMemAllocFunc mNCCLMemAlloc;
|
||||
bool mIsLoaded;
|
||||
};
|
||||
|
||||
class NCCLUserBufferAllocator : public UserBufferAllocator
|
||||
{
|
||||
public:
|
||||
void initialize(tensorrt_llm::runtime::WorldConfig const& world_config) override;
|
||||
UBBuffer registerUBBuffer(size_t bytes) override;
|
||||
|
||||
// Get shared NCCLHelper instance
|
||||
static NCCLHelper& getNCCLHelper();
|
||||
|
||||
private:
|
||||
std::shared_ptr<ncclComm_t> mComm;
|
||||
static std::unique_ptr<NCCLHelper> mNCCLHelper;
|
||||
};
|
||||
|
||||
#else
|
||||
using communicator = void;
|
||||
#endif
|
||||
|
||||
@ -14,6 +14,7 @@
|
||||
* limitations under the License.
|
||||
*/
|
||||
#include "userbuffersManager.h"
|
||||
#include "tensorrt_llm/common/logger.h"
|
||||
|
||||
namespace tensorrt_llm::runtime::ub
|
||||
{
|
||||
@ -29,14 +30,11 @@ UserBuffersManager& UserBuffersManager::get_instance()
|
||||
return allocator;
|
||||
}
|
||||
|
||||
void UserBuffersManager::initialize(int64_t tp_size, int64_t pp_size, int64_t cp_size, int64_t rank,
|
||||
int64_t gpus_per_node, int64_t buffer_size, bool use_nccl_symmetric)
|
||||
void UserBuffersManager::initialize(
|
||||
int64_t tp_size, int64_t pp_size, int64_t cp_size, int64_t rank, int64_t gpus_per_node, int64_t buffer_size)
|
||||
{
|
||||
std::lock_guard<std::mutex> lock(mutex_);
|
||||
tensorrt_llm::runtime::WorldConfig world_config(tp_size, pp_size, cp_size, rank, gpus_per_node);
|
||||
#if ENABLE_MULTI_DEVICE
|
||||
UserBufferAllocator::Instance().use_nccl_symmetric = use_nccl_symmetric;
|
||||
#endif
|
||||
tensorrt_llm::runtime::ub::ub_initialize(world_config);
|
||||
TLLM_CHECK(tensorrt_llm::runtime::ub::ub_is_initialized());
|
||||
buffer_size_ = buffer_size;
|
||||
@ -98,11 +96,10 @@ tensorrt_llm::runtime::ub::communicator* UserBuffersManager::comm()
|
||||
return tensorrt_llm::runtime::ub::ub_comm();
|
||||
}
|
||||
|
||||
void initialize_userbuffers_manager(int64_t tp_size, int64_t pp_size, int64_t cp_size, int64_t rank,
|
||||
int64_t gpus_per_node, int64_t buffer_size, bool use_nccl_symmetric)
|
||||
void initialize_userbuffers_manager(
|
||||
int64_t tp_size, int64_t pp_size, int64_t cp_size, int64_t rank, int64_t gpus_per_node, int64_t buffer_size)
|
||||
{
|
||||
UserBuffersManager::get_instance().initialize(
|
||||
tp_size, pp_size, cp_size, rank, gpus_per_node, buffer_size, use_nccl_symmetric);
|
||||
UserBuffersManager::get_instance().initialize(tp_size, pp_size, cp_size, rank, gpus_per_node, buffer_size);
|
||||
}
|
||||
|
||||
} // namespace tensorrt_llm::runtime::ub
|
||||
|
||||
@ -46,9 +46,8 @@ public:
|
||||
//! @param gpus_per_node The number of GPUs per node.
|
||||
//! @param buffer_size The size of the buffer to allocate. All buffers allocated by this manager will have this
|
||||
//! size.
|
||||
//! @param use_nccl_symmetric Whether to use NCCL symmetric communication.
|
||||
void initialize(int64_t tp_size, int64_t pp_size, int64_t cp_size, int64_t rank, int64_t gpus_per_node,
|
||||
int64_t buffer_size, bool use_nccl_symmetric);
|
||||
void initialize(
|
||||
int64_t tp_size, int64_t pp_size, int64_t cp_size, int64_t rank, int64_t gpus_per_node, int64_t buffer_size);
|
||||
|
||||
//! @brief Create a UB tensor from the given shape, strides and data type. The function will choose available UB
|
||||
//! buffer or create a new one if no available buffer is found.
|
||||
@ -76,7 +75,7 @@ private:
|
||||
int64_t buffer_size_;
|
||||
};
|
||||
|
||||
void initialize_userbuffers_manager(int64_t tp_size, int64_t pp_size, int64_t cp_size, int64_t rank,
|
||||
int64_t gpus_per_node, int64_t buffer_size, bool use_nccl_symmetric);
|
||||
void initialize_userbuffers_manager(
|
||||
int64_t tp_size, int64_t pp_size, int64_t cp_size, int64_t rank, int64_t gpus_per_node, int64_t buffer_size);
|
||||
|
||||
} // namespace tensorrt_llm::runtime::ub
|
||||
|
||||
@ -137,13 +137,12 @@ bool AllreducePlugin::supportsFormatCombination(
|
||||
int pos, nvinfer1::PluginTensorDesc const* inOut, int nbInputs, int nbOutputs) noexcept
|
||||
{
|
||||
int base_inputs = 0;
|
||||
if (mStrategy == AllReduceStrategyType::NCCL || mStrategy == AllReduceStrategyType::UB)
|
||||
switch (mStrategy)
|
||||
{
|
||||
base_inputs = 1;
|
||||
}
|
||||
else
|
||||
{
|
||||
base_inputs = 2;
|
||||
case AllReduceStrategyType::NCCL:
|
||||
case AllReduceStrategyType::UB:
|
||||
case AllReduceStrategyType::NCCL_SYMMETRIC: base_inputs = 1; break;
|
||||
default: base_inputs = 2; break;
|
||||
}
|
||||
int fusion_op_extra_inputs = 0;
|
||||
int scale_idx = 0;
|
||||
@ -169,9 +168,15 @@ bool AllreducePlugin::supportsFormatCombination(
|
||||
|
||||
TLLM_CHECK(nbInputs == (base_inputs + fusion_op_extra_inputs));
|
||||
|
||||
if (mStrategy != AllReduceStrategyType::NCCL && mStrategy != AllReduceStrategyType::UB && pos == 1)
|
||||
if (pos == 1)
|
||||
{
|
||||
return (inOut[pos].type == nvinfer1::DataType::kINT64) && (inOut[pos].format == TensorFormat::kLINEAR);
|
||||
switch (mStrategy)
|
||||
{
|
||||
case AllReduceStrategyType::NCCL:
|
||||
case AllReduceStrategyType::UB:
|
||||
case AllReduceStrategyType::NCCL_SYMMETRIC: break;
|
||||
default: return (inOut[pos].type == nvinfer1::DataType::kINT64) && (inOut[pos].format == TensorFormat::kLINEAR);
|
||||
}
|
||||
}
|
||||
if (mStrategy == AllReduceStrategyType::UB)
|
||||
{
|
||||
@ -222,25 +227,26 @@ AllReduceStrategyType AllreducePlugin::selectImplementation(
|
||||
{
|
||||
if (!isAuto)
|
||||
{
|
||||
TLLM_LOG_INFO("Since Peer to Peer not supported, fallback to AllReduceStrategy: NCCL");
|
||||
TLLM_LOG_INFO("Since Peer to Peer not supported, fallback to AllReduceStrategy: NCCL_SYMMETRIC");
|
||||
}
|
||||
else if (forceDeterministic)
|
||||
{
|
||||
TLLM_LOG_WARNING(
|
||||
"Since Peer to Peer not supported, fallback to AllReduceStrategy: NCCL. NCCL might produce "
|
||||
"Since Peer to Peer not supported, fallback to AllReduceStrategy: NCCL_SYMMETRIC. NCCL_SYMMETRIC might "
|
||||
"produce "
|
||||
"non-deterministic results.");
|
||||
}
|
||||
return AllReduceStrategyType::NCCL;
|
||||
return AllReduceStrategyType::NCCL_SYMMETRIC;
|
||||
}
|
||||
|
||||
if (isAuto && !mIsNVLINKSupported && !forceDeterministic)
|
||||
{
|
||||
return AllReduceStrategyType::NCCL;
|
||||
return AllReduceStrategyType::NCCL_SYMMETRIC;
|
||||
}
|
||||
|
||||
auto const maxWorkspaceSize = utils::customAllReduceUtils::getMaxRequiredWorkspaceSize(worldSize);
|
||||
|
||||
AllReduceStrategyType strat = AllReduceStrategyType::NCCL;
|
||||
AllReduceStrategyType strat = AllReduceStrategyType::NCCL_SYMMETRIC;
|
||||
auto const messageSizeBytes = messageSize * common::getDTypeSize(type);
|
||||
|
||||
if (messageSizeBytes <= maxWorkspaceSize)
|
||||
@ -268,7 +274,7 @@ AllReduceStrategyType AllreducePlugin::selectImplementation(
|
||||
}
|
||||
else
|
||||
{
|
||||
strat = AllReduceStrategyType::NCCL;
|
||||
strat = AllReduceStrategyType::NCCL_SYMMETRIC;
|
||||
}
|
||||
}
|
||||
else
|
||||
@ -279,7 +285,7 @@ AllReduceStrategyType AllreducePlugin::selectImplementation(
|
||||
}
|
||||
else
|
||||
{
|
||||
strat = AllReduceStrategyType::NCCL;
|
||||
strat = AllReduceStrategyType::NCCL_SYMMETRIC;
|
||||
}
|
||||
}
|
||||
|
||||
@ -287,30 +293,31 @@ AllReduceStrategyType AllreducePlugin::selectImplementation(
|
||||
{
|
||||
if (!isAuto)
|
||||
{
|
||||
TLLM_LOG_WARNING("Since not aligned, fallback to AllReduceStrategy: NCCL");
|
||||
TLLM_LOG_WARNING("Since not aligned, fallback to AllReduceStrategy: NCCL_SYMMETRIC");
|
||||
}
|
||||
else if (forceDeterministic)
|
||||
{
|
||||
TLLM_LOG_WARNING(
|
||||
"Since not aligned, fallback to AllReduceStrategy: NCCL. NCCL might produce "
|
||||
"Since not aligned, fallback to AllReduceStrategy: NCCL_SYMMETRIC. NCCL_SYMMETRIC might produce "
|
||||
"non-deterministic results.");
|
||||
}
|
||||
strat = AllReduceStrategyType::NCCL;
|
||||
strat = AllReduceStrategyType::NCCL_SYMMETRIC;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
if (!isAuto)
|
||||
{
|
||||
TLLM_LOG_WARNING("Since messageSize > maxWorkspace, fallback to AllReduceStrategy: NCCL");
|
||||
TLLM_LOG_WARNING("Since messageSize > maxWorkspace, fallback to AllReduceStrategy: NCCL_SYMMETRIC");
|
||||
}
|
||||
else if (forceDeterministic)
|
||||
{
|
||||
TLLM_LOG_WARNING(
|
||||
"Since messageSize > maxWorkspace, fallback to AllReduceStrategy: NCCL. NCCL might produce "
|
||||
"Since messageSize > maxWorkspace, fallback to AllReduceStrategy: NCCL_SYMMETRIC. NCCL_SYMMETRIC might "
|
||||
"produce "
|
||||
"non-deterministic results.");
|
||||
}
|
||||
strat = AllReduceStrategyType::NCCL;
|
||||
strat = AllReduceStrategyType::NCCL_SYMMETRIC;
|
||||
}
|
||||
|
||||
return strat;
|
||||
@ -337,6 +344,10 @@ int AllreducePlugin::enqueue(nvinfer1::PluginTensorDesc const* inputDesc, nvinfe
|
||||
{
|
||||
runtimeStrategy = AllReduceStrategyType::NCCL;
|
||||
}
|
||||
else if (mStrategy == AllReduceStrategyType::NCCL_SYMMETRIC)
|
||||
{
|
||||
runtimeStrategy = AllReduceStrategyType::NCCL_SYMMETRIC;
|
||||
}
|
||||
else if (mStrategy == AllReduceStrategyType::UB)
|
||||
{
|
||||
runtimeStrategy = AllReduceStrategyType::UB;
|
||||
@ -355,6 +366,11 @@ int AllreducePlugin::enqueue(nvinfer1::PluginTensorDesc const* inputDesc, nvinfe
|
||||
TLLM_LOG_DEBUG("AllReducePlugin strategy for rank %d: NCCL", rank);
|
||||
break;
|
||||
}
|
||||
case AllReduceStrategyType::NCCL_SYMMETRIC:
|
||||
{
|
||||
TLLM_LOG_DEBUG("AllReducePlugin strategy for rank %d: NCCL_SYMMETRIC", rank);
|
||||
break;
|
||||
}
|
||||
case AllReduceStrategyType::ONESHOT:
|
||||
{
|
||||
TLLM_LOG_DEBUG("AllReducePlugin strategy for rank %d: ONESHOT", rank);
|
||||
@ -373,14 +389,14 @@ int AllreducePlugin::enqueue(nvinfer1::PluginTensorDesc const* inputDesc, nvinfe
|
||||
default: break;
|
||||
}
|
||||
|
||||
if (runtimeStrategy == AllReduceStrategyType::NCCL)
|
||||
if (runtimeStrategy == AllReduceStrategyType::NCCL || runtimeStrategy == AllReduceStrategyType::NCCL_SYMMETRIC)
|
||||
{
|
||||
if (mOp == AllReduceFusionOp::RESIDUAL_RMS_NORM || mOp == AllReduceFusionOp::RESIDUAL_RMS_PREPOST_NORM)
|
||||
{
|
||||
NCCLCHECK(ncclAllReduce(inputs[0], outputs[1], size, (*getDtypeMap())[mType], ncclSum, *mNcclComm, stream));
|
||||
tensorrt_llm::kernels::AllReduceParams params;
|
||||
int fusion_ptr_idx = 0;
|
||||
if (mStrategy == AllReduceStrategyType::NCCL)
|
||||
if (mStrategy == AllReduceStrategyType::NCCL || mStrategy == AllReduceStrategyType::NCCL_SYMMETRIC)
|
||||
{
|
||||
fusion_ptr_idx = 1;
|
||||
}
|
||||
|
||||
@ -15,10 +15,12 @@
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include "tensorrt_llm/common/cudaDriverWrapper.h"
|
||||
#include "tensorrt_llm/common/cudaUtils.h"
|
||||
#include "tensorrt_llm/common/customAllReduceUtils.h"
|
||||
#include "tensorrt_llm/common/dataType.h"
|
||||
#include "tensorrt_llm/common/mcastDevMemUtils.h"
|
||||
#include "tensorrt_llm/common/ncclUtils.h"
|
||||
#include "tensorrt_llm/common/opUtils.h"
|
||||
#include "tensorrt_llm/kernels/communicationKernels/allReduceFusionKernels.h"
|
||||
#include "tensorrt_llm/kernels/communicationKernels/customLowPrecisionAllReduceKernels.h"
|
||||
@ -39,6 +41,7 @@
|
||||
#if ENABLE_MULTI_DEVICE
|
||||
#include <ATen/cuda/EmptyTensor.h>
|
||||
#include <c10/util/irange.h>
|
||||
#include <cuda.h>
|
||||
#include <nccl.h>
|
||||
#include <torch/csrc/distributed/c10d/FileStore.hpp>
|
||||
#include <torch/csrc/distributed/c10d/ProcessGroup.hpp>
|
||||
@ -51,6 +54,7 @@
|
||||
|
||||
#include <cstddef>
|
||||
#include <cstdint>
|
||||
#include <limits>
|
||||
#include <unordered_set>
|
||||
|
||||
// using namespace nvinfer1;
|
||||
@ -238,6 +242,9 @@ public:
|
||||
AllreduceOp(
|
||||
std::set<int> group, nvinfer1::DataType type, AllReduceStrategyType strategy, AllReduceFusionOp op, float eps)
|
||||
: mGroup(std::move(group))
|
||||
, mIsNVLINKSupported(false)
|
||||
, mIsP2PSupported(false)
|
||||
, mIsMNNVLSupported(false)
|
||||
, mType(type)
|
||||
, mStrategy(strategy)
|
||||
, mOp(op)
|
||||
@ -248,6 +255,9 @@ public:
|
||||
AllreduceOp(std::set<int> group, c10::intrusive_ptr<c10d::ProcessGroup> const& process_group_,
|
||||
nvinfer1::DataType type, AllReduceStrategyType strategy, AllReduceFusionOp op, float eps)
|
||||
: mGroup(std::move(group))
|
||||
, mIsNVLINKSupported(false)
|
||||
, mIsP2PSupported(false)
|
||||
, mIsMNNVLSupported(false)
|
||||
, mType(type)
|
||||
, mStrategy(strategy)
|
||||
, mOp(op)
|
||||
@ -437,44 +447,117 @@ private:
|
||||
torch::optional<torch::Tensor> const& residual, torch::optional<torch::Tensor> const& norm_weight,
|
||||
torch::optional<torch::Tensor> const& scale, torch::optional<torch::Tensor> const& bias)
|
||||
{
|
||||
// Handle ProcessGroup path first - cannot extract NCCL comm for window registration
|
||||
// Use ProcessGroup's allreduce directly and return early
|
||||
if (mNcclComm.index() == 1)
|
||||
{
|
||||
auto torchPg = std::get<1>(mNcclComm);
|
||||
|
||||
torch::Tensor reduceOutput = input.clone();
|
||||
std::vector tensors{reduceOutput};
|
||||
PGCHECK_THROW(torchPg->allreduce(tensors, {c10d::ReduceOp::SUM}));
|
||||
|
||||
if (mOp == AllReduceFusionOp::NONE)
|
||||
{
|
||||
return {reduceOutput};
|
||||
}
|
||||
|
||||
// Treat any other patterns as fallback cases.
|
||||
return fallbackRunSubsequentOps(input, residual, norm_weight, scale, bias, reduceOutput);
|
||||
}
|
||||
|
||||
// From here on, we have a raw NCCL comm - can proceed with window registration
|
||||
auto rawComm = std::get<0>(mNcclComm);
|
||||
ncclComm_t comm = *rawComm;
|
||||
TLLM_CHECK_WITH_INFO(comm != nullptr, "NCCL communicator is null");
|
||||
TLLM_LOG_DEBUG("[runNCCLAllReduceSymmetric] Using raw NCCL comm path (not ProcessGroup)");
|
||||
|
||||
using tensorrt_llm::common::nccl_util::NCCLWindowAllocator;
|
||||
using tensorrt_llm::common::nccl_util::createNCCLWindowTensor;
|
||||
|
||||
auto stream = at::cuda::getCurrentCUDAStream(input.get_device());
|
||||
int size = input.numel();
|
||||
auto& ub_manager = tensorrt_llm::runtime::ub::UserBuffersManager::get_instance();
|
||||
auto ub_tensor0 = input;
|
||||
auto ub_buffer0 = ub_manager.search_buffer(input.data_ptr());
|
||||
if (ub_buffer0.invalid())
|
||||
size_t bufferSizeBytes = size * input.element_size();
|
||||
|
||||
// Using unregistered input buffers with NCCL symmetric, requires a memcpy
|
||||
// This is an overhead introduced with using NCCL_SYMMTRIC over NCCL.
|
||||
// Both the memcpy and the perf benefit from using NCCL_SYMMETRIC scale linear with the message size.
|
||||
// But a local memcpy is cheaper than the remote operations, so with larger message sizes the benefit is
|
||||
// stronger. Additionally, the perf benefit scales with the number of ranks, since multimem enables O(const.)
|
||||
// versus O(N) complexity. Hence we model this cutoff with a linear model. The numbers below were obtained on
|
||||
// GB200, scanning different message sizes and ranks. You can determine the regression onset for each number of
|
||||
// ranks to a single message size. And the following formula was obtained by fitting a linear model to the
|
||||
// regression onset. It is possible to override this empirical heuristic with the TLLM_NCCL_MIN_REGISTRATION
|
||||
// environment variable.
|
||||
double const a = -4986.43478503;
|
||||
double const b = 156716.52177552;
|
||||
int nRanks;
|
||||
NCCLCHECK_THROW(ncclCommCount(comm, &nRanks));
|
||||
size_t minRegistrationThreshold = static_cast<size_t>(std::max(0.0, a * nRanks + b)) * input.element_size();
|
||||
// Disable window registration if neither NVLink nor MNNVL is supported
|
||||
// TODO replace in NCCL 2.29 with comm query
|
||||
if (!mIsNVLINKSupported && !mIsMNNVLSupported)
|
||||
{
|
||||
auto [symmetric_input, symmetric_ub_buffer0]
|
||||
= torch_ext::create_userbuffers_tensor(input.sizes(), input.scalar_type());
|
||||
cudaMemcpyAsync(symmetric_ub_buffer0.addr, input.data_ptr(), size * input.element_size(),
|
||||
cudaMemcpyDeviceToDevice, stream);
|
||||
ub_buffer0 = symmetric_ub_buffer0;
|
||||
ub_tensor0 = symmetric_input;
|
||||
minRegistrationThreshold = std::numeric_limits<size_t>::max();
|
||||
}
|
||||
char const* envThreshold = std::getenv("TLLM_NCCL_MIN_REGISTRATION");
|
||||
if (envThreshold != nullptr)
|
||||
{
|
||||
minRegistrationThreshold = static_cast<size_t>(std::atoi(envThreshold)) * input.element_size();
|
||||
}
|
||||
|
||||
TLLM_CHECK(!ub_buffer0.invalid());
|
||||
auto [norm_out, ub_buffer1] = torch_ext::create_userbuffers_tensor(input.sizes(), input.scalar_type());
|
||||
// Search for existing buffer
|
||||
auto& allocator = NCCLWindowAllocator::getInstance();
|
||||
auto windowBuffer0 = allocator.searchBuffer(comm, input.data_ptr());
|
||||
|
||||
std::visit(overloaded{[&, norm_out_ = norm_out](std::shared_ptr<ncclComm_t>& rawComm)
|
||||
{
|
||||
NCCLCHECK_THROW(ncclAllReduce(ub_buffer0.addr, norm_out_.mutable_data_ptr(), size,
|
||||
(*getDtypeMap())[mType], ncclSum, *rawComm, stream));
|
||||
},
|
||||
[&, norm_out_ = norm_out](c10::intrusive_ptr<c10d::ProcessGroup>& torchPg)
|
||||
{
|
||||
PGCHECK_THROW(PgHelper{torchPg}.allreduce(ub_tensor0, {c10d::ReduceOp::SUM}));
|
||||
std::ignore = norm_out_.copy_(ub_tensor0, true);
|
||||
}},
|
||||
mNcclComm);
|
||||
torch::Tensor inputTensor = input;
|
||||
void* inputPtr = input.data_ptr();
|
||||
|
||||
// If buffer is not registered, decide whether to register based on size
|
||||
if (!windowBuffer0.isValid())
|
||||
{
|
||||
if (bufferSizeBytes < minRegistrationThreshold)
|
||||
{
|
||||
// Small buffer: use input directly without window registration
|
||||
TLLM_LOG_DEBUG(
|
||||
"[runNCCLAllReduceSymmetric] Buffer size %zu bytes < threshold %zu bytes, "
|
||||
"skipping window registration",
|
||||
bufferSizeBytes, minRegistrationThreshold);
|
||||
// inputTensor and inputPtr remain pointing to original input
|
||||
}
|
||||
else
|
||||
{
|
||||
// Large buffer: create window buffer and copy input (can swap inputTensor reference)
|
||||
auto [symmetricInput, symmetricBuffer0]
|
||||
= createNCCLWindowTensor(comm, input.sizes(), input.scalar_type());
|
||||
TLLM_CUDA_CHECK(cudaMemcpyAsync(
|
||||
symmetricBuffer0.ptr, input.data_ptr(), bufferSizeBytes, cudaMemcpyDeviceToDevice, stream));
|
||||
windowBuffer0 = symmetricBuffer0;
|
||||
inputTensor = symmetricInput; // Swap to window-backed tensor
|
||||
inputPtr = windowBuffer0.ptr;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
// Buffer already registered - use it directly
|
||||
inputPtr = windowBuffer0.ptr;
|
||||
}
|
||||
|
||||
// Use window-backed output buffer
|
||||
auto [normOut, windowBuffer1] = createNCCLWindowTensor(comm, input.sizes(), input.scalar_type());
|
||||
torch::Tensor outputTensor = normOut;
|
||||
void* outputPtr = windowBuffer1.ptr;
|
||||
|
||||
// Perform allreduce
|
||||
NCCLCHECK_THROW(ncclAllReduce(inputPtr, outputPtr, size, (*getDtypeMap())[mType], ncclSum, comm, stream));
|
||||
|
||||
if (mOp == AllReduceFusionOp::NONE)
|
||||
{
|
||||
return {norm_out};
|
||||
return {outputTensor};
|
||||
}
|
||||
|
||||
// Treat any other patterns as fallback cases.
|
||||
return fallbackRunSubsequentOps(input, residual, norm_weight, scale, bias, norm_out);
|
||||
return fallbackRunSubsequentOps(input, residual, norm_weight, scale, bias, outputTensor);
|
||||
}
|
||||
|
||||
std::vector<torch::Tensor> runLowPrecisionAllReduce(torch::Tensor const& input,
|
||||
@ -799,16 +882,104 @@ private:
|
||||
|
||||
void initGroupTopology()
|
||||
{
|
||||
static std::map<std::set<int>, std::tuple<bool, bool>> cache;
|
||||
static std::map<std::set<int>, std::tuple<bool, bool, bool>> cache;
|
||||
if (cache.find(mGroup) != cache.end())
|
||||
{
|
||||
auto [is_NVLINK_supported, is_P2P_supported] = cache[mGroup];
|
||||
auto [is_NVLINK_supported, is_P2P_supported, is_MNNVL_supported] = cache[mGroup];
|
||||
mIsNVLINKSupported = is_NVLINK_supported;
|
||||
mIsP2PSupported = is_P2P_supported;
|
||||
mIsMNNVLSupported = is_MNNVL_supported;
|
||||
return;
|
||||
}
|
||||
setGroupTopology();
|
||||
cache[mGroup] = {mIsNVLINKSupported, mIsP2PSupported};
|
||||
cache[mGroup] = {mIsNVLINKSupported, mIsP2PSupported, mIsMNNVLSupported};
|
||||
}
|
||||
|
||||
bool checkMNNVLSupport(int device_id)
|
||||
{
|
||||
#if ENABLE_MULTI_DEVICE
|
||||
// 1. Check CUDA driver version (needs >= 12.0.10)
|
||||
int cuda_driver_version = -1;
|
||||
TLLM_CUDA_CHECK(cudaDriverGetVersion(&cuda_driver_version));
|
||||
if (cuda_driver_version < 12010)
|
||||
{
|
||||
TLLM_LOG_DEBUG("MNNVL check: CUDA Driver version %d < 12010", cuda_driver_version);
|
||||
return false;
|
||||
}
|
||||
|
||||
// 2. Check multicast support
|
||||
CUdevice cu_device;
|
||||
TLLM_CU_CHECK(cuDeviceGet(&cu_device, device_id));
|
||||
auto cuda_driver = tensorrt_llm::common::CUDADriverWrapper::getInstance();
|
||||
|
||||
int multicast_supported = 0;
|
||||
TLLM_CU_CHECK(cuda_driver->cuDeviceGetAttribute(
|
||||
&multicast_supported, CU_DEVICE_ATTRIBUTE_MULTICAST_SUPPORTED, cu_device));
|
||||
if (!multicast_supported)
|
||||
{
|
||||
TLLM_LOG_DEBUG("MNNVL check: Device %d does not support multicast", device_id);
|
||||
return false;
|
||||
}
|
||||
|
||||
// 3. Check fabric handle support
|
||||
int fabric_handle_supported = 0;
|
||||
TLLM_CU_CHECK(cuda_driver->cuDeviceGetAttribute(
|
||||
&fabric_handle_supported, CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_FABRIC_SUPPORTED, cu_device));
|
||||
if (!fabric_handle_supported)
|
||||
{
|
||||
TLLM_LOG_DEBUG("MNNVL check: Device %d does not support fabric handles", device_id);
|
||||
return false;
|
||||
}
|
||||
|
||||
// 4. Check NVML GPU Fabric Info
|
||||
nvmlDevice_t nvml_device;
|
||||
NVML_CHECK_THROW(nvmlDeviceGetHandleByIndex(device_id, &nvml_device));
|
||||
|
||||
nvmlGpuFabricInfo_t fabric_info;
|
||||
NVML_CHECK_THROW(nvmlDeviceGetGpuFabricInfo(nvml_device, &fabric_info));
|
||||
|
||||
// Check if fabric is fully initialized
|
||||
if (fabric_info.state != NVML_GPU_FABRIC_STATE_COMPLETED || fabric_info.status != NVML_SUCCESS)
|
||||
{
|
||||
TLLM_LOG_DEBUG(
|
||||
"MNNVL check: Fabric state not complete - state=%u status=%u", fabric_info.state, fabric_info.status);
|
||||
return false;
|
||||
}
|
||||
|
||||
// 5. Check NVLink links are active (similar to Python support_nvlink(True))
|
||||
unsigned int active_links = 0;
|
||||
unsigned int available_links = 0;
|
||||
|
||||
for (unsigned int link = 0; link < NVML_NVLINK_MAX_LINKS; link++)
|
||||
{
|
||||
unsigned int cap_p2p = 0;
|
||||
nvmlReturn_t cap_result
|
||||
= nvmlDeviceGetNvLinkCapability(nvml_device, link, NVML_NVLINK_CAP_P2P_SUPPORTED, &cap_p2p);
|
||||
if (cap_result == NVML_SUCCESS && cap_p2p)
|
||||
{
|
||||
available_links++;
|
||||
nvmlEnableState_t link_state;
|
||||
if (nvmlDeviceGetNvLinkState(nvml_device, link, &link_state) == NVML_SUCCESS
|
||||
&& link_state == NVML_FEATURE_ENABLED)
|
||||
{
|
||||
active_links++;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
bool all_links_up = (active_links == available_links && available_links > 0);
|
||||
if (!all_links_up)
|
||||
{
|
||||
TLLM_LOG_DEBUG(
|
||||
"MNNVL check: Not all NVLink links active - active=%u available=%u", active_links, available_links);
|
||||
return false;
|
||||
}
|
||||
|
||||
TLLM_LOG_INFO("MNNVL check: Device %d supports MNNVL (fabric_clique=%u)", device_id, fabric_info.cliqueId);
|
||||
return true;
|
||||
#else
|
||||
return false;
|
||||
#endif
|
||||
}
|
||||
|
||||
void setGroupTopology()
|
||||
@ -820,108 +991,190 @@ private:
|
||||
[&](c10::intrusive_ptr<c10d::ProcessGroup>& torchPg) { return getLocalGroupTorch(mGroup); }},
|
||||
mNcclComm);
|
||||
|
||||
if (mGroup.size() != local_group.size())
|
||||
{
|
||||
mIsP2PSupported = false;
|
||||
mIsNVLINKSupported = false;
|
||||
TLLM_LOG_INFO("Found inter-node TP group for rank %d", rank);
|
||||
return;
|
||||
}
|
||||
TLLM_LOG_INFO("TP group is intra-node for rank %d", rank);
|
||||
bool is_inter_node = (mGroup.size() != local_group.size());
|
||||
|
||||
NvmlManager nvml_manager;
|
||||
mIsP2PSupported = true;
|
||||
mIsNVLINKSupported = true;
|
||||
mIsMNNVLSupported = false;
|
||||
|
||||
// TODO(ytong): Should we provide group topology info instead of querying it here?
|
||||
// Use cudaDeviceCanAccessPeer to determine whether p2p is supported,
|
||||
// and use nvml to determine whether there are nvlink links between ranks.
|
||||
for (int first_device_id : local_group)
|
||||
// First, check NVLink within local group (intra-node)
|
||||
if (!local_group.empty())
|
||||
{
|
||||
for (int second_device_id : local_group)
|
||||
for (int first_device_id : local_group)
|
||||
{
|
||||
if (first_device_id >= second_device_id)
|
||||
for (int second_device_id : local_group)
|
||||
{
|
||||
continue;
|
||||
}
|
||||
|
||||
int can_access_peer = 0;
|
||||
TLLM_CUDA_CHECK(cudaDeviceCanAccessPeer(&can_access_peer, first_device_id, second_device_id));
|
||||
|
||||
if (!can_access_peer)
|
||||
{
|
||||
mIsP2PSupported = false;
|
||||
mIsNVLINKSupported = false;
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
nvmlDevice_t first_device;
|
||||
NVML_CHECK_THROW(nvmlDeviceGetHandleByIndex(first_device_id, &first_device));
|
||||
|
||||
bool is_NVLINK = false;
|
||||
|
||||
for (unsigned int link = 0; link < NVML_NVLINK_MAX_LINKS; link++)
|
||||
{
|
||||
nvmlPciInfo_t remote_pci_info;
|
||||
if (nvmlDeviceGetNvLinkRemotePciInfo_v2(first_device, link, &remote_pci_info) != NVML_SUCCESS)
|
||||
if (first_device_id >= second_device_id)
|
||||
{
|
||||
continue;
|
||||
}
|
||||
|
||||
nvmlDevice_t remote_device;
|
||||
auto const result = nvmlDeviceGetHandleByPciBusId_v2(remote_pci_info.busId, &remote_device);
|
||||
int can_access_peer = 0;
|
||||
TLLM_CUDA_CHECK(cudaDeviceCanAccessPeer(&can_access_peer, first_device_id, second_device_id));
|
||||
|
||||
if (result == NVML_SUCCESS)
|
||||
if (!can_access_peer)
|
||||
{
|
||||
// Two GPUs are connected directly through nvlink
|
||||
unsigned int remote_device_id;
|
||||
NVML_CHECK_THROW(nvmlDeviceGetIndex(remote_device, &remote_device_id));
|
||||
|
||||
if (remote_device_id == static_cast<unsigned int>(second_device_id))
|
||||
{
|
||||
is_NVLINK = true;
|
||||
}
|
||||
mIsP2PSupported = false;
|
||||
mIsNVLINKSupported = false;
|
||||
TLLM_LOG_INFO(
|
||||
"P2P not supported between local devices %d and %d", first_device_id, second_device_id);
|
||||
// Continue checking other pairs, but mark as not supported
|
||||
continue;
|
||||
}
|
||||
else if (result == NVML_ERROR_NOT_FOUND)
|
||||
|
||||
nvmlDevice_t first_device;
|
||||
NVML_CHECK_THROW(nvmlDeviceGetHandleByIndex(first_device_id, &first_device));
|
||||
|
||||
bool is_NVLINK = false;
|
||||
|
||||
for (unsigned int link = 0; link < NVML_NVLINK_MAX_LINKS; link++)
|
||||
{
|
||||
// Maybe Two GPUs are connected via nvswitch,
|
||||
// now remotePciInfo represents the pci information of nvswitch,
|
||||
// determine whether nvlink is supported by whether two GPUs are connected to the same
|
||||
// nvswitch.
|
||||
nvmlDevice_t second_device;
|
||||
NVML_CHECK_THROW(nvmlDeviceGetHandleByIndex(second_device_id, &second_device));
|
||||
|
||||
for (unsigned int second_link = 0; second_link < NVML_NVLINK_MAX_LINKS; second_link++)
|
||||
nvmlPciInfo_t remote_pci_info;
|
||||
if (nvmlDeviceGetNvLinkRemotePciInfo_v2(first_device, link, &remote_pci_info) != NVML_SUCCESS)
|
||||
{
|
||||
nvmlPciInfo_t second_remote_pci_info;
|
||||
if (nvmlDeviceGetNvLinkRemotePciInfo_v2(second_device, second_link, &second_remote_pci_info)
|
||||
!= NVML_SUCCESS)
|
||||
{
|
||||
continue;
|
||||
}
|
||||
continue;
|
||||
}
|
||||
|
||||
if (strcmp(remote_pci_info.busId, second_remote_pci_info.busId) == 0)
|
||||
nvmlDevice_t remote_device;
|
||||
auto const result = nvmlDeviceGetHandleByPciBusId_v2(remote_pci_info.busId, &remote_device);
|
||||
|
||||
if (result == NVML_SUCCESS)
|
||||
{
|
||||
// Two GPUs are connected directly through nvlink
|
||||
unsigned int remote_device_id;
|
||||
NVML_CHECK_THROW(nvmlDeviceGetIndex(remote_device, &remote_device_id));
|
||||
|
||||
if (remote_device_id == static_cast<unsigned int>(second_device_id))
|
||||
{
|
||||
is_NVLINK = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
NVML_CHECK_THROW(result);
|
||||
else if (result == NVML_ERROR_NOT_FOUND)
|
||||
{
|
||||
// Maybe Two GPUs are connected via nvswitch,
|
||||
// now remotePciInfo represents the pci information of nvswitch,
|
||||
// determine whether nvlink is supported by whether two GPUs are connected to the same
|
||||
// nvswitch.
|
||||
nvmlDevice_t second_device;
|
||||
NVML_CHECK_THROW(nvmlDeviceGetHandleByIndex(second_device_id, &second_device));
|
||||
|
||||
for (unsigned int second_link = 0; second_link < NVML_NVLINK_MAX_LINKS; second_link++)
|
||||
{
|
||||
nvmlPciInfo_t second_remote_pci_info;
|
||||
if (nvmlDeviceGetNvLinkRemotePciInfo_v2(
|
||||
second_device, second_link, &second_remote_pci_info)
|
||||
!= NVML_SUCCESS)
|
||||
{
|
||||
continue;
|
||||
}
|
||||
|
||||
if (strcmp(remote_pci_info.busId, second_remote_pci_info.busId) == 0)
|
||||
{
|
||||
is_NVLINK = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
NVML_CHECK_THROW(result);
|
||||
}
|
||||
|
||||
if (is_NVLINK)
|
||||
{
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (is_NVLINK)
|
||||
{
|
||||
break;
|
||||
}
|
||||
mIsNVLINKSupported &= is_NVLINK;
|
||||
}
|
||||
|
||||
mIsNVLINKSupported &= is_NVLINK;
|
||||
}
|
||||
}
|
||||
|
||||
// For inter-node groups, check MNNVL support
|
||||
if (is_inter_node)
|
||||
{
|
||||
TLLM_LOG_INFO("Found inter-node TP group for rank %d, checking MNNVL support", rank);
|
||||
|
||||
// Check MNNVL support on local device(s)
|
||||
bool local_mnnvl_supported = false;
|
||||
if (!local_group.empty())
|
||||
{
|
||||
// Check MNNVL on first device in local group (all devices on same node should have same MNNVL status)
|
||||
int check_device = *local_group.begin();
|
||||
local_mnnvl_supported = checkMNNVLSupport(check_device);
|
||||
}
|
||||
|
||||
// Gather MNNVL status from all ranks in the group
|
||||
int local_mnnvl_status = local_mnnvl_supported ? 1 : 0;
|
||||
std::vector<int> all_mnnvl_status(mGroup.size());
|
||||
|
||||
std::visit(overloaded{[&](std::shared_ptr<ncclComm_t>& comm_ptr)
|
||||
{
|
||||
// For NCCL comm, use MPI to gather status
|
||||
// Use MPI allgather to collect MNNVL status
|
||||
// Create a sub-communicator for the group
|
||||
std::vector<int> group_ranks(mGroup.begin(), mGroup.end());
|
||||
MPI_Group world_group, new_group;
|
||||
MPI_Comm group_comm;
|
||||
MPI_Comm_group(COMM_SESSION, &world_group);
|
||||
MPI_Group_incl(world_group, group_ranks.size(), group_ranks.data(), &new_group);
|
||||
MPI_Comm_create_group(COMM_SESSION, new_group, 0, &group_comm);
|
||||
|
||||
if (group_comm != MPI_COMM_NULL)
|
||||
{
|
||||
MPI_Allgather(&local_mnnvl_status, 1, MPI_INT, all_mnnvl_status.data(), 1, MPI_INT,
|
||||
group_comm);
|
||||
MPI_Comm_free(&group_comm);
|
||||
}
|
||||
MPI_Group_free(&new_group);
|
||||
MPI_Group_free(&world_group);
|
||||
},
|
||||
[&](c10::intrusive_ptr<c10d::ProcessGroup>& torchPg)
|
||||
{
|
||||
// For ProcessGroup, use allgather directly
|
||||
// Note: This assumes the ProcessGroup is already set up for the correct group
|
||||
std::vector<torch::Tensor> input_tensors
|
||||
= {torch::tensor({local_mnnvl_status}, torch::kInt32)};
|
||||
std::vector<std::vector<torch::Tensor>> output_tensors(1);
|
||||
output_tensors[0].resize(mGroup.size());
|
||||
auto work = torchPg->allgather(output_tensors, input_tensors);
|
||||
if (work)
|
||||
{
|
||||
work->wait();
|
||||
for (size_t i = 0; i < mGroup.size(); ++i)
|
||||
{
|
||||
all_mnnvl_status[i] = output_tensors[0][i].item<int>();
|
||||
}
|
||||
}
|
||||
}},
|
||||
mNcclComm);
|
||||
|
||||
// Check if all ranks support MNNVL
|
||||
bool all_ranks_support_mnnvl = true;
|
||||
for (int status : all_mnnvl_status)
|
||||
{
|
||||
if (status == 0)
|
||||
{
|
||||
all_ranks_support_mnnvl = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// For inter-node: MNNVL support means all nodes have MNNVL
|
||||
// Also need local NVLink for optimal performance
|
||||
mIsMNNVLSupported = mIsNVLINKSupported && all_ranks_support_mnnvl;
|
||||
mIsP2PSupported = false; // P2P doesn't work across nodes
|
||||
|
||||
TLLM_LOG_INFO("Inter-node topology: local_NVLink=%d, local_MNNVL=%d, all_ranks_MNNVL=%d, final_MNNVL=%d",
|
||||
mIsNVLINKSupported ? 1 : 0, local_mnnvl_status, all_ranks_support_mnnvl ? 1 : 0,
|
||||
mIsMNNVLSupported ? 1 : 0);
|
||||
}
|
||||
else
|
||||
{
|
||||
TLLM_LOG_INFO("TP group is intra-node for rank %d", rank);
|
||||
}
|
||||
}
|
||||
|
||||
AllReduceStrategyType selectImplementation(size_t seq_len, size_t hidden_size)
|
||||
@ -951,12 +1204,12 @@ private:
|
||||
|
||||
if (ifFallbackToNCCL(seq_len, message_size_bytes, max_workspace_size))
|
||||
{
|
||||
return AllReduceStrategyType::NCCL;
|
||||
return AllReduceStrategyType::NCCL_SYMMETRIC;
|
||||
}
|
||||
|
||||
// This rule based heuristic only chooses between NCCL and MIN_LATENCY strategies.
|
||||
// From this point, all fusion patterns are supported by all these strategies: NCCL, ONESHOT, TWOSHOT and
|
||||
// MIN_LATENCY.
|
||||
// This rule based heuristic only chooses between NCCL_SYMMETRIC and MIN_LATENCY strategies.
|
||||
// From this point, all fusion patterns are supported by all these strategies: NCCL_SYMMETRIC, ONESHOT, TWOSHOT
|
||||
// and MIN_LATENCY.
|
||||
if (mStrategy != AllReduceStrategyType::AUTO)
|
||||
{
|
||||
// Check TWOSHOT constraint: seq_len >= tp_size
|
||||
@ -973,12 +1226,11 @@ private:
|
||||
return tensorrt_llm::utils::customAllReduceUtils::selectStrategyLookUpTable(
|
||||
seq_len, hidden_size, mOp, mGroup.size());
|
||||
}
|
||||
return AllReduceStrategyType::NCCL;
|
||||
}
|
||||
|
||||
bool ifFallbackToNCCL(size_t seq_len, size_t message_size_bytes, size_t max_workspace_size)
|
||||
{
|
||||
// If messageSize is less than maxWorkspaceSize, use NCCL, regardless of the fusion type.
|
||||
// If messageSize is greater than maxWorkspaceSize or topology is unsuitable, use NCCL_SYMMETRIC fallback.
|
||||
if (message_size_bytes > max_workspace_size || !mIsP2PSupported || !mIsNVLINKSupported)
|
||||
{
|
||||
return true;
|
||||
@ -1006,6 +1258,7 @@ private:
|
||||
std::set<int> mGroup;
|
||||
bool mIsNVLINKSupported;
|
||||
bool mIsP2PSupported;
|
||||
bool mIsMNNVLSupported;
|
||||
nvinfer1::DataType mType;
|
||||
AllReduceStrategyType mStrategy;
|
||||
AllReduceFusionOp mOp;
|
||||
|
||||
@ -20,3 +20,9 @@ target_link_libraries(cacheTransceiverTest PRIVATE ${Python3_LIBRARIES})
|
||||
|
||||
add_gtest(mpiUtilsTest mpiUtilsTest.cpp)
|
||||
add_gtest(userBufferTest userBufferTest.cpp)
|
||||
add_gtest(ncclUtilsTest ncclUtilsTest.cpp)
|
||||
target_link_libraries(ncclUtilsTest PRIVATE ${Python3_LIBRARIES})
|
||||
if(BUILD_PYT)
|
||||
target_compile_definitions(ncclUtilsTest PUBLIC BUILD_PYT)
|
||||
target_link_libraries(ncclUtilsTest PUBLIC ${TORCH_LIBRARIES})
|
||||
endif()
|
||||
|
||||
745
cpp/tests/unit_tests/multi_gpu/ncclUtilsTest.cpp
Normal file
745
cpp/tests/unit_tests/multi_gpu/ncclUtilsTest.cpp
Normal file
@ -0,0 +1,745 @@
|
||||
/*
|
||||
* Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include "tensorrt_llm/common/ncclUtils.h"
|
||||
#include "tensorrt_llm/common/assert.h"
|
||||
#include "tensorrt_llm/common/cudaUtils.h"
|
||||
#include "tensorrt_llm/common/logger.h"
|
||||
#include "tensorrt_llm/common/opUtils.h"
|
||||
#include "tensorrt_llm/runtime/utils/mpiUtils.h"
|
||||
|
||||
#include <gtest/gtest.h>
|
||||
#include <nccl.h>
|
||||
#include <thread>
|
||||
#include <vector>
|
||||
|
||||
#if ENABLE_MULTI_DEVICE && BUILD_PYT
|
||||
#include <torch/extension.h>
|
||||
#endif
|
||||
|
||||
#if ENABLE_MULTI_DEVICE
|
||||
|
||||
namespace mpi = tensorrt_llm::mpi;
|
||||
namespace tr = tensorrt_llm::runtime;
|
||||
namespace nccl_util = tensorrt_llm::common::nccl_util;
|
||||
|
||||
using ::getComm;
|
||||
|
||||
// Helper function to create a split communicator for testing
|
||||
// This allows us to test cleanup behavior explicitly by controlling the lifetime
|
||||
std::shared_ptr<ncclComm_t> createSplitComm(ncclComm_t parentComm, int color, int key)
|
||||
{
|
||||
ncclComm_t newComm;
|
||||
ncclResult_t result = ncclCommSplit(parentComm, color, key, &newComm, nullptr);
|
||||
if (result != ncclSuccess)
|
||||
{
|
||||
TLLM_THROW("ncclCommSplit failed with error: %d", result);
|
||||
}
|
||||
|
||||
// Create a shared_ptr with custom deleter that cleans up resources first
|
||||
return std::shared_ptr<ncclComm_t>(new ncclComm_t(newComm),
|
||||
[](ncclComm_t* comm)
|
||||
{
|
||||
if (comm && *comm)
|
||||
{
|
||||
// STEP 1: Clean up all registered resources FIRST
|
||||
tensorrt_llm::common::nccl_util::NcclCommResourceManager::getInstance().cleanupResources(*comm);
|
||||
|
||||
// STEP 2: Now destroy the NCCL communicator
|
||||
ncclResult_t result = ncclCommDestroy(*comm);
|
||||
if (result != ncclSuccess)
|
||||
{
|
||||
TLLM_LOG_WARNING("ncclCommDestroy failed with error: %d", result);
|
||||
}
|
||||
|
||||
// STEP 3: Free the memory
|
||||
delete comm;
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
//==============================================================================
|
||||
// NcclCommResourceManager Tests
|
||||
//==============================================================================
|
||||
|
||||
class NcclCommResourceManagerTest : public ::testing::Test
|
||||
{
|
||||
protected:
|
||||
void SetUp() override
|
||||
{
|
||||
auto& comm = mpi::MpiComm::world();
|
||||
mWorldSize = comm.getSize();
|
||||
mRank = comm.getRank();
|
||||
|
||||
if (mWorldSize < 2)
|
||||
{
|
||||
GTEST_SKIP() << "Requires at least 2 ranks (got " << mWorldSize << ")";
|
||||
}
|
||||
|
||||
// Set CUDA device for this rank (required before NCCL initialization)
|
||||
int deviceCount = 0;
|
||||
TLLM_CUDA_CHECK(cudaGetDeviceCount(&deviceCount));
|
||||
if (deviceCount > 0)
|
||||
{
|
||||
int deviceId = mRank % deviceCount;
|
||||
TLLM_CUDA_CHECK(cudaSetDevice(deviceId));
|
||||
}
|
||||
|
||||
// Create a communicator for testing
|
||||
std::set<int> group;
|
||||
for (int i = 0; i < mWorldSize; ++i)
|
||||
{
|
||||
group.insert(i);
|
||||
}
|
||||
mComm = getComm(group);
|
||||
}
|
||||
|
||||
void TearDown() override
|
||||
{
|
||||
// Communicator cleanup happens automatically via shared_ptr deleter
|
||||
mComm.reset();
|
||||
}
|
||||
|
||||
int mWorldSize;
|
||||
int mRank;
|
||||
std::shared_ptr<ncclComm_t> mComm;
|
||||
};
|
||||
|
||||
TEST_F(NcclCommResourceManagerTest, ResourceRegistration)
|
||||
{
|
||||
auto& manager = nccl_util::NcclCommResourceManager::getInstance();
|
||||
|
||||
// Create a separate comm using split for this test
|
||||
auto testComm = createSplitComm(*mComm, 0, mRank);
|
||||
|
||||
// Register a resource
|
||||
bool cleanupCalled = false;
|
||||
manager.registerResource(
|
||||
*testComm, [&cleanupCalled]() { cleanupCalled = true; }, "TestResource");
|
||||
|
||||
EXPECT_TRUE(manager.hasResources(*testComm));
|
||||
EXPECT_EQ(manager.getResourceCount(*testComm), 1);
|
||||
EXPECT_FALSE(cleanupCalled); // Cleanup not called yet
|
||||
|
||||
// Store the raw comm value before destruction
|
||||
ncclComm_t rawComm = *testComm;
|
||||
|
||||
// Cleanup should be called when comm is destroyed
|
||||
testComm.reset();
|
||||
|
||||
// Verify cleanup was called
|
||||
EXPECT_TRUE(cleanupCalled);
|
||||
|
||||
// Verify cleanup: check that the old comm (now destroyed) no longer has resources
|
||||
// Note: The comm is destroyed, but we can still check the manager's internal state
|
||||
// The cleanup should have removed all resources for this comm
|
||||
EXPECT_FALSE(manager.hasResources(rawComm));
|
||||
EXPECT_EQ(manager.getResourceCount(rawComm), 0);
|
||||
}
|
||||
|
||||
TEST_F(NcclCommResourceManagerTest, MultipleResources)
|
||||
{
|
||||
auto& manager = nccl_util::NcclCommResourceManager::getInstance();
|
||||
|
||||
// Create a separate comm using split for this test
|
||||
auto testComm = createSplitComm(*mComm, 0, mRank);
|
||||
|
||||
std::vector<int> cleanupOrder;
|
||||
manager.registerResource(
|
||||
*testComm, [&cleanupOrder]() { cleanupOrder.push_back(1); }, "Resource1");
|
||||
manager.registerResource(
|
||||
*testComm, [&cleanupOrder]() { cleanupOrder.push_back(2); }, "Resource2");
|
||||
manager.registerResource(
|
||||
*testComm, [&cleanupOrder]() { cleanupOrder.push_back(3); }, "Resource3");
|
||||
|
||||
EXPECT_EQ(manager.getResourceCount(*testComm), 3);
|
||||
|
||||
// Cleanup order should be preserved - destroy comm and verify order
|
||||
testComm.reset();
|
||||
|
||||
// Verify cleanup order was preserved (1, 2, 3)
|
||||
EXPECT_EQ(cleanupOrder.size(), 3);
|
||||
EXPECT_EQ(cleanupOrder[0], 1);
|
||||
EXPECT_EQ(cleanupOrder[1], 2);
|
||||
EXPECT_EQ(cleanupOrder[2], 3);
|
||||
}
|
||||
|
||||
TEST_F(NcclCommResourceManagerTest, ResourceCount)
|
||||
{
|
||||
auto& manager = nccl_util::NcclCommResourceManager::getInstance();
|
||||
|
||||
// Create a separate comm using split for this test
|
||||
auto testComm = createSplitComm(*mComm, 0, mRank);
|
||||
|
||||
EXPECT_FALSE(manager.hasResources(*testComm));
|
||||
EXPECT_EQ(manager.getResourceCount(*testComm), 0);
|
||||
|
||||
manager.registerResource(
|
||||
*testComm, []() {}, "Test1");
|
||||
EXPECT_EQ(manager.getResourceCount(*testComm), 1);
|
||||
|
||||
manager.registerResource(
|
||||
*testComm, []() {}, "Test2");
|
||||
EXPECT_EQ(manager.getResourceCount(*testComm), 2);
|
||||
|
||||
testComm.reset();
|
||||
}
|
||||
|
||||
//==============================================================================
|
||||
// NCCLWindowAllocator Tests
|
||||
//==============================================================================
|
||||
|
||||
class NCCLWindowAllocatorTest : public ::testing::Test
|
||||
{
|
||||
protected:
|
||||
void SetUp() override
|
||||
{
|
||||
auto& comm = mpi::MpiComm::world();
|
||||
mWorldSize = comm.getSize();
|
||||
mRank = comm.getRank();
|
||||
|
||||
if (mWorldSize < 2)
|
||||
{
|
||||
GTEST_SKIP() << "Requires at least 2 ranks (got " << mWorldSize << ")";
|
||||
}
|
||||
|
||||
// Set CUDA device for this rank (required before NCCL initialization)
|
||||
int deviceCount = 0;
|
||||
TLLM_CUDA_CHECK(cudaGetDeviceCount(&deviceCount));
|
||||
if (deviceCount > 0)
|
||||
{
|
||||
int deviceId = mRank % deviceCount;
|
||||
TLLM_CUDA_CHECK(cudaSetDevice(deviceId));
|
||||
}
|
||||
|
||||
// Check if NCCL symmetric is supported
|
||||
auto& ncclHelper = nccl_util::NCCLHelper::getInstance();
|
||||
if (!ncclHelper.isLoaded())
|
||||
{
|
||||
GTEST_SKIP() << "NCCL library with symmetric memory support is not available";
|
||||
}
|
||||
|
||||
std::set<int> group;
|
||||
for (int i = 0; i < mWorldSize; ++i)
|
||||
{
|
||||
group.insert(i);
|
||||
}
|
||||
mComm = getComm(group);
|
||||
}
|
||||
|
||||
void TearDown() override
|
||||
{
|
||||
// Cleanup happens automatically
|
||||
mComm.reset();
|
||||
}
|
||||
|
||||
int mWorldSize;
|
||||
int mRank;
|
||||
std::shared_ptr<ncclComm_t> mComm;
|
||||
};
|
||||
|
||||
TEST_F(NCCLWindowAllocatorTest, BasicAllocation)
|
||||
{
|
||||
auto& allocator = nccl_util::NCCLWindowAllocator::getInstance();
|
||||
|
||||
const size_t bufferSize = 1024 * 1024; // 1MB
|
||||
auto buffer = allocator.requestBuffer(*mComm, bufferSize);
|
||||
|
||||
EXPECT_TRUE(buffer.isValid());
|
||||
EXPECT_NE(buffer.ptr, nullptr);
|
||||
EXPECT_NE(buffer.window, nullptr);
|
||||
EXPECT_EQ(buffer.size, bufferSize);
|
||||
EXPECT_GE(buffer.handle, 0);
|
||||
|
||||
// Verify we can search for it
|
||||
auto found = allocator.searchBuffer(*mComm, buffer.ptr);
|
||||
EXPECT_TRUE(found.isValid());
|
||||
EXPECT_EQ(found.ptr, buffer.ptr);
|
||||
|
||||
// Release the buffer
|
||||
allocator.releaseBuffer(*mComm, buffer.ptr);
|
||||
}
|
||||
|
||||
TEST_F(NCCLWindowAllocatorTest, BufferReuse)
|
||||
{
|
||||
auto& allocator = nccl_util::NCCLWindowAllocator::getInstance();
|
||||
|
||||
const size_t bufferSize = 512 * 1024; // 512KB
|
||||
|
||||
// Allocate first buffer
|
||||
auto buffer1 = allocator.requestBuffer(*mComm, bufferSize);
|
||||
EXPECT_TRUE(buffer1.isValid());
|
||||
void* ptr1 = buffer1.ptr;
|
||||
|
||||
// Release it
|
||||
allocator.releaseBuffer(*mComm, ptr1);
|
||||
|
||||
// Request another buffer of the same size - should reuse
|
||||
auto buffer2 = allocator.requestBuffer(*mComm, bufferSize);
|
||||
EXPECT_TRUE(buffer2.isValid());
|
||||
EXPECT_EQ(buffer2.ptr, ptr1); // Should be the same buffer
|
||||
|
||||
allocator.releaseBuffer(*mComm, buffer2.ptr);
|
||||
}
|
||||
|
||||
TEST_F(NCCLWindowAllocatorTest, BestFitReuse)
|
||||
{
|
||||
auto& allocator = nccl_util::NCCLWindowAllocator::getInstance();
|
||||
|
||||
// Allocate buffers of different sizes
|
||||
auto buffer1MB = allocator.requestBuffer(*mComm, 1024 * 1024);
|
||||
auto buffer2MB = allocator.requestBuffer(*mComm, 2 * 1024 * 1024);
|
||||
auto buffer512KB = allocator.requestBuffer(*mComm, 512 * 1024);
|
||||
|
||||
void* ptr1MB = buffer1MB.ptr;
|
||||
void* ptr2MB = buffer2MB.ptr;
|
||||
void* ptr512KB = buffer512KB.ptr;
|
||||
|
||||
// Release all
|
||||
allocator.releaseBuffer(*mComm, ptr1MB);
|
||||
allocator.releaseBuffer(*mComm, ptr2MB);
|
||||
allocator.releaseBuffer(*mComm, ptr512KB);
|
||||
|
||||
// Request 768KB - should reuse 1MB (best fit, smallest that fits)
|
||||
auto buffer768KB = allocator.requestBuffer(*mComm, 768 * 1024);
|
||||
EXPECT_TRUE(buffer768KB.isValid());
|
||||
EXPECT_EQ(buffer768KB.ptr, ptr1MB); // Should reuse 1MB buffer
|
||||
EXPECT_EQ(buffer768KB.size, 1024 * 1024); // Original size
|
||||
|
||||
allocator.releaseBuffer(*mComm, buffer768KB.ptr);
|
||||
}
|
||||
|
||||
TEST_F(NCCLWindowAllocatorTest, MultipleBuffers)
|
||||
{
|
||||
auto& allocator = nccl_util::NCCLWindowAllocator::getInstance();
|
||||
|
||||
const size_t bufferSize = 256 * 1024;
|
||||
std::vector<void*> ptrs;
|
||||
|
||||
// Allocate multiple buffers
|
||||
for (int i = 0; i < 5; ++i)
|
||||
{
|
||||
auto buffer = allocator.requestBuffer(*mComm, bufferSize);
|
||||
EXPECT_TRUE(buffer.isValid());
|
||||
ptrs.push_back(buffer.ptr);
|
||||
}
|
||||
|
||||
EXPECT_EQ(allocator.getBufferCount(*mComm), 5);
|
||||
EXPECT_EQ(allocator.getBufferInUseCount(*mComm), 5);
|
||||
|
||||
// Release all
|
||||
for (auto* ptr : ptrs)
|
||||
{
|
||||
allocator.releaseBuffer(*mComm, ptr);
|
||||
}
|
||||
|
||||
EXPECT_EQ(allocator.getBufferInUseCount(*mComm), 0);
|
||||
EXPECT_EQ(allocator.getBufferCount(*mComm), 5); // Buffers still exist, just not in use
|
||||
}
|
||||
|
||||
TEST_F(NCCLWindowAllocatorTest, SearchBuffer)
|
||||
{
|
||||
auto& allocator = nccl_util::NCCLWindowAllocator::getInstance();
|
||||
|
||||
const size_t bufferSize = 128 * 1024;
|
||||
auto buffer = allocator.requestBuffer(*mComm, bufferSize);
|
||||
|
||||
// Test searchBuffer
|
||||
auto found = allocator.searchBuffer(*mComm, buffer.ptr);
|
||||
EXPECT_TRUE(found.isValid());
|
||||
EXPECT_EQ(found.ptr, buffer.ptr);
|
||||
// Compare against actual allocated size (ncclMemAlloc may allocate more than requested)
|
||||
EXPECT_EQ(found.size, buffer.size);
|
||||
EXPECT_GE(found.size, bufferSize); // At least the requested size
|
||||
|
||||
// Test search for non-existent buffer
|
||||
void* fakePtr = reinterpret_cast<void*>(0xDEADBEEF);
|
||||
auto notFound = allocator.searchBuffer(*mComm, fakePtr);
|
||||
EXPECT_FALSE(notFound.isValid());
|
||||
|
||||
allocator.releaseBuffer(*mComm, buffer.ptr);
|
||||
}
|
||||
|
||||
TEST_F(NCCLWindowAllocatorTest, GetWindowAndSize)
|
||||
{
|
||||
auto& allocator = nccl_util::NCCLWindowAllocator::getInstance();
|
||||
|
||||
const size_t bufferSize = 64 * 1024;
|
||||
auto buffer = allocator.requestBuffer(*mComm, bufferSize);
|
||||
|
||||
// Test getWindow
|
||||
auto window = allocator.getWindow(*mComm, buffer.ptr);
|
||||
EXPECT_NE(window, nullptr);
|
||||
EXPECT_EQ(window, buffer.window);
|
||||
|
||||
// Test getSize - compare against actual allocated size (ncclMemAlloc may allocate more than requested)
|
||||
auto size = allocator.getSize(*mComm, buffer.ptr);
|
||||
EXPECT_EQ(size, buffer.size);
|
||||
EXPECT_GE(size, bufferSize); // At least the requested size
|
||||
|
||||
// Test with invalid pointer
|
||||
void* fakePtr = reinterpret_cast<void*>(0xDEADBEEF);
|
||||
EXPECT_EQ(allocator.getWindow(*mComm, fakePtr), nullptr);
|
||||
EXPECT_EQ(allocator.getSize(*mComm, fakePtr), 0);
|
||||
|
||||
allocator.releaseBuffer(*mComm, buffer.ptr);
|
||||
}
|
||||
|
||||
TEST_F(NCCLWindowAllocatorTest, GetBufferInfo)
|
||||
{
|
||||
auto& allocator = nccl_util::NCCLWindowAllocator::getInstance();
|
||||
|
||||
const size_t bufferSize = 32 * 1024;
|
||||
auto buffer = allocator.requestBuffer(*mComm, bufferSize);
|
||||
|
||||
auto info = allocator.getBufferInfo(*mComm, buffer.ptr);
|
||||
EXPECT_TRUE(info.isValid());
|
||||
EXPECT_EQ(info.ptr, buffer.ptr);
|
||||
EXPECT_EQ(info.size, buffer.size);
|
||||
EXPECT_EQ(info.handle, buffer.handle);
|
||||
EXPECT_EQ(info.window, buffer.window);
|
||||
|
||||
allocator.releaseBuffer(*mComm, buffer.ptr);
|
||||
}
|
||||
|
||||
TEST_F(NCCLWindowAllocatorTest, ScopedBuffer)
|
||||
{
|
||||
const size_t bufferSize = 16 * 1024;
|
||||
|
||||
{
|
||||
nccl_util::ScopedNCCLWindowBuffer scopedBuffer(*mComm, bufferSize);
|
||||
EXPECT_TRUE(scopedBuffer.getBuffer().isValid());
|
||||
EXPECT_NE(scopedBuffer.getPtr(), nullptr);
|
||||
// Compare against actual allocated size (ncclMemAlloc may allocate more than requested)
|
||||
EXPECT_EQ(scopedBuffer.getSize(), scopedBuffer.getBuffer().size);
|
||||
EXPECT_GE(scopedBuffer.getSize(), bufferSize); // At least the requested size
|
||||
EXPECT_NE(scopedBuffer.getWindow(), nullptr);
|
||||
|
||||
// Buffer should be in use
|
||||
auto& allocator = nccl_util::NCCLWindowAllocator::getInstance();
|
||||
EXPECT_EQ(allocator.getBufferInUseCount(*mComm), 1);
|
||||
}
|
||||
|
||||
// Buffer should be released when scoped buffer goes out of scope
|
||||
auto& allocator = nccl_util::NCCLWindowAllocator::getInstance();
|
||||
EXPECT_EQ(allocator.getBufferInUseCount(*mComm), 0);
|
||||
}
|
||||
|
||||
TEST_F(NCCLWindowAllocatorTest, CleanupOnCommDestroy)
|
||||
{
|
||||
auto& allocator = nccl_util::NCCLWindowAllocator::getInstance();
|
||||
|
||||
// Create a separate comm using split for this test
|
||||
auto testComm = createSplitComm(*mComm, 0, mRank);
|
||||
|
||||
// Store the raw comm value before destruction
|
||||
ncclComm_t rawComm = *testComm;
|
||||
|
||||
// Allocate some buffers
|
||||
const size_t bufferSize = 8 * 1024;
|
||||
auto buffer1 = allocator.requestBuffer(*testComm, bufferSize);
|
||||
auto buffer2 = allocator.requestBuffer(*testComm, bufferSize * 2);
|
||||
|
||||
EXPECT_EQ(allocator.getBufferCount(*testComm), 2);
|
||||
EXPECT_EQ(allocator.getBufferInUseCount(*testComm), 2);
|
||||
|
||||
// Verify buffers are valid
|
||||
EXPECT_TRUE(buffer1.isValid());
|
||||
EXPECT_TRUE(buffer2.isValid());
|
||||
|
||||
// Manually release buffers before cleanup to avoid warnings
|
||||
allocator.releaseBuffer(*testComm, buffer1.ptr);
|
||||
allocator.releaseBuffer(*testComm, buffer2.ptr);
|
||||
|
||||
// Verify buffers are released but still exist in pool
|
||||
EXPECT_EQ(allocator.getBufferInUseCount(*testComm), 0);
|
||||
EXPECT_EQ(allocator.getBufferCount(*testComm), 2); // Buffers still exist, just not in use
|
||||
|
||||
// Destroy the communicator - buffers should be cleaned up automatically
|
||||
testComm.reset();
|
||||
|
||||
// Verify cleanup: check that the old comm (now destroyed) no longer has buffers
|
||||
// Note: The comm is destroyed, but we can still check the allocator's internal state
|
||||
// The cleanup should have removed all buffers for this comm
|
||||
EXPECT_EQ(allocator.getBufferCount(rawComm), 0);
|
||||
EXPECT_EQ(allocator.getBufferInUseCount(rawComm), 0);
|
||||
// Note: isCommValid only checks for null, not cleaned-up state, because NCCL can reuse addresses
|
||||
// The real check is that buffers are gone, which we verify above
|
||||
}
|
||||
|
||||
TEST_F(NCCLWindowAllocatorTest, CommValidity)
|
||||
{
|
||||
auto& allocator = nccl_util::NCCLWindowAllocator::getInstance();
|
||||
|
||||
// Valid comm should be valid
|
||||
EXPECT_TRUE(allocator.isCommValid(*mComm));
|
||||
|
||||
// Null comm should be invalid
|
||||
EXPECT_FALSE(allocator.isCommValid(nullptr));
|
||||
}
|
||||
|
||||
//==============================================================================
|
||||
// Integration Tests
|
||||
//==============================================================================
|
||||
|
||||
TEST_F(NCCLWindowAllocatorTest, MultipleComms)
|
||||
{
|
||||
auto& allocator = nccl_util::NCCLWindowAllocator::getInstance();
|
||||
|
||||
// Create two different communicators using split (different colors)
|
||||
auto comm1 = createSplitComm(*mComm, 0, mRank);
|
||||
auto comm2 = createSplitComm(*mComm, 1, mRank);
|
||||
|
||||
const size_t bufferSize = 4 * 1024;
|
||||
|
||||
// Allocate buffers from both comms
|
||||
auto buffer1 = allocator.requestBuffer(*comm1, bufferSize);
|
||||
auto buffer2 = allocator.requestBuffer(*comm2, bufferSize);
|
||||
|
||||
EXPECT_TRUE(buffer1.isValid());
|
||||
EXPECT_TRUE(buffer2.isValid());
|
||||
|
||||
// Buffers should be tracked separately per comm
|
||||
EXPECT_EQ(allocator.getBufferCount(*comm1), 1);
|
||||
EXPECT_EQ(allocator.getBufferCount(*comm2), 1);
|
||||
EXPECT_NE(buffer1.ptr, buffer2.ptr); // Different buffers from different comms
|
||||
|
||||
allocator.releaseBuffer(*comm1, buffer1.ptr);
|
||||
allocator.releaseBuffer(*comm2, buffer2.ptr);
|
||||
|
||||
// Clean up comms
|
||||
comm1.reset();
|
||||
comm2.reset();
|
||||
}
|
||||
|
||||
#if ENABLE_MULTI_DEVICE && BUILD_PYT
|
||||
//==============================================================================
|
||||
// createNCCLWindowTensor Tests
|
||||
//==============================================================================
|
||||
|
||||
class CreateNCCLWindowTensorTest : public ::testing::Test
|
||||
{
|
||||
protected:
|
||||
void SetUp() override
|
||||
{
|
||||
auto& comm = mpi::MpiComm::world();
|
||||
mWorldSize = comm.getSize();
|
||||
mRank = comm.getRank();
|
||||
|
||||
if (mWorldSize < 2)
|
||||
{
|
||||
GTEST_SKIP() << "Requires at least 2 ranks (got " << mWorldSize << ")";
|
||||
}
|
||||
|
||||
// Set CUDA device for this rank (required before NCCL initialization)
|
||||
int deviceCount = 0;
|
||||
TLLM_CUDA_CHECK(cudaGetDeviceCount(&deviceCount));
|
||||
if (deviceCount > 0)
|
||||
{
|
||||
int deviceId = mRank % deviceCount;
|
||||
TLLM_CUDA_CHECK(cudaSetDevice(deviceId));
|
||||
}
|
||||
|
||||
// Check if NCCL symmetric is supported
|
||||
auto& ncclHelper = nccl_util::NCCLHelper::getInstance();
|
||||
if (!ncclHelper.isLoaded())
|
||||
{
|
||||
GTEST_SKIP() << "NCCL library with symmetric memory support is not available";
|
||||
}
|
||||
|
||||
std::set<int> group;
|
||||
for (int i = 0; i < mWorldSize; ++i)
|
||||
{
|
||||
group.insert(i);
|
||||
}
|
||||
mComm = getComm(group);
|
||||
}
|
||||
|
||||
void TearDown() override
|
||||
{
|
||||
mComm.reset();
|
||||
}
|
||||
|
||||
int mWorldSize;
|
||||
int mRank;
|
||||
std::shared_ptr<ncclComm_t> mComm;
|
||||
};
|
||||
|
||||
TEST_F(CreateNCCLWindowTensorTest, BasicTensorCreation)
|
||||
{
|
||||
using nccl_util::createNCCLWindowTensor;
|
||||
|
||||
// Create a tensor with shape [4, 8] and float32 dtype
|
||||
std::vector<int64_t> shape = {4, 8};
|
||||
auto [tensor, buffer] = createNCCLWindowTensor(*mComm, shape, torch::kFloat32);
|
||||
|
||||
// Verify tensor properties
|
||||
EXPECT_TRUE(tensor.defined());
|
||||
EXPECT_EQ(tensor.dtype(), torch::kFloat32);
|
||||
EXPECT_EQ(tensor.device().type(), torch::kCUDA);
|
||||
EXPECT_EQ(tensor.dim(), 2);
|
||||
EXPECT_EQ(tensor.size(0), 4);
|
||||
EXPECT_EQ(tensor.size(1), 8);
|
||||
EXPECT_EQ(tensor.numel(), 4 * 8);
|
||||
|
||||
// Verify buffer properties
|
||||
EXPECT_TRUE(buffer.isValid());
|
||||
EXPECT_NE(buffer.ptr, nullptr);
|
||||
// ncclMemAlloc may allocate more than requested, so check at least the requested size
|
||||
EXPECT_GE(buffer.size, 4 * 8 * sizeof(float));
|
||||
EXPECT_NE(buffer.window, nullptr);
|
||||
|
||||
// Verify tensor data pointer matches buffer pointer
|
||||
EXPECT_EQ(tensor.data_ptr(), buffer.ptr);
|
||||
|
||||
// Tensor should be in use
|
||||
auto& allocator = nccl_util::NCCLWindowAllocator::getInstance();
|
||||
EXPECT_EQ(allocator.getBufferInUseCount(*mComm), 1);
|
||||
}
|
||||
|
||||
TEST_F(CreateNCCLWindowTensorTest, DifferentDtypes)
|
||||
{
|
||||
using nccl_util::createNCCLWindowTensor;
|
||||
|
||||
std::vector<int64_t> shape = {10};
|
||||
|
||||
// Test float32
|
||||
{
|
||||
auto [tensor, buffer] = createNCCLWindowTensor(*mComm, shape, torch::kFloat32);
|
||||
EXPECT_EQ(tensor.dtype(), torch::kFloat32);
|
||||
// ncclMemAlloc may allocate more than requested, so check at least the requested size
|
||||
EXPECT_GE(buffer.size, 10 * sizeof(float));
|
||||
EXPECT_EQ(tensor.data_ptr(), buffer.ptr);
|
||||
}
|
||||
|
||||
// Test float16
|
||||
{
|
||||
auto [tensor, buffer] = createNCCLWindowTensor(*mComm, shape, torch::kFloat16);
|
||||
EXPECT_EQ(tensor.dtype(), torch::kFloat16);
|
||||
// ncclMemAlloc may allocate more than requested, so check at least the requested size
|
||||
EXPECT_GE(buffer.size, 10 * sizeof(at::Half));
|
||||
EXPECT_EQ(tensor.data_ptr(), buffer.ptr);
|
||||
}
|
||||
|
||||
// Test int32
|
||||
{
|
||||
auto [tensor, buffer] = createNCCLWindowTensor(*mComm, shape, torch::kInt32);
|
||||
EXPECT_EQ(tensor.dtype(), torch::kInt32);
|
||||
// ncclMemAlloc may allocate more than requested, so check at least the requested size
|
||||
EXPECT_GE(buffer.size, 10 * sizeof(int32_t));
|
||||
EXPECT_EQ(tensor.data_ptr(), buffer.ptr);
|
||||
}
|
||||
}
|
||||
|
||||
TEST_F(CreateNCCLWindowTensorTest, DifferentShapes)
|
||||
{
|
||||
using nccl_util::createNCCLWindowTensor;
|
||||
|
||||
// 1D tensor
|
||||
{
|
||||
std::vector<int64_t> shape = {100};
|
||||
auto [tensor, buffer] = createNCCLWindowTensor(*mComm, shape, torch::kFloat32);
|
||||
EXPECT_EQ(tensor.dim(), 1);
|
||||
EXPECT_EQ(tensor.size(0), 100);
|
||||
// ncclMemAlloc may allocate more than requested, so check at least the requested size
|
||||
EXPECT_GE(buffer.size, 100 * sizeof(float));
|
||||
}
|
||||
|
||||
// 3D tensor
|
||||
{
|
||||
std::vector<int64_t> shape = {2, 3, 4};
|
||||
auto [tensor, buffer] = createNCCLWindowTensor(*mComm, shape, torch::kFloat32);
|
||||
EXPECT_EQ(tensor.dim(), 3);
|
||||
EXPECT_EQ(tensor.size(0), 2);
|
||||
EXPECT_EQ(tensor.size(1), 3);
|
||||
EXPECT_EQ(tensor.size(2), 4);
|
||||
// ncclMemAlloc may allocate more than requested, so check at least the requested size
|
||||
EXPECT_GE(buffer.size, 2 * 3 * 4 * sizeof(float));
|
||||
}
|
||||
|
||||
// 4D tensor
|
||||
{
|
||||
std::vector<int64_t> shape = {1, 2, 3, 4};
|
||||
auto [tensor, buffer] = createNCCLWindowTensor(*mComm, shape, torch::kFloat32);
|
||||
EXPECT_EQ(tensor.dim(), 4);
|
||||
EXPECT_EQ(tensor.numel(), 1 * 2 * 3 * 4);
|
||||
// ncclMemAlloc may allocate more than requested, so check at least the requested size
|
||||
EXPECT_GE(buffer.size, 1 * 2 * 3 * 4 * sizeof(float));
|
||||
}
|
||||
}
|
||||
|
||||
TEST_F(CreateNCCLWindowTensorTest, TensorDeleterReleasesBuffer)
|
||||
{
|
||||
using nccl_util::createNCCLWindowTensor;
|
||||
|
||||
auto& allocator = nccl_util::NCCLWindowAllocator::getInstance();
|
||||
|
||||
{
|
||||
std::vector<int64_t> shape = {16, 16};
|
||||
auto [tensor, buffer] = createNCCLWindowTensor(*mComm, shape, torch::kFloat32);
|
||||
|
||||
EXPECT_EQ(allocator.getBufferInUseCount(*mComm), 1);
|
||||
EXPECT_TRUE(buffer.isValid());
|
||||
void* bufferPtr = buffer.ptr;
|
||||
|
||||
// Tensor goes out of scope - deleter should release the buffer
|
||||
}
|
||||
|
||||
// Buffer should be released (not in use anymore)
|
||||
EXPECT_EQ(allocator.getBufferInUseCount(*mComm), 0);
|
||||
|
||||
// Buffer should still exist in the pool (for reuse)
|
||||
EXPECT_GE(allocator.getBufferCount(*mComm), 1);
|
||||
}
|
||||
|
||||
TEST_F(CreateNCCLWindowTensorTest, MultipleTensors)
|
||||
{
|
||||
using nccl_util::createNCCLWindowTensor;
|
||||
|
||||
auto& allocator = nccl_util::NCCLWindowAllocator::getInstance();
|
||||
|
||||
std::vector<int64_t> shape = {8, 8};
|
||||
auto [tensor1, buffer1] = createNCCLWindowTensor(*mComm, shape, torch::kFloat32);
|
||||
auto [tensor2, buffer2] = createNCCLWindowTensor(*mComm, shape, torch::kFloat32);
|
||||
auto [tensor3, buffer3] = createNCCLWindowTensor(*mComm, shape, torch::kFloat32);
|
||||
|
||||
EXPECT_EQ(allocator.getBufferInUseCount(*mComm), 3);
|
||||
EXPECT_NE(buffer1.ptr, buffer2.ptr);
|
||||
EXPECT_NE(buffer2.ptr, buffer3.ptr);
|
||||
EXPECT_NE(buffer1.ptr, buffer3.ptr);
|
||||
|
||||
// All tensors should be valid
|
||||
EXPECT_TRUE(tensor1.defined());
|
||||
EXPECT_TRUE(tensor2.defined());
|
||||
EXPECT_TRUE(tensor3.defined());
|
||||
}
|
||||
|
||||
TEST_F(CreateNCCLWindowTensorTest, TensorStrides)
|
||||
{
|
||||
using nccl_util::createNCCLWindowTensor;
|
||||
|
||||
std::vector<int64_t> shape = {3, 4, 5};
|
||||
auto [tensor, buffer] = createNCCLWindowTensor(*mComm, shape, torch::kFloat32);
|
||||
|
||||
// Verify strides are correct (row-major order)
|
||||
EXPECT_EQ(tensor.stride(0), 4 * 5); // stride for first dimension
|
||||
EXPECT_EQ(tensor.stride(1), 5); // stride for second dimension
|
||||
EXPECT_EQ(tensor.stride(2), 1); // stride for third dimension
|
||||
}
|
||||
|
||||
#endif // ENABLE_MULTI_DEVICE && BUILD_PYT
|
||||
|
||||
#endif // ENABLE_MULTI_DEVICE
|
||||
@ -46,7 +46,7 @@ In this third blog of our scaling Expert Parallelism (EP) series, we push the pe
|
||||
|
||||
The wo GEMM is the final linear layer within the multi-head attention block that produces the final outputs. While DeepSeek R1's MLA modifies the initial projections for keys and values, the wo GEMM operator remains a critical and standard component for finalizing the attention computation. In the term, "wo" is the abbreviation for the weight matrix for the output.
|
||||
|
||||
We've evaluated that quantizing the wo GEMM to FP4 still satisfies the accuracy requirements, maintaining a similar MTP accept rate (AR) while improving end-to-end performance. The [NVIDIA TensorRT Model Optimizer](https://github.com/NVIDIA/TensorRT-Model-Optimizer) team has published checkpoints that additionally quantize the wo module in attention layers to FP4 on HuggingFace:
|
||||
We've evaluated that quantizing the wo GEMM to FP4 still satisfies the accuracy requirements, maintaining a similar MTP accept rate (AR) while improving end-to-end performance. The [NVIDIA Model Optimizer](https://github.com/NVIDIA/Model-Optimizer) team has published checkpoints that additionally quantize the wo module in attention layers to FP4 on HuggingFace:
|
||||
* https://huggingface.co/nvidia/DeepSeek-R1-FP4-v2
|
||||
* https://huggingface.co/nvidia/DeepSeek-R1-0528-FP4-v2
|
||||
|
||||
|
||||
@ -67,7 +67,7 @@ We have explored a mixed precision recipe, which provides a better tradeoff betw
|
||||
|
||||
*TensorRT LLM already supports [FP8 Attention](https://github.com/NVIDIA/TensorRT-LLM/tree/main/examples/models/core/deepseek_v3#fp8-kv-cache-and-mla) while for this latency scenario low-precision attention computation doesn't help with performance so we choose to use bf16 precision for the Attention Modules.
|
||||
|
||||
** nvfp4 model checkpoint is generated by the [NVIDIA TensorRT Model Optimizer toolkit](https://github.com/NVIDIA/TensorRT-Model-Optimizer).
|
||||
** nvfp4 model checkpoint is generated by the [NVIDIA Model Optimizer toolkit](https://github.com/NVIDIA/Model-Optimizer).
|
||||
|
||||
*** RouterGEMM uses bf16 inputs/weights with fp32 outputs for numerical stability
|
||||
|
||||
|
||||
@ -29,7 +29,7 @@ The mixed precision recipe for DeepSeek R1 throughput scenario is almost the sam
|
||||
* FP8 KV cache and FP8 attention, rather than BF16 precision.
|
||||
* FP4 Allgather for better communication bandwidth utilization.
|
||||
|
||||
The checkpoint used in this blog is hosted in [nvidia/DeepSeek-R1-FP4](https://huggingface.co/nvidia/DeepSeek-R1-FP4), generated by [NVIDIA Model Optimizer](https://github.com/NVIDIA/TensorRT-Model-Optimizer). The accuracy score of common dataset on this FP4 checkpoint and TensorRT LLM implementations are:
|
||||
The checkpoint used in this blog is hosted in [nvidia/DeepSeek-R1-FP4](https://huggingface.co/nvidia/DeepSeek-R1-FP4), generated by [NVIDIA Model Optimizer](https://github.com/NVIDIA/Model-Optimizer). The accuracy score of common dataset on this FP4 checkpoint and TensorRT LLM implementations are:
|
||||
|
||||
| Precision | GPQA Diamond | MATH-500
|
||||
| :-- | :-- | :-- |
|
||||
|
||||
@ -423,10 +423,10 @@ checkpoint. For the Llama-3.1 models, TensorRT LLM provides the following checkp
|
||||
- [`nvidia/Llama-3.1-70B-Instruct-FP8`](https://huggingface.co/nvidia/Llama-3.1-70B-Instruct-FP8)
|
||||
- [`nvidia/Llama-3.1-405B-Instruct-FP8`](https://huggingface.co/nvidia/Llama-3.1-405B-Instruct-FP8)
|
||||
|
||||
To understand more about how to quantize your own checkpoints, refer to ModelOpt [documentation](https://nvidia.github.io/TensorRT-Model-Optimizer/deployment/1_tensorrt_llm.html).
|
||||
To understand more about how to quantize your own checkpoints, refer to ModelOpt [documentation](https://nvidia.github.io/Model-Optimizer/deployment/1_tensorrt_llm.html).
|
||||
|
||||
`trtllm-bench` utilizes the `hf_quant_config.json` file present in the pre-quantized checkpoints above. The configuration
|
||||
file is present in checkpoints quantized with [TensorRT Model Optimizer](https://github.com/NVIDIA/TensorRT-Model-Optimizer)
|
||||
file is present in checkpoints quantized with [Model Optimizer](https://github.com/NVIDIA/Model-Optimizer)
|
||||
and describes the compute and KV cache quantization that checkpoint was compiled with. For example, from the checkpoints
|
||||
above:
|
||||
|
||||
|
||||
@ -21,7 +21,7 @@ and shows the throughput scenario under maximum load. The reported metric is `To
|
||||
|
||||
The performance numbers below were collected using the steps described in this document.
|
||||
|
||||
Testing was performed on models with weights quantized using [ModelOpt](https://nvidia.github.io/TensorRT-Model-Optimizer/#) and published by NVIDIA on the [Model Optimizer HuggingFace Collection](https://huggingface.co/collections/nvidia/model-optimizer-66aa84f7966b3150262481a4).
|
||||
Testing was performed on models with weights quantized using [ModelOpt](https://nvidia.github.io/Model-Optimizer/#) and published by NVIDIA on the [Model Optimizer HuggingFace Collection](https://huggingface.co/collections/nvidia/model-optimizer-66aa84f7966b3150262481a4).
|
||||
|
||||
*(NEW for v1.0) RTX 6000 Pro Blackwell Server Edition Benchmarks:*
|
||||
|
||||
|
||||
@ -120,7 +120,7 @@ Optimize attention operations with different attention kernel implementations:
|
||||
|
||||
### Precision Support
|
||||
|
||||
AutoDeploy supports models with various precision formats, including quantized checkpoints generated by [`TensorRT-Model-Optimizer`](https://github.com/NVIDIA/TensorRT-Model-Optimizer).
|
||||
AutoDeploy supports models with various precision formats, including quantized checkpoints generated by [`Model-Optimizer`](https://github.com/NVIDIA/Model-Optimizer).
|
||||
|
||||
**Supported precision types include:**
|
||||
|
||||
|
||||
@ -23,7 +23,7 @@ The default PyTorch backend supports FP4 and FP8 quantization on the latest Blac
|
||||
|
||||
### Running Pre-quantized Models
|
||||
|
||||
TensorRT LLM can directly run [pre-quantized models](https://huggingface.co/collections/nvidia/model-optimizer-66aa84f7966b3150262481a4) generated with the [NVIDIA TensorRT Model Optimizer](https://github.com/NVIDIA/TensorRT-Model-Optimizer).
|
||||
TensorRT LLM can directly run [pre-quantized models](https://huggingface.co/collections/nvidia/model-optimizer-66aa84f7966b3150262481a4) generated with the [NVIDIA Model Optimizer](https://github.com/NVIDIA/Model-Optimizer).
|
||||
|
||||
```python
|
||||
from tensorrt_llm import LLM
|
||||
@ -54,8 +54,8 @@ If a pre-quantized model is not available on the [Hugging Face Hub](https://hugg
|
||||
Follow this step-by-step guide to quantize a model:
|
||||
|
||||
```bash
|
||||
git clone https://github.com/NVIDIA/TensorRT-Model-Optimizer.git
|
||||
cd TensorRT-Model-Optimizer/examples/llm_ptq
|
||||
git clone https://github.com/NVIDIA/Model-Optimizer.git
|
||||
cd Model-Optimizer/examples/llm_ptq
|
||||
scripts/huggingface_example.sh --model <huggingface_model_card> --quant fp8 --export_fmt hf
|
||||
```
|
||||
|
||||
@ -108,4 +108,4 @@ FP8 block wise scaling GEMM kernels for sm100 are using MXFP8 recipe (E4M3 act/w
|
||||
## Quick Links
|
||||
|
||||
- [Pre-quantized Models by ModelOpt](https://huggingface.co/collections/nvidia/model-optimizer-66aa84f7966b3150262481a4)
|
||||
- [ModelOpt Support Matrix](https://nvidia.github.io/TensorRT-Model-Optimizer/guides/0_support_matrix.html)
|
||||
- [ModelOpt Support Matrix](https://nvidia.github.io/Model-Optimizer/guides/0_support_matrix.html)
|
||||
|
||||
@ -662,7 +662,7 @@ checkpoint. For the Llama-3.1 models, TensorRT-LLM provides the following checkp
|
||||
- [`nvidia/Llama-3.1-405B-Instruct-FP8`](https://huggingface.co/nvidia/Llama-3.1-405B-Instruct-FP8)
|
||||
|
||||
`trtllm-bench` utilizes the `hf_quant_config.json` file present in the pre-quantized checkpoints above. The configuration
|
||||
file is present in checkpoints quantized with [TensorRT Model Optimizer](https://github.com/NVIDIA/TensorRT-Model-Optimizer)
|
||||
file is present in checkpoints quantized with [Model Optimizer](https://github.com/NVIDIA/Model-Optimizer)
|
||||
and describes the compute and KV cache quantization that checkpoint was compiled with. For example, from the checkpoints
|
||||
above:
|
||||
|
||||
|
||||
@ -118,7 +118,7 @@ Optimize attention operations with different attention kernel implementations:
|
||||
|
||||
### Precision Support
|
||||
|
||||
AutoDeploy supports models with various precision formats, including quantized checkpoints generated by [`TensorRT-Model-Optimizer`](https://github.com/NVIDIA/TensorRT-Model-Optimizer).
|
||||
AutoDeploy supports models with various precision formats, including quantized checkpoints generated by [`Model-Optimizer`](https://github.com/NVIDIA/Model-Optimizer).
|
||||
|
||||
**Supported precision types include:**
|
||||
|
||||
|
||||
@ -1,7 +1,7 @@
|
||||
# Quantization
|
||||
|
||||
The PyTorch backend supports FP8 and NVFP4 quantization. You can pass quantized models in HF model hub,
|
||||
which are generated by [TensorRT Model Optimizer](https://github.com/NVIDIA/TensorRT-Model-Optimizer).
|
||||
which are generated by [Model Optimizer](https://github.com/NVIDIA/Model-Optimizer).
|
||||
|
||||
```python
|
||||
from tensorrt_llm._torch import LLM
|
||||
@ -12,7 +12,7 @@ llm.generate("Hello, my name is")
|
||||
Or you can try the following commands to get a quantized model by yourself:
|
||||
|
||||
```bash
|
||||
git clone https://github.com/NVIDIA/TensorRT-Model-Optimizer.git
|
||||
cd TensorRT-Model-Optimizer/examples/llm_ptq
|
||||
git clone https://github.com/NVIDIA/Model-Optimizer.git
|
||||
cd Model-Optimizer/examples/llm_ptq
|
||||
scripts/huggingface_example.sh --model <huggingface_model_card> --quant fp8 --export_fmt hf
|
||||
```
|
||||
|
||||
@ -90,16 +90,16 @@ python lm_eval_ad.py \
|
||||
--model autodeploy --model_args model=meta-llama/Meta-Llama-3.1-8B-Instruct,world_size=2 --tasks mmlu
|
||||
```
|
||||
|
||||
### Mixed-precision Quantization using TensorRT Model Optimizer
|
||||
### Mixed-precision Quantization using Model Optimizer
|
||||
|
||||
TensorRT Model Optimizer [AutoQuantize](https://nvidia.github.io/TensorRT-Model-Optimizer/reference/generated/modelopt.torch.quantization.model_quant.html#modelopt.torch.quantization.model_quant.auto_quantize) algorithm is a PTQ algorithm from ModelOpt which quantizes a model by searching for the best quantization format per-layer while meeting the performance constraint specified by the user. This way, `AutoQuantize` enables to trade-off model accuracy for performance.
|
||||
Model Optimizer [AutoQuantize](https://nvidia.github.io/Model-Optimizer/reference/generated/modelopt.torch.quantization.model_quant.html#modelopt.torch.quantization.model_quant.auto_quantize) algorithm is a PTQ algorithm from ModelOpt which quantizes a model by searching for the best quantization format per-layer while meeting the performance constraint specified by the user. This way, `AutoQuantize` enables to trade-off model accuracy for performance.
|
||||
|
||||
Currently `AutoQuantize` supports only `effective_bits` as the performance constraint (for both weight-only quantization and weight & activation quantization). See
|
||||
[AutoQuantize documentation](https://nvidia.github.io/TensorRT-Model-Optimizer/reference/generated/modelopt.torch.quantization.model_quant.html#modelopt.torch.quantization.model_quant.auto_quantize) for more details.
|
||||
[AutoQuantize documentation](https://nvidia.github.io/Model-Optimizer/reference/generated/modelopt.torch.quantization.model_quant.html#modelopt.torch.quantization.model_quant.auto_quantize) for more details.
|
||||
|
||||
#### 1. Quantize a model with ModelOpt
|
||||
|
||||
Refer to [NVIDIA TensorRT Model Optimizer](https://github.com/NVIDIA/TensorRT-Model-Optimizer/blob/main/examples/llm_autodeploy/README.md) for generating quantized model checkpoint.
|
||||
Refer to [NVIDIA Model Optimizer](https://github.com/NVIDIA/Model-Optimizer/blob/main/examples/llm_autodeploy/README.md) for generating quantized model checkpoint.
|
||||
|
||||
#### 2. Deploy the quantized model with AutoDeploy
|
||||
|
||||
|
||||
@ -212,7 +212,7 @@ In disaggregated serving, the context workers and generation workers have differ
|
||||
### Prerequisites
|
||||
|
||||
To enable mixed precision serving, you will need:
|
||||
1. A quantized checkpoint created with [TensorRT Model Optimizer](https://github.com/NVIDIA/TensorRT-Model-Optimizer)
|
||||
1. A quantized checkpoint created with [Model Optimizer](https://github.com/NVIDIA/Model-Optimizer)
|
||||
2. The original unquantized checkpoint (Can also be quantized)
|
||||
3. Both checkpoints must use the same KV cache dtype to ensure compatibility during transfer
|
||||
|
||||
|
||||
@ -29,7 +29,7 @@ def run_medusa_decoding(use_modelopt_ckpt=False, model_dir=None):
|
||||
llm_kwargs = {}
|
||||
|
||||
if use_modelopt_ckpt:
|
||||
# This is a Llama-3.1-8B combined with Medusa heads provided by TensorRT Model Optimizer.
|
||||
# This is a Llama-3.1-8B combined with Medusa heads provided by Model Optimizer.
|
||||
# Both the base model (except lm_head) and Medusa heads have been quantized in FP8.
|
||||
model = model_dir or "nvidia/Llama-3.1-8B-Medusa-FP8"
|
||||
|
||||
@ -85,7 +85,7 @@ if __name__ == '__main__':
|
||||
parser.add_argument(
|
||||
'--use_modelopt_ckpt',
|
||||
action='store_true',
|
||||
help="Use FP8-quantized checkpoint from TensorRT Model Optimizer.")
|
||||
help="Use FP8-quantized checkpoint from Model Optimizer.")
|
||||
# TODO: remove this arg after ModelOpt ckpt is public on HF
|
||||
parser.add_argument('--model_dir', type=Path, default=None)
|
||||
args = parser.parse_args()
|
||||
|
||||
@ -9,7 +9,7 @@ def main():
|
||||
build_config.max_num_tokens = 1024
|
||||
|
||||
# Model could accept HF model name, a path to local HF model,
|
||||
# or TensorRT Model Optimizer's quantized checkpoints like nvidia/Llama-3.1-8B-Instruct-FP8 on HF.
|
||||
# or Model Optimizer's quantized checkpoints like nvidia/Llama-3.1-8B-Instruct-FP8 on HF.
|
||||
llm = LLM(model="TinyLlama/TinyLlama-1.1B-Chat-v1.0",
|
||||
build_config=build_config)
|
||||
|
||||
|
||||
@ -7,7 +7,7 @@ from tensorrt_llm import LLM, SamplingParams
|
||||
def main():
|
||||
|
||||
# Model could accept HF model name, a path to local HF model,
|
||||
# or TensorRT Model Optimizer's quantized checkpoints like nvidia/Llama-3.1-8B-Instruct-FP8 on HF.
|
||||
# or Model Optimizer's quantized checkpoints like nvidia/Llama-3.1-8B-Instruct-FP8 on HF.
|
||||
llm = LLM(model="TinyLlama/TinyLlama-1.1B-Chat-v1.0")
|
||||
|
||||
# Sample prompts.
|
||||
|
||||
@ -4,7 +4,7 @@ from tensorrt_llm import LLM, SamplingParams
|
||||
def main():
|
||||
|
||||
# Model could accept HF model name, a path to local HF model,
|
||||
# or TensorRT Model Optimizer's quantized checkpoints like nvidia/Llama-3.1-8B-Instruct-FP8 on HF.
|
||||
# or Model Optimizer's quantized checkpoints like nvidia/Llama-3.1-8B-Instruct-FP8 on HF.
|
||||
llm = LLM(model="TinyLlama/TinyLlama-1.1B-Chat-v1.0")
|
||||
|
||||
# Sample prompts.
|
||||
|
||||
@ -19,7 +19,7 @@ For more info about Medusa visit [speculative decoding documentation](https://nv
|
||||
The TensorRT LLM Medusa example code is located in [`examples/medusa`](./). There is one [`convert_checkpoint.py`](./convert_checkpoint.py) file to convert and build the [TensorRT](https://developer.nvidia.com/tensorrt) engine(s) needed to run models with Medusa decoding support.
|
||||
In this example, we demonstrate the usage of two models:
|
||||
1. The Vucuna 7B model from Hugging Face [`FasterDecoding/medusa-vicuna-7b-v1.3`](https://huggingface.co/FasterDecoding/medusa-vicuna-7b-v1.3) with its Medusa heads [`medusa-vicuna-7b-v1.3`](https://huggingface.co/FasterDecoding/medusa-vicuna-7b-v1.3).
|
||||
2. The quantized checkpoint [`nvidia/Llama-3.1-8B-Medusa-FP8`](https://huggingface.co/nvidia/Llama-3.1-8B-Medusa-FP8) on Hugging Face by [TensorRT Model Optimizer](https://github.com/NVIDIA/TensorRT-Model-Optimizer) (ModelOpt). This model is based on [Llama-3.1 8B](https://huggingface.co/meta-llama/Llama-3.1-8B) and enhanced with Medusa heads, with both the base model (except lm_head) and Medusa heads already quantized in FP8.
|
||||
2. The quantized checkpoint [`nvidia/Llama-3.1-8B-Medusa-FP8`](https://huggingface.co/nvidia/Llama-3.1-8B-Medusa-FP8) on Hugging Face by [Model Optimizer](https://github.com/NVIDIA/Model-Optimizer) (ModelOpt). This model is based on [Llama-3.1 8B](https://huggingface.co/meta-llama/Llama-3.1-8B) and enhanced with Medusa heads, with both the base model (except lm_head) and Medusa heads already quantized in FP8.
|
||||
|
||||
### Build TensorRT engine(s)
|
||||
Get the weights by downloading base model [`vicuna-7b-v1.3`](https://huggingface.co/lmsys/vicuna-7b-v1.3) and Medusa Heads [`medusa-vicuna-7b-v1.3`](https://huggingface.co/FasterDecoding/medusa-vicuna-7b-v1.3) from HF.
|
||||
|
||||
@ -773,7 +773,7 @@ You can enable FP8 MLA through either of these methods:
|
||||
|
||||
**Option 1: Checkpoint config**
|
||||
|
||||
TensorRT LLM automatically detects the `hf_quant_config.json` file in the model directory, which configures both GEMM and KV cache quantization. For example, see the FP4 DeepSeek-R1 checkpoint [configuration](https://huggingface.co/nvidia/DeepSeek-R1-FP4/blob/main/hf_quant_config.json) provided by [ModelOpt](https://github.com/NVIDIA/TensorRT-Model-Optimizer).
|
||||
TensorRT LLM automatically detects the `hf_quant_config.json` file in the model directory, which configures both GEMM and KV cache quantization. For example, see the FP4 DeepSeek-R1 checkpoint [configuration](https://huggingface.co/nvidia/DeepSeek-R1-FP4/blob/main/hf_quant_config.json) provided by [ModelOpt](https://github.com/NVIDIA/Model-Optimizer).
|
||||
|
||||
To enable FP8 MLA, modify the `kv_cache_quant_algo` property. The following shows the config for DeepSeek's block-wise FP8 GEMM quantization + FP8 MLA:
|
||||
|
||||
@ -808,14 +808,14 @@ Or you can follow the steps to generate one by yourselves.
|
||||
|
||||
#### Activation calibration
|
||||
|
||||
[ModelOpt](https://github.com/NVIDIA/TensorRT-Model-Optimizer) is used for calibrating activations of MoE layers. We provide a calibrated file at [HF model hub](https://huggingface.co/Barrrrry/DeepSeek-R1-W4AFP8/blob/main/act_scales.safetensors) or you can run the following commands to generate by yourselves.
|
||||
[ModelOpt](https://github.com/NVIDIA/Model-Optimizer) is used for calibrating activations of MoE layers. We provide a calibrated file at [HF model hub](https://huggingface.co/Barrrrry/DeepSeek-R1-W4AFP8/blob/main/act_scales.safetensors) or you can run the following commands to generate by yourselves.
|
||||
|
||||
```bash
|
||||
# Make sure for enough GPU resources (8xH200s) to run the following commands
|
||||
PATH_OF_DEEPSEEK_R1=/llm-models/DeepSeek-R1/DeepSeek-R1
|
||||
|
||||
# Install ModelOpt from source
|
||||
git clone https://github.com/NVIDIA/TensorRT-Model-Optimizer/ && cd modelopt
|
||||
git clone https://github.com/NVIDIA/Model-Optimizer/ && cd modelopt
|
||||
pip install "nvidia-modelopt[all]" -U --extra-index-url https://pypi.nvidia.com
|
||||
|
||||
# Clone DeepSeek-V3 (base model of R1) Github repository for FP8 inference,
|
||||
|
||||
@ -85,17 +85,17 @@ The output will be like:
|
||||
|
||||
#### PyTorch flow Quantization
|
||||
|
||||
For PyTorch flow, TRT-LLM supports quantized format generated by [TensorRT Model Optimizer](https://github.com/NVIDIA/TensorRT-Model-Optimizer).
|
||||
For PyTorch flow, TRT-LLM supports quantized format generated by [Model Optimizer](https://github.com/NVIDIA/Model-Optimizer).
|
||||
|
||||
You can either do pre-quantized models in HF model hub, or can generate quantized model by yourself and then run models with below command:
|
||||
|
||||
```bash
|
||||
git clone https://github.com/NVIDIA/TensorRT-Model-Optimizer.git
|
||||
cd TensorRT-Model-Optimizer/examples/llm_ptq
|
||||
git clone https://github.com/NVIDIA/Model-Optimizer.git
|
||||
cd Model-Optimizer/examples/llm_ptq
|
||||
scripts/huggingface_example.sh --model hf_models/$MODEL_NAME --quant fp8 --export_fmt hf
|
||||
```
|
||||
|
||||
For more information, please refer to official [docs](https://github.com/NVIDIA/TensorRT-Model-Optimizer) or [TensorRT Model Optimizer](https://github.com/NVIDIA/TensorRT-Model-Optimizer).
|
||||
For more information, please refer to official [docs](https://github.com/NVIDIA/Model-Optimizer) or [Model Optimizer](https://github.com/NVIDIA/Model-Optimizer).
|
||||
|
||||
Troubleshooting
|
||||
|
||||
@ -107,7 +107,7 @@ Hint: Move the offending context manager(s) to outside the compiled region.
|
||||
Hint: This graph break may have been caused by an earlier graph break. Resolving the earlier graph break may resolve this one.
|
||||
```
|
||||
|
||||
This error may indicate an incompatibility between `torch.compile()` and the `HybridCache` module of the transformers library. As a result, [TensorRT Model Optimizer](https://github.com/NVIDIA/TensorRT-Model-Optimizer) (ModelOpt) cannot perform PTQ with HybridCache.
|
||||
This error may indicate an incompatibility between `torch.compile()` and the `HybridCache` module of the transformers library. As a result, [Model Optimizer](https://github.com/NVIDIA/Model-Optimizer) (ModelOpt) cannot perform PTQ with HybridCache.
|
||||
|
||||
Temporarily switching to `DynamicCache` when creating PTQ models could help address the issue. This can be done by updating the `cache_implementation` field in the `generation_config.json` file located in the model checkpoint directory, for example:
|
||||
```json
|
||||
|
||||
@ -1559,7 +1559,7 @@ Explanation:
|
||||
|
||||
|
||||
### Launch trtllm-serve OpenAI-compatible API server
|
||||
TensorRT LLM supports nvidia TensorRT Model Optimizer quantized FP8 checkpoint
|
||||
TensorRT LLM supports nvidia Model Optimizer quantized FP8 checkpoint
|
||||
``` bash
|
||||
trtllm-serve nvidia/Llama-3.3-70B-Instruct-FP8 \
|
||||
--tp_size 8 \
|
||||
|
||||
@ -42,7 +42,7 @@ Explanation:
|
||||
|
||||
|
||||
#### 2. Launch trtllm-serve OpenAI-compatible API server
|
||||
TensorRT LLM supports nvidia TensorRT Model Optimizer quantized FP8 checkpoint
|
||||
TensorRT LLM supports nvidia Model Optimizer quantized FP8 checkpoint
|
||||
``` bash
|
||||
trtllm-serve nvidia/Llama-4-Maverick-17B-128E-Instruct-FP8 \
|
||||
--max_batch_size 512 \
|
||||
@ -94,7 +94,7 @@ Explanation:
|
||||
|
||||
|
||||
#### 2. Launch trtllm-serve OpenAI-compatible API server
|
||||
TensorRT LLM supports nvidia TensorRT Model Optimizer quantized FP8 checkpoint.
|
||||
TensorRT LLM supports nvidia Model Optimizer quantized FP8 checkpoint.
|
||||
``` bash
|
||||
trtllm-serve nvidia/Llama-4-Maverick-17B-128E-Instruct-FP8 \
|
||||
--max_batch_size 8 \
|
||||
@ -140,7 +140,7 @@ Explanation:
|
||||
|
||||
|
||||
#### 2. Launch trtllm-serve OpenAI-compatible API server
|
||||
TensorRT LLM supports nvidia TensorRT Model Optimizer quantized FP8 checkpoint.
|
||||
TensorRT LLM supports nvidia Model Optimizer quantized FP8 checkpoint.
|
||||
``` bash
|
||||
trtllm-serve nvidia/Llama-4-Maverick-17B-128E-Instruct-FP8 \
|
||||
--tp_size 8 \
|
||||
|
||||
@ -663,19 +663,19 @@ trtllm-eval --model=Qwen3-30B-A3B/ --tokenizer=Qwen3-30B-A3B/ --backend=pytorch
|
||||
To quantize the Qwen3 model for use with the PyTorch backend, we'll use NVIDIA's Model Optimizer (ModelOpt) tool. Follow these steps:
|
||||
|
||||
```bash
|
||||
# Clone the TensorRT Model Optimizer (ModelOpt)
|
||||
git clone https://github.com/NVIDIA/TensorRT-Model-Optimizer.git
|
||||
pushd TensorRT-Model-Optimizer
|
||||
# Clone the Model Optimizer (ModelOpt)
|
||||
git clone https://github.com/NVIDIA/Model-Optimizer.git
|
||||
pushd Model-Optimizer
|
||||
|
||||
# install the ModelOpt
|
||||
pip install -e .
|
||||
|
||||
# Quantize the Qwen3-235B-A22B model by nvfp4
|
||||
# By default, the checkpoint would be stored in `TensorRT-Model-Optimizer/examples/llm_ptq/saved_models_Qwen3-235B-A22B_nvfp4_hf/`.
|
||||
# By default, the checkpoint would be stored in `Model-Optimizer/examples/llm_ptq/saved_models_Qwen3-235B-A22B_nvfp4_hf/`.
|
||||
./examples/llm_ptq/scripts/huggingface_example.sh --model Qwen3-235B-A22B/ --quant nvfp4 --export_fmt hf
|
||||
|
||||
# Quantize the Qwen3-32B model by fp8_pc_pt
|
||||
# By default, the checkpoint would be stored in `TensorRT-Model-Optimizer/examples/llm_ptq/saved_models_Qwen3-32B_fp8_pc_pt_hf/`.
|
||||
# By default, the checkpoint would be stored in `Model-Optimizer/examples/llm_ptq/saved_models_Qwen3-32B_fp8_pc_pt_hf/`.
|
||||
./examples/llm_ptq/scripts/huggingface_example.sh --model Qwen3-32B/ --quant fp8_pc_pt --export_fmt hf
|
||||
popd
|
||||
```
|
||||
@ -687,7 +687,7 @@ To run the benchmark, we suggest using the `trtllm-bench` tool. Please refer to
|
||||
```bash
|
||||
#!/bin/bash
|
||||
|
||||
folder_model=TensorRT-Model-Optimizer/examples/llm_ptq/saved_models_Qwen3-235B-A22B_nvfp4_hf/
|
||||
folder_model=Model-Optimizer/examples/llm_ptq/saved_models_Qwen3-235B-A22B_nvfp4_hf/
|
||||
path_config=extra-llm-api-config.yml
|
||||
num_gpus=8
|
||||
ep_size=8
|
||||
@ -727,7 +727,7 @@ trtllm-bench --model ${folder_model} --model_path ${folder_model} throughput \
|
||||
We suggest benchmarking with a real dataset. It will prevent from having improperly distributed tokens in the MoE. Here, we use the `aa_prompt_isl_1k_osl_2k_qwen3_10000samples.txt` dataset. It has 10000 samples with an average input length of 1024 and an average output length of 2048. If you don't have a dataset (this or an other) and you want to run the benchmark, you can use the following command to generate a random dataset:
|
||||
|
||||
```bash
|
||||
folder_model=TensorRT-Model-Optimizer/examples/llm_ptq/saved_models_Qwen3-235B-A22B_nvfp4_hf/
|
||||
folder_model=Model-Optimizer/examples/llm_ptq/saved_models_Qwen3-235B-A22B_nvfp4_hf/
|
||||
min_input_len=1024
|
||||
min_output_len=2048
|
||||
concurrency=128
|
||||
|
||||
@ -11,7 +11,7 @@ The detailed LLM quantization recipe is distributed to the README.md of the corr
|
||||
|
||||
## Installation
|
||||
|
||||
The NVIDIA TensorRT Model Optimizer quantization toolkit is installed automatically as a dependency of TensorRT-LLM.
|
||||
The NVIDIA Model Optimizer quantization toolkit is installed automatically as a dependency of TensorRT-LLM.
|
||||
|
||||
```bash
|
||||
# Install the additional requirements
|
||||
|
||||
@ -1126,7 +1126,7 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
|
||||
def runLLMTestlistOnSlurm(pipeline, platform, testList, config=VANILLA_CONFIG, perfMode=false, stageName="Undefined", splitId=1, splits=1, gpuCount=1, nodeCount=1, runWithSbatch=false, skipInstallWheel=false, cpver="cp312")
|
||||
{
|
||||
echo "Run Slurm job with native sbatch: $runWithSbatch"
|
||||
if(nodeCount > 1 || runWithSbatch) {
|
||||
if (nodeCount > 1 || runWithSbatch) {
|
||||
runLLMTestlistWithSbatch(pipeline, platform, testList, config, perfMode, stageName, splitId, splits, gpuCount, nodeCount, skipInstallWheel, cpver)
|
||||
} else {
|
||||
runLLMTestlistWithAgent(pipeline, platform, testList, config, perfMode, stageName, splitId, splits, gpuCount, skipInstallWheel, cpver)
|
||||
@ -1639,6 +1639,7 @@ def launchTestListCheck(pipeline)
|
||||
sh "tar -zxf ${tarName}"
|
||||
def llmPath = sh (script: "realpath .", returnStdout: true).trim()
|
||||
def llmSrc = "${llmPath}/TensorRT-LLM/src"
|
||||
trtllm_utils.llmExecStepWithRetry(pipeline, script: "pip3 install -r ${llmSrc}/requirements-dev.txt")
|
||||
sh "NVIDIA_TRITON_SERVER_VERSION=25.10 LLM_ROOT=${llmSrc} LLM_BACKEND_ROOT=${llmSrc}/triton_backend python3 ${llmSrc}/scripts/check_test_list.py --l0 --qa --waive"
|
||||
} catch (InterruptedException e) {
|
||||
throw e
|
||||
@ -2492,7 +2493,7 @@ def runLLMTestlistOnPlatformImpl(pipeline, platform, testList, config=VANILLA_CO
|
||||
error "Some tests still failed after rerun attempts, please check the test report."
|
||||
}
|
||||
|
||||
if (perfMode) {
|
||||
if (perfMode && !stageName.contains("Perf-Sanity")) {
|
||||
basePerfFilename = stageName.contains("PyTorch") ? "base_perf_pytorch.csv" : "base_perf.csv"
|
||||
basePerfPath = "${llmSrc}/tests/integration/defs/perf/${basePerfFilename}"
|
||||
stage("Check perf result") {
|
||||
@ -2903,12 +2904,14 @@ def launchTestJobs(pipeline, testFilter)
|
||||
"DGX_B200-4_GPUs-PyTorch-2": ["b200-x4", "l0_dgx_b200", 2, 2, 4],
|
||||
"DGX_B200-4_GPUs-PyTorch-Ray-1": ["b200-x4", "l0_dgx_b200", 1, 1, 4],
|
||||
"DGX_B200-8_GPUs-PyTorch-1": ["b200-x8", "l0_dgx_b200", 1, 1, 8],
|
||||
"DGX_B200-4_GPUs-PyTorch-Post-Merge-1": ["b200-trtllm", "l0_dgx_b200", 1, 1, 4, 1, true],
|
||||
"DGX_B300-4_GPUs-PyTorch-Post-Merge-1": ["b300-x4", "l0_dgx_b300", 1, 1, 4],
|
||||
"DGX_B200-4_GPUs-PyTorch-Post-Merge-1": ["b200-trtllm", "l0_dgx_b200", 1, 2, 4, 1, true],
|
||||
"DGX_B200-4_GPUs-PyTorch-Post-Merge-2": ["b200-trtllm", "l0_dgx_b200", 2, 2, 4, 1, true],
|
||||
"DGX_B300-4_GPUs-PyTorch-Post-Merge-1": ["b300-x4", "l0_dgx_b300", 1, 2, 4],
|
||||
"DGX_B300-4_GPUs-PyTorch-Post-Merge-2": ["b300-x4", "l0_dgx_b300", 2, 2, 4],
|
||||
// Perf sanity post merge test
|
||||
// Disable perf stages due to https://nvbugs/5643646
|
||||
// "DGX_B200-4_GPUs-PyTorch-Perf-Sanity-Post-Merge-1": ["b200-x4", "perf_sanity_l0_dgx_b200", 1, 1, 4],
|
||||
// "DGX_B300-4_GPUs-PyTorch-Perf-Sanity-Post-Merge-1": ["b300-x4", "perf_sanity_l0_dgx_b300", 1, 1, 4],
|
||||
// "DGX_B200-4_GPUs-PyTorch-Perf-Sanity-Post-Merge-1": ["b200-x4", "l0_dgx_b200_perf_sanity", 1, 1, 4],
|
||||
// "DGX_B200-8_GPUs-PyTorch-Perf-Sanity-Post-Merge-1": ["b200-x8", "l0_dgx_b200_perf_sanity", 1, 1, 8],
|
||||
// "DGX_B300-4_GPUs-PyTorch-Perf-Sanity-Post-Merge-1": ["b300-x4", "l0_dgx_b300_perf_sanity", 1, 1, 4],
|
||||
]
|
||||
fullSet += x86SlurmTestConfigs.keySet()
|
||||
|
||||
@ -2933,8 +2936,11 @@ def launchTestJobs(pipeline, testFilter)
|
||||
fullSet += SBSATestConfigs.keySet()
|
||||
|
||||
SBSASlurmTestConfigs = [
|
||||
"GB200-4_GPUs-PyTorch-1": ["gb200-x4-oci", "l0_gb200_multi_gpus", 1, 1, 4],
|
||||
"GB200-4_GPUs-PyTorch-1": ["gb200-x4-oci", "l0_gb200_multi_gpus", 1, 2, 4],
|
||||
"GB200-4_GPUs-PyTorch-2": ["gb200-x4-oci", "l0_gb200_multi_gpus", 2, 2, 4],
|
||||
"GB200-4_GPUs-PyTorch-Post-Merge-1": ["gb200-x4-oci", "l0_gb200_multi_gpus", 1, 1, 4],
|
||||
// Perf sanity post merge test
|
||||
"GB200-4_GPUs-PyTorch-Perf-Sanity-Post-Merge-1": ["gb200-x4-oci", "l0_gb200_multi_gpus_perf_sanity", 1, 1, 4],
|
||||
// Disable GB300 stages due to nodes will be offline temporarily.
|
||||
// "GB300-PyTorch-1": ["gb300-single", "l0_gb300", 1, 1],
|
||||
// "GB300-4_GPUs-PyTorch-Post-Merge-1": ["gb300-x4", "l0_gb300_multi_gpus", 1, 1, 4],
|
||||
@ -2949,6 +2955,8 @@ def launchTestJobs(pipeline, testFilter)
|
||||
"GB200-8_GPUs-2_Nodes-PyTorch-Post-Merge-1": ["gb200-oci-trtllm", "l0_gb200_multi_nodes", 1, 3, 8, 2],
|
||||
"GB200-8_GPUs-2_Nodes-PyTorch-Post-Merge-2": ["gb200-oci-trtllm", "l0_gb200_multi_nodes", 2, 3, 8, 2],
|
||||
"GB200-8_GPUs-2_Nodes-PyTorch-Post-Merge-3": ["gb200-oci-trtllm", "l0_gb200_multi_nodes", 3, 3, 8, 2],
|
||||
// Perf sanity post merge test
|
||||
"GB200-8_GPUs-2_Nodes-PyTorch-Perf-Sanity-Post-Merge-1": ["gb200-oci-trtllm", "l0_gb200_multi_nodes_perf_sanity", 1, 1, 8, 2],
|
||||
]
|
||||
fullSet += multiNodesSBSAConfigs.keySet()
|
||||
|
||||
|
||||
@ -29,10 +29,14 @@ set_value_in_command() {
|
||||
echo "$result"
|
||||
}
|
||||
|
||||
# Only the first process will save the job ID
|
||||
# Only the first process will save the job ID and set the git config
|
||||
if [ $SLURM_PROCID -eq 0 ]; then
|
||||
# Save job ID in $jobWorkspace/slurm_job_id.txt for later job to retrieve
|
||||
echo $SLURM_JOB_ID > $jobWorkspace/slurm_job_id.txt
|
||||
# Update HOME/.gitconfig
|
||||
if ! git config --global --get-all safe.directory | grep -Fxq "*"; then
|
||||
git config --global --add safe.directory "*"
|
||||
fi
|
||||
fi
|
||||
|
||||
if [ $SLURM_LOCALID -eq 0 ]; then
|
||||
@ -47,7 +51,6 @@ if [ $SLURM_LOCALID -eq 0 ]; then
|
||||
fi
|
||||
cd $llmSrcNode && pip3 install --retries 10 -r requirements-dev.txt
|
||||
cd $resourcePathNode && pip3 install --retries 10 --force-reinstall --no-deps TensorRT-LLM/tensorrt_llm-*.whl
|
||||
git config --global --add safe.directory "*"
|
||||
gpuUuids=$(nvidia-smi -q | grep "GPU UUID" | awk '{print $4}' | tr '\n' ',' || true)
|
||||
hostNodeName="${HOST_NODE_NAME:-$(hostname -f || hostname)}"
|
||||
echo "HOST_NODE_NAME = $hostNodeName ; GPU_UUIDS = $gpuUuids ; STAGE_NAME = $stageName"
|
||||
@ -106,7 +109,7 @@ echo "Full Command: $pytestCommand"
|
||||
eval $pytestCommand
|
||||
echo "Rank${SLURM_PROCID} Pytest finished execution"
|
||||
|
||||
if [ "$perfMode" = "true" ]; then
|
||||
if [ $SLURM_PROCID -eq 0 ] && [ "$perfMode" = "true" ] && [[ "$stageName" != *Perf-Sanity* ]]; then
|
||||
if [[ "$stageName" == *PyTorch* ]]; then
|
||||
basePerfFilename="base_perf_pytorch.csv"
|
||||
else
|
||||
|
||||
@ -23,10 +23,9 @@ MARKER_LIST_IN_TEST = [" TIMEOUT"]
|
||||
|
||||
|
||||
def install_python_dependencies(llm_src):
|
||||
subprocess.run(
|
||||
f"cd {llm_src} && pip3 install --retries 1 -r requirements-dev.txt",
|
||||
shell=True,
|
||||
check=True)
|
||||
subprocess.run(f"cd {llm_src} && pip3 install -r requirements-dev.txt",
|
||||
shell=True,
|
||||
check=True)
|
||||
subprocess.run(
|
||||
f"pip3 install --force-reinstall --no-deps {llm_src}/../tensorrt_llm-*.whl",
|
||||
shell=True,
|
||||
|
||||
@ -1,4 +1,4 @@
|
||||
{
|
||||
"commit_hash": "e4c707845ff58fcc0b1d87afb4dd0e64885c780a",
|
||||
"timestamp": "2025-12-07T02:39:14Z"
|
||||
"commit_hash": "8e27ce7084d9fab1051e88fc945732e59689761b",
|
||||
"timestamp": "2025-12-08T02:39:23Z"
|
||||
}
|
||||
|
||||
@ -511,6 +511,11 @@ class AutoTunerProfilingCache:
|
||||
cache = {}
|
||||
cache_data = serializable_cache["cache_data"]
|
||||
|
||||
def lists_to_tuples(obj):
|
||||
if isinstance(obj, list):
|
||||
return tuple(lists_to_tuples(x) for x in obj)
|
||||
return obj
|
||||
|
||||
for key_str, value in cache_data.items():
|
||||
# Reconstruct the tuple key safely
|
||||
try:
|
||||
@ -521,7 +526,7 @@ class AutoTunerProfilingCache:
|
||||
continue
|
||||
|
||||
runner_id = value["runner_id"]
|
||||
tactic = value["tactic"]
|
||||
tactic = lists_to_tuples(value["tactic"])
|
||||
min_time = value["min_time"]
|
||||
|
||||
cache[key] = (runner_id, tactic, min_time)
|
||||
|
||||
@ -486,10 +486,10 @@ class CublasLtFP4GemmRunner(TunableRunner):
|
||||
self.cublaslt_runner = CublasLtFP4GemmRunner.runner_dict[instance_key]
|
||||
|
||||
def unique_id(self):
|
||||
return hash((
|
||||
return (
|
||||
self.to_userbuffers,
|
||||
self.output_dtype,
|
||||
))
|
||||
)
|
||||
|
||||
def get_valid_tactics(self, inputs: List[torch.Tensor],
|
||||
profile: OptimizationProfile, **kwargs) -> List[int]:
|
||||
|
||||
@ -170,18 +170,23 @@ class ConfigurableMoE(MoE):
|
||||
# ConfigurableMoE's super().__init__() was called with real layer_idx and initialized load balancer.
|
||||
# Backend was created with init_load_balancer=False and without_comm=True to avoid
|
||||
# duplicate initialization. Now sync all attributes from ConfigurableMoE to backend.
|
||||
self.backend.aux_stream_dict = self.aux_stream_dict
|
||||
self.backend.layer_idx = self.layer_idx
|
||||
self.backend.layer_idx_str = self.layer_idx_str
|
||||
self.backend.num_slots = self.num_slots
|
||||
self.backend.layer_load_balancer = self.layer_load_balancer
|
||||
self.backend.repeat_count = self.repeat_count
|
||||
self.backend.repeat_idx = self.repeat_idx
|
||||
self.backend.initial_local_expert_ids = self.initial_local_expert_ids
|
||||
self.backend.initial_global_assignments = self.initial_global_assignments
|
||||
self.backend.slot_start = self.slot_start
|
||||
self.backend.slot_end = self.slot_end
|
||||
self.backend.expert_size_per_partition = self.expert_size_per_partition
|
||||
if self.backend is not None:
|
||||
# Add a check to WAR the issue that the backend is none during torch.compile
|
||||
assert not torch.compiler.is_compiling(), (
|
||||
"Backend should not be none if not in torch.compile"
|
||||
)
|
||||
self.backend.aux_stream_dict = self.aux_stream_dict
|
||||
self.backend.layer_idx = self.layer_idx
|
||||
self.backend.layer_idx_str = self.layer_idx_str
|
||||
self.backend.num_slots = self.num_slots
|
||||
self.backend.layer_load_balancer = self.layer_load_balancer
|
||||
self.backend.repeat_count = self.repeat_count
|
||||
self.backend.repeat_idx = self.repeat_idx
|
||||
self.backend.initial_local_expert_ids = self.initial_local_expert_ids
|
||||
self.backend.initial_global_assignments = self.initial_global_assignments
|
||||
self.backend.slot_start = self.slot_start
|
||||
self.backend.slot_end = self.slot_end
|
||||
self.backend.expert_size_per_partition = self.expert_size_per_partition
|
||||
|
||||
# Create weights here, because the backend needs the layer_load_balancer info to create weights
|
||||
model_config._frozen = False
|
||||
|
||||
@ -2844,11 +2844,17 @@ class PyTorchModelEngine(ModelEngine):
|
||||
# Disable UB for unsupported platforms
|
||||
if not ub.ub_supported():
|
||||
return False
|
||||
use_nccl_symmetric = self.llm_args.allreduce_strategy == "NCCL_SYMMETRIC"
|
||||
ub.initialize_userbuffers_manager(
|
||||
self.mapping.tp_size, self.mapping.pp_size, self.mapping.cp_size,
|
||||
self.mapping.rank, self.mapping.gpus_per_node,
|
||||
hidden_size * self.max_num_tokens * 2, use_nccl_symmetric)
|
||||
# NCCL_SYMMETRIC strategy no longer requires UserBuffer allocator initialization.
|
||||
# It uses NCCLWindowAllocator from ncclUtils directly.
|
||||
if self.llm_args.allreduce_strategy == "NCCL_SYMMETRIC":
|
||||
# Skip UB initialization for NCCL_SYMMETRIC - it uses NCCLWindowAllocator directly
|
||||
return False
|
||||
ub.initialize_userbuffers_manager(self.mapping.tp_size,
|
||||
self.mapping.pp_size,
|
||||
self.mapping.cp_size,
|
||||
self.mapping.rank,
|
||||
self.mapping.gpus_per_node,
|
||||
hidden_size * self.max_num_tokens * 2)
|
||||
|
||||
return True
|
||||
|
||||
|
||||
@ -4020,7 +4020,10 @@ def create_allreduce_plugin(
|
||||
pfc = trt.PluginFieldCollection(pfc)
|
||||
ar_plug = allreduce_plg_creator.create_plugin("allreduce", pfc)
|
||||
plug_inputs = [tensor]
|
||||
if all_reduce_params.strategy != AllReduceStrategy.NCCL and all_reduce_params.strategy != AllReduceStrategy.UB:
|
||||
if all_reduce_params.strategy not in {
|
||||
AllReduceStrategy.NCCL, AllReduceStrategy.UB,
|
||||
AllReduceStrategy.NCCL_SYMMETRIC
|
||||
}:
|
||||
plug_inputs.append(workspace)
|
||||
if all_reduce_params.fusion_op != AllReduceFusionOp.NONE:
|
||||
if all_reduce_params.has_bias() == 1:
|
||||
@ -4092,7 +4095,7 @@ def allreduce(
|
||||
workspace = None
|
||||
if all_reduce_params.strategy != AllReduceStrategy.NCCL and all_reduce_params.strategy != AllReduceStrategy.UB:
|
||||
if current_all_reduce_helper().workspace is None:
|
||||
all_reduce_params.strategy = AllReduceStrategy.NCCL
|
||||
all_reduce_params.strategy = AllReduceStrategy.NCCL_SYMMETRIC
|
||||
else:
|
||||
workspace = current_all_reduce_helper().workspace.trt_tensor
|
||||
if all_reduce_params.strategy == AllReduceStrategy.UB:
|
||||
|
||||
@ -13,9 +13,37 @@
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
import os
|
||||
import sys
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
from mpi4py.futures import MPIPoolExecutor
|
||||
|
||||
|
||||
def patch_mpi_pool_session_for_env(mocker, env_vars: dict):
|
||||
"""
|
||||
Patch MpiPoolSession._start_mpi_pool to propagate environment variables to MPI child processes.
|
||||
|
||||
Uses MPIPoolExecutor's built-in `env` parameter instead of `initializer` to avoid
|
||||
segfault issues during process cleanup (UCX memory cache conflicts with PyTorch
|
||||
tensor cleanup during Py_FinalizeEx).
|
||||
|
||||
Args:
|
||||
mocker: pytest-mock mocker fixture
|
||||
env_vars: Dictionary of environment variable name -> value to propagate
|
||||
"""
|
||||
from tensorrt_llm.llmapi.mpi_session import MpiPoolSession
|
||||
|
||||
def patched_start_mpi_pool(self):
|
||||
assert not self.mpi_pool, 'MPI session already started'
|
||||
self.mpi_pool = MPIPoolExecutor(max_workers=self.n_workers,
|
||||
path=sys.path,
|
||||
env=env_vars)
|
||||
|
||||
mocker.patch.object(MpiPoolSession, '_start_mpi_pool',
|
||||
patched_start_mpi_pool)
|
||||
|
||||
|
||||
from defs.conftest import get_sm_version, is_sm_100f
|
||||
|
||||
from tensorrt_llm import LLM
|
||||
@ -1830,9 +1858,24 @@ class TestDeepSeekV3Lite(LlmapiAccuracyTestHarness):
|
||||
ids=["tp4", "ep4", "tp2pp2", "pp4"])
|
||||
@parametrize_with_ids("mtp_nextn", [0, 2])
|
||||
@parametrize_with_ids("moe_backend", ["CUTLASS", "TRTLLM", "CUTEDSL"])
|
||||
@pytest.mark.parametrize("enable_configurable_moe", [0, 1],
|
||||
ids=lambda x: ""
|
||||
if x == 0 else "enable_configurable_moe")
|
||||
def test_nvfp4_4gpus(self, fp8kv, attention_dp, cuda_graph,
|
||||
overlap_scheduler, tp_size, pp_size, ep_size,
|
||||
torch_compile, mtp_nextn, moe_backend):
|
||||
torch_compile, mtp_nextn, moe_backend,
|
||||
enable_configurable_moe, mocker):
|
||||
# Handle ENABLE_CONFIGURABLE_MOE environment variable
|
||||
if enable_configurable_moe == 1 and moe_backend != "TRTLLM":
|
||||
pytest.skip(
|
||||
f"ENABLE_CONFIGURABLE_MOE=1 is only supported with TRTLLM backend, "
|
||||
f"current backend is {moe_backend}")
|
||||
|
||||
# Patch MpiPoolSession to propagate env vars to MPI worker processes
|
||||
env_value = "1" if enable_configurable_moe == 1 and moe_backend == "TRTLLM" else "0"
|
||||
patch_mpi_pool_session_for_env(mocker,
|
||||
{"ENABLE_CONFIGURABLE_MOE": env_value})
|
||||
|
||||
if moe_backend == "TRTLLM" and (get_sm_version() == 120
|
||||
or get_sm_version() == 121):
|
||||
pytest.skip(
|
||||
@ -3452,9 +3495,23 @@ class TestQwen3_30B_A3B(LlmapiAccuracyTestHarness):
|
||||
ids=["latency", "ep2", "ep4"])
|
||||
@pytest.mark.parametrize("activation_dtype", ["static_fp8", "mxfp8"],
|
||||
ids=["fp8", "mxfp8"])
|
||||
@pytest.mark.parametrize("enable_configurable_moe", [0, 1],
|
||||
ids=lambda x: ""
|
||||
if x == 0 else "enable_configurable_moe")
|
||||
def test_w4a8_mxfp4(self, moe_backend, tp_size, pp_size, ep_size,
|
||||
attention_dp, cuda_graph, overlap_scheduler,
|
||||
activation_dtype):
|
||||
activation_dtype, enable_configurable_moe, mocker):
|
||||
# Handle ENABLE_CONFIGURABLE_MOE environment variable
|
||||
if enable_configurable_moe == 1 and moe_backend != "TRTLLM":
|
||||
pytest.skip(
|
||||
f"ENABLE_CONFIGURABLE_MOE=1 is only supported with TRTLLM backend, "
|
||||
f"current backend is {moe_backend}")
|
||||
|
||||
# Patch MpiPoolSession to propagate env vars to MPI worker processes
|
||||
env_value = "1" if enable_configurable_moe == 1 and moe_backend == "TRTLLM" else "0"
|
||||
patch_mpi_pool_session_for_env(mocker,
|
||||
{"ENABLE_CONFIGURABLE_MOE": env_value})
|
||||
|
||||
if moe_backend == "TRITON":
|
||||
if not IS_TRITON_KERNELS_AVAILABLE:
|
||||
pytest.skip("TRITON moe backend is not available.")
|
||||
@ -3906,9 +3963,23 @@ class TestGPTOSS(LlmapiAccuracyTestHarness):
|
||||
(4, 1, 4, True, True, True),
|
||||
],
|
||||
ids=["tp4", "ep4", "dp4"])
|
||||
@pytest.mark.parametrize("enable_configurable_moe", [0, 1],
|
||||
ids=lambda x: ""
|
||||
if x == 0 else "enable_configurable_moe")
|
||||
def test_w4_4gpus(self, kv_cache_dtype, moe_backend, tp_size, pp_size,
|
||||
ep_size, attention_dp, cuda_graph, overlap_scheduler,
|
||||
mocker):
|
||||
enable_configurable_moe, mocker):
|
||||
# Handle ENABLE_CONFIGURABLE_MOE environment variable
|
||||
if enable_configurable_moe == 1 and moe_backend != "TRTLLM":
|
||||
pytest.skip(
|
||||
f"ENABLE_CONFIGURABLE_MOE=1 is only supported with TRTLLM backend, "
|
||||
f"current backend is {moe_backend}")
|
||||
|
||||
# Patch MpiPoolSession to propagate env vars to MPI worker processes
|
||||
env_value = "1" if enable_configurable_moe == 1 and moe_backend == "TRTLLM" else "0"
|
||||
patch_mpi_pool_session_for_env(mocker,
|
||||
{"ENABLE_CONFIGURABLE_MOE": env_value})
|
||||
|
||||
if moe_backend == "TRITON":
|
||||
if not IS_TRITON_KERNELS_AVAILABLE:
|
||||
pytest.skip("Triton kernels are not available")
|
||||
@ -3925,7 +3996,8 @@ class TestGPTOSS(LlmapiAccuracyTestHarness):
|
||||
|
||||
pytorch_config = dict(
|
||||
disable_overlap_scheduler=not overlap_scheduler,
|
||||
cuda_graph_config=CudaGraphConfig() if cuda_graph else None)
|
||||
cuda_graph_config=CudaGraphConfig() if cuda_graph else None,
|
||||
moe_config=MoeConfig(backend=moe_backend))
|
||||
|
||||
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.7,
|
||||
dtype=kv_cache_dtype)
|
||||
@ -3939,8 +4011,7 @@ class TestGPTOSS(LlmapiAccuracyTestHarness):
|
||||
max_seq_len=max_seq_len,
|
||||
max_batch_size=720,
|
||||
**pytorch_config,
|
||||
enable_attention_dp=attention_dp,
|
||||
moe_config=MoeConfig(backend=moe_backend))
|
||||
enable_attention_dp=attention_dp)
|
||||
|
||||
with llm:
|
||||
model_name = "GPT-OSS/120B-MXFP4"
|
||||
@ -4252,8 +4323,17 @@ class TestGPTOSS(LlmapiAccuracyTestHarness):
|
||||
@pytest.mark.parametrize(
|
||||
"kv_cache_dtype",
|
||||
["auto", pytest.param("fp8", marks=skip_pre_blackwell)])
|
||||
def test_w4_4gpus_online_eplb(self, kv_cache_dtype, mocker):
|
||||
@pytest.mark.parametrize("enable_configurable_moe", [0, 1],
|
||||
ids=lambda x: ""
|
||||
if x == 0 else "enable_configurable_moe")
|
||||
def test_w4_4gpus_online_eplb(self, kv_cache_dtype, enable_configurable_moe,
|
||||
mocker):
|
||||
"""Test GPTOSS with online expert parallel load balancer using TRTLLM backend and attention DP."""
|
||||
# Patch MpiPoolSession to propagate env vars to MPI worker processes
|
||||
env_value = "1" if enable_configurable_moe == 1 else "0"
|
||||
patch_mpi_pool_session_for_env(mocker,
|
||||
{"ENABLE_CONFIGURABLE_MOE": env_value})
|
||||
|
||||
mocker.patch.object(GSM8K, "MAX_OUTPUT_LEN", 8192)
|
||||
mocker.patch.dict(GSM8K.EVALUATE_KWARGS,
|
||||
{"scores_filter": "exact_match,flexible-extract"})
|
||||
|
||||
@ -2209,6 +2209,94 @@ def pytest_generate_tests(metafunc: pytest.Metafunc):
|
||||
metafunc.parametrize("case", uts, ids=lambda x: x)
|
||||
|
||||
|
||||
# Test cases that use enable_configurable_moe parameter and need ID conversion
|
||||
TESTS_WITH_CONFIGURABLE_MOE = [
|
||||
"TestDeepSeekV3Lite::test_nvfp4_4gpus",
|
||||
"TestGPTOSS::test_w4_4gpus",
|
||||
"TestGPTOSS::test_w4_4gpus_online_eplb",
|
||||
"TestQwen3_30B_A3B::test_w4a8_mxfp4",
|
||||
]
|
||||
|
||||
|
||||
def _convert_clean_to_original_moe_test_id(test_id):
|
||||
"""Convert clean MoE test ID back to original format for pytest collection.
|
||||
|
||||
Example: "test_llm_api_pytorch.py::test_foo[param]" -> "test_llm_api_pytorch.py::test_foo[-param]"
|
||||
|
||||
This is needed because the `enable_configurable_moe` parameter uses empty string
|
||||
as ID when value is 0, resulting in test IDs like "test_foo[-param]".
|
||||
We clean these up in pytest_collection_modifyitems, but pytest filters tests
|
||||
during collection using the original IDs. So when user runs with clean test name,
|
||||
we need to convert it back to match the original.
|
||||
"""
|
||||
if "test_llm_api_pytorch.py" not in test_id:
|
||||
return test_id
|
||||
|
||||
# Match pattern like "test_name[params]" and add leading dash after "["
|
||||
# But only if params don't already start with "-" or "enable_configurable_moe"
|
||||
match = re.search(r"\[([^\]]+)\]", test_id)
|
||||
if match:
|
||||
params = match.group(1)
|
||||
# Skip if already has leading dash or starts with enable_configurable_moe
|
||||
if not params.startswith("-") and not params.startswith(
|
||||
"enable_configurable_moe"):
|
||||
# Add leading dash to params
|
||||
new_params = "-" + params
|
||||
test_id = test_id.replace(f"[{params}]", f"[{new_params}]")
|
||||
|
||||
return test_id
|
||||
|
||||
|
||||
def pytest_sessionstart(session):
|
||||
"""Convert clean MoE test IDs in config.args to original format for collection.
|
||||
|
||||
This is needed because pytest filters tests during collection using original IDs.
|
||||
When user runs with clean test name, we convert it back to match the original.
|
||||
"""
|
||||
args = session.config.args
|
||||
for i, arg in enumerate(args):
|
||||
if "test_llm_api_pytorch.py" in arg and "[" in arg:
|
||||
# Only apply conversion to specific tests that use enable_configurable_moe
|
||||
should_convert = any(test_name in arg
|
||||
for test_name in TESTS_WITH_CONFIGURABLE_MOE)
|
||||
if should_convert:
|
||||
args[i] = _convert_clean_to_original_moe_test_id(arg)
|
||||
|
||||
|
||||
def _clean_moe_test_ids(items):
|
||||
"""Clean up test IDs by removing leading/trailing dashes from parameter IDs.
|
||||
|
||||
This is needed because `enable_configurable_moe` parameter can be empty,
|
||||
resulting in ugly test IDs like "test_foo[-True]" or "test_foo[--abc]".
|
||||
We clean these up to "test_foo[True]" or "test_foo[abc]" so that:
|
||||
1. Test names in waive files and test lists remain unchanged
|
||||
2. Test reports look cleaner
|
||||
"""
|
||||
for item in items:
|
||||
if "test_llm_api_pytorch.py" in item.nodeid and "[" in item.nodeid:
|
||||
# Only apply cleanup to specific tests that use enable_configurable_moe
|
||||
should_cleanup = any(test_name in item.nodeid
|
||||
for test_name in TESTS_WITH_CONFIGURABLE_MOE)
|
||||
if should_cleanup:
|
||||
original_nodeid = item.nodeid
|
||||
original_name = item.name
|
||||
nodeid = item.nodeid
|
||||
name = item.name
|
||||
|
||||
# Clean up leading/trailing dashes in nodeid
|
||||
nodeid = nodeid.replace("[-", "[")
|
||||
nodeid = nodeid.replace("-]", "]")
|
||||
|
||||
# Clean up leading/trailing dashes in name
|
||||
name = name.replace("[-", "[")
|
||||
name = name.replace("-]", "]")
|
||||
|
||||
if nodeid != original_nodeid:
|
||||
item._nodeid = nodeid
|
||||
if name != original_name:
|
||||
item.name = name
|
||||
|
||||
|
||||
@pytest.hookimpl(tryfirst=True, hookwrapper=True)
|
||||
def pytest_collection_modifyitems(session, config, items):
|
||||
testlist_path = config.getoption("--test-list")
|
||||
@ -2217,6 +2305,10 @@ def pytest_collection_modifyitems(session, config, items):
|
||||
perf_test = config.getoption("--perf")
|
||||
test_model_suites = config.getoption("--test-model-suites")
|
||||
|
||||
# TODO Once the MoE refactor is complete, this should be removed.
|
||||
# This is a temporary WAR to minimize the impact of the MoE refactor on the existing test lists.
|
||||
_clean_moe_test_ids(items)
|
||||
|
||||
if perf_test:
|
||||
global ALL_PYTEST_ITEMS
|
||||
ALL_PYTEST_ITEMS = None
|
||||
|
||||
@ -127,6 +127,24 @@ def run_user_buffer_tests(build_dir: _pl.Path, nprocs=2, timeout=300):
|
||||
timeout=timeout)
|
||||
|
||||
|
||||
def run_nccl_utils_tests(build_dir: _pl.Path, nprocs=2, timeout=300):
|
||||
tests_dir = build_dir / "tests" / "unit_tests" / "multi_gpu"
|
||||
mgpu_env = get_multi_gpu_env()
|
||||
|
||||
nccl_utils_test = [
|
||||
"mpirun",
|
||||
"-n",
|
||||
f"{nprocs}",
|
||||
"--allow-run-as-root",
|
||||
"ncclUtilsTest",
|
||||
]
|
||||
|
||||
_cpp.run_command(nccl_utils_test,
|
||||
cwd=tests_dir,
|
||||
env=mgpu_env,
|
||||
timeout=timeout)
|
||||
|
||||
|
||||
def run_llama_executor_leader_tests(build_dir: _pl.Path, timeout=1500):
|
||||
tests_dir = build_dir / "tests" / "e2e_tests"
|
||||
|
||||
@ -505,6 +523,15 @@ def test_user_buffer(build_google_tests, nprocs, build_dir):
|
||||
run_user_buffer_tests(build_dir=build_dir, nprocs=nprocs, timeout=300)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("build_google_tests", ["80", "86", "89", "90"],
|
||||
indirect=True)
|
||||
@pytest.mark.parametrize("nprocs", [2, 8], ids=["2proc", "8proc"])
|
||||
def test_nccl_utils(build_google_tests, nprocs, build_dir):
|
||||
|
||||
if platform.system() != "Windows":
|
||||
run_nccl_utils_tests(build_dir=build_dir, nprocs=nprocs, timeout=300)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("build_google_tests", ["80", "86", "89", "90"],
|
||||
indirect=True)
|
||||
@pytest.mark.parametrize("multi_gpu_model", ["t5"], indirect=True)
|
||||
|
||||
@ -114,13 +114,16 @@ def validate_timing_metrics(perf_metrics_item, request_context=""):
|
||||
)), f"gen server_first_token_time is not numeric in {request_context}"
|
||||
assert gen_server_arrival <= gen_server_first_token, f"gen server_arrival_time > server_first_token_time in {request_context}"
|
||||
|
||||
# Network Time Protocol can ensure ms-level accuracy in LAN
|
||||
ntp_tolerance = 1e-3
|
||||
|
||||
# Validate timing relationships between different levels
|
||||
# Disaggregated server should receive request before individual servers
|
||||
assert disagg_arrival <= ctx_server_arrival, f"disagg_arrival > ctx_server_arrival in {request_context}"
|
||||
assert disagg_arrival <= gen_server_arrival, f"disagg_arrival > gen_server_arrival in {request_context}"
|
||||
assert disagg_arrival - ntp_tolerance <= ctx_server_arrival, f"disagg_arrival > ctx_server_arrival in {request_context}"
|
||||
assert disagg_arrival - ntp_tolerance <= gen_server_arrival, f"disagg_arrival > gen_server_arrival in {request_context}"
|
||||
|
||||
# Context should complete before generation starts
|
||||
assert ctx_server_first_token <= gen_server_arrival, f"ctx_server_first_token > gen_server_arrival in {request_context}"
|
||||
assert ctx_server_first_token - ntp_tolerance <= gen_server_arrival, f"ctx_server_first_token > gen_server_arrival in {request_context}"
|
||||
|
||||
# Validate internal timing consistency
|
||||
ctx_arrival_time = ctx_metrics["arrival_time"]
|
||||
|
||||
@ -249,8 +249,8 @@ class JobManager:
|
||||
logger.error(f"Job submission exception: {error_msg}")
|
||||
# Clean up temporary file on exception
|
||||
temp_config_path = test_config.temp_config_path
|
||||
if os.path.exists(temp_config_path):
|
||||
os.remove(temp_config_path)
|
||||
# if os.path.exists(temp_config_path):
|
||||
# os.remove(temp_config_path)
|
||||
return False, error_msg
|
||||
|
||||
@staticmethod
|
||||
|
||||
@ -12,6 +12,8 @@ No complex process tree cleanup is needed because:
|
||||
import subprocess
|
||||
from typing import Optional
|
||||
|
||||
from utils.logger import logger
|
||||
|
||||
|
||||
def exec_cmd(*popenargs, timeout: Optional[float] = None, **kwargs) -> int:
|
||||
"""Execute command and return exit code.
|
||||
@ -54,4 +56,10 @@ def exec_cmd_with_output(*popenargs, timeout: Optional[float] = None, **kwargs)
|
||||
check=True,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
# Log stderr if it exists
|
||||
if result.stderr:
|
||||
stderr_output = result.stderr.decode()
|
||||
logger.error(f"Command stderr: {stderr_output}")
|
||||
|
||||
return result.stdout.decode()
|
||||
|
||||
@ -14,6 +14,7 @@ slurm:
|
||||
account: <account>
|
||||
job_time: 02:00:00
|
||||
job_name: unified-benchmark
|
||||
extra_args: "--gres=gpu:4"
|
||||
numa_bind: true
|
||||
benchmark:
|
||||
mode: e2e
|
||||
@ -36,6 +37,8 @@ environment:
|
||||
trtllm_repo: ''
|
||||
build_wheel: false
|
||||
work_dir: <full_path_to_work_dir>
|
||||
worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes"
|
||||
server_env_var: "TRTLLM_SERVER_DISABLE_GC=1"
|
||||
profiling:
|
||||
nsys_on: false
|
||||
accuracy:
|
||||
|
||||
@ -14,6 +14,7 @@ slurm:
|
||||
account: <account>
|
||||
job_time: 02:00:00
|
||||
job_name: unified-benchmark
|
||||
extra_args: "--gres=gpu:4"
|
||||
numa_bind: true
|
||||
benchmark:
|
||||
mode: e2e
|
||||
@ -36,6 +37,8 @@ environment:
|
||||
trtllm_repo: ''
|
||||
build_wheel: false
|
||||
work_dir: <full_path_to_work_dir>
|
||||
worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes"
|
||||
server_env_var: "TRTLLM_SERVER_DISABLE_GC=1"
|
||||
profiling:
|
||||
nsys_on: false
|
||||
accuracy:
|
||||
|
||||
@ -14,6 +14,7 @@ slurm:
|
||||
account: <account>
|
||||
job_time: 02:00:00
|
||||
job_name: unified-benchmark
|
||||
extra_args: "--gres=gpu:4"
|
||||
numa_bind: true
|
||||
benchmark:
|
||||
mode: e2e
|
||||
@ -36,6 +37,8 @@ environment:
|
||||
trtllm_repo: ''
|
||||
build_wheel: false
|
||||
work_dir: <full_path_to_work_dir>
|
||||
worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes"
|
||||
server_env_var: "TRTLLM_SERVER_DISABLE_GC=1"
|
||||
profiling:
|
||||
nsys_on: false
|
||||
accuracy:
|
||||
|
||||
@ -14,6 +14,7 @@ slurm:
|
||||
account: <account>
|
||||
job_time: 02:00:00
|
||||
job_name: unified-benchmark
|
||||
extra_args: "--gres=gpu:4"
|
||||
numa_bind: true
|
||||
benchmark:
|
||||
mode: e2e
|
||||
@ -36,6 +37,8 @@ environment:
|
||||
trtllm_repo: ''
|
||||
build_wheel: false
|
||||
work_dir: <full_path_to_work_dir>
|
||||
worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes"
|
||||
server_env_var: "TRTLLM_SERVER_DISABLE_GC=1"
|
||||
profiling:
|
||||
nsys_on: false
|
||||
accuracy:
|
||||
|
||||
@ -14,6 +14,7 @@ slurm:
|
||||
account: <account>
|
||||
job_time: 02:00:00
|
||||
job_name: unified-benchmark
|
||||
extra_args: "--gres=gpu:4"
|
||||
numa_bind: true
|
||||
benchmark:
|
||||
mode: e2e
|
||||
@ -36,6 +37,8 @@ environment:
|
||||
trtllm_repo: ''
|
||||
build_wheel: false
|
||||
work_dir: <full_path_to_work_dir>
|
||||
worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes"
|
||||
server_env_var: "TRTLLM_SERVER_DISABLE_GC=1"
|
||||
profiling:
|
||||
nsys_on: false
|
||||
accuracy:
|
||||
|
||||
@ -14,6 +14,7 @@ slurm:
|
||||
account: <account>
|
||||
job_time: 02:00:00
|
||||
job_name: unified-benchmark
|
||||
extra_args: "--gres=gpu:4"
|
||||
numa_bind: true
|
||||
benchmark:
|
||||
mode: e2e
|
||||
@ -36,6 +37,8 @@ environment:
|
||||
trtllm_repo: ''
|
||||
build_wheel: false
|
||||
work_dir: <full_path_to_work_dir>
|
||||
worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes"
|
||||
server_env_var: "TRTLLM_SERVER_DISABLE_GC=1"
|
||||
profiling:
|
||||
nsys_on: false
|
||||
accuracy:
|
||||
|
||||
@ -14,6 +14,7 @@ slurm:
|
||||
account: <account>
|
||||
job_time: 02:00:00
|
||||
job_name: unified-benchmark
|
||||
extra_args: "--gres=gpu:4"
|
||||
numa_bind: true
|
||||
benchmark:
|
||||
mode: e2e
|
||||
@ -36,6 +37,8 @@ environment:
|
||||
trtllm_repo: ''
|
||||
build_wheel: false
|
||||
work_dir: <full_path_to_work_dir>
|
||||
worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes"
|
||||
server_env_var: "TRTLLM_SERVER_DISABLE_GC=1"
|
||||
profiling:
|
||||
nsys_on: false
|
||||
accuracy:
|
||||
|
||||
@ -14,6 +14,7 @@ slurm:
|
||||
account: <account>
|
||||
job_time: 02:00:00
|
||||
job_name: unified-benchmark
|
||||
extra_args: "--gres=gpu:4"
|
||||
numa_bind: true
|
||||
benchmark:
|
||||
mode: e2e
|
||||
@ -36,6 +37,8 @@ environment:
|
||||
trtllm_repo: ''
|
||||
build_wheel: false
|
||||
work_dir: <full_path_to_work_dir>
|
||||
worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes"
|
||||
server_env_var: "TRTLLM_SERVER_DISABLE_GC=1"
|
||||
profiling:
|
||||
nsys_on: false
|
||||
accuracy:
|
||||
|
||||
@ -0,0 +1,95 @@
|
||||
# nvbugs: 5561153
|
||||
metadata:
|
||||
model_name: Qwen3-235B-A22B-FP8
|
||||
precision: fp8
|
||||
model_dir_name: Qwen3-235B-A22B-FP8
|
||||
supported_gpus:
|
||||
- GB200
|
||||
- GB300
|
||||
script_file: disaggr_torch.slurm
|
||||
benchmark_type: 1k1k
|
||||
config_index: 21
|
||||
slurm:
|
||||
script_file: disaggr_torch.slurm
|
||||
partition: <partition>
|
||||
account: <account>
|
||||
job_time: 02:00:00
|
||||
job_name: unified-benchmark
|
||||
extra_args: "--gres=gpu:4"
|
||||
numa_bind: true
|
||||
benchmark:
|
||||
mode: e2e
|
||||
use_nv_sa_benchmark: true
|
||||
multi_round: 8
|
||||
benchmark_ratio: 0.8
|
||||
streaming: true
|
||||
concurrency_list: 1 2 4 8 16 36
|
||||
input_length: 1024
|
||||
output_length: 1024
|
||||
dataset_file: <dataset_file>
|
||||
hardware:
|
||||
gpus_per_node: 4
|
||||
num_ctx_servers: 1
|
||||
num_gen_servers: 1
|
||||
environment:
|
||||
container_mount: <container_mount>
|
||||
container_image: <container_image>
|
||||
model_path: <model_path>
|
||||
trtllm_repo: ''
|
||||
build_wheel: false
|
||||
work_dir: <full_path_to_work_dir>
|
||||
worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes"
|
||||
server_env_var: "TRTLLM_SERVER_DISABLE_GC=1"
|
||||
profiling:
|
||||
nsys_on: false
|
||||
accuracy:
|
||||
enable_accuracy_test: false
|
||||
model: local-completions
|
||||
tasks: gsm8k
|
||||
model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096
|
||||
worker_config:
|
||||
gen:
|
||||
tensor_parallel_size: 4
|
||||
moe_expert_parallel_size: 4
|
||||
enable_attention_dp: false
|
||||
pipeline_parallel_size: 1
|
||||
max_batch_size: 64
|
||||
max_num_tokens: 2048
|
||||
max_seq_len: 2051
|
||||
cuda_graph_config:
|
||||
enable_padding: true
|
||||
max_batch_size: 128
|
||||
print_iter_log: true
|
||||
kv_cache_config:
|
||||
enable_block_reuse: true
|
||||
free_gpu_memory_fraction: 0.7
|
||||
dtype: fp8
|
||||
moe_config:
|
||||
backend: TRTLLM
|
||||
cache_transceiver_config:
|
||||
max_tokens_in_buffer: 2048
|
||||
backend: NIXL
|
||||
stream_interval: 20
|
||||
num_postprocess_workers: 4
|
||||
allreduce_strategy: MNNVL
|
||||
disable_overlap_scheduler: false
|
||||
ctx:
|
||||
max_batch_size: 32
|
||||
max_num_tokens: 2048
|
||||
max_seq_len: 2051
|
||||
tensor_parallel_size: 4
|
||||
moe_expert_parallel_size: 4
|
||||
enable_attention_dp: false
|
||||
pipeline_parallel_size: 1
|
||||
print_iter_log: true
|
||||
cuda_graph_config: null
|
||||
disable_overlap_scheduler: true
|
||||
kv_cache_config:
|
||||
enable_block_reuse: true
|
||||
free_gpu_memory_fraction: 0.7
|
||||
dtype: fp8
|
||||
moe_config:
|
||||
backend: TRTLLM
|
||||
cache_transceiver_config:
|
||||
max_tokens_in_buffer: 2048
|
||||
backend: NIXL
|
||||
@ -0,0 +1,95 @@
|
||||
# nvbugs: 5561153
|
||||
metadata:
|
||||
model_name: Qwen3-235B-A22B-FP8
|
||||
precision: fp8
|
||||
model_dir_name: Qwen3-235B-A22B-FP8
|
||||
supported_gpus:
|
||||
- GB200
|
||||
- GB300
|
||||
script_file: disaggr_torch.slurm
|
||||
benchmark_type: 1k1k
|
||||
config_index: 21
|
||||
slurm:
|
||||
script_file: disaggr_torch.slurm
|
||||
partition: <partition>
|
||||
account: <account>
|
||||
job_time: 02:00:00
|
||||
job_name: unified-benchmark
|
||||
extra_args: "--gres=gpu:4"
|
||||
numa_bind: true
|
||||
benchmark:
|
||||
mode: e2e
|
||||
use_nv_sa_benchmark: true
|
||||
multi_round: 8
|
||||
benchmark_ratio: 0.8
|
||||
streaming: true
|
||||
concurrency_list: 1 2 4 8 16 36
|
||||
input_length: 1024
|
||||
output_length: 1024
|
||||
dataset_file: <dataset_file>
|
||||
hardware:
|
||||
gpus_per_node: 4
|
||||
num_ctx_servers: 1
|
||||
num_gen_servers: 1
|
||||
environment:
|
||||
container_mount: <container_mount>
|
||||
container_image: <container_image>
|
||||
model_path: <model_path>
|
||||
trtllm_repo: ''
|
||||
build_wheel: false
|
||||
work_dir: <full_path_to_work_dir>
|
||||
worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes"
|
||||
server_env_var: "TRTLLM_SERVER_DISABLE_GC=1"
|
||||
profiling:
|
||||
nsys_on: false
|
||||
accuracy:
|
||||
enable_accuracy_test: false
|
||||
model: local-completions
|
||||
tasks: gsm8k
|
||||
model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096
|
||||
worker_config:
|
||||
gen:
|
||||
tensor_parallel_size: 4
|
||||
moe_expert_parallel_size: 4
|
||||
enable_attention_dp: false
|
||||
pipeline_parallel_size: 1
|
||||
max_batch_size: 64
|
||||
max_num_tokens: 2048
|
||||
max_seq_len: 2051
|
||||
cuda_graph_config:
|
||||
enable_padding: true
|
||||
max_batch_size: 128
|
||||
print_iter_log: true
|
||||
kv_cache_config:
|
||||
enable_block_reuse: true
|
||||
free_gpu_memory_fraction: 0.7
|
||||
dtype: fp8
|
||||
moe_config:
|
||||
backend: TRTLLM
|
||||
cache_transceiver_config:
|
||||
max_tokens_in_buffer: 2048
|
||||
backend: UCX
|
||||
stream_interval: 20
|
||||
num_postprocess_workers: 4
|
||||
allreduce_strategy: MNNVL
|
||||
disable_overlap_scheduler: false
|
||||
ctx:
|
||||
max_batch_size: 32
|
||||
max_num_tokens: 2048
|
||||
max_seq_len: 2051
|
||||
tensor_parallel_size: 4
|
||||
moe_expert_parallel_size: 4
|
||||
enable_attention_dp: false
|
||||
pipeline_parallel_size: 1
|
||||
print_iter_log: true
|
||||
cuda_graph_config: null
|
||||
disable_overlap_scheduler: true
|
||||
kv_cache_config:
|
||||
enable_block_reuse: true
|
||||
free_gpu_memory_fraction: 0.7
|
||||
dtype: fp8
|
||||
moe_config:
|
||||
backend: TRTLLM
|
||||
cache_transceiver_config:
|
||||
max_tokens_in_buffer: 2048
|
||||
backend: UCX
|
||||
@ -14,6 +14,7 @@ slurm:
|
||||
account: <account>
|
||||
job_time: 02:00:00
|
||||
job_name: unified-benchmark
|
||||
extra_args: "--gres=gpu:4"
|
||||
numa_bind: true
|
||||
benchmark:
|
||||
mode: e2e
|
||||
@ -37,6 +38,8 @@ environment:
|
||||
build_wheel: false
|
||||
trtllm_wheel_path: ''
|
||||
work_dir: <full_path_to_work_dir>
|
||||
worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes"
|
||||
server_env_var: "TRTLLM_SERVER_DISABLE_GC=1"
|
||||
profiling:
|
||||
nsys_on: false
|
||||
accuracy:
|
||||
|
||||
@ -14,6 +14,7 @@ slurm:
|
||||
account: <account>
|
||||
job_time: 02:00:00
|
||||
job_name: unified-benchmark
|
||||
extra_args: "--gres=gpu:4"
|
||||
numa_bind: true
|
||||
benchmark:
|
||||
mode: e2e
|
||||
@ -36,6 +37,8 @@ environment:
|
||||
trtllm_repo: ''
|
||||
build_wheel: false
|
||||
work_dir: <full_path_to_work_dir>
|
||||
worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes"
|
||||
server_env_var: "TRTLLM_SERVER_DISABLE_GC=1"
|
||||
profiling:
|
||||
nsys_on: false
|
||||
accuracy:
|
||||
|
||||
@ -14,6 +14,7 @@ slurm:
|
||||
account: <account>
|
||||
job_time: 02:00:00
|
||||
job_name: unified-benchmark
|
||||
extra_args: "--gres=gpu:4"
|
||||
numa_bind: true
|
||||
benchmark:
|
||||
mode: e2e
|
||||
@ -36,6 +37,8 @@ environment:
|
||||
trtllm_repo: ''
|
||||
build_wheel: false
|
||||
work_dir: <full_path_to_work_dir>
|
||||
worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes"
|
||||
server_env_var: "TRTLLM_SERVER_DISABLE_GC=1"
|
||||
profiling:
|
||||
nsys_on: false
|
||||
accuracy:
|
||||
|
||||
@ -14,6 +14,7 @@ slurm:
|
||||
account: <account>
|
||||
job_time: 02:00:00
|
||||
job_name: unified-benchmark
|
||||
extra_args: "--gres=gpu:4"
|
||||
numa_bind: true
|
||||
benchmark:
|
||||
mode: e2e
|
||||
@ -36,6 +37,8 @@ environment:
|
||||
trtllm_repo: ''
|
||||
build_wheel: false
|
||||
work_dir: <full_path_to_work_dir>
|
||||
worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes"
|
||||
server_env_var: "TRTLLM_SERVER_DISABLE_GC=1"
|
||||
profiling:
|
||||
nsys_on: false
|
||||
accuracy:
|
||||
|
||||
@ -14,6 +14,7 @@ slurm:
|
||||
account: <account>
|
||||
job_time: 02:00:00
|
||||
job_name: unified-benchmark
|
||||
extra_args: "--gres=gpu:4"
|
||||
numa_bind: true
|
||||
benchmark:
|
||||
mode: e2e
|
||||
@ -36,6 +37,8 @@ environment:
|
||||
trtllm_repo: ''
|
||||
build_wheel: false
|
||||
work_dir: <full_path_to_work_dir>
|
||||
worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes"
|
||||
server_env_var: "TRTLLM_SERVER_DISABLE_GC=1"
|
||||
profiling:
|
||||
nsys_on: false
|
||||
accuracy:
|
||||
|
||||
@ -14,6 +14,7 @@ slurm:
|
||||
account: <account>
|
||||
job_time: 02:00:00
|
||||
job_name: unified-benchmark
|
||||
extra_args: "--gres=gpu:4"
|
||||
numa_bind: true
|
||||
benchmark:
|
||||
mode: e2e
|
||||
@ -36,6 +37,8 @@ environment:
|
||||
trtllm_repo: ''
|
||||
build_wheel: false
|
||||
work_dir: <full_path_to_work_dir>
|
||||
worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes"
|
||||
server_env_var: "TRTLLM_SERVER_DISABLE_GC=1"
|
||||
profiling:
|
||||
nsys_on: false
|
||||
accuracy:
|
||||
|
||||
@ -14,6 +14,7 @@ slurm:
|
||||
account: <account>
|
||||
job_time: 02:00:00
|
||||
job_name: unified-benchmark
|
||||
extra_args: "--gres=gpu:4"
|
||||
numa_bind: true
|
||||
benchmark:
|
||||
mode: e2e
|
||||
@ -36,6 +37,8 @@ environment:
|
||||
trtllm_repo: ''
|
||||
build_wheel: false
|
||||
work_dir: <full_path_to_work_dir>
|
||||
worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes"
|
||||
server_env_var: "TRTLLM_SERVER_DISABLE_GC=1"
|
||||
profiling:
|
||||
nsys_on: false
|
||||
accuracy:
|
||||
|
||||
@ -14,6 +14,7 @@ slurm:
|
||||
account: <account>
|
||||
job_time: 02:00:00
|
||||
job_name: unified-benchmark
|
||||
extra_args: "--gres=gpu:4"
|
||||
numa_bind: true
|
||||
benchmark:
|
||||
mode: e2e
|
||||
@ -36,6 +37,8 @@ environment:
|
||||
trtllm_repo: ''
|
||||
build_wheel: false
|
||||
work_dir: <full_path_to_work_dir>
|
||||
worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes"
|
||||
server_env_var: "TRTLLM_SERVER_DISABLE_GC=1"
|
||||
profiling:
|
||||
nsys_on: false
|
||||
accuracy:
|
||||
|
||||
@ -14,6 +14,7 @@ slurm:
|
||||
account: <account>
|
||||
job_time: 02:00:00
|
||||
job_name: unified-benchmark
|
||||
extra_args: "--gres=gpu:4"
|
||||
numa_bind: true
|
||||
benchmark:
|
||||
mode: e2e
|
||||
@ -36,6 +37,8 @@ environment:
|
||||
trtllm_repo: ''
|
||||
build_wheel: false
|
||||
work_dir: <full_path_to_work_dir>
|
||||
worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes"
|
||||
server_env_var: "TRTLLM_SERVER_DISABLE_GC=1"
|
||||
profiling:
|
||||
nsys_on: false
|
||||
accuracy:
|
||||
|
||||
@ -14,6 +14,7 @@ slurm:
|
||||
account: <account>
|
||||
job_time: 02:00:00
|
||||
job_name: unified-benchmark
|
||||
extra_args: "--gres=gpu:4"
|
||||
numa_bind: true
|
||||
benchmark:
|
||||
mode: e2e
|
||||
@ -36,6 +37,8 @@ environment:
|
||||
trtllm_repo: ''
|
||||
build_wheel: false
|
||||
work_dir: <full_path_to_work_dir>
|
||||
worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes"
|
||||
server_env_var: "TRTLLM_SERVER_DISABLE_GC=1"
|
||||
profiling:
|
||||
nsys_on: false
|
||||
accuracy:
|
||||
|
||||
@ -14,6 +14,7 @@ slurm:
|
||||
account: <account>
|
||||
job_time: 02:00:00
|
||||
job_name: unified-benchmark
|
||||
extra_args: "--gres=gpu:4"
|
||||
numa_bind: true
|
||||
benchmark:
|
||||
mode: e2e
|
||||
@ -36,6 +37,8 @@ environment:
|
||||
trtllm_repo: ''
|
||||
build_wheel: false
|
||||
work_dir: <full_path_to_work_dir>
|
||||
worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes"
|
||||
server_env_var: "TRTLLM_SERVER_DISABLE_GC=1"
|
||||
profiling:
|
||||
nsys_on: false
|
||||
accuracy:
|
||||
|
||||
@ -14,6 +14,7 @@ slurm:
|
||||
account: <account>
|
||||
job_time: 02:00:00
|
||||
job_name: unified-benchmark
|
||||
extra_args: "--gres=gpu:4"
|
||||
numa_bind: true
|
||||
benchmark:
|
||||
mode: e2e
|
||||
@ -36,6 +37,8 @@ environment:
|
||||
trtllm_repo: ''
|
||||
build_wheel: false
|
||||
work_dir: <full_path_to_work_dir>
|
||||
worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes"
|
||||
server_env_var: "TRTLLM_SERVER_DISABLE_GC=1"
|
||||
profiling:
|
||||
nsys_on: false
|
||||
accuracy:
|
||||
|
||||
@ -14,6 +14,7 @@ slurm:
|
||||
account: <account>
|
||||
job_time: 02:00:00
|
||||
job_name: unified-benchmark
|
||||
extra_args: "--gres=gpu:4"
|
||||
numa_bind: true
|
||||
benchmark:
|
||||
mode: e2e
|
||||
@ -36,6 +37,8 @@ environment:
|
||||
trtllm_repo: ''
|
||||
build_wheel: false
|
||||
work_dir: <full_path_to_work_dir>
|
||||
worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes"
|
||||
server_env_var: "TRTLLM_SERVER_DISABLE_GC=1"
|
||||
profiling:
|
||||
nsys_on: false
|
||||
accuracy:
|
||||
|
||||
@ -14,6 +14,7 @@ slurm:
|
||||
account: <account>
|
||||
job_time: 02:00:00
|
||||
job_name: unified-benchmark
|
||||
extra_args: "--gres=gpu:4"
|
||||
numa_bind: true
|
||||
benchmark:
|
||||
mode: e2e
|
||||
@ -36,6 +37,8 @@ environment:
|
||||
trtllm_repo: ''
|
||||
build_wheel: false
|
||||
work_dir: <full_path_to_work_dir>
|
||||
worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes"
|
||||
server_env_var: "TRTLLM_SERVER_DISABLE_GC=1"
|
||||
profiling:
|
||||
nsys_on: false
|
||||
accuracy:
|
||||
|
||||
@ -14,6 +14,7 @@ slurm:
|
||||
account: <account>
|
||||
job_time: 02:00:00
|
||||
job_name: unified-benchmark
|
||||
extra_args: "--gres=gpu:4"
|
||||
numa_bind: true
|
||||
benchmark:
|
||||
mode: e2e
|
||||
@ -36,6 +37,8 @@ environment:
|
||||
trtllm_repo: ''
|
||||
build_wheel: false
|
||||
work_dir: <full_path_to_work_dir>
|
||||
worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes"
|
||||
server_env_var: "TRTLLM_SERVER_DISABLE_GC=1"
|
||||
profiling:
|
||||
nsys_on: false
|
||||
accuracy:
|
||||
|
||||
@ -14,6 +14,7 @@ slurm:
|
||||
account: <account>
|
||||
job_time: 02:00:00
|
||||
job_name: unified-benchmark
|
||||
extra_args: "--gres=gpu:4"
|
||||
numa_bind: true
|
||||
benchmark:
|
||||
mode: e2e
|
||||
@ -36,6 +37,8 @@ environment:
|
||||
trtllm_repo: ''
|
||||
build_wheel: false
|
||||
work_dir: <full_path_to_work_dir>
|
||||
worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes"
|
||||
server_env_var: "TRTLLM_SERVER_DISABLE_GC=1"
|
||||
profiling:
|
||||
nsys_on: false
|
||||
accuracy:
|
||||
|
||||
@ -21,6 +21,7 @@ slurm:
|
||||
account: <account>
|
||||
job_time: 02:00:00
|
||||
job_name: unified-benchmark
|
||||
extra_args: "--gres=gpu:4"
|
||||
numa_bind: true
|
||||
benchmark:
|
||||
mode: gen_only
|
||||
|
||||
@ -15,6 +15,7 @@ slurm:
|
||||
account: <account>
|
||||
job_time: 02:00:00
|
||||
job_name: unified-benchmark
|
||||
extra_args: "--gres=gpu:4"
|
||||
numa_bind: true
|
||||
benchmark:
|
||||
mode: gen_only
|
||||
@ -37,6 +38,8 @@ environment:
|
||||
trtllm_repo: ''
|
||||
build_wheel: false
|
||||
work_dir: <full_path_to_work_dir>
|
||||
worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes"
|
||||
server_env_var: "TRTLLM_SERVER_DISABLE_GC=1"
|
||||
profiling:
|
||||
nsys_on: false
|
||||
accuracy:
|
||||
|
||||
@ -15,6 +15,7 @@ slurm:
|
||||
account: <account>
|
||||
job_time: 02:00:00
|
||||
job_name: unified-benchmark
|
||||
extra_args: "--gres=gpu:4"
|
||||
numa_bind: true
|
||||
benchmark:
|
||||
mode: gen_only
|
||||
@ -37,6 +38,8 @@ environment:
|
||||
trtllm_repo: ''
|
||||
build_wheel: false
|
||||
work_dir: <full_path_to_work_dir>
|
||||
worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes"
|
||||
server_env_var: "TRTLLM_SERVER_DISABLE_GC=1"
|
||||
profiling:
|
||||
nsys_on: false
|
||||
accuracy:
|
||||
|
||||
@ -15,6 +15,7 @@ slurm:
|
||||
account: <account>
|
||||
job_time: 02:00:00
|
||||
job_name: unified-benchmark
|
||||
extra_args: "--gres=gpu:4"
|
||||
numa_bind: true
|
||||
benchmark:
|
||||
mode: gen_only
|
||||
@ -37,6 +38,8 @@ environment:
|
||||
trtllm_repo: ''
|
||||
build_wheel: false
|
||||
work_dir: <full_path_to_work_dir>
|
||||
worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes"
|
||||
server_env_var: "TRTLLM_SERVER_DISABLE_GC=1"
|
||||
profiling:
|
||||
nsys_on: false
|
||||
accuracy:
|
||||
|
||||
@ -15,6 +15,7 @@ slurm:
|
||||
account: <account>
|
||||
job_time: 02:00:00
|
||||
job_name: unified-benchmark
|
||||
extra_args: "--gres=gpu:4"
|
||||
numa_bind: true
|
||||
benchmark:
|
||||
mode: gen_only
|
||||
@ -37,6 +38,8 @@ environment:
|
||||
trtllm_repo: ''
|
||||
build_wheel: false
|
||||
work_dir: <full_path_to_work_dir>
|
||||
worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes"
|
||||
server_env_var: "TRTLLM_SERVER_DISABLE_GC=1"
|
||||
profiling:
|
||||
nsys_on: false
|
||||
accuracy:
|
||||
|
||||
@ -15,6 +15,7 @@ slurm:
|
||||
account: <account>
|
||||
job_time: 02:00:00
|
||||
job_name: unified-benchmark
|
||||
extra_args: "--gres=gpu:4"
|
||||
numa_bind: true
|
||||
benchmark:
|
||||
mode: gen_only
|
||||
@ -37,6 +38,8 @@ environment:
|
||||
trtllm_repo: ''
|
||||
build_wheel: false
|
||||
work_dir: <full_path_to_work_dir>
|
||||
worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes"
|
||||
server_env_var: "TRTLLM_SERVER_DISABLE_GC=1"
|
||||
profiling:
|
||||
nsys_on: false
|
||||
accuracy:
|
||||
|
||||
@ -15,6 +15,7 @@ slurm:
|
||||
account: <account>
|
||||
job_time: 02:00:00
|
||||
job_name: unified-benchmark
|
||||
extra_args: "--gres=gpu:4"
|
||||
numa_bind: true
|
||||
benchmark:
|
||||
mode: gen_only
|
||||
@ -37,6 +38,8 @@ environment:
|
||||
trtllm_repo: ''
|
||||
build_wheel: false
|
||||
work_dir: <full_path_to_work_dir>
|
||||
worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes"
|
||||
server_env_var: "TRTLLM_SERVER_DISABLE_GC=1"
|
||||
profiling:
|
||||
nsys_on: false
|
||||
accuracy:
|
||||
|
||||
@ -15,6 +15,7 @@ slurm:
|
||||
account: <account>
|
||||
job_time: 02:00:00
|
||||
job_name: unified-benchmark
|
||||
extra_args: "--gres=gpu:4"
|
||||
numa_bind: true
|
||||
benchmark:
|
||||
mode: gen_only
|
||||
@ -37,6 +38,8 @@ environment:
|
||||
trtllm_repo: ''
|
||||
build_wheel: false
|
||||
work_dir: <full_path_to_work_dir>
|
||||
worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes"
|
||||
server_env_var: "TRTLLM_SERVER_DISABLE_GC=1"
|
||||
profiling:
|
||||
nsys_on: false
|
||||
accuracy:
|
||||
|
||||
@ -15,6 +15,7 @@ slurm:
|
||||
account: <account>
|
||||
job_time: 02:00:00
|
||||
job_name: unified-benchmark
|
||||
extra_args: "--gres=gpu:4"
|
||||
numa_bind: true
|
||||
benchmark:
|
||||
mode: gen_only
|
||||
@ -37,6 +38,8 @@ environment:
|
||||
trtllm_repo: ''
|
||||
build_wheel: false
|
||||
work_dir: <full_path_to_work_dir>
|
||||
worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes"
|
||||
server_env_var: "TRTLLM_SERVER_DISABLE_GC=1"
|
||||
profiling:
|
||||
nsys_on: false
|
||||
accuracy:
|
||||
|
||||
@ -15,6 +15,7 @@ slurm:
|
||||
account: <account>
|
||||
job_time: 02:00:00
|
||||
job_name: unified-benchmark
|
||||
extra_args: "--gres=gpu:4"
|
||||
numa_bind: true
|
||||
benchmark:
|
||||
mode: gen_only
|
||||
@ -37,6 +38,8 @@ environment:
|
||||
trtllm_repo: ''
|
||||
build_wheel: false
|
||||
work_dir: <full_path_to_work_dir>
|
||||
worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes"
|
||||
server_env_var: "TRTLLM_SERVER_DISABLE_GC=1"
|
||||
profiling:
|
||||
nsys_on: false
|
||||
accuracy:
|
||||
|
||||
@ -15,6 +15,7 @@ slurm:
|
||||
account: <account>
|
||||
job_time: 02:00:00
|
||||
job_name: unified-benchmark
|
||||
extra_args: "--gres=gpu:4"
|
||||
numa_bind: true
|
||||
benchmark:
|
||||
mode: gen_only
|
||||
@ -37,6 +38,8 @@ environment:
|
||||
trtllm_repo: ''
|
||||
build_wheel: false
|
||||
work_dir: <full_path_to_work_dir>
|
||||
worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes"
|
||||
server_env_var: "TRTLLM_SERVER_DISABLE_GC=1"
|
||||
profiling:
|
||||
nsys_on: false
|
||||
accuracy:
|
||||
|
||||
@ -0,0 +1,110 @@
|
||||
# nvbugs: 5422621
|
||||
metadata:
|
||||
model_name: deepseek-r1-fp4
|
||||
precision: fp4
|
||||
model_dir_name: DeepSeek-R1-0528-FP4-V2
|
||||
supported_gpus:
|
||||
- GB200
|
||||
- GB300
|
||||
script_file: disaggr_torch.slurm
|
||||
benchmark_type: 8k1k
|
||||
config_index: 7
|
||||
dataset_file: datasets/deepseek-r1-8192-1024-200000-ratio-1_for_serve.json
|
||||
slurm:
|
||||
script_file: disaggr_torch.slurm
|
||||
partition: <partition>
|
||||
account: <account>
|
||||
job_time: 02:00:00
|
||||
job_name: unified-benchmark
|
||||
extra_args: "--gres=gpu:4"
|
||||
numa_bind: true
|
||||
benchmark:
|
||||
mode: gen_only
|
||||
use_nv_sa_benchmark: false
|
||||
multi_round: 8
|
||||
benchmark_ratio: 0.8
|
||||
streaming: true
|
||||
concurrency_list: '12288'
|
||||
input_length: 1024
|
||||
output_length: 1024
|
||||
dataset_file: <dataset_file>
|
||||
hardware:
|
||||
gpus_per_node: 4
|
||||
num_ctx_servers: 2
|
||||
num_gen_servers: 1
|
||||
environment:
|
||||
container_mount: <container_mount>
|
||||
container_image: <container_image>
|
||||
model_path: <model_path>
|
||||
trtllm_repo: ''
|
||||
build_wheel: false
|
||||
work_dir: <full_path_to_work_dir>
|
||||
worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes"
|
||||
server_env_var: "TRTLLM_SERVER_DISABLE_GC=1"
|
||||
profiling:
|
||||
nsys_on: false
|
||||
accuracy:
|
||||
enable_accuracy_test: false
|
||||
model: local-completions
|
||||
tasks: gsm8k
|
||||
model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096
|
||||
worker_config:
|
||||
gen:
|
||||
enable_layerwise_nvtx_marker: true
|
||||
tensor_parallel_size: 48
|
||||
moe_expert_parallel_size: 48
|
||||
enable_attention_dp: true
|
||||
enable_lm_head_tp_in_adp: true
|
||||
pipeline_parallel_size: 1
|
||||
max_batch_size: 1024
|
||||
max_num_tokens: 1024
|
||||
max_seq_len: 2176
|
||||
cuda_graph_config:
|
||||
enable_padding: true
|
||||
batch_sizes:
|
||||
- 1
|
||||
- 2
|
||||
- 4
|
||||
- 8
|
||||
- 16
|
||||
- 32
|
||||
- 64
|
||||
- 128
|
||||
- 256
|
||||
- 512
|
||||
- 768
|
||||
- 1024
|
||||
- 2048
|
||||
print_iter_log: true
|
||||
kv_cache_config:
|
||||
enable_block_reuse: false
|
||||
free_gpu_memory_fraction: 0.7
|
||||
dtype: fp8
|
||||
moe_config:
|
||||
backend: WIDEEP
|
||||
load_balancer:
|
||||
num_slots: 288
|
||||
layer_updates_per_iter: 1
|
||||
cache_transceiver_config:
|
||||
max_tokens_in_buffer: 8320
|
||||
backend: DEFAULT
|
||||
stream_interval: 20
|
||||
ctx:
|
||||
enable_layerwise_nvtx_marker: true
|
||||
max_batch_size: 4
|
||||
max_num_tokens: 4480
|
||||
max_seq_len: 2176
|
||||
tensor_parallel_size: 4
|
||||
moe_expert_parallel_size: 4
|
||||
enable_attention_dp: true
|
||||
pipeline_parallel_size: 1
|
||||
print_iter_log: true
|
||||
cuda_graph_config: null
|
||||
disable_overlap_scheduler: true
|
||||
kv_cache_config:
|
||||
enable_block_reuse: false
|
||||
free_gpu_memory_fraction: 0.85
|
||||
dtype: fp8
|
||||
cache_transceiver_config:
|
||||
max_tokens_in_buffer: 8320
|
||||
backend: DEFAULT
|
||||
@ -15,6 +15,7 @@ slurm:
|
||||
account: <account>
|
||||
job_time: 02:00:00
|
||||
job_name: disaggr-test
|
||||
extra_args: "--gres=gpu:4"
|
||||
numa_bind: true
|
||||
hardware:
|
||||
gpus_per_node: 4
|
||||
@ -37,6 +38,8 @@ environment:
|
||||
trtllm_repo: ''
|
||||
build_wheel: false
|
||||
work_dir: <full_path_to_work_dir>
|
||||
worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes"
|
||||
server_env_var: "TRTLLM_SERVER_DISABLE_GC=1"
|
||||
profiling:
|
||||
nsys_on: false
|
||||
accuracy:
|
||||
|
||||
@ -15,6 +15,7 @@ slurm:
|
||||
account: <account>
|
||||
job_time: 02:00:00
|
||||
job_name: unified-benchmark
|
||||
extra_args: "--gres=gpu:4"
|
||||
numa_bind: true
|
||||
benchmark:
|
||||
mode: gen_only
|
||||
@ -37,6 +38,8 @@ environment:
|
||||
trtllm_repo: ''
|
||||
build_wheel: false
|
||||
work_dir: <full_path_to_work_dir>
|
||||
worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes"
|
||||
server_env_var: "TRTLLM_SERVER_DISABLE_GC=1"
|
||||
profiling:
|
||||
nsys_on: false
|
||||
accuracy:
|
||||
|
||||
@ -15,6 +15,7 @@ slurm:
|
||||
account: <account>
|
||||
job_time: 02:00:00
|
||||
job_name: unified-benchmark
|
||||
extra_args: "--gres=gpu:4"
|
||||
numa_bind: true
|
||||
benchmark:
|
||||
mode: gen_only
|
||||
@ -37,6 +38,8 @@ environment:
|
||||
trtllm_repo: ''
|
||||
build_wheel: false
|
||||
work_dir: <full_path_to_work_dir>
|
||||
worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes"
|
||||
server_env_var: "TRTLLM_SERVER_DISABLE_GC=1"
|
||||
profiling:
|
||||
nsys_on: false
|
||||
accuracy:
|
||||
|
||||
@ -15,6 +15,7 @@ slurm:
|
||||
account: <account>
|
||||
job_time: 02:00:00
|
||||
job_name: unified-benchmark
|
||||
extra_args: "--gres=gpu:4"
|
||||
numa_bind: true
|
||||
benchmark:
|
||||
mode: gen_only
|
||||
@ -37,6 +38,8 @@ environment:
|
||||
trtllm_repo: ''
|
||||
build_wheel: false
|
||||
work_dir: <full_path_to_work_dir>
|
||||
worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes"
|
||||
server_env_var: "TRTLLM_SERVER_DISABLE_GC=1"
|
||||
profiling:
|
||||
nsys_on: false
|
||||
accuracy:
|
||||
|
||||
@ -15,6 +15,7 @@ slurm:
|
||||
account: <account>
|
||||
job_time: 02:00:00
|
||||
job_name: unified-benchmark
|
||||
extra_args: "--gres=gpu:4"
|
||||
numa_bind: true
|
||||
benchmark:
|
||||
mode: gen_only
|
||||
@ -37,6 +38,8 @@ environment:
|
||||
trtllm_repo: ''
|
||||
build_wheel: false
|
||||
work_dir: <full_path_to_work_dir>
|
||||
worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes"
|
||||
server_env_var: "TRTLLM_SERVER_DISABLE_GC=1"
|
||||
profiling:
|
||||
nsys_on: false
|
||||
accuracy:
|
||||
|
||||
@ -16,6 +16,8 @@ test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_deepseek-r1-fp4_
|
||||
test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs16_eplb0_mtp3_ccb-UCX]
|
||||
test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_Qwen3-235B-A22B-FP4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_ccb-NIXL]
|
||||
test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_Qwen3-235B-A22B-FP4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_ccb-UCX]
|
||||
test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_Qwen3-235B-A22B-FP8_1k1k_ctx1_gen1_tep8_bs32_eplb0_mtp0_ccb-NIXL]
|
||||
test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_Qwen3-235B-A22B-FP8_1k1k_ctx1_gen1_tep8_bs32_eplb0_mtp0_ccb-UCX]
|
||||
# test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_Qwen3-235B-A22B-FP4_1k1k_ctx1_gen1_dep16_bs64_eplb0_mtp3_ccb-NIXL]
|
||||
# test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_Qwen3-235B-A22B-FP4_1k1k_ctx2_gen1_dep16_bs128_eplb0_mtp1_ccb-NIXL]
|
||||
# test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_Qwen3-235B-A22B-FP4_1k1k_ctx2_gen1_dep16_bs128_eplb0_mtp1_ccb-UCX]
|
||||
|
||||
@ -7,6 +7,7 @@ test_disagg.py::TestDisaggBenchmark::test_benchmark[wideep_perf_deepseek-r1-fp4_
|
||||
test_disagg.py::TestDisaggBenchmark::test_benchmark[wideep_perf_deepseek-r1-fp4_8k1k_ctx6_gen1_dep16_bs64_eplb288_mtp0_ccb-UCX]
|
||||
test_disagg.py::TestDisaggBenchmark::test_benchmark[wideep_perf_deepseek-r1-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb288_mtp3_ccb-NIXL]
|
||||
test_disagg.py::TestDisaggBenchmark::test_benchmark[wideep_perf_deepseek-r1-fp4_1k1k_ctx1_gen1_dep32_bs32_eplb288_mtp0_ccb-NIXL]
|
||||
test_disagg.py::TestDisaggBenchmark::test_benchmark[wideep_perf_deepseek-r1-fp4_1k1k_ctx2_gen1_dep48_bs16_eplb288_mtp3_ccb-DEFAULT]
|
||||
# test_disagg.py::TestDisaggBenchmark::test_benchmark[wideep_perf_Qwen3-235B-A22B-FP4_1k1k_ctx1_gen1_dep16_bs64_eplb288_mtp3_ccb-NIXL]
|
||||
# test_disagg.py::TestDisaggBenchmark::test_benchmark[wideep_perf_Qwen3-235B-A22B-FP4_1k1k_ctx1_gen1_dep32_bs16_eplb288_mtp3_ccb-NIXL]
|
||||
# test_disagg.py::TestDisaggBenchmark::test_benchmark[wideep_perf_Qwen3-235B-A22B-FP4_1k1k_ctx2_gen1_dep16_bs128_eplb288_mtp1_ccb-NIXL]
|
||||
|
||||
@ -1,6 +1,7 @@
|
||||
"""Disaggregated Benchmark Configuration."""
|
||||
|
||||
import os
|
||||
from datetime import datetime
|
||||
|
||||
SESSION_COLLECT_CMD_TYPE = "session_collect"
|
||||
|
||||
@ -169,7 +170,8 @@ def extract_config_fields(config_data: dict) -> dict:
|
||||
|
||||
# Generate derived fields
|
||||
dep_flag = "dep" if gen_enable_dp else "tep"
|
||||
log_base = f"{isl}-{osl}"
|
||||
date_prefix = datetime.now().strftime("%Y%m%d")
|
||||
log_base = f"{date_prefix}/{isl}-{osl}"
|
||||
context_dir = (
|
||||
f"ctx{ctx_num}_gen{gen_num}_{dep_flag}{gen_tp_size}_"
|
||||
f"batch{gen_batch_size}_eplb{eplb_slots}_mtp{mtp_size}"
|
||||
|
||||
@ -20,6 +20,7 @@ import os
|
||||
import re
|
||||
import sys
|
||||
import time
|
||||
from datetime import datetime
|
||||
|
||||
from defs.trt_test_alternative import print_info
|
||||
|
||||
@ -32,40 +33,6 @@ from jenkins.scripts.open_search_db import OpenSearchDB
|
||||
PROJECT_ROOT = "sandbox-temp-trtllm-ci-perf-v1" # "sandbox-trtllm-ci-perf"
|
||||
TEST_INFO_PROJECT_NAME = f"{PROJECT_ROOT}-test_info"
|
||||
|
||||
# Server config fields to compare
|
||||
SERVER_FIELDS = [
|
||||
"s_model_name",
|
||||
"l_gpus",
|
||||
"l_tp",
|
||||
"l_ep",
|
||||
"l_pp",
|
||||
"l_max_num_tokens",
|
||||
"b_enable_chunked_prefill",
|
||||
"b_disable_overlap_scheduler",
|
||||
"s_attention_backend",
|
||||
"s_moe_backend",
|
||||
"l_moe_max_num_tokens",
|
||||
"l_stream_interval",
|
||||
"b_enable_attention_dp",
|
||||
"b_attention_dp_balance",
|
||||
"l_batching_wait_iters",
|
||||
"l_timeout_iters",
|
||||
"s_kv_cache_dtype",
|
||||
"b_enable_block_reuse",
|
||||
"d_free_gpu_memory_fraction",
|
||||
"l_max_batch_size",
|
||||
"b_enable_padding",
|
||||
]
|
||||
|
||||
# Client config fields to compare
|
||||
CLIENT_FIELDS = [
|
||||
"l_concurrency",
|
||||
"l_iterations",
|
||||
"l_isl",
|
||||
"l_osl",
|
||||
"d_random_range_ratio",
|
||||
]
|
||||
|
||||
# Metrics where larger is better
|
||||
MAXIMIZE_METRICS = [
|
||||
"d_seq_throughput",
|
||||
@ -137,6 +104,7 @@ def get_job_info():
|
||||
trigger_mr_link = ""
|
||||
trigger_mr_id = ""
|
||||
trigger_mr_commit = ""
|
||||
artifact_url = ""
|
||||
if is_pr_job:
|
||||
# Get PR info from github_pr_api_url
|
||||
github_pr_api_url = global_vars.get("github_pr_api_url", "")
|
||||
@ -162,6 +130,9 @@ def get_job_info():
|
||||
|
||||
# Set trigger_mr_commit to commit
|
||||
trigger_mr_commit = commit
|
||||
artifact_url = f"https://urm.nvidia.com/artifactory/sw-tensorrt-generic/llm-artifacts/LLM/main/L0_PostMerge/{job_id}" if job_id else ""
|
||||
else:
|
||||
artifact_url = f"https://urm.nvidia.com/artifactory/sw-tensorrt-generic/llm-artifacts/LLM/main/L0_PostMerge/{job_id}" if job_id else ""
|
||||
|
||||
return {
|
||||
"b_is_baseline": False,
|
||||
@ -185,11 +156,12 @@ def get_job_info():
|
||||
"s_trigger_mr_link": trigger_mr_link,
|
||||
"s_trigger_mr_id": trigger_mr_id,
|
||||
"s_trigger_mr_commit": trigger_mr_commit,
|
||||
"s_artifact_url": artifact_url,
|
||||
"b_is_regression": False,
|
||||
}
|
||||
|
||||
|
||||
def query_history_data():
|
||||
def query_history_data(gpu_type):
|
||||
"""
|
||||
Query post-merge data with specific gpu type and model name
|
||||
"""
|
||||
@ -209,6 +181,16 @@ def query_history_data():
|
||||
"b_is_post_merge": True
|
||||
}
|
||||
},
|
||||
{
|
||||
"term": {
|
||||
"b_is_regression": False
|
||||
}
|
||||
},
|
||||
{
|
||||
"term": {
|
||||
"s_gpu_type": gpu_type
|
||||
}
|
||||
},
|
||||
{
|
||||
"range": {
|
||||
"ts_created": {
|
||||
@ -263,30 +245,38 @@ def query_history_data():
|
||||
return []
|
||||
|
||||
|
||||
def match(history_data, new_data):
|
||||
def match(history_data, new_data, match_keys):
|
||||
"""
|
||||
Check if the server and client config of history data matches the new data
|
||||
"""
|
||||
# Combine all fields to compare (excluding log links)
|
||||
fields_to_compare = SERVER_FIELDS + CLIENT_FIELDS
|
||||
|
||||
def is_empty(value):
|
||||
"""Check if a value is empty (None, empty string, etc.)"""
|
||||
return value is None or value == ""
|
||||
|
||||
# Compare each field
|
||||
for field in fields_to_compare:
|
||||
history_value = history_data.get(field)
|
||||
new_value = new_data.get(field)
|
||||
def should_skip_field(field):
|
||||
# Skip fields starting with @, _, ts_
|
||||
if field.startswith('@') or field.startswith('_') or field.startswith(
|
||||
'ts_'):
|
||||
return True
|
||||
# Skip log links and speculative_model_dir and job configs
|
||||
if field in [
|
||||
's_speculative_model_dir', 's_server_log_link',
|
||||
's_ctx_server_log_link', 's_gen_server_log_link',
|
||||
's_client_log_link'
|
||||
]:
|
||||
return True
|
||||
return False
|
||||
|
||||
# If both are empty, consider them equal
|
||||
for field in match_keys:
|
||||
# Skip excluded fields
|
||||
if should_skip_field(field):
|
||||
continue
|
||||
history_value = history_data.get(field, None)
|
||||
new_value = new_data.get(field, None)
|
||||
if is_empty(history_value) and is_empty(new_value):
|
||||
continue
|
||||
|
||||
# If values don't match, return False
|
||||
if history_value != new_value:
|
||||
return False
|
||||
|
||||
return True
|
||||
|
||||
|
||||
@ -339,27 +329,44 @@ def calculate_best_perf_result(history_data_list, new_data):
|
||||
return best_metrics
|
||||
|
||||
|
||||
def get_history_data(new_data_dict):
|
||||
def get_history_data(new_data_dict, gpu_type, match_keys):
|
||||
"""
|
||||
Query history post-merge data for each cmd_idx
|
||||
"""
|
||||
|
||||
def get_latest_data(data_list):
|
||||
if not data_list:
|
||||
return None
|
||||
time_format = "%b %d, %Y @ %H:%M:%S.%f"
|
||||
# Find the item with the maximum ts_created value
|
||||
latest_data = max(
|
||||
data_list,
|
||||
key=lambda x: datetime.strptime(x["ts_created"], time_format))
|
||||
return latest_data
|
||||
|
||||
history_baseline_dict = {}
|
||||
history_data_dict = {}
|
||||
cmd_idxs = new_data_dict.keys()
|
||||
for cmd_idx in cmd_idxs:
|
||||
history_data_dict[cmd_idx] = []
|
||||
history_baseline_dict[cmd_idx] = None
|
||||
history_data_list = query_history_data()
|
||||
history_baseline_dict[cmd_idx] = []
|
||||
history_data_list = []
|
||||
if cmd_idxs:
|
||||
history_data_list = query_history_data(gpu_type)
|
||||
if history_data_list:
|
||||
for history_data in history_data_list:
|
||||
for cmd_idx in cmd_idxs:
|
||||
if match(history_data, new_data_dict[cmd_idx]):
|
||||
if match(history_data, new_data_dict[cmd_idx], match_keys):
|
||||
if history_data.get("b_is_baseline") and history_data.get(
|
||||
"b_is_baseline") == True:
|
||||
history_baseline_dict[cmd_idx] = history_data
|
||||
history_baseline_dict[cmd_idx].append(history_data)
|
||||
else:
|
||||
history_data_dict[cmd_idx].append(history_data)
|
||||
break
|
||||
# Sometime database has several baselines and we only use the latest baseline one
|
||||
for cmd_idx, baseline_list in history_baseline_dict.items():
|
||||
latest_baseline = get_latest_data(baseline_list)
|
||||
history_baseline_dict[cmd_idx] = latest_baseline
|
||||
return history_baseline_dict, history_data_dict
|
||||
|
||||
|
||||
@ -477,6 +484,8 @@ def post_new_perf_data(new_baseline_data_dict, new_data_dict,
|
||||
# Only post regressive test cases when post-merge.
|
||||
if new_baseline_data_dict:
|
||||
data_list.extend(regressive_data_list)
|
||||
if not data_list:
|
||||
return
|
||||
try:
|
||||
print_info(
|
||||
f"Ready to post {len(data_list)} data to {TEST_INFO_PROJECT_NAME}")
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue
Block a user