Merge branch 'main' into update_mnnvl_test

Signed-off-by: Bo Li <22713281+bobboli@users.noreply.github.com>
2026-02-13 22:43:46 +08:00 · 2025-12-08 11:10:06 +08:00 · 2025-12-08 11:10:06 +08:00 · e72b9f9377
commit e72b9f9377
parent 683b58611c 03f89d7aa4
122 changed files with 5248 additions and 1106 deletions
--- a/ATTRIBUTIONS-Python.md
+++ b/ATTRIBUTIONS-Python.md
@ -25486,7 +25486,7 @@ limitations under the License.
 ```

 ### URLs
-  - `Homepage`: https://github.com/NVIDIA/TensorRT-Model-Optimizer
+  - `Homepage`: https://github.com/NVIDIA/Model-Optimizer


 ## nvidia-modelopt-core (0.33.1)
@ -25513,7 +25513,7 @@ limitations under the License.
 ```

 ### URLs
-  - `Homepage`: https://github.com/NVIDIA/TensorRT-Model-Optimizer
+  - `Homepage`: https://github.com/NVIDIA/Model-Optimizer


 ## nvidia-nccl-cu12 (2.27.3)
--- a/README.md
+++ b/README.md
@ -164,7 +164,7 @@ state-of-the-art optimizations to perform inference efficiently on NVIDIA GPUs.<
 [➡️ link](https://www.bentoml.com/blog/tuning-tensor-rt-llm-for-optimal-serving-with-bentoml)


-* [2024/08/20] 🏎️SDXL with #TensorRT Model Optimizer ⏱️⚡ 🏁 cache diffusion 🏁 quantization aware training 🏁 QLoRA 🏁 #Python 3.12
+* [2024/08/20] 🏎️SDXL with #Model Optimizer ⏱️⚡ 🏁 cache diffusion 🏁 quantization aware training 🏁 QLoRA 🏁 #Python 3.12
 [➡️ link](https://developer.nvidia.com/blog/nvidia-tensorrt-model-optimizer-v0-15-boosts-inference-performance-and-expands-model-support/)

 * [2024/08/13] 🐍 DIY Code Completion with #Mamba ⚡ #TensorRT #LLM for speed 🤖 NIM for ease ☁️ deploy anywhere
@ -209,7 +209,7 @@ Technical Deep Dive for serious coders ✅+99% compression ✅1 set of weights
 * [2024/05/21] ✨@modal_labs has the codes for serverless @AIatMeta Llama 3 on #TensorRT #LLM ✨👀 📚 Marvelous Modal Manual:
 Serverless TensorRT LLM (LLaMA 3 8B) | Modal Docs [➡️ link](https://modal.com/docs/examples/trtllm_llama)

-* [2024/05/08] NVIDIA TensorRT Model Optimizer -- the newest member of the #TensorRT ecosystem is a library of post-training and training-in-the-loop model optimization techniques ✅quantization ✅sparsity ✅QAT [➡️ blog](https://developer.nvidia.com/blog/accelerate-generative-ai-inference-performance-with-nvidia-tensorrt-model-optimizer-now-publicly-available/)
+* [2024/05/08] NVIDIA Model Optimizer -- the newest member of the #TensorRT ecosystem is a library of post-training and training-in-the-loop model optimization techniques ✅quantization ✅sparsity ✅QAT [➡️ blog](https://developer.nvidia.com/blog/accelerate-generative-ai-inference-performance-with-nvidia-tensorrt-model-optimizer-now-publicly-available/)

 * [2024/05/07] 🦙🦙🦙 24,000 tokens per second 🛫Meta Llama 3 takes off with #TensorRT #LLM 📚[➡️ link](https://blogs.nvidia.com/blog/meta-llama3-inference-acceleration/)

--- a/cpp/tensorrt_llm/common/customAllReduceUtils.h
+++ b/cpp/tensorrt_llm/common/customAllReduceUtils.h
@ -81,7 +81,6 @@ inline AllReduceStrategyType SelectStrategyLP(size_t seq_len, size_t hidden_size
    {
        return AllReduceStrategyType::ONESHOT;
    }
-    return AllReduceStrategyType::NCCL;
 }

 // use 1D vector to store the best strategy instead of a map for each sm version
@ -143,7 +142,7 @@ inline AllReduceStrategyType selectStrategyLookUpTable(
        sm_version = 100;
    }

-    // Check if the entry is out of bounds, otherwise return NCCL as fallback
+    // Check if the entry is out of bounds, otherwise return NCCL_SYMMETRIC as fallback
    if (AllReduceBestStrategyTable.find(sm_version) == AllReduceBestStrategyTable.end()
        || tp_index >= AllReduceBestStrategyTable.at(sm_version).size()
        || fusion_op_index >= AllReduceBestStrategyTable.at(sm_version).at(tp_index).size()
@ -151,7 +150,7 @@ inline AllReduceStrategyType selectStrategyLookUpTable(
        || num_token_index
            >= AllReduceBestStrategyTable.at(sm_version).at(tp_index).at(fusion_op_index).at(hidden_size_index).size())
    {
-        return AllReduceStrategyType::NCCL;
+        return AllReduceStrategyType::NCCL_SYMMETRIC;
    }

    return static_cast<AllReduceStrategyType>(
--- a/cpp/tensorrt_llm/common/ncclUtils.cpp
+++ b/cpp/tensorrt_llm/common/ncclUtils.cpp
@ -0,0 +1,585 @@
+/*
+ * Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "tensorrt_llm/common/ncclUtils.h"
+
+#if ENABLE_MULTI_DEVICE
+
+#include "tensorrt_llm/common/assert.h"
+#include "tensorrt_llm/common/cudaUtils.h"
+#include "tensorrt_llm/common/logger.h"
+#include <limits>
+#include <stdexcept>
+
+namespace tensorrt_llm::common::nccl_util
+{
+
+//==============================================================================
+// NcclCommResourceManager Implementation
+//==============================================================================
+
+NcclCommResourceManager& NcclCommResourceManager::getInstance() noexcept
+{
+    static NcclCommResourceManager instance;
+    return instance;
+}
+
+void NcclCommResourceManager::registerResource(ncclComm_t comm, ResourceCleanupFunc cleanup, char const* debugName)
+{
+    if (!comm)
+    {
+        TLLM_LOG_WARNING("[NCCLUtil] Attempted to register resource for null NCCL comm");
+        return;
+    }
+
+    std::lock_guard<std::mutex> lock(mMutex);
+    auto& resources = mCommResources[comm];
+    resources.emplace_back(std::move(cleanup), debugName ? debugName : "unnamed");
+
+    TLLM_LOG_TRACE("[NCCLUtil] Registered resource '%s' for NCCL comm %p (total: %zu)",
+        debugName ? debugName : "unnamed", static_cast<void*>(comm), resources.size());
+}
+
+void NcclCommResourceManager::cleanupResources(ncclComm_t comm) noexcept
+{
+    if (!comm)
+    {
+        return;
+    }
+
+    std::vector<ResourceEntry> resourcesToClean;
+
+    {
+        std::lock_guard<std::mutex> lock(mMutex);
+        auto it = mCommResources.find(comm);
+        if (it == mCommResources.end())
+        {
+            // Nothing registered for this comm, nothing to clean up
+            return;
+        }
+
+        // Move resources out (preserves order) and remove from map
+        resourcesToClean = std::move(it->second);
+        mCommResources.erase(it);
+
+        TLLM_LOG_TRACE(
+            "[NCCLUtil] Cleaning up %zu resources for NCCL comm %p", resourcesToClean.size(), static_cast<void*>(comm));
+    }
+
+    // Clean up outside the lock to avoid deadlocks if cleanup functions try to access the manager
+    // Order is preserved: resources are cleaned up in registration order
+    for (auto& [cleanup, name] : resourcesToClean)
+    {
+        try
+        {
+            TLLM_LOG_TRACE(
+                "[NCCLUtil] Cleaning up resource '%s' for NCCL comm %p", name.c_str(), static_cast<void*>(comm));
+            cleanup();
+        }
+        catch (std::exception const& e)
+        {
+            TLLM_LOG_ERROR("[NCCLUtil] Exception during cleanup of resource '%s' for NCCL comm %p: %s", name.c_str(),
+                static_cast<void*>(comm), e.what());
+        }
+        catch (...)
+        {
+            TLLM_LOG_ERROR("[NCCLUtil] Unknown exception during cleanup of resource '%s' for NCCL comm %p",
+                name.c_str(), static_cast<void*>(comm));
+        }
+    }
+}
+
+bool NcclCommResourceManager::hasResources(ncclComm_t comm) const noexcept
+{
+    std::lock_guard<std::mutex> lock(mMutex);
+    return mCommResources.find(comm) != mCommResources.end();
+}
+
+size_t NcclCommResourceManager::getResourceCount(ncclComm_t comm) const noexcept
+{
+    std::lock_guard<std::mutex> lock(mMutex);
+    auto it = mCommResources.find(comm);
+    return it != mCommResources.end() ? it->second.size() : 0;
+}
+
+//==============================================================================
+// NCCLHelper Implementation
+//==============================================================================
+
+NCCLHelper& NCCLHelper::getInstance()
+{
+    static NCCLHelper instance;
+    return instance;
+}
+
+NCCLHelper::NCCLHelper()
+    : mLibraryHandle(nullptr)
+    , mNCCLCommWindowRegister(nullptr)
+    , mNCCLMemAlloc(nullptr)
+    , mIsLoaded(false)
+{
+    loadNCCLLibrary();
+}
+
+NCCLHelper::~NCCLHelper()
+{
+    if (mLibraryHandle)
+    {
+#ifdef _WIN32
+        FreeLibrary(mLibraryHandle);
+#else
+        dlclose(mLibraryHandle);
+#endif
+        mLibraryHandle = nullptr;
+    }
+}
+
+void NCCLHelper::loadNCCLLibrary()
+{
+    try
+    {
+#ifdef _WIN32
+        char const* libraryNames[] = {"nccl.dll"};
+#else
+        char const* libraryNames[] = {"libnccl.so"};
+#endif
+
+        for (auto const* name : libraryNames)
+        {
+            mLibraryHandle = loadLibraryHandle(name);
+            if (mLibraryHandle)
+            {
+                TLLM_LOG_INFO("Successfully loaded NCCL library: %s", name);
+                break;
+            }
+        }
+
+        if (!mLibraryHandle)
+        {
+            TLLM_LOG_WARNING("Failed to load NCCL library");
+            return;
+        }
+
+        // Load the required symbols
+        mNCCLCommWindowRegister
+            = reinterpret_cast<ncclCommWindowRegisterFunc>(getSymbolAddress(mLibraryHandle, "ncclCommWindowRegister"));
+
+        mNCCLMemAlloc = reinterpret_cast<ncclMemAllocFunc>(getSymbolAddress(mLibraryHandle, "ncclMemAlloc"));
+
+        if (mNCCLCommWindowRegister == nullptr)
+        {
+            TLLM_LOG_WARNING("Failed to load ncclCommWindowRegister symbol, NCCL symmetric will not be supported.");
+        }
+
+        if (mNCCLMemAlloc == nullptr)
+        {
+            TLLM_LOG_WARNING("Failed to load ncclMemAlloc symbol, NCCL symmetric will not be supported.");
+        }
+
+        if (mNCCLCommWindowRegister != nullptr && mNCCLMemAlloc != nullptr)
+        {
+            mIsLoaded = true;
+        }
+        else
+        {
+            TLLM_LOG_WARNING(
+                "Failed to load required NCCL symbols (both ncclCommWindowRegister and ncclMemAlloc are required)");
+        }
+    }
+    catch (std::exception const& e)
+    {
+        TLLM_LOG_WARNING("Exception while loading NCCL library: %s", e.what());
+    }
+}
+
+void* NCCLHelper::loadLibraryHandle(char const* libName)
+{
+#ifdef _WIN32
+    return LoadLibraryA(libName);
+#else
+    return dlopen(libName, RTLD_LAZY | RTLD_GLOBAL);
+#endif
+}
+
+void* NCCLHelper::getSymbolAddress(void* handle, char const* symbolName)
+{
+    if (!handle)
+    {
+        return nullptr;
+    }
+
+#ifdef _WIN32
+    return GetProcAddress(static_cast<HMODULE>(handle), symbolName);
+#else
+    return dlsym(handle, symbolName);
+#endif
+}
+
+NCCLHelper::ncclCommWindowRegisterFunc NCCLHelper::getNCCLCommWindowRegister()
+{
+    return mNCCLCommWindowRegister;
+}
+
+NCCLHelper::ncclMemAllocFunc NCCLHelper::getNCCLMemAlloc()
+{
+    return mNCCLMemAlloc;
+}
+
+bool NCCLHelper::isLoaded() const
+{
+    return mIsLoaded;
+}
+
+//==============================================================================
+// NCCLWindowAllocator Implementation
+//==============================================================================
+
+NCCLWindowAllocator& NCCLWindowAllocator::getInstance()
+{
+    static NCCLWindowAllocator instance;
+    return instance;
+}
+
+NCCLWindowBuffer NCCLWindowAllocator::requestBuffer(ncclComm_t comm, size_t size)
+{
+    TLLM_CHECK_WITH_INFO(comm != nullptr, "NCCL communicator cannot be null");
+    TLLM_CHECK_WITH_INFO(size > 0, "Buffer size must be greater than 0");
+
+    std::lock_guard<std::mutex> lock(mMutex);
+
+    // Register cleanup callback for this communicator if not already registered
+    // This is cheap even if no buffers exist yet - cleanup will just return early
+    registerBufferCleanup(comm);
+
+    // Check if we have an available buffer of at least the requested size for this communicator
+    // Use best-fit: find the smallest buffer that's >= requested size
+    auto& commBuffers = mBufferPool[comm];
+    auto bestFit = commBuffers.end();
+    size_t bestFitSize = std::numeric_limits<size_t>::max();
+
+    for (auto it = commBuffers.begin(); it != commBuffers.end(); ++it)
+    {
+        if (!it->inUse && it->buffer.size >= size && it->buffer.size < bestFitSize)
+        {
+            bestFit = it;
+            bestFitSize = it->buffer.size;
+        }
+    }
+
+    if (bestFit != commBuffers.end())
+    {
+        bestFit->inUse = true;
+        TLLM_LOG_TRACE(
+            "[NCCLUtil] Reusing NCCL window buffer for comm %p: handle=%d, ptr=%p, size=%zu (requested: %zu)",
+            static_cast<void*>(comm), bestFit->buffer.handle, bestFit->buffer.ptr, bestFit->buffer.size, size);
+        return bestFit->buffer;
+    }
+
+    // No available buffer found, allocate a new one
+    TLLM_LOG_TRACE(
+        "[NCCLUtil] Allocating new NCCL window buffer for comm %p, size=%zu", static_cast<void*>(comm), size);
+    int handle = static_cast<int>(commBuffers.size());
+    NCCLWindowBuffer buffer = allocateAndRegisterBuffer(comm, size, handle);
+    commBuffers.push_back({buffer, true});
+
+    return buffer;
+}
+
+NCCLWindowBuffer NCCLWindowAllocator::searchBuffer(ncclComm_t comm, void* ptr) const
+{
+    if (!comm || !ptr)
+    {
+        return NCCLWindowBuffer();
+    }
+
+    std::lock_guard<std::mutex> lock(mMutex);
+    return searchBufferLocked(comm, ptr);
+}
+
+void NCCLWindowAllocator::releaseBuffer(ncclComm_t comm, void* ptr)
+{
+    if (!comm || !ptr)
+    {
+        return;
+    }
+
+    std::lock_guard<std::mutex> lock(mMutex);
+    auto commIt = mBufferPool.find(comm);
+    if (commIt == mBufferPool.end())
+    {
+        TLLM_LOG_WARNING(
+            "[NCCLUtil] Attempted to release buffer %p for unknown comm %p", ptr, static_cast<void*>(comm));
+        return;
+    }
+
+    for (auto& entry : commIt->second)
+    {
+        if (entry.buffer.ptr == ptr)
+        {
+            entry.inUse = false;
+            TLLM_LOG_TRACE("[NCCLUtil] Released NCCL window buffer for comm %p: ptr=%p", static_cast<void*>(comm), ptr);
+            return;
+        }
+    }
+
+    TLLM_LOG_WARNING("[NCCLUtil] Attempted to release unknown buffer %p for comm %p", ptr, static_cast<void*>(comm));
+}
+
+ncclWindow_t NCCLWindowAllocator::getWindow(ncclComm_t comm, void* ptr) const
+{
+    std::lock_guard<std::mutex> lock(mMutex);
+    NCCLWindowBuffer buffer = searchBufferLocked(comm, ptr);
+    return buffer.isValid() ? buffer.window : nullptr;
+}
+
+size_t NCCLWindowAllocator::getSize(ncclComm_t comm, void* ptr) const
+{
+    std::lock_guard<std::mutex> lock(mMutex);
+    NCCLWindowBuffer buffer = searchBufferLocked(comm, ptr);
+    return buffer.isValid() ? buffer.size : 0;
+}
+
+NCCLWindowBuffer NCCLWindowAllocator::getBufferInfo(ncclComm_t comm, void* ptr) const
+{
+    std::lock_guard<std::mutex> lock(mMutex);
+    return searchBufferLocked(comm, ptr);
+}
+
+size_t NCCLWindowAllocator::getBufferCount(ncclComm_t comm) const
+{
+    std::lock_guard<std::mutex> lock(mMutex);
+    auto commIt = mBufferPool.find(comm);
+    return commIt != mBufferPool.end() ? commIt->second.size() : 0;
+}
+
+size_t NCCLWindowAllocator::getBufferInUseCount(ncclComm_t comm) const
+{
+    std::lock_guard<std::mutex> lock(mMutex);
+    auto commIt = mBufferPool.find(comm);
+    if (commIt == mBufferPool.end())
+    {
+        return 0;
+    }
+
+    size_t count = 0;
+    for (auto const& entry : commIt->second)
+    {
+        if (entry.inUse)
+        {
+            ++count;
+        }
+    }
+    return count;
+}
+
+bool NCCLWindowAllocator::isCommValid(ncclComm_t comm) const noexcept
+{
+    // Simply check for null - all non-null comms are valid
+    // We don't track cleaned-up comms because NCCL can reuse memory addresses,
+    // making pointer-based tracking unreliable. New comms will be registered when used.
+    return comm != nullptr;
+}
+
+NCCLWindowBuffer NCCLWindowAllocator::allocateAndRegisterBuffer(ncclComm_t comm, size_t size, int handle)
+{
+    NCCLWindowBuffer buffer;
+    buffer.handle = handle;
+
+    // Get NCCL helper for dynamic symbol loading
+    auto& ncclHelper = NCCLHelper::getInstance();
+    if (!ncclHelper.isLoaded())
+    {
+        TLLM_THROW("NCCL library could not be loaded for dynamic symbol access");
+    }
+
+    auto ncclMemAllocFunc = ncclHelper.getNCCLMemAlloc();
+    auto ncclCommWindowRegisterFunc = ncclHelper.getNCCLCommWindowRegister();
+
+    // Defensive checks: both function pointers must be non-null
+    if (ncclMemAllocFunc == nullptr)
+    {
+        TLLM_THROW("ncclMemAlloc function pointer is null, cannot allocate NCCL window buffer");
+    }
+
+    if (ncclCommWindowRegisterFunc == nullptr)
+    {
+        TLLM_THROW("ncclCommWindowRegister function pointer is null, cannot register NCCL window buffer");
+    }
+
+    // Allocate device memory using ncclMemAlloc
+    ncclResult_t allocResult = ncclMemAllocFunc(&buffer.ptr, size);
+    if (allocResult != ncclSuccess)
+    {
+        TLLM_THROW("ncclMemAlloc failed with error: %d", allocResult);
+    }
+    buffer.size = size;
+
+    // Register the buffer with NCCL as a window
+    ncclResult_t regResult
+        = ncclCommWindowRegisterFunc(comm, buffer.ptr, size, &buffer.window, NCCL_WIN_COLL_SYMMETRIC);
+    if (regResult != ncclSuccess)
+    {
+        ncclMemFree(buffer.ptr);
+        TLLM_THROW("ncclCommWindowRegister failed with error: %d", regResult);
+    }
+
+    TLLM_LOG_TRACE("[NCCLUtil] Allocated and registered NCCL window buffer: handle=%d, ptr=%p, size=%zu, window=%p",
+        handle, buffer.ptr, size, static_cast<void*>(buffer.window));
+
+    return buffer;
+}
+
+NCCLWindowBuffer NCCLWindowAllocator::searchBufferLocked(ncclComm_t comm, void* ptr) const
+{
+    auto commIt = mBufferPool.find(comm);
+    if (commIt == mBufferPool.end())
+    {
+        return NCCLWindowBuffer();
+    }
+
+    for (auto const& entry : commIt->second)
+    {
+        if (entry.buffer.ptr == ptr)
+        {
+            return entry.buffer;
+        }
+    }
+
+    return NCCLWindowBuffer();
+}
+
+void NCCLWindowAllocator::registerBufferCleanup(ncclComm_t comm)
+{
+    // Don't register if already registered
+    if (mRegisteredComms.find(comm) != mRegisteredComms.end())
+    {
+        return;
+    }
+
+    mRegisteredComms.insert(comm);
+
+    // Register cleanup with the resource manager
+    NcclCommResourceManager::getInstance().registerResource(
+        comm, [this, comm]() { this->cleanupBuffersForComm(comm); }, "NCCLWindowAllocator");
+}
+
+void NCCLWindowAllocator::cleanupBuffersForComm(ncclComm_t comm) noexcept
+{
+    if (!comm)
+    {
+        return;
+    }
+
+    // Synchronize CUDA to ensure all operations using these buffers are complete
+    // before we deregister windows and free memory
+    cudaError_t cudaErr = cudaDeviceSynchronize();
+    if (cudaErr != cudaSuccess)
+    {
+        TLLM_LOG_WARNING("[NCCLUtil] cudaDeviceSynchronize failed with error: %d before cleanup for comm %p", cudaErr,
+            static_cast<void*>(comm));
+        // Continue anyway - the sync failure might be from a previous error
+    }
+
+    std::lock_guard<std::mutex> lock(mMutex);
+
+    // Check if we've already cleaned up this communicator
+    if (mRegisteredComms.find(comm) == mRegisteredComms.end())
+    {
+        // Already cleaned up or never registered
+        return;
+    }
+
+    auto commIt = mBufferPool.find(comm);
+    if (commIt == mBufferPool.end())
+    {
+        // No buffers to clean up, but mark as cleaned
+        mRegisteredComms.erase(comm);
+        return;
+    }
+
+    TLLM_LOG_TRACE(
+        "[NCCLUtil] Cleaning up %zu NCCL window buffers for comm %p", commIt->second.size(), static_cast<void*>(comm));
+
+    // Check for buffers still in use - this shouldn't happen if cleanup is called properly,
+    // but we log a warning if it does
+    size_t inUseCount = 0;
+    for (auto const& entry : commIt->second)
+    {
+        if (entry.inUse)
+        {
+            ++inUseCount;
+        }
+    }
+    if (inUseCount > 0)
+    {
+        TLLM_LOG_WARNING(
+            "[NCCLUtil] Cleaning up %zu buffers still marked as in-use for comm %p. "
+            "This may indicate buffers weren't properly released before cleanup.",
+            inUseCount, static_cast<void*>(comm));
+    }
+
+    for (auto& entry : commIt->second)
+    {
+        if (entry.buffer.isValid())
+        {
+            // Deregister the window - the communicator is still valid at this point
+            // (cleanup happens before ncclCommDestroy), but we need to be careful
+            // if buffers are still in use by active operations
+            if (entry.buffer.window && comm)
+            {
+                // Note: Even if buffer is marked inUse, we must deregister since
+                // the communicator is being destroyed. The communicator is valid,
+                // but we should handle potential errors gracefully.
+                ncclResult_t result = ncclCommWindowDeregister(comm, entry.buffer.window);
+                if (result != ncclSuccess)
+                {
+                    TLLM_LOG_WARNING(
+                        "[NCCLUtil] ncclCommWindowDeregister failed with error: %d for comm %p, "
+                        "window %p (buffer inUse: %d)",
+                        result, static_cast<void*>(comm), static_cast<void*>(entry.buffer.window), entry.inUse);
+                }
+            }
+
+            // Free device memory using ncclMemFree
+            // This should be safe even if deregister failed
+            if (entry.buffer.ptr)
+            {
+                try
+                {
+                    ncclResult_t ncclResult = ncclMemFree(entry.buffer.ptr);
+                    if (ncclResult != ncclSuccess)
+                    {
+                        TLLM_LOG_WARNING("[NCCLUtil] ncclMemFree failed with error: %d", ncclResult);
+                    }
+                }
+                catch (...)
+                {
+                    TLLM_LOG_ERROR("[NCCLUtil] Exception during ncclMemFree for ptr %p", entry.buffer.ptr);
+                }
+            }
+
+            TLLM_LOG_TRACE(
+                "[NCCLUtil] Freed NCCL window buffer: ptr=%p, size=%zu", entry.buffer.ptr, entry.buffer.size);
+        }
+    }
+
+    mBufferPool.erase(commIt);
+    mRegisteredComms.erase(comm);
+}
+
+} // namespace tensorrt_llm::common::nccl_util
+
+#endif // ENABLE_MULTI_DEVICE
--- a/cpp/tensorrt_llm/common/ncclUtils.h
+++ b/cpp/tensorrt_llm/common/ncclUtils.h
@ -0,0 +1,397 @@
+/*
+ * Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include "tensorrt_llm/common/assert.h"
+#include "tensorrt_llm/common/cudaUtils.h"
+#include "tensorrt_llm/common/logger.h"
+
+#if ENABLE_MULTI_DEVICE
+#include <nccl.h>
+#include <torch/extension.h>
+#endif
+
+#include <algorithm>
+#include <functional>
+#include <limits>
+#include <memory>
+#include <mutex>
+#include <numeric>
+#include <sstream>
+#include <stdexcept>
+#include <string>
+#include <unordered_map>
+#include <unordered_set>
+#include <utility>
+#include <vector>
+
+#if ENABLE_MULTI_DEVICE
+
+#ifdef _WIN32
+#include <windows.h>
+#else
+#include <dlfcn.h>
+#endif
+
+namespace tensorrt_llm::common::nccl_util
+{
+
+//==============================================================================
+// NCCL Helper - Dynamic Library Loading
+//==============================================================================
+
+// Helper class for dynamically loading NCCL symbols (ncclMemAlloc, ncclCommWindowRegister)
+// This allows the code to work with NCCL libraries that may or may not have these symbols
+class NCCLHelper
+{
+public:
+    static NCCLHelper& getInstance();
+
+    // Dynamic loading function type definition
+    using ncclCommWindowRegisterFunc = ncclResult_t (*)(ncclComm_t, void*, size_t, ncclWindow_t*, int);
+    using ncclMemAllocFunc = ncclResult_t (*)(void**, size_t);
+
+    // Get function pointer for ncclCommWindowRegister
+    ncclCommWindowRegisterFunc getNCCLCommWindowRegister();
+
+    // Get function pointer for ncclMemAlloc
+    ncclMemAllocFunc getNCCLMemAlloc();
+
+    // Check if NCCL library is successfully loaded
+    bool isLoaded() const;
+
+    NCCLHelper(NCCLHelper const&) = delete;
+    NCCLHelper& operator=(NCCLHelper const&) = delete;
+    NCCLHelper(NCCLHelper&&) = delete;
+    NCCLHelper& operator=(NCCLHelper&&) = delete;
+
+private:
+    NCCLHelper();
+    ~NCCLHelper();
+
+    void loadNCCLLibrary();
+    void* loadLibraryHandle(char const* libName);
+    void* getSymbolAddress(void* handle, char const* symbolName);
+
+#ifdef _WIN32
+    HMODULE mLibraryHandle;
+#else
+    void* mLibraryHandle;
+#endif
+
+    ncclCommWindowRegisterFunc mNCCLCommWindowRegister;
+    ncclMemAllocFunc mNCCLMemAlloc;
+    bool mIsLoaded;
+};
+
+//==============================================================================
+// NCCL Resource Management
+//==============================================================================
+
+// Resource cleanup function type. Called before the NCCL communicator is destroyed.
+using ResourceCleanupFunc = std::function<void()>;
+
+// Manages resources associated with NCCL communicators. Thread-safe singleton that maintains
+// a pool of resources per NCCL comm. Resources are automatically cleaned up when the
+// communicator is destroyed.
+class NcclCommResourceManager
+{
+public:
+    static NcclCommResourceManager& getInstance() noexcept;
+
+    // Register a resource cleanup function for a specific NCCL communicator.
+    // The cleanup function will be called before ncclCommDestroy.
+    // Thread-safe: Uses global mutex to serialize all operations.
+    void registerResource(ncclComm_t comm, ResourceCleanupFunc cleanup, char const* debugName = nullptr);
+
+    // Cleanup all resources associated with a communicator. Called automatically by
+    // the shared_ptr deleter before ncclCommDestroy.
+    // Thread-safe: Uses global mutex to serialize cleanup operations.
+    // Order-preserving: Resources are cleaned up in registration order.
+    void cleanupResources(ncclComm_t comm) noexcept;
+
+    // Check if a communicator has registered resources.
+    bool hasResources(ncclComm_t comm) const noexcept;
+
+    // Get the number of resources registered for a communicator.
+    size_t getResourceCount(ncclComm_t comm) const noexcept;
+
+    NcclCommResourceManager(NcclCommResourceManager const&) = delete;
+    NcclCommResourceManager& operator=(NcclCommResourceManager const&) = delete;
+    NcclCommResourceManager(NcclCommResourceManager&&) = delete;
+    NcclCommResourceManager& operator=(NcclCommResourceManager&&) = delete;
+
+private:
+    NcclCommResourceManager() = default;
+    ~NcclCommResourceManager() = default;
+
+    using ResourceEntry = std::pair<ResourceCleanupFunc, std::string>;
+
+    mutable std::mutex mMutex;
+    std::unordered_map<ncclComm_t, std::vector<ResourceEntry>> mCommResources;
+};
+
+// RAII helper to register a resource with a NCCL communicator.
+// Automatically registers cleanup function on construction.
+template <typename ResourceType>
+class NcclCommResource
+{
+public:
+    NcclCommResource(ncclComm_t comm, ResourceType&& resource, std::function<void(ResourceType&)> cleanup,
+        char const* debugName = nullptr)
+        : mComm(comm)
+        , mResource(std::forward<ResourceType>(resource))
+        , mCleanup(std::move(cleanup))
+        , mRegistered(true)
+    {
+        // Register with the manager
+        NcclCommResourceManager::getInstance().registerResource(
+            comm,
+            [this]()
+            {
+                if (mCleanup)
+                {
+                    mCleanup(mResource);
+                }
+            },
+            debugName);
+    }
+
+    ResourceType& get()
+    {
+        return mResource;
+    }
+
+    ResourceType const& get() const
+    {
+        return mResource;
+    }
+
+    NcclCommResource(NcclCommResource const&) = delete;
+    NcclCommResource& operator=(NcclCommResource const&) = delete;
+    NcclCommResource(NcclCommResource&&) = delete;
+    NcclCommResource& operator=(NcclCommResource&&) = delete;
+
+private:
+    ncclComm_t mComm;
+    ResourceType mResource;
+    std::function<void(ResourceType&)> mCleanup;
+    bool mRegistered;
+};
+
+//==============================================================================
+// NCCL Window Buffer Allocation
+//==============================================================================
+
+// Represents a buffer with an associated NCCL window
+struct NCCLWindowBuffer
+{
+    void* ptr;           // Device pointer (same as UBBuffer.addr)
+    int handle;          // Buffer handle/index (for compatibility with UB interface)
+    size_t size;         // Size in bytes
+    ncclWindow_t window; // NCCL window handle
+
+    NCCLWindowBuffer(void* p = nullptr, int h = -1, size_t s = 0, ncclWindow_t w = nullptr)
+        : ptr(p)
+        , handle(h)
+        , size(s)
+        , window(w)
+    {
+    }
+
+    [[nodiscard]] bool isValid() const
+    {
+        return ptr != nullptr && handle >= 0 && size > 0 && window != nullptr;
+    }
+
+    [[nodiscard]] bool invalid() const
+    {
+        return !isValid();
+    }
+
+    // Alias for compatibility with UBBuffer interface
+    void* addr() const
+    {
+        return ptr;
+    }
+};
+
+// Manages NCCL window-registered buffers with pooling and automatic cleanup.
+// Buffers are tied to the lifetime of their associated NCCL communicator.
+class NCCLWindowAllocator
+{
+public:
+    static NCCLWindowAllocator& getInstance();
+
+    // Request a buffer for the given communicator and size.
+    // If an unused buffer of at least the requested size exists for this communicator, it will be reused.
+    // Uses best-fit strategy: selects the smallest available buffer that meets the size requirement.
+    // Otherwise, a new buffer is allocated and registered.
+    NCCLWindowBuffer requestBuffer(ncclComm_t comm, size_t size);
+
+    // Search for a buffer by pointer. Returns an invalid buffer if not found.
+    // This matches the UBManager.search_buffer() interface.
+    NCCLWindowBuffer searchBuffer(ncclComm_t comm, void* ptr) const;
+
+    // Release a buffer back to the pool for potential reuse
+    void releaseBuffer(ncclComm_t comm, void* ptr);
+
+    // Get the window handle for a specific buffer pointer
+    ncclWindow_t getWindow(ncclComm_t comm, void* ptr) const;
+
+    // Get the size of a specific buffer pointer
+    size_t getSize(ncclComm_t comm, void* ptr) const;
+
+    // Get buffer info by pointer
+    NCCLWindowBuffer getBufferInfo(ncclComm_t comm, void* ptr) const;
+
+    // Get the number of buffers allocated for a communicator
+    size_t getBufferCount(ncclComm_t comm) const;
+
+    // Get the number of buffers in use for a communicator
+    size_t getBufferInUseCount(ncclComm_t comm) const;
+
+    // Check if a communicator is valid (non-null)
+    // Note: We don't track cleaned-up comms because NCCL can reuse memory addresses.
+    // All non-null comms are considered valid and will be registered when first used.
+    bool isCommValid(ncclComm_t comm) const noexcept;
+
+    NCCLWindowAllocator(NCCLWindowAllocator const&) = delete;
+    NCCLWindowAllocator& operator=(NCCLWindowAllocator const&) = delete;
+    NCCLWindowAllocator(NCCLWindowAllocator&&) = delete;
+    NCCLWindowAllocator& operator=(NCCLWindowAllocator&&) = delete;
+
+private:
+    NCCLWindowAllocator() = default;
+    ~NCCLWindowAllocator() = default;
+
+    // Allocate a new buffer and register it with NCCL as a window
+    NCCLWindowBuffer allocateAndRegisterBuffer(ncclComm_t comm, size_t size, int handle);
+
+    // Search for a buffer by pointer (assumes mMutex is already locked)
+    NCCLWindowBuffer searchBufferLocked(ncclComm_t comm, void* ptr) const;
+
+    // Register cleanup function for all buffers associated with a communicator
+    void registerBufferCleanup(ncclComm_t comm);
+
+    // Cleanup all buffers for a specific communicator
+    void cleanupBuffersForComm(ncclComm_t comm) noexcept;
+
+    struct BufferEntry
+    {
+        NCCLWindowBuffer buffer;
+        bool inUse;
+    };
+
+    mutable std::mutex mMutex;
+    std::unordered_map<ncclComm_t, std::vector<BufferEntry>> mBufferPool;
+    std::unordered_set<ncclComm_t> mRegisteredComms;
+};
+
+// RAII wrapper for NCCL window buffers
+class ScopedNCCLWindowBuffer
+{
+public:
+    ScopedNCCLWindowBuffer(ncclComm_t comm, size_t size)
+        : mComm(comm)
+        , mBuffer(NCCLWindowAllocator::getInstance().requestBuffer(comm, size))
+    {
+    }
+
+    ~ScopedNCCLWindowBuffer()
+    {
+        if (mBuffer.isValid())
+        {
+            NCCLWindowAllocator::getInstance().releaseBuffer(mComm, mBuffer.ptr);
+        }
+    }
+
+    void* getPtr() const
+    {
+        return mBuffer.ptr;
+    }
+
+    size_t getSize() const
+    {
+        return mBuffer.size;
+    }
+
+    ncclWindow_t getWindow() const
+    {
+        return mBuffer.window;
+    }
+
+    NCCLWindowBuffer const& getBuffer() const
+    {
+        return mBuffer;
+    }
+
+    ScopedNCCLWindowBuffer(ScopedNCCLWindowBuffer const&) = delete;
+    ScopedNCCLWindowBuffer& operator=(ScopedNCCLWindowBuffer const&) = delete;
+    ScopedNCCLWindowBuffer(ScopedNCCLWindowBuffer&&) = delete;
+    ScopedNCCLWindowBuffer& operator=(ScopedNCCLWindowBuffer&&) = delete;
+
+private:
+    ncclComm_t mComm;
+    NCCLWindowBuffer mBuffer;
+};
+
+// Creates a PyTorch tensor backed by an NCCL window buffer.
+// The tensor will automatically release the buffer back to the pool when destroyed.
+// This is analogous to torch_ext::create_userbuffers_tensor() but for NCCLWindowAllocator.
+inline std::pair<torch::Tensor, NCCLWindowBuffer> createNCCLWindowTensor(
+    ncclComm_t comm, at::IntArrayRef shape, torch::ScalarType dtype)
+{
+    // Calculate buffer size
+    int64_t buffer_size
+        = std::accumulate(shape.begin(), shape.end(), 1LL, std::multiplies<int64_t>()) * torch::elementSize(dtype);
+
+    // Calculate strides
+    std::vector<int64_t> strides_vec(shape.size());
+    if (!shape.empty())
+    {
+        strides_vec[shape.size() - 1] = 1;
+        for (int64_t i = static_cast<int64_t>(shape.size()) - 1; i >= 1; --i)
+        {
+            strides_vec[i - 1] = strides_vec[i] * shape[i];
+        }
+    }
+
+    // Request buffer from allocator
+    auto& allocator = NCCLWindowAllocator::getInstance();
+    auto buffer = allocator.requestBuffer(comm, buffer_size);
+
+    // Defensive validation: ensure buffer is valid before proceeding
+    if (!buffer.isValid())
+    {
+        std::ostringstream oss;
+        oss << "Failed to allocate NCCL window buffer: invalid buffer returned from requestBuffer "
+            << "(comm=" << static_cast<void*>(comm) << ", buffer_size=" << buffer_size << ")";
+        throw std::runtime_error(oss.str());
+    }
+
+    // Create custom deleter that releases the buffer
+    auto deleter = [comm, ptr = buffer.ptr](void*) { NCCLWindowAllocator::getInstance().releaseBuffer(comm, ptr); };
+
+    // Create tensor from the buffer
+    auto tensor = torch::from_blob(buffer.ptr, shape, strides_vec, deleter, torch::dtype(dtype).device(torch::kCUDA));
+
+    return std::make_pair(tensor, buffer);
+}
+
+} // namespace tensorrt_llm::common::nccl_util
+
+#endif // ENABLE_MULTI_DEVICE
--- a/cpp/tensorrt_llm/common/opUtils.cpp
+++ b/cpp/tensorrt_llm/common/opUtils.cpp
@ -16,6 +16,7 @@
 */

 #include "tensorrt_llm/common/opUtils.h"
+#include "tensorrt_llm/common/ncclUtils.h"
 #include "tensorrt_llm/runtime/utils/mpiTags.h"
 #include "tensorrt_llm/runtime/utils/mpiUtils.h"

@ -112,7 +113,29 @@ std::shared_ptr<ncclComm_t> getComm(std::set<int> const& group)
    std::shared_ptr<ncclComm_t> ncclComm(new ncclComm_t,
        [](ncclComm_t* comm)
        {
-            ncclCommDestroy(*comm);
+            if (!comm)
+            {
+                return;
+            }
+
+            // STEP 1: Clean up resources and destroy NCCL communicator if it's valid
+            if (*comm)
+            {
+                // Clean up all registered resources FIRST
+                tensorrt_llm::common::nccl_util::NcclCommResourceManager::getInstance().cleanupResources(*comm);
+
+                // Now destroy the NCCL communicator
+                ncclResult_t result = ncclCommDestroy(*comm);
+                if (result != ncclSuccess)
+                {
+                    TLLM_LOG_WARNING("ncclCommDestroy failed with error: %d", result);
+                }
+
+                // Clear the communicator value before freeing the pointer
+                *comm = nullptr;
+            }
+
+            // STEP 2: Always free the pointer memory (regardless of whether *comm was valid)
            delete comm;
        });
 #if defined(_WIN32)
--- a/cpp/tensorrt_llm/kernels/userbuffers/ub_allocator.cpp
+++ b/cpp/tensorrt_llm/kernels/userbuffers/ub_allocator.cpp
@ -22,16 +22,8 @@ namespace tensorrt_llm::runtime::ub
 {
 UserBufferAllocator& UserBufferAllocator::Instance()
 {
-    if (use_nccl_symmetric)
-    {
-        static NCCLUserBufferAllocator _;
-        return _;
-    }
-    else
-    {
-        static UserBufferAllocator _;
-        return _;
-    }
+    static UserBufferAllocator _;
+    return _;
 }

 void UserBufferAllocator::initialize(tensorrt_llm::runtime::WorldConfig const& worldConfig)
@ -83,167 +75,4 @@ communicator* UserBufferAllocator::comm()
    return mUbComm;
 }

-void NCCLUserBufferAllocator::initialize(tensorrt_llm::runtime::WorldConfig const& worldConfig)
-{
-    if (!isInitialized())
-    {
-        TLLM_LOG_INFO("Initializing NCCLUserBufferAllocator");
-        std::set<int> group;
-        for (int i = 0; i < worldConfig.getSize(); i++)
-        {
-            group.insert(i);
-        }
-        mComm = getComm(group);
-        mIsInitialized = true;
-    }
-}
-
-UBBuffer NCCLUserBufferAllocator::registerUBBuffer(size_t bytes)
-{
-    TLLM_CHECK(isInitialized());
-    UBBuffer ub_buffer;
-
-    auto& ncclHelper = getNCCLHelper();
-    if (!ncclHelper.isLoaded())
-    {
-        TLLM_THROW("NCCL library could not be loaded for dynamic symbol access");
-    }
-
-    auto ncclMemAllocFunc = ncclHelper.getNCCLMemAlloc();
-    auto ncclCommWindowRegisterFunc = ncclHelper.getNCCLCommWindowRegister();
-
-    NCCLCHECK(ncclMemAllocFunc(&ub_buffer.addr, bytes));
-    NCCLCHECK(ncclCommWindowRegisterFunc((*mComm), ub_buffer.addr, bytes, &ub_buffer.window, NCCL_WIN_COLL_SYMMETRIC));
-    ub_buffer.handle = 5;
-    ub_buffer.size = bytes;
-    return ub_buffer;
-}
-
-// Static member definitions
-std::unique_ptr<NCCLHelper> NCCLUserBufferAllocator::mNCCLHelper = nullptr;
-
-NCCLHelper& NCCLUserBufferAllocator::getNCCLHelper()
-{
-    if (!mNCCLHelper)
-    {
-        mNCCLHelper = std::make_unique<NCCLHelper>();
-    }
-    return *mNCCLHelper;
-}
-
-// NCCLHelper implementation
-NCCLHelper::NCCLHelper()
-    : mLibraryHandle(nullptr)
-    , mNCCLCommWindowRegister(nullptr)
-    , mNCCLMemAlloc(nullptr)
-    , mIsLoaded(false)
-{
-    loadNCCLLibrary();
-}
-
-NCCLHelper::~NCCLHelper()
-{
-    if (mLibraryHandle)
-    {
-#ifdef _WIN32
-        FreeLibrary(mLibraryHandle);
-#else
-        dlclose(mLibraryHandle);
-#endif
-        mLibraryHandle = nullptr;
-    }
-}
-
-void NCCLHelper::loadNCCLLibrary()
-{
-    try
-    {
-#ifdef _WIN32
-        char const* libraryNames[] = {"nccl.dll"};
-#else
-        char const* libraryNames[] = {"libnccl.so"};
-#endif
-
-        for (int i = 0; libraryNames[i] != nullptr; ++i)
-        {
-            mLibraryHandle = loadLibraryHandle(libraryNames[i]);
-            if (mLibraryHandle)
-            {
-                TLLM_LOG_INFO("Successfully loaded NCCL library: %s", libraryNames[i]);
-                break;
-            }
-        }
-
-        if (!mLibraryHandle)
-        {
-            TLLM_LOG_WARNING("Failed to load NCCL library");
-            return;
-        }
-
-        // Load the required symbols
-        mNCCLCommWindowRegister
-            = reinterpret_cast<ncclCommWindowRegisterFunc>(getSymbolAddress(mLibraryHandle, "ncclCommWindowRegister"));
-
-        mNCCLMemAlloc = reinterpret_cast<ncclMemAllocFunc>(getSymbolAddress(mLibraryHandle, "ncclMemAlloc"));
-
-        if (mNCCLCommWindowRegister == nullptr)
-        {
-            TLLM_LOG_WARNING("Failed to load ncclCommWindowRegister symbol, NCCL symmetric will not be supported.");
-        }
-
-        if (mNCCLMemAlloc)
-        {
-            mIsLoaded = true;
-        }
-        else
-        {
-            TLLM_LOG_WARNING("Failed to load required NCCL symbols");
-        }
-    }
-    catch (std::exception const& e)
-    {
-        TLLM_LOG_WARNING("Exception while loading NCCL library: %s", e.what());
-    }
-}
-
-void* NCCLHelper::loadLibraryHandle(char const* libName)
-{
-#ifdef _WIN32
-    return LoadLibraryA(libName);
-#else
-    return dlopen(libName, RTLD_LAZY | RTLD_GLOBAL);
-#endif
-}
-
-void* NCCLHelper::getSymbolAddress(void* handle, char const* symbolName)
-{
-    if (!handle)
-    {
-        return nullptr;
-    }
-
-#ifdef _WIN32
-    return GetProcAddress(static_cast<HMODULE>(handle), symbolName);
-#else
-    return dlsym(handle, symbolName);
-#endif
-}
-
-NCCLHelper::ncclCommWindowRegisterFunc NCCLHelper::getNCCLCommWindowRegister()
-{
-    return mNCCLCommWindowRegister;
-}
-
-NCCLHelper::ncclMemAllocFunc NCCLHelper::getNCCLMemAlloc()
-{
-    return mNCCLMemAlloc;
-}
-
-bool NCCLHelper::isLoaded() const
-{
-    return mIsLoaded;
-}
-
-bool UserBufferAllocator::use_nccl_symmetric = false;
-
 }; // namespace tensorrt_llm::runtime::ub
--- a/cpp/tensorrt_llm/kernels/userbuffers/ub_allocator.h
+++ b/cpp/tensorrt_llm/kernels/userbuffers/ub_allocator.h
@ -19,11 +19,6 @@
 #if ENABLE_MULTI_DEVICE
 #include "nccl.h"
 #include "userbuffers.h"
-#ifdef _WIN32
-#include <windows.h>
-#else
-#include <dlfcn.h>
-#endif
 #else
 using ncclWindow_t = void*;
 #endif
@ -69,8 +64,6 @@ public:
    communicator* comm();
    virtual UBBuffer registerUBBuffer(size_t bytes);

-    static bool use_nccl_symmetric;
-
 private:
    communicator* mUbComm;

@ -80,55 +73,6 @@ protected:
    tensorrt_llm::runtime::WorldConfig mWorldConfig;
 };

-class NCCLHelper
-{
-public:
-    NCCLHelper();
-    ~NCCLHelper();
-
-    // Dynamic loading function type definition
-    using ncclCommWindowRegisterFunc = ncclResult_t (*)(ncclComm_t, void*, size_t, ncclWindow_t*, int);
-    using ncclMemAllocFunc = ncclResult_t (*)(void**, size_t);
-
-    // Get function pointer for ncclCommWindowRegister
-    ncclCommWindowRegisterFunc getNCCLCommWindowRegister();
-
-    // Get function pointer for ncclMemAlloc
-    ncclMemAllocFunc getNCCLMemAlloc();
-
-    // Check if NCCL library is successfully loaded
-    bool isLoaded() const;
-
-private:
-    void loadNCCLLibrary();
-    void* loadLibraryHandle(char const* libName);
-    void* getSymbolAddress(void* handle, char const* symbolName);
-
-#ifdef _WIN32
-    HMODULE mLibraryHandle;
-#else
-    void* mLibraryHandle;
-#endif
-
-    ncclCommWindowRegisterFunc mNCCLCommWindowRegister;
-    ncclMemAllocFunc mNCCLMemAlloc;
-    bool mIsLoaded;
-};
-
-class NCCLUserBufferAllocator : public UserBufferAllocator
-{
-public:
-    void initialize(tensorrt_llm::runtime::WorldConfig const& world_config) override;
-    UBBuffer registerUBBuffer(size_t bytes) override;
-
-    // Get shared NCCLHelper instance
-    static NCCLHelper& getNCCLHelper();
-
-private:
-    std::shared_ptr<ncclComm_t> mComm;
-    static std::unique_ptr<NCCLHelper> mNCCLHelper;
-};
-
 #else
 using communicator = void;
 #endif
--- a/cpp/tensorrt_llm/kernels/userbuffers/userbuffersManager.cpp
+++ b/cpp/tensorrt_llm/kernels/userbuffers/userbuffersManager.cpp
@ -14,6 +14,7 @@
 * limitations under the License.
 */
 #include "userbuffersManager.h"
+#include "tensorrt_llm/common/logger.h"

 namespace tensorrt_llm::runtime::ub
 {
@ -29,14 +30,11 @@ UserBuffersManager& UserBuffersManager::get_instance()
    return allocator;
 }

-void UserBuffersManager::initialize(int64_t tp_size, int64_t pp_size, int64_t cp_size, int64_t rank,
-    int64_t gpus_per_node, int64_t buffer_size, bool use_nccl_symmetric)
+void UserBuffersManager::initialize(
+    int64_t tp_size, int64_t pp_size, int64_t cp_size, int64_t rank, int64_t gpus_per_node, int64_t buffer_size)
 {
    std::lock_guard<std::mutex> lock(mutex_);
    tensorrt_llm::runtime::WorldConfig world_config(tp_size, pp_size, cp_size, rank, gpus_per_node);
-#if ENABLE_MULTI_DEVICE
-    UserBufferAllocator::Instance().use_nccl_symmetric = use_nccl_symmetric;
-#endif
    tensorrt_llm::runtime::ub::ub_initialize(world_config);
    TLLM_CHECK(tensorrt_llm::runtime::ub::ub_is_initialized());
    buffer_size_ = buffer_size;
@ -98,11 +96,10 @@ tensorrt_llm::runtime::ub::communicator* UserBuffersManager::comm()
    return tensorrt_llm::runtime::ub::ub_comm();
 }

-void initialize_userbuffers_manager(int64_t tp_size, int64_t pp_size, int64_t cp_size, int64_t rank,
-    int64_t gpus_per_node, int64_t buffer_size, bool use_nccl_symmetric)
+void initialize_userbuffers_manager(
+    int64_t tp_size, int64_t pp_size, int64_t cp_size, int64_t rank, int64_t gpus_per_node, int64_t buffer_size)
 {
-    UserBuffersManager::get_instance().initialize(
-        tp_size, pp_size, cp_size, rank, gpus_per_node, buffer_size, use_nccl_symmetric);
+    UserBuffersManager::get_instance().initialize(tp_size, pp_size, cp_size, rank, gpus_per_node, buffer_size);
 }

 } // namespace tensorrt_llm::runtime::ub
--- a/cpp/tensorrt_llm/kernels/userbuffers/userbuffersManager.h
+++ b/cpp/tensorrt_llm/kernels/userbuffers/userbuffersManager.h
@ -46,9 +46,8 @@ public:
    //! @param gpus_per_node The number of GPUs per node.
    //! @param buffer_size The size of the buffer to allocate. All buffers allocated by this manager will have this
    //! size.
-    //! @param use_nccl_symmetric Whether to use NCCL symmetric communication.
-    void initialize(int64_t tp_size, int64_t pp_size, int64_t cp_size, int64_t rank, int64_t gpus_per_node,
-        int64_t buffer_size, bool use_nccl_symmetric);
+    void initialize(
+        int64_t tp_size, int64_t pp_size, int64_t cp_size, int64_t rank, int64_t gpus_per_node, int64_t buffer_size);

    //! @brief Create a UB tensor from the given shape, strides and data type. The function will choose available UB
    //! buffer or create a new one if no available buffer is found.
@ -76,7 +75,7 @@ private:
    int64_t buffer_size_;
 };

-void initialize_userbuffers_manager(int64_t tp_size, int64_t pp_size, int64_t cp_size, int64_t rank,
-    int64_t gpus_per_node, int64_t buffer_size, bool use_nccl_symmetric);
+void initialize_userbuffers_manager(
+    int64_t tp_size, int64_t pp_size, int64_t cp_size, int64_t rank, int64_t gpus_per_node, int64_t buffer_size);

 } // namespace tensorrt_llm::runtime::ub
--- a/cpp/tensorrt_llm/plugins/ncclPlugin/allreducePlugin.cpp
+++ b/cpp/tensorrt_llm/plugins/ncclPlugin/allreducePlugin.cpp
@ -137,13 +137,12 @@ bool AllreducePlugin::supportsFormatCombination(
    int pos, nvinfer1::PluginTensorDesc const* inOut, int nbInputs, int nbOutputs) noexcept
 {
    int base_inputs = 0;
-    if (mStrategy == AllReduceStrategyType::NCCL || mStrategy == AllReduceStrategyType::UB)
+    switch (mStrategy)
    {
-        base_inputs = 1;
-    }
-    else
-    {
-        base_inputs = 2;
+    case AllReduceStrategyType::NCCL:
+    case AllReduceStrategyType::UB:
+    case AllReduceStrategyType::NCCL_SYMMETRIC: base_inputs = 1; break;
+    default: base_inputs = 2; break;
    }
    int fusion_op_extra_inputs = 0;
    int scale_idx = 0;
@ -169,9 +168,15 @@ bool AllreducePlugin::supportsFormatCombination(

    TLLM_CHECK(nbInputs == (base_inputs + fusion_op_extra_inputs));

-    if (mStrategy != AllReduceStrategyType::NCCL && mStrategy != AllReduceStrategyType::UB && pos == 1)
+    if (pos == 1)
    {
-        return (inOut[pos].type == nvinfer1::DataType::kINT64) && (inOut[pos].format == TensorFormat::kLINEAR);
+        switch (mStrategy)
+        {
+        case AllReduceStrategyType::NCCL:
+        case AllReduceStrategyType::UB:
+        case AllReduceStrategyType::NCCL_SYMMETRIC: break;
+        default: return (inOut[pos].type == nvinfer1::DataType::kINT64) && (inOut[pos].format == TensorFormat::kLINEAR);
+        }
    }
    if (mStrategy == AllReduceStrategyType::UB)
    {
@ -222,25 +227,26 @@ AllReduceStrategyType AllreducePlugin::selectImplementation(
    {
        if (!isAuto)
        {
-            TLLM_LOG_INFO("Since Peer to Peer not supported, fallback to AllReduceStrategy: NCCL");
+            TLLM_LOG_INFO("Since Peer to Peer not supported, fallback to AllReduceStrategy: NCCL_SYMMETRIC");
        }
        else if (forceDeterministic)
        {
            TLLM_LOG_WARNING(
-                "Since Peer to Peer not supported, fallback to AllReduceStrategy: NCCL. NCCL might produce "
+                "Since Peer to Peer not supported, fallback to AllReduceStrategy: NCCL_SYMMETRIC. NCCL_SYMMETRIC might "
+                "produce "
                "non-deterministic results.");
        }
-        return AllReduceStrategyType::NCCL;
+        return AllReduceStrategyType::NCCL_SYMMETRIC;
    }

    if (isAuto && !mIsNVLINKSupported && !forceDeterministic)
    {
-        return AllReduceStrategyType::NCCL;
+        return AllReduceStrategyType::NCCL_SYMMETRIC;
    }

    auto const maxWorkspaceSize = utils::customAllReduceUtils::getMaxRequiredWorkspaceSize(worldSize);

-    AllReduceStrategyType strat = AllReduceStrategyType::NCCL;
+    AllReduceStrategyType strat = AllReduceStrategyType::NCCL_SYMMETRIC;
    auto const messageSizeBytes = messageSize * common::getDTypeSize(type);

    if (messageSizeBytes <= maxWorkspaceSize)
@ -268,7 +274,7 @@ AllReduceStrategyType AllreducePlugin::selectImplementation(
            }
            else
            {
-                strat = AllReduceStrategyType::NCCL;
+                strat = AllReduceStrategyType::NCCL_SYMMETRIC;
            }
        }
        else
@ -279,7 +285,7 @@ AllReduceStrategyType AllreducePlugin::selectImplementation(
            }
            else
            {
-                strat = AllReduceStrategyType::NCCL;
+                strat = AllReduceStrategyType::NCCL_SYMMETRIC;
            }
        }

@ -287,30 +293,31 @@ AllReduceStrategyType AllreducePlugin::selectImplementation(
        {
            if (!isAuto)
            {
-                TLLM_LOG_WARNING("Since not aligned, fallback to AllReduceStrategy: NCCL");
+                TLLM_LOG_WARNING("Since not aligned, fallback to AllReduceStrategy: NCCL_SYMMETRIC");
            }
            else if (forceDeterministic)
            {
                TLLM_LOG_WARNING(
-                    "Since not aligned, fallback to AllReduceStrategy: NCCL. NCCL might produce "
+                    "Since not aligned, fallback to AllReduceStrategy: NCCL_SYMMETRIC. NCCL_SYMMETRIC might produce "
                    "non-deterministic results.");
            }
-            strat = AllReduceStrategyType::NCCL;
+            strat = AllReduceStrategyType::NCCL_SYMMETRIC;
        }
    }
    else
    {
        if (!isAuto)
        {
-            TLLM_LOG_WARNING("Since messageSize > maxWorkspace, fallback to AllReduceStrategy: NCCL");
+            TLLM_LOG_WARNING("Since messageSize > maxWorkspace, fallback to AllReduceStrategy: NCCL_SYMMETRIC");
        }
        else if (forceDeterministic)
        {
            TLLM_LOG_WARNING(
-                "Since messageSize > maxWorkspace, fallback to AllReduceStrategy: NCCL. NCCL might produce "
+                "Since messageSize > maxWorkspace, fallback to AllReduceStrategy: NCCL_SYMMETRIC. NCCL_SYMMETRIC might "
+                "produce "
                "non-deterministic results.");
        }
-        strat = AllReduceStrategyType::NCCL;
+        strat = AllReduceStrategyType::NCCL_SYMMETRIC;
    }

    return strat;
@ -337,6 +344,10 @@ int AllreducePlugin::enqueue(nvinfer1::PluginTensorDesc const* inputDesc, nvinfe
    {
        runtimeStrategy = AllReduceStrategyType::NCCL;
    }
+    else if (mStrategy == AllReduceStrategyType::NCCL_SYMMETRIC)
+    {
+        runtimeStrategy = AllReduceStrategyType::NCCL_SYMMETRIC;
+    }
    else if (mStrategy == AllReduceStrategyType::UB)
    {
        runtimeStrategy = AllReduceStrategyType::UB;
@ -355,6 +366,11 @@ int AllreducePlugin::enqueue(nvinfer1::PluginTensorDesc const* inputDesc, nvinfe
        TLLM_LOG_DEBUG("AllReducePlugin strategy for rank %d: NCCL", rank);
        break;
    }
+    case AllReduceStrategyType::NCCL_SYMMETRIC:
+    {
+        TLLM_LOG_DEBUG("AllReducePlugin strategy for rank %d: NCCL_SYMMETRIC", rank);
+        break;
+    }
    case AllReduceStrategyType::ONESHOT:
    {
        TLLM_LOG_DEBUG("AllReducePlugin strategy for rank %d: ONESHOT", rank);
@ -373,14 +389,14 @@ int AllreducePlugin::enqueue(nvinfer1::PluginTensorDesc const* inputDesc, nvinfe
    default: break;
    }

-    if (runtimeStrategy == AllReduceStrategyType::NCCL)
+    if (runtimeStrategy == AllReduceStrategyType::NCCL || runtimeStrategy == AllReduceStrategyType::NCCL_SYMMETRIC)
    {
        if (mOp == AllReduceFusionOp::RESIDUAL_RMS_NORM || mOp == AllReduceFusionOp::RESIDUAL_RMS_PREPOST_NORM)
        {
            NCCLCHECK(ncclAllReduce(inputs[0], outputs[1], size, (*getDtypeMap())[mType], ncclSum, *mNcclComm, stream));
            tensorrt_llm::kernels::AllReduceParams params;
            int fusion_ptr_idx = 0;
-            if (mStrategy == AllReduceStrategyType::NCCL)
+            if (mStrategy == AllReduceStrategyType::NCCL || mStrategy == AllReduceStrategyType::NCCL_SYMMETRIC)
            {
                fusion_ptr_idx = 1;
            }
--- a/cpp/tensorrt_llm/thop/allreduceOp.cpp
+++ b/cpp/tensorrt_llm/thop/allreduceOp.cpp
@ -15,10 +15,12 @@
 * limitations under the License.
 */

+#include "tensorrt_llm/common/cudaDriverWrapper.h"
 #include "tensorrt_llm/common/cudaUtils.h"
 #include "tensorrt_llm/common/customAllReduceUtils.h"
 #include "tensorrt_llm/common/dataType.h"
 #include "tensorrt_llm/common/mcastDevMemUtils.h"
+#include "tensorrt_llm/common/ncclUtils.h"
 #include "tensorrt_llm/common/opUtils.h"
 #include "tensorrt_llm/kernels/communicationKernels/allReduceFusionKernels.h"
 #include "tensorrt_llm/kernels/communicationKernels/customLowPrecisionAllReduceKernels.h"
@ -39,6 +41,7 @@
 #if ENABLE_MULTI_DEVICE
 #include <ATen/cuda/EmptyTensor.h>
 #include <c10/util/irange.h>
+#include <cuda.h>
 #include <nccl.h>
 #include <torch/csrc/distributed/c10d/FileStore.hpp>
 #include <torch/csrc/distributed/c10d/ProcessGroup.hpp>
@ -51,6 +54,7 @@

 #include <cstddef>
 #include <cstdint>
+#include <limits>
 #include <unordered_set>

 // using namespace nvinfer1;
@ -238,6 +242,9 @@ public:
    AllreduceOp(
        std::set<int> group, nvinfer1::DataType type, AllReduceStrategyType strategy, AllReduceFusionOp op, float eps)
        : mGroup(std::move(group))
+        , mIsNVLINKSupported(false)
+        , mIsP2PSupported(false)
+        , mIsMNNVLSupported(false)
        , mType(type)
        , mStrategy(strategy)
        , mOp(op)
@ -248,6 +255,9 @@ public:
    AllreduceOp(std::set<int> group, c10::intrusive_ptr<c10d::ProcessGroup> const& process_group_,
        nvinfer1::DataType type, AllReduceStrategyType strategy, AllReduceFusionOp op, float eps)
        : mGroup(std::move(group))
+        , mIsNVLINKSupported(false)
+        , mIsP2PSupported(false)
+        , mIsMNNVLSupported(false)
        , mType(type)
        , mStrategy(strategy)
        , mOp(op)
@ -437,44 +447,117 @@ private:
        torch::optional<torch::Tensor> const& residual, torch::optional<torch::Tensor> const& norm_weight,
        torch::optional<torch::Tensor> const& scale, torch::optional<torch::Tensor> const& bias)
    {
+        // Handle ProcessGroup path first - cannot extract NCCL comm for window registration
+        // Use ProcessGroup's allreduce directly and return early
+        if (mNcclComm.index() == 1)
+        {
+            auto torchPg = std::get<1>(mNcclComm);
+
+            torch::Tensor reduceOutput = input.clone();
+            std::vector tensors{reduceOutput};
+            PGCHECK_THROW(torchPg->allreduce(tensors, {c10d::ReduceOp::SUM}));
+
+            if (mOp == AllReduceFusionOp::NONE)
+            {
+                return {reduceOutput};
+            }
+
+            // Treat any other patterns as fallback cases.
+            return fallbackRunSubsequentOps(input, residual, norm_weight, scale, bias, reduceOutput);
+        }
+
+        // From here on, we have a raw NCCL comm - can proceed with window registration
+        auto rawComm = std::get<0>(mNcclComm);
+        ncclComm_t comm = *rawComm;
+        TLLM_CHECK_WITH_INFO(comm != nullptr, "NCCL communicator is null");
+        TLLM_LOG_DEBUG("[runNCCLAllReduceSymmetric] Using raw NCCL comm path (not ProcessGroup)");
+
+        using tensorrt_llm::common::nccl_util::NCCLWindowAllocator;
+        using tensorrt_llm::common::nccl_util::createNCCLWindowTensor;

        auto stream = at::cuda::getCurrentCUDAStream(input.get_device());
        int size = input.numel();
-        auto& ub_manager = tensorrt_llm::runtime::ub::UserBuffersManager::get_instance();
-        auto ub_tensor0 = input;
-        auto ub_buffer0 = ub_manager.search_buffer(input.data_ptr());
-        if (ub_buffer0.invalid())
+        size_t bufferSizeBytes = size * input.element_size();
+
+        // Using unregistered input buffers with NCCL symmetric, requires a memcpy
+        // This is an overhead introduced with using NCCL_SYMMTRIC over NCCL.
+        // Both the memcpy and the perf benefit from using NCCL_SYMMETRIC scale linear with the message size.
+        // But a local memcpy is cheaper than the remote operations, so with larger message sizes the benefit is
+        // stronger. Additionally, the perf benefit scales with the number of ranks, since multimem enables O(const.)
+        // versus O(N) complexity. Hence we model this cutoff with a linear model. The numbers below were obtained on
+        // GB200, scanning different message sizes and ranks. You can determine the regression onset for each number of
+        // ranks to a single message size. And the following formula was obtained by fitting a linear model to the
+        // regression onset. It is possible to override this empirical heuristic with the TLLM_NCCL_MIN_REGISTRATION
+        // environment variable.
+        double const a = -4986.43478503;
+        double const b = 156716.52177552;
+        int nRanks;
+        NCCLCHECK_THROW(ncclCommCount(comm, &nRanks));
+        size_t minRegistrationThreshold = static_cast<size_t>(std::max(0.0, a * nRanks + b)) * input.element_size();
+        // Disable window registration if neither NVLink nor MNNVL is supported
+        // TODO replace in NCCL 2.29 with comm query
+        if (!mIsNVLINKSupported && !mIsMNNVLSupported)
        {
-            auto [symmetric_input, symmetric_ub_buffer0]
-                = torch_ext::create_userbuffers_tensor(input.sizes(), input.scalar_type());
-            cudaMemcpyAsync(symmetric_ub_buffer0.addr, input.data_ptr(), size * input.element_size(),
-                cudaMemcpyDeviceToDevice, stream);
-            ub_buffer0 = symmetric_ub_buffer0;
-            ub_tensor0 = symmetric_input;
+            minRegistrationThreshold = std::numeric_limits<size_t>::max();
+        }
+        char const* envThreshold = std::getenv("TLLM_NCCL_MIN_REGISTRATION");
+        if (envThreshold != nullptr)
+        {
+            minRegistrationThreshold = static_cast<size_t>(std::atoi(envThreshold)) * input.element_size();
        }

-        TLLM_CHECK(!ub_buffer0.invalid());
-        auto [norm_out, ub_buffer1] = torch_ext::create_userbuffers_tensor(input.sizes(), input.scalar_type());
+        // Search for existing buffer
+        auto& allocator = NCCLWindowAllocator::getInstance();
+        auto windowBuffer0 = allocator.searchBuffer(comm, input.data_ptr());

-        std::visit(overloaded{[&, norm_out_ = norm_out](std::shared_ptr<ncclComm_t>& rawComm)
-                       {
-                           NCCLCHECK_THROW(ncclAllReduce(ub_buffer0.addr, norm_out_.mutable_data_ptr(), size,
-                               (*getDtypeMap())[mType], ncclSum, *rawComm, stream));
-                       },
-                       [&, norm_out_ = norm_out](c10::intrusive_ptr<c10d::ProcessGroup>& torchPg)
-                       {
-                           PGCHECK_THROW(PgHelper{torchPg}.allreduce(ub_tensor0, {c10d::ReduceOp::SUM}));
-                           std::ignore = norm_out_.copy_(ub_tensor0, true);
-                       }},
-            mNcclComm);
+        torch::Tensor inputTensor = input;
+        void* inputPtr = input.data_ptr();
+
+        // If buffer is not registered, decide whether to register based on size
+        if (!windowBuffer0.isValid())
+        {
+            if (bufferSizeBytes < minRegistrationThreshold)
+            {
+                // Small buffer: use input directly without window registration
+                TLLM_LOG_DEBUG(
+                    "[runNCCLAllReduceSymmetric] Buffer size %zu bytes < threshold %zu bytes, "
+                    "skipping window registration",
+                    bufferSizeBytes, minRegistrationThreshold);
+                // inputTensor and inputPtr remain pointing to original input
+            }
+            else
+            {
+                // Large buffer: create window buffer and copy input (can swap inputTensor reference)
+                auto [symmetricInput, symmetricBuffer0]
+                    = createNCCLWindowTensor(comm, input.sizes(), input.scalar_type());
+                TLLM_CUDA_CHECK(cudaMemcpyAsync(
+                    symmetricBuffer0.ptr, input.data_ptr(), bufferSizeBytes, cudaMemcpyDeviceToDevice, stream));
+                windowBuffer0 = symmetricBuffer0;
+                inputTensor = symmetricInput; // Swap to window-backed tensor
+                inputPtr = windowBuffer0.ptr;
+            }
+        }
+        else
+        {
+            // Buffer already registered - use it directly
+            inputPtr = windowBuffer0.ptr;
+        }
+
+        // Use window-backed output buffer
+        auto [normOut, windowBuffer1] = createNCCLWindowTensor(comm, input.sizes(), input.scalar_type());
+        torch::Tensor outputTensor = normOut;
+        void* outputPtr = windowBuffer1.ptr;
+
+        // Perform allreduce
+        NCCLCHECK_THROW(ncclAllReduce(inputPtr, outputPtr, size, (*getDtypeMap())[mType], ncclSum, comm, stream));

        if (mOp == AllReduceFusionOp::NONE)
        {
-            return {norm_out};
+            return {outputTensor};
        }

        // Treat any other patterns as fallback cases.
-        return fallbackRunSubsequentOps(input, residual, norm_weight, scale, bias, norm_out);
+        return fallbackRunSubsequentOps(input, residual, norm_weight, scale, bias, outputTensor);
    }

    std::vector<torch::Tensor> runLowPrecisionAllReduce(torch::Tensor const& input,
@ -799,16 +882,104 @@ private:

    void initGroupTopology()
    {
-        static std::map<std::set<int>, std::tuple<bool, bool>> cache;
+        static std::map<std::set<int>, std::tuple<bool, bool, bool>> cache;
        if (cache.find(mGroup) != cache.end())
        {
-            auto [is_NVLINK_supported, is_P2P_supported] = cache[mGroup];
+            auto [is_NVLINK_supported, is_P2P_supported, is_MNNVL_supported] = cache[mGroup];
            mIsNVLINKSupported = is_NVLINK_supported;
            mIsP2PSupported = is_P2P_supported;
+            mIsMNNVLSupported = is_MNNVL_supported;
            return;
        }
        setGroupTopology();
-        cache[mGroup] = {mIsNVLINKSupported, mIsP2PSupported};
+        cache[mGroup] = {mIsNVLINKSupported, mIsP2PSupported, mIsMNNVLSupported};
+    }
+
+    bool checkMNNVLSupport(int device_id)
+    {
+#if ENABLE_MULTI_DEVICE
+        // 1. Check CUDA driver version (needs >= 12.0.10)
+        int cuda_driver_version = -1;
+        TLLM_CUDA_CHECK(cudaDriverGetVersion(&cuda_driver_version));
+        if (cuda_driver_version < 12010)
+        {
+            TLLM_LOG_DEBUG("MNNVL check: CUDA Driver version %d < 12010", cuda_driver_version);
+            return false;
+        }
+
+        // 2. Check multicast support
+        CUdevice cu_device;
+        TLLM_CU_CHECK(cuDeviceGet(&cu_device, device_id));
+        auto cuda_driver = tensorrt_llm::common::CUDADriverWrapper::getInstance();
+
+        int multicast_supported = 0;
+        TLLM_CU_CHECK(cuda_driver->cuDeviceGetAttribute(
+            &multicast_supported, CU_DEVICE_ATTRIBUTE_MULTICAST_SUPPORTED, cu_device));
+        if (!multicast_supported)
+        {
+            TLLM_LOG_DEBUG("MNNVL check: Device %d does not support multicast", device_id);
+            return false;
+        }
+
+        // 3. Check fabric handle support
+        int fabric_handle_supported = 0;
+        TLLM_CU_CHECK(cuda_driver->cuDeviceGetAttribute(
+            &fabric_handle_supported, CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_FABRIC_SUPPORTED, cu_device));
+        if (!fabric_handle_supported)
+        {
+            TLLM_LOG_DEBUG("MNNVL check: Device %d does not support fabric handles", device_id);
+            return false;
+        }
+
+        // 4. Check NVML GPU Fabric Info
+        nvmlDevice_t nvml_device;
+        NVML_CHECK_THROW(nvmlDeviceGetHandleByIndex(device_id, &nvml_device));
+
+        nvmlGpuFabricInfo_t fabric_info;
+        NVML_CHECK_THROW(nvmlDeviceGetGpuFabricInfo(nvml_device, &fabric_info));
+
+        // Check if fabric is fully initialized
+        if (fabric_info.state != NVML_GPU_FABRIC_STATE_COMPLETED || fabric_info.status != NVML_SUCCESS)
+        {
+            TLLM_LOG_DEBUG(
+                "MNNVL check: Fabric state not complete - state=%u status=%u", fabric_info.state, fabric_info.status);
+            return false;
+        }
+
+        // 5. Check NVLink links are active (similar to Python support_nvlink(True))
+        unsigned int active_links = 0;
+        unsigned int available_links = 0;
+
+        for (unsigned int link = 0; link < NVML_NVLINK_MAX_LINKS; link++)
+        {
+            unsigned int cap_p2p = 0;
+            nvmlReturn_t cap_result
+                = nvmlDeviceGetNvLinkCapability(nvml_device, link, NVML_NVLINK_CAP_P2P_SUPPORTED, &cap_p2p);
+            if (cap_result == NVML_SUCCESS && cap_p2p)
+            {
+                available_links++;
+                nvmlEnableState_t link_state;
+                if (nvmlDeviceGetNvLinkState(nvml_device, link, &link_state) == NVML_SUCCESS
+                    && link_state == NVML_FEATURE_ENABLED)
+                {
+                    active_links++;
+                }
+            }
+        }
+
+        bool all_links_up = (active_links == available_links && available_links > 0);
+        if (!all_links_up)
+        {
+            TLLM_LOG_DEBUG(
+                "MNNVL check: Not all NVLink links active - active=%u available=%u", active_links, available_links);
+            return false;
+        }
+
+        TLLM_LOG_INFO("MNNVL check: Device %d supports MNNVL (fabric_clique=%u)", device_id, fabric_info.cliqueId);
+        return true;
+#else
+        return false;
+#endif
    }

    void setGroupTopology()
@ -820,108 +991,190 @@ private:
                [&](c10::intrusive_ptr<c10d::ProcessGroup>& torchPg) { return getLocalGroupTorch(mGroup); }},
            mNcclComm);

-        if (mGroup.size() != local_group.size())
-        {
-            mIsP2PSupported = false;
-            mIsNVLINKSupported = false;
-            TLLM_LOG_INFO("Found inter-node TP group for rank %d", rank);
-            return;
-        }
-        TLLM_LOG_INFO("TP group is intra-node for rank %d", rank);
+        bool is_inter_node = (mGroup.size() != local_group.size());

        NvmlManager nvml_manager;
        mIsP2PSupported = true;
        mIsNVLINKSupported = true;
+        mIsMNNVLSupported = false;

-        // TODO(ytong): Should we provide group topology info instead of querying it here?
-        // Use cudaDeviceCanAccessPeer to determine whether p2p is supported,
-        // and use nvml to determine whether there are nvlink links between ranks.
-        for (int first_device_id : local_group)
+        // First, check NVLink within local group (intra-node)
+        if (!local_group.empty())
        {
-            for (int second_device_id : local_group)
+            for (int first_device_id : local_group)
            {
-                if (first_device_id >= second_device_id)
+                for (int second_device_id : local_group)
                {
-                    continue;
-                }
-
-                int can_access_peer = 0;
-                TLLM_CUDA_CHECK(cudaDeviceCanAccessPeer(&can_access_peer, first_device_id, second_device_id));
-
-                if (!can_access_peer)
-                {
-                    mIsP2PSupported = false;
-                    mIsNVLINKSupported = false;
-
-                    return;
-                }
-
-                nvmlDevice_t first_device;
-                NVML_CHECK_THROW(nvmlDeviceGetHandleByIndex(first_device_id, &first_device));
-
-                bool is_NVLINK = false;
-
-                for (unsigned int link = 0; link < NVML_NVLINK_MAX_LINKS; link++)
-                {
-                    nvmlPciInfo_t remote_pci_info;
-                    if (nvmlDeviceGetNvLinkRemotePciInfo_v2(first_device, link, &remote_pci_info) != NVML_SUCCESS)
+                    if (first_device_id >= second_device_id)
                    {
                        continue;
                    }

-                    nvmlDevice_t remote_device;
-                    auto const result = nvmlDeviceGetHandleByPciBusId_v2(remote_pci_info.busId, &remote_device);
+                    int can_access_peer = 0;
+                    TLLM_CUDA_CHECK(cudaDeviceCanAccessPeer(&can_access_peer, first_device_id, second_device_id));

-                    if (result == NVML_SUCCESS)
+                    if (!can_access_peer)
                    {
-                        // Two GPUs are connected directly through nvlink
-                        unsigned int remote_device_id;
-                        NVML_CHECK_THROW(nvmlDeviceGetIndex(remote_device, &remote_device_id));
-
-                        if (remote_device_id == static_cast<unsigned int>(second_device_id))
-                        {
-                            is_NVLINK = true;
-                        }
+                        mIsP2PSupported = false;
+                        mIsNVLINKSupported = false;
+                        TLLM_LOG_INFO(
+                            "P2P not supported between local devices %d and %d", first_device_id, second_device_id);
+                        // Continue checking other pairs, but mark as not supported
+                        continue;
                    }
-                    else if (result == NVML_ERROR_NOT_FOUND)
+
+                    nvmlDevice_t first_device;
+                    NVML_CHECK_THROW(nvmlDeviceGetHandleByIndex(first_device_id, &first_device));
+
+                    bool is_NVLINK = false;
+
+                    for (unsigned int link = 0; link < NVML_NVLINK_MAX_LINKS; link++)
                    {
-                        // Maybe Two GPUs are connected via nvswitch,
-                        // now remotePciInfo represents the pci information of nvswitch,
-                        // determine whether nvlink is supported by whether two GPUs are connected to the same
-                        // nvswitch.
-                        nvmlDevice_t second_device;
-                        NVML_CHECK_THROW(nvmlDeviceGetHandleByIndex(second_device_id, &second_device));
-
-                        for (unsigned int second_link = 0; second_link < NVML_NVLINK_MAX_LINKS; second_link++)
+                        nvmlPciInfo_t remote_pci_info;
+                        if (nvmlDeviceGetNvLinkRemotePciInfo_v2(first_device, link, &remote_pci_info) != NVML_SUCCESS)
                        {
-                            nvmlPciInfo_t second_remote_pci_info;
-                            if (nvmlDeviceGetNvLinkRemotePciInfo_v2(second_device, second_link, &second_remote_pci_info)
-                                != NVML_SUCCESS)
-                            {
-                                continue;
-                            }
+                            continue;
+                        }

-                            if (strcmp(remote_pci_info.busId, second_remote_pci_info.busId) == 0)
+                        nvmlDevice_t remote_device;
+                        auto const result = nvmlDeviceGetHandleByPciBusId_v2(remote_pci_info.busId, &remote_device);
+
+                        if (result == NVML_SUCCESS)
+                        {
+                            // Two GPUs are connected directly through nvlink
+                            unsigned int remote_device_id;
+                            NVML_CHECK_THROW(nvmlDeviceGetIndex(remote_device, &remote_device_id));
+
+                            if (remote_device_id == static_cast<unsigned int>(second_device_id))
                            {
                                is_NVLINK = true;
-                                break;
                            }
                        }
-                    }
-                    else
-                    {
-                        NVML_CHECK_THROW(result);
+                        else if (result == NVML_ERROR_NOT_FOUND)
+                        {
+                            // Maybe Two GPUs are connected via nvswitch,
+                            // now remotePciInfo represents the pci information of nvswitch,
+                            // determine whether nvlink is supported by whether two GPUs are connected to the same
+                            // nvswitch.
+                            nvmlDevice_t second_device;
+                            NVML_CHECK_THROW(nvmlDeviceGetHandleByIndex(second_device_id, &second_device));
+
+                            for (unsigned int second_link = 0; second_link < NVML_NVLINK_MAX_LINKS; second_link++)
+                            {
+                                nvmlPciInfo_t second_remote_pci_info;
+                                if (nvmlDeviceGetNvLinkRemotePciInfo_v2(
+                                        second_device, second_link, &second_remote_pci_info)
+                                    != NVML_SUCCESS)
+                                {
+                                    continue;
+                                }
+
+                                if (strcmp(remote_pci_info.busId, second_remote_pci_info.busId) == 0)
+                                {
+                                    is_NVLINK = true;
+                                    break;
+                                }
+                            }
+                        }
+                        else
+                        {
+                            NVML_CHECK_THROW(result);
+                        }
+
+                        if (is_NVLINK)
+                        {
+                            break;
+                        }
                    }

-                    if (is_NVLINK)
-                    {
-                        break;
-                    }
+                    mIsNVLINKSupported &= is_NVLINK;
                }
-
-                mIsNVLINKSupported &= is_NVLINK;
            }
        }
+
+        // For inter-node groups, check MNNVL support
+        if (is_inter_node)
+        {
+            TLLM_LOG_INFO("Found inter-node TP group for rank %d, checking MNNVL support", rank);
+
+            // Check MNNVL support on local device(s)
+            bool local_mnnvl_supported = false;
+            if (!local_group.empty())
+            {
+                // Check MNNVL on first device in local group (all devices on same node should have same MNNVL status)
+                int check_device = *local_group.begin();
+                local_mnnvl_supported = checkMNNVLSupport(check_device);
+            }
+
+            // Gather MNNVL status from all ranks in the group
+            int local_mnnvl_status = local_mnnvl_supported ? 1 : 0;
+            std::vector<int> all_mnnvl_status(mGroup.size());
+
+            std::visit(overloaded{[&](std::shared_ptr<ncclComm_t>& comm_ptr)
+                           {
+                               // For NCCL comm, use MPI to gather status
+                               // Use MPI allgather to collect MNNVL status
+                               // Create a sub-communicator for the group
+                               std::vector<int> group_ranks(mGroup.begin(), mGroup.end());
+                               MPI_Group world_group, new_group;
+                               MPI_Comm group_comm;
+                               MPI_Comm_group(COMM_SESSION, &world_group);
+                               MPI_Group_incl(world_group, group_ranks.size(), group_ranks.data(), &new_group);
+                               MPI_Comm_create_group(COMM_SESSION, new_group, 0, &group_comm);
+
+                               if (group_comm != MPI_COMM_NULL)
+                               {
+                                   MPI_Allgather(&local_mnnvl_status, 1, MPI_INT, all_mnnvl_status.data(), 1, MPI_INT,
+                                       group_comm);
+                                   MPI_Comm_free(&group_comm);
+                               }
+                               MPI_Group_free(&new_group);
+                               MPI_Group_free(&world_group);
+                           },
+                           [&](c10::intrusive_ptr<c10d::ProcessGroup>& torchPg)
+                           {
+                               // For ProcessGroup, use allgather directly
+                               // Note: This assumes the ProcessGroup is already set up for the correct group
+                               std::vector<torch::Tensor> input_tensors
+                                   = {torch::tensor({local_mnnvl_status}, torch::kInt32)};
+                               std::vector<std::vector<torch::Tensor>> output_tensors(1);
+                               output_tensors[0].resize(mGroup.size());
+                               auto work = torchPg->allgather(output_tensors, input_tensors);
+                               if (work)
+                               {
+                                   work->wait();
+                                   for (size_t i = 0; i < mGroup.size(); ++i)
+                                   {
+                                       all_mnnvl_status[i] = output_tensors[0][i].item<int>();
+                                   }
+                               }
+                           }},
+                mNcclComm);
+
+            // Check if all ranks support MNNVL
+            bool all_ranks_support_mnnvl = true;
+            for (int status : all_mnnvl_status)
+            {
+                if (status == 0)
+                {
+                    all_ranks_support_mnnvl = false;
+                    break;
+                }
+            }
+
+            // For inter-node: MNNVL support means all nodes have MNNVL
+            // Also need local NVLink for optimal performance
+            mIsMNNVLSupported = mIsNVLINKSupported && all_ranks_support_mnnvl;
+            mIsP2PSupported = false; // P2P doesn't work across nodes
+
+            TLLM_LOG_INFO("Inter-node topology: local_NVLink=%d, local_MNNVL=%d, all_ranks_MNNVL=%d, final_MNNVL=%d",
+                mIsNVLINKSupported ? 1 : 0, local_mnnvl_status, all_ranks_support_mnnvl ? 1 : 0,
+                mIsMNNVLSupported ? 1 : 0);
+        }
+        else
+        {
+            TLLM_LOG_INFO("TP group is intra-node for rank %d", rank);
+        }
    }

    AllReduceStrategyType selectImplementation(size_t seq_len, size_t hidden_size)
@ -951,12 +1204,12 @@ private:

        if (ifFallbackToNCCL(seq_len, message_size_bytes, max_workspace_size))
        {
-            return AllReduceStrategyType::NCCL;
+            return AllReduceStrategyType::NCCL_SYMMETRIC;
        }

-        // This rule based heuristic only chooses between NCCL and MIN_LATENCY strategies.
-        // From this point, all fusion patterns are supported by all these strategies: NCCL, ONESHOT, TWOSHOT and
-        // MIN_LATENCY.
+        // This rule based heuristic only chooses between NCCL_SYMMETRIC and MIN_LATENCY strategies.
+        // From this point, all fusion patterns are supported by all these strategies: NCCL_SYMMETRIC, ONESHOT, TWOSHOT
+        // and MIN_LATENCY.
        if (mStrategy != AllReduceStrategyType::AUTO)
        {
            // Check TWOSHOT constraint: seq_len >= tp_size
@ -973,12 +1226,11 @@ private:
            return tensorrt_llm::utils::customAllReduceUtils::selectStrategyLookUpTable(
                seq_len, hidden_size, mOp, mGroup.size());
        }
-        return AllReduceStrategyType::NCCL;
    }

    bool ifFallbackToNCCL(size_t seq_len, size_t message_size_bytes, size_t max_workspace_size)
    {
-        // If messageSize is less than maxWorkspaceSize, use NCCL, regardless of the fusion type.
+        // If messageSize is greater than maxWorkspaceSize or topology is unsuitable, use NCCL_SYMMETRIC fallback.
        if (message_size_bytes > max_workspace_size || !mIsP2PSupported || !mIsNVLINKSupported)
        {
            return true;
@ -1006,6 +1258,7 @@ private:
    std::set<int> mGroup;
    bool mIsNVLINKSupported;
    bool mIsP2PSupported;
+    bool mIsMNNVLSupported;
    nvinfer1::DataType mType;
    AllReduceStrategyType mStrategy;
    AllReduceFusionOp mOp;
--- a/cpp/tests/unit_tests/multi_gpu/CMakeLists.txt
+++ b/cpp/tests/unit_tests/multi_gpu/CMakeLists.txt
@ -20,3 +20,9 @@ target_link_libraries(cacheTransceiverTest PRIVATE ${Python3_LIBRARIES})

 add_gtest(mpiUtilsTest mpiUtilsTest.cpp)
 add_gtest(userBufferTest userBufferTest.cpp)
+add_gtest(ncclUtilsTest ncclUtilsTest.cpp)
+target_link_libraries(ncclUtilsTest PRIVATE ${Python3_LIBRARIES})
+if(BUILD_PYT)
+  target_compile_definitions(ncclUtilsTest PUBLIC BUILD_PYT)
+  target_link_libraries(ncclUtilsTest PUBLIC ${TORCH_LIBRARIES})
+endif()
--- a/cpp/tests/unit_tests/multi_gpu/ncclUtilsTest.cpp
+++ b/cpp/tests/unit_tests/multi_gpu/ncclUtilsTest.cpp
@ -0,0 +1,745 @@
+/*
+ * Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "tensorrt_llm/common/ncclUtils.h"
+#include "tensorrt_llm/common/assert.h"
+#include "tensorrt_llm/common/cudaUtils.h"
+#include "tensorrt_llm/common/logger.h"
+#include "tensorrt_llm/common/opUtils.h"
+#include "tensorrt_llm/runtime/utils/mpiUtils.h"
+
+#include <gtest/gtest.h>
+#include <nccl.h>
+#include <thread>
+#include <vector>
+
+#if ENABLE_MULTI_DEVICE && BUILD_PYT
+#include <torch/extension.h>
+#endif
+
+#if ENABLE_MULTI_DEVICE
+
+namespace mpi = tensorrt_llm::mpi;
+namespace tr = tensorrt_llm::runtime;
+namespace nccl_util = tensorrt_llm::common::nccl_util;
+
+using ::getComm;
+
+// Helper function to create a split communicator for testing
+// This allows us to test cleanup behavior explicitly by controlling the lifetime
+std::shared_ptr<ncclComm_t> createSplitComm(ncclComm_t parentComm, int color, int key)
+{
+    ncclComm_t newComm;
+    ncclResult_t result = ncclCommSplit(parentComm, color, key, &newComm, nullptr);
+    if (result != ncclSuccess)
+    {
+        TLLM_THROW("ncclCommSplit failed with error: %d", result);
+    }
+
+    // Create a shared_ptr with custom deleter that cleans up resources first
+    return std::shared_ptr<ncclComm_t>(new ncclComm_t(newComm),
+        [](ncclComm_t* comm)
+        {
+            if (comm && *comm)
+            {
+                // STEP 1: Clean up all registered resources FIRST
+                tensorrt_llm::common::nccl_util::NcclCommResourceManager::getInstance().cleanupResources(*comm);
+
+                // STEP 2: Now destroy the NCCL communicator
+                ncclResult_t result = ncclCommDestroy(*comm);
+                if (result != ncclSuccess)
+                {
+                    TLLM_LOG_WARNING("ncclCommDestroy failed with error: %d", result);
+                }
+
+                // STEP 3: Free the memory
+                delete comm;
+            }
+        });
+}
+
+//==============================================================================
+// NcclCommResourceManager Tests
+//==============================================================================
+
+class NcclCommResourceManagerTest : public ::testing::Test
+{
+protected:
+    void SetUp() override
+    {
+        auto& comm = mpi::MpiComm::world();
+        mWorldSize = comm.getSize();
+        mRank = comm.getRank();
+
+        if (mWorldSize < 2)
+        {
+            GTEST_SKIP() << "Requires at least 2 ranks (got " << mWorldSize << ")";
+        }
+
+        // Set CUDA device for this rank (required before NCCL initialization)
+        int deviceCount = 0;
+        TLLM_CUDA_CHECK(cudaGetDeviceCount(&deviceCount));
+        if (deviceCount > 0)
+        {
+            int deviceId = mRank % deviceCount;
+            TLLM_CUDA_CHECK(cudaSetDevice(deviceId));
+        }
+
+        // Create a communicator for testing
+        std::set<int> group;
+        for (int i = 0; i < mWorldSize; ++i)
+        {
+            group.insert(i);
+        }
+        mComm = getComm(group);
+    }
+
+    void TearDown() override
+    {
+        // Communicator cleanup happens automatically via shared_ptr deleter
+        mComm.reset();
+    }
+
+    int mWorldSize;
+    int mRank;
+    std::shared_ptr<ncclComm_t> mComm;
+};
+
+TEST_F(NcclCommResourceManagerTest, ResourceRegistration)
+{
+    auto& manager = nccl_util::NcclCommResourceManager::getInstance();
+
+    // Create a separate comm using split for this test
+    auto testComm = createSplitComm(*mComm, 0, mRank);
+
+    // Register a resource
+    bool cleanupCalled = false;
+    manager.registerResource(
+        *testComm, [&cleanupCalled]() { cleanupCalled = true; }, "TestResource");
+
+    EXPECT_TRUE(manager.hasResources(*testComm));
+    EXPECT_EQ(manager.getResourceCount(*testComm), 1);
+    EXPECT_FALSE(cleanupCalled); // Cleanup not called yet
+
+    // Store the raw comm value before destruction
+    ncclComm_t rawComm = *testComm;
+
+    // Cleanup should be called when comm is destroyed
+    testComm.reset();
+
+    // Verify cleanup was called
+    EXPECT_TRUE(cleanupCalled);
+
+    // Verify cleanup: check that the old comm (now destroyed) no longer has resources
+    // Note: The comm is destroyed, but we can still check the manager's internal state
+    // The cleanup should have removed all resources for this comm
+    EXPECT_FALSE(manager.hasResources(rawComm));
+    EXPECT_EQ(manager.getResourceCount(rawComm), 0);
+}
+
+TEST_F(NcclCommResourceManagerTest, MultipleResources)
+{
+    auto& manager = nccl_util::NcclCommResourceManager::getInstance();
+
+    // Create a separate comm using split for this test
+    auto testComm = createSplitComm(*mComm, 0, mRank);
+
+    std::vector<int> cleanupOrder;
+    manager.registerResource(
+        *testComm, [&cleanupOrder]() { cleanupOrder.push_back(1); }, "Resource1");
+    manager.registerResource(
+        *testComm, [&cleanupOrder]() { cleanupOrder.push_back(2); }, "Resource2");
+    manager.registerResource(
+        *testComm, [&cleanupOrder]() { cleanupOrder.push_back(3); }, "Resource3");
+
+    EXPECT_EQ(manager.getResourceCount(*testComm), 3);
+
+    // Cleanup order should be preserved - destroy comm and verify order
+    testComm.reset();
+
+    // Verify cleanup order was preserved (1, 2, 3)
+    EXPECT_EQ(cleanupOrder.size(), 3);
+    EXPECT_EQ(cleanupOrder[0], 1);
+    EXPECT_EQ(cleanupOrder[1], 2);
+    EXPECT_EQ(cleanupOrder[2], 3);
+}
+
+TEST_F(NcclCommResourceManagerTest, ResourceCount)
+{
+    auto& manager = nccl_util::NcclCommResourceManager::getInstance();
+
+    // Create a separate comm using split for this test
+    auto testComm = createSplitComm(*mComm, 0, mRank);
+
+    EXPECT_FALSE(manager.hasResources(*testComm));
+    EXPECT_EQ(manager.getResourceCount(*testComm), 0);
+
+    manager.registerResource(
+        *testComm, []() {}, "Test1");
+    EXPECT_EQ(manager.getResourceCount(*testComm), 1);
+
+    manager.registerResource(
+        *testComm, []() {}, "Test2");
+    EXPECT_EQ(manager.getResourceCount(*testComm), 2);
+
+    testComm.reset();
+}
+
+//==============================================================================
+// NCCLWindowAllocator Tests
+//==============================================================================
+
+class NCCLWindowAllocatorTest : public ::testing::Test
+{
+protected:
+    void SetUp() override
+    {
+        auto& comm = mpi::MpiComm::world();
+        mWorldSize = comm.getSize();
+        mRank = comm.getRank();
+
+        if (mWorldSize < 2)
+        {
+            GTEST_SKIP() << "Requires at least 2 ranks (got " << mWorldSize << ")";
+        }
+
+        // Set CUDA device for this rank (required before NCCL initialization)
+        int deviceCount = 0;
+        TLLM_CUDA_CHECK(cudaGetDeviceCount(&deviceCount));
+        if (deviceCount > 0)
+        {
+            int deviceId = mRank % deviceCount;
+            TLLM_CUDA_CHECK(cudaSetDevice(deviceId));
+        }
+
+        // Check if NCCL symmetric is supported
+        auto& ncclHelper = nccl_util::NCCLHelper::getInstance();
+        if (!ncclHelper.isLoaded())
+        {
+            GTEST_SKIP() << "NCCL library with symmetric memory support is not available";
+        }
+
+        std::set<int> group;
+        for (int i = 0; i < mWorldSize; ++i)
+        {
+            group.insert(i);
+        }
+        mComm = getComm(group);
+    }
+
+    void TearDown() override
+    {
+        // Cleanup happens automatically
+        mComm.reset();
+    }
+
+    int mWorldSize;
+    int mRank;
+    std::shared_ptr<ncclComm_t> mComm;
+};
+
+TEST_F(NCCLWindowAllocatorTest, BasicAllocation)
+{
+    auto& allocator = nccl_util::NCCLWindowAllocator::getInstance();
+
+    const size_t bufferSize = 1024 * 1024; // 1MB
+    auto buffer = allocator.requestBuffer(*mComm, bufferSize);
+
+    EXPECT_TRUE(buffer.isValid());
+    EXPECT_NE(buffer.ptr, nullptr);
+    EXPECT_NE(buffer.window, nullptr);
+    EXPECT_EQ(buffer.size, bufferSize);
+    EXPECT_GE(buffer.handle, 0);
+
+    // Verify we can search for it
+    auto found = allocator.searchBuffer(*mComm, buffer.ptr);
+    EXPECT_TRUE(found.isValid());
+    EXPECT_EQ(found.ptr, buffer.ptr);
+
+    // Release the buffer
+    allocator.releaseBuffer(*mComm, buffer.ptr);
+}
+
+TEST_F(NCCLWindowAllocatorTest, BufferReuse)
+{
+    auto& allocator = nccl_util::NCCLWindowAllocator::getInstance();
+
+    const size_t bufferSize = 512 * 1024; // 512KB
+
+    // Allocate first buffer
+    auto buffer1 = allocator.requestBuffer(*mComm, bufferSize);
+    EXPECT_TRUE(buffer1.isValid());
+    void* ptr1 = buffer1.ptr;
+
+    // Release it
+    allocator.releaseBuffer(*mComm, ptr1);
+
+    // Request another buffer of the same size - should reuse
+    auto buffer2 = allocator.requestBuffer(*mComm, bufferSize);
+    EXPECT_TRUE(buffer2.isValid());
+    EXPECT_EQ(buffer2.ptr, ptr1); // Should be the same buffer
+
+    allocator.releaseBuffer(*mComm, buffer2.ptr);
+}
+
+TEST_F(NCCLWindowAllocatorTest, BestFitReuse)
+{
+    auto& allocator = nccl_util::NCCLWindowAllocator::getInstance();
+
+    // Allocate buffers of different sizes
+    auto buffer1MB = allocator.requestBuffer(*mComm, 1024 * 1024);
+    auto buffer2MB = allocator.requestBuffer(*mComm, 2 * 1024 * 1024);
+    auto buffer512KB = allocator.requestBuffer(*mComm, 512 * 1024);
+
+    void* ptr1MB = buffer1MB.ptr;
+    void* ptr2MB = buffer2MB.ptr;
+    void* ptr512KB = buffer512KB.ptr;
+
+    // Release all
+    allocator.releaseBuffer(*mComm, ptr1MB);
+    allocator.releaseBuffer(*mComm, ptr2MB);
+    allocator.releaseBuffer(*mComm, ptr512KB);
+
+    // Request 768KB - should reuse 1MB (best fit, smallest that fits)
+    auto buffer768KB = allocator.requestBuffer(*mComm, 768 * 1024);
+    EXPECT_TRUE(buffer768KB.isValid());
+    EXPECT_EQ(buffer768KB.ptr, ptr1MB);       // Should reuse 1MB buffer
+    EXPECT_EQ(buffer768KB.size, 1024 * 1024); // Original size
+
+    allocator.releaseBuffer(*mComm, buffer768KB.ptr);
+}
+
+TEST_F(NCCLWindowAllocatorTest, MultipleBuffers)
+{
+    auto& allocator = nccl_util::NCCLWindowAllocator::getInstance();
+
+    const size_t bufferSize = 256 * 1024;
+    std::vector<void*> ptrs;
+
+    // Allocate multiple buffers
+    for (int i = 0; i < 5; ++i)
+    {
+        auto buffer = allocator.requestBuffer(*mComm, bufferSize);
+        EXPECT_TRUE(buffer.isValid());
+        ptrs.push_back(buffer.ptr);
+    }
+
+    EXPECT_EQ(allocator.getBufferCount(*mComm), 5);
+    EXPECT_EQ(allocator.getBufferInUseCount(*mComm), 5);
+
+    // Release all
+    for (auto* ptr : ptrs)
+    {
+        allocator.releaseBuffer(*mComm, ptr);
+    }
+
+    EXPECT_EQ(allocator.getBufferInUseCount(*mComm), 0);
+    EXPECT_EQ(allocator.getBufferCount(*mComm), 5); // Buffers still exist, just not in use
+}
+
+TEST_F(NCCLWindowAllocatorTest, SearchBuffer)
+{
+    auto& allocator = nccl_util::NCCLWindowAllocator::getInstance();
+
+    const size_t bufferSize = 128 * 1024;
+    auto buffer = allocator.requestBuffer(*mComm, bufferSize);
+
+    // Test searchBuffer
+    auto found = allocator.searchBuffer(*mComm, buffer.ptr);
+    EXPECT_TRUE(found.isValid());
+    EXPECT_EQ(found.ptr, buffer.ptr);
+    // Compare against actual allocated size (ncclMemAlloc may allocate more than requested)
+    EXPECT_EQ(found.size, buffer.size);
+    EXPECT_GE(found.size, bufferSize); // At least the requested size
+
+    // Test search for non-existent buffer
+    void* fakePtr = reinterpret_cast<void*>(0xDEADBEEF);
+    auto notFound = allocator.searchBuffer(*mComm, fakePtr);
+    EXPECT_FALSE(notFound.isValid());
+
+    allocator.releaseBuffer(*mComm, buffer.ptr);
+}
+
+TEST_F(NCCLWindowAllocatorTest, GetWindowAndSize)
+{
+    auto& allocator = nccl_util::NCCLWindowAllocator::getInstance();
+
+    const size_t bufferSize = 64 * 1024;
+    auto buffer = allocator.requestBuffer(*mComm, bufferSize);
+
+    // Test getWindow
+    auto window = allocator.getWindow(*mComm, buffer.ptr);
+    EXPECT_NE(window, nullptr);
+    EXPECT_EQ(window, buffer.window);
+
+    // Test getSize - compare against actual allocated size (ncclMemAlloc may allocate more than requested)
+    auto size = allocator.getSize(*mComm, buffer.ptr);
+    EXPECT_EQ(size, buffer.size);
+    EXPECT_GE(size, bufferSize); // At least the requested size
+
+    // Test with invalid pointer
+    void* fakePtr = reinterpret_cast<void*>(0xDEADBEEF);
+    EXPECT_EQ(allocator.getWindow(*mComm, fakePtr), nullptr);
+    EXPECT_EQ(allocator.getSize(*mComm, fakePtr), 0);
+
+    allocator.releaseBuffer(*mComm, buffer.ptr);
+}
+
+TEST_F(NCCLWindowAllocatorTest, GetBufferInfo)
+{
+    auto& allocator = nccl_util::NCCLWindowAllocator::getInstance();
+
+    const size_t bufferSize = 32 * 1024;
+    auto buffer = allocator.requestBuffer(*mComm, bufferSize);
+
+    auto info = allocator.getBufferInfo(*mComm, buffer.ptr);
+    EXPECT_TRUE(info.isValid());
+    EXPECT_EQ(info.ptr, buffer.ptr);
+    EXPECT_EQ(info.size, buffer.size);
+    EXPECT_EQ(info.handle, buffer.handle);
+    EXPECT_EQ(info.window, buffer.window);
+
+    allocator.releaseBuffer(*mComm, buffer.ptr);
+}
+
+TEST_F(NCCLWindowAllocatorTest, ScopedBuffer)
+{
+    const size_t bufferSize = 16 * 1024;
+
+    {
+        nccl_util::ScopedNCCLWindowBuffer scopedBuffer(*mComm, bufferSize);
+        EXPECT_TRUE(scopedBuffer.getBuffer().isValid());
+        EXPECT_NE(scopedBuffer.getPtr(), nullptr);
+        // Compare against actual allocated size (ncclMemAlloc may allocate more than requested)
+        EXPECT_EQ(scopedBuffer.getSize(), scopedBuffer.getBuffer().size);
+        EXPECT_GE(scopedBuffer.getSize(), bufferSize); // At least the requested size
+        EXPECT_NE(scopedBuffer.getWindow(), nullptr);
+
+        // Buffer should be in use
+        auto& allocator = nccl_util::NCCLWindowAllocator::getInstance();
+        EXPECT_EQ(allocator.getBufferInUseCount(*mComm), 1);
+    }
+
+    // Buffer should be released when scoped buffer goes out of scope
+    auto& allocator = nccl_util::NCCLWindowAllocator::getInstance();
+    EXPECT_EQ(allocator.getBufferInUseCount(*mComm), 0);
+}
+
+TEST_F(NCCLWindowAllocatorTest, CleanupOnCommDestroy)
+{
+    auto& allocator = nccl_util::NCCLWindowAllocator::getInstance();
+
+    // Create a separate comm using split for this test
+    auto testComm = createSplitComm(*mComm, 0, mRank);
+
+    // Store the raw comm value before destruction
+    ncclComm_t rawComm = *testComm;
+
+    // Allocate some buffers
+    const size_t bufferSize = 8 * 1024;
+    auto buffer1 = allocator.requestBuffer(*testComm, bufferSize);
+    auto buffer2 = allocator.requestBuffer(*testComm, bufferSize * 2);
+
+    EXPECT_EQ(allocator.getBufferCount(*testComm), 2);
+    EXPECT_EQ(allocator.getBufferInUseCount(*testComm), 2);
+
+    // Verify buffers are valid
+    EXPECT_TRUE(buffer1.isValid());
+    EXPECT_TRUE(buffer2.isValid());
+
+    // Manually release buffers before cleanup to avoid warnings
+    allocator.releaseBuffer(*testComm, buffer1.ptr);
+    allocator.releaseBuffer(*testComm, buffer2.ptr);
+
+    // Verify buffers are released but still exist in pool
+    EXPECT_EQ(allocator.getBufferInUseCount(*testComm), 0);
+    EXPECT_EQ(allocator.getBufferCount(*testComm), 2); // Buffers still exist, just not in use
+
+    // Destroy the communicator - buffers should be cleaned up automatically
+    testComm.reset();
+
+    // Verify cleanup: check that the old comm (now destroyed) no longer has buffers
+    // Note: The comm is destroyed, but we can still check the allocator's internal state
+    // The cleanup should have removed all buffers for this comm
+    EXPECT_EQ(allocator.getBufferCount(rawComm), 0);
+    EXPECT_EQ(allocator.getBufferInUseCount(rawComm), 0);
+    // Note: isCommValid only checks for null, not cleaned-up state, because NCCL can reuse addresses
+    // The real check is that buffers are gone, which we verify above
+}
+
+TEST_F(NCCLWindowAllocatorTest, CommValidity)
+{
+    auto& allocator = nccl_util::NCCLWindowAllocator::getInstance();
+
+    // Valid comm should be valid
+    EXPECT_TRUE(allocator.isCommValid(*mComm));
+
+    // Null comm should be invalid
+    EXPECT_FALSE(allocator.isCommValid(nullptr));
+}
+
+//==============================================================================
+// Integration Tests
+//==============================================================================
+
+TEST_F(NCCLWindowAllocatorTest, MultipleComms)
+{
+    auto& allocator = nccl_util::NCCLWindowAllocator::getInstance();
+
+    // Create two different communicators using split (different colors)
+    auto comm1 = createSplitComm(*mComm, 0, mRank);
+    auto comm2 = createSplitComm(*mComm, 1, mRank);
+
+    const size_t bufferSize = 4 * 1024;
+
+    // Allocate buffers from both comms
+    auto buffer1 = allocator.requestBuffer(*comm1, bufferSize);
+    auto buffer2 = allocator.requestBuffer(*comm2, bufferSize);
+
+    EXPECT_TRUE(buffer1.isValid());
+    EXPECT_TRUE(buffer2.isValid());
+
+    // Buffers should be tracked separately per comm
+    EXPECT_EQ(allocator.getBufferCount(*comm1), 1);
+    EXPECT_EQ(allocator.getBufferCount(*comm2), 1);
+    EXPECT_NE(buffer1.ptr, buffer2.ptr); // Different buffers from different comms
+
+    allocator.releaseBuffer(*comm1, buffer1.ptr);
+    allocator.releaseBuffer(*comm2, buffer2.ptr);
+
+    // Clean up comms
+    comm1.reset();
+    comm2.reset();
+}
+
+#if ENABLE_MULTI_DEVICE && BUILD_PYT
+//==============================================================================
+// createNCCLWindowTensor Tests
+//==============================================================================
+
+class CreateNCCLWindowTensorTest : public ::testing::Test
+{
+protected:
+    void SetUp() override
+    {
+        auto& comm = mpi::MpiComm::world();
+        mWorldSize = comm.getSize();
+        mRank = comm.getRank();
+
+        if (mWorldSize < 2)
+        {
+            GTEST_SKIP() << "Requires at least 2 ranks (got " << mWorldSize << ")";
+        }
+
+        // Set CUDA device for this rank (required before NCCL initialization)
+        int deviceCount = 0;
+        TLLM_CUDA_CHECK(cudaGetDeviceCount(&deviceCount));
+        if (deviceCount > 0)
+        {
+            int deviceId = mRank % deviceCount;
+            TLLM_CUDA_CHECK(cudaSetDevice(deviceId));
+        }
+
+        // Check if NCCL symmetric is supported
+        auto& ncclHelper = nccl_util::NCCLHelper::getInstance();
+        if (!ncclHelper.isLoaded())
+        {
+            GTEST_SKIP() << "NCCL library with symmetric memory support is not available";
+        }
+
+        std::set<int> group;
+        for (int i = 0; i < mWorldSize; ++i)
+        {
+            group.insert(i);
+        }
+        mComm = getComm(group);
+    }
+
+    void TearDown() override
+    {
+        mComm.reset();
+    }
+
+    int mWorldSize;
+    int mRank;
+    std::shared_ptr<ncclComm_t> mComm;
+};
+
+TEST_F(CreateNCCLWindowTensorTest, BasicTensorCreation)
+{
+    using nccl_util::createNCCLWindowTensor;
+
+    // Create a tensor with shape [4, 8] and float32 dtype
+    std::vector<int64_t> shape = {4, 8};
+    auto [tensor, buffer] = createNCCLWindowTensor(*mComm, shape, torch::kFloat32);
+
+    // Verify tensor properties
+    EXPECT_TRUE(tensor.defined());
+    EXPECT_EQ(tensor.dtype(), torch::kFloat32);
+    EXPECT_EQ(tensor.device().type(), torch::kCUDA);
+    EXPECT_EQ(tensor.dim(), 2);
+    EXPECT_EQ(tensor.size(0), 4);
+    EXPECT_EQ(tensor.size(1), 8);
+    EXPECT_EQ(tensor.numel(), 4 * 8);
+
+    // Verify buffer properties
+    EXPECT_TRUE(buffer.isValid());
+    EXPECT_NE(buffer.ptr, nullptr);
+    // ncclMemAlloc may allocate more than requested, so check at least the requested size
+    EXPECT_GE(buffer.size, 4 * 8 * sizeof(float));
+    EXPECT_NE(buffer.window, nullptr);
+
+    // Verify tensor data pointer matches buffer pointer
+    EXPECT_EQ(tensor.data_ptr(), buffer.ptr);
+
+    // Tensor should be in use
+    auto& allocator = nccl_util::NCCLWindowAllocator::getInstance();
+    EXPECT_EQ(allocator.getBufferInUseCount(*mComm), 1);
+}
+
+TEST_F(CreateNCCLWindowTensorTest, DifferentDtypes)
+{
+    using nccl_util::createNCCLWindowTensor;
+
+    std::vector<int64_t> shape = {10};
+
+    // Test float32
+    {
+        auto [tensor, buffer] = createNCCLWindowTensor(*mComm, shape, torch::kFloat32);
+        EXPECT_EQ(tensor.dtype(), torch::kFloat32);
+        // ncclMemAlloc may allocate more than requested, so check at least the requested size
+        EXPECT_GE(buffer.size, 10 * sizeof(float));
+        EXPECT_EQ(tensor.data_ptr(), buffer.ptr);
+    }
+
+    // Test float16
+    {
+        auto [tensor, buffer] = createNCCLWindowTensor(*mComm, shape, torch::kFloat16);
+        EXPECT_EQ(tensor.dtype(), torch::kFloat16);
+        // ncclMemAlloc may allocate more than requested, so check at least the requested size
+        EXPECT_GE(buffer.size, 10 * sizeof(at::Half));
+        EXPECT_EQ(tensor.data_ptr(), buffer.ptr);
+    }
+
+    // Test int32
+    {
+        auto [tensor, buffer] = createNCCLWindowTensor(*mComm, shape, torch::kInt32);
+        EXPECT_EQ(tensor.dtype(), torch::kInt32);
+        // ncclMemAlloc may allocate more than requested, so check at least the requested size
+        EXPECT_GE(buffer.size, 10 * sizeof(int32_t));
+        EXPECT_EQ(tensor.data_ptr(), buffer.ptr);
+    }
+}
+
+TEST_F(CreateNCCLWindowTensorTest, DifferentShapes)
+{
+    using nccl_util::createNCCLWindowTensor;
+
+    // 1D tensor
+    {
+        std::vector<int64_t> shape = {100};
+        auto [tensor, buffer] = createNCCLWindowTensor(*mComm, shape, torch::kFloat32);
+        EXPECT_EQ(tensor.dim(), 1);
+        EXPECT_EQ(tensor.size(0), 100);
+        // ncclMemAlloc may allocate more than requested, so check at least the requested size
+        EXPECT_GE(buffer.size, 100 * sizeof(float));
+    }
+
+    // 3D tensor
+    {
+        std::vector<int64_t> shape = {2, 3, 4};
+        auto [tensor, buffer] = createNCCLWindowTensor(*mComm, shape, torch::kFloat32);
+        EXPECT_EQ(tensor.dim(), 3);
+        EXPECT_EQ(tensor.size(0), 2);
+        EXPECT_EQ(tensor.size(1), 3);
+        EXPECT_EQ(tensor.size(2), 4);
+        // ncclMemAlloc may allocate more than requested, so check at least the requested size
+        EXPECT_GE(buffer.size, 2 * 3 * 4 * sizeof(float));
+    }
+
+    // 4D tensor
+    {
+        std::vector<int64_t> shape = {1, 2, 3, 4};
+        auto [tensor, buffer] = createNCCLWindowTensor(*mComm, shape, torch::kFloat32);
+        EXPECT_EQ(tensor.dim(), 4);
+        EXPECT_EQ(tensor.numel(), 1 * 2 * 3 * 4);
+        // ncclMemAlloc may allocate more than requested, so check at least the requested size
+        EXPECT_GE(buffer.size, 1 * 2 * 3 * 4 * sizeof(float));
+    }
+}
+
+TEST_F(CreateNCCLWindowTensorTest, TensorDeleterReleasesBuffer)
+{
+    using nccl_util::createNCCLWindowTensor;
+
+    auto& allocator = nccl_util::NCCLWindowAllocator::getInstance();
+
+    {
+        std::vector<int64_t> shape = {16, 16};
+        auto [tensor, buffer] = createNCCLWindowTensor(*mComm, shape, torch::kFloat32);
+
+        EXPECT_EQ(allocator.getBufferInUseCount(*mComm), 1);
+        EXPECT_TRUE(buffer.isValid());
+        void* bufferPtr = buffer.ptr;
+
+        // Tensor goes out of scope - deleter should release the buffer
+    }
+
+    // Buffer should be released (not in use anymore)
+    EXPECT_EQ(allocator.getBufferInUseCount(*mComm), 0);
+
+    // Buffer should still exist in the pool (for reuse)
+    EXPECT_GE(allocator.getBufferCount(*mComm), 1);
+}
+
+TEST_F(CreateNCCLWindowTensorTest, MultipleTensors)
+{
+    using nccl_util::createNCCLWindowTensor;
+
+    auto& allocator = nccl_util::NCCLWindowAllocator::getInstance();
+
+    std::vector<int64_t> shape = {8, 8};
+    auto [tensor1, buffer1] = createNCCLWindowTensor(*mComm, shape, torch::kFloat32);
+    auto [tensor2, buffer2] = createNCCLWindowTensor(*mComm, shape, torch::kFloat32);
+    auto [tensor3, buffer3] = createNCCLWindowTensor(*mComm, shape, torch::kFloat32);
+
+    EXPECT_EQ(allocator.getBufferInUseCount(*mComm), 3);
+    EXPECT_NE(buffer1.ptr, buffer2.ptr);
+    EXPECT_NE(buffer2.ptr, buffer3.ptr);
+    EXPECT_NE(buffer1.ptr, buffer3.ptr);
+
+    // All tensors should be valid
+    EXPECT_TRUE(tensor1.defined());
+    EXPECT_TRUE(tensor2.defined());
+    EXPECT_TRUE(tensor3.defined());
+}
+
+TEST_F(CreateNCCLWindowTensorTest, TensorStrides)
+{
+    using nccl_util::createNCCLWindowTensor;
+
+    std::vector<int64_t> shape = {3, 4, 5};
+    auto [tensor, buffer] = createNCCLWindowTensor(*mComm, shape, torch::kFloat32);
+
+    // Verify strides are correct (row-major order)
+    EXPECT_EQ(tensor.stride(0), 4 * 5); // stride for first dimension
+    EXPECT_EQ(tensor.stride(1), 5);     // stride for second dimension
+    EXPECT_EQ(tensor.stride(2), 1);     // stride for third dimension
+}
+
+#endif // ENABLE_MULTI_DEVICE && BUILD_PYT
+
+#endif // ENABLE_MULTI_DEVICE
--- a/docs/source/blogs/tech_blog/blog14_Scaling_Expert_Parallelism_in_TensorRT-LLM_part3.md
+++ b/docs/source/blogs/tech_blog/blog14_Scaling_Expert_Parallelism_in_TensorRT-LLM_part3.md
@ -46,7 +46,7 @@ In this third blog of our scaling Expert Parallelism (EP) series, we push the pe

 The wo GEMM is the final linear layer within the multi-head attention block that produces the final outputs. While DeepSeek R1's MLA modifies the initial projections for keys and values, the wo GEMM operator remains a critical and standard component for finalizing the attention computation. In the term, "wo" is the abbreviation for the weight matrix for the output.

-We've evaluated that quantizing the wo GEMM to FP4 still satisfies the accuracy requirements, maintaining a similar MTP accept rate (AR) while improving end-to-end performance. The [NVIDIA TensorRT Model Optimizer](https://github.com/NVIDIA/TensorRT-Model-Optimizer) team has published checkpoints that additionally quantize the wo module in attention layers to FP4 on HuggingFace:
+We've evaluated that quantizing the wo GEMM to FP4 still satisfies the accuracy requirements, maintaining a similar MTP accept rate (AR) while improving end-to-end performance. The [NVIDIA Model Optimizer](https://github.com/NVIDIA/Model-Optimizer) team has published checkpoints that additionally quantize the wo module in attention layers to FP4 on HuggingFace:
 * https://huggingface.co/nvidia/DeepSeek-R1-FP4-v2
 * https://huggingface.co/nvidia/DeepSeek-R1-0528-FP4-v2

--- a/docs/source/blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.md
+++ b/docs/source/blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.md
@ -67,7 +67,7 @@ We have explored a mixed precision recipe, which provides a better tradeoff betw

 *TensorRT LLM already supports [FP8 Attention](https://github.com/NVIDIA/TensorRT-LLM/tree/main/examples/models/core/deepseek_v3#fp8-kv-cache-and-mla) while for this latency scenario low-precision attention computation doesn't help with performance so we choose to use bf16 precision for the Attention Modules.

-** nvfp4 model checkpoint is generated by the [NVIDIA TensorRT Model Optimizer toolkit](https://github.com/NVIDIA/TensorRT-Model-Optimizer).
+** nvfp4 model checkpoint is generated by the [NVIDIA Model Optimizer toolkit](https://github.com/NVIDIA/Model-Optimizer).

 *** RouterGEMM uses bf16 inputs/weights with fp32 outputs for numerical stability

--- a/docs/source/blogs/tech_blog/blog3_Optimizing_DeepSeek_R1_Throughput_on_NVIDIA_Blackwell_GPUs.md
+++ b/docs/source/blogs/tech_blog/blog3_Optimizing_DeepSeek_R1_Throughput_on_NVIDIA_Blackwell_GPUs.md
@ -29,7 +29,7 @@ The mixed precision recipe for DeepSeek R1 throughput scenario is almost the sam
 * FP8 KV cache and FP8 attention, rather than BF16 precision.
 * FP4 Allgather for better communication bandwidth utilization.

-The checkpoint used in this blog is hosted in [nvidia/DeepSeek-R1-FP4](https://huggingface.co/nvidia/DeepSeek-R1-FP4), generated by [NVIDIA Model Optimizer](https://github.com/NVIDIA/TensorRT-Model-Optimizer). The accuracy score of common dataset on this FP4 checkpoint and TensorRT LLM implementations are:
+The checkpoint used in this blog is hosted in [nvidia/DeepSeek-R1-FP4](https://huggingface.co/nvidia/DeepSeek-R1-FP4), generated by [NVIDIA Model Optimizer](https://github.com/NVIDIA/Model-Optimizer). The accuracy score of common dataset on this FP4 checkpoint and TensorRT LLM implementations are:

 | Precision | GPQA Diamond | MATH-500
 | :-- | :-- | :-- |
--- a/docs/source/developer-guide/perf-benchmarking.md
+++ b/docs/source/developer-guide/perf-benchmarking.md
@ -423,10 +423,10 @@ checkpoint. For the Llama-3.1 models, TensorRT LLM provides the following checkp
 - [`nvidia/Llama-3.1-70B-Instruct-FP8`](https://huggingface.co/nvidia/Llama-3.1-70B-Instruct-FP8)
 - [`nvidia/Llama-3.1-405B-Instruct-FP8`](https://huggingface.co/nvidia/Llama-3.1-405B-Instruct-FP8)

-To understand more about how to quantize your own checkpoints, refer to ModelOpt [documentation](https://nvidia.github.io/TensorRT-Model-Optimizer/deployment/1_tensorrt_llm.html).
+To understand more about how to quantize your own checkpoints, refer to ModelOpt [documentation](https://nvidia.github.io/Model-Optimizer/deployment/1_tensorrt_llm.html).

 `trtllm-bench` utilizes the `hf_quant_config.json` file present in the pre-quantized checkpoints above. The configuration
-file is present in checkpoints quantized with [TensorRT Model Optimizer](https://github.com/NVIDIA/TensorRT-Model-Optimizer)
+file is present in checkpoints quantized with [Model Optimizer](https://github.com/NVIDIA/Model-Optimizer)
 and describes the compute and KV cache quantization that checkpoint was compiled with. For example, from the checkpoints
 above:

--- a/docs/source/developer-guide/perf-overview.md
+++ b/docs/source/developer-guide/perf-overview.md
@ -21,7 +21,7 @@ and shows the throughput scenario under maximum load. The reported metric is `To

 The performance numbers below were collected using the steps described in this document.

-Testing was performed on models with weights quantized using [ModelOpt](https://nvidia.github.io/TensorRT-Model-Optimizer/#) and published by NVIDIA on the [Model Optimizer HuggingFace Collection](https://huggingface.co/collections/nvidia/model-optimizer-66aa84f7966b3150262481a4).
+Testing was performed on models with weights quantized using [ModelOpt](https://nvidia.github.io/Model-Optimizer/#) and published by NVIDIA on the [Model Optimizer HuggingFace Collection](https://huggingface.co/collections/nvidia/model-optimizer-66aa84f7966b3150262481a4).

 *(NEW for v1.0) RTX 6000 Pro Blackwell Server Edition Benchmarks:*

--- a/docs/source/features/auto_deploy/support_matrix.md
+++ b/docs/source/features/auto_deploy/support_matrix.md
@ -120,7 +120,7 @@ Optimize attention operations with different attention kernel implementations:

 ### Precision Support

-AutoDeploy supports models with various precision formats, including quantized checkpoints generated by [`TensorRT-Model-Optimizer`](https://github.com/NVIDIA/TensorRT-Model-Optimizer).
+AutoDeploy supports models with various precision formats, including quantized checkpoints generated by [`Model-Optimizer`](https://github.com/NVIDIA/Model-Optimizer).

 **Supported precision types include:**

--- a/docs/source/features/quantization.md
+++ b/docs/source/features/quantization.md
@ -23,7 +23,7 @@ The default PyTorch backend supports FP4 and FP8 quantization on the latest Blac

 ### Running Pre-quantized Models

-TensorRT LLM can directly run [pre-quantized models](https://huggingface.co/collections/nvidia/model-optimizer-66aa84f7966b3150262481a4) generated with the [NVIDIA TensorRT Model Optimizer](https://github.com/NVIDIA/TensorRT-Model-Optimizer).
+TensorRT LLM can directly run [pre-quantized models](https://huggingface.co/collections/nvidia/model-optimizer-66aa84f7966b3150262481a4) generated with the [NVIDIA Model Optimizer](https://github.com/NVIDIA/Model-Optimizer).

 ```python
 from tensorrt_llm import LLM
@ -54,8 +54,8 @@ If a pre-quantized model is not available on the [Hugging Face Hub](https://hugg
 Follow this step-by-step guide to quantize a model:

 ```bash
-git clone https://github.com/NVIDIA/TensorRT-Model-Optimizer.git
-cd TensorRT-Model-Optimizer/examples/llm_ptq
+git clone https://github.com/NVIDIA/Model-Optimizer.git
+cd Model-Optimizer/examples/llm_ptq
 scripts/huggingface_example.sh --model <huggingface_model_card> --quant fp8 --export_fmt hf
 ```

@ -108,4 +108,4 @@ FP8 block wise scaling GEMM kernels for sm100 are using MXFP8 recipe (E4M3 act/w
 ## Quick Links

 - [Pre-quantized Models by ModelOpt](https://huggingface.co/collections/nvidia/model-optimizer-66aa84f7966b3150262481a4)
- [ModelOpt Support Matrix](https://nvidia.github.io/TensorRT-Model-Optimizer/guides/0_support_matrix.html)
+- [ModelOpt Support Matrix](https://nvidia.github.io/Model-Optimizer/guides/0_support_matrix.html)
--- a/docs/source/legacy/performance/perf-benchmarking.md
+++ b/docs/source/legacy/performance/perf-benchmarking.md
@ -662,7 +662,7 @@ checkpoint. For the Llama-3.1 models, TensorRT-LLM provides the following checkp
 - [`nvidia/Llama-3.1-405B-Instruct-FP8`](https://huggingface.co/nvidia/Llama-3.1-405B-Instruct-FP8)

 `trtllm-bench` utilizes the `hf_quant_config.json` file present in the pre-quantized checkpoints above. The configuration
-file is present in checkpoints quantized with [TensorRT Model Optimizer](https://github.com/NVIDIA/TensorRT-Model-Optimizer)
+file is present in checkpoints quantized with [Model Optimizer](https://github.com/NVIDIA/Model-Optimizer)
 and describes the compute and KV cache quantization that checkpoint was compiled with. For example, from the checkpoints
 above:

--- a/docs/source/torch/auto_deploy/support_matrix.md
+++ b/docs/source/torch/auto_deploy/support_matrix.md
@ -118,7 +118,7 @@ Optimize attention operations with different attention kernel implementations:

 ### Precision Support

-AutoDeploy supports models with various precision formats, including quantized checkpoints generated by [`TensorRT-Model-Optimizer`](https://github.com/NVIDIA/TensorRT-Model-Optimizer).
+AutoDeploy supports models with various precision formats, including quantized checkpoints generated by [`Model-Optimizer`](https://github.com/NVIDIA/Model-Optimizer).

 **Supported precision types include:**

--- a/docs/source/torch/features/quantization.md
+++ b/docs/source/torch/features/quantization.md
@ -1,7 +1,7 @@
 # Quantization

 The PyTorch backend supports FP8 and NVFP4 quantization. You can pass quantized models in HF model hub,
-which are generated by [TensorRT Model Optimizer](https://github.com/NVIDIA/TensorRT-Model-Optimizer).
+which are generated by [Model Optimizer](https://github.com/NVIDIA/Model-Optimizer).

 ```python
 from tensorrt_llm._torch import LLM
@ -12,7 +12,7 @@ llm.generate("Hello, my name is")
 Or you can try the following commands to get a quantized model by yourself:

 ```bash
-git clone https://github.com/NVIDIA/TensorRT-Model-Optimizer.git
-cd TensorRT-Model-Optimizer/examples/llm_ptq
+git clone https://github.com/NVIDIA/Model-Optimizer.git
+cd Model-Optimizer/examples/llm_ptq
 scripts/huggingface_example.sh --model <huggingface_model_card> --quant fp8 --export_fmt hf
 ```
--- a/examples/auto_deploy/README.md
+++ b/examples/auto_deploy/README.md
@ -90,16 +90,16 @@ python lm_eval_ad.py \
 --model autodeploy --model_args model=meta-llama/Meta-Llama-3.1-8B-Instruct,world_size=2 --tasks mmlu
 ```

-### Mixed-precision Quantization using TensorRT Model Optimizer
+### Mixed-precision Quantization using Model Optimizer

-TensorRT Model Optimizer [AutoQuantize](https://nvidia.github.io/TensorRT-Model-Optimizer/reference/generated/modelopt.torch.quantization.model_quant.html#modelopt.torch.quantization.model_quant.auto_quantize) algorithm is a PTQ algorithm from ModelOpt which quantizes a model by searching for the best quantization format per-layer while meeting the performance constraint specified by the user. This way, `AutoQuantize` enables to trade-off model accuracy for performance.
+Model Optimizer [AutoQuantize](https://nvidia.github.io/Model-Optimizer/reference/generated/modelopt.torch.quantization.model_quant.html#modelopt.torch.quantization.model_quant.auto_quantize) algorithm is a PTQ algorithm from ModelOpt which quantizes a model by searching for the best quantization format per-layer while meeting the performance constraint specified by the user. This way, `AutoQuantize` enables to trade-off model accuracy for performance.

 Currently `AutoQuantize` supports only `effective_bits` as the performance constraint (for both weight-only quantization and weight & activation quantization). See
-[AutoQuantize documentation](https://nvidia.github.io/TensorRT-Model-Optimizer/reference/generated/modelopt.torch.quantization.model_quant.html#modelopt.torch.quantization.model_quant.auto_quantize) for more details.
+[AutoQuantize documentation](https://nvidia.github.io/Model-Optimizer/reference/generated/modelopt.torch.quantization.model_quant.html#modelopt.torch.quantization.model_quant.auto_quantize) for more details.

 #### 1. Quantize a model with ModelOpt

-Refer to [NVIDIA TensorRT Model Optimizer](https://github.com/NVIDIA/TensorRT-Model-Optimizer/blob/main/examples/llm_autodeploy/README.md) for generating quantized model checkpoint.
+Refer to [NVIDIA Model Optimizer](https://github.com/NVIDIA/Model-Optimizer/blob/main/examples/llm_autodeploy/README.md) for generating quantized model checkpoint.

 #### 2. Deploy the quantized model with AutoDeploy

--- a/examples/disaggregated/README.md
+++ b/examples/disaggregated/README.md
@ -212,7 +212,7 @@ In disaggregated serving, the context workers and generation workers have differ
 ### Prerequisites

 To enable mixed precision serving, you will need:
-1. A quantized checkpoint created with [TensorRT Model Optimizer](https://github.com/NVIDIA/TensorRT-Model-Optimizer)
+1. A quantized checkpoint created with [Model Optimizer](https://github.com/NVIDIA/Model-Optimizer)
 2. The original unquantized checkpoint (Can also be quantized)
 3. Both checkpoints must use the same KV cache dtype to ensure compatibility during transfer

--- a/examples/llm-api/_tensorrt_engine/llm_medusa_decoding.py
+++ b/examples/llm-api/_tensorrt_engine/llm_medusa_decoding.py
@ -29,7 +29,7 @@ def run_medusa_decoding(use_modelopt_ckpt=False, model_dir=None):
    llm_kwargs = {}

    if use_modelopt_ckpt:
-        # This is a Llama-3.1-8B combined with Medusa heads provided by TensorRT Model Optimizer.
+        # This is a Llama-3.1-8B combined with Medusa heads provided by Model Optimizer.
        # Both the base model (except lm_head) and Medusa heads have been quantized in FP8.
        model = model_dir or "nvidia/Llama-3.1-8B-Medusa-FP8"

@ -85,7 +85,7 @@ if __name__ == '__main__':
    parser.add_argument(
        '--use_modelopt_ckpt',
        action='store_true',
-        help="Use FP8-quantized checkpoint from TensorRT Model Optimizer.")
+        help="Use FP8-quantized checkpoint from Model Optimizer.")
    # TODO: remove this arg after ModelOpt ckpt is public on HF
    parser.add_argument('--model_dir', type=Path, default=None)
    args = parser.parse_args()
--- a/examples/llm-api/_tensorrt_engine/quickstart_example.py
+++ b/examples/llm-api/_tensorrt_engine/quickstart_example.py
@ -9,7 +9,7 @@ def main():
    build_config.max_num_tokens = 1024

    # Model could accept HF model name, a path to local HF model,
-    # or TensorRT Model Optimizer's quantized checkpoints like nvidia/Llama-3.1-8B-Instruct-FP8 on HF.
+    # or Model Optimizer's quantized checkpoints like nvidia/Llama-3.1-8B-Instruct-FP8 on HF.
    llm = LLM(model="TinyLlama/TinyLlama-1.1B-Chat-v1.0",
              build_config=build_config)

--- a/examples/llm-api/llm_inference.py
+++ b/examples/llm-api/llm_inference.py
@ -7,7 +7,7 @@ from tensorrt_llm import LLM, SamplingParams
 def main():

    # Model could accept HF model name, a path to local HF model,
-    # or TensorRT Model Optimizer's quantized checkpoints like nvidia/Llama-3.1-8B-Instruct-FP8 on HF.
+    # or Model Optimizer's quantized checkpoints like nvidia/Llama-3.1-8B-Instruct-FP8 on HF.
    llm = LLM(model="TinyLlama/TinyLlama-1.1B-Chat-v1.0")

    # Sample prompts.
--- a/examples/llm-api/quickstart_example.py
+++ b/examples/llm-api/quickstart_example.py
@ -4,7 +4,7 @@ from tensorrt_llm import LLM, SamplingParams
 def main():

    # Model could accept HF model name, a path to local HF model,
-    # or TensorRT Model Optimizer's quantized checkpoints like nvidia/Llama-3.1-8B-Instruct-FP8 on HF.
+    # or Model Optimizer's quantized checkpoints like nvidia/Llama-3.1-8B-Instruct-FP8 on HF.
    llm = LLM(model="TinyLlama/TinyLlama-1.1B-Chat-v1.0")

    # Sample prompts.
--- a/examples/medusa/README.md
+++ b/examples/medusa/README.md
@ -19,7 +19,7 @@ For more info about Medusa visit [speculative decoding documentation](https://nv
 The TensorRT LLM Medusa example code is located in [`examples/medusa`](./). There is one [`convert_checkpoint.py`](./convert_checkpoint.py) file to convert and build the [TensorRT](https://developer.nvidia.com/tensorrt) engine(s) needed to run models with Medusa decoding support.
 In this example, we demonstrate the usage of two models:
 1. The Vucuna 7B model from Hugging Face [`FasterDecoding/medusa-vicuna-7b-v1.3`](https://huggingface.co/FasterDecoding/medusa-vicuna-7b-v1.3) with its Medusa heads [`medusa-vicuna-7b-v1.3`](https://huggingface.co/FasterDecoding/medusa-vicuna-7b-v1.3).
-2. The quantized checkpoint [`nvidia/Llama-3.1-8B-Medusa-FP8`](https://huggingface.co/nvidia/Llama-3.1-8B-Medusa-FP8) on Hugging Face by [TensorRT Model Optimizer](https://github.com/NVIDIA/TensorRT-Model-Optimizer) (ModelOpt). This model is based on [Llama-3.1 8B](https://huggingface.co/meta-llama/Llama-3.1-8B) and enhanced with Medusa heads, with both the base model (except lm_head) and Medusa heads already quantized in FP8.
+2. The quantized checkpoint [`nvidia/Llama-3.1-8B-Medusa-FP8`](https://huggingface.co/nvidia/Llama-3.1-8B-Medusa-FP8) on Hugging Face by [Model Optimizer](https://github.com/NVIDIA/Model-Optimizer) (ModelOpt). This model is based on [Llama-3.1 8B](https://huggingface.co/meta-llama/Llama-3.1-8B) and enhanced with Medusa heads, with both the base model (except lm_head) and Medusa heads already quantized in FP8.

 ### Build TensorRT engine(s)
 Get the weights by downloading base model [`vicuna-7b-v1.3`](https://huggingface.co/lmsys/vicuna-7b-v1.3) and Medusa Heads [`medusa-vicuna-7b-v1.3`](https://huggingface.co/FasterDecoding/medusa-vicuna-7b-v1.3) from HF.
--- a/examples/models/core/deepseek_v3/README.md
+++ b/examples/models/core/deepseek_v3/README.md
@ -773,7 +773,7 @@ You can enable FP8 MLA through either of these methods:

 **Option 1: Checkpoint config**

-TensorRT LLM automatically detects the `hf_quant_config.json` file in the model directory, which configures both GEMM and KV cache quantization. For example, see the FP4 DeepSeek-R1 checkpoint [configuration](https://huggingface.co/nvidia/DeepSeek-R1-FP4/blob/main/hf_quant_config.json) provided by [ModelOpt](https://github.com/NVIDIA/TensorRT-Model-Optimizer).
+TensorRT LLM automatically detects the `hf_quant_config.json` file in the model directory, which configures both GEMM and KV cache quantization. For example, see the FP4 DeepSeek-R1 checkpoint [configuration](https://huggingface.co/nvidia/DeepSeek-R1-FP4/blob/main/hf_quant_config.json) provided by [ModelOpt](https://github.com/NVIDIA/Model-Optimizer).

 To enable FP8 MLA, modify the `kv_cache_quant_algo` property. The following shows the config for DeepSeek's block-wise FP8 GEMM quantization + FP8 MLA:

@ -808,14 +808,14 @@ Or you can follow the steps to generate one by yourselves.

 #### Activation calibration

-[ModelOpt](https://github.com/NVIDIA/TensorRT-Model-Optimizer) is used for calibrating activations of MoE layers. We provide a calibrated file at [HF model hub](https://huggingface.co/Barrrrry/DeepSeek-R1-W4AFP8/blob/main/act_scales.safetensors) or you can run the following commands to generate by yourselves.
+[ModelOpt](https://github.com/NVIDIA/Model-Optimizer) is used for calibrating activations of MoE layers. We provide a calibrated file at [HF model hub](https://huggingface.co/Barrrrry/DeepSeek-R1-W4AFP8/blob/main/act_scales.safetensors) or you can run the following commands to generate by yourselves.

 ```bash
 # Make sure for enough GPU resources (8xH200s) to run the following commands
 PATH_OF_DEEPSEEK_R1=/llm-models/DeepSeek-R1/DeepSeek-R1

 # Install ModelOpt from source
-git clone https://github.com/NVIDIA/TensorRT-Model-Optimizer/ && cd modelopt
+git clone https://github.com/NVIDIA/Model-Optimizer/ && cd modelopt
 pip install "nvidia-modelopt[all]" -U --extra-index-url https://pypi.nvidia.com

 # Clone DeepSeek-V3 (base model of R1) Github repository for FP8 inference,
--- a/examples/models/core/exaone/README.md
+++ b/examples/models/core/exaone/README.md
@ -85,17 +85,17 @@ The output will be like:

 #### PyTorch flow Quantization

-For PyTorch flow, TRT-LLM supports quantized format generated by [TensorRT Model Optimizer](https://github.com/NVIDIA/TensorRT-Model-Optimizer).
+For PyTorch flow, TRT-LLM supports quantized format generated by [Model Optimizer](https://github.com/NVIDIA/Model-Optimizer).

 You can either do pre-quantized models in HF model hub, or can generate quantized model by yourself and then run models with below command:

 ```bash
-git clone https://github.com/NVIDIA/TensorRT-Model-Optimizer.git
-cd TensorRT-Model-Optimizer/examples/llm_ptq
+git clone https://github.com/NVIDIA/Model-Optimizer.git
+cd Model-Optimizer/examples/llm_ptq
 scripts/huggingface_example.sh --model  hf_models/$MODEL_NAME --quant fp8 --export_fmt hf
 ```

-For more information, please refer to official [docs](https://github.com/NVIDIA/TensorRT-Model-Optimizer) or [TensorRT Model Optimizer](https://github.com/NVIDIA/TensorRT-Model-Optimizer).
+For more information, please refer to official [docs](https://github.com/NVIDIA/Model-Optimizer) or [Model Optimizer](https://github.com/NVIDIA/Model-Optimizer).

 Troubleshooting

@ -107,7 +107,7 @@ Hint: Move the offending context manager(s) to outside the compiled region.
 Hint: This graph break may have been caused by an earlier graph break. Resolving the earlier graph break may resolve this one.
 ```

-This error may indicate an incompatibility between `torch.compile()` and the `HybridCache` module of the transformers library. As a result, [TensorRT Model Optimizer](https://github.com/NVIDIA/TensorRT-Model-Optimizer) (ModelOpt) cannot perform PTQ with HybridCache.
+This error may indicate an incompatibility between `torch.compile()` and the `HybridCache` module of the transformers library. As a result, [Model Optimizer](https://github.com/NVIDIA/Model-Optimizer) (ModelOpt) cannot perform PTQ with HybridCache.

 Temporarily switching to `DynamicCache` when creating PTQ models could help address the issue. This can be done by updating the `cache_implementation` field in the `generation_config.json` file located in the model checkpoint directory, for example:
 ```json
--- a/examples/models/core/llama/README.md
+++ b/examples/models/core/llama/README.md
@ -1559,7 +1559,7 @@ Explanation:


 ### Launch trtllm-serve OpenAI-compatible API server
-TensorRT LLM supports nvidia TensorRT Model Optimizer quantized FP8 checkpoint
+TensorRT LLM supports nvidia Model Optimizer quantized FP8 checkpoint
 ``` bash
 trtllm-serve nvidia/Llama-3.3-70B-Instruct-FP8 \
    --tp_size 8 \
--- a/examples/models/core/llama4/README.md
+++ b/examples/models/core/llama4/README.md
@ -42,7 +42,7 @@ Explanation:


 #### 2. Launch trtllm-serve OpenAI-compatible API server
-TensorRT LLM supports nvidia TensorRT Model Optimizer quantized FP8 checkpoint
+TensorRT LLM supports nvidia Model Optimizer quantized FP8 checkpoint
 ``` bash
 trtllm-serve nvidia/Llama-4-Maverick-17B-128E-Instruct-FP8 \
    --max_batch_size 512 \
@ -94,7 +94,7 @@ Explanation:


 #### 2. Launch trtllm-serve OpenAI-compatible API server
-TensorRT LLM supports nvidia TensorRT Model Optimizer quantized FP8 checkpoint.
+TensorRT LLM supports nvidia Model Optimizer quantized FP8 checkpoint.
 ``` bash
 trtllm-serve nvidia/Llama-4-Maverick-17B-128E-Instruct-FP8 \
    --max_batch_size 8 \
@ -140,7 +140,7 @@ Explanation:


 #### 2. Launch trtllm-serve OpenAI-compatible API server
-TensorRT LLM supports nvidia TensorRT Model Optimizer quantized FP8 checkpoint.
+TensorRT LLM supports nvidia Model Optimizer quantized FP8 checkpoint.
 ``` bash
 trtllm-serve nvidia/Llama-4-Maverick-17B-128E-Instruct-FP8 \
    --tp_size 8 \
--- a/examples/models/core/qwen/README.md
+++ b/examples/models/core/qwen/README.md
@ -663,19 +663,19 @@ trtllm-eval --model=Qwen3-30B-A3B/ --tokenizer=Qwen3-30B-A3B/ --backend=pytorch
 To quantize the Qwen3 model for use with the PyTorch backend, we'll use NVIDIA's Model Optimizer (ModelOpt) tool. Follow these steps:

 ```bash
-# Clone the TensorRT Model Optimizer (ModelOpt)
-git clone https://github.com/NVIDIA/TensorRT-Model-Optimizer.git
-pushd TensorRT-Model-Optimizer
+# Clone the Model Optimizer (ModelOpt)
+git clone https://github.com/NVIDIA/Model-Optimizer.git
+pushd Model-Optimizer

 # install the ModelOpt
 pip install -e .

 # Quantize the Qwen3-235B-A22B model by nvfp4
-# By default, the checkpoint would be stored in `TensorRT-Model-Optimizer/examples/llm_ptq/saved_models_Qwen3-235B-A22B_nvfp4_hf/`.
+# By default, the checkpoint would be stored in `Model-Optimizer/examples/llm_ptq/saved_models_Qwen3-235B-A22B_nvfp4_hf/`.
 ./examples/llm_ptq/scripts/huggingface_example.sh --model Qwen3-235B-A22B/ --quant nvfp4 --export_fmt hf

 # Quantize the Qwen3-32B model by fp8_pc_pt
-# By default, the checkpoint would be stored in `TensorRT-Model-Optimizer/examples/llm_ptq/saved_models_Qwen3-32B_fp8_pc_pt_hf/`.
+# By default, the checkpoint would be stored in `Model-Optimizer/examples/llm_ptq/saved_models_Qwen3-32B_fp8_pc_pt_hf/`.
 ./examples/llm_ptq/scripts/huggingface_example.sh --model Qwen3-32B/ --quant fp8_pc_pt --export_fmt hf
 popd
 ```
@ -687,7 +687,7 @@ To run the benchmark, we suggest using the `trtllm-bench` tool. Please refer to
 ```bash
 #!/bin/bash

-folder_model=TensorRT-Model-Optimizer/examples/llm_ptq/saved_models_Qwen3-235B-A22B_nvfp4_hf/
+folder_model=Model-Optimizer/examples/llm_ptq/saved_models_Qwen3-235B-A22B_nvfp4_hf/
 path_config=extra-llm-api-config.yml
 num_gpus=8
 ep_size=8
@ -727,7 +727,7 @@ trtllm-bench --model ${folder_model} --model_path ${folder_model} throughput \
 We suggest benchmarking with a real dataset. It will prevent from having improperly distributed tokens in the MoE. Here, we use the `aa_prompt_isl_1k_osl_2k_qwen3_10000samples.txt` dataset. It has 10000 samples with an average input length of 1024 and an average output length of 2048. If you don't have a dataset (this or an other) and you want to run the benchmark, you can use the following command to generate a random dataset:

 ```bash
-folder_model=TensorRT-Model-Optimizer/examples/llm_ptq/saved_models_Qwen3-235B-A22B_nvfp4_hf/
+folder_model=Model-Optimizer/examples/llm_ptq/saved_models_Qwen3-235B-A22B_nvfp4_hf/
 min_input_len=1024
 min_output_len=2048
 concurrency=128
--- a/examples/quantization/README.md
+++ b/examples/quantization/README.md
@ -11,7 +11,7 @@ The detailed LLM quantization recipe is distributed to the README.md of the corr

 ## Installation

-The NVIDIA TensorRT Model Optimizer quantization toolkit is installed automatically as a dependency of TensorRT-LLM.
+The NVIDIA Model Optimizer quantization toolkit is installed automatically as a dependency of TensorRT-LLM.

 ```bash
 # Install the additional requirements
--- a/jenkins/L0_Test.groovy
+++ b/jenkins/L0_Test.groovy
@ -1126,7 +1126,7 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
 def runLLMTestlistOnSlurm(pipeline, platform, testList, config=VANILLA_CONFIG, perfMode=false, stageName="Undefined", splitId=1, splits=1, gpuCount=1, nodeCount=1, runWithSbatch=false, skipInstallWheel=false, cpver="cp312")
 {
  echo "Run Slurm job with native sbatch: $runWithSbatch"
-  if(nodeCount > 1 || runWithSbatch) {
+  if (nodeCount > 1 || runWithSbatch) {
    runLLMTestlistWithSbatch(pipeline, platform, testList, config, perfMode, stageName, splitId, splits, gpuCount, nodeCount, skipInstallWheel, cpver)
  } else {
    runLLMTestlistWithAgent(pipeline, platform, testList, config, perfMode, stageName, splitId, splits, gpuCount, skipInstallWheel, cpver)
@ -1639,6 +1639,7 @@ def launchTestListCheck(pipeline)
            sh "tar -zxf ${tarName}"
            def llmPath = sh (script: "realpath .", returnStdout: true).trim()
            def llmSrc = "${llmPath}/TensorRT-LLM/src"
+            trtllm_utils.llmExecStepWithRetry(pipeline, script: "pip3 install -r ${llmSrc}/requirements-dev.txt")
            sh "NVIDIA_TRITON_SERVER_VERSION=25.10 LLM_ROOT=${llmSrc} LLM_BACKEND_ROOT=${llmSrc}/triton_backend python3 ${llmSrc}/scripts/check_test_list.py --l0 --qa --waive"
        } catch (InterruptedException e) {
            throw e
@ -2492,7 +2493,7 @@ def runLLMTestlistOnPlatformImpl(pipeline, platform, testList, config=VANILLA_CO
            error "Some tests still failed after rerun attempts, please check the test report."
        }

-        if (perfMode) {
+        if (perfMode && !stageName.contains("Perf-Sanity")) {
            basePerfFilename = stageName.contains("PyTorch") ? "base_perf_pytorch.csv" : "base_perf.csv"
            basePerfPath = "${llmSrc}/tests/integration/defs/perf/${basePerfFilename}"
            stage("Check perf result") {
@ -2903,12 +2904,14 @@ def launchTestJobs(pipeline, testFilter)
        "DGX_B200-4_GPUs-PyTorch-2": ["b200-x4", "l0_dgx_b200", 2, 2, 4],
        "DGX_B200-4_GPUs-PyTorch-Ray-1": ["b200-x4", "l0_dgx_b200", 1, 1, 4],
        "DGX_B200-8_GPUs-PyTorch-1": ["b200-x8", "l0_dgx_b200", 1, 1, 8],
-        "DGX_B200-4_GPUs-PyTorch-Post-Merge-1": ["b200-trtllm", "l0_dgx_b200", 1, 1, 4, 1, true],
-        "DGX_B300-4_GPUs-PyTorch-Post-Merge-1": ["b300-x4", "l0_dgx_b300", 1, 1, 4],
+        "DGX_B200-4_GPUs-PyTorch-Post-Merge-1": ["b200-trtllm", "l0_dgx_b200", 1, 2, 4, 1, true],
+        "DGX_B200-4_GPUs-PyTorch-Post-Merge-2": ["b200-trtllm", "l0_dgx_b200", 2, 2, 4, 1, true],
+        "DGX_B300-4_GPUs-PyTorch-Post-Merge-1": ["b300-x4", "l0_dgx_b300", 1, 2, 4],
+        "DGX_B300-4_GPUs-PyTorch-Post-Merge-2": ["b300-x4", "l0_dgx_b300", 2, 2, 4],
        // Perf sanity post merge test
-        // Disable perf stages due to https://nvbugs/5643646
-        // "DGX_B200-4_GPUs-PyTorch-Perf-Sanity-Post-Merge-1": ["b200-x4", "perf_sanity_l0_dgx_b200", 1, 1, 4],
-        // "DGX_B300-4_GPUs-PyTorch-Perf-Sanity-Post-Merge-1": ["b300-x4", "perf_sanity_l0_dgx_b300", 1, 1, 4],
+        // "DGX_B200-4_GPUs-PyTorch-Perf-Sanity-Post-Merge-1": ["b200-x4", "l0_dgx_b200_perf_sanity", 1, 1, 4],
+        // "DGX_B200-8_GPUs-PyTorch-Perf-Sanity-Post-Merge-1": ["b200-x8", "l0_dgx_b200_perf_sanity", 1, 1, 8],
+        // "DGX_B300-4_GPUs-PyTorch-Perf-Sanity-Post-Merge-1": ["b300-x4", "l0_dgx_b300_perf_sanity", 1, 1, 4],
    ]
    fullSet += x86SlurmTestConfigs.keySet()

@ -2933,8 +2936,11 @@ def launchTestJobs(pipeline, testFilter)
    fullSet += SBSATestConfigs.keySet()

    SBSASlurmTestConfigs = [
-        "GB200-4_GPUs-PyTorch-1": ["gb200-x4-oci", "l0_gb200_multi_gpus", 1, 1, 4],
+        "GB200-4_GPUs-PyTorch-1": ["gb200-x4-oci", "l0_gb200_multi_gpus", 1, 2, 4],
+        "GB200-4_GPUs-PyTorch-2": ["gb200-x4-oci", "l0_gb200_multi_gpus", 2, 2, 4],
        "GB200-4_GPUs-PyTorch-Post-Merge-1": ["gb200-x4-oci", "l0_gb200_multi_gpus", 1, 1, 4],
+        // Perf sanity post merge test
+        "GB200-4_GPUs-PyTorch-Perf-Sanity-Post-Merge-1": ["gb200-x4-oci", "l0_gb200_multi_gpus_perf_sanity", 1, 1, 4],
        // Disable GB300 stages due to nodes will be offline temporarily.
        // "GB300-PyTorch-1": ["gb300-single", "l0_gb300", 1, 1],
        // "GB300-4_GPUs-PyTorch-Post-Merge-1": ["gb300-x4", "l0_gb300_multi_gpus", 1, 1, 4],
@ -2949,6 +2955,8 @@ def launchTestJobs(pipeline, testFilter)
        "GB200-8_GPUs-2_Nodes-PyTorch-Post-Merge-1": ["gb200-oci-trtllm", "l0_gb200_multi_nodes", 1, 3, 8, 2],
        "GB200-8_GPUs-2_Nodes-PyTorch-Post-Merge-2": ["gb200-oci-trtllm", "l0_gb200_multi_nodes", 2, 3, 8, 2],
        "GB200-8_GPUs-2_Nodes-PyTorch-Post-Merge-3": ["gb200-oci-trtllm", "l0_gb200_multi_nodes", 3, 3, 8, 2],
+        // Perf sanity post merge test
+        "GB200-8_GPUs-2_Nodes-PyTorch-Perf-Sanity-Post-Merge-1": ["gb200-oci-trtllm", "l0_gb200_multi_nodes_perf_sanity", 1, 1, 8, 2],
    ]
    fullSet += multiNodesSBSAConfigs.keySet()

--- a/jenkins/scripts/slurm_run.sh
+++ b/jenkins/scripts/slurm_run.sh
@ -29,10 +29,14 @@ set_value_in_command() {
    echo "$result"
 }

-# Only the first process will save the job ID
+# Only the first process will save the job ID and set the git config
 if [ $SLURM_PROCID -eq 0 ]; then
    # Save job ID in $jobWorkspace/slurm_job_id.txt for later job to retrieve
    echo $SLURM_JOB_ID > $jobWorkspace/slurm_job_id.txt
+    # Update HOME/.gitconfig
+    if ! git config --global --get-all safe.directory | grep -Fxq "*"; then
+        git config --global --add safe.directory "*"
+    fi
 fi

 if [ $SLURM_LOCALID -eq 0 ]; then
@ -47,7 +51,6 @@ if [ $SLURM_LOCALID -eq 0 ]; then
    fi
    cd $llmSrcNode && pip3 install --retries 10 -r requirements-dev.txt
    cd $resourcePathNode &&  pip3 install --retries 10 --force-reinstall --no-deps TensorRT-LLM/tensorrt_llm-*.whl
-    git config --global --add safe.directory "*"
    gpuUuids=$(nvidia-smi -q | grep "GPU UUID" | awk '{print $4}' | tr '\n' ',' || true)
    hostNodeName="${HOST_NODE_NAME:-$(hostname -f || hostname)}"
    echo "HOST_NODE_NAME = $hostNodeName ; GPU_UUIDS = $gpuUuids ; STAGE_NAME = $stageName"
@ -106,7 +109,7 @@ echo "Full Command: $pytestCommand"
 eval $pytestCommand
 echo "Rank${SLURM_PROCID} Pytest finished execution"

-if [ "$perfMode" = "true" ]; then
+if [ $SLURM_PROCID -eq 0 ] && [ "$perfMode" = "true" ] && [[ "$stageName" != *Perf-Sanity* ]]; then
    if [[ "$stageName" == *PyTorch* ]]; then
        basePerfFilename="base_perf_pytorch.csv"
    else
--- a/scripts/check_test_list.py
+++ b/scripts/check_test_list.py
@ -23,10 +23,9 @@ MARKER_LIST_IN_TEST = [" TIMEOUT"]


 def install_python_dependencies(llm_src):
-    subprocess.run(
-        f"cd {llm_src} && pip3 install --retries 1 -r requirements-dev.txt",
-        shell=True,
-        check=True)
+    subprocess.run(f"cd {llm_src} && pip3 install -r requirements-dev.txt",
+                   shell=True,
+                   check=True)
    subprocess.run(
        f"pip3 install --force-reinstall --no-deps {llm_src}/../tensorrt_llm-*.whl",
        shell=True,
--- a/security_scanning/metadata.json
+++ b/security_scanning/metadata.json
@ -1,4 +1,4 @@
 {
-  "commit_hash": "e4c707845ff58fcc0b1d87afb4dd0e64885c780a",
-  "timestamp": "2025-12-07T02:39:14Z"
+  "commit_hash": "8e27ce7084d9fab1051e88fc945732e59689761b",
+  "timestamp": "2025-12-08T02:39:23Z"
 }
--- a/tensorrt_llm/_torch/autotuner.py
+++ b/tensorrt_llm/_torch/autotuner.py
@ -511,6 +511,11 @@ class AutoTunerProfilingCache:
        cache = {}
        cache_data = serializable_cache["cache_data"]

+        def lists_to_tuples(obj):
+            if isinstance(obj, list):
+                return tuple(lists_to_tuples(x) for x in obj)
+            return obj
+
        for key_str, value in cache_data.items():
            # Reconstruct the tuple key safely
            try:
@ -521,7 +526,7 @@ class AutoTunerProfilingCache:
                continue

            runner_id = value["runner_id"]
-            tactic = value["tactic"]
+            tactic = lists_to_tuples(value["tactic"])
            min_time = value["min_time"]

            cache[key] = (runner_id, tactic, min_time)
--- a/tensorrt_llm/_torch/custom_ops/torch_custom_ops.py
+++ b/tensorrt_llm/_torch/custom_ops/torch_custom_ops.py
@ -486,10 +486,10 @@ class CublasLtFP4GemmRunner(TunableRunner):
        self.cublaslt_runner = CublasLtFP4GemmRunner.runner_dict[instance_key]

    def unique_id(self):
-        return hash((
+        return (
            self.to_userbuffers,
            self.output_dtype,
-        ))
+        )

    def get_valid_tactics(self, inputs: List[torch.Tensor],
                          profile: OptimizationProfile, **kwargs) -> List[int]:
--- a/tensorrt_llm/_torch/modules/fused_moe/configurable_moe.py
+++ b/tensorrt_llm/_torch/modules/fused_moe/configurable_moe.py
@ -170,18 +170,23 @@ class ConfigurableMoE(MoE):
        # ConfigurableMoE's super().__init__() was called with real layer_idx and initialized load balancer.
        # Backend was created with init_load_balancer=False and without_comm=True to avoid
        # duplicate initialization. Now sync all attributes from ConfigurableMoE to backend.
-        self.backend.aux_stream_dict = self.aux_stream_dict
-        self.backend.layer_idx = self.layer_idx
-        self.backend.layer_idx_str = self.layer_idx_str
-        self.backend.num_slots = self.num_slots
-        self.backend.layer_load_balancer = self.layer_load_balancer
-        self.backend.repeat_count = self.repeat_count
-        self.backend.repeat_idx = self.repeat_idx
-        self.backend.initial_local_expert_ids = self.initial_local_expert_ids
-        self.backend.initial_global_assignments = self.initial_global_assignments
-        self.backend.slot_start = self.slot_start
-        self.backend.slot_end = self.slot_end
-        self.backend.expert_size_per_partition = self.expert_size_per_partition
+        if self.backend is not None:
+            # Add a check to WAR the issue that the backend is none during torch.compile
+            assert not torch.compiler.is_compiling(), (
+                "Backend should not be none if not in torch.compile"
+            )
+            self.backend.aux_stream_dict = self.aux_stream_dict
+            self.backend.layer_idx = self.layer_idx
+            self.backend.layer_idx_str = self.layer_idx_str
+            self.backend.num_slots = self.num_slots
+            self.backend.layer_load_balancer = self.layer_load_balancer
+            self.backend.repeat_count = self.repeat_count
+            self.backend.repeat_idx = self.repeat_idx
+            self.backend.initial_local_expert_ids = self.initial_local_expert_ids
+            self.backend.initial_global_assignments = self.initial_global_assignments
+            self.backend.slot_start = self.slot_start
+            self.backend.slot_end = self.slot_end
+            self.backend.expert_size_per_partition = self.expert_size_per_partition

        # Create weights here, because the backend needs the layer_load_balancer info to create weights
        model_config._frozen = False
--- a/tensorrt_llm/_torch/pyexecutor/model_engine.py
+++ b/tensorrt_llm/_torch/pyexecutor/model_engine.py
@ -2844,11 +2844,17 @@ class PyTorchModelEngine(ModelEngine):
        # Disable UB for unsupported platforms
        if not ub.ub_supported():
            return False
-        use_nccl_symmetric = self.llm_args.allreduce_strategy == "NCCL_SYMMETRIC"
-        ub.initialize_userbuffers_manager(
-            self.mapping.tp_size, self.mapping.pp_size, self.mapping.cp_size,
-            self.mapping.rank, self.mapping.gpus_per_node,
-            hidden_size * self.max_num_tokens * 2, use_nccl_symmetric)
+        # NCCL_SYMMETRIC strategy no longer requires UserBuffer allocator initialization.
+        # It uses NCCLWindowAllocator from ncclUtils directly.
+        if self.llm_args.allreduce_strategy == "NCCL_SYMMETRIC":
+            # Skip UB initialization for NCCL_SYMMETRIC - it uses NCCLWindowAllocator directly
+            return False
+        ub.initialize_userbuffers_manager(self.mapping.tp_size,
+                                          self.mapping.pp_size,
+                                          self.mapping.cp_size,
+                                          self.mapping.rank,
+                                          self.mapping.gpus_per_node,
+                                          hidden_size * self.max_num_tokens * 2)

        return True

--- a/tensorrt_llm/functional.py
+++ b/tensorrt_llm/functional.py
@ -4020,7 +4020,10 @@ def create_allreduce_plugin(
    pfc = trt.PluginFieldCollection(pfc)
    ar_plug = allreduce_plg_creator.create_plugin("allreduce", pfc)
    plug_inputs = [tensor]
-    if all_reduce_params.strategy != AllReduceStrategy.NCCL and all_reduce_params.strategy != AllReduceStrategy.UB:
+    if all_reduce_params.strategy not in {
+            AllReduceStrategy.NCCL, AllReduceStrategy.UB,
+            AllReduceStrategy.NCCL_SYMMETRIC
+    }:
        plug_inputs.append(workspace)
    if all_reduce_params.fusion_op != AllReduceFusionOp.NONE:
        if all_reduce_params.has_bias() == 1:
@ -4092,7 +4095,7 @@ def allreduce(
    workspace = None
    if all_reduce_params.strategy != AllReduceStrategy.NCCL and all_reduce_params.strategy != AllReduceStrategy.UB:
        if current_all_reduce_helper().workspace is None:
-            all_reduce_params.strategy = AllReduceStrategy.NCCL
+            all_reduce_params.strategy = AllReduceStrategy.NCCL_SYMMETRIC
        else:
            workspace = current_all_reduce_helper().workspace.trt_tensor
    if all_reduce_params.strategy == AllReduceStrategy.UB:
--- a/tests/integration/defs/accuracy/test_llm_api_pytorch.py
+++ b/tests/integration/defs/accuracy/test_llm_api_pytorch.py
@ -13,9 +13,37 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import os
+import sys

 import pytest
 import torch
+from mpi4py.futures import MPIPoolExecutor
+
+
+def patch_mpi_pool_session_for_env(mocker, env_vars: dict):
+    """
+    Patch MpiPoolSession._start_mpi_pool to propagate environment variables to MPI child processes.
+
+    Uses MPIPoolExecutor's built-in `env` parameter instead of `initializer` to avoid
+    segfault issues during process cleanup (UCX memory cache conflicts with PyTorch
+    tensor cleanup during Py_FinalizeEx).
+
+    Args:
+        mocker: pytest-mock mocker fixture
+        env_vars: Dictionary of environment variable name -> value to propagate
+    """
+    from tensorrt_llm.llmapi.mpi_session import MpiPoolSession
+
+    def patched_start_mpi_pool(self):
+        assert not self.mpi_pool, 'MPI session already started'
+        self.mpi_pool = MPIPoolExecutor(max_workers=self.n_workers,
+                                        path=sys.path,
+                                        env=env_vars)
+
+    mocker.patch.object(MpiPoolSession, '_start_mpi_pool',
+                        patched_start_mpi_pool)
+
+
 from defs.conftest import get_sm_version, is_sm_100f

 from tensorrt_llm import LLM
@ -1830,9 +1858,24 @@ class TestDeepSeekV3Lite(LlmapiAccuracyTestHarness):
                             ids=["tp4", "ep4", "tp2pp2", "pp4"])
    @parametrize_with_ids("mtp_nextn", [0, 2])
    @parametrize_with_ids("moe_backend", ["CUTLASS", "TRTLLM", "CUTEDSL"])
+    @pytest.mark.parametrize("enable_configurable_moe", [0, 1],
+                             ids=lambda x: ""
+                             if x == 0 else "enable_configurable_moe")
    def test_nvfp4_4gpus(self, fp8kv, attention_dp, cuda_graph,
                         overlap_scheduler, tp_size, pp_size, ep_size,
-                         torch_compile, mtp_nextn, moe_backend):
+                         torch_compile, mtp_nextn, moe_backend,
+                         enable_configurable_moe, mocker):
+        # Handle ENABLE_CONFIGURABLE_MOE environment variable
+        if enable_configurable_moe == 1 and moe_backend != "TRTLLM":
+            pytest.skip(
+                f"ENABLE_CONFIGURABLE_MOE=1 is only supported with TRTLLM backend, "
+                f"current backend is {moe_backend}")
+
+        # Patch MpiPoolSession to propagate env vars to MPI worker processes
+        env_value = "1" if enable_configurable_moe == 1 and moe_backend == "TRTLLM" else "0"
+        patch_mpi_pool_session_for_env(mocker,
+                                       {"ENABLE_CONFIGURABLE_MOE": env_value})
+
        if moe_backend == "TRTLLM" and (get_sm_version() == 120
                                        or get_sm_version() == 121):
            pytest.skip(
@ -3452,9 +3495,23 @@ class TestQwen3_30B_A3B(LlmapiAccuracyTestHarness):
        ids=["latency", "ep2", "ep4"])
    @pytest.mark.parametrize("activation_dtype", ["static_fp8", "mxfp8"],
                             ids=["fp8", "mxfp8"])
+    @pytest.mark.parametrize("enable_configurable_moe", [0, 1],
+                             ids=lambda x: ""
+                             if x == 0 else "enable_configurable_moe")
    def test_w4a8_mxfp4(self, moe_backend, tp_size, pp_size, ep_size,
                        attention_dp, cuda_graph, overlap_scheduler,
-                        activation_dtype):
+                        activation_dtype, enable_configurable_moe, mocker):
+        # Handle ENABLE_CONFIGURABLE_MOE environment variable
+        if enable_configurable_moe == 1 and moe_backend != "TRTLLM":
+            pytest.skip(
+                f"ENABLE_CONFIGURABLE_MOE=1 is only supported with TRTLLM backend, "
+                f"current backend is {moe_backend}")
+
+        # Patch MpiPoolSession to propagate env vars to MPI worker processes
+        env_value = "1" if enable_configurable_moe == 1 and moe_backend == "TRTLLM" else "0"
+        patch_mpi_pool_session_for_env(mocker,
+                                       {"ENABLE_CONFIGURABLE_MOE": env_value})
+
        if moe_backend == "TRITON":
            if not IS_TRITON_KERNELS_AVAILABLE:
                pytest.skip("TRITON moe backend is not available.")
@ -3906,9 +3963,23 @@ class TestGPTOSS(LlmapiAccuracyTestHarness):
            (4, 1, 4, True, True, True),
        ],
        ids=["tp4", "ep4", "dp4"])
+    @pytest.mark.parametrize("enable_configurable_moe", [0, 1],
+                             ids=lambda x: ""
+                             if x == 0 else "enable_configurable_moe")
    def test_w4_4gpus(self, kv_cache_dtype, moe_backend, tp_size, pp_size,
                      ep_size, attention_dp, cuda_graph, overlap_scheduler,
-                      mocker):
+                      enable_configurable_moe, mocker):
+        # Handle ENABLE_CONFIGURABLE_MOE environment variable
+        if enable_configurable_moe == 1 and moe_backend != "TRTLLM":
+            pytest.skip(
+                f"ENABLE_CONFIGURABLE_MOE=1 is only supported with TRTLLM backend, "
+                f"current backend is {moe_backend}")
+
+        # Patch MpiPoolSession to propagate env vars to MPI worker processes
+        env_value = "1" if enable_configurable_moe == 1 and moe_backend == "TRTLLM" else "0"
+        patch_mpi_pool_session_for_env(mocker,
+                                       {"ENABLE_CONFIGURABLE_MOE": env_value})
+
        if moe_backend == "TRITON":
            if not IS_TRITON_KERNELS_AVAILABLE:
                pytest.skip("Triton kernels are not available")
@ -3925,7 +3996,8 @@ class TestGPTOSS(LlmapiAccuracyTestHarness):

        pytorch_config = dict(
            disable_overlap_scheduler=not overlap_scheduler,
-            cuda_graph_config=CudaGraphConfig() if cuda_graph else None)
+            cuda_graph_config=CudaGraphConfig() if cuda_graph else None,
+            moe_config=MoeConfig(backend=moe_backend))

        kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.7,
                                        dtype=kv_cache_dtype)
@ -3939,8 +4011,7 @@ class TestGPTOSS(LlmapiAccuracyTestHarness):
                  max_seq_len=max_seq_len,
                  max_batch_size=720,
                  **pytorch_config,
-                  enable_attention_dp=attention_dp,
-                  moe_config=MoeConfig(backend=moe_backend))
+                  enable_attention_dp=attention_dp)

        with llm:
            model_name = "GPT-OSS/120B-MXFP4"
@ -4252,8 +4323,17 @@ class TestGPTOSS(LlmapiAccuracyTestHarness):
    @pytest.mark.parametrize(
        "kv_cache_dtype",
        ["auto", pytest.param("fp8", marks=skip_pre_blackwell)])
-    def test_w4_4gpus_online_eplb(self, kv_cache_dtype, mocker):
+    @pytest.mark.parametrize("enable_configurable_moe", [0, 1],
+                             ids=lambda x: ""
+                             if x == 0 else "enable_configurable_moe")
+    def test_w4_4gpus_online_eplb(self, kv_cache_dtype, enable_configurable_moe,
+                                  mocker):
        """Test GPTOSS with online expert parallel load balancer using TRTLLM backend and attention DP."""
+        # Patch MpiPoolSession to propagate env vars to MPI worker processes
+        env_value = "1" if enable_configurable_moe == 1 else "0"
+        patch_mpi_pool_session_for_env(mocker,
+                                       {"ENABLE_CONFIGURABLE_MOE": env_value})
+
        mocker.patch.object(GSM8K, "MAX_OUTPUT_LEN", 8192)
        mocker.patch.dict(GSM8K.EVALUATE_KWARGS,
                          {"scores_filter": "exact_match,flexible-extract"})
--- a/tests/integration/defs/conftest.py
+++ b/tests/integration/defs/conftest.py
@ -2209,6 +2209,94 @@ def pytest_generate_tests(metafunc: pytest.Metafunc):
    metafunc.parametrize("case", uts, ids=lambda x: x)


+# Test cases that use enable_configurable_moe parameter and need ID conversion
+TESTS_WITH_CONFIGURABLE_MOE = [
+    "TestDeepSeekV3Lite::test_nvfp4_4gpus",
+    "TestGPTOSS::test_w4_4gpus",
+    "TestGPTOSS::test_w4_4gpus_online_eplb",
+    "TestQwen3_30B_A3B::test_w4a8_mxfp4",
+]
+
+
+def _convert_clean_to_original_moe_test_id(test_id):
+    """Convert clean MoE test ID back to original format for pytest collection.
+
+    Example: "test_llm_api_pytorch.py::test_foo[param]" -> "test_llm_api_pytorch.py::test_foo[-param]"
+
+    This is needed because the `enable_configurable_moe` parameter uses empty string
+    as ID when value is 0, resulting in test IDs like "test_foo[-param]".
+    We clean these up in pytest_collection_modifyitems, but pytest filters tests
+    during collection using the original IDs. So when user runs with clean test name,
+    we need to convert it back to match the original.
+    """
+    if "test_llm_api_pytorch.py" not in test_id:
+        return test_id
+
+    # Match pattern like "test_name[params]" and add leading dash after "["
+    # But only if params don't already start with "-" or "enable_configurable_moe"
+    match = re.search(r"\[([^\]]+)\]", test_id)
+    if match:
+        params = match.group(1)
+        # Skip if already has leading dash or starts with enable_configurable_moe
+        if not params.startswith("-") and not params.startswith(
+                "enable_configurable_moe"):
+            # Add leading dash to params
+            new_params = "-" + params
+            test_id = test_id.replace(f"[{params}]", f"[{new_params}]")
+
+    return test_id
+
+
+def pytest_sessionstart(session):
+    """Convert clean MoE test IDs in config.args to original format for collection.
+
+    This is needed because pytest filters tests during collection using original IDs.
+    When user runs with clean test name, we convert it back to match the original.
+    """
+    args = session.config.args
+    for i, arg in enumerate(args):
+        if "test_llm_api_pytorch.py" in arg and "[" in arg:
+            # Only apply conversion to specific tests that use enable_configurable_moe
+            should_convert = any(test_name in arg
+                                 for test_name in TESTS_WITH_CONFIGURABLE_MOE)
+            if should_convert:
+                args[i] = _convert_clean_to_original_moe_test_id(arg)
+
+
+def _clean_moe_test_ids(items):
+    """Clean up test IDs by removing leading/trailing dashes from parameter IDs.
+
+    This is needed because `enable_configurable_moe` parameter can be empty,
+    resulting in ugly test IDs like "test_foo[-True]" or "test_foo[--abc]".
+    We clean these up to "test_foo[True]" or "test_foo[abc]" so that:
+    1. Test names in waive files and test lists remain unchanged
+    2. Test reports look cleaner
+    """
+    for item in items:
+        if "test_llm_api_pytorch.py" in item.nodeid and "[" in item.nodeid:
+            # Only apply cleanup to specific tests that use enable_configurable_moe
+            should_cleanup = any(test_name in item.nodeid
+                                 for test_name in TESTS_WITH_CONFIGURABLE_MOE)
+            if should_cleanup:
+                original_nodeid = item.nodeid
+                original_name = item.name
+                nodeid = item.nodeid
+                name = item.name
+
+                # Clean up leading/trailing dashes in nodeid
+                nodeid = nodeid.replace("[-", "[")
+                nodeid = nodeid.replace("-]", "]")
+
+                # Clean up leading/trailing dashes in name
+                name = name.replace("[-", "[")
+                name = name.replace("-]", "]")
+
+                if nodeid != original_nodeid:
+                    item._nodeid = nodeid
+                if name != original_name:
+                    item.name = name
+
+
@pytest.hookimpl(tryfirst=True, hookwrapper=True)
 def pytest_collection_modifyitems(session, config, items):
    testlist_path = config.getoption("--test-list")
@ -2217,6 +2305,10 @@ def pytest_collection_modifyitems(session, config, items):
    perf_test = config.getoption("--perf")
    test_model_suites = config.getoption("--test-model-suites")

+    # TODO Once the MoE refactor is complete, this should be removed.
+    # This is a temporary WAR to minimize the impact of the MoE refactor on the existing test lists.
+    _clean_moe_test_ids(items)
+
    if perf_test:
        global ALL_PYTEST_ITEMS
        ALL_PYTEST_ITEMS = None
--- a/tests/integration/defs/cpp/test_multi_gpu.py
+++ b/tests/integration/defs/cpp/test_multi_gpu.py
@ -127,6 +127,24 @@ def run_user_buffer_tests(build_dir: _pl.Path, nprocs=2, timeout=300):
                     timeout=timeout)


+def run_nccl_utils_tests(build_dir: _pl.Path, nprocs=2, timeout=300):
+    tests_dir = build_dir / "tests" / "unit_tests" / "multi_gpu"
+    mgpu_env = get_multi_gpu_env()
+
+    nccl_utils_test = [
+        "mpirun",
+        "-n",
+        f"{nprocs}",
+        "--allow-run-as-root",
+        "ncclUtilsTest",
+    ]
+
+    _cpp.run_command(nccl_utils_test,
+                     cwd=tests_dir,
+                     env=mgpu_env,
+                     timeout=timeout)
+
+
 def run_llama_executor_leader_tests(build_dir: _pl.Path, timeout=1500):
    tests_dir = build_dir / "tests" / "e2e_tests"

@ -505,6 +523,15 @@ def test_user_buffer(build_google_tests, nprocs, build_dir):
        run_user_buffer_tests(build_dir=build_dir, nprocs=nprocs, timeout=300)


+@pytest.mark.parametrize("build_google_tests", ["80", "86", "89", "90"],
+                         indirect=True)
+@pytest.mark.parametrize("nprocs", [2, 8], ids=["2proc", "8proc"])
+def test_nccl_utils(build_google_tests, nprocs, build_dir):
+
+    if platform.system() != "Windows":
+        run_nccl_utils_tests(build_dir=build_dir, nprocs=nprocs, timeout=300)
+
+
@pytest.mark.parametrize("build_google_tests", ["80", "86", "89", "90"],
                         indirect=True)
@pytest.mark.parametrize("multi_gpu_model", ["t5"], indirect=True)
--- a/tests/integration/defs/disaggregated/test_disaggregated.py
+++ b/tests/integration/defs/disaggregated/test_disaggregated.py
@ -114,13 +114,16 @@ def validate_timing_metrics(perf_metrics_item, request_context=""):
         )), f"gen server_first_token_time is not numeric in {request_context}"
    assert gen_server_arrival <= gen_server_first_token, f"gen server_arrival_time > server_first_token_time in {request_context}"

+    # Network Time Protocol can ensure ms-level accuracy in LAN
+    ntp_tolerance = 1e-3
+
    # Validate timing relationships between different levels
    # Disaggregated server should receive request before individual servers
-    assert disagg_arrival <= ctx_server_arrival, f"disagg_arrival > ctx_server_arrival in {request_context}"
-    assert disagg_arrival <= gen_server_arrival, f"disagg_arrival > gen_server_arrival in {request_context}"
+    assert disagg_arrival - ntp_tolerance <= ctx_server_arrival, f"disagg_arrival > ctx_server_arrival in {request_context}"
+    assert disagg_arrival - ntp_tolerance <= gen_server_arrival, f"disagg_arrival > gen_server_arrival in {request_context}"

    # Context should complete before generation starts
-    assert ctx_server_first_token <= gen_server_arrival, f"ctx_server_first_token > gen_server_arrival in {request_context}"
+    assert ctx_server_first_token - ntp_tolerance <= gen_server_arrival, f"ctx_server_first_token > gen_server_arrival in {request_context}"

    # Validate internal timing consistency
    ctx_arrival_time = ctx_metrics["arrival_time"]
--- a/tests/integration/defs/perf/disagg/execution/executor.py
+++ b/tests/integration/defs/perf/disagg/execution/executor.py
@ -249,8 +249,8 @@ class JobManager:
            logger.error(f"Job submission exception: {error_msg}")
            # Clean up temporary file on exception
            temp_config_path = test_config.temp_config_path
-            if os.path.exists(temp_config_path):
-                os.remove(temp_config_path)
+            # if os.path.exists(temp_config_path):
+            #     os.remove(temp_config_path)
            return False, error_msg

    @staticmethod
--- a/tests/integration/defs/perf/disagg/execution/subprocess_utils.py
+++ b/tests/integration/defs/perf/disagg/execution/subprocess_utils.py
@ -12,6 +12,8 @@ No complex process tree cleanup is needed because:
 import subprocess
 from typing import Optional

+from utils.logger import logger
+

 def exec_cmd(*popenargs, timeout: Optional[float] = None, **kwargs) -> int:
    """Execute command and return exit code.
@ -54,4 +56,10 @@ def exec_cmd_with_output(*popenargs, timeout: Optional[float] = None, **kwargs)
        check=True,
        **kwargs,
    )
+
+    # Log stderr if it exists
+    if result.stderr:
+        stderr_output = result.stderr.decode()
+        logger.error(f"Command stderr: {stderr_output}")
+
    return result.stdout.decode()
--- a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen1_dep16_bs64_eplb0_mtp3_ccb-NIXL.yaml
+++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen1_dep16_bs64_eplb0_mtp3_ccb-NIXL.yaml
@ -14,6 +14,7 @@ slurm:
  account: <account>
  job_time: 02:00:00
  job_name: unified-benchmark
+  extra_args: "--gres=gpu:4"
  numa_bind: true
 benchmark:
  mode: e2e
@ -36,6 +37,8 @@ environment:
  trtllm_repo: ''
  build_wheel: false
  work_dir: <full_path_to_work_dir>
+  worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes"
+  server_env_var: "TRTLLM_SERVER_DISABLE_GC=1"
 profiling:
  nsys_on: false
 accuracy:
--- a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen1_dep16_bs64_eplb0_mtp3_ccb-UCX.yaml
+++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen1_dep16_bs64_eplb0_mtp3_ccb-UCX.yaml
@ -14,6 +14,7 @@ slurm:
  account: <account>
  job_time: 02:00:00
  job_name: unified-benchmark
+  extra_args: "--gres=gpu:4"
  numa_bind: true
 benchmark:
  mode: e2e
@ -36,6 +37,8 @@ environment:
  trtllm_repo: ''
  build_wheel: false
  work_dir: <full_path_to_work_dir>
+  worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes"
+  server_env_var: "TRTLLM_SERVER_DISABLE_GC=1"
 profiling:
  nsys_on: false
 accuracy:
--- a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen1_dep32_bs16_eplb0_mtp3_ccb-NIXL.yaml
+++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen1_dep32_bs16_eplb0_mtp3_ccb-NIXL.yaml
@ -14,6 +14,7 @@ slurm:
  account: <account>
  job_time: 02:00:00
  job_name: unified-benchmark
+  extra_args: "--gres=gpu:4"
  numa_bind: true
 benchmark:
  mode: e2e
@ -36,6 +37,8 @@ environment:
  trtllm_repo: ''
  build_wheel: false
  work_dir: <full_path_to_work_dir>
+  worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes"
+  server_env_var: "TRTLLM_SERVER_DISABLE_GC=1"
 profiling:
  nsys_on: false
 accuracy:
--- a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen1_dep32_bs16_eplb0_mtp3_ccb-UCX.yaml
+++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen1_dep32_bs16_eplb0_mtp3_ccb-UCX.yaml
@ -14,6 +14,7 @@ slurm:
  account: <account>
  job_time: 02:00:00
  job_name: unified-benchmark
+  extra_args: "--gres=gpu:4"
  numa_bind: true
 benchmark:
  mode: e2e
@ -36,6 +37,8 @@ environment:
  trtllm_repo: ''
  build_wheel: false
  work_dir: <full_path_to_work_dir>
+  worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes"
+  server_env_var: "TRTLLM_SERVER_DISABLE_GC=1"
 profiling:
  nsys_on: false
 accuracy:
--- a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_ccb-NIXL.yaml
+++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_ccb-NIXL.yaml
@ -14,6 +14,7 @@ slurm:
  account: <account>
  job_time: 02:00:00
  job_name: unified-benchmark
+  extra_args: "--gres=gpu:4"
  numa_bind: true
 benchmark:
  mode: e2e
@ -36,6 +37,8 @@ environment:
  trtllm_repo: ''
  build_wheel: false
  work_dir: <full_path_to_work_dir>
+  worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes"
+  server_env_var: "TRTLLM_SERVER_DISABLE_GC=1"
 profiling:
  nsys_on: false
 accuracy:
--- a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_ccb-UCX.yaml
+++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_ccb-UCX.yaml
@ -14,6 +14,7 @@ slurm:
  account: <account>
  job_time: 02:00:00
  job_name: unified-benchmark
+  extra_args: "--gres=gpu:4"
  numa_bind: true
 benchmark:
  mode: e2e
@ -36,6 +37,8 @@ environment:
  trtllm_repo: ''
  build_wheel: false
  work_dir: <full_path_to_work_dir>
+  worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes"
+  server_env_var: "TRTLLM_SERVER_DISABLE_GC=1"
 profiling:
  nsys_on: false
 accuracy:
--- a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/Qwen3-235B-A22B-FP4_1k1k_ctx2_gen1_dep16_bs128_eplb0_mtp1_ccb-NIXL.yaml
+++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/Qwen3-235B-A22B-FP4_1k1k_ctx2_gen1_dep16_bs128_eplb0_mtp1_ccb-NIXL.yaml
@ -14,6 +14,7 @@ slurm:
  account: <account>
  job_time: 02:00:00
  job_name: unified-benchmark
+  extra_args: "--gres=gpu:4"
  numa_bind: true
 benchmark:
  mode: e2e
@ -36,6 +37,8 @@ environment:
  trtllm_repo: ''
  build_wheel: false
  work_dir: <full_path_to_work_dir>
+  worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes"
+  server_env_var: "TRTLLM_SERVER_DISABLE_GC=1"
 profiling:
  nsys_on: false
 accuracy:
--- a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/Qwen3-235B-A22B-FP4_1k1k_ctx2_gen1_dep16_bs128_eplb0_mtp1_ccb-UCX.yaml
+++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/Qwen3-235B-A22B-FP4_1k1k_ctx2_gen1_dep16_bs128_eplb0_mtp1_ccb-UCX.yaml
@ -14,6 +14,7 @@ slurm:
  account: <account>
  job_time: 02:00:00
  job_name: unified-benchmark
+  extra_args: "--gres=gpu:4"
  numa_bind: true
 benchmark:
  mode: e2e
@ -36,6 +37,8 @@ environment:
  trtllm_repo: ''
  build_wheel: false
  work_dir: <full_path_to_work_dir>
+  worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes"
+  server_env_var: "TRTLLM_SERVER_DISABLE_GC=1"
 profiling:
  nsys_on: false
 accuracy:
--- a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/Qwen3-235B-A22B-FP8_1k1k_ctx1_gen1_tep8_bs32_eplb0_mtp0_ccb-NIXL.yaml
+++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/Qwen3-235B-A22B-FP8_1k1k_ctx1_gen1_tep8_bs32_eplb0_mtp0_ccb-NIXL.yaml
@ -0,0 +1,95 @@
+# nvbugs: 5561153
+metadata:
+  model_name: Qwen3-235B-A22B-FP8
+  precision: fp8
+  model_dir_name: Qwen3-235B-A22B-FP8
+  supported_gpus:
+  - GB200
+  - GB300
+  script_file: disaggr_torch.slurm
+  benchmark_type: 1k1k
+  config_index: 21
+slurm:
+  script_file: disaggr_torch.slurm
+  partition: <partition>
+  account: <account>
+  job_time: 02:00:00
+  job_name: unified-benchmark
+  extra_args: "--gres=gpu:4"
+  numa_bind: true
+benchmark:
+  mode: e2e
+  use_nv_sa_benchmark: true
+  multi_round: 8
+  benchmark_ratio: 0.8
+  streaming: true
+  concurrency_list: 1 2 4 8 16 36
+  input_length: 1024
+  output_length: 1024
+  dataset_file: <dataset_file>
+hardware:
+  gpus_per_node: 4
+  num_ctx_servers: 1
+  num_gen_servers: 1
+environment:
+  container_mount: <container_mount>
+  container_image: <container_image>
+  model_path: <model_path>
+  trtllm_repo: ''
+  build_wheel: false
+  work_dir: <full_path_to_work_dir>
+  worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes"
+  server_env_var: "TRTLLM_SERVER_DISABLE_GC=1"
+profiling:
+  nsys_on: false
+accuracy:
+  enable_accuracy_test: false
+  model: local-completions
+  tasks: gsm8k
+  model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096
+worker_config:
+  gen:
+    tensor_parallel_size: 4
+    moe_expert_parallel_size: 4
+    enable_attention_dp: false
+    pipeline_parallel_size: 1
+    max_batch_size: 64
+    max_num_tokens: 2048
+    max_seq_len: 2051
+    cuda_graph_config:
+      enable_padding: true
+      max_batch_size: 128
+    print_iter_log: true
+    kv_cache_config:
+      enable_block_reuse: true
+      free_gpu_memory_fraction: 0.7
+      dtype: fp8
+    moe_config:
+      backend: TRTLLM
+    cache_transceiver_config:
+      max_tokens_in_buffer: 2048
+      backend: NIXL
+    stream_interval: 20
+    num_postprocess_workers: 4
+    allreduce_strategy: MNNVL
+    disable_overlap_scheduler: false
+  ctx:
+    max_batch_size: 32
+    max_num_tokens: 2048
+    max_seq_len: 2051
+    tensor_parallel_size: 4
+    moe_expert_parallel_size: 4
+    enable_attention_dp: false
+    pipeline_parallel_size: 1
+    print_iter_log: true
+    cuda_graph_config: null
+    disable_overlap_scheduler: true
+    kv_cache_config:
+      enable_block_reuse: true
+      free_gpu_memory_fraction: 0.7
+      dtype: fp8
+    moe_config:
+      backend: TRTLLM
+    cache_transceiver_config:
+      max_tokens_in_buffer: 2048
+      backend: NIXL
--- a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/Qwen3-235B-A22B-FP8_1k1k_ctx1_gen1_tep8_bs32_eplb0_mtp0_ccb-UCX.yaml
+++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/Qwen3-235B-A22B-FP8_1k1k_ctx1_gen1_tep8_bs32_eplb0_mtp0_ccb-UCX.yaml
@ -0,0 +1,95 @@
+# nvbugs: 5561153
+metadata:
+  model_name: Qwen3-235B-A22B-FP8
+  precision: fp8
+  model_dir_name: Qwen3-235B-A22B-FP8
+  supported_gpus:
+  - GB200
+  - GB300
+  script_file: disaggr_torch.slurm
+  benchmark_type: 1k1k
+  config_index: 21
+slurm:
+  script_file: disaggr_torch.slurm
+  partition: <partition>
+  account: <account>
+  job_time: 02:00:00
+  job_name: unified-benchmark
+  extra_args: "--gres=gpu:4"
+  numa_bind: true
+benchmark:
+  mode: e2e
+  use_nv_sa_benchmark: true
+  multi_round: 8
+  benchmark_ratio: 0.8
+  streaming: true
+  concurrency_list: 1 2 4 8 16 36
+  input_length: 1024
+  output_length: 1024
+  dataset_file: <dataset_file>
+hardware:
+  gpus_per_node: 4
+  num_ctx_servers: 1
+  num_gen_servers: 1
+environment:
+  container_mount: <container_mount>
+  container_image: <container_image>
+  model_path: <model_path>
+  trtllm_repo: ''
+  build_wheel: false
+  work_dir: <full_path_to_work_dir>
+  worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes"
+  server_env_var: "TRTLLM_SERVER_DISABLE_GC=1"
+profiling:
+  nsys_on: false
+accuracy:
+  enable_accuracy_test: false
+  model: local-completions
+  tasks: gsm8k
+  model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096
+worker_config:
+  gen:
+    tensor_parallel_size: 4
+    moe_expert_parallel_size: 4
+    enable_attention_dp: false
+    pipeline_parallel_size: 1
+    max_batch_size: 64
+    max_num_tokens: 2048
+    max_seq_len: 2051
+    cuda_graph_config:
+      enable_padding: true
+      max_batch_size: 128
+    print_iter_log: true
+    kv_cache_config:
+      enable_block_reuse: true
+      free_gpu_memory_fraction: 0.7
+      dtype: fp8
+    moe_config:
+      backend: TRTLLM
+    cache_transceiver_config:
+      max_tokens_in_buffer: 2048
+      backend: UCX
+    stream_interval: 20
+    num_postprocess_workers: 4
+    allreduce_strategy: MNNVL
+    disable_overlap_scheduler: false
+  ctx:
+    max_batch_size: 32
+    max_num_tokens: 2048
+    max_seq_len: 2051
+    tensor_parallel_size: 4
+    moe_expert_parallel_size: 4
+    enable_attention_dp: false
+    pipeline_parallel_size: 1
+    print_iter_log: true
+    cuda_graph_config: null
+    disable_overlap_scheduler: true
+    kv_cache_config:
+      enable_block_reuse: true
+      free_gpu_memory_fraction: 0.7
+      dtype: fp8
+    moe_config:
+      backend: TRTLLM
+    cache_transceiver_config:
+      max_tokens_in_buffer: 2048
+      backend: UCX
--- a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_1k1k_ctx1_gen1_dep32_bs32_eplb0_mtp0_ccb-NIXL.yaml
+++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_1k1k_ctx1_gen1_dep32_bs32_eplb0_mtp0_ccb-NIXL.yaml
@ -14,6 +14,7 @@ slurm:
  account: <account>
  job_time: 02:00:00
  job_name: unified-benchmark
+  extra_args: "--gres=gpu:4"
  numa_bind: true
 benchmark:
  mode: e2e
@ -37,6 +38,8 @@ environment:
  build_wheel: false
  trtllm_wheel_path: ''
  work_dir: <full_path_to_work_dir>
+  worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes"
+  server_env_var: "TRTLLM_SERVER_DISABLE_GC=1"
 profiling:
  nsys_on: false
 accuracy:
--- a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_1k1k_ctx1_gen1_dep32_bs32_eplb0_mtp0_ccb-UCX.yaml
+++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_1k1k_ctx1_gen1_dep32_bs32_eplb0_mtp0_ccb-UCX.yaml
@ -14,6 +14,7 @@ slurm:
  account: <account>
  job_time: 02:00:00
  job_name: unified-benchmark
+  extra_args: "--gres=gpu:4"
  numa_bind: true
 benchmark:
  mode: e2e
@ -36,6 +37,8 @@ environment:
  trtllm_repo: ''
  build_wheel: false
  work_dir: <full_path_to_work_dir>
+  worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes"
+  server_env_var: "TRTLLM_SERVER_DISABLE_GC=1"
 profiling:
  nsys_on: false
 accuracy:
--- a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_ccb-NIXL.yaml
+++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_ccb-NIXL.yaml
@ -14,6 +14,7 @@ slurm:
  account: <account>
  job_time: 02:00:00
  job_name: unified-benchmark
+  extra_args: "--gres=gpu:4"
  numa_bind: true
 benchmark:
  mode: e2e
@ -36,6 +37,8 @@ environment:
  trtllm_repo: ''
  build_wheel: false
  work_dir: <full_path_to_work_dir>
+  worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes"
+  server_env_var: "TRTLLM_SERVER_DISABLE_GC=1"
 profiling:
  nsys_on: false
 accuracy:
--- a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_ccb-UCX.yaml
+++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_ccb-UCX.yaml
@ -14,6 +14,7 @@ slurm:
  account: <account>
  job_time: 02:00:00
  job_name: unified-benchmark
+  extra_args: "--gres=gpu:4"
  numa_bind: true
 benchmark:
  mode: e2e
@ -36,6 +37,8 @@ environment:
  trtllm_repo: ''
  build_wheel: false
  work_dir: <full_path_to_work_dir>
+  worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes"
+  server_env_var: "TRTLLM_SERVER_DISABLE_GC=1"
 profiling:
  nsys_on: false
 accuracy:
--- a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp3_ccb-NIXL.yaml
+++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp3_ccb-NIXL.yaml
@ -14,6 +14,7 @@ slurm:
  account: <account>
  job_time: 02:00:00
  job_name: unified-benchmark
+  extra_args: "--gres=gpu:4"
  numa_bind: true
 benchmark:
  mode: e2e
@ -36,6 +37,8 @@ environment:
  trtllm_repo: ''
  build_wheel: false
  work_dir: <full_path_to_work_dir>
+  worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes"
+  server_env_var: "TRTLLM_SERVER_DISABLE_GC=1"
 profiling:
  nsys_on: false
 accuracy:
--- a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp3_ccb-UCX.yaml
+++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp3_ccb-UCX.yaml
@ -14,6 +14,7 @@ slurm:
  account: <account>
  job_time: 02:00:00
  job_name: unified-benchmark
+  extra_args: "--gres=gpu:4"
  numa_bind: true
 benchmark:
  mode: e2e
@ -36,6 +37,8 @@ environment:
  trtllm_repo: ''
  build_wheel: false
  work_dir: <full_path_to_work_dir>
+  worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes"
+  server_env_var: "TRTLLM_SERVER_DISABLE_GC=1"
 profiling:
  nsys_on: false
 accuracy:
--- a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb0_mtp3_ccb-NIXL.yaml
+++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb0_mtp3_ccb-NIXL.yaml
@ -14,6 +14,7 @@ slurm:
  account: <account>
  job_time: 02:00:00
  job_name: unified-benchmark
+  extra_args: "--gres=gpu:4"
  numa_bind: true
 benchmark:
  mode: e2e
@ -36,6 +37,8 @@ environment:
  trtllm_repo: ''
  build_wheel: false
  work_dir: <full_path_to_work_dir>
+  worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes"
+  server_env_var: "TRTLLM_SERVER_DISABLE_GC=1"
 profiling:
  nsys_on: false
 accuracy:
--- a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb0_mtp3_ccb-UCX.yaml
+++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb0_mtp3_ccb-UCX.yaml
@ -14,6 +14,7 @@ slurm:
  account: <account>
  job_time: 02:00:00
  job_name: unified-benchmark
+  extra_args: "--gres=gpu:4"
  numa_bind: true
 benchmark:
  mode: e2e
@ -36,6 +37,8 @@ environment:
  trtllm_repo: ''
  build_wheel: false
  work_dir: <full_path_to_work_dir>
+  worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes"
+  server_env_var: "TRTLLM_SERVER_DISABLE_GC=1"
 profiling:
  nsys_on: false
 accuracy:
--- a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs16_eplb0_mtp3_ccb-NIXL.yaml
+++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs16_eplb0_mtp3_ccb-NIXL.yaml
@ -14,6 +14,7 @@ slurm:
  account: <account>
  job_time: 02:00:00
  job_name: unified-benchmark
+  extra_args: "--gres=gpu:4"
  numa_bind: true
 benchmark:
  mode: e2e
@ -36,6 +37,8 @@ environment:
  trtllm_repo: ''
  build_wheel: false
  work_dir: <full_path_to_work_dir>
+  worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes"
+  server_env_var: "TRTLLM_SERVER_DISABLE_GC=1"
 profiling:
  nsys_on: false
 accuracy:
--- a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs16_eplb0_mtp3_ccb-UCX.yaml
+++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs16_eplb0_mtp3_ccb-UCX.yaml
@ -14,6 +14,7 @@ slurm:
  account: <account>
  job_time: 02:00:00
  job_name: unified-benchmark
+  extra_args: "--gres=gpu:4"
  numa_bind: true
 benchmark:
  mode: e2e
@ -36,6 +37,8 @@ environment:
  trtllm_repo: ''
  build_wheel: false
  work_dir: <full_path_to_work_dir>
+  worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes"
+  server_env_var: "TRTLLM_SERVER_DISABLE_GC=1"
 profiling:
  nsys_on: false
 accuracy:
--- a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs32_eplb0_mtp0_ccb-NIXL.yaml
+++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs32_eplb0_mtp0_ccb-NIXL.yaml
@ -14,6 +14,7 @@ slurm:
  account: <account>
  job_time: 02:00:00
  job_name: unified-benchmark
+  extra_args: "--gres=gpu:4"
  numa_bind: true
 benchmark:
  mode: e2e
@ -36,6 +37,8 @@ environment:
  trtllm_repo: ''
  build_wheel: false
  work_dir: <full_path_to_work_dir>
+  worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes"
+  server_env_var: "TRTLLM_SERVER_DISABLE_GC=1"
 profiling:
  nsys_on: false
 accuracy:
--- a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs32_eplb0_mtp0_ccb-UCX.yaml
+++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs32_eplb0_mtp0_ccb-UCX.yaml
@ -14,6 +14,7 @@ slurm:
  account: <account>
  job_time: 02:00:00
  job_name: unified-benchmark
+  extra_args: "--gres=gpu:4"
  numa_bind: true
 benchmark:
  mode: e2e
@ -36,6 +37,8 @@ environment:
  trtllm_repo: ''
  build_wheel: false
  work_dir: <full_path_to_work_dir>
+  worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes"
+  server_env_var: "TRTLLM_SERVER_DISABLE_GC=1"
 profiling:
  nsys_on: false
 accuracy:
--- a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_8k1k_ctx6_gen1_dep16_bs64_eplb0_mtp0_ccb-NIXL.yaml
+++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_8k1k_ctx6_gen1_dep16_bs64_eplb0_mtp0_ccb-NIXL.yaml
@ -14,6 +14,7 @@ slurm:
  account: <account>
  job_time: 02:00:00
  job_name: unified-benchmark
+  extra_args: "--gres=gpu:4"
  numa_bind: true
 benchmark:
  mode: e2e
@ -36,6 +37,8 @@ environment:
  trtllm_repo: ''
  build_wheel: false
  work_dir: <full_path_to_work_dir>
+  worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes"
+  server_env_var: "TRTLLM_SERVER_DISABLE_GC=1"
 profiling:
  nsys_on: false
 accuracy:
--- a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_8k1k_ctx6_gen1_dep16_bs64_eplb0_mtp0_ccb-UCX.yaml
+++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_8k1k_ctx6_gen1_dep16_bs64_eplb0_mtp0_ccb-UCX.yaml
@ -14,6 +14,7 @@ slurm:
  account: <account>
  job_time: 02:00:00
  job_name: unified-benchmark
+  extra_args: "--gres=gpu:4"
  numa_bind: true
 benchmark:
  mode: e2e
@ -36,6 +37,8 @@ environment:
  trtllm_repo: ''
  build_wheel: false
  work_dir: <full_path_to_work_dir>
+  worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes"
+  server_env_var: "TRTLLM_SERVER_DISABLE_GC=1"
 profiling:
  nsys_on: false
 accuracy:
--- a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_8k1k_ctx8_gen1_dep32_bs16_eplb0_mtp3_ccb-NIXL.yaml
+++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_8k1k_ctx8_gen1_dep32_bs16_eplb0_mtp3_ccb-NIXL.yaml
@ -14,6 +14,7 @@ slurm:
  account: <account>
  job_time: 02:00:00
  job_name: unified-benchmark
+  extra_args: "--gres=gpu:4"
  numa_bind: true
 benchmark:
  mode: e2e
@ -36,6 +37,8 @@ environment:
  trtllm_repo: ''
  build_wheel: false
  work_dir: <full_path_to_work_dir>
+  worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes"
+  server_env_var: "TRTLLM_SERVER_DISABLE_GC=1"
 profiling:
  nsys_on: false
 accuracy:
--- a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_8k1k_ctx8_gen1_dep32_bs16_eplb0_mtp3_ccb-UCX.yaml
+++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_8k1k_ctx8_gen1_dep32_bs16_eplb0_mtp3_ccb-UCX.yaml
@ -14,6 +14,7 @@ slurm:
  account: <account>
  job_time: 02:00:00
  job_name: unified-benchmark
+  extra_args: "--gres=gpu:4"
  numa_bind: true
 benchmark:
  mode: e2e
@ -36,6 +37,8 @@ environment:
  trtllm_repo: ''
  build_wheel: false
  work_dir: <full_path_to_work_dir>
+  worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes"
+  server_env_var: "TRTLLM_SERVER_DISABLE_GC=1"
 profiling:
  nsys_on: false
 accuracy:
--- a/tests/integration/defs/perf/disagg/test_configs/wideep/accuracy/deepseek-r1-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb288_mtp3_ccb-NIXL.yaml
+++ b/tests/integration/defs/perf/disagg/test_configs/wideep/accuracy/deepseek-r1-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb288_mtp3_ccb-NIXL.yaml
@ -21,6 +21,7 @@ slurm:
  account: <account>
  job_time: 02:00:00
  job_name: unified-benchmark
+  extra_args: "--gres=gpu:4"
  numa_bind: true
 benchmark:
  mode: gen_only
--- a/tests/integration/defs/perf/disagg/test_configs/wideep/perf/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen1_dep16_bs64_eplb288_mtp3_ccb-NIXL.yaml
+++ b/tests/integration/defs/perf/disagg/test_configs/wideep/perf/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen1_dep16_bs64_eplb288_mtp3_ccb-NIXL.yaml
@ -15,6 +15,7 @@ slurm:
  account: <account>
  job_time: 02:00:00
  job_name: unified-benchmark
+  extra_args: "--gres=gpu:4"
  numa_bind: true
 benchmark:
  mode: gen_only
@ -37,6 +38,8 @@ environment:
  trtllm_repo: ''
  build_wheel: false
  work_dir: <full_path_to_work_dir>
+  worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes"
+  server_env_var: "TRTLLM_SERVER_DISABLE_GC=1"
 profiling:
  nsys_on: false
 accuracy:
--- a/tests/integration/defs/perf/disagg/test_configs/wideep/perf/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen1_dep16_bs64_eplb288_mtp3_ccb-UCX.yaml
+++ b/tests/integration/defs/perf/disagg/test_configs/wideep/perf/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen1_dep16_bs64_eplb288_mtp3_ccb-UCX.yaml
@ -15,6 +15,7 @@ slurm:
  account: <account>
  job_time: 02:00:00
  job_name: unified-benchmark
+  extra_args: "--gres=gpu:4"
  numa_bind: true
 benchmark:
  mode: gen_only
@ -37,6 +38,8 @@ environment:
  trtllm_repo: ''
  build_wheel: false
  work_dir: <full_path_to_work_dir>
+  worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes"
+  server_env_var: "TRTLLM_SERVER_DISABLE_GC=1"
 profiling:
  nsys_on: false
 accuracy:
--- a/tests/integration/defs/perf/disagg/test_configs/wideep/perf/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen1_dep32_bs16_eplb288_mtp3_ccb-NIXL.yaml
+++ b/tests/integration/defs/perf/disagg/test_configs/wideep/perf/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen1_dep32_bs16_eplb288_mtp3_ccb-NIXL.yaml
@ -15,6 +15,7 @@ slurm:
  account: <account>
  job_time: 02:00:00
  job_name: unified-benchmark
+  extra_args: "--gres=gpu:4"
  numa_bind: true
 benchmark:
  mode: gen_only
@ -37,6 +38,8 @@ environment:
  trtllm_repo: ''
  build_wheel: false
  work_dir: <full_path_to_work_dir>
+  worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes"
+  server_env_var: "TRTLLM_SERVER_DISABLE_GC=1"
 profiling:
  nsys_on: false
 accuracy:
--- a/tests/integration/defs/perf/disagg/test_configs/wideep/perf/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen1_dep32_bs16_eplb288_mtp3_ccb-UCX.yaml
+++ b/tests/integration/defs/perf/disagg/test_configs/wideep/perf/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen1_dep32_bs16_eplb288_mtp3_ccb-UCX.yaml
@ -15,6 +15,7 @@ slurm:
  account: <account>
  job_time: 02:00:00
  job_name: unified-benchmark
+  extra_args: "--gres=gpu:4"
  numa_bind: true
 benchmark:
  mode: gen_only
@ -37,6 +38,8 @@ environment:
  trtllm_repo: ''
  build_wheel: false
  work_dir: <full_path_to_work_dir>
+  worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes"
+  server_env_var: "TRTLLM_SERVER_DISABLE_GC=1"
 profiling:
  nsys_on: false
 accuracy:
--- a/tests/integration/defs/perf/disagg/test_configs/wideep/perf/Qwen3-235B-A22B-FP4_1k1k_ctx2_gen1_dep16_bs128_eplb288_mtp1_ccb-NIXL.yaml
+++ b/tests/integration/defs/perf/disagg/test_configs/wideep/perf/Qwen3-235B-A22B-FP4_1k1k_ctx2_gen1_dep16_bs128_eplb288_mtp1_ccb-NIXL.yaml
@ -15,6 +15,7 @@ slurm:
  account: <account>
  job_time: 02:00:00
  job_name: unified-benchmark
+  extra_args: "--gres=gpu:4"
  numa_bind: true
 benchmark:
  mode: gen_only
@ -37,6 +38,8 @@ environment:
  trtllm_repo: ''
  build_wheel: false
  work_dir: <full_path_to_work_dir>
+  worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes"
+  server_env_var: "TRTLLM_SERVER_DISABLE_GC=1"
 profiling:
  nsys_on: false
 accuracy:
--- a/tests/integration/defs/perf/disagg/test_configs/wideep/perf/Qwen3-235B-A22B-FP4_1k1k_ctx2_gen1_dep16_bs128_eplb288_mtp1_ccb-UCX.yaml
+++ b/tests/integration/defs/perf/disagg/test_configs/wideep/perf/Qwen3-235B-A22B-FP4_1k1k_ctx2_gen1_dep16_bs128_eplb288_mtp1_ccb-UCX.yaml
@ -15,6 +15,7 @@ slurm:
  account: <account>
  job_time: 02:00:00
  job_name: unified-benchmark
+  extra_args: "--gres=gpu:4"
  numa_bind: true
 benchmark:
  mode: gen_only
@ -37,6 +38,8 @@ environment:
  trtllm_repo: ''
  build_wheel: false
  work_dir: <full_path_to_work_dir>
+  worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes"
+  server_env_var: "TRTLLM_SERVER_DISABLE_GC=1"
 profiling:
  nsys_on: false
 accuracy:
--- a/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-r1-fp4_1k1k_ctx1_gen1_dep32_bs32_eplb288_mtp0_ccb-NIXL.yaml
+++ b/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-r1-fp4_1k1k_ctx1_gen1_dep32_bs32_eplb288_mtp0_ccb-NIXL.yaml
@ -15,6 +15,7 @@ slurm:
  account: <account>
  job_time: 02:00:00
  job_name: unified-benchmark
+  extra_args: "--gres=gpu:4"
  numa_bind: true
 benchmark:
  mode: gen_only
@ -37,6 +38,8 @@ environment:
  trtllm_repo: ''
  build_wheel: false
  work_dir: <full_path_to_work_dir>
+  worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes"
+  server_env_var: "TRTLLM_SERVER_DISABLE_GC=1"
 profiling:
  nsys_on: false
 accuracy:
--- a/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-r1-fp4_1k1k_ctx1_gen1_dep32_bs32_eplb288_mtp0_ccb-UCX.yaml
+++ b/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-r1-fp4_1k1k_ctx1_gen1_dep32_bs32_eplb288_mtp0_ccb-UCX.yaml
@ -15,6 +15,7 @@ slurm:
  account: <account>
  job_time: 02:00:00
  job_name: unified-benchmark
+  extra_args: "--gres=gpu:4"
  numa_bind: true
 benchmark:
  mode: gen_only
@ -37,6 +38,8 @@ environment:
  trtllm_repo: ''
  build_wheel: false
  work_dir: <full_path_to_work_dir>
+  worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes"
+  server_env_var: "TRTLLM_SERVER_DISABLE_GC=1"
 profiling:
  nsys_on: false
 accuracy:
--- a/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-r1-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb288_mtp3_ccb-NIXL.yaml
+++ b/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-r1-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb288_mtp3_ccb-NIXL.yaml
@ -15,6 +15,7 @@ slurm:
  account: <account>
  job_time: 02:00:00
  job_name: unified-benchmark
+  extra_args: "--gres=gpu:4"
  numa_bind: true
 benchmark:
  mode: gen_only
@ -37,6 +38,8 @@ environment:
  trtllm_repo: ''
  build_wheel: false
  work_dir: <full_path_to_work_dir>
+  worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes"
+  server_env_var: "TRTLLM_SERVER_DISABLE_GC=1"
 profiling:
  nsys_on: false
 accuracy:
--- a/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-r1-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb288_mtp3_ccb-UCX.yaml
+++ b/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-r1-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb288_mtp3_ccb-UCX.yaml
@ -15,6 +15,7 @@ slurm:
  account: <account>
  job_time: 02:00:00
  job_name: unified-benchmark
+  extra_args: "--gres=gpu:4"
  numa_bind: true
 benchmark:
  mode: gen_only
@ -37,6 +38,8 @@ environment:
  trtllm_repo: ''
  build_wheel: false
  work_dir: <full_path_to_work_dir>
+  worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes"
+  server_env_var: "TRTLLM_SERVER_DISABLE_GC=1"
 profiling:
  nsys_on: false
 accuracy:
--- a/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-r1-fp4_1k1k_ctx2_gen1_dep48_bs16_eplb288_mtp3_ccb-DEFAULT.yaml
+++ b/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-r1-fp4_1k1k_ctx2_gen1_dep48_bs16_eplb288_mtp3_ccb-DEFAULT.yaml
@ -0,0 +1,110 @@
+# nvbugs: 5422621
+metadata:
+  model_name: deepseek-r1-fp4
+  precision: fp4
+  model_dir_name: DeepSeek-R1-0528-FP4-V2
+  supported_gpus:
+  - GB200
+  - GB300
+  script_file: disaggr_torch.slurm
+  benchmark_type: 8k1k
+  config_index: 7
+  dataset_file: datasets/deepseek-r1-8192-1024-200000-ratio-1_for_serve.json
+slurm:
+  script_file: disaggr_torch.slurm
+  partition: <partition>
+  account: <account>
+  job_time: 02:00:00
+  job_name: unified-benchmark
+  extra_args: "--gres=gpu:4"
+  numa_bind: true
+benchmark:
+  mode: gen_only
+  use_nv_sa_benchmark: false
+  multi_round: 8
+  benchmark_ratio: 0.8
+  streaming: true
+  concurrency_list: '12288'
+  input_length: 1024
+  output_length: 1024
+  dataset_file: <dataset_file>
+hardware:
+  gpus_per_node: 4
+  num_ctx_servers: 2
+  num_gen_servers: 1
+environment:
+  container_mount: <container_mount>
+  container_image: <container_image>
+  model_path: <model_path>
+  trtllm_repo: ''
+  build_wheel: false
+  work_dir: <full_path_to_work_dir>
+  worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes"
+  server_env_var: "TRTLLM_SERVER_DISABLE_GC=1"
+profiling:
+  nsys_on: false
+accuracy:
+  enable_accuracy_test: false
+  model: local-completions
+  tasks: gsm8k
+  model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096
+worker_config:
+  gen:
+    enable_layerwise_nvtx_marker: true
+    tensor_parallel_size: 48
+    moe_expert_parallel_size: 48
+    enable_attention_dp: true
+    enable_lm_head_tp_in_adp: true
+    pipeline_parallel_size: 1
+    max_batch_size: 1024
+    max_num_tokens: 1024
+    max_seq_len: 2176
+    cuda_graph_config:
+      enable_padding: true
+      batch_sizes:
+      - 1
+      - 2
+      - 4
+      - 8
+      - 16
+      - 32
+      - 64
+      - 128
+      - 256
+      - 512
+      - 768
+      - 1024
+      - 2048
+    print_iter_log: true
+    kv_cache_config:
+      enable_block_reuse: false
+      free_gpu_memory_fraction: 0.7
+      dtype: fp8
+    moe_config:
+      backend: WIDEEP
+      load_balancer:
+        num_slots: 288
+        layer_updates_per_iter: 1
+    cache_transceiver_config:
+      max_tokens_in_buffer: 8320
+      backend: DEFAULT
+    stream_interval: 20
+  ctx:
+    enable_layerwise_nvtx_marker: true
+    max_batch_size: 4
+    max_num_tokens: 4480
+    max_seq_len: 2176
+    tensor_parallel_size: 4
+    moe_expert_parallel_size: 4
+    enable_attention_dp: true
+    pipeline_parallel_size: 1
+    print_iter_log: true
+    cuda_graph_config: null
+    disable_overlap_scheduler: true
+    kv_cache_config:
+      enable_block_reuse: false
+      free_gpu_memory_fraction: 0.85
+      dtype: fp8
+    cache_transceiver_config:
+      max_tokens_in_buffer: 8320
+      backend: DEFAULT
--- a/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-r1-fp4_8k1k_ctx2_gen1_dep32_bs128_eplb288_mtp3_ccb-DEFAULT.yaml
+++ b/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-r1-fp4_8k1k_ctx2_gen1_dep32_bs128_eplb288_mtp3_ccb-DEFAULT.yaml
@ -15,6 +15,7 @@ slurm:
  account: <account>
  job_time: 02:00:00
  job_name: disaggr-test
+  extra_args: "--gres=gpu:4"
  numa_bind: true
 hardware:
  gpus_per_node: 4
@ -37,6 +38,8 @@ environment:
  trtllm_repo: ''
  build_wheel: false
  work_dir: <full_path_to_work_dir>
+  worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes"
+  server_env_var: "TRTLLM_SERVER_DISABLE_GC=1"
 profiling:
  nsys_on: false
 accuracy:
--- a/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-r1-fp4_8k1k_ctx6_gen1_dep16_bs64_eplb288_mtp0_ccb-NIXL.yaml
+++ b/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-r1-fp4_8k1k_ctx6_gen1_dep16_bs64_eplb288_mtp0_ccb-NIXL.yaml
@ -15,6 +15,7 @@ slurm:
  account: <account>
  job_time: 02:00:00
  job_name: unified-benchmark
+  extra_args: "--gres=gpu:4"
  numa_bind: true
 benchmark:
  mode: gen_only
@ -37,6 +38,8 @@ environment:
  trtllm_repo: ''
  build_wheel: false
  work_dir: <full_path_to_work_dir>
+  worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes"
+  server_env_var: "TRTLLM_SERVER_DISABLE_GC=1"
 profiling:
  nsys_on: false
 accuracy:
--- a/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-r1-fp4_8k1k_ctx6_gen1_dep16_bs64_eplb288_mtp0_ccb-UCX.yaml
+++ b/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-r1-fp4_8k1k_ctx6_gen1_dep16_bs64_eplb288_mtp0_ccb-UCX.yaml
@ -15,6 +15,7 @@ slurm:
  account: <account>
  job_time: 02:00:00
  job_name: unified-benchmark
+  extra_args: "--gres=gpu:4"
  numa_bind: true
 benchmark:
  mode: gen_only
@ -37,6 +38,8 @@ environment:
  trtllm_repo: ''
  build_wheel: false
  work_dir: <full_path_to_work_dir>
+  worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes"
+  server_env_var: "TRTLLM_SERVER_DISABLE_GC=1"
 profiling:
  nsys_on: false
 accuracy:
--- a/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-r1-fp4_8k1k_ctx8_gen1_dep32_bs16_eplb288_mtp3_ccb-NIXL.yaml
+++ b/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-r1-fp4_8k1k_ctx8_gen1_dep32_bs16_eplb288_mtp3_ccb-NIXL.yaml
@ -15,6 +15,7 @@ slurm:
  account: <account>
  job_time: 02:00:00
  job_name: unified-benchmark
+  extra_args: "--gres=gpu:4"
  numa_bind: true
 benchmark:
  mode: gen_only
@ -37,6 +38,8 @@ environment:
  trtllm_repo: ''
  build_wheel: false
  work_dir: <full_path_to_work_dir>
+  worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes"
+  server_env_var: "TRTLLM_SERVER_DISABLE_GC=1"
 profiling:
  nsys_on: false
 accuracy:
--- a/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-r1-fp4_8k1k_ctx8_gen1_dep32_bs16_eplb288_mtp3_ccb-UCX.yaml
+++ b/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-r1-fp4_8k1k_ctx8_gen1_dep32_bs16_eplb288_mtp3_ccb-UCX.yaml
@ -15,6 +15,7 @@ slurm:
  account: <account>
  job_time: 02:00:00
  job_name: unified-benchmark
+  extra_args: "--gres=gpu:4"
  numa_bind: true
 benchmark:
  mode: gen_only
@ -37,6 +38,8 @@ environment:
  trtllm_repo: ''
  build_wheel: false
  work_dir: <full_path_to_work_dir>
+  worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes"
+  server_env_var: "TRTLLM_SERVER_DISABLE_GC=1"
 profiling:
  nsys_on: false
 accuracy:
--- a/tests/integration/defs/perf/disagg/testlist/disagg.txt
+++ b/tests/integration/defs/perf/disagg/testlist/disagg.txt
@ -16,6 +16,8 @@ test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_deepseek-r1-fp4_
 test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs16_eplb0_mtp3_ccb-UCX]
 test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_Qwen3-235B-A22B-FP4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_ccb-NIXL]
 test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_Qwen3-235B-A22B-FP4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_ccb-UCX]
+test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_Qwen3-235B-A22B-FP8_1k1k_ctx1_gen1_tep8_bs32_eplb0_mtp0_ccb-NIXL]
+test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_Qwen3-235B-A22B-FP8_1k1k_ctx1_gen1_tep8_bs32_eplb0_mtp0_ccb-UCX]
 # test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_Qwen3-235B-A22B-FP4_1k1k_ctx1_gen1_dep16_bs64_eplb0_mtp3_ccb-NIXL]
 # test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_Qwen3-235B-A22B-FP4_1k1k_ctx2_gen1_dep16_bs128_eplb0_mtp1_ccb-NIXL]
 # test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_Qwen3-235B-A22B-FP4_1k1k_ctx2_gen1_dep16_bs128_eplb0_mtp1_ccb-UCX]
--- a/tests/integration/defs/perf/disagg/testlist/wideep.txt
+++ b/tests/integration/defs/perf/disagg/testlist/wideep.txt
@ -7,6 +7,7 @@ test_disagg.py::TestDisaggBenchmark::test_benchmark[wideep_perf_deepseek-r1-fp4_
 test_disagg.py::TestDisaggBenchmark::test_benchmark[wideep_perf_deepseek-r1-fp4_8k1k_ctx6_gen1_dep16_bs64_eplb288_mtp0_ccb-UCX]
 test_disagg.py::TestDisaggBenchmark::test_benchmark[wideep_perf_deepseek-r1-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb288_mtp3_ccb-NIXL]
 test_disagg.py::TestDisaggBenchmark::test_benchmark[wideep_perf_deepseek-r1-fp4_1k1k_ctx1_gen1_dep32_bs32_eplb288_mtp0_ccb-NIXL]
+test_disagg.py::TestDisaggBenchmark::test_benchmark[wideep_perf_deepseek-r1-fp4_1k1k_ctx2_gen1_dep48_bs16_eplb288_mtp3_ccb-DEFAULT]
 # test_disagg.py::TestDisaggBenchmark::test_benchmark[wideep_perf_Qwen3-235B-A22B-FP4_1k1k_ctx1_gen1_dep16_bs64_eplb288_mtp3_ccb-NIXL]
 # test_disagg.py::TestDisaggBenchmark::test_benchmark[wideep_perf_Qwen3-235B-A22B-FP4_1k1k_ctx1_gen1_dep32_bs16_eplb288_mtp3_ccb-NIXL]
 # test_disagg.py::TestDisaggBenchmark::test_benchmark[wideep_perf_Qwen3-235B-A22B-FP4_1k1k_ctx2_gen1_dep16_bs128_eplb288_mtp1_ccb-NIXL]
--- a/tests/integration/defs/perf/disagg/utils/common.py
+++ b/tests/integration/defs/perf/disagg/utils/common.py
@ -1,6 +1,7 @@
 """Disaggregated Benchmark Configuration."""

 import os
+from datetime import datetime

 SESSION_COLLECT_CMD_TYPE = "session_collect"

@ -169,7 +170,8 @@ def extract_config_fields(config_data: dict) -> dict:

    # Generate derived fields
    dep_flag = "dep" if gen_enable_dp else "tep"
-    log_base = f"{isl}-{osl}"
+    date_prefix = datetime.now().strftime("%Y%m%d")
+    log_base = f"{date_prefix}/{isl}-{osl}"
    context_dir = (
        f"ctx{ctx_num}_gen{gen_num}_{dep_flag}{gen_tp_size}_"
        f"batch{gen_batch_size}_eplb{eplb_slots}_mtp{mtp_size}"
--- a/tests/integration/defs/perf/open_search_db_utils.py
+++ b/tests/integration/defs/perf/open_search_db_utils.py
@ -20,6 +20,7 @@ import os
 import re
 import sys
 import time
+from datetime import datetime

 from defs.trt_test_alternative import print_info

@ -32,40 +33,6 @@ from jenkins.scripts.open_search_db import OpenSearchDB
 PROJECT_ROOT = "sandbox-temp-trtllm-ci-perf-v1"  # "sandbox-trtllm-ci-perf"
 TEST_INFO_PROJECT_NAME = f"{PROJECT_ROOT}-test_info"

-# Server config fields to compare
-SERVER_FIELDS = [
-    "s_model_name",
-    "l_gpus",
-    "l_tp",
-    "l_ep",
-    "l_pp",
-    "l_max_num_tokens",
-    "b_enable_chunked_prefill",
-    "b_disable_overlap_scheduler",
-    "s_attention_backend",
-    "s_moe_backend",
-    "l_moe_max_num_tokens",
-    "l_stream_interval",
-    "b_enable_attention_dp",
-    "b_attention_dp_balance",
-    "l_batching_wait_iters",
-    "l_timeout_iters",
-    "s_kv_cache_dtype",
-    "b_enable_block_reuse",
-    "d_free_gpu_memory_fraction",
-    "l_max_batch_size",
-    "b_enable_padding",
-]
-
-# Client config fields to compare
-CLIENT_FIELDS = [
-    "l_concurrency",
-    "l_iterations",
-    "l_isl",
-    "l_osl",
-    "d_random_range_ratio",
-]
-
 # Metrics where larger is better
 MAXIMIZE_METRICS = [
    "d_seq_throughput",
@ -137,6 +104,7 @@ def get_job_info():
    trigger_mr_link = ""
    trigger_mr_id = ""
    trigger_mr_commit = ""
+    artifact_url = ""
    if is_pr_job:
        # Get PR info from github_pr_api_url
        github_pr_api_url = global_vars.get("github_pr_api_url", "")
@ -162,6 +130,9 @@ def get_job_info():

        # Set trigger_mr_commit to commit
        trigger_mr_commit = commit
+        artifact_url = f"https://urm.nvidia.com/artifactory/sw-tensorrt-generic/llm-artifacts/LLM/main/L0_PostMerge/{job_id}" if job_id else ""
+    else:
+        artifact_url = f"https://urm.nvidia.com/artifactory/sw-tensorrt-generic/llm-artifacts/LLM/main/L0_PostMerge/{job_id}" if job_id else ""

    return {
        "b_is_baseline": False,
@ -185,11 +156,12 @@ def get_job_info():
        "s_trigger_mr_link": trigger_mr_link,
        "s_trigger_mr_id": trigger_mr_id,
        "s_trigger_mr_commit": trigger_mr_commit,
+        "s_artifact_url": artifact_url,
        "b_is_regression": False,
    }


-def query_history_data():
+def query_history_data(gpu_type):
    """
    Query post-merge data with specific gpu type and model name
    """
@ -209,6 +181,16 @@ def query_history_data():
                            "b_is_post_merge": True
                        }
                    },
+                    {
+                        "term": {
+                            "b_is_regression": False
+                        }
+                    },
+                    {
+                        "term": {
+                            "s_gpu_type": gpu_type
+                        }
+                    },
                    {
                        "range": {
                            "ts_created": {
@ -263,30 +245,38 @@ def query_history_data():
        return []


-def match(history_data, new_data):
+def match(history_data, new_data, match_keys):
    """
    Check if the server and client config of history data matches the new data
    """
-    # Combine all fields to compare (excluding log links)
-    fields_to_compare = SERVER_FIELDS + CLIENT_FIELDS

    def is_empty(value):
-        """Check if a value is empty (None, empty string, etc.)"""
        return value is None or value == ""

-    # Compare each field
-    for field in fields_to_compare:
-        history_value = history_data.get(field)
-        new_value = new_data.get(field)
+    def should_skip_field(field):
+        # Skip fields starting with @, _, ts_
+        if field.startswith('@') or field.startswith('_') or field.startswith(
+                'ts_'):
+            return True
+        # Skip log links and speculative_model_dir and job configs
+        if field in [
+                's_speculative_model_dir', 's_server_log_link',
+                's_ctx_server_log_link', 's_gen_server_log_link',
+                's_client_log_link'
+        ]:
+            return True
+        return False

-        # If both are empty, consider them equal
+    for field in match_keys:
+        # Skip excluded fields
+        if should_skip_field(field):
+            continue
+        history_value = history_data.get(field, None)
+        new_value = new_data.get(field, None)
        if is_empty(history_value) and is_empty(new_value):
            continue
-
-        # If values don't match, return False
        if history_value != new_value:
            return False
-
    return True


@ -339,27 +329,44 @@ def calculate_best_perf_result(history_data_list, new_data):
    return best_metrics


-def get_history_data(new_data_dict):
+def get_history_data(new_data_dict, gpu_type, match_keys):
    """
    Query history post-merge data for each cmd_idx
    """
+
+    def get_latest_data(data_list):
+        if not data_list:
+            return None
+        time_format = "%b %d, %Y @ %H:%M:%S.%f"
+        # Find the item with the maximum ts_created value
+        latest_data = max(
+            data_list,
+            key=lambda x: datetime.strptime(x["ts_created"], time_format))
+        return latest_data
+
    history_baseline_dict = {}
    history_data_dict = {}
    cmd_idxs = new_data_dict.keys()
    for cmd_idx in cmd_idxs:
        history_data_dict[cmd_idx] = []
-        history_baseline_dict[cmd_idx] = None
-    history_data_list = query_history_data()
+        history_baseline_dict[cmd_idx] = []
+    history_data_list = []
+    if cmd_idxs:
+        history_data_list = query_history_data(gpu_type)
    if history_data_list:
        for history_data in history_data_list:
            for cmd_idx in cmd_idxs:
-                if match(history_data, new_data_dict[cmd_idx]):
+                if match(history_data, new_data_dict[cmd_idx], match_keys):
                    if history_data.get("b_is_baseline") and history_data.get(
                            "b_is_baseline") == True:
-                        history_baseline_dict[cmd_idx] = history_data
+                        history_baseline_dict[cmd_idx].append(history_data)
                    else:
                        history_data_dict[cmd_idx].append(history_data)
                    break
+    # Sometime database has several baselines and we only use the latest baseline one
+    for cmd_idx, baseline_list in history_baseline_dict.items():
+        latest_baseline = get_latest_data(baseline_list)
+        history_baseline_dict[cmd_idx] = latest_baseline
    return history_baseline_dict, history_data_dict


@ -477,6 +484,8 @@ def post_new_perf_data(new_baseline_data_dict, new_data_dict,
    # Only post regressive test cases when post-merge.
    if new_baseline_data_dict:
        data_list.extend(regressive_data_list)
+    if not data_list:
+        return
    try:
        print_info(
            f"Ready to post {len(data_list)} data to {TEST_INFO_PROJECT_NAME}")
--- a/tests/integration/defs/perf/test_perf.py
+++ b/tests/integration/defs/perf/test_perf.py
--- a/Show More
+++ b/Show More