From 7c6c49399361e025c8a68474f82f2a20ba5aa1d8 Mon Sep 17 00:00:00 2001
From: Emma Qiao <qqiao@nvidia.com>
Date: Sun, 7 Dec 2025 22:26:47 +0800
Subject: [PATCH 01/10] [None][infra] Waive failed cases for main branch on
 12/07 (#9769)

Signed-off-by: qqiao <qqiao@nvidia.com>
---
 tests/integration/test_lists/waives.txt | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/tests/integration/test_lists/waives.txt b/tests/integration/test_lists/waives.txt
index 8ee57361c3..2d7bfd20d7 100644
--- a/tests/integration/test_lists/waives.txt
+++ b/tests/integration/test_lists/waives.txt
@@ -432,3 +432,8 @@ disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_nix
 disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_overlap_cuda_graph[DeepSeek-V3-Lite-fp8] SKIP (https://nvbugs/5719561)
 disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_ucx[DeepSeek-V3-Lite-fp8] SKIP (https://nvbugs/5719561)
 disaggregated/test_workers.py::test_workers_kv_cache_aware_router_deepseek_v3_lite_bf16[DeepSeek-V3-Lite-bf16] SKIP (https://nvbugs/5719561)
+accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=TRTLLM-mtp_nextn=0-ep4-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False] SKIP (https://nvbugs/5721661)
+accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[throughput_mtp] SKIP (https://nvbugs/5715568)
+accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[latency_moe_trtllm] SKIP (https://nvbugs/5721672)
+unittest/_torch/modules/test_fused_moe.py::test_fused_moe_w4a8_nvfp4_fp8[CUTLASS] SKIP (https://nvbugs/5721912)
+unittest/_torch/auto_deploy/unit/singlegpu/custom_ops/test_flashinfer_attention_op.py::test_flashinfer_attention_op_context_input_pos[cuda-dtype0-4-8-seq6] SKIP (https://nvbugs/5721907)

From f59d64e6c7af038141b77df1d67d37caf53fcb6b Mon Sep 17 00:00:00 2001
From: Yanchao Lu <yanchaol@nvidia.com>
Date: Sun, 7 Dec 2025 23:07:59 +0800
Subject: [PATCH 02/10] [None][fix] Several minor fixes to CI setting (#9765)

Signed-off-by: Yanchao Lu <yanchaol@nvidia.com>
---
 jenkins/L0_Test.groovy       | 10 +++++++---
 jenkins/scripts/slurm_run.sh |  7 +++++--
 scripts/check_test_list.py   |  7 +++----
 3 files changed, 15 insertions(+), 9 deletions(-)

diff --git a/jenkins/L0_Test.groovy b/jenkins/L0_Test.groovy
index fe4434a86c..41c66a7887 100644
--- a/jenkins/L0_Test.groovy
+++ b/jenkins/L0_Test.groovy
@@ -1639,6 +1639,7 @@ def launchTestListCheck(pipeline)
             sh "tar -zxf ${tarName}"
             def llmPath = sh (script: "realpath .", returnStdout: true).trim()
             def llmSrc = "${llmPath}/TensorRT-LLM/src"
+            trtllm_utils.llmExecStepWithRetry(pipeline, script: "pip3 install -r ${llmSrc}/requirements-dev.txt")
             sh "NVIDIA_TRITON_SERVER_VERSION=25.10 LLM_ROOT=${llmSrc} LLM_BACKEND_ROOT=${llmSrc}/triton_backend python3 ${llmSrc}/scripts/check_test_list.py --l0 --qa --waive"
         } catch (InterruptedException e) {
             throw e
@@ -2903,8 +2904,10 @@ def launchTestJobs(pipeline, testFilter)
         "DGX_B200-4_GPUs-PyTorch-2": ["b200-x4", "l0_dgx_b200", 2, 2, 4],
         "DGX_B200-4_GPUs-PyTorch-Ray-1": ["b200-x4", "l0_dgx_b200", 1, 1, 4],
         "DGX_B200-8_GPUs-PyTorch-1": ["b200-x8", "l0_dgx_b200", 1, 1, 8],
-        "DGX_B200-4_GPUs-PyTorch-Post-Merge-1": ["b200-trtllm", "l0_dgx_b200", 1, 1, 4, 1, true],
-        "DGX_B300-4_GPUs-PyTorch-Post-Merge-1": ["b300-x4", "l0_dgx_b300", 1, 1, 4],
+        "DGX_B200-4_GPUs-PyTorch-Post-Merge-1": ["b200-trtllm", "l0_dgx_b200", 1, 2, 4, 1, true],
+        "DGX_B200-4_GPUs-PyTorch-Post-Merge-2": ["b200-trtllm", "l0_dgx_b200", 2, 2, 4, 1, true],
+        "DGX_B300-4_GPUs-PyTorch-Post-Merge-1": ["b300-x4", "l0_dgx_b300", 1, 2, 4],
+        "DGX_B300-4_GPUs-PyTorch-Post-Merge-2": ["b300-x4", "l0_dgx_b300", 2, 2, 4],
         // Perf sanity post merge test
         // Disable perf stages due to https://nvbugs/5643646
         // "DGX_B200-4_GPUs-PyTorch-Perf-Sanity-Post-Merge-1": ["b200-x4", "perf_sanity_l0_dgx_b200", 1, 1, 4],
@@ -2933,7 +2936,8 @@ def launchTestJobs(pipeline, testFilter)
     fullSet += SBSATestConfigs.keySet()
 
     SBSASlurmTestConfigs = [
-        "GB200-4_GPUs-PyTorch-1": ["gb200-x4-oci", "l0_gb200_multi_gpus", 1, 1, 4],
+        "GB200-4_GPUs-PyTorch-1": ["gb200-x4-oci", "l0_gb200_multi_gpus", 1, 2, 4],
+        "GB200-4_GPUs-PyTorch-2": ["gb200-x4-oci", "l0_gb200_multi_gpus", 2, 2, 4],
         "GB200-4_GPUs-PyTorch-Post-Merge-1": ["gb200-x4-oci", "l0_gb200_multi_gpus", 1, 1, 4],
         // Disable GB300 stages due to nodes will be offline temporarily.
         // "GB300-PyTorch-1": ["gb300-single", "l0_gb300", 1, 1],
diff --git a/jenkins/scripts/slurm_run.sh b/jenkins/scripts/slurm_run.sh
index 49368b94c0..8f191b3edb 100755
--- a/jenkins/scripts/slurm_run.sh
+++ b/jenkins/scripts/slurm_run.sh
@@ -29,10 +29,14 @@ set_value_in_command() {
     echo "$result"
 }
 
-# Only the first process will save the job ID
+# Only the first process will save the job ID and set the git config
 if [ $SLURM_PROCID -eq 0 ]; then
     # Save job ID in $jobWorkspace/slurm_job_id.txt for later job to retrieve
     echo $SLURM_JOB_ID > $jobWorkspace/slurm_job_id.txt
+    # Update HOME/.gitconfig
+    if ! git config --global --get-all safe.directory | grep -Fxq "*"; then
+        git config --global --add safe.directory "*"
+    fi
 fi
 
 if [ $SLURM_LOCALID -eq 0 ]; then
@@ -47,7 +51,6 @@ if [ $SLURM_LOCALID -eq 0 ]; then
     fi
     cd $llmSrcNode && pip3 install --retries 10 -r requirements-dev.txt
     cd $resourcePathNode &&  pip3 install --retries 10 --force-reinstall --no-deps TensorRT-LLM/tensorrt_llm-*.whl
-    git config --global --add safe.directory "*"
     gpuUuids=$(nvidia-smi -q | grep "GPU UUID" | awk '{print $4}' | tr '\n' ',' || true)
     hostNodeName="${HOST_NODE_NAME:-$(hostname -f || hostname)}"
     echo "HOST_NODE_NAME = $hostNodeName ; GPU_UUIDS = $gpuUuids ; STAGE_NAME = $stageName"
diff --git a/scripts/check_test_list.py b/scripts/check_test_list.py
index c7b5357d25..c799d433fc 100755
--- a/scripts/check_test_list.py
+++ b/scripts/check_test_list.py
@@ -23,10 +23,9 @@ MARKER_LIST_IN_TEST = [" TIMEOUT"]
 
 
 def install_python_dependencies(llm_src):
-    subprocess.run(
-        f"cd {llm_src} && pip3 install --retries 1 -r requirements-dev.txt",
-        shell=True,
-        check=True)
+    subprocess.run(f"cd {llm_src} && pip3 install -r requirements-dev.txt",
+                   shell=True,
+                   check=True)
     subprocess.run(
         f"pip3 install --force-reinstall --no-deps {llm_src}/../tensorrt_llm-*.whl",
         shell=True,

From d252101a769e730907b13f559844e28d4b6fcdcd Mon Sep 17 00:00:00 2001
From: Chenjie Luo <108829653+cjluo-nv@users.noreply.github.com>
Date: Sun, 7 Dec 2025 07:14:05 -0800
Subject: [PATCH 03/10] [OMNIML-3036][doc] Re-branding TensorRT-Model-Optimizer
 as Nvidia Model-Optimizer (#9679)

Signed-off-by: Chenjie Luo <chenjiel@nvidia.com>
---
 ATTRIBUTIONS-Python.md                             |  4 ++--
 README.md                                          |  4 ++--
 ...ing_Expert_Parallelism_in_TensorRT-LLM_part3.md |  2 +-
 ..._DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.md |  2 +-
 ...pSeek_R1_Throughput_on_NVIDIA_Blackwell_GPUs.md |  2 +-
 docs/source/developer-guide/perf-benchmarking.md   |  4 ++--
 docs/source/developer-guide/perf-overview.md       |  2 +-
 docs/source/features/auto_deploy/support_matrix.md |  2 +-
 docs/source/features/quantization.md               |  8 ++++----
 .../source/legacy/performance/perf-benchmarking.md |  2 +-
 docs/source/torch/auto_deploy/support_matrix.md    |  2 +-
 docs/source/torch/features/quantization.md         |  6 +++---
 examples/auto_deploy/README.md                     |  8 ++++----
 examples/disaggregated/README.md                   |  2 +-
 .../_tensorrt_engine/llm_medusa_decoding.py        |  4 ++--
 .../llm-api/_tensorrt_engine/quickstart_example.py |  2 +-
 examples/llm-api/llm_inference.py                  |  2 +-
 examples/llm-api/quickstart_example.py             |  2 +-
 examples/medusa/README.md                          |  2 +-
 examples/models/core/deepseek_v3/README.md         |  6 +++---
 examples/models/core/exaone/README.md              | 10 +++++-----
 examples/models/core/llama/README.md               |  2 +-
 examples/models/core/llama4/README.md              |  6 +++---
 examples/models/core/qwen/README.md                | 14 +++++++-------
 examples/quantization/README.md                    |  2 +-
 .../examples/models/core/mllama/poetry.lock        |  2 +-
 security_scanning/poetry.lock                      |  2 +-
 27 files changed, 53 insertions(+), 53 deletions(-)

diff --git a/ATTRIBUTIONS-Python.md b/ATTRIBUTIONS-Python.md
index f7360a7e93..4e350512a2 100644
--- a/ATTRIBUTIONS-Python.md
+++ b/ATTRIBUTIONS-Python.md
@@ -25486,7 +25486,7 @@ limitations under the License.
 ```
 
 ### URLs
-  - `Homepage`: https://github.com/NVIDIA/TensorRT-Model-Optimizer
+  - `Homepage`: https://github.com/NVIDIA/Model-Optimizer
 
 
 ## nvidia-modelopt-core (0.33.1)
@@ -25513,7 +25513,7 @@ limitations under the License.
 ```
 
 ### URLs
-  - `Homepage`: https://github.com/NVIDIA/TensorRT-Model-Optimizer
+  - `Homepage`: https://github.com/NVIDIA/Model-Optimizer
 
 
 ## nvidia-nccl-cu12 (2.27.3)
diff --git a/README.md b/README.md
index f09c61783d..208767b037 100644
--- a/README.md
+++ b/README.md
@@ -164,7 +164,7 @@ state-of-the-art optimizations to perform inference efficiently on NVIDIA GPUs.<
 [➡️ link](https://www.bentoml.com/blog/tuning-tensor-rt-llm-for-optimal-serving-with-bentoml)
 
 
-* [2024/08/20] 🏎️SDXL with #TensorRT Model Optimizer ⏱️⚡ 🏁 cache diffusion 🏁 quantization aware training 🏁 QLoRA 🏁 #Python 3.12
+* [2024/08/20] 🏎️SDXL with #Model Optimizer ⏱️⚡ 🏁 cache diffusion 🏁 quantization aware training 🏁 QLoRA 🏁 #Python 3.12
 [➡️ link](https://developer.nvidia.com/blog/nvidia-tensorrt-model-optimizer-v0-15-boosts-inference-performance-and-expands-model-support/)
 
 * [2024/08/13] 🐍 DIY Code Completion with #Mamba ⚡ #TensorRT #LLM for speed 🤖 NIM for ease ☁️ deploy anywhere
@@ -209,7 +209,7 @@ Technical Deep Dive for serious coders ✅+99% compression ✅1 set of weights 
 * [2024/05/21] ✨@modal_labs has the codes for serverless @AIatMeta Llama 3 on #TensorRT #LLM ✨👀 📚 Marvelous Modal Manual:
 Serverless TensorRT LLM (LLaMA 3 8B) | Modal Docs [➡️ link](https://modal.com/docs/examples/trtllm_llama)
 
-* [2024/05/08] NVIDIA TensorRT Model Optimizer -- the newest member of the #TensorRT ecosystem is a library of post-training and training-in-the-loop model optimization techniques ✅quantization ✅sparsity ✅QAT [➡️ blog](https://developer.nvidia.com/blog/accelerate-generative-ai-inference-performance-with-nvidia-tensorrt-model-optimizer-now-publicly-available/)
+* [2024/05/08] NVIDIA Model Optimizer -- the newest member of the #TensorRT ecosystem is a library of post-training and training-in-the-loop model optimization techniques ✅quantization ✅sparsity ✅QAT [➡️ blog](https://developer.nvidia.com/blog/accelerate-generative-ai-inference-performance-with-nvidia-tensorrt-model-optimizer-now-publicly-available/)
 
 * [2024/05/07] 🦙🦙🦙 24,000 tokens per second 🛫Meta Llama 3 takes off with #TensorRT #LLM 📚[➡️ link](https://blogs.nvidia.com/blog/meta-llama3-inference-acceleration/)
 
diff --git a/docs/source/blogs/tech_blog/blog14_Scaling_Expert_Parallelism_in_TensorRT-LLM_part3.md b/docs/source/blogs/tech_blog/blog14_Scaling_Expert_Parallelism_in_TensorRT-LLM_part3.md
index 4b80603e29..800c406bd2 100644
--- a/docs/source/blogs/tech_blog/blog14_Scaling_Expert_Parallelism_in_TensorRT-LLM_part3.md
+++ b/docs/source/blogs/tech_blog/blog14_Scaling_Expert_Parallelism_in_TensorRT-LLM_part3.md
@@ -46,7 +46,7 @@ In this third blog of our scaling Expert Parallelism (EP) series, we push the pe
 
 The wo GEMM is the final linear layer within the multi-head attention block that produces the final outputs. While DeepSeek R1's MLA modifies the initial projections for keys and values, the wo GEMM operator remains a critical and standard component for finalizing the attention computation. In the term, "wo" is the abbreviation for the weight matrix for the output.
 
-We've evaluated that quantizing the wo GEMM to FP4 still satisfies the accuracy requirements, maintaining a similar MTP accept rate (AR) while improving end-to-end performance. The [NVIDIA TensorRT Model Optimizer](https://github.com/NVIDIA/TensorRT-Model-Optimizer) team has published checkpoints that additionally quantize the wo module in attention layers to FP4 on HuggingFace:
+We've evaluated that quantizing the wo GEMM to FP4 still satisfies the accuracy requirements, maintaining a similar MTP accept rate (AR) while improving end-to-end performance. The [NVIDIA Model Optimizer](https://github.com/NVIDIA/Model-Optimizer) team has published checkpoints that additionally quantize the wo module in attention layers to FP4 on HuggingFace:
 * https://huggingface.co/nvidia/DeepSeek-R1-FP4-v2
 * https://huggingface.co/nvidia/DeepSeek-R1-0528-FP4-v2
 
diff --git a/docs/source/blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.md b/docs/source/blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.md
index cd55d049d4..b5e3e6558a 100644
--- a/docs/source/blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.md
+++ b/docs/source/blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.md
@@ -67,7 +67,7 @@ We have explored a mixed precision recipe, which provides a better tradeoff betw
 
 *TensorRT LLM already supports [FP8 Attention](https://github.com/NVIDIA/TensorRT-LLM/tree/main/examples/models/core/deepseek_v3#fp8-kv-cache-and-mla) while for this latency scenario low-precision attention computation doesn't help with performance so we choose to use bf16 precision for the Attention Modules.
 
-** nvfp4 model checkpoint is generated by the [NVIDIA TensorRT Model Optimizer toolkit](https://github.com/NVIDIA/TensorRT-Model-Optimizer).
+** nvfp4 model checkpoint is generated by the [NVIDIA Model Optimizer toolkit](https://github.com/NVIDIA/Model-Optimizer).
 
 *** RouterGEMM uses bf16 inputs/weights with fp32 outputs for numerical stability
 
diff --git a/docs/source/blogs/tech_blog/blog3_Optimizing_DeepSeek_R1_Throughput_on_NVIDIA_Blackwell_GPUs.md b/docs/source/blogs/tech_blog/blog3_Optimizing_DeepSeek_R1_Throughput_on_NVIDIA_Blackwell_GPUs.md
index 2da07411a8..d2483af3f3 100644
--- a/docs/source/blogs/tech_blog/blog3_Optimizing_DeepSeek_R1_Throughput_on_NVIDIA_Blackwell_GPUs.md
+++ b/docs/source/blogs/tech_blog/blog3_Optimizing_DeepSeek_R1_Throughput_on_NVIDIA_Blackwell_GPUs.md
@@ -29,7 +29,7 @@ The mixed precision recipe for DeepSeek R1 throughput scenario is almost the sam
 * FP8 KV cache and FP8 attention, rather than BF16 precision.
 * FP4 Allgather for better communication bandwidth utilization.
 
-The checkpoint used in this blog is hosted in [nvidia/DeepSeek-R1-FP4](https://huggingface.co/nvidia/DeepSeek-R1-FP4), generated by [NVIDIA Model Optimizer](https://github.com/NVIDIA/TensorRT-Model-Optimizer). The accuracy score of common dataset on this FP4 checkpoint and TensorRT LLM implementations are:
+The checkpoint used in this blog is hosted in [nvidia/DeepSeek-R1-FP4](https://huggingface.co/nvidia/DeepSeek-R1-FP4), generated by [NVIDIA Model Optimizer](https://github.com/NVIDIA/Model-Optimizer). The accuracy score of common dataset on this FP4 checkpoint and TensorRT LLM implementations are:
 
 | Precision | GPQA Diamond | MATH-500
 | :-- | :-- | :-- |
diff --git a/docs/source/developer-guide/perf-benchmarking.md b/docs/source/developer-guide/perf-benchmarking.md
index 4e4e3ca421..57ef00d8f6 100644
--- a/docs/source/developer-guide/perf-benchmarking.md
+++ b/docs/source/developer-guide/perf-benchmarking.md
@@ -423,10 +423,10 @@ checkpoint. For the Llama-3.1 models, TensorRT LLM provides the following checkp
 - [`nvidia/Llama-3.1-70B-Instruct-FP8`](https://huggingface.co/nvidia/Llama-3.1-70B-Instruct-FP8)
 - [`nvidia/Llama-3.1-405B-Instruct-FP8`](https://huggingface.co/nvidia/Llama-3.1-405B-Instruct-FP8)
 
-To understand more about how to quantize your own checkpoints, refer to ModelOpt [documentation](https://nvidia.github.io/TensorRT-Model-Optimizer/deployment/1_tensorrt_llm.html).
+To understand more about how to quantize your own checkpoints, refer to ModelOpt [documentation](https://nvidia.github.io/Model-Optimizer/deployment/1_tensorrt_llm.html).
 
 `trtllm-bench` utilizes the `hf_quant_config.json` file present in the pre-quantized checkpoints above. The configuration
-file is present in checkpoints quantized with [TensorRT Model Optimizer](https://github.com/NVIDIA/TensorRT-Model-Optimizer)
+file is present in checkpoints quantized with [Model Optimizer](https://github.com/NVIDIA/Model-Optimizer)
 and describes the compute and KV cache quantization that checkpoint was compiled with. For example, from the checkpoints
 above:
 
diff --git a/docs/source/developer-guide/perf-overview.md b/docs/source/developer-guide/perf-overview.md
index 0a144a58d4..aefa91fd43 100644
--- a/docs/source/developer-guide/perf-overview.md
+++ b/docs/source/developer-guide/perf-overview.md
@@ -21,7 +21,7 @@ and shows the throughput scenario under maximum load. The reported metric is `To
 
 The performance numbers below were collected using the steps described in this document.
 
-Testing was performed on models with weights quantized using [ModelOpt](https://nvidia.github.io/TensorRT-Model-Optimizer/#) and published by NVIDIA on the [Model Optimizer HuggingFace Collection](https://huggingface.co/collections/nvidia/model-optimizer-66aa84f7966b3150262481a4).
+Testing was performed on models with weights quantized using [ModelOpt](https://nvidia.github.io/Model-Optimizer/#) and published by NVIDIA on the [Model Optimizer HuggingFace Collection](https://huggingface.co/collections/nvidia/model-optimizer-66aa84f7966b3150262481a4).
 
 *(NEW for v1.0) RTX 6000 Pro Blackwell Server Edition Benchmarks:*
 
diff --git a/docs/source/features/auto_deploy/support_matrix.md b/docs/source/features/auto_deploy/support_matrix.md
index 26c07b308b..fec6d841af 100644
--- a/docs/source/features/auto_deploy/support_matrix.md
+++ b/docs/source/features/auto_deploy/support_matrix.md
@@ -120,7 +120,7 @@ Optimize attention operations with different attention kernel implementations:
 
 ### Precision Support
 
-AutoDeploy supports models with various precision formats, including quantized checkpoints generated by [`TensorRT-Model-Optimizer`](https://github.com/NVIDIA/TensorRT-Model-Optimizer).
+AutoDeploy supports models with various precision formats, including quantized checkpoints generated by [`Model-Optimizer`](https://github.com/NVIDIA/Model-Optimizer).
 
 **Supported precision types include:**
 
diff --git a/docs/source/features/quantization.md b/docs/source/features/quantization.md
index 8a0e160529..e057a91b39 100644
--- a/docs/source/features/quantization.md
+++ b/docs/source/features/quantization.md
@@ -23,7 +23,7 @@ The default PyTorch backend supports FP4 and FP8 quantization on the latest Blac
 
 ### Running Pre-quantized Models
 
-TensorRT LLM can directly run [pre-quantized models](https://huggingface.co/collections/nvidia/model-optimizer-66aa84f7966b3150262481a4) generated with the [NVIDIA TensorRT Model Optimizer](https://github.com/NVIDIA/TensorRT-Model-Optimizer).
+TensorRT LLM can directly run [pre-quantized models](https://huggingface.co/collections/nvidia/model-optimizer-66aa84f7966b3150262481a4) generated with the [NVIDIA Model Optimizer](https://github.com/NVIDIA/Model-Optimizer).
 
 ```python
 from tensorrt_llm import LLM
@@ -54,8 +54,8 @@ If a pre-quantized model is not available on the [Hugging Face Hub](https://hugg
 Follow this step-by-step guide to quantize a model:
 
 ```bash
-git clone https://github.com/NVIDIA/TensorRT-Model-Optimizer.git
-cd TensorRT-Model-Optimizer/examples/llm_ptq
+git clone https://github.com/NVIDIA/Model-Optimizer.git
+cd Model-Optimizer/examples/llm_ptq
 scripts/huggingface_example.sh --model <huggingface_model_card> --quant fp8 --export_fmt hf
 ```
 
@@ -108,4 +108,4 @@ FP8 block wise scaling GEMM kernels for sm100 are using MXFP8 recipe (E4M3 act/w
 ## Quick Links
 
 - [Pre-quantized Models by ModelOpt](https://huggingface.co/collections/nvidia/model-optimizer-66aa84f7966b3150262481a4)
-- [ModelOpt Support Matrix](https://nvidia.github.io/TensorRT-Model-Optimizer/guides/0_support_matrix.html)
+- [ModelOpt Support Matrix](https://nvidia.github.io/Model-Optimizer/guides/0_support_matrix.html)
diff --git a/docs/source/legacy/performance/perf-benchmarking.md b/docs/source/legacy/performance/perf-benchmarking.md
index 55caef07ba..5efd6625f0 100644
--- a/docs/source/legacy/performance/perf-benchmarking.md
+++ b/docs/source/legacy/performance/perf-benchmarking.md
@@ -662,7 +662,7 @@ checkpoint. For the Llama-3.1 models, TensorRT-LLM provides the following checkp
 - [`nvidia/Llama-3.1-405B-Instruct-FP8`](https://huggingface.co/nvidia/Llama-3.1-405B-Instruct-FP8)
 
 `trtllm-bench` utilizes the `hf_quant_config.json` file present in the pre-quantized checkpoints above. The configuration
-file is present in checkpoints quantized with [TensorRT Model Optimizer](https://github.com/NVIDIA/TensorRT-Model-Optimizer)
+file is present in checkpoints quantized with [Model Optimizer](https://github.com/NVIDIA/Model-Optimizer)
 and describes the compute and KV cache quantization that checkpoint was compiled with. For example, from the checkpoints
 above:
 
diff --git a/docs/source/torch/auto_deploy/support_matrix.md b/docs/source/torch/auto_deploy/support_matrix.md
index c8780cbca1..f0158253dd 100644
--- a/docs/source/torch/auto_deploy/support_matrix.md
+++ b/docs/source/torch/auto_deploy/support_matrix.md
@@ -118,7 +118,7 @@ Optimize attention operations with different attention kernel implementations:
 
 ### Precision Support
 
-AutoDeploy supports models with various precision formats, including quantized checkpoints generated by [`TensorRT-Model-Optimizer`](https://github.com/NVIDIA/TensorRT-Model-Optimizer).
+AutoDeploy supports models with various precision formats, including quantized checkpoints generated by [`Model-Optimizer`](https://github.com/NVIDIA/Model-Optimizer).
 
 **Supported precision types include:**
 
diff --git a/docs/source/torch/features/quantization.md b/docs/source/torch/features/quantization.md
index a2b6c48be2..47cc745165 100644
--- a/docs/source/torch/features/quantization.md
+++ b/docs/source/torch/features/quantization.md
@@ -1,7 +1,7 @@
 # Quantization
 
 The PyTorch backend supports FP8 and NVFP4 quantization. You can pass quantized models in HF model hub,
-which are generated by [TensorRT Model Optimizer](https://github.com/NVIDIA/TensorRT-Model-Optimizer).
+which are generated by [Model Optimizer](https://github.com/NVIDIA/Model-Optimizer).
 
 ```python
 from tensorrt_llm._torch import LLM
@@ -12,7 +12,7 @@ llm.generate("Hello, my name is")
 Or you can try the following commands to get a quantized model by yourself:
 
 ```bash
-git clone https://github.com/NVIDIA/TensorRT-Model-Optimizer.git
-cd TensorRT-Model-Optimizer/examples/llm_ptq
+git clone https://github.com/NVIDIA/Model-Optimizer.git
+cd Model-Optimizer/examples/llm_ptq
 scripts/huggingface_example.sh --model <huggingface_model_card> --quant fp8 --export_fmt hf
 ```
diff --git a/examples/auto_deploy/README.md b/examples/auto_deploy/README.md
index c89c1a552c..5343d88999 100644
--- a/examples/auto_deploy/README.md
+++ b/examples/auto_deploy/README.md
@@ -90,16 +90,16 @@ python lm_eval_ad.py \
 --model autodeploy --model_args model=meta-llama/Meta-Llama-3.1-8B-Instruct,world_size=2 --tasks mmlu
 ```
 
-### Mixed-precision Quantization using TensorRT Model Optimizer
+### Mixed-precision Quantization using Model Optimizer
 
-TensorRT Model Optimizer [AutoQuantize](https://nvidia.github.io/TensorRT-Model-Optimizer/reference/generated/modelopt.torch.quantization.model_quant.html#modelopt.torch.quantization.model_quant.auto_quantize) algorithm is a PTQ algorithm from ModelOpt which quantizes a model by searching for the best quantization format per-layer while meeting the performance constraint specified by the user. This way, `AutoQuantize` enables to trade-off model accuracy for performance.
+Model Optimizer [AutoQuantize](https://nvidia.github.io/Model-Optimizer/reference/generated/modelopt.torch.quantization.model_quant.html#modelopt.torch.quantization.model_quant.auto_quantize) algorithm is a PTQ algorithm from ModelOpt which quantizes a model by searching for the best quantization format per-layer while meeting the performance constraint specified by the user. This way, `AutoQuantize` enables to trade-off model accuracy for performance.
 
 Currently `AutoQuantize` supports only `effective_bits` as the performance constraint (for both weight-only quantization and weight & activation quantization). See
-[AutoQuantize documentation](https://nvidia.github.io/TensorRT-Model-Optimizer/reference/generated/modelopt.torch.quantization.model_quant.html#modelopt.torch.quantization.model_quant.auto_quantize) for more details.
+[AutoQuantize documentation](https://nvidia.github.io/Model-Optimizer/reference/generated/modelopt.torch.quantization.model_quant.html#modelopt.torch.quantization.model_quant.auto_quantize) for more details.
 
 #### 1. Quantize a model with ModelOpt
 
-Refer to [NVIDIA TensorRT Model Optimizer](https://github.com/NVIDIA/TensorRT-Model-Optimizer/blob/main/examples/llm_autodeploy/README.md) for generating quantized model checkpoint.
+Refer to [NVIDIA Model Optimizer](https://github.com/NVIDIA/Model-Optimizer/blob/main/examples/llm_autodeploy/README.md) for generating quantized model checkpoint.
 
 #### 2. Deploy the quantized model with AutoDeploy
 
diff --git a/examples/disaggregated/README.md b/examples/disaggregated/README.md
index 511bce3619..8b99f8845f 100644
--- a/examples/disaggregated/README.md
+++ b/examples/disaggregated/README.md
@@ -212,7 +212,7 @@ In disaggregated serving, the context workers and generation workers have differ
 ### Prerequisites
 
 To enable mixed precision serving, you will need:
-1. A quantized checkpoint created with [TensorRT Model Optimizer](https://github.com/NVIDIA/TensorRT-Model-Optimizer)
+1. A quantized checkpoint created with [Model Optimizer](https://github.com/NVIDIA/Model-Optimizer)
 2. The original unquantized checkpoint (Can also be quantized)
 3. Both checkpoints must use the same KV cache dtype to ensure compatibility during transfer
 
diff --git a/examples/llm-api/_tensorrt_engine/llm_medusa_decoding.py b/examples/llm-api/_tensorrt_engine/llm_medusa_decoding.py
index b6d7f90c0f..f45411b233 100644
--- a/examples/llm-api/_tensorrt_engine/llm_medusa_decoding.py
+++ b/examples/llm-api/_tensorrt_engine/llm_medusa_decoding.py
@@ -29,7 +29,7 @@ def run_medusa_decoding(use_modelopt_ckpt=False, model_dir=None):
     llm_kwargs = {}
 
     if use_modelopt_ckpt:
-        # This is a Llama-3.1-8B combined with Medusa heads provided by TensorRT Model Optimizer.
+        # This is a Llama-3.1-8B combined with Medusa heads provided by Model Optimizer.
         # Both the base model (except lm_head) and Medusa heads have been quantized in FP8.
         model = model_dir or "nvidia/Llama-3.1-8B-Medusa-FP8"
 
@@ -85,7 +85,7 @@ if __name__ == '__main__':
     parser.add_argument(
         '--use_modelopt_ckpt',
         action='store_true',
-        help="Use FP8-quantized checkpoint from TensorRT Model Optimizer.")
+        help="Use FP8-quantized checkpoint from Model Optimizer.")
     # TODO: remove this arg after ModelOpt ckpt is public on HF
     parser.add_argument('--model_dir', type=Path, default=None)
     args = parser.parse_args()
diff --git a/examples/llm-api/_tensorrt_engine/quickstart_example.py b/examples/llm-api/_tensorrt_engine/quickstart_example.py
index a6ba9ec559..d02f55c46b 100644
--- a/examples/llm-api/_tensorrt_engine/quickstart_example.py
+++ b/examples/llm-api/_tensorrt_engine/quickstart_example.py
@@ -9,7 +9,7 @@ def main():
     build_config.max_num_tokens = 1024
 
     # Model could accept HF model name, a path to local HF model,
-    # or TensorRT Model Optimizer's quantized checkpoints like nvidia/Llama-3.1-8B-Instruct-FP8 on HF.
+    # or Model Optimizer's quantized checkpoints like nvidia/Llama-3.1-8B-Instruct-FP8 on HF.
     llm = LLM(model="TinyLlama/TinyLlama-1.1B-Chat-v1.0",
               build_config=build_config)
 
diff --git a/examples/llm-api/llm_inference.py b/examples/llm-api/llm_inference.py
index 5146504d25..6c806f0768 100644
--- a/examples/llm-api/llm_inference.py
+++ b/examples/llm-api/llm_inference.py
@@ -7,7 +7,7 @@ from tensorrt_llm import LLM, SamplingParams
 def main():
 
     # Model could accept HF model name, a path to local HF model,
-    # or TensorRT Model Optimizer's quantized checkpoints like nvidia/Llama-3.1-8B-Instruct-FP8 on HF.
+    # or Model Optimizer's quantized checkpoints like nvidia/Llama-3.1-8B-Instruct-FP8 on HF.
     llm = LLM(model="TinyLlama/TinyLlama-1.1B-Chat-v1.0")
 
     # Sample prompts.
diff --git a/examples/llm-api/quickstart_example.py b/examples/llm-api/quickstart_example.py
index 400a241c0e..2d6f14012b 100644
--- a/examples/llm-api/quickstart_example.py
+++ b/examples/llm-api/quickstart_example.py
@@ -4,7 +4,7 @@ from tensorrt_llm import LLM, SamplingParams
 def main():
 
     # Model could accept HF model name, a path to local HF model,
-    # or TensorRT Model Optimizer's quantized checkpoints like nvidia/Llama-3.1-8B-Instruct-FP8 on HF.
+    # or Model Optimizer's quantized checkpoints like nvidia/Llama-3.1-8B-Instruct-FP8 on HF.
     llm = LLM(model="TinyLlama/TinyLlama-1.1B-Chat-v1.0")
 
     # Sample prompts.
diff --git a/examples/medusa/README.md b/examples/medusa/README.md
index eb442554ec..7820335cd0 100644
--- a/examples/medusa/README.md
+++ b/examples/medusa/README.md
@@ -19,7 +19,7 @@ For more info about Medusa visit [speculative decoding documentation](https://nv
 The TensorRT LLM Medusa example code is located in [`examples/medusa`](./). There is one [`convert_checkpoint.py`](./convert_checkpoint.py) file to convert and build the [TensorRT](https://developer.nvidia.com/tensorrt) engine(s) needed to run models with Medusa decoding support.
 In this example, we demonstrate the usage of two models:
 1. The Vucuna 7B model from Hugging Face [`FasterDecoding/medusa-vicuna-7b-v1.3`](https://huggingface.co/FasterDecoding/medusa-vicuna-7b-v1.3) with its Medusa heads [`medusa-vicuna-7b-v1.3`](https://huggingface.co/FasterDecoding/medusa-vicuna-7b-v1.3).
-2. The quantized checkpoint [`nvidia/Llama-3.1-8B-Medusa-FP8`](https://huggingface.co/nvidia/Llama-3.1-8B-Medusa-FP8) on Hugging Face by [TensorRT Model Optimizer](https://github.com/NVIDIA/TensorRT-Model-Optimizer) (ModelOpt). This model is based on [Llama-3.1 8B](https://huggingface.co/meta-llama/Llama-3.1-8B) and enhanced with Medusa heads, with both the base model (except lm_head) and Medusa heads already quantized in FP8.
+2. The quantized checkpoint [`nvidia/Llama-3.1-8B-Medusa-FP8`](https://huggingface.co/nvidia/Llama-3.1-8B-Medusa-FP8) on Hugging Face by [Model Optimizer](https://github.com/NVIDIA/Model-Optimizer) (ModelOpt). This model is based on [Llama-3.1 8B](https://huggingface.co/meta-llama/Llama-3.1-8B) and enhanced with Medusa heads, with both the base model (except lm_head) and Medusa heads already quantized in FP8.
 
 ### Build TensorRT engine(s)
 Get the weights by downloading base model [`vicuna-7b-v1.3`](https://huggingface.co/lmsys/vicuna-7b-v1.3) and Medusa Heads [`medusa-vicuna-7b-v1.3`](https://huggingface.co/FasterDecoding/medusa-vicuna-7b-v1.3) from HF.
diff --git a/examples/models/core/deepseek_v3/README.md b/examples/models/core/deepseek_v3/README.md
index 3e82442563..934db2e493 100644
--- a/examples/models/core/deepseek_v3/README.md
+++ b/examples/models/core/deepseek_v3/README.md
@@ -773,7 +773,7 @@ You can enable FP8 MLA through either of these methods:
 
 **Option 1: Checkpoint config**
 
-TensorRT LLM automatically detects the `hf_quant_config.json` file in the model directory, which configures both GEMM and KV cache quantization. For example, see the FP4 DeepSeek-R1 checkpoint [configuration](https://huggingface.co/nvidia/DeepSeek-R1-FP4/blob/main/hf_quant_config.json) provided by [ModelOpt](https://github.com/NVIDIA/TensorRT-Model-Optimizer).
+TensorRT LLM automatically detects the `hf_quant_config.json` file in the model directory, which configures both GEMM and KV cache quantization. For example, see the FP4 DeepSeek-R1 checkpoint [configuration](https://huggingface.co/nvidia/DeepSeek-R1-FP4/blob/main/hf_quant_config.json) provided by [ModelOpt](https://github.com/NVIDIA/Model-Optimizer).
 
 To enable FP8 MLA, modify the `kv_cache_quant_algo` property. The following shows the config for DeepSeek's block-wise FP8 GEMM quantization + FP8 MLA:
 
@@ -808,14 +808,14 @@ Or you can follow the steps to generate one by yourselves.
 
 #### Activation calibration
 
-[ModelOpt](https://github.com/NVIDIA/TensorRT-Model-Optimizer) is used for calibrating activations of MoE layers. We provide a calibrated file at [HF model hub](https://huggingface.co/Barrrrry/DeepSeek-R1-W4AFP8/blob/main/act_scales.safetensors) or you can run the following commands to generate by yourselves.
+[ModelOpt](https://github.com/NVIDIA/Model-Optimizer) is used for calibrating activations of MoE layers. We provide a calibrated file at [HF model hub](https://huggingface.co/Barrrrry/DeepSeek-R1-W4AFP8/blob/main/act_scales.safetensors) or you can run the following commands to generate by yourselves.
 
 ```bash
 # Make sure for enough GPU resources (8xH200s) to run the following commands
 PATH_OF_DEEPSEEK_R1=/llm-models/DeepSeek-R1/DeepSeek-R1
 
 # Install ModelOpt from source
-git clone https://github.com/NVIDIA/TensorRT-Model-Optimizer/ && cd modelopt
+git clone https://github.com/NVIDIA/Model-Optimizer/ && cd modelopt
 pip install "nvidia-modelopt[all]" -U --extra-index-url https://pypi.nvidia.com
 
 # Clone DeepSeek-V3 (base model of R1) Github repository for FP8 inference,
diff --git a/examples/models/core/exaone/README.md b/examples/models/core/exaone/README.md
index 549b83843a..9ea4a9e71d 100644
--- a/examples/models/core/exaone/README.md
+++ b/examples/models/core/exaone/README.md
@@ -85,17 +85,17 @@ The output will be like:
 
 #### PyTorch flow Quantization
 
-For PyTorch flow, TRT-LLM supports quantized format generated by [TensorRT Model Optimizer](https://github.com/NVIDIA/TensorRT-Model-Optimizer).
+For PyTorch flow, TRT-LLM supports quantized format generated by [Model Optimizer](https://github.com/NVIDIA/Model-Optimizer).
 
 You can either do pre-quantized models in HF model hub, or can generate quantized model by yourself and then run models with below command:
 
 ```bash
-git clone https://github.com/NVIDIA/TensorRT-Model-Optimizer.git
-cd TensorRT-Model-Optimizer/examples/llm_ptq
+git clone https://github.com/NVIDIA/Model-Optimizer.git
+cd Model-Optimizer/examples/llm_ptq
 scripts/huggingface_example.sh --model  hf_models/$MODEL_NAME --quant fp8 --export_fmt hf
 ```
 
-For more information, please refer to official [docs](https://github.com/NVIDIA/TensorRT-Model-Optimizer) or [TensorRT Model Optimizer](https://github.com/NVIDIA/TensorRT-Model-Optimizer).
+For more information, please refer to official [docs](https://github.com/NVIDIA/Model-Optimizer) or [Model Optimizer](https://github.com/NVIDIA/Model-Optimizer).
 
 Troubleshooting
 
@@ -107,7 +107,7 @@ Hint: Move the offending context manager(s) to outside the compiled region.
 Hint: This graph break may have been caused by an earlier graph break. Resolving the earlier graph break may resolve this one.
 ```
 
-This error may indicate an incompatibility between `torch.compile()` and the `HybridCache` module of the transformers library. As a result, [TensorRT Model Optimizer](https://github.com/NVIDIA/TensorRT-Model-Optimizer) (ModelOpt) cannot perform PTQ with HybridCache.
+This error may indicate an incompatibility between `torch.compile()` and the `HybridCache` module of the transformers library. As a result, [Model Optimizer](https://github.com/NVIDIA/Model-Optimizer) (ModelOpt) cannot perform PTQ with HybridCache.
 
 Temporarily switching to `DynamicCache` when creating PTQ models could help address the issue. This can be done by updating the `cache_implementation` field in the `generation_config.json` file located in the model checkpoint directory, for example:
 ```json
diff --git a/examples/models/core/llama/README.md b/examples/models/core/llama/README.md
index 464fe8bdf3..df26ac1ad6 100644
--- a/examples/models/core/llama/README.md
+++ b/examples/models/core/llama/README.md
@@ -1559,7 +1559,7 @@ Explanation:
 
 
 ### Launch trtllm-serve OpenAI-compatible API server
-TensorRT LLM supports nvidia TensorRT Model Optimizer quantized FP8 checkpoint
+TensorRT LLM supports nvidia Model Optimizer quantized FP8 checkpoint
 ``` bash
 trtllm-serve nvidia/Llama-3.3-70B-Instruct-FP8 \
     --tp_size 8 \
diff --git a/examples/models/core/llama4/README.md b/examples/models/core/llama4/README.md
index 93e3778864..a6c02070e9 100644
--- a/examples/models/core/llama4/README.md
+++ b/examples/models/core/llama4/README.md
@@ -42,7 +42,7 @@ Explanation:
 
 
 #### 2. Launch trtllm-serve OpenAI-compatible API server
-TensorRT LLM supports nvidia TensorRT Model Optimizer quantized FP8 checkpoint
+TensorRT LLM supports nvidia Model Optimizer quantized FP8 checkpoint
 ``` bash
 trtllm-serve nvidia/Llama-4-Maverick-17B-128E-Instruct-FP8 \
     --max_batch_size 512 \
@@ -94,7 +94,7 @@ Explanation:
 
 
 #### 2. Launch trtllm-serve OpenAI-compatible API server
-TensorRT LLM supports nvidia TensorRT Model Optimizer quantized FP8 checkpoint.
+TensorRT LLM supports nvidia Model Optimizer quantized FP8 checkpoint.
 ``` bash
 trtllm-serve nvidia/Llama-4-Maverick-17B-128E-Instruct-FP8 \
     --max_batch_size 8 \
@@ -140,7 +140,7 @@ Explanation:
 
 
 #### 2. Launch trtllm-serve OpenAI-compatible API server
-TensorRT LLM supports nvidia TensorRT Model Optimizer quantized FP8 checkpoint.
+TensorRT LLM supports nvidia Model Optimizer quantized FP8 checkpoint.
 ``` bash
 trtllm-serve nvidia/Llama-4-Maverick-17B-128E-Instruct-FP8 \
     --tp_size 8 \
diff --git a/examples/models/core/qwen/README.md b/examples/models/core/qwen/README.md
index 52a5ecb481..1d3d97b267 100644
--- a/examples/models/core/qwen/README.md
+++ b/examples/models/core/qwen/README.md
@@ -663,19 +663,19 @@ trtllm-eval --model=Qwen3-30B-A3B/ --tokenizer=Qwen3-30B-A3B/ --backend=pytorch
 To quantize the Qwen3 model for use with the PyTorch backend, we'll use NVIDIA's Model Optimizer (ModelOpt) tool. Follow these steps:
 
 ```bash
-# Clone the TensorRT Model Optimizer (ModelOpt)
-git clone https://github.com/NVIDIA/TensorRT-Model-Optimizer.git
-pushd TensorRT-Model-Optimizer
+# Clone the Model Optimizer (ModelOpt)
+git clone https://github.com/NVIDIA/Model-Optimizer.git
+pushd Model-Optimizer
 
 # install the ModelOpt
 pip install -e .
 
 # Quantize the Qwen3-235B-A22B model by nvfp4
-# By default, the checkpoint would be stored in `TensorRT-Model-Optimizer/examples/llm_ptq/saved_models_Qwen3-235B-A22B_nvfp4_hf/`.
+# By default, the checkpoint would be stored in `Model-Optimizer/examples/llm_ptq/saved_models_Qwen3-235B-A22B_nvfp4_hf/`.
 ./examples/llm_ptq/scripts/huggingface_example.sh --model Qwen3-235B-A22B/ --quant nvfp4 --export_fmt hf
 
 # Quantize the Qwen3-32B model by fp8_pc_pt
-# By default, the checkpoint would be stored in `TensorRT-Model-Optimizer/examples/llm_ptq/saved_models_Qwen3-32B_fp8_pc_pt_hf/`.
+# By default, the checkpoint would be stored in `Model-Optimizer/examples/llm_ptq/saved_models_Qwen3-32B_fp8_pc_pt_hf/`.
 ./examples/llm_ptq/scripts/huggingface_example.sh --model Qwen3-32B/ --quant fp8_pc_pt --export_fmt hf
 popd
 ```
@@ -687,7 +687,7 @@ To run the benchmark, we suggest using the `trtllm-bench` tool. Please refer to
 ```bash
 #!/bin/bash
 
-folder_model=TensorRT-Model-Optimizer/examples/llm_ptq/saved_models_Qwen3-235B-A22B_nvfp4_hf/
+folder_model=Model-Optimizer/examples/llm_ptq/saved_models_Qwen3-235B-A22B_nvfp4_hf/
 path_config=extra-llm-api-config.yml
 num_gpus=8
 ep_size=8
@@ -727,7 +727,7 @@ trtllm-bench --model ${folder_model} --model_path ${folder_model} throughput \
 We suggest benchmarking with a real dataset. It will prevent from having improperly distributed tokens in the MoE. Here, we use the `aa_prompt_isl_1k_osl_2k_qwen3_10000samples.txt` dataset. It has 10000 samples with an average input length of 1024 and an average output length of 2048. If you don't have a dataset (this or an other) and you want to run the benchmark, you can use the following command to generate a random dataset:
 
 ```bash
-folder_model=TensorRT-Model-Optimizer/examples/llm_ptq/saved_models_Qwen3-235B-A22B_nvfp4_hf/
+folder_model=Model-Optimizer/examples/llm_ptq/saved_models_Qwen3-235B-A22B_nvfp4_hf/
 min_input_len=1024
 min_output_len=2048
 concurrency=128
diff --git a/examples/quantization/README.md b/examples/quantization/README.md
index e74736b61b..b3b2e35b20 100644
--- a/examples/quantization/README.md
+++ b/examples/quantization/README.md
@@ -11,7 +11,7 @@ The detailed LLM quantization recipe is distributed to the README.md of the corr
 
 ## Installation
 
-The NVIDIA TensorRT Model Optimizer quantization toolkit is installed automatically as a dependency of TensorRT-LLM.
+The NVIDIA Model Optimizer quantization toolkit is installed automatically as a dependency of TensorRT-LLM.
 
 ```bash
 # Install the additional requirements
diff --git a/security_scanning/examples/models/core/mllama/poetry.lock b/security_scanning/examples/models/core/mllama/poetry.lock
index 11e0ed3ccb..c58e7c12b5 100644
--- a/security_scanning/examples/models/core/mllama/poetry.lock
+++ b/security_scanning/examples/models/core/mllama/poetry.lock
@@ -708,7 +708,7 @@ files = [
 [[package]]
 name = "nvidia-modelopt"
 version = "0.21.1"
-description = "Nvidia TensorRT Model Optimizer: a unified model optimization and deployment toolkit."
+description = "Nvidia Model Optimizer: a unified model optimization and deployment toolkit."
 optional = false
 python-versions = "<3.13,>=3.8"
 files = [
diff --git a/security_scanning/poetry.lock b/security_scanning/poetry.lock
index 18ed93657e..e5959abf84 100644
--- a/security_scanning/poetry.lock
+++ b/security_scanning/poetry.lock
@@ -2793,7 +2793,7 @@ files = [
 [[package]]
 name = "nvidia-modelopt"
 version = "0.37.0"
-description = "Nvidia TensorRT Model Optimizer: a unified model optimization and deployment toolkit."
+description = "Nvidia Model Optimizer: a unified model optimization and deployment toolkit."
 optional = false
 python-versions = "<3.13,>=3.10"
 files = [

From 41ce14ab0445cb35d4b7d3ac715dffd0a2ae03fb Mon Sep 17 00:00:00 2001
From: Ludwig Schneider <lschneider@nvidia.com>
Date: Sun, 7 Dec 2025 11:43:26 -0600
Subject: [PATCH 04/10] [None][feat] Enable NCCL_SYMMETRIC as default fallback
 for AllReduce (#9314)

Signed-off-by: Ludwig Schneider <lschneider@nvidia.com>
---
 .../common/customAllReduceUtils.h             |   5 +-
 cpp/tensorrt_llm/common/ncclUtils.cpp         | 585 ++++++++++++++
 cpp/tensorrt_llm/common/ncclUtils.h           | 397 ++++++++++
 cpp/tensorrt_llm/common/opUtils.cpp           |  25 +-
 .../kernels/userbuffers/ub_allocator.cpp      | 175 +---
 .../kernels/userbuffers/ub_allocator.h        |  56 --
 .../userbuffers/userbuffersManager.cpp        |  15 +-
 .../kernels/userbuffers/userbuffersManager.h  |   9 +-
 .../plugins/ncclPlugin/allreducePlugin.cpp    |  62 +-
 cpp/tensorrt_llm/thop/allreduceOp.cpp         | 471 ++++++++---
 cpp/tests/unit_tests/multi_gpu/CMakeLists.txt |   6 +
 .../unit_tests/multi_gpu/ncclUtilsTest.cpp    | 745 ++++++++++++++++++
 .../_torch/pyexecutor/model_engine.py         |  16 +-
 tensorrt_llm/functional.py                    |   7 +-
 tests/integration/defs/cpp/test_multi_gpu.py  |  27 +
 tests/microbenchmarks/all_reduce.py           |   4 +
 .../allreduce_heuristic_code_gen.py           |   5 +-
 .../_torch/multi_gpu/test_allreduce.py        |   2 +-
 .../_torch/multi_gpu/test_mnnvl_allreduce.py  |   2 +-
 .../_torch/multi_gpu/test_user_buffers.py     |   3 +-
 20 files changed, 2225 insertions(+), 392 deletions(-)
 create mode 100644 cpp/tensorrt_llm/common/ncclUtils.cpp
 create mode 100644 cpp/tensorrt_llm/common/ncclUtils.h
 create mode 100644 cpp/tests/unit_tests/multi_gpu/ncclUtilsTest.cpp

diff --git a/cpp/tensorrt_llm/common/customAllReduceUtils.h b/cpp/tensorrt_llm/common/customAllReduceUtils.h
index 0a6c2d9d32..9a466512e4 100644
--- a/cpp/tensorrt_llm/common/customAllReduceUtils.h
+++ b/cpp/tensorrt_llm/common/customAllReduceUtils.h
@@ -81,7 +81,6 @@ inline AllReduceStrategyType SelectStrategyLP(size_t seq_len, size_t hidden_size
     {
         return AllReduceStrategyType::ONESHOT;
     }
-    return AllReduceStrategyType::NCCL;
 }
 
 // use 1D vector to store the best strategy instead of a map for each sm version
@@ -143,7 +142,7 @@ inline AllReduceStrategyType selectStrategyLookUpTable(
         sm_version = 100;
     }
 
-    // Check if the entry is out of bounds, otherwise return NCCL as fallback
+    // Check if the entry is out of bounds, otherwise return NCCL_SYMMETRIC as fallback
     if (AllReduceBestStrategyTable.find(sm_version) == AllReduceBestStrategyTable.end()
         || tp_index >= AllReduceBestStrategyTable.at(sm_version).size()
         || fusion_op_index >= AllReduceBestStrategyTable.at(sm_version).at(tp_index).size()
@@ -151,7 +150,7 @@ inline AllReduceStrategyType selectStrategyLookUpTable(
         || num_token_index
             >= AllReduceBestStrategyTable.at(sm_version).at(tp_index).at(fusion_op_index).at(hidden_size_index).size())
     {
-        return AllReduceStrategyType::NCCL;
+        return AllReduceStrategyType::NCCL_SYMMETRIC;
     }
 
     return static_cast<AllReduceStrategyType>(
diff --git a/cpp/tensorrt_llm/common/ncclUtils.cpp b/cpp/tensorrt_llm/common/ncclUtils.cpp
new file mode 100644
index 0000000000..76406fd806
--- /dev/null
+++ b/cpp/tensorrt_llm/common/ncclUtils.cpp
@@ -0,0 +1,585 @@
+/*
+ * Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "tensorrt_llm/common/ncclUtils.h"
+
+#if ENABLE_MULTI_DEVICE
+
+#include "tensorrt_llm/common/assert.h"
+#include "tensorrt_llm/common/cudaUtils.h"
+#include "tensorrt_llm/common/logger.h"
+#include <limits>
+#include <stdexcept>
+
+namespace tensorrt_llm::common::nccl_util
+{
+
+//==============================================================================
+// NcclCommResourceManager Implementation
+//==============================================================================
+
+NcclCommResourceManager& NcclCommResourceManager::getInstance() noexcept
+{
+    static NcclCommResourceManager instance;
+    return instance;
+}
+
+void NcclCommResourceManager::registerResource(ncclComm_t comm, ResourceCleanupFunc cleanup, char const* debugName)
+{
+    if (!comm)
+    {
+        TLLM_LOG_WARNING("[NCCLUtil] Attempted to register resource for null NCCL comm");
+        return;
+    }
+
+    std::lock_guard<std::mutex> lock(mMutex);
+    auto& resources = mCommResources[comm];
+    resources.emplace_back(std::move(cleanup), debugName ? debugName : "unnamed");
+
+    TLLM_LOG_TRACE("[NCCLUtil] Registered resource '%s' for NCCL comm %p (total: %zu)",
+        debugName ? debugName : "unnamed", static_cast<void*>(comm), resources.size());
+}
+
+void NcclCommResourceManager::cleanupResources(ncclComm_t comm) noexcept
+{
+    if (!comm)
+    {
+        return;
+    }
+
+    std::vector<ResourceEntry> resourcesToClean;
+
+    {
+        std::lock_guard<std::mutex> lock(mMutex);
+        auto it = mCommResources.find(comm);
+        if (it == mCommResources.end())
+        {
+            // Nothing registered for this comm, nothing to clean up
+            return;
+        }
+
+        // Move resources out (preserves order) and remove from map
+        resourcesToClean = std::move(it->second);
+        mCommResources.erase(it);
+
+        TLLM_LOG_TRACE(
+            "[NCCLUtil] Cleaning up %zu resources for NCCL comm %p", resourcesToClean.size(), static_cast<void*>(comm));
+    }
+
+    // Clean up outside the lock to avoid deadlocks if cleanup functions try to access the manager
+    // Order is preserved: resources are cleaned up in registration order
+    for (auto& [cleanup, name] : resourcesToClean)
+    {
+        try
+        {
+            TLLM_LOG_TRACE(
+                "[NCCLUtil] Cleaning up resource '%s' for NCCL comm %p", name.c_str(), static_cast<void*>(comm));
+            cleanup();
+        }
+        catch (std::exception const& e)
+        {
+            TLLM_LOG_ERROR("[NCCLUtil] Exception during cleanup of resource '%s' for NCCL comm %p: %s", name.c_str(),
+                static_cast<void*>(comm), e.what());
+        }
+        catch (...)
+        {
+            TLLM_LOG_ERROR("[NCCLUtil] Unknown exception during cleanup of resource '%s' for NCCL comm %p",
+                name.c_str(), static_cast<void*>(comm));
+        }
+    }
+}
+
+bool NcclCommResourceManager::hasResources(ncclComm_t comm) const noexcept
+{
+    std::lock_guard<std::mutex> lock(mMutex);
+    return mCommResources.find(comm) != mCommResources.end();
+}
+
+size_t NcclCommResourceManager::getResourceCount(ncclComm_t comm) const noexcept
+{
+    std::lock_guard<std::mutex> lock(mMutex);
+    auto it = mCommResources.find(comm);
+    return it != mCommResources.end() ? it->second.size() : 0;
+}
+
+//==============================================================================
+// NCCLHelper Implementation
+//==============================================================================
+
+NCCLHelper& NCCLHelper::getInstance()
+{
+    static NCCLHelper instance;
+    return instance;
+}
+
+NCCLHelper::NCCLHelper()
+    : mLibraryHandle(nullptr)
+    , mNCCLCommWindowRegister(nullptr)
+    , mNCCLMemAlloc(nullptr)
+    , mIsLoaded(false)
+{
+    loadNCCLLibrary();
+}
+
+NCCLHelper::~NCCLHelper()
+{
+    if (mLibraryHandle)
+    {
+#ifdef _WIN32
+        FreeLibrary(mLibraryHandle);
+#else
+        dlclose(mLibraryHandle);
+#endif
+        mLibraryHandle = nullptr;
+    }
+}
+
+void NCCLHelper::loadNCCLLibrary()
+{
+    try
+    {
+#ifdef _WIN32
+        char const* libraryNames[] = {"nccl.dll"};
+#else
+        char const* libraryNames[] = {"libnccl.so"};
+#endif
+
+        for (auto const* name : libraryNames)
+        {
+            mLibraryHandle = loadLibraryHandle(name);
+            if (mLibraryHandle)
+            {
+                TLLM_LOG_INFO("Successfully loaded NCCL library: %s", name);
+                break;
+            }
+        }
+
+        if (!mLibraryHandle)
+        {
+            TLLM_LOG_WARNING("Failed to load NCCL library");
+            return;
+        }
+
+        // Load the required symbols
+        mNCCLCommWindowRegister
+            = reinterpret_cast<ncclCommWindowRegisterFunc>(getSymbolAddress(mLibraryHandle, "ncclCommWindowRegister"));
+
+        mNCCLMemAlloc = reinterpret_cast<ncclMemAllocFunc>(getSymbolAddress(mLibraryHandle, "ncclMemAlloc"));
+
+        if (mNCCLCommWindowRegister == nullptr)
+        {
+            TLLM_LOG_WARNING("Failed to load ncclCommWindowRegister symbol, NCCL symmetric will not be supported.");
+        }
+
+        if (mNCCLMemAlloc == nullptr)
+        {
+            TLLM_LOG_WARNING("Failed to load ncclMemAlloc symbol, NCCL symmetric will not be supported.");
+        }
+
+        if (mNCCLCommWindowRegister != nullptr && mNCCLMemAlloc != nullptr)
+        {
+            mIsLoaded = true;
+        }
+        else
+        {
+            TLLM_LOG_WARNING(
+                "Failed to load required NCCL symbols (both ncclCommWindowRegister and ncclMemAlloc are required)");
+        }
+    }
+    catch (std::exception const& e)
+    {
+        TLLM_LOG_WARNING("Exception while loading NCCL library: %s", e.what());
+    }
+}
+
+void* NCCLHelper::loadLibraryHandle(char const* libName)
+{
+#ifdef _WIN32
+    return LoadLibraryA(libName);
+#else
+    return dlopen(libName, RTLD_LAZY | RTLD_GLOBAL);
+#endif
+}
+
+void* NCCLHelper::getSymbolAddress(void* handle, char const* symbolName)
+{
+    if (!handle)
+    {
+        return nullptr;
+    }
+
+#ifdef _WIN32
+    return GetProcAddress(static_cast<HMODULE>(handle), symbolName);
+#else
+    return dlsym(handle, symbolName);
+#endif
+}
+
+NCCLHelper::ncclCommWindowRegisterFunc NCCLHelper::getNCCLCommWindowRegister()
+{
+    return mNCCLCommWindowRegister;
+}
+
+NCCLHelper::ncclMemAllocFunc NCCLHelper::getNCCLMemAlloc()
+{
+    return mNCCLMemAlloc;
+}
+
+bool NCCLHelper::isLoaded() const
+{
+    return mIsLoaded;
+}
+
+//==============================================================================
+// NCCLWindowAllocator Implementation
+//==============================================================================
+
+NCCLWindowAllocator& NCCLWindowAllocator::getInstance()
+{
+    static NCCLWindowAllocator instance;
+    return instance;
+}
+
+NCCLWindowBuffer NCCLWindowAllocator::requestBuffer(ncclComm_t comm, size_t size)
+{
+    TLLM_CHECK_WITH_INFO(comm != nullptr, "NCCL communicator cannot be null");
+    TLLM_CHECK_WITH_INFO(size > 0, "Buffer size must be greater than 0");
+
+    std::lock_guard<std::mutex> lock(mMutex);
+
+    // Register cleanup callback for this communicator if not already registered
+    // This is cheap even if no buffers exist yet - cleanup will just return early
+    registerBufferCleanup(comm);
+
+    // Check if we have an available buffer of at least the requested size for this communicator
+    // Use best-fit: find the smallest buffer that's >= requested size
+    auto& commBuffers = mBufferPool[comm];
+    auto bestFit = commBuffers.end();
+    size_t bestFitSize = std::numeric_limits<size_t>::max();
+
+    for (auto it = commBuffers.begin(); it != commBuffers.end(); ++it)
+    {
+        if (!it->inUse && it->buffer.size >= size && it->buffer.size < bestFitSize)
+        {
+            bestFit = it;
+            bestFitSize = it->buffer.size;
+        }
+    }
+
+    if (bestFit != commBuffers.end())
+    {
+        bestFit->inUse = true;
+        TLLM_LOG_TRACE(
+            "[NCCLUtil] Reusing NCCL window buffer for comm %p: handle=%d, ptr=%p, size=%zu (requested: %zu)",
+            static_cast<void*>(comm), bestFit->buffer.handle, bestFit->buffer.ptr, bestFit->buffer.size, size);
+        return bestFit->buffer;
+    }
+
+    // No available buffer found, allocate a new one
+    TLLM_LOG_TRACE(
+        "[NCCLUtil] Allocating new NCCL window buffer for comm %p, size=%zu", static_cast<void*>(comm), size);
+    int handle = static_cast<int>(commBuffers.size());
+    NCCLWindowBuffer buffer = allocateAndRegisterBuffer(comm, size, handle);
+    commBuffers.push_back({buffer, true});
+
+    return buffer;
+}
+
+NCCLWindowBuffer NCCLWindowAllocator::searchBuffer(ncclComm_t comm, void* ptr) const
+{
+    if (!comm || !ptr)
+    {
+        return NCCLWindowBuffer();
+    }
+
+    std::lock_guard<std::mutex> lock(mMutex);
+    return searchBufferLocked(comm, ptr);
+}
+
+void NCCLWindowAllocator::releaseBuffer(ncclComm_t comm, void* ptr)
+{
+    if (!comm || !ptr)
+    {
+        return;
+    }
+
+    std::lock_guard<std::mutex> lock(mMutex);
+    auto commIt = mBufferPool.find(comm);
+    if (commIt == mBufferPool.end())
+    {
+        TLLM_LOG_WARNING(
+            "[NCCLUtil] Attempted to release buffer %p for unknown comm %p", ptr, static_cast<void*>(comm));
+        return;
+    }
+
+    for (auto& entry : commIt->second)
+    {
+        if (entry.buffer.ptr == ptr)
+        {
+            entry.inUse = false;
+            TLLM_LOG_TRACE("[NCCLUtil] Released NCCL window buffer for comm %p: ptr=%p", static_cast<void*>(comm), ptr);
+            return;
+        }
+    }
+
+    TLLM_LOG_WARNING("[NCCLUtil] Attempted to release unknown buffer %p for comm %p", ptr, static_cast<void*>(comm));
+}
+
+ncclWindow_t NCCLWindowAllocator::getWindow(ncclComm_t comm, void* ptr) const
+{
+    std::lock_guard<std::mutex> lock(mMutex);
+    NCCLWindowBuffer buffer = searchBufferLocked(comm, ptr);
+    return buffer.isValid() ? buffer.window : nullptr;
+}
+
+size_t NCCLWindowAllocator::getSize(ncclComm_t comm, void* ptr) const
+{
+    std::lock_guard<std::mutex> lock(mMutex);
+    NCCLWindowBuffer buffer = searchBufferLocked(comm, ptr);
+    return buffer.isValid() ? buffer.size : 0;
+}
+
+NCCLWindowBuffer NCCLWindowAllocator::getBufferInfo(ncclComm_t comm, void* ptr) const
+{
+    std::lock_guard<std::mutex> lock(mMutex);
+    return searchBufferLocked(comm, ptr);
+}
+
+size_t NCCLWindowAllocator::getBufferCount(ncclComm_t comm) const
+{
+    std::lock_guard<std::mutex> lock(mMutex);
+    auto commIt = mBufferPool.find(comm);
+    return commIt != mBufferPool.end() ? commIt->second.size() : 0;
+}
+
+size_t NCCLWindowAllocator::getBufferInUseCount(ncclComm_t comm) const
+{
+    std::lock_guard<std::mutex> lock(mMutex);
+    auto commIt = mBufferPool.find(comm);
+    if (commIt == mBufferPool.end())
+    {
+        return 0;
+    }
+
+    size_t count = 0;
+    for (auto const& entry : commIt->second)
+    {
+        if (entry.inUse)
+        {
+            ++count;
+        }
+    }
+    return count;
+}
+
+bool NCCLWindowAllocator::isCommValid(ncclComm_t comm) const noexcept
+{
+    // Simply check for null - all non-null comms are valid
+    // We don't track cleaned-up comms because NCCL can reuse memory addresses,
+    // making pointer-based tracking unreliable. New comms will be registered when used.
+    return comm != nullptr;
+}
+
+NCCLWindowBuffer NCCLWindowAllocator::allocateAndRegisterBuffer(ncclComm_t comm, size_t size, int handle)
+{
+    NCCLWindowBuffer buffer;
+    buffer.handle = handle;
+
+    // Get NCCL helper for dynamic symbol loading
+    auto& ncclHelper = NCCLHelper::getInstance();
+    if (!ncclHelper.isLoaded())
+    {
+        TLLM_THROW("NCCL library could not be loaded for dynamic symbol access");
+    }
+
+    auto ncclMemAllocFunc = ncclHelper.getNCCLMemAlloc();
+    auto ncclCommWindowRegisterFunc = ncclHelper.getNCCLCommWindowRegister();
+
+    // Defensive checks: both function pointers must be non-null
+    if (ncclMemAllocFunc == nullptr)
+    {
+        TLLM_THROW("ncclMemAlloc function pointer is null, cannot allocate NCCL window buffer");
+    }
+
+    if (ncclCommWindowRegisterFunc == nullptr)
+    {
+        TLLM_THROW("ncclCommWindowRegister function pointer is null, cannot register NCCL window buffer");
+    }
+
+    // Allocate device memory using ncclMemAlloc
+    ncclResult_t allocResult = ncclMemAllocFunc(&buffer.ptr, size);
+    if (allocResult != ncclSuccess)
+    {
+        TLLM_THROW("ncclMemAlloc failed with error: %d", allocResult);
+    }
+    buffer.size = size;
+
+    // Register the buffer with NCCL as a window
+    ncclResult_t regResult
+        = ncclCommWindowRegisterFunc(comm, buffer.ptr, size, &buffer.window, NCCL_WIN_COLL_SYMMETRIC);
+    if (regResult != ncclSuccess)
+    {
+        ncclMemFree(buffer.ptr);
+        TLLM_THROW("ncclCommWindowRegister failed with error: %d", regResult);
+    }
+
+    TLLM_LOG_TRACE("[NCCLUtil] Allocated and registered NCCL window buffer: handle=%d, ptr=%p, size=%zu, window=%p",
+        handle, buffer.ptr, size, static_cast<void*>(buffer.window));
+
+    return buffer;
+}
+
+NCCLWindowBuffer NCCLWindowAllocator::searchBufferLocked(ncclComm_t comm, void* ptr) const
+{
+    auto commIt = mBufferPool.find(comm);
+    if (commIt == mBufferPool.end())
+    {
+        return NCCLWindowBuffer();
+    }
+
+    for (auto const& entry : commIt->second)
+    {
+        if (entry.buffer.ptr == ptr)
+        {
+            return entry.buffer;
+        }
+    }
+
+    return NCCLWindowBuffer();
+}
+
+void NCCLWindowAllocator::registerBufferCleanup(ncclComm_t comm)
+{
+    // Don't register if already registered
+    if (mRegisteredComms.find(comm) != mRegisteredComms.end())
+    {
+        return;
+    }
+
+    mRegisteredComms.insert(comm);
+
+    // Register cleanup with the resource manager
+    NcclCommResourceManager::getInstance().registerResource(
+        comm, [this, comm]() { this->cleanupBuffersForComm(comm); }, "NCCLWindowAllocator");
+}
+
+void NCCLWindowAllocator::cleanupBuffersForComm(ncclComm_t comm) noexcept
+{
+    if (!comm)
+    {
+        return;
+    }
+
+    // Synchronize CUDA to ensure all operations using these buffers are complete
+    // before we deregister windows and free memory
+    cudaError_t cudaErr = cudaDeviceSynchronize();
+    if (cudaErr != cudaSuccess)
+    {
+        TLLM_LOG_WARNING("[NCCLUtil] cudaDeviceSynchronize failed with error: %d before cleanup for comm %p", cudaErr,
+            static_cast<void*>(comm));
+        // Continue anyway - the sync failure might be from a previous error
+    }
+
+    std::lock_guard<std::mutex> lock(mMutex);
+
+    // Check if we've already cleaned up this communicator
+    if (mRegisteredComms.find(comm) == mRegisteredComms.end())
+    {
+        // Already cleaned up or never registered
+        return;
+    }
+
+    auto commIt = mBufferPool.find(comm);
+    if (commIt == mBufferPool.end())
+    {
+        // No buffers to clean up, but mark as cleaned
+        mRegisteredComms.erase(comm);
+        return;
+    }
+
+    TLLM_LOG_TRACE(
+        "[NCCLUtil] Cleaning up %zu NCCL window buffers for comm %p", commIt->second.size(), static_cast<void*>(comm));
+
+    // Check for buffers still in use - this shouldn't happen if cleanup is called properly,
+    // but we log a warning if it does
+    size_t inUseCount = 0;
+    for (auto const& entry : commIt->second)
+    {
+        if (entry.inUse)
+        {
+            ++inUseCount;
+        }
+    }
+    if (inUseCount > 0)
+    {
+        TLLM_LOG_WARNING(
+            "[NCCLUtil] Cleaning up %zu buffers still marked as in-use for comm %p. "
+            "This may indicate buffers weren't properly released before cleanup.",
+            inUseCount, static_cast<void*>(comm));
+    }
+
+    for (auto& entry : commIt->second)
+    {
+        if (entry.buffer.isValid())
+        {
+            // Deregister the window - the communicator is still valid at this point
+            // (cleanup happens before ncclCommDestroy), but we need to be careful
+            // if buffers are still in use by active operations
+            if (entry.buffer.window && comm)
+            {
+                // Note: Even if buffer is marked inUse, we must deregister since
+                // the communicator is being destroyed. The communicator is valid,
+                // but we should handle potential errors gracefully.
+                ncclResult_t result = ncclCommWindowDeregister(comm, entry.buffer.window);
+                if (result != ncclSuccess)
+                {
+                    TLLM_LOG_WARNING(
+                        "[NCCLUtil] ncclCommWindowDeregister failed with error: %d for comm %p, "
+                        "window %p (buffer inUse: %d)",
+                        result, static_cast<void*>(comm), static_cast<void*>(entry.buffer.window), entry.inUse);
+                }
+            }
+
+            // Free device memory using ncclMemFree
+            // This should be safe even if deregister failed
+            if (entry.buffer.ptr)
+            {
+                try
+                {
+                    ncclResult_t ncclResult = ncclMemFree(entry.buffer.ptr);
+                    if (ncclResult != ncclSuccess)
+                    {
+                        TLLM_LOG_WARNING("[NCCLUtil] ncclMemFree failed with error: %d", ncclResult);
+                    }
+                }
+                catch (...)
+                {
+                    TLLM_LOG_ERROR("[NCCLUtil] Exception during ncclMemFree for ptr %p", entry.buffer.ptr);
+                }
+            }
+
+            TLLM_LOG_TRACE(
+                "[NCCLUtil] Freed NCCL window buffer: ptr=%p, size=%zu", entry.buffer.ptr, entry.buffer.size);
+        }
+    }
+
+    mBufferPool.erase(commIt);
+    mRegisteredComms.erase(comm);
+}
+
+} // namespace tensorrt_llm::common::nccl_util
+
+#endif // ENABLE_MULTI_DEVICE
diff --git a/cpp/tensorrt_llm/common/ncclUtils.h b/cpp/tensorrt_llm/common/ncclUtils.h
new file mode 100644
index 0000000000..d128741e0a
--- /dev/null
+++ b/cpp/tensorrt_llm/common/ncclUtils.h
@@ -0,0 +1,397 @@
+/*
+ * Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include "tensorrt_llm/common/assert.h"
+#include "tensorrt_llm/common/cudaUtils.h"
+#include "tensorrt_llm/common/logger.h"
+
+#if ENABLE_MULTI_DEVICE
+#include <nccl.h>
+#include <torch/extension.h>
+#endif
+
+#include <algorithm>
+#include <functional>
+#include <limits>
+#include <memory>
+#include <mutex>
+#include <numeric>
+#include <sstream>
+#include <stdexcept>
+#include <string>
+#include <unordered_map>
+#include <unordered_set>
+#include <utility>
+#include <vector>
+
+#if ENABLE_MULTI_DEVICE
+
+#ifdef _WIN32
+#include <windows.h>
+#else
+#include <dlfcn.h>
+#endif
+
+namespace tensorrt_llm::common::nccl_util
+{
+
+//==============================================================================
+// NCCL Helper - Dynamic Library Loading
+//==============================================================================
+
+// Helper class for dynamically loading NCCL symbols (ncclMemAlloc, ncclCommWindowRegister)
+// This allows the code to work with NCCL libraries that may or may not have these symbols
+class NCCLHelper
+{
+public:
+    static NCCLHelper& getInstance();
+
+    // Dynamic loading function type definition
+    using ncclCommWindowRegisterFunc = ncclResult_t (*)(ncclComm_t, void*, size_t, ncclWindow_t*, int);
+    using ncclMemAllocFunc = ncclResult_t (*)(void**, size_t);
+
+    // Get function pointer for ncclCommWindowRegister
+    ncclCommWindowRegisterFunc getNCCLCommWindowRegister();
+
+    // Get function pointer for ncclMemAlloc
+    ncclMemAllocFunc getNCCLMemAlloc();
+
+    // Check if NCCL library is successfully loaded
+    bool isLoaded() const;
+
+    NCCLHelper(NCCLHelper const&) = delete;
+    NCCLHelper& operator=(NCCLHelper const&) = delete;
+    NCCLHelper(NCCLHelper&&) = delete;
+    NCCLHelper& operator=(NCCLHelper&&) = delete;
+
+private:
+    NCCLHelper();
+    ~NCCLHelper();
+
+    void loadNCCLLibrary();
+    void* loadLibraryHandle(char const* libName);
+    void* getSymbolAddress(void* handle, char const* symbolName);
+
+#ifdef _WIN32
+    HMODULE mLibraryHandle;
+#else
+    void* mLibraryHandle;
+#endif
+
+    ncclCommWindowRegisterFunc mNCCLCommWindowRegister;
+    ncclMemAllocFunc mNCCLMemAlloc;
+    bool mIsLoaded;
+};
+
+//==============================================================================
+// NCCL Resource Management
+//==============================================================================
+
+// Resource cleanup function type. Called before the NCCL communicator is destroyed.
+using ResourceCleanupFunc = std::function<void()>;
+
+// Manages resources associated with NCCL communicators. Thread-safe singleton that maintains
+// a pool of resources per NCCL comm. Resources are automatically cleaned up when the
+// communicator is destroyed.
+class NcclCommResourceManager
+{
+public:
+    static NcclCommResourceManager& getInstance() noexcept;
+
+    // Register a resource cleanup function for a specific NCCL communicator.
+    // The cleanup function will be called before ncclCommDestroy.
+    // Thread-safe: Uses global mutex to serialize all operations.
+    void registerResource(ncclComm_t comm, ResourceCleanupFunc cleanup, char const* debugName = nullptr);
+
+    // Cleanup all resources associated with a communicator. Called automatically by
+    // the shared_ptr deleter before ncclCommDestroy.
+    // Thread-safe: Uses global mutex to serialize cleanup operations.
+    // Order-preserving: Resources are cleaned up in registration order.
+    void cleanupResources(ncclComm_t comm) noexcept;
+
+    // Check if a communicator has registered resources.
+    bool hasResources(ncclComm_t comm) const noexcept;
+
+    // Get the number of resources registered for a communicator.
+    size_t getResourceCount(ncclComm_t comm) const noexcept;
+
+    NcclCommResourceManager(NcclCommResourceManager const&) = delete;
+    NcclCommResourceManager& operator=(NcclCommResourceManager const&) = delete;
+    NcclCommResourceManager(NcclCommResourceManager&&) = delete;
+    NcclCommResourceManager& operator=(NcclCommResourceManager&&) = delete;
+
+private:
+    NcclCommResourceManager() = default;
+    ~NcclCommResourceManager() = default;
+
+    using ResourceEntry = std::pair<ResourceCleanupFunc, std::string>;
+
+    mutable std::mutex mMutex;
+    std::unordered_map<ncclComm_t, std::vector<ResourceEntry>> mCommResources;
+};
+
+// RAII helper to register a resource with a NCCL communicator.
+// Automatically registers cleanup function on construction.
+template <typename ResourceType>
+class NcclCommResource
+{
+public:
+    NcclCommResource(ncclComm_t comm, ResourceType&& resource, std::function<void(ResourceType&)> cleanup,
+        char const* debugName = nullptr)
+        : mComm(comm)
+        , mResource(std::forward<ResourceType>(resource))
+        , mCleanup(std::move(cleanup))
+        , mRegistered(true)
+    {
+        // Register with the manager
+        NcclCommResourceManager::getInstance().registerResource(
+            comm,
+            [this]()
+            {
+                if (mCleanup)
+                {
+                    mCleanup(mResource);
+                }
+            },
+            debugName);
+    }
+
+    ResourceType& get()
+    {
+        return mResource;
+    }
+
+    ResourceType const& get() const
+    {
+        return mResource;
+    }
+
+    NcclCommResource(NcclCommResource const&) = delete;
+    NcclCommResource& operator=(NcclCommResource const&) = delete;
+    NcclCommResource(NcclCommResource&&) = delete;
+    NcclCommResource& operator=(NcclCommResource&&) = delete;
+
+private:
+    ncclComm_t mComm;
+    ResourceType mResource;
+    std::function<void(ResourceType&)> mCleanup;
+    bool mRegistered;
+};
+
+//==============================================================================
+// NCCL Window Buffer Allocation
+//==============================================================================
+
+// Represents a buffer with an associated NCCL window
+struct NCCLWindowBuffer
+{
+    void* ptr;           // Device pointer (same as UBBuffer.addr)
+    int handle;          // Buffer handle/index (for compatibility with UB interface)
+    size_t size;         // Size in bytes
+    ncclWindow_t window; // NCCL window handle
+
+    NCCLWindowBuffer(void* p = nullptr, int h = -1, size_t s = 0, ncclWindow_t w = nullptr)
+        : ptr(p)
+        , handle(h)
+        , size(s)
+        , window(w)
+    {
+    }
+
+    [[nodiscard]] bool isValid() const
+    {
+        return ptr != nullptr && handle >= 0 && size > 0 && window != nullptr;
+    }
+
+    [[nodiscard]] bool invalid() const
+    {
+        return !isValid();
+    }
+
+    // Alias for compatibility with UBBuffer interface
+    void* addr() const
+    {
+        return ptr;
+    }
+};
+
+// Manages NCCL window-registered buffers with pooling and automatic cleanup.
+// Buffers are tied to the lifetime of their associated NCCL communicator.
+class NCCLWindowAllocator
+{
+public:
+    static NCCLWindowAllocator& getInstance();
+
+    // Request a buffer for the given communicator and size.
+    // If an unused buffer of at least the requested size exists for this communicator, it will be reused.
+    // Uses best-fit strategy: selects the smallest available buffer that meets the size requirement.
+    // Otherwise, a new buffer is allocated and registered.
+    NCCLWindowBuffer requestBuffer(ncclComm_t comm, size_t size);
+
+    // Search for a buffer by pointer. Returns an invalid buffer if not found.
+    // This matches the UBManager.search_buffer() interface.
+    NCCLWindowBuffer searchBuffer(ncclComm_t comm, void* ptr) const;
+
+    // Release a buffer back to the pool for potential reuse
+    void releaseBuffer(ncclComm_t comm, void* ptr);
+
+    // Get the window handle for a specific buffer pointer
+    ncclWindow_t getWindow(ncclComm_t comm, void* ptr) const;
+
+    // Get the size of a specific buffer pointer
+    size_t getSize(ncclComm_t comm, void* ptr) const;
+
+    // Get buffer info by pointer
+    NCCLWindowBuffer getBufferInfo(ncclComm_t comm, void* ptr) const;
+
+    // Get the number of buffers allocated for a communicator
+    size_t getBufferCount(ncclComm_t comm) const;
+
+    // Get the number of buffers in use for a communicator
+    size_t getBufferInUseCount(ncclComm_t comm) const;
+
+    // Check if a communicator is valid (non-null)
+    // Note: We don't track cleaned-up comms because NCCL can reuse memory addresses.
+    // All non-null comms are considered valid and will be registered when first used.
+    bool isCommValid(ncclComm_t comm) const noexcept;
+
+    NCCLWindowAllocator(NCCLWindowAllocator const&) = delete;
+    NCCLWindowAllocator& operator=(NCCLWindowAllocator const&) = delete;
+    NCCLWindowAllocator(NCCLWindowAllocator&&) = delete;
+    NCCLWindowAllocator& operator=(NCCLWindowAllocator&&) = delete;
+
+private:
+    NCCLWindowAllocator() = default;
+    ~NCCLWindowAllocator() = default;
+
+    // Allocate a new buffer and register it with NCCL as a window
+    NCCLWindowBuffer allocateAndRegisterBuffer(ncclComm_t comm, size_t size, int handle);
+
+    // Search for a buffer by pointer (assumes mMutex is already locked)
+    NCCLWindowBuffer searchBufferLocked(ncclComm_t comm, void* ptr) const;
+
+    // Register cleanup function for all buffers associated with a communicator
+    void registerBufferCleanup(ncclComm_t comm);
+
+    // Cleanup all buffers for a specific communicator
+    void cleanupBuffersForComm(ncclComm_t comm) noexcept;
+
+    struct BufferEntry
+    {
+        NCCLWindowBuffer buffer;
+        bool inUse;
+    };
+
+    mutable std::mutex mMutex;
+    std::unordered_map<ncclComm_t, std::vector<BufferEntry>> mBufferPool;
+    std::unordered_set<ncclComm_t> mRegisteredComms;
+};
+
+// RAII wrapper for NCCL window buffers
+class ScopedNCCLWindowBuffer
+{
+public:
+    ScopedNCCLWindowBuffer(ncclComm_t comm, size_t size)
+        : mComm(comm)
+        , mBuffer(NCCLWindowAllocator::getInstance().requestBuffer(comm, size))
+    {
+    }
+
+    ~ScopedNCCLWindowBuffer()
+    {
+        if (mBuffer.isValid())
+        {
+            NCCLWindowAllocator::getInstance().releaseBuffer(mComm, mBuffer.ptr);
+        }
+    }
+
+    void* getPtr() const
+    {
+        return mBuffer.ptr;
+    }
+
+    size_t getSize() const
+    {
+        return mBuffer.size;
+    }
+
+    ncclWindow_t getWindow() const
+    {
+        return mBuffer.window;
+    }
+
+    NCCLWindowBuffer const& getBuffer() const
+    {
+        return mBuffer;
+    }
+
+    ScopedNCCLWindowBuffer(ScopedNCCLWindowBuffer const&) = delete;
+    ScopedNCCLWindowBuffer& operator=(ScopedNCCLWindowBuffer const&) = delete;
+    ScopedNCCLWindowBuffer(ScopedNCCLWindowBuffer&&) = delete;
+    ScopedNCCLWindowBuffer& operator=(ScopedNCCLWindowBuffer&&) = delete;
+
+private:
+    ncclComm_t mComm;
+    NCCLWindowBuffer mBuffer;
+};
+
+// Creates a PyTorch tensor backed by an NCCL window buffer.
+// The tensor will automatically release the buffer back to the pool when destroyed.
+// This is analogous to torch_ext::create_userbuffers_tensor() but for NCCLWindowAllocator.
+inline std::pair<torch::Tensor, NCCLWindowBuffer> createNCCLWindowTensor(
+    ncclComm_t comm, at::IntArrayRef shape, torch::ScalarType dtype)
+{
+    // Calculate buffer size
+    int64_t buffer_size
+        = std::accumulate(shape.begin(), shape.end(), 1LL, std::multiplies<int64_t>()) * torch::elementSize(dtype);
+
+    // Calculate strides
+    std::vector<int64_t> strides_vec(shape.size());
+    if (!shape.empty())
+    {
+        strides_vec[shape.size() - 1] = 1;
+        for (int64_t i = static_cast<int64_t>(shape.size()) - 1; i >= 1; --i)
+        {
+            strides_vec[i - 1] = strides_vec[i] * shape[i];
+        }
+    }
+
+    // Request buffer from allocator
+    auto& allocator = NCCLWindowAllocator::getInstance();
+    auto buffer = allocator.requestBuffer(comm, buffer_size);
+
+    // Defensive validation: ensure buffer is valid before proceeding
+    if (!buffer.isValid())
+    {
+        std::ostringstream oss;
+        oss << "Failed to allocate NCCL window buffer: invalid buffer returned from requestBuffer "
+            << "(comm=" << static_cast<void*>(comm) << ", buffer_size=" << buffer_size << ")";
+        throw std::runtime_error(oss.str());
+    }
+
+    // Create custom deleter that releases the buffer
+    auto deleter = [comm, ptr = buffer.ptr](void*) { NCCLWindowAllocator::getInstance().releaseBuffer(comm, ptr); };
+
+    // Create tensor from the buffer
+    auto tensor = torch::from_blob(buffer.ptr, shape, strides_vec, deleter, torch::dtype(dtype).device(torch::kCUDA));
+
+    return std::make_pair(tensor, buffer);
+}
+
+} // namespace tensorrt_llm::common::nccl_util
+
+#endif // ENABLE_MULTI_DEVICE
diff --git a/cpp/tensorrt_llm/common/opUtils.cpp b/cpp/tensorrt_llm/common/opUtils.cpp
index 736cd1c48d..72d966e43d 100644
--- a/cpp/tensorrt_llm/common/opUtils.cpp
+++ b/cpp/tensorrt_llm/common/opUtils.cpp
@@ -16,6 +16,7 @@
  */
 
 #include "tensorrt_llm/common/opUtils.h"
+#include "tensorrt_llm/common/ncclUtils.h"
 #include "tensorrt_llm/runtime/utils/mpiTags.h"
 #include "tensorrt_llm/runtime/utils/mpiUtils.h"
 
@@ -112,7 +113,29 @@ std::shared_ptr<ncclComm_t> getComm(std::set<int> const& group)
     std::shared_ptr<ncclComm_t> ncclComm(new ncclComm_t,
         [](ncclComm_t* comm)
         {
-            ncclCommDestroy(*comm);
+            if (!comm)
+            {
+                return;
+            }
+
+            // STEP 1: Clean up resources and destroy NCCL communicator if it's valid
+            if (*comm)
+            {
+                // Clean up all registered resources FIRST
+                tensorrt_llm::common::nccl_util::NcclCommResourceManager::getInstance().cleanupResources(*comm);
+
+                // Now destroy the NCCL communicator
+                ncclResult_t result = ncclCommDestroy(*comm);
+                if (result != ncclSuccess)
+                {
+                    TLLM_LOG_WARNING("ncclCommDestroy failed with error: %d", result);
+                }
+
+                // Clear the communicator value before freeing the pointer
+                *comm = nullptr;
+            }
+
+            // STEP 2: Always free the pointer memory (regardless of whether *comm was valid)
             delete comm;
         });
 #if defined(_WIN32)
diff --git a/cpp/tensorrt_llm/kernels/userbuffers/ub_allocator.cpp b/cpp/tensorrt_llm/kernels/userbuffers/ub_allocator.cpp
index e0f2d5cce2..2e3e6dde66 100644
--- a/cpp/tensorrt_llm/kernels/userbuffers/ub_allocator.cpp
+++ b/cpp/tensorrt_llm/kernels/userbuffers/ub_allocator.cpp
@@ -22,16 +22,8 @@ namespace tensorrt_llm::runtime::ub
 {
 UserBufferAllocator& UserBufferAllocator::Instance()
 {
-    if (use_nccl_symmetric)
-    {
-        static NCCLUserBufferAllocator _;
-        return _;
-    }
-    else
-    {
-        static UserBufferAllocator _;
-        return _;
-    }
+    static UserBufferAllocator _;
+    return _;
 }
 
 void UserBufferAllocator::initialize(tensorrt_llm::runtime::WorldConfig const& worldConfig)
@@ -83,167 +75,4 @@ communicator* UserBufferAllocator::comm()
     return mUbComm;
 }
 
-void NCCLUserBufferAllocator::initialize(tensorrt_llm::runtime::WorldConfig const& worldConfig)
-{
-    if (!isInitialized())
-    {
-        TLLM_LOG_INFO("Initializing NCCLUserBufferAllocator");
-        std::set<int> group;
-        for (int i = 0; i < worldConfig.getSize(); i++)
-        {
-            group.insert(i);
-        }
-        mComm = getComm(group);
-        mIsInitialized = true;
-    }
-}
-
-UBBuffer NCCLUserBufferAllocator::registerUBBuffer(size_t bytes)
-{
-    TLLM_CHECK(isInitialized());
-    UBBuffer ub_buffer;
-
-    auto& ncclHelper = getNCCLHelper();
-    if (!ncclHelper.isLoaded())
-    {
-        TLLM_THROW("NCCL library could not be loaded for dynamic symbol access");
-    }
-
-    auto ncclMemAllocFunc = ncclHelper.getNCCLMemAlloc();
-    auto ncclCommWindowRegisterFunc = ncclHelper.getNCCLCommWindowRegister();
-
-    NCCLCHECK(ncclMemAllocFunc(&ub_buffer.addr, bytes));
-    NCCLCHECK(ncclCommWindowRegisterFunc((*mComm), ub_buffer.addr, bytes, &ub_buffer.window, NCCL_WIN_COLL_SYMMETRIC));
-    ub_buffer.handle = 5;
-    ub_buffer.size = bytes;
-    return ub_buffer;
-}
-
-// Static member definitions
-std::unique_ptr<NCCLHelper> NCCLUserBufferAllocator::mNCCLHelper = nullptr;
-
-NCCLHelper& NCCLUserBufferAllocator::getNCCLHelper()
-{
-    if (!mNCCLHelper)
-    {
-        mNCCLHelper = std::make_unique<NCCLHelper>();
-    }
-    return *mNCCLHelper;
-}
-
-// NCCLHelper implementation
-NCCLHelper::NCCLHelper()
-    : mLibraryHandle(nullptr)
-    , mNCCLCommWindowRegister(nullptr)
-    , mNCCLMemAlloc(nullptr)
-    , mIsLoaded(false)
-{
-    loadNCCLLibrary();
-}
-
-NCCLHelper::~NCCLHelper()
-{
-    if (mLibraryHandle)
-    {
-#ifdef _WIN32
-        FreeLibrary(mLibraryHandle);
-#else
-        dlclose(mLibraryHandle);
-#endif
-        mLibraryHandle = nullptr;
-    }
-}
-
-void NCCLHelper::loadNCCLLibrary()
-{
-    try
-    {
-#ifdef _WIN32
-        char const* libraryNames[] = {"nccl.dll"};
-#else
-        char const* libraryNames[] = {"libnccl.so"};
-#endif
-
-        for (int i = 0; libraryNames[i] != nullptr; ++i)
-        {
-            mLibraryHandle = loadLibraryHandle(libraryNames[i]);
-            if (mLibraryHandle)
-            {
-                TLLM_LOG_INFO("Successfully loaded NCCL library: %s", libraryNames[i]);
-                break;
-            }
-        }
-
-        if (!mLibraryHandle)
-        {
-            TLLM_LOG_WARNING("Failed to load NCCL library");
-            return;
-        }
-
-        // Load the required symbols
-        mNCCLCommWindowRegister
-            = reinterpret_cast<ncclCommWindowRegisterFunc>(getSymbolAddress(mLibraryHandle, "ncclCommWindowRegister"));
-
-        mNCCLMemAlloc = reinterpret_cast<ncclMemAllocFunc>(getSymbolAddress(mLibraryHandle, "ncclMemAlloc"));
-
-        if (mNCCLCommWindowRegister == nullptr)
-        {
-            TLLM_LOG_WARNING("Failed to load ncclCommWindowRegister symbol, NCCL symmetric will not be supported.");
-        }
-
-        if (mNCCLMemAlloc)
-        {
-            mIsLoaded = true;
-        }
-        else
-        {
-            TLLM_LOG_WARNING("Failed to load required NCCL symbols");
-        }
-    }
-    catch (std::exception const& e)
-    {
-        TLLM_LOG_WARNING("Exception while loading NCCL library: %s", e.what());
-    }
-}
-
-void* NCCLHelper::loadLibraryHandle(char const* libName)
-{
-#ifdef _WIN32
-    return LoadLibraryA(libName);
-#else
-    return dlopen(libName, RTLD_LAZY | RTLD_GLOBAL);
-#endif
-}
-
-void* NCCLHelper::getSymbolAddress(void* handle, char const* symbolName)
-{
-    if (!handle)
-    {
-        return nullptr;
-    }
-
-#ifdef _WIN32
-    return GetProcAddress(static_cast<HMODULE>(handle), symbolName);
-#else
-    return dlsym(handle, symbolName);
-#endif
-}
-
-NCCLHelper::ncclCommWindowRegisterFunc NCCLHelper::getNCCLCommWindowRegister()
-{
-    return mNCCLCommWindowRegister;
-}
-
-NCCLHelper::ncclMemAllocFunc NCCLHelper::getNCCLMemAlloc()
-{
-    return mNCCLMemAlloc;
-}
-
-bool NCCLHelper::isLoaded() const
-{
-    return mIsLoaded;
-}
-
-bool UserBufferAllocator::use_nccl_symmetric = false;
-
 }; // namespace tensorrt_llm::runtime::ub
diff --git a/cpp/tensorrt_llm/kernels/userbuffers/ub_allocator.h b/cpp/tensorrt_llm/kernels/userbuffers/ub_allocator.h
index 4cc9149705..05a4b6dd4e 100644
--- a/cpp/tensorrt_llm/kernels/userbuffers/ub_allocator.h
+++ b/cpp/tensorrt_llm/kernels/userbuffers/ub_allocator.h
@@ -19,11 +19,6 @@
 #if ENABLE_MULTI_DEVICE
 #include "nccl.h"
 #include "userbuffers.h"
-#ifdef _WIN32
-#include <windows.h>
-#else
-#include <dlfcn.h>
-#endif
 #else
 using ncclWindow_t = void*;
 #endif
@@ -69,8 +64,6 @@ public:
     communicator* comm();
     virtual UBBuffer registerUBBuffer(size_t bytes);
 
-    static bool use_nccl_symmetric;
-
 private:
     communicator* mUbComm;
 
@@ -80,55 +73,6 @@ protected:
     tensorrt_llm::runtime::WorldConfig mWorldConfig;
 };
 
-class NCCLHelper
-{
-public:
-    NCCLHelper();
-    ~NCCLHelper();
-
-    // Dynamic loading function type definition
-    using ncclCommWindowRegisterFunc = ncclResult_t (*)(ncclComm_t, void*, size_t, ncclWindow_t*, int);
-    using ncclMemAllocFunc = ncclResult_t (*)(void**, size_t);
-
-    // Get function pointer for ncclCommWindowRegister
-    ncclCommWindowRegisterFunc getNCCLCommWindowRegister();
-
-    // Get function pointer for ncclMemAlloc
-    ncclMemAllocFunc getNCCLMemAlloc();
-
-    // Check if NCCL library is successfully loaded
-    bool isLoaded() const;
-
-private:
-    void loadNCCLLibrary();
-    void* loadLibraryHandle(char const* libName);
-    void* getSymbolAddress(void* handle, char const* symbolName);
-
-#ifdef _WIN32
-    HMODULE mLibraryHandle;
-#else
-    void* mLibraryHandle;
-#endif
-
-    ncclCommWindowRegisterFunc mNCCLCommWindowRegister;
-    ncclMemAllocFunc mNCCLMemAlloc;
-    bool mIsLoaded;
-};
-
-class NCCLUserBufferAllocator : public UserBufferAllocator
-{
-public:
-    void initialize(tensorrt_llm::runtime::WorldConfig const& world_config) override;
-    UBBuffer registerUBBuffer(size_t bytes) override;
-
-    // Get shared NCCLHelper instance
-    static NCCLHelper& getNCCLHelper();
-
-private:
-    std::shared_ptr<ncclComm_t> mComm;
-    static std::unique_ptr<NCCLHelper> mNCCLHelper;
-};
-
 #else
 using communicator = void;
 #endif
diff --git a/cpp/tensorrt_llm/kernels/userbuffers/userbuffersManager.cpp b/cpp/tensorrt_llm/kernels/userbuffers/userbuffersManager.cpp
index a1fcd3c01f..df2a549b8d 100644
--- a/cpp/tensorrt_llm/kernels/userbuffers/userbuffersManager.cpp
+++ b/cpp/tensorrt_llm/kernels/userbuffers/userbuffersManager.cpp
@@ -14,6 +14,7 @@
  * limitations under the License.
  */
 #include "userbuffersManager.h"
+#include "tensorrt_llm/common/logger.h"
 
 namespace tensorrt_llm::runtime::ub
 {
@@ -29,14 +30,11 @@ UserBuffersManager& UserBuffersManager::get_instance()
     return allocator;
 }
 
-void UserBuffersManager::initialize(int64_t tp_size, int64_t pp_size, int64_t cp_size, int64_t rank,
-    int64_t gpus_per_node, int64_t buffer_size, bool use_nccl_symmetric)
+void UserBuffersManager::initialize(
+    int64_t tp_size, int64_t pp_size, int64_t cp_size, int64_t rank, int64_t gpus_per_node, int64_t buffer_size)
 {
     std::lock_guard<std::mutex> lock(mutex_);
     tensorrt_llm::runtime::WorldConfig world_config(tp_size, pp_size, cp_size, rank, gpus_per_node);
-#if ENABLE_MULTI_DEVICE
-    UserBufferAllocator::Instance().use_nccl_symmetric = use_nccl_symmetric;
-#endif
     tensorrt_llm::runtime::ub::ub_initialize(world_config);
     TLLM_CHECK(tensorrt_llm::runtime::ub::ub_is_initialized());
     buffer_size_ = buffer_size;
@@ -98,11 +96,10 @@ tensorrt_llm::runtime::ub::communicator* UserBuffersManager::comm()
     return tensorrt_llm::runtime::ub::ub_comm();
 }
 
-void initialize_userbuffers_manager(int64_t tp_size, int64_t pp_size, int64_t cp_size, int64_t rank,
-    int64_t gpus_per_node, int64_t buffer_size, bool use_nccl_symmetric)
+void initialize_userbuffers_manager(
+    int64_t tp_size, int64_t pp_size, int64_t cp_size, int64_t rank, int64_t gpus_per_node, int64_t buffer_size)
 {
-    UserBuffersManager::get_instance().initialize(
-        tp_size, pp_size, cp_size, rank, gpus_per_node, buffer_size, use_nccl_symmetric);
+    UserBuffersManager::get_instance().initialize(tp_size, pp_size, cp_size, rank, gpus_per_node, buffer_size);
 }
 
 } // namespace tensorrt_llm::runtime::ub
diff --git a/cpp/tensorrt_llm/kernels/userbuffers/userbuffersManager.h b/cpp/tensorrt_llm/kernels/userbuffers/userbuffersManager.h
index 1b34f8e8a1..7ec39db602 100644
--- a/cpp/tensorrt_llm/kernels/userbuffers/userbuffersManager.h
+++ b/cpp/tensorrt_llm/kernels/userbuffers/userbuffersManager.h
@@ -46,9 +46,8 @@ public:
     //! @param gpus_per_node The number of GPUs per node.
     //! @param buffer_size The size of the buffer to allocate. All buffers allocated by this manager will have this
     //! size.
-    //! @param use_nccl_symmetric Whether to use NCCL symmetric communication.
-    void initialize(int64_t tp_size, int64_t pp_size, int64_t cp_size, int64_t rank, int64_t gpus_per_node,
-        int64_t buffer_size, bool use_nccl_symmetric);
+    void initialize(
+        int64_t tp_size, int64_t pp_size, int64_t cp_size, int64_t rank, int64_t gpus_per_node, int64_t buffer_size);
 
     //! @brief Create a UB tensor from the given shape, strides and data type. The function will choose available UB
     //! buffer or create a new one if no available buffer is found.
@@ -76,7 +75,7 @@ private:
     int64_t buffer_size_;
 };
 
-void initialize_userbuffers_manager(int64_t tp_size, int64_t pp_size, int64_t cp_size, int64_t rank,
-    int64_t gpus_per_node, int64_t buffer_size, bool use_nccl_symmetric);
+void initialize_userbuffers_manager(
+    int64_t tp_size, int64_t pp_size, int64_t cp_size, int64_t rank, int64_t gpus_per_node, int64_t buffer_size);
 
 } // namespace tensorrt_llm::runtime::ub
diff --git a/cpp/tensorrt_llm/plugins/ncclPlugin/allreducePlugin.cpp b/cpp/tensorrt_llm/plugins/ncclPlugin/allreducePlugin.cpp
index 4241cf8d85..112364400d 100644
--- a/cpp/tensorrt_llm/plugins/ncclPlugin/allreducePlugin.cpp
+++ b/cpp/tensorrt_llm/plugins/ncclPlugin/allreducePlugin.cpp
@@ -137,13 +137,12 @@ bool AllreducePlugin::supportsFormatCombination(
     int pos, nvinfer1::PluginTensorDesc const* inOut, int nbInputs, int nbOutputs) noexcept
 {
     int base_inputs = 0;
-    if (mStrategy == AllReduceStrategyType::NCCL || mStrategy == AllReduceStrategyType::UB)
+    switch (mStrategy)
     {
-        base_inputs = 1;
-    }
-    else
-    {
-        base_inputs = 2;
+    case AllReduceStrategyType::NCCL:
+    case AllReduceStrategyType::UB:
+    case AllReduceStrategyType::NCCL_SYMMETRIC: base_inputs = 1; break;
+    default: base_inputs = 2; break;
     }
     int fusion_op_extra_inputs = 0;
     int scale_idx = 0;
@@ -169,9 +168,15 @@ bool AllreducePlugin::supportsFormatCombination(
 
     TLLM_CHECK(nbInputs == (base_inputs + fusion_op_extra_inputs));
 
-    if (mStrategy != AllReduceStrategyType::NCCL && mStrategy != AllReduceStrategyType::UB && pos == 1)
+    if (pos == 1)
     {
-        return (inOut[pos].type == nvinfer1::DataType::kINT64) && (inOut[pos].format == TensorFormat::kLINEAR);
+        switch (mStrategy)
+        {
+        case AllReduceStrategyType::NCCL:
+        case AllReduceStrategyType::UB:
+        case AllReduceStrategyType::NCCL_SYMMETRIC: break;
+        default: return (inOut[pos].type == nvinfer1::DataType::kINT64) && (inOut[pos].format == TensorFormat::kLINEAR);
+        }
     }
     if (mStrategy == AllReduceStrategyType::UB)
     {
@@ -222,25 +227,26 @@ AllReduceStrategyType AllreducePlugin::selectImplementation(
     {
         if (!isAuto)
         {
-            TLLM_LOG_INFO("Since Peer to Peer not supported, fallback to AllReduceStrategy: NCCL");
+            TLLM_LOG_INFO("Since Peer to Peer not supported, fallback to AllReduceStrategy: NCCL_SYMMETRIC");
         }
         else if (forceDeterministic)
         {
             TLLM_LOG_WARNING(
-                "Since Peer to Peer not supported, fallback to AllReduceStrategy: NCCL. NCCL might produce "
+                "Since Peer to Peer not supported, fallback to AllReduceStrategy: NCCL_SYMMETRIC. NCCL_SYMMETRIC might "
+                "produce "
                 "non-deterministic results.");
         }
-        return AllReduceStrategyType::NCCL;
+        return AllReduceStrategyType::NCCL_SYMMETRIC;
     }
 
     if (isAuto && !mIsNVLINKSupported && !forceDeterministic)
     {
-        return AllReduceStrategyType::NCCL;
+        return AllReduceStrategyType::NCCL_SYMMETRIC;
     }
 
     auto const maxWorkspaceSize = utils::customAllReduceUtils::getMaxRequiredWorkspaceSize(worldSize);
 
-    AllReduceStrategyType strat = AllReduceStrategyType::NCCL;
+    AllReduceStrategyType strat = AllReduceStrategyType::NCCL_SYMMETRIC;
     auto const messageSizeBytes = messageSize * common::getDTypeSize(type);
 
     if (messageSizeBytes <= maxWorkspaceSize)
@@ -268,7 +274,7 @@ AllReduceStrategyType AllreducePlugin::selectImplementation(
             }
             else
             {
-                strat = AllReduceStrategyType::NCCL;
+                strat = AllReduceStrategyType::NCCL_SYMMETRIC;
             }
         }
         else
@@ -279,7 +285,7 @@ AllReduceStrategyType AllreducePlugin::selectImplementation(
             }
             else
             {
-                strat = AllReduceStrategyType::NCCL;
+                strat = AllReduceStrategyType::NCCL_SYMMETRIC;
             }
         }
 
@@ -287,30 +293,31 @@ AllReduceStrategyType AllreducePlugin::selectImplementation(
         {
             if (!isAuto)
             {
-                TLLM_LOG_WARNING("Since not aligned, fallback to AllReduceStrategy: NCCL");
+                TLLM_LOG_WARNING("Since not aligned, fallback to AllReduceStrategy: NCCL_SYMMETRIC");
             }
             else if (forceDeterministic)
             {
                 TLLM_LOG_WARNING(
-                    "Since not aligned, fallback to AllReduceStrategy: NCCL. NCCL might produce "
+                    "Since not aligned, fallback to AllReduceStrategy: NCCL_SYMMETRIC. NCCL_SYMMETRIC might produce "
                     "non-deterministic results.");
             }
-            strat = AllReduceStrategyType::NCCL;
+            strat = AllReduceStrategyType::NCCL_SYMMETRIC;
         }
     }
     else
     {
         if (!isAuto)
         {
-            TLLM_LOG_WARNING("Since messageSize > maxWorkspace, fallback to AllReduceStrategy: NCCL");
+            TLLM_LOG_WARNING("Since messageSize > maxWorkspace, fallback to AllReduceStrategy: NCCL_SYMMETRIC");
         }
         else if (forceDeterministic)
         {
             TLLM_LOG_WARNING(
-                "Since messageSize > maxWorkspace, fallback to AllReduceStrategy: NCCL. NCCL might produce "
+                "Since messageSize > maxWorkspace, fallback to AllReduceStrategy: NCCL_SYMMETRIC. NCCL_SYMMETRIC might "
+                "produce "
                 "non-deterministic results.");
         }
-        strat = AllReduceStrategyType::NCCL;
+        strat = AllReduceStrategyType::NCCL_SYMMETRIC;
     }
 
     return strat;
@@ -337,6 +344,10 @@ int AllreducePlugin::enqueue(nvinfer1::PluginTensorDesc const* inputDesc, nvinfe
     {
         runtimeStrategy = AllReduceStrategyType::NCCL;
     }
+    else if (mStrategy == AllReduceStrategyType::NCCL_SYMMETRIC)
+    {
+        runtimeStrategy = AllReduceStrategyType::NCCL_SYMMETRIC;
+    }
     else if (mStrategy == AllReduceStrategyType::UB)
     {
         runtimeStrategy = AllReduceStrategyType::UB;
@@ -355,6 +366,11 @@ int AllreducePlugin::enqueue(nvinfer1::PluginTensorDesc const* inputDesc, nvinfe
         TLLM_LOG_DEBUG("AllReducePlugin strategy for rank %d: NCCL", rank);
         break;
     }
+    case AllReduceStrategyType::NCCL_SYMMETRIC:
+    {
+        TLLM_LOG_DEBUG("AllReducePlugin strategy for rank %d: NCCL_SYMMETRIC", rank);
+        break;
+    }
     case AllReduceStrategyType::ONESHOT:
     {
         TLLM_LOG_DEBUG("AllReducePlugin strategy for rank %d: ONESHOT", rank);
@@ -373,14 +389,14 @@ int AllreducePlugin::enqueue(nvinfer1::PluginTensorDesc const* inputDesc, nvinfe
     default: break;
     }
 
-    if (runtimeStrategy == AllReduceStrategyType::NCCL)
+    if (runtimeStrategy == AllReduceStrategyType::NCCL || runtimeStrategy == AllReduceStrategyType::NCCL_SYMMETRIC)
     {
         if (mOp == AllReduceFusionOp::RESIDUAL_RMS_NORM || mOp == AllReduceFusionOp::RESIDUAL_RMS_PREPOST_NORM)
         {
             NCCLCHECK(ncclAllReduce(inputs[0], outputs[1], size, (*getDtypeMap())[mType], ncclSum, *mNcclComm, stream));
             tensorrt_llm::kernels::AllReduceParams params;
             int fusion_ptr_idx = 0;
-            if (mStrategy == AllReduceStrategyType::NCCL)
+            if (mStrategy == AllReduceStrategyType::NCCL || mStrategy == AllReduceStrategyType::NCCL_SYMMETRIC)
             {
                 fusion_ptr_idx = 1;
             }
diff --git a/cpp/tensorrt_llm/thop/allreduceOp.cpp b/cpp/tensorrt_llm/thop/allreduceOp.cpp
index 21018e241d..fbd60d1ec5 100644
--- a/cpp/tensorrt_llm/thop/allreduceOp.cpp
+++ b/cpp/tensorrt_llm/thop/allreduceOp.cpp
@@ -15,10 +15,12 @@
  * limitations under the License.
  */
 
+#include "tensorrt_llm/common/cudaDriverWrapper.h"
 #include "tensorrt_llm/common/cudaUtils.h"
 #include "tensorrt_llm/common/customAllReduceUtils.h"
 #include "tensorrt_llm/common/dataType.h"
 #include "tensorrt_llm/common/mcastDevMemUtils.h"
+#include "tensorrt_llm/common/ncclUtils.h"
 #include "tensorrt_llm/common/opUtils.h"
 #include "tensorrt_llm/kernels/communicationKernels/allReduceFusionKernels.h"
 #include "tensorrt_llm/kernels/communicationKernels/customLowPrecisionAllReduceKernels.h"
@@ -39,6 +41,7 @@
 #if ENABLE_MULTI_DEVICE
 #include <ATen/cuda/EmptyTensor.h>
 #include <c10/util/irange.h>
+#include <cuda.h>
 #include <nccl.h>
 #include <torch/csrc/distributed/c10d/FileStore.hpp>
 #include <torch/csrc/distributed/c10d/ProcessGroup.hpp>
@@ -51,6 +54,7 @@
 
 #include <cstddef>
 #include <cstdint>
+#include <limits>
 #include <unordered_set>
 
 // using namespace nvinfer1;
@@ -238,6 +242,9 @@ public:
     AllreduceOp(
         std::set<int> group, nvinfer1::DataType type, AllReduceStrategyType strategy, AllReduceFusionOp op, float eps)
         : mGroup(std::move(group))
+        , mIsNVLINKSupported(false)
+        , mIsP2PSupported(false)
+        , mIsMNNVLSupported(false)
         , mType(type)
         , mStrategy(strategy)
         , mOp(op)
@@ -248,6 +255,9 @@ public:
     AllreduceOp(std::set<int> group, c10::intrusive_ptr<c10d::ProcessGroup> const& process_group_,
         nvinfer1::DataType type, AllReduceStrategyType strategy, AllReduceFusionOp op, float eps)
         : mGroup(std::move(group))
+        , mIsNVLINKSupported(false)
+        , mIsP2PSupported(false)
+        , mIsMNNVLSupported(false)
         , mType(type)
         , mStrategy(strategy)
         , mOp(op)
@@ -437,44 +447,117 @@ private:
         torch::optional<torch::Tensor> const& residual, torch::optional<torch::Tensor> const& norm_weight,
         torch::optional<torch::Tensor> const& scale, torch::optional<torch::Tensor> const& bias)
     {
+        // Handle ProcessGroup path first - cannot extract NCCL comm for window registration
+        // Use ProcessGroup's allreduce directly and return early
+        if (mNcclComm.index() == 1)
+        {
+            auto torchPg = std::get<1>(mNcclComm);
+
+            torch::Tensor reduceOutput = input.clone();
+            std::vector tensors{reduceOutput};
+            PGCHECK_THROW(torchPg->allreduce(tensors, {c10d::ReduceOp::SUM}));
+
+            if (mOp == AllReduceFusionOp::NONE)
+            {
+                return {reduceOutput};
+            }
+
+            // Treat any other patterns as fallback cases.
+            return fallbackRunSubsequentOps(input, residual, norm_weight, scale, bias, reduceOutput);
+        }
+
+        // From here on, we have a raw NCCL comm - can proceed with window registration
+        auto rawComm = std::get<0>(mNcclComm);
+        ncclComm_t comm = *rawComm;
+        TLLM_CHECK_WITH_INFO(comm != nullptr, "NCCL communicator is null");
+        TLLM_LOG_DEBUG("[runNCCLAllReduceSymmetric] Using raw NCCL comm path (not ProcessGroup)");
+
+        using tensorrt_llm::common::nccl_util::NCCLWindowAllocator;
+        using tensorrt_llm::common::nccl_util::createNCCLWindowTensor;
 
         auto stream = at::cuda::getCurrentCUDAStream(input.get_device());
         int size = input.numel();
-        auto& ub_manager = tensorrt_llm::runtime::ub::UserBuffersManager::get_instance();
-        auto ub_tensor0 = input;
-        auto ub_buffer0 = ub_manager.search_buffer(input.data_ptr());
-        if (ub_buffer0.invalid())
+        size_t bufferSizeBytes = size * input.element_size();
+
+        // Using unregistered input buffers with NCCL symmetric, requires a memcpy
+        // This is an overhead introduced with using NCCL_SYMMTRIC over NCCL.
+        // Both the memcpy and the perf benefit from using NCCL_SYMMETRIC scale linear with the message size.
+        // But a local memcpy is cheaper than the remote operations, so with larger message sizes the benefit is
+        // stronger. Additionally, the perf benefit scales with the number of ranks, since multimem enables O(const.)
+        // versus O(N) complexity. Hence we model this cutoff with a linear model. The numbers below were obtained on
+        // GB200, scanning different message sizes and ranks. You can determine the regression onset for each number of
+        // ranks to a single message size. And the following formula was obtained by fitting a linear model to the
+        // regression onset. It is possible to override this empirical heuristic with the TLLM_NCCL_MIN_REGISTRATION
+        // environment variable.
+        double const a = -4986.43478503;
+        double const b = 156716.52177552;
+        int nRanks;
+        NCCLCHECK_THROW(ncclCommCount(comm, &nRanks));
+        size_t minRegistrationThreshold = static_cast<size_t>(std::max(0.0, a * nRanks + b)) * input.element_size();
+        // Disable window registration if neither NVLink nor MNNVL is supported
+        // TODO replace in NCCL 2.29 with comm query
+        if (!mIsNVLINKSupported && !mIsMNNVLSupported)
         {
-            auto [symmetric_input, symmetric_ub_buffer0]
-                = torch_ext::create_userbuffers_tensor(input.sizes(), input.scalar_type());
-            cudaMemcpyAsync(symmetric_ub_buffer0.addr, input.data_ptr(), size * input.element_size(),
-                cudaMemcpyDeviceToDevice, stream);
-            ub_buffer0 = symmetric_ub_buffer0;
-            ub_tensor0 = symmetric_input;
+            minRegistrationThreshold = std::numeric_limits<size_t>::max();
+        }
+        char const* envThreshold = std::getenv("TLLM_NCCL_MIN_REGISTRATION");
+        if (envThreshold != nullptr)
+        {
+            minRegistrationThreshold = static_cast<size_t>(std::atoi(envThreshold)) * input.element_size();
         }
 
-        TLLM_CHECK(!ub_buffer0.invalid());
-        auto [norm_out, ub_buffer1] = torch_ext::create_userbuffers_tensor(input.sizes(), input.scalar_type());
+        // Search for existing buffer
+        auto& allocator = NCCLWindowAllocator::getInstance();
+        auto windowBuffer0 = allocator.searchBuffer(comm, input.data_ptr());
 
-        std::visit(overloaded{[&, norm_out_ = norm_out](std::shared_ptr<ncclComm_t>& rawComm)
-                       {
-                           NCCLCHECK_THROW(ncclAllReduce(ub_buffer0.addr, norm_out_.mutable_data_ptr(), size,
-                               (*getDtypeMap())[mType], ncclSum, *rawComm, stream));
-                       },
-                       [&, norm_out_ = norm_out](c10::intrusive_ptr<c10d::ProcessGroup>& torchPg)
-                       {
-                           PGCHECK_THROW(PgHelper{torchPg}.allreduce(ub_tensor0, {c10d::ReduceOp::SUM}));
-                           std::ignore = norm_out_.copy_(ub_tensor0, true);
-                       }},
-            mNcclComm);
+        torch::Tensor inputTensor = input;
+        void* inputPtr = input.data_ptr();
+
+        // If buffer is not registered, decide whether to register based on size
+        if (!windowBuffer0.isValid())
+        {
+            if (bufferSizeBytes < minRegistrationThreshold)
+            {
+                // Small buffer: use input directly without window registration
+                TLLM_LOG_DEBUG(
+                    "[runNCCLAllReduceSymmetric] Buffer size %zu bytes < threshold %zu bytes, "
+                    "skipping window registration",
+                    bufferSizeBytes, minRegistrationThreshold);
+                // inputTensor and inputPtr remain pointing to original input
+            }
+            else
+            {
+                // Large buffer: create window buffer and copy input (can swap inputTensor reference)
+                auto [symmetricInput, symmetricBuffer0]
+                    = createNCCLWindowTensor(comm, input.sizes(), input.scalar_type());
+                TLLM_CUDA_CHECK(cudaMemcpyAsync(
+                    symmetricBuffer0.ptr, input.data_ptr(), bufferSizeBytes, cudaMemcpyDeviceToDevice, stream));
+                windowBuffer0 = symmetricBuffer0;
+                inputTensor = symmetricInput; // Swap to window-backed tensor
+                inputPtr = windowBuffer0.ptr;
+            }
+        }
+        else
+        {
+            // Buffer already registered - use it directly
+            inputPtr = windowBuffer0.ptr;
+        }
+
+        // Use window-backed output buffer
+        auto [normOut, windowBuffer1] = createNCCLWindowTensor(comm, input.sizes(), input.scalar_type());
+        torch::Tensor outputTensor = normOut;
+        void* outputPtr = windowBuffer1.ptr;
+
+        // Perform allreduce
+        NCCLCHECK_THROW(ncclAllReduce(inputPtr, outputPtr, size, (*getDtypeMap())[mType], ncclSum, comm, stream));
 
         if (mOp == AllReduceFusionOp::NONE)
         {
-            return {norm_out};
+            return {outputTensor};
         }
 
         // Treat any other patterns as fallback cases.
-        return fallbackRunSubsequentOps(input, residual, norm_weight, scale, bias, norm_out);
+        return fallbackRunSubsequentOps(input, residual, norm_weight, scale, bias, outputTensor);
     }
 
     std::vector<torch::Tensor> runLowPrecisionAllReduce(torch::Tensor const& input,
@@ -799,16 +882,104 @@ private:
 
     void initGroupTopology()
     {
-        static std::map<std::set<int>, std::tuple<bool, bool>> cache;
+        static std::map<std::set<int>, std::tuple<bool, bool, bool>> cache;
         if (cache.find(mGroup) != cache.end())
         {
-            auto [is_NVLINK_supported, is_P2P_supported] = cache[mGroup];
+            auto [is_NVLINK_supported, is_P2P_supported, is_MNNVL_supported] = cache[mGroup];
             mIsNVLINKSupported = is_NVLINK_supported;
             mIsP2PSupported = is_P2P_supported;
+            mIsMNNVLSupported = is_MNNVL_supported;
             return;
         }
         setGroupTopology();
-        cache[mGroup] = {mIsNVLINKSupported, mIsP2PSupported};
+        cache[mGroup] = {mIsNVLINKSupported, mIsP2PSupported, mIsMNNVLSupported};
+    }
+
+    bool checkMNNVLSupport(int device_id)
+    {
+#if ENABLE_MULTI_DEVICE
+        // 1. Check CUDA driver version (needs >= 12.0.10)
+        int cuda_driver_version = -1;
+        TLLM_CUDA_CHECK(cudaDriverGetVersion(&cuda_driver_version));
+        if (cuda_driver_version < 12010)
+        {
+            TLLM_LOG_DEBUG("MNNVL check: CUDA Driver version %d < 12010", cuda_driver_version);
+            return false;
+        }
+
+        // 2. Check multicast support
+        CUdevice cu_device;
+        TLLM_CU_CHECK(cuDeviceGet(&cu_device, device_id));
+        auto cuda_driver = tensorrt_llm::common::CUDADriverWrapper::getInstance();
+
+        int multicast_supported = 0;
+        TLLM_CU_CHECK(cuda_driver->cuDeviceGetAttribute(
+            &multicast_supported, CU_DEVICE_ATTRIBUTE_MULTICAST_SUPPORTED, cu_device));
+        if (!multicast_supported)
+        {
+            TLLM_LOG_DEBUG("MNNVL check: Device %d does not support multicast", device_id);
+            return false;
+        }
+
+        // 3. Check fabric handle support
+        int fabric_handle_supported = 0;
+        TLLM_CU_CHECK(cuda_driver->cuDeviceGetAttribute(
+            &fabric_handle_supported, CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_FABRIC_SUPPORTED, cu_device));
+        if (!fabric_handle_supported)
+        {
+            TLLM_LOG_DEBUG("MNNVL check: Device %d does not support fabric handles", device_id);
+            return false;
+        }
+
+        // 4. Check NVML GPU Fabric Info
+        nvmlDevice_t nvml_device;
+        NVML_CHECK_THROW(nvmlDeviceGetHandleByIndex(device_id, &nvml_device));
+
+        nvmlGpuFabricInfo_t fabric_info;
+        NVML_CHECK_THROW(nvmlDeviceGetGpuFabricInfo(nvml_device, &fabric_info));
+
+        // Check if fabric is fully initialized
+        if (fabric_info.state != NVML_GPU_FABRIC_STATE_COMPLETED || fabric_info.status != NVML_SUCCESS)
+        {
+            TLLM_LOG_DEBUG(
+                "MNNVL check: Fabric state not complete - state=%u status=%u", fabric_info.state, fabric_info.status);
+            return false;
+        }
+
+        // 5. Check NVLink links are active (similar to Python support_nvlink(True))
+        unsigned int active_links = 0;
+        unsigned int available_links = 0;
+
+        for (unsigned int link = 0; link < NVML_NVLINK_MAX_LINKS; link++)
+        {
+            unsigned int cap_p2p = 0;
+            nvmlReturn_t cap_result
+                = nvmlDeviceGetNvLinkCapability(nvml_device, link, NVML_NVLINK_CAP_P2P_SUPPORTED, &cap_p2p);
+            if (cap_result == NVML_SUCCESS && cap_p2p)
+            {
+                available_links++;
+                nvmlEnableState_t link_state;
+                if (nvmlDeviceGetNvLinkState(nvml_device, link, &link_state) == NVML_SUCCESS
+                    && link_state == NVML_FEATURE_ENABLED)
+                {
+                    active_links++;
+                }
+            }
+        }
+
+        bool all_links_up = (active_links == available_links && available_links > 0);
+        if (!all_links_up)
+        {
+            TLLM_LOG_DEBUG(
+                "MNNVL check: Not all NVLink links active - active=%u available=%u", active_links, available_links);
+            return false;
+        }
+
+        TLLM_LOG_INFO("MNNVL check: Device %d supports MNNVL (fabric_clique=%u)", device_id, fabric_info.cliqueId);
+        return true;
+#else
+        return false;
+#endif
     }
 
     void setGroupTopology()
@@ -820,108 +991,190 @@ private:
                 [&](c10::intrusive_ptr<c10d::ProcessGroup>& torchPg) { return getLocalGroupTorch(mGroup); }},
             mNcclComm);
 
-        if (mGroup.size() != local_group.size())
-        {
-            mIsP2PSupported = false;
-            mIsNVLINKSupported = false;
-            TLLM_LOG_INFO("Found inter-node TP group for rank %d", rank);
-            return;
-        }
-        TLLM_LOG_INFO("TP group is intra-node for rank %d", rank);
+        bool is_inter_node = (mGroup.size() != local_group.size());
 
         NvmlManager nvml_manager;
         mIsP2PSupported = true;
         mIsNVLINKSupported = true;
+        mIsMNNVLSupported = false;
 
-        // TODO(ytong): Should we provide group topology info instead of querying it here?
-        // Use cudaDeviceCanAccessPeer to determine whether p2p is supported,
-        // and use nvml to determine whether there are nvlink links between ranks.
-        for (int first_device_id : local_group)
+        // First, check NVLink within local group (intra-node)
+        if (!local_group.empty())
         {
-            for (int second_device_id : local_group)
+            for (int first_device_id : local_group)
             {
-                if (first_device_id >= second_device_id)
+                for (int second_device_id : local_group)
                 {
-                    continue;
-                }
-
-                int can_access_peer = 0;
-                TLLM_CUDA_CHECK(cudaDeviceCanAccessPeer(&can_access_peer, first_device_id, second_device_id));
-
-                if (!can_access_peer)
-                {
-                    mIsP2PSupported = false;
-                    mIsNVLINKSupported = false;
-
-                    return;
-                }
-
-                nvmlDevice_t first_device;
-                NVML_CHECK_THROW(nvmlDeviceGetHandleByIndex(first_device_id, &first_device));
-
-                bool is_NVLINK = false;
-
-                for (unsigned int link = 0; link < NVML_NVLINK_MAX_LINKS; link++)
-                {
-                    nvmlPciInfo_t remote_pci_info;
-                    if (nvmlDeviceGetNvLinkRemotePciInfo_v2(first_device, link, &remote_pci_info) != NVML_SUCCESS)
+                    if (first_device_id >= second_device_id)
                     {
                         continue;
                     }
 
-                    nvmlDevice_t remote_device;
-                    auto const result = nvmlDeviceGetHandleByPciBusId_v2(remote_pci_info.busId, &remote_device);
+                    int can_access_peer = 0;
+                    TLLM_CUDA_CHECK(cudaDeviceCanAccessPeer(&can_access_peer, first_device_id, second_device_id));
 
-                    if (result == NVML_SUCCESS)
+                    if (!can_access_peer)
                     {
-                        // Two GPUs are connected directly through nvlink
-                        unsigned int remote_device_id;
-                        NVML_CHECK_THROW(nvmlDeviceGetIndex(remote_device, &remote_device_id));
-
-                        if (remote_device_id == static_cast<unsigned int>(second_device_id))
-                        {
-                            is_NVLINK = true;
-                        }
+                        mIsP2PSupported = false;
+                        mIsNVLINKSupported = false;
+                        TLLM_LOG_INFO(
+                            "P2P not supported between local devices %d and %d", first_device_id, second_device_id);
+                        // Continue checking other pairs, but mark as not supported
+                        continue;
                     }
-                    else if (result == NVML_ERROR_NOT_FOUND)
+
+                    nvmlDevice_t first_device;
+                    NVML_CHECK_THROW(nvmlDeviceGetHandleByIndex(first_device_id, &first_device));
+
+                    bool is_NVLINK = false;
+
+                    for (unsigned int link = 0; link < NVML_NVLINK_MAX_LINKS; link++)
                     {
-                        // Maybe Two GPUs are connected via nvswitch,
-                        // now remotePciInfo represents the pci information of nvswitch,
-                        // determine whether nvlink is supported by whether two GPUs are connected to the same
-                        // nvswitch.
-                        nvmlDevice_t second_device;
-                        NVML_CHECK_THROW(nvmlDeviceGetHandleByIndex(second_device_id, &second_device));
-
-                        for (unsigned int second_link = 0; second_link < NVML_NVLINK_MAX_LINKS; second_link++)
+                        nvmlPciInfo_t remote_pci_info;
+                        if (nvmlDeviceGetNvLinkRemotePciInfo_v2(first_device, link, &remote_pci_info) != NVML_SUCCESS)
                         {
-                            nvmlPciInfo_t second_remote_pci_info;
-                            if (nvmlDeviceGetNvLinkRemotePciInfo_v2(second_device, second_link, &second_remote_pci_info)
-                                != NVML_SUCCESS)
-                            {
-                                continue;
-                            }
+                            continue;
+                        }
 
-                            if (strcmp(remote_pci_info.busId, second_remote_pci_info.busId) == 0)
+                        nvmlDevice_t remote_device;
+                        auto const result = nvmlDeviceGetHandleByPciBusId_v2(remote_pci_info.busId, &remote_device);
+
+                        if (result == NVML_SUCCESS)
+                        {
+                            // Two GPUs are connected directly through nvlink
+                            unsigned int remote_device_id;
+                            NVML_CHECK_THROW(nvmlDeviceGetIndex(remote_device, &remote_device_id));
+
+                            if (remote_device_id == static_cast<unsigned int>(second_device_id))
                             {
                                 is_NVLINK = true;
-                                break;
                             }
                         }
-                    }
-                    else
-                    {
-                        NVML_CHECK_THROW(result);
+                        else if (result == NVML_ERROR_NOT_FOUND)
+                        {
+                            // Maybe Two GPUs are connected via nvswitch,
+                            // now remotePciInfo represents the pci information of nvswitch,
+                            // determine whether nvlink is supported by whether two GPUs are connected to the same
+                            // nvswitch.
+                            nvmlDevice_t second_device;
+                            NVML_CHECK_THROW(nvmlDeviceGetHandleByIndex(second_device_id, &second_device));
+
+                            for (unsigned int second_link = 0; second_link < NVML_NVLINK_MAX_LINKS; second_link++)
+                            {
+                                nvmlPciInfo_t second_remote_pci_info;
+                                if (nvmlDeviceGetNvLinkRemotePciInfo_v2(
+                                        second_device, second_link, &second_remote_pci_info)
+                                    != NVML_SUCCESS)
+                                {
+                                    continue;
+                                }
+
+                                if (strcmp(remote_pci_info.busId, second_remote_pci_info.busId) == 0)
+                                {
+                                    is_NVLINK = true;
+                                    break;
+                                }
+                            }
+                        }
+                        else
+                        {
+                            NVML_CHECK_THROW(result);
+                        }
+
+                        if (is_NVLINK)
+                        {
+                            break;
+                        }
                     }
 
-                    if (is_NVLINK)
-                    {
-                        break;
-                    }
+                    mIsNVLINKSupported &= is_NVLINK;
                 }
-
-                mIsNVLINKSupported &= is_NVLINK;
             }
         }
+
+        // For inter-node groups, check MNNVL support
+        if (is_inter_node)
+        {
+            TLLM_LOG_INFO("Found inter-node TP group for rank %d, checking MNNVL support", rank);
+
+            // Check MNNVL support on local device(s)
+            bool local_mnnvl_supported = false;
+            if (!local_group.empty())
+            {
+                // Check MNNVL on first device in local group (all devices on same node should have same MNNVL status)
+                int check_device = *local_group.begin();
+                local_mnnvl_supported = checkMNNVLSupport(check_device);
+            }
+
+            // Gather MNNVL status from all ranks in the group
+            int local_mnnvl_status = local_mnnvl_supported ? 1 : 0;
+            std::vector<int> all_mnnvl_status(mGroup.size());
+
+            std::visit(overloaded{[&](std::shared_ptr<ncclComm_t>& comm_ptr)
+                           {
+                               // For NCCL comm, use MPI to gather status
+                               // Use MPI allgather to collect MNNVL status
+                               // Create a sub-communicator for the group
+                               std::vector<int> group_ranks(mGroup.begin(), mGroup.end());
+                               MPI_Group world_group, new_group;
+                               MPI_Comm group_comm;
+                               MPI_Comm_group(COMM_SESSION, &world_group);
+                               MPI_Group_incl(world_group, group_ranks.size(), group_ranks.data(), &new_group);
+                               MPI_Comm_create_group(COMM_SESSION, new_group, 0, &group_comm);
+
+                               if (group_comm != MPI_COMM_NULL)
+                               {
+                                   MPI_Allgather(&local_mnnvl_status, 1, MPI_INT, all_mnnvl_status.data(), 1, MPI_INT,
+                                       group_comm);
+                                   MPI_Comm_free(&group_comm);
+                               }
+                               MPI_Group_free(&new_group);
+                               MPI_Group_free(&world_group);
+                           },
+                           [&](c10::intrusive_ptr<c10d::ProcessGroup>& torchPg)
+                           {
+                               // For ProcessGroup, use allgather directly
+                               // Note: This assumes the ProcessGroup is already set up for the correct group
+                               std::vector<torch::Tensor> input_tensors
+                                   = {torch::tensor({local_mnnvl_status}, torch::kInt32)};
+                               std::vector<std::vector<torch::Tensor>> output_tensors(1);
+                               output_tensors[0].resize(mGroup.size());
+                               auto work = torchPg->allgather(output_tensors, input_tensors);
+                               if (work)
+                               {
+                                   work->wait();
+                                   for (size_t i = 0; i < mGroup.size(); ++i)
+                                   {
+                                       all_mnnvl_status[i] = output_tensors[0][i].item<int>();
+                                   }
+                               }
+                           }},
+                mNcclComm);
+
+            // Check if all ranks support MNNVL
+            bool all_ranks_support_mnnvl = true;
+            for (int status : all_mnnvl_status)
+            {
+                if (status == 0)
+                {
+                    all_ranks_support_mnnvl = false;
+                    break;
+                }
+            }
+
+            // For inter-node: MNNVL support means all nodes have MNNVL
+            // Also need local NVLink for optimal performance
+            mIsMNNVLSupported = mIsNVLINKSupported && all_ranks_support_mnnvl;
+            mIsP2PSupported = false; // P2P doesn't work across nodes
+
+            TLLM_LOG_INFO("Inter-node topology: local_NVLink=%d, local_MNNVL=%d, all_ranks_MNNVL=%d, final_MNNVL=%d",
+                mIsNVLINKSupported ? 1 : 0, local_mnnvl_status, all_ranks_support_mnnvl ? 1 : 0,
+                mIsMNNVLSupported ? 1 : 0);
+        }
+        else
+        {
+            TLLM_LOG_INFO("TP group is intra-node for rank %d", rank);
+        }
     }
 
     AllReduceStrategyType selectImplementation(size_t seq_len, size_t hidden_size)
@@ -951,12 +1204,12 @@ private:
 
         if (ifFallbackToNCCL(seq_len, message_size_bytes, max_workspace_size))
         {
-            return AllReduceStrategyType::NCCL;
+            return AllReduceStrategyType::NCCL_SYMMETRIC;
         }
 
-        // This rule based heuristic only chooses between NCCL and MIN_LATENCY strategies.
-        // From this point, all fusion patterns are supported by all these strategies: NCCL, ONESHOT, TWOSHOT and
-        // MIN_LATENCY.
+        // This rule based heuristic only chooses between NCCL_SYMMETRIC and MIN_LATENCY strategies.
+        // From this point, all fusion patterns are supported by all these strategies: NCCL_SYMMETRIC, ONESHOT, TWOSHOT
+        // and MIN_LATENCY.
         if (mStrategy != AllReduceStrategyType::AUTO)
         {
             // Check TWOSHOT constraint: seq_len >= tp_size
@@ -973,12 +1226,11 @@ private:
             return tensorrt_llm::utils::customAllReduceUtils::selectStrategyLookUpTable(
                 seq_len, hidden_size, mOp, mGroup.size());
         }
-        return AllReduceStrategyType::NCCL;
     }
 
     bool ifFallbackToNCCL(size_t seq_len, size_t message_size_bytes, size_t max_workspace_size)
     {
-        // If messageSize is less than maxWorkspaceSize, use NCCL, regardless of the fusion type.
+        // If messageSize is greater than maxWorkspaceSize or topology is unsuitable, use NCCL_SYMMETRIC fallback.
         if (message_size_bytes > max_workspace_size || !mIsP2PSupported || !mIsNVLINKSupported)
         {
             return true;
@@ -1006,6 +1258,7 @@ private:
     std::set<int> mGroup;
     bool mIsNVLINKSupported;
     bool mIsP2PSupported;
+    bool mIsMNNVLSupported;
     nvinfer1::DataType mType;
     AllReduceStrategyType mStrategy;
     AllReduceFusionOp mOp;
diff --git a/cpp/tests/unit_tests/multi_gpu/CMakeLists.txt b/cpp/tests/unit_tests/multi_gpu/CMakeLists.txt
index 5fb79c766c..44b8e30577 100644
--- a/cpp/tests/unit_tests/multi_gpu/CMakeLists.txt
+++ b/cpp/tests/unit_tests/multi_gpu/CMakeLists.txt
@@ -20,3 +20,9 @@ target_link_libraries(cacheTransceiverTest PRIVATE ${Python3_LIBRARIES})
 
 add_gtest(mpiUtilsTest mpiUtilsTest.cpp)
 add_gtest(userBufferTest userBufferTest.cpp)
+add_gtest(ncclUtilsTest ncclUtilsTest.cpp)
+target_link_libraries(ncclUtilsTest PRIVATE ${Python3_LIBRARIES})
+if(BUILD_PYT)
+  target_compile_definitions(ncclUtilsTest PUBLIC BUILD_PYT)
+  target_link_libraries(ncclUtilsTest PUBLIC ${TORCH_LIBRARIES})
+endif()
diff --git a/cpp/tests/unit_tests/multi_gpu/ncclUtilsTest.cpp b/cpp/tests/unit_tests/multi_gpu/ncclUtilsTest.cpp
new file mode 100644
index 0000000000..bf4ddd2141
--- /dev/null
+++ b/cpp/tests/unit_tests/multi_gpu/ncclUtilsTest.cpp
@@ -0,0 +1,745 @@
+/*
+ * Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "tensorrt_llm/common/ncclUtils.h"
+#include "tensorrt_llm/common/assert.h"
+#include "tensorrt_llm/common/cudaUtils.h"
+#include "tensorrt_llm/common/logger.h"
+#include "tensorrt_llm/common/opUtils.h"
+#include "tensorrt_llm/runtime/utils/mpiUtils.h"
+
+#include <gtest/gtest.h>
+#include <nccl.h>
+#include <thread>
+#include <vector>
+
+#if ENABLE_MULTI_DEVICE && BUILD_PYT
+#include <torch/extension.h>
+#endif
+
+#if ENABLE_MULTI_DEVICE
+
+namespace mpi = tensorrt_llm::mpi;
+namespace tr = tensorrt_llm::runtime;
+namespace nccl_util = tensorrt_llm::common::nccl_util;
+
+using ::getComm;
+
+// Helper function to create a split communicator for testing
+// This allows us to test cleanup behavior explicitly by controlling the lifetime
+std::shared_ptr<ncclComm_t> createSplitComm(ncclComm_t parentComm, int color, int key)
+{
+    ncclComm_t newComm;
+    ncclResult_t result = ncclCommSplit(parentComm, color, key, &newComm, nullptr);
+    if (result != ncclSuccess)
+    {
+        TLLM_THROW("ncclCommSplit failed with error: %d", result);
+    }
+
+    // Create a shared_ptr with custom deleter that cleans up resources first
+    return std::shared_ptr<ncclComm_t>(new ncclComm_t(newComm),
+        [](ncclComm_t* comm)
+        {
+            if (comm && *comm)
+            {
+                // STEP 1: Clean up all registered resources FIRST
+                tensorrt_llm::common::nccl_util::NcclCommResourceManager::getInstance().cleanupResources(*comm);
+
+                // STEP 2: Now destroy the NCCL communicator
+                ncclResult_t result = ncclCommDestroy(*comm);
+                if (result != ncclSuccess)
+                {
+                    TLLM_LOG_WARNING("ncclCommDestroy failed with error: %d", result);
+                }
+
+                // STEP 3: Free the memory
+                delete comm;
+            }
+        });
+}
+
+//==============================================================================
+// NcclCommResourceManager Tests
+//==============================================================================
+
+class NcclCommResourceManagerTest : public ::testing::Test
+{
+protected:
+    void SetUp() override
+    {
+        auto& comm = mpi::MpiComm::world();
+        mWorldSize = comm.getSize();
+        mRank = comm.getRank();
+
+        if (mWorldSize < 2)
+        {
+            GTEST_SKIP() << "Requires at least 2 ranks (got " << mWorldSize << ")";
+        }
+
+        // Set CUDA device for this rank (required before NCCL initialization)
+        int deviceCount = 0;
+        TLLM_CUDA_CHECK(cudaGetDeviceCount(&deviceCount));
+        if (deviceCount > 0)
+        {
+            int deviceId = mRank % deviceCount;
+            TLLM_CUDA_CHECK(cudaSetDevice(deviceId));
+        }
+
+        // Create a communicator for testing
+        std::set<int> group;
+        for (int i = 0; i < mWorldSize; ++i)
+        {
+            group.insert(i);
+        }
+        mComm = getComm(group);
+    }
+
+    void TearDown() override
+    {
+        // Communicator cleanup happens automatically via shared_ptr deleter
+        mComm.reset();
+    }
+
+    int mWorldSize;
+    int mRank;
+    std::shared_ptr<ncclComm_t> mComm;
+};
+
+TEST_F(NcclCommResourceManagerTest, ResourceRegistration)
+{
+    auto& manager = nccl_util::NcclCommResourceManager::getInstance();
+
+    // Create a separate comm using split for this test
+    auto testComm = createSplitComm(*mComm, 0, mRank);
+
+    // Register a resource
+    bool cleanupCalled = false;
+    manager.registerResource(
+        *testComm, [&cleanupCalled]() { cleanupCalled = true; }, "TestResource");
+
+    EXPECT_TRUE(manager.hasResources(*testComm));
+    EXPECT_EQ(manager.getResourceCount(*testComm), 1);
+    EXPECT_FALSE(cleanupCalled); // Cleanup not called yet
+
+    // Store the raw comm value before destruction
+    ncclComm_t rawComm = *testComm;
+
+    // Cleanup should be called when comm is destroyed
+    testComm.reset();
+
+    // Verify cleanup was called
+    EXPECT_TRUE(cleanupCalled);
+
+    // Verify cleanup: check that the old comm (now destroyed) no longer has resources
+    // Note: The comm is destroyed, but we can still check the manager's internal state
+    // The cleanup should have removed all resources for this comm
+    EXPECT_FALSE(manager.hasResources(rawComm));
+    EXPECT_EQ(manager.getResourceCount(rawComm), 0);
+}
+
+TEST_F(NcclCommResourceManagerTest, MultipleResources)
+{
+    auto& manager = nccl_util::NcclCommResourceManager::getInstance();
+
+    // Create a separate comm using split for this test
+    auto testComm = createSplitComm(*mComm, 0, mRank);
+
+    std::vector<int> cleanupOrder;
+    manager.registerResource(
+        *testComm, [&cleanupOrder]() { cleanupOrder.push_back(1); }, "Resource1");
+    manager.registerResource(
+        *testComm, [&cleanupOrder]() { cleanupOrder.push_back(2); }, "Resource2");
+    manager.registerResource(
+        *testComm, [&cleanupOrder]() { cleanupOrder.push_back(3); }, "Resource3");
+
+    EXPECT_EQ(manager.getResourceCount(*testComm), 3);
+
+    // Cleanup order should be preserved - destroy comm and verify order
+    testComm.reset();
+
+    // Verify cleanup order was preserved (1, 2, 3)
+    EXPECT_EQ(cleanupOrder.size(), 3);
+    EXPECT_EQ(cleanupOrder[0], 1);
+    EXPECT_EQ(cleanupOrder[1], 2);
+    EXPECT_EQ(cleanupOrder[2], 3);
+}
+
+TEST_F(NcclCommResourceManagerTest, ResourceCount)
+{
+    auto& manager = nccl_util::NcclCommResourceManager::getInstance();
+
+    // Create a separate comm using split for this test
+    auto testComm = createSplitComm(*mComm, 0, mRank);
+
+    EXPECT_FALSE(manager.hasResources(*testComm));
+    EXPECT_EQ(manager.getResourceCount(*testComm), 0);
+
+    manager.registerResource(
+        *testComm, []() {}, "Test1");
+    EXPECT_EQ(manager.getResourceCount(*testComm), 1);
+
+    manager.registerResource(
+        *testComm, []() {}, "Test2");
+    EXPECT_EQ(manager.getResourceCount(*testComm), 2);
+
+    testComm.reset();
+}
+
+//==============================================================================
+// NCCLWindowAllocator Tests
+//==============================================================================
+
+class NCCLWindowAllocatorTest : public ::testing::Test
+{
+protected:
+    void SetUp() override
+    {
+        auto& comm = mpi::MpiComm::world();
+        mWorldSize = comm.getSize();
+        mRank = comm.getRank();
+
+        if (mWorldSize < 2)
+        {
+            GTEST_SKIP() << "Requires at least 2 ranks (got " << mWorldSize << ")";
+        }
+
+        // Set CUDA device for this rank (required before NCCL initialization)
+        int deviceCount = 0;
+        TLLM_CUDA_CHECK(cudaGetDeviceCount(&deviceCount));
+        if (deviceCount > 0)
+        {
+            int deviceId = mRank % deviceCount;
+            TLLM_CUDA_CHECK(cudaSetDevice(deviceId));
+        }
+
+        // Check if NCCL symmetric is supported
+        auto& ncclHelper = nccl_util::NCCLHelper::getInstance();
+        if (!ncclHelper.isLoaded())
+        {
+            GTEST_SKIP() << "NCCL library with symmetric memory support is not available";
+        }
+
+        std::set<int> group;
+        for (int i = 0; i < mWorldSize; ++i)
+        {
+            group.insert(i);
+        }
+        mComm = getComm(group);
+    }
+
+    void TearDown() override
+    {
+        // Cleanup happens automatically
+        mComm.reset();
+    }
+
+    int mWorldSize;
+    int mRank;
+    std::shared_ptr<ncclComm_t> mComm;
+};
+
+TEST_F(NCCLWindowAllocatorTest, BasicAllocation)
+{
+    auto& allocator = nccl_util::NCCLWindowAllocator::getInstance();
+
+    const size_t bufferSize = 1024 * 1024; // 1MB
+    auto buffer = allocator.requestBuffer(*mComm, bufferSize);
+
+    EXPECT_TRUE(buffer.isValid());
+    EXPECT_NE(buffer.ptr, nullptr);
+    EXPECT_NE(buffer.window, nullptr);
+    EXPECT_EQ(buffer.size, bufferSize);
+    EXPECT_GE(buffer.handle, 0);
+
+    // Verify we can search for it
+    auto found = allocator.searchBuffer(*mComm, buffer.ptr);
+    EXPECT_TRUE(found.isValid());
+    EXPECT_EQ(found.ptr, buffer.ptr);
+
+    // Release the buffer
+    allocator.releaseBuffer(*mComm, buffer.ptr);
+}
+
+TEST_F(NCCLWindowAllocatorTest, BufferReuse)
+{
+    auto& allocator = nccl_util::NCCLWindowAllocator::getInstance();
+
+    const size_t bufferSize = 512 * 1024; // 512KB
+
+    // Allocate first buffer
+    auto buffer1 = allocator.requestBuffer(*mComm, bufferSize);
+    EXPECT_TRUE(buffer1.isValid());
+    void* ptr1 = buffer1.ptr;
+
+    // Release it
+    allocator.releaseBuffer(*mComm, ptr1);
+
+    // Request another buffer of the same size - should reuse
+    auto buffer2 = allocator.requestBuffer(*mComm, bufferSize);
+    EXPECT_TRUE(buffer2.isValid());
+    EXPECT_EQ(buffer2.ptr, ptr1); // Should be the same buffer
+
+    allocator.releaseBuffer(*mComm, buffer2.ptr);
+}
+
+TEST_F(NCCLWindowAllocatorTest, BestFitReuse)
+{
+    auto& allocator = nccl_util::NCCLWindowAllocator::getInstance();
+
+    // Allocate buffers of different sizes
+    auto buffer1MB = allocator.requestBuffer(*mComm, 1024 * 1024);
+    auto buffer2MB = allocator.requestBuffer(*mComm, 2 * 1024 * 1024);
+    auto buffer512KB = allocator.requestBuffer(*mComm, 512 * 1024);
+
+    void* ptr1MB = buffer1MB.ptr;
+    void* ptr2MB = buffer2MB.ptr;
+    void* ptr512KB = buffer512KB.ptr;
+
+    // Release all
+    allocator.releaseBuffer(*mComm, ptr1MB);
+    allocator.releaseBuffer(*mComm, ptr2MB);
+    allocator.releaseBuffer(*mComm, ptr512KB);
+
+    // Request 768KB - should reuse 1MB (best fit, smallest that fits)
+    auto buffer768KB = allocator.requestBuffer(*mComm, 768 * 1024);
+    EXPECT_TRUE(buffer768KB.isValid());
+    EXPECT_EQ(buffer768KB.ptr, ptr1MB);       // Should reuse 1MB buffer
+    EXPECT_EQ(buffer768KB.size, 1024 * 1024); // Original size
+
+    allocator.releaseBuffer(*mComm, buffer768KB.ptr);
+}
+
+TEST_F(NCCLWindowAllocatorTest, MultipleBuffers)
+{
+    auto& allocator = nccl_util::NCCLWindowAllocator::getInstance();
+
+    const size_t bufferSize = 256 * 1024;
+    std::vector<void*> ptrs;
+
+    // Allocate multiple buffers
+    for (int i = 0; i < 5; ++i)
+    {
+        auto buffer = allocator.requestBuffer(*mComm, bufferSize);
+        EXPECT_TRUE(buffer.isValid());
+        ptrs.push_back(buffer.ptr);
+    }
+
+    EXPECT_EQ(allocator.getBufferCount(*mComm), 5);
+    EXPECT_EQ(allocator.getBufferInUseCount(*mComm), 5);
+
+    // Release all
+    for (auto* ptr : ptrs)
+    {
+        allocator.releaseBuffer(*mComm, ptr);
+    }
+
+    EXPECT_EQ(allocator.getBufferInUseCount(*mComm), 0);
+    EXPECT_EQ(allocator.getBufferCount(*mComm), 5); // Buffers still exist, just not in use
+}
+
+TEST_F(NCCLWindowAllocatorTest, SearchBuffer)
+{
+    auto& allocator = nccl_util::NCCLWindowAllocator::getInstance();
+
+    const size_t bufferSize = 128 * 1024;
+    auto buffer = allocator.requestBuffer(*mComm, bufferSize);
+
+    // Test searchBuffer
+    auto found = allocator.searchBuffer(*mComm, buffer.ptr);
+    EXPECT_TRUE(found.isValid());
+    EXPECT_EQ(found.ptr, buffer.ptr);
+    // Compare against actual allocated size (ncclMemAlloc may allocate more than requested)
+    EXPECT_EQ(found.size, buffer.size);
+    EXPECT_GE(found.size, bufferSize); // At least the requested size
+
+    // Test search for non-existent buffer
+    void* fakePtr = reinterpret_cast<void*>(0xDEADBEEF);
+    auto notFound = allocator.searchBuffer(*mComm, fakePtr);
+    EXPECT_FALSE(notFound.isValid());
+
+    allocator.releaseBuffer(*mComm, buffer.ptr);
+}
+
+TEST_F(NCCLWindowAllocatorTest, GetWindowAndSize)
+{
+    auto& allocator = nccl_util::NCCLWindowAllocator::getInstance();
+
+    const size_t bufferSize = 64 * 1024;
+    auto buffer = allocator.requestBuffer(*mComm, bufferSize);
+
+    // Test getWindow
+    auto window = allocator.getWindow(*mComm, buffer.ptr);
+    EXPECT_NE(window, nullptr);
+    EXPECT_EQ(window, buffer.window);
+
+    // Test getSize - compare against actual allocated size (ncclMemAlloc may allocate more than requested)
+    auto size = allocator.getSize(*mComm, buffer.ptr);
+    EXPECT_EQ(size, buffer.size);
+    EXPECT_GE(size, bufferSize); // At least the requested size
+
+    // Test with invalid pointer
+    void* fakePtr = reinterpret_cast<void*>(0xDEADBEEF);
+    EXPECT_EQ(allocator.getWindow(*mComm, fakePtr), nullptr);
+    EXPECT_EQ(allocator.getSize(*mComm, fakePtr), 0);
+
+    allocator.releaseBuffer(*mComm, buffer.ptr);
+}
+
+TEST_F(NCCLWindowAllocatorTest, GetBufferInfo)
+{
+    auto& allocator = nccl_util::NCCLWindowAllocator::getInstance();
+
+    const size_t bufferSize = 32 * 1024;
+    auto buffer = allocator.requestBuffer(*mComm, bufferSize);
+
+    auto info = allocator.getBufferInfo(*mComm, buffer.ptr);
+    EXPECT_TRUE(info.isValid());
+    EXPECT_EQ(info.ptr, buffer.ptr);
+    EXPECT_EQ(info.size, buffer.size);
+    EXPECT_EQ(info.handle, buffer.handle);
+    EXPECT_EQ(info.window, buffer.window);
+
+    allocator.releaseBuffer(*mComm, buffer.ptr);
+}
+
+TEST_F(NCCLWindowAllocatorTest, ScopedBuffer)
+{
+    const size_t bufferSize = 16 * 1024;
+
+    {
+        nccl_util::ScopedNCCLWindowBuffer scopedBuffer(*mComm, bufferSize);
+        EXPECT_TRUE(scopedBuffer.getBuffer().isValid());
+        EXPECT_NE(scopedBuffer.getPtr(), nullptr);
+        // Compare against actual allocated size (ncclMemAlloc may allocate more than requested)
+        EXPECT_EQ(scopedBuffer.getSize(), scopedBuffer.getBuffer().size);
+        EXPECT_GE(scopedBuffer.getSize(), bufferSize); // At least the requested size
+        EXPECT_NE(scopedBuffer.getWindow(), nullptr);
+
+        // Buffer should be in use
+        auto& allocator = nccl_util::NCCLWindowAllocator::getInstance();
+        EXPECT_EQ(allocator.getBufferInUseCount(*mComm), 1);
+    }
+
+    // Buffer should be released when scoped buffer goes out of scope
+    auto& allocator = nccl_util::NCCLWindowAllocator::getInstance();
+    EXPECT_EQ(allocator.getBufferInUseCount(*mComm), 0);
+}
+
+TEST_F(NCCLWindowAllocatorTest, CleanupOnCommDestroy)
+{
+    auto& allocator = nccl_util::NCCLWindowAllocator::getInstance();
+
+    // Create a separate comm using split for this test
+    auto testComm = createSplitComm(*mComm, 0, mRank);
+
+    // Store the raw comm value before destruction
+    ncclComm_t rawComm = *testComm;
+
+    // Allocate some buffers
+    const size_t bufferSize = 8 * 1024;
+    auto buffer1 = allocator.requestBuffer(*testComm, bufferSize);
+    auto buffer2 = allocator.requestBuffer(*testComm, bufferSize * 2);
+
+    EXPECT_EQ(allocator.getBufferCount(*testComm), 2);
+    EXPECT_EQ(allocator.getBufferInUseCount(*testComm), 2);
+
+    // Verify buffers are valid
+    EXPECT_TRUE(buffer1.isValid());
+    EXPECT_TRUE(buffer2.isValid());
+
+    // Manually release buffers before cleanup to avoid warnings
+    allocator.releaseBuffer(*testComm, buffer1.ptr);
+    allocator.releaseBuffer(*testComm, buffer2.ptr);
+
+    // Verify buffers are released but still exist in pool
+    EXPECT_EQ(allocator.getBufferInUseCount(*testComm), 0);
+    EXPECT_EQ(allocator.getBufferCount(*testComm), 2); // Buffers still exist, just not in use
+
+    // Destroy the communicator - buffers should be cleaned up automatically
+    testComm.reset();
+
+    // Verify cleanup: check that the old comm (now destroyed) no longer has buffers
+    // Note: The comm is destroyed, but we can still check the allocator's internal state
+    // The cleanup should have removed all buffers for this comm
+    EXPECT_EQ(allocator.getBufferCount(rawComm), 0);
+    EXPECT_EQ(allocator.getBufferInUseCount(rawComm), 0);
+    // Note: isCommValid only checks for null, not cleaned-up state, because NCCL can reuse addresses
+    // The real check is that buffers are gone, which we verify above
+}
+
+TEST_F(NCCLWindowAllocatorTest, CommValidity)
+{
+    auto& allocator = nccl_util::NCCLWindowAllocator::getInstance();
+
+    // Valid comm should be valid
+    EXPECT_TRUE(allocator.isCommValid(*mComm));
+
+    // Null comm should be invalid
+    EXPECT_FALSE(allocator.isCommValid(nullptr));
+}
+
+//==============================================================================
+// Integration Tests
+//==============================================================================
+
+TEST_F(NCCLWindowAllocatorTest, MultipleComms)
+{
+    auto& allocator = nccl_util::NCCLWindowAllocator::getInstance();
+
+    // Create two different communicators using split (different colors)
+    auto comm1 = createSplitComm(*mComm, 0, mRank);
+    auto comm2 = createSplitComm(*mComm, 1, mRank);
+
+    const size_t bufferSize = 4 * 1024;
+
+    // Allocate buffers from both comms
+    auto buffer1 = allocator.requestBuffer(*comm1, bufferSize);
+    auto buffer2 = allocator.requestBuffer(*comm2, bufferSize);
+
+    EXPECT_TRUE(buffer1.isValid());
+    EXPECT_TRUE(buffer2.isValid());
+
+    // Buffers should be tracked separately per comm
+    EXPECT_EQ(allocator.getBufferCount(*comm1), 1);
+    EXPECT_EQ(allocator.getBufferCount(*comm2), 1);
+    EXPECT_NE(buffer1.ptr, buffer2.ptr); // Different buffers from different comms
+
+    allocator.releaseBuffer(*comm1, buffer1.ptr);
+    allocator.releaseBuffer(*comm2, buffer2.ptr);
+
+    // Clean up comms
+    comm1.reset();
+    comm2.reset();
+}
+
+#if ENABLE_MULTI_DEVICE && BUILD_PYT
+//==============================================================================
+// createNCCLWindowTensor Tests
+//==============================================================================
+
+class CreateNCCLWindowTensorTest : public ::testing::Test
+{
+protected:
+    void SetUp() override
+    {
+        auto& comm = mpi::MpiComm::world();
+        mWorldSize = comm.getSize();
+        mRank = comm.getRank();
+
+        if (mWorldSize < 2)
+        {
+            GTEST_SKIP() << "Requires at least 2 ranks (got " << mWorldSize << ")";
+        }
+
+        // Set CUDA device for this rank (required before NCCL initialization)
+        int deviceCount = 0;
+        TLLM_CUDA_CHECK(cudaGetDeviceCount(&deviceCount));
+        if (deviceCount > 0)
+        {
+            int deviceId = mRank % deviceCount;
+            TLLM_CUDA_CHECK(cudaSetDevice(deviceId));
+        }
+
+        // Check if NCCL symmetric is supported
+        auto& ncclHelper = nccl_util::NCCLHelper::getInstance();
+        if (!ncclHelper.isLoaded())
+        {
+            GTEST_SKIP() << "NCCL library with symmetric memory support is not available";
+        }
+
+        std::set<int> group;
+        for (int i = 0; i < mWorldSize; ++i)
+        {
+            group.insert(i);
+        }
+        mComm = getComm(group);
+    }
+
+    void TearDown() override
+    {
+        mComm.reset();
+    }
+
+    int mWorldSize;
+    int mRank;
+    std::shared_ptr<ncclComm_t> mComm;
+};
+
+TEST_F(CreateNCCLWindowTensorTest, BasicTensorCreation)
+{
+    using nccl_util::createNCCLWindowTensor;
+
+    // Create a tensor with shape [4, 8] and float32 dtype
+    std::vector<int64_t> shape = {4, 8};
+    auto [tensor, buffer] = createNCCLWindowTensor(*mComm, shape, torch::kFloat32);
+
+    // Verify tensor properties
+    EXPECT_TRUE(tensor.defined());
+    EXPECT_EQ(tensor.dtype(), torch::kFloat32);
+    EXPECT_EQ(tensor.device().type(), torch::kCUDA);
+    EXPECT_EQ(tensor.dim(), 2);
+    EXPECT_EQ(tensor.size(0), 4);
+    EXPECT_EQ(tensor.size(1), 8);
+    EXPECT_EQ(tensor.numel(), 4 * 8);
+
+    // Verify buffer properties
+    EXPECT_TRUE(buffer.isValid());
+    EXPECT_NE(buffer.ptr, nullptr);
+    // ncclMemAlloc may allocate more than requested, so check at least the requested size
+    EXPECT_GE(buffer.size, 4 * 8 * sizeof(float));
+    EXPECT_NE(buffer.window, nullptr);
+
+    // Verify tensor data pointer matches buffer pointer
+    EXPECT_EQ(tensor.data_ptr(), buffer.ptr);
+
+    // Tensor should be in use
+    auto& allocator = nccl_util::NCCLWindowAllocator::getInstance();
+    EXPECT_EQ(allocator.getBufferInUseCount(*mComm), 1);
+}
+
+TEST_F(CreateNCCLWindowTensorTest, DifferentDtypes)
+{
+    using nccl_util::createNCCLWindowTensor;
+
+    std::vector<int64_t> shape = {10};
+
+    // Test float32
+    {
+        auto [tensor, buffer] = createNCCLWindowTensor(*mComm, shape, torch::kFloat32);
+        EXPECT_EQ(tensor.dtype(), torch::kFloat32);
+        // ncclMemAlloc may allocate more than requested, so check at least the requested size
+        EXPECT_GE(buffer.size, 10 * sizeof(float));
+        EXPECT_EQ(tensor.data_ptr(), buffer.ptr);
+    }
+
+    // Test float16
+    {
+        auto [tensor, buffer] = createNCCLWindowTensor(*mComm, shape, torch::kFloat16);
+        EXPECT_EQ(tensor.dtype(), torch::kFloat16);
+        // ncclMemAlloc may allocate more than requested, so check at least the requested size
+        EXPECT_GE(buffer.size, 10 * sizeof(at::Half));
+        EXPECT_EQ(tensor.data_ptr(), buffer.ptr);
+    }
+
+    // Test int32
+    {
+        auto [tensor, buffer] = createNCCLWindowTensor(*mComm, shape, torch::kInt32);
+        EXPECT_EQ(tensor.dtype(), torch::kInt32);
+        // ncclMemAlloc may allocate more than requested, so check at least the requested size
+        EXPECT_GE(buffer.size, 10 * sizeof(int32_t));
+        EXPECT_EQ(tensor.data_ptr(), buffer.ptr);
+    }
+}
+
+TEST_F(CreateNCCLWindowTensorTest, DifferentShapes)
+{
+    using nccl_util::createNCCLWindowTensor;
+
+    // 1D tensor
+    {
+        std::vector<int64_t> shape = {100};
+        auto [tensor, buffer] = createNCCLWindowTensor(*mComm, shape, torch::kFloat32);
+        EXPECT_EQ(tensor.dim(), 1);
+        EXPECT_EQ(tensor.size(0), 100);
+        // ncclMemAlloc may allocate more than requested, so check at least the requested size
+        EXPECT_GE(buffer.size, 100 * sizeof(float));
+    }
+
+    // 3D tensor
+    {
+        std::vector<int64_t> shape = {2, 3, 4};
+        auto [tensor, buffer] = createNCCLWindowTensor(*mComm, shape, torch::kFloat32);
+        EXPECT_EQ(tensor.dim(), 3);
+        EXPECT_EQ(tensor.size(0), 2);
+        EXPECT_EQ(tensor.size(1), 3);
+        EXPECT_EQ(tensor.size(2), 4);
+        // ncclMemAlloc may allocate more than requested, so check at least the requested size
+        EXPECT_GE(buffer.size, 2 * 3 * 4 * sizeof(float));
+    }
+
+    // 4D tensor
+    {
+        std::vector<int64_t> shape = {1, 2, 3, 4};
+        auto [tensor, buffer] = createNCCLWindowTensor(*mComm, shape, torch::kFloat32);
+        EXPECT_EQ(tensor.dim(), 4);
+        EXPECT_EQ(tensor.numel(), 1 * 2 * 3 * 4);
+        // ncclMemAlloc may allocate more than requested, so check at least the requested size
+        EXPECT_GE(buffer.size, 1 * 2 * 3 * 4 * sizeof(float));
+    }
+}
+
+TEST_F(CreateNCCLWindowTensorTest, TensorDeleterReleasesBuffer)
+{
+    using nccl_util::createNCCLWindowTensor;
+
+    auto& allocator = nccl_util::NCCLWindowAllocator::getInstance();
+
+    {
+        std::vector<int64_t> shape = {16, 16};
+        auto [tensor, buffer] = createNCCLWindowTensor(*mComm, shape, torch::kFloat32);
+
+        EXPECT_EQ(allocator.getBufferInUseCount(*mComm), 1);
+        EXPECT_TRUE(buffer.isValid());
+        void* bufferPtr = buffer.ptr;
+
+        // Tensor goes out of scope - deleter should release the buffer
+    }
+
+    // Buffer should be released (not in use anymore)
+    EXPECT_EQ(allocator.getBufferInUseCount(*mComm), 0);
+
+    // Buffer should still exist in the pool (for reuse)
+    EXPECT_GE(allocator.getBufferCount(*mComm), 1);
+}
+
+TEST_F(CreateNCCLWindowTensorTest, MultipleTensors)
+{
+    using nccl_util::createNCCLWindowTensor;
+
+    auto& allocator = nccl_util::NCCLWindowAllocator::getInstance();
+
+    std::vector<int64_t> shape = {8, 8};
+    auto [tensor1, buffer1] = createNCCLWindowTensor(*mComm, shape, torch::kFloat32);
+    auto [tensor2, buffer2] = createNCCLWindowTensor(*mComm, shape, torch::kFloat32);
+    auto [tensor3, buffer3] = createNCCLWindowTensor(*mComm, shape, torch::kFloat32);
+
+    EXPECT_EQ(allocator.getBufferInUseCount(*mComm), 3);
+    EXPECT_NE(buffer1.ptr, buffer2.ptr);
+    EXPECT_NE(buffer2.ptr, buffer3.ptr);
+    EXPECT_NE(buffer1.ptr, buffer3.ptr);
+
+    // All tensors should be valid
+    EXPECT_TRUE(tensor1.defined());
+    EXPECT_TRUE(tensor2.defined());
+    EXPECT_TRUE(tensor3.defined());
+}
+
+TEST_F(CreateNCCLWindowTensorTest, TensorStrides)
+{
+    using nccl_util::createNCCLWindowTensor;
+
+    std::vector<int64_t> shape = {3, 4, 5};
+    auto [tensor, buffer] = createNCCLWindowTensor(*mComm, shape, torch::kFloat32);
+
+    // Verify strides are correct (row-major order)
+    EXPECT_EQ(tensor.stride(0), 4 * 5); // stride for first dimension
+    EXPECT_EQ(tensor.stride(1), 5);     // stride for second dimension
+    EXPECT_EQ(tensor.stride(2), 1);     // stride for third dimension
+}
+
+#endif // ENABLE_MULTI_DEVICE && BUILD_PYT
+
+#endif // ENABLE_MULTI_DEVICE
diff --git a/tensorrt_llm/_torch/pyexecutor/model_engine.py b/tensorrt_llm/_torch/pyexecutor/model_engine.py
index 811f11fce5..aaac2256c9 100644
--- a/tensorrt_llm/_torch/pyexecutor/model_engine.py
+++ b/tensorrt_llm/_torch/pyexecutor/model_engine.py
@@ -2844,11 +2844,17 @@ class PyTorchModelEngine(ModelEngine):
         # Disable UB for unsupported platforms
         if not ub.ub_supported():
             return False
-        use_nccl_symmetric = self.llm_args.allreduce_strategy == "NCCL_SYMMETRIC"
-        ub.initialize_userbuffers_manager(
-            self.mapping.tp_size, self.mapping.pp_size, self.mapping.cp_size,
-            self.mapping.rank, self.mapping.gpus_per_node,
-            hidden_size * self.max_num_tokens * 2, use_nccl_symmetric)
+        # NCCL_SYMMETRIC strategy no longer requires UserBuffer allocator initialization.
+        # It uses NCCLWindowAllocator from ncclUtils directly.
+        if self.llm_args.allreduce_strategy == "NCCL_SYMMETRIC":
+            # Skip UB initialization for NCCL_SYMMETRIC - it uses NCCLWindowAllocator directly
+            return False
+        ub.initialize_userbuffers_manager(self.mapping.tp_size,
+                                          self.mapping.pp_size,
+                                          self.mapping.cp_size,
+                                          self.mapping.rank,
+                                          self.mapping.gpus_per_node,
+                                          hidden_size * self.max_num_tokens * 2)
 
         return True
 
diff --git a/tensorrt_llm/functional.py b/tensorrt_llm/functional.py
index 19e31d04ce..b4c986fd6a 100755
--- a/tensorrt_llm/functional.py
+++ b/tensorrt_llm/functional.py
@@ -4020,7 +4020,10 @@ def create_allreduce_plugin(
     pfc = trt.PluginFieldCollection(pfc)
     ar_plug = allreduce_plg_creator.create_plugin("allreduce", pfc)
     plug_inputs = [tensor]
-    if all_reduce_params.strategy != AllReduceStrategy.NCCL and all_reduce_params.strategy != AllReduceStrategy.UB:
+    if all_reduce_params.strategy not in {
+            AllReduceStrategy.NCCL, AllReduceStrategy.UB,
+            AllReduceStrategy.NCCL_SYMMETRIC
+    }:
         plug_inputs.append(workspace)
     if all_reduce_params.fusion_op != AllReduceFusionOp.NONE:
         if all_reduce_params.has_bias() == 1:
@@ -4092,7 +4095,7 @@ def allreduce(
     workspace = None
     if all_reduce_params.strategy != AllReduceStrategy.NCCL and all_reduce_params.strategy != AllReduceStrategy.UB:
         if current_all_reduce_helper().workspace is None:
-            all_reduce_params.strategy = AllReduceStrategy.NCCL
+            all_reduce_params.strategy = AllReduceStrategy.NCCL_SYMMETRIC
         else:
             workspace = current_all_reduce_helper().workspace.trt_tensor
     if all_reduce_params.strategy == AllReduceStrategy.UB:
diff --git a/tests/integration/defs/cpp/test_multi_gpu.py b/tests/integration/defs/cpp/test_multi_gpu.py
index 3b384dd58e..7cf92efaad 100644
--- a/tests/integration/defs/cpp/test_multi_gpu.py
+++ b/tests/integration/defs/cpp/test_multi_gpu.py
@@ -127,6 +127,24 @@ def run_user_buffer_tests(build_dir: _pl.Path, nprocs=2, timeout=300):
                      timeout=timeout)
 
 
+def run_nccl_utils_tests(build_dir: _pl.Path, nprocs=2, timeout=300):
+    tests_dir = build_dir / "tests" / "unit_tests" / "multi_gpu"
+    mgpu_env = get_multi_gpu_env()
+
+    nccl_utils_test = [
+        "mpirun",
+        "-n",
+        f"{nprocs}",
+        "--allow-run-as-root",
+        "ncclUtilsTest",
+    ]
+
+    _cpp.run_command(nccl_utils_test,
+                     cwd=tests_dir,
+                     env=mgpu_env,
+                     timeout=timeout)
+
+
 def run_llama_executor_leader_tests(build_dir: _pl.Path, timeout=1500):
     tests_dir = build_dir / "tests" / "e2e_tests"
 
@@ -505,6 +523,15 @@ def test_user_buffer(build_google_tests, nprocs, build_dir):
         run_user_buffer_tests(build_dir=build_dir, nprocs=nprocs, timeout=300)
 
 
+@pytest.mark.parametrize("build_google_tests", ["80", "86", "89", "90"],
+                         indirect=True)
+@pytest.mark.parametrize("nprocs", [2, 8], ids=["2proc", "8proc"])
+def test_nccl_utils(build_google_tests, nprocs, build_dir):
+
+    if platform.system() != "Windows":
+        run_nccl_utils_tests(build_dir=build_dir, nprocs=nprocs, timeout=300)
+
+
 @pytest.mark.parametrize("build_google_tests", ["80", "86", "89", "90"],
                          indirect=True)
 @pytest.mark.parametrize("multi_gpu_model", ["t5"], indirect=True)
diff --git a/tests/microbenchmarks/all_reduce.py b/tests/microbenchmarks/all_reduce.py
index 837b034812..bd5ceb8826 100644
--- a/tests/microbenchmarks/all_reduce.py
+++ b/tests/microbenchmarks/all_reduce.py
@@ -176,6 +176,7 @@ def allreduce_benchmark(
     ]
     strategies = [
         AllReduceStrategy.NCCL,
+        AllReduceStrategy.NCCL_SYMMETRIC,
         AllReduceStrategy.ONESHOT,
         AllReduceStrategy.TWOSHOT,
         AllReduceStrategy.AUTO,
@@ -242,6 +243,9 @@ def allreduce_benchmark(
     # print the dataframe
     if mapping.rank == 0:
         pd.set_option('display.max_rows', None)
+        pd.set_option('display.max_columns', None)
+        pd.set_option('display.width', None)
+        pd.set_option('display.max_colwidth', None)
         print(df)
 
     # # save the dataframe to a csv file
diff --git a/tests/scripts/allreduce_perf/allreduce_heuristic_code_gen.py b/tests/scripts/allreduce_perf/allreduce_heuristic_code_gen.py
index 11c114e9cf..e7aeb994b6 100644
--- a/tests/scripts/allreduce_perf/allreduce_heuristic_code_gen.py
+++ b/tests/scripts/allreduce_perf/allreduce_heuristic_code_gen.py
@@ -28,6 +28,7 @@ class Constants:
     tp_size_list = [2, 4, 8]
     strategy_name_to_enum = {
         'NCCL': 0,
+        'NCCL_SYMMETRIC': 8,
         'ONESHOT': 4,
         'TWOSHOT': 5,
     }
@@ -84,10 +85,10 @@ def generate_heuristic_look_up_table(df: pd.DataFrame) -> str:
     hidden_size_count = len(Constants.hidden_size_list)
     num_tokens_count = len(Constants.num_tokens_list)
 
-    # Initialize lookup table with default values (NCCL = 0)
+    # Initialize lookup table with default values (NCCL_SYMMETRIC = 8)
     strategy_table = np.full(
         (tp_size_count, fusion_count, hidden_size_count, num_tokens_count),
-        Constants.strategy_name_to_enum['NCCL'],
+        Constants.strategy_name_to_enum['NCCL_SYMMETRIC'],
         dtype=int)
 
     # Fill the lookup table with best strategies
diff --git a/tests/unittest/_torch/multi_gpu/test_allreduce.py b/tests/unittest/_torch/multi_gpu/test_allreduce.py
index c01fe9205c..5051998c5a 100644
--- a/tests/unittest/_torch/multi_gpu/test_allreduce.py
+++ b/tests/unittest/_torch/multi_gpu/test_allreduce.py
@@ -123,7 +123,7 @@ def run_allreduce_op(x: torch.Tensor, residual: torch.Tensor, hidden_size: int,
         dtype=dtype,
         mapping=mapping,
         tensor_parallel_mode=TensorParallelMode.ROW,
-        allreduce_strategy=AllReduceStrategy.NCCL,
+        allreduce_strategy=AllReduceStrategy.NCCL_SYMMETRIC,
     ).cuda()
     allreduce = AllReduce(mapping=mapping)
     norm = RMSNorm(hidden_size=hidden_size, eps=eps, dtype=dtype).cuda()
diff --git a/tests/unittest/_torch/multi_gpu/test_mnnvl_allreduce.py b/tests/unittest/_torch/multi_gpu/test_mnnvl_allreduce.py
index 56cf5a9562..524fed462e 100644
--- a/tests/unittest/_torch/multi_gpu/test_mnnvl_allreduce.py
+++ b/tests/unittest/_torch/multi_gpu/test_mnnvl_allreduce.py
@@ -108,7 +108,7 @@ def row_linear_residual_norm_fusion_forward(
         ub.initialize_userbuffers_manager(
             tensor_parallel_size, 1, 1, tensor_parallel_rank,
             torch.cuda.device_count(),
-            x_list[0].nelement() * x_list[0].element_size(), True)
+            x_list[0].nelement() * x_list[0].element_size())
     elif strategy == AllReduceStrategy.MNNVL:
         os.environ["TLLM_TEST_MNNVL"] = "1"
 
diff --git a/tests/unittest/_torch/multi_gpu/test_user_buffers.py b/tests/unittest/_torch/multi_gpu/test_user_buffers.py
index c547c8a3e8..6de03d1908 100644
--- a/tests/unittest/_torch/multi_gpu/test_user_buffers.py
+++ b/tests/unittest/_torch/multi_gpu/test_user_buffers.py
@@ -43,8 +43,7 @@ def create_tp_mapping(tp_size, rank):
 
 def init_userbuffers_allocator(tp_size, rank, max_ub_size):
     ub.initialize_userbuffers_manager(tp_size, 1, 1, rank,
-                                      torch.cuda.device_count(), max_ub_size,
-                                      False)
+                                      torch.cuda.device_count(), max_ub_size)
 
 
 def create_userbuffers_tensor(shape, dtype):

From 383178c00a94fa104155659f2668068061b275fd Mon Sep 17 00:00:00 2001
From: chenfeiz0326 <chenfeiz@nvidia.com>
Date: Mon, 8 Dec 2025 09:00:44 +0800
Subject: [PATCH 05/10] [TRTLLM-9000][feat] Add multi-node Perf Tests into CI
 (#8800)

Signed-off-by: Chenfei Zhang <chenfeiz@nvidia.com>
---
 jenkins/L0_Test.groovy                        |  14 +-
 jenkins/scripts/slurm_run.sh                  |   2 +-
 .../defs/perf/open_search_db_utils.py         | 113 ++-
 tests/integration/defs/perf/test_perf.py      | 952 +++++++++++++-----
 tests/integration/defs/perf/utils.py          | 348 ++++++-
 .../test-db/l0_dgx_b200_perf_sanity.yml       |  41 +
 .../test-db/l0_dgx_b300_perf_sanity.yml       |  41 +
 .../l0_gb200_multi_gpus_perf_sanity.yml       |  22 +
 .../l0_gb200_multi_nodes_perf_sanity.yml      |  16 +
 .../test-db/perf_sanity_l0_dgx_b200.yml       |  35 -
 .../test-db/perf_sanity_l0_dgx_b300.yml       |  37 -
 tests/scripts/perf-sanity/README.md           | 201 ++--
 tests/scripts/perf-sanity/l0_dgx_b200.yaml    | 325 +++++-
 tests/scripts/perf-sanity/l0_dgx_b300.yaml    | 226 ++++-
 .../perf-sanity/l0_gb200_multi_gpus.yaml      | 294 ++++++
 .../perf-sanity/l0_gb200_multi_nodes.yaml     |  71 ++
 16 files changed, 2119 insertions(+), 619 deletions(-)
 create mode 100644 tests/integration/test_lists/test-db/l0_dgx_b200_perf_sanity.yml
 create mode 100644 tests/integration/test_lists/test-db/l0_dgx_b300_perf_sanity.yml
 create mode 100644 tests/integration/test_lists/test-db/l0_gb200_multi_gpus_perf_sanity.yml
 create mode 100644 tests/integration/test_lists/test-db/l0_gb200_multi_nodes_perf_sanity.yml
 delete mode 100644 tests/integration/test_lists/test-db/perf_sanity_l0_dgx_b200.yml
 delete mode 100644 tests/integration/test_lists/test-db/perf_sanity_l0_dgx_b300.yml
 create mode 100644 tests/scripts/perf-sanity/l0_gb200_multi_gpus.yaml
 create mode 100644 tests/scripts/perf-sanity/l0_gb200_multi_nodes.yaml

diff --git a/jenkins/L0_Test.groovy b/jenkins/L0_Test.groovy
index 41c66a7887..26c7716ba8 100644
--- a/jenkins/L0_Test.groovy
+++ b/jenkins/L0_Test.groovy
@@ -1126,7 +1126,7 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
 def runLLMTestlistOnSlurm(pipeline, platform, testList, config=VANILLA_CONFIG, perfMode=false, stageName="Undefined", splitId=1, splits=1, gpuCount=1, nodeCount=1, runWithSbatch=false, skipInstallWheel=false, cpver="cp312")
 {
   echo "Run Slurm job with native sbatch: $runWithSbatch"
-  if(nodeCount > 1 || runWithSbatch) {
+  if (nodeCount > 1 || runWithSbatch) {
     runLLMTestlistWithSbatch(pipeline, platform, testList, config, perfMode, stageName, splitId, splits, gpuCount, nodeCount, skipInstallWheel, cpver)
   } else {
     runLLMTestlistWithAgent(pipeline, platform, testList, config, perfMode, stageName, splitId, splits, gpuCount, skipInstallWheel, cpver)
@@ -2493,7 +2493,7 @@ def runLLMTestlistOnPlatformImpl(pipeline, platform, testList, config=VANILLA_CO
             error "Some tests still failed after rerun attempts, please check the test report."
         }
 
-        if (perfMode) {
+        if (perfMode && !stageName.contains("Perf-Sanity")) {
             basePerfFilename = stageName.contains("PyTorch") ? "base_perf_pytorch.csv" : "base_perf.csv"
             basePerfPath = "${llmSrc}/tests/integration/defs/perf/${basePerfFilename}"
             stage("Check perf result") {
@@ -2909,9 +2909,9 @@ def launchTestJobs(pipeline, testFilter)
         "DGX_B300-4_GPUs-PyTorch-Post-Merge-1": ["b300-x4", "l0_dgx_b300", 1, 2, 4],
         "DGX_B300-4_GPUs-PyTorch-Post-Merge-2": ["b300-x4", "l0_dgx_b300", 2, 2, 4],
         // Perf sanity post merge test
-        // Disable perf stages due to https://nvbugs/5643646
-        // "DGX_B200-4_GPUs-PyTorch-Perf-Sanity-Post-Merge-1": ["b200-x4", "perf_sanity_l0_dgx_b200", 1, 1, 4],
-        // "DGX_B300-4_GPUs-PyTorch-Perf-Sanity-Post-Merge-1": ["b300-x4", "perf_sanity_l0_dgx_b300", 1, 1, 4],
+        // "DGX_B200-4_GPUs-PyTorch-Perf-Sanity-Post-Merge-1": ["b200-x4", "l0_dgx_b200_perf_sanity", 1, 1, 4],
+        // "DGX_B200-8_GPUs-PyTorch-Perf-Sanity-Post-Merge-1": ["b200-x8", "l0_dgx_b200_perf_sanity", 1, 1, 8],
+        // "DGX_B300-4_GPUs-PyTorch-Perf-Sanity-Post-Merge-1": ["b300-x4", "l0_dgx_b300_perf_sanity", 1, 1, 4],
     ]
     fullSet += x86SlurmTestConfigs.keySet()
 
@@ -2939,6 +2939,8 @@ def launchTestJobs(pipeline, testFilter)
         "GB200-4_GPUs-PyTorch-1": ["gb200-x4-oci", "l0_gb200_multi_gpus", 1, 2, 4],
         "GB200-4_GPUs-PyTorch-2": ["gb200-x4-oci", "l0_gb200_multi_gpus", 2, 2, 4],
         "GB200-4_GPUs-PyTorch-Post-Merge-1": ["gb200-x4-oci", "l0_gb200_multi_gpus", 1, 1, 4],
+        // Perf sanity post merge test
+        "GB200-4_GPUs-PyTorch-Perf-Sanity-Post-Merge-1": ["gb200-x4-oci", "l0_gb200_multi_gpus_perf_sanity", 1, 1, 4],
         // Disable GB300 stages due to nodes will be offline temporarily.
         // "GB300-PyTorch-1": ["gb300-single", "l0_gb300", 1, 1],
         // "GB300-4_GPUs-PyTorch-Post-Merge-1": ["gb300-x4", "l0_gb300_multi_gpus", 1, 1, 4],
@@ -2953,6 +2955,8 @@ def launchTestJobs(pipeline, testFilter)
         "GB200-8_GPUs-2_Nodes-PyTorch-Post-Merge-1": ["gb200-oci-trtllm", "l0_gb200_multi_nodes", 1, 3, 8, 2],
         "GB200-8_GPUs-2_Nodes-PyTorch-Post-Merge-2": ["gb200-oci-trtllm", "l0_gb200_multi_nodes", 2, 3, 8, 2],
         "GB200-8_GPUs-2_Nodes-PyTorch-Post-Merge-3": ["gb200-oci-trtllm", "l0_gb200_multi_nodes", 3, 3, 8, 2],
+        // Perf sanity post merge test
+        "GB200-8_GPUs-2_Nodes-PyTorch-Perf-Sanity-Post-Merge-1": ["gb200-oci-trtllm", "l0_gb200_multi_nodes_perf_sanity", 1, 1, 8, 2],
     ]
     fullSet += multiNodesSBSAConfigs.keySet()
 
diff --git a/jenkins/scripts/slurm_run.sh b/jenkins/scripts/slurm_run.sh
index 8f191b3edb..717f1be791 100755
--- a/jenkins/scripts/slurm_run.sh
+++ b/jenkins/scripts/slurm_run.sh
@@ -109,7 +109,7 @@ echo "Full Command: $pytestCommand"
 eval $pytestCommand
 echo "Rank${SLURM_PROCID} Pytest finished execution"
 
-if [ "$perfMode" = "true" ]; then
+if [ $SLURM_PROCID -eq 0 ] && [ "$perfMode" = "true" ] && [[ "$stageName" != *Perf-Sanity* ]]; then
     if [[ "$stageName" == *PyTorch* ]]; then
         basePerfFilename="base_perf_pytorch.csv"
     else
diff --git a/tests/integration/defs/perf/open_search_db_utils.py b/tests/integration/defs/perf/open_search_db_utils.py
index 9f9ebda169..434af387a5 100644
--- a/tests/integration/defs/perf/open_search_db_utils.py
+++ b/tests/integration/defs/perf/open_search_db_utils.py
@@ -20,6 +20,7 @@ import os
 import re
 import sys
 import time
+from datetime import datetime
 
 from defs.trt_test_alternative import print_info
 
@@ -32,40 +33,6 @@ from jenkins.scripts.open_search_db import OpenSearchDB
 PROJECT_ROOT = "sandbox-temp-trtllm-ci-perf-v1"  # "sandbox-trtllm-ci-perf"
 TEST_INFO_PROJECT_NAME = f"{PROJECT_ROOT}-test_info"
 
-# Server config fields to compare
-SERVER_FIELDS = [
-    "s_model_name",
-    "l_gpus",
-    "l_tp",
-    "l_ep",
-    "l_pp",
-    "l_max_num_tokens",
-    "b_enable_chunked_prefill",
-    "b_disable_overlap_scheduler",
-    "s_attention_backend",
-    "s_moe_backend",
-    "l_moe_max_num_tokens",
-    "l_stream_interval",
-    "b_enable_attention_dp",
-    "b_attention_dp_balance",
-    "l_batching_wait_iters",
-    "l_timeout_iters",
-    "s_kv_cache_dtype",
-    "b_enable_block_reuse",
-    "d_free_gpu_memory_fraction",
-    "l_max_batch_size",
-    "b_enable_padding",
-]
-
-# Client config fields to compare
-CLIENT_FIELDS = [
-    "l_concurrency",
-    "l_iterations",
-    "l_isl",
-    "l_osl",
-    "d_random_range_ratio",
-]
-
 # Metrics where larger is better
 MAXIMIZE_METRICS = [
     "d_seq_throughput",
@@ -137,6 +104,7 @@ def get_job_info():
     trigger_mr_link = ""
     trigger_mr_id = ""
     trigger_mr_commit = ""
+    artifact_url = ""
     if is_pr_job:
         # Get PR info from github_pr_api_url
         github_pr_api_url = global_vars.get("github_pr_api_url", "")
@@ -162,6 +130,9 @@ def get_job_info():
 
         # Set trigger_mr_commit to commit
         trigger_mr_commit = commit
+        artifact_url = f"https://urm.nvidia.com/artifactory/sw-tensorrt-generic/llm-artifacts/LLM/main/L0_PostMerge/{job_id}" if job_id else ""
+    else:
+        artifact_url = f"https://urm.nvidia.com/artifactory/sw-tensorrt-generic/llm-artifacts/LLM/main/L0_PostMerge/{job_id}" if job_id else ""
 
     return {
         "b_is_baseline": False,
@@ -185,11 +156,12 @@ def get_job_info():
         "s_trigger_mr_link": trigger_mr_link,
         "s_trigger_mr_id": trigger_mr_id,
         "s_trigger_mr_commit": trigger_mr_commit,
+        "s_artifact_url": artifact_url,
         "b_is_regression": False,
     }
 
 
-def query_history_data():
+def query_history_data(gpu_type):
     """
     Query post-merge data with specific gpu type and model name
     """
@@ -209,6 +181,16 @@ def query_history_data():
                             "b_is_post_merge": True
                         }
                     },
+                    {
+                        "term": {
+                            "b_is_regression": False
+                        }
+                    },
+                    {
+                        "term": {
+                            "s_gpu_type": gpu_type
+                        }
+                    },
                     {
                         "range": {
                             "ts_created": {
@@ -263,30 +245,38 @@ def query_history_data():
         return []
 
 
-def match(history_data, new_data):
+def match(history_data, new_data, match_keys):
     """
     Check if the server and client config of history data matches the new data
     """
-    # Combine all fields to compare (excluding log links)
-    fields_to_compare = SERVER_FIELDS + CLIENT_FIELDS
 
     def is_empty(value):
-        """Check if a value is empty (None, empty string, etc.)"""
         return value is None or value == ""
 
-    # Compare each field
-    for field in fields_to_compare:
-        history_value = history_data.get(field)
-        new_value = new_data.get(field)
+    def should_skip_field(field):
+        # Skip fields starting with @, _, ts_
+        if field.startswith('@') or field.startswith('_') or field.startswith(
+                'ts_'):
+            return True
+        # Skip log links and speculative_model_dir and job configs
+        if field in [
+                's_speculative_model_dir', 's_server_log_link',
+                's_ctx_server_log_link', 's_gen_server_log_link',
+                's_client_log_link'
+        ]:
+            return True
+        return False
 
-        # If both are empty, consider them equal
+    for field in match_keys:
+        # Skip excluded fields
+        if should_skip_field(field):
+            continue
+        history_value = history_data.get(field, None)
+        new_value = new_data.get(field, None)
         if is_empty(history_value) and is_empty(new_value):
             continue
-
-        # If values don't match, return False
         if history_value != new_value:
             return False
-
     return True
 
 
@@ -339,27 +329,44 @@ def calculate_best_perf_result(history_data_list, new_data):
     return best_metrics
 
 
-def get_history_data(new_data_dict):
+def get_history_data(new_data_dict, gpu_type, match_keys):
     """
     Query history post-merge data for each cmd_idx
     """
+
+    def get_latest_data(data_list):
+        if not data_list:
+            return None
+        time_format = "%b %d, %Y @ %H:%M:%S.%f"
+        # Find the item with the maximum ts_created value
+        latest_data = max(
+            data_list,
+            key=lambda x: datetime.strptime(x["ts_created"], time_format))
+        return latest_data
+
     history_baseline_dict = {}
     history_data_dict = {}
     cmd_idxs = new_data_dict.keys()
     for cmd_idx in cmd_idxs:
         history_data_dict[cmd_idx] = []
-        history_baseline_dict[cmd_idx] = None
-    history_data_list = query_history_data()
+        history_baseline_dict[cmd_idx] = []
+    history_data_list = []
+    if cmd_idxs:
+        history_data_list = query_history_data(gpu_type)
     if history_data_list:
         for history_data in history_data_list:
             for cmd_idx in cmd_idxs:
-                if match(history_data, new_data_dict[cmd_idx]):
+                if match(history_data, new_data_dict[cmd_idx], match_keys):
                     if history_data.get("b_is_baseline") and history_data.get(
                             "b_is_baseline") == True:
-                        history_baseline_dict[cmd_idx] = history_data
+                        history_baseline_dict[cmd_idx].append(history_data)
                     else:
                         history_data_dict[cmd_idx].append(history_data)
                     break
+    # Sometime database has several baselines and we only use the latest baseline one
+    for cmd_idx, baseline_list in history_baseline_dict.items():
+        latest_baseline = get_latest_data(baseline_list)
+        history_baseline_dict[cmd_idx] = latest_baseline
     return history_baseline_dict, history_data_dict
 
 
@@ -477,6 +484,8 @@ def post_new_perf_data(new_baseline_data_dict, new_data_dict,
     # Only post regressive test cases when post-merge.
     if new_baseline_data_dict:
         data_list.extend(regressive_data_list)
+    if not data_list:
+        return
     try:
         print_info(
             f"Ready to post {len(data_list)} data to {TEST_INFO_PROJECT_NAME}")
diff --git a/tests/integration/defs/perf/test_perf.py b/tests/integration/defs/perf/test_perf.py
index 942b3bd878..c8cd559e4d 100644
--- a/tests/integration/defs/perf/test_perf.py
+++ b/tests/integration/defs/perf/test_perf.py
@@ -18,6 +18,7 @@ TensorRT LLM perf tests
 import os
 import re
 import shutil
+import socket
 import sys
 from typing import Dict, List, NamedTuple
 
@@ -34,9 +35,10 @@ from .open_search_db_utils import (add_id, get_history_data, get_job_info,
                                    print_regressive_test_cases)
 from .pytorch_model_config import get_model_yaml_config
 from .sampler_options_config import get_sampler_options_config
-from .utils import (AbstractPerfScriptTestClass, PerfBenchScriptTestCmds,
-                    PerfDisaggScriptTestCmds, PerfMetricType,
-                    PerfServerClientBenchmarkCmds, generate_test_nodes)
+from .utils import (AbstractPerfScriptTestClass, PerfAggrScriptTestCmds,
+                    PerfBenchScriptTestCmds, PerfDisaggScriptTestCmds,
+                    PerfMetricType, PerfMultiNodeDisaggScriptTestCmds,
+                    generate_test_nodes)
 
 if not hasattr(re, "Pattern"):
     re.Pattern = type(re.compile(""))
@@ -103,6 +105,7 @@ MODEL_PATH_DICT = {
     "deepseek_r1_nvfp4": "DeepSeek-R1/DeepSeek-R1-FP4",
     "deepseek_r1_0528_fp8": "DeepSeek-R1/DeepSeek-R1-0528/",
     "deepseek_r1_0528_fp4": "DeepSeek-R1/DeepSeek-R1-0528-FP4/",
+    "deepseek_r1_0528_fp4_v2": "DeepSeek-R1/DeepSeek-R1-0528-FP4-v2/",
     "deepseek_v3_lite_fp8": "DeepSeek-V3-Lite/fp8",
     "deepseek_v3_lite_nvfp4": "DeepSeek-V3-Lite/nvfp4_moe_only",
     "qwen2_7b_instruct": "Qwen2-7B-Instruct",
@@ -310,7 +313,7 @@ BENCH_PERF_METRIC_LOG_QUERIES = {
                r"Final KV cache size after resize: ([\d\.]+) GiB).*"),
 }
 
-SERVER_BENCHMARK_PERF_METRIC_LOG_QUERIES = {
+AGGR_SERVER_PERF_METRIC_LOG_QUERIES = {
     PerfMetricType.SEQ_THROUGHPUT:
     re.compile(r"Request throughput \(req\/s\):\s+([\d\.]+)"),
     PerfMetricType.TOKEN_THROUGHPUT:
@@ -345,13 +348,6 @@ SERVER_BENCHMARK_PERF_METRIC_LOG_QUERIES = {
     re.compile(r"P99 E2EL \(ms\):\s+([\d\.]+)"),
 }
 
-DISAGG_SERVER_METRICS_LOG_QUERIES = {
-    PerfMetricType.DISAGG_SERVER_E2EL:
-    re.compile(r"Median E2EL \(ms\):\s*(\d+\.?\d*)"),
-    PerfMetricType.DISAGG_SERVER_TTFT:
-    re.compile(r"Median TTFT \(ms\):\s*(\d+\.?\d*)"),
-}
-
 # (Relative threshold, Absolute threshold) for all metric types
 PERF_METRIC_THRESHOLD = {
     PerfMetricType.BUILD_TIME: (0.1, 30),  # Ignore build time regression < 30ms
@@ -443,7 +439,7 @@ INFERENCE_METRICS = [
     PerfMetricType.CONTEXT_GPU_MEMORY,
 ]
 
-SERVER_BENCHMARK_METRICS = [
+AGGR_SERVER_METRICS = [
     PerfMetricType.SEQ_THROUGHPUT,
     PerfMetricType.TOKEN_THROUGHPUT,
     PerfMetricType.TOTAL_TOKEN_THROUGHPUT,
@@ -496,138 +492,247 @@ class PerfTestMetric(NamedTuple):
     cmd_idx: int
 
 
+def to_env_dict(env_vars: str) -> Dict[str, str]:
+    env = {}
+    for env_var in env_vars.split():
+        if "=" in env_var:
+            key, value = env_var.split("=", 1)
+            env[key] = value
+    return env
+
+
 class ServerConfig:
     """
     Configurations of trtllm-server.
     """
 
-    def __init__(
-        self,
-        name: str,
-        model_name: str,
-        gpus: int,
-        tp: int,
-        ep: int,
-        max_num_tokens: int,
-        attention_backend: str,
-        max_batch_size: int,
-        pp: int = 1,
-        enable_chunked_prefill: bool = False,
-        disable_overlap_scheduler: bool = False,
-        moe_backend: str = "",
-        moe_max_num_tokens: int = 0,
-        stream_interval: int = 10,
-        enable_attention_dp: bool = False,
-        attention_dp_balance: bool = False,
-        batching_wait_iters: int = 10,
-        timeout_iters: int = 50,
-        kv_cache_dtype: str = "fp8",
-        enable_block_reuse: bool = False,
-        free_gpu_memory_fraction: float = 0.8,
-        enable_padding: bool = True,
-    ):
-        self.name = name
-        self.model_name = model_name
-        self.gpus = gpus
-        self.tp = tp
-        self.ep = ep
-        self.pp = pp
-        self.max_num_tokens = max_num_tokens
-        self.enable_chunked_prefill = enable_chunked_prefill
-        self.disable_overlap_scheduler = disable_overlap_scheduler
-        self.attention_backend = attention_backend
-        self.moe_backend = moe_backend
-        self.moe_max_num_tokens = moe_max_num_tokens
-        self.stream_interval = stream_interval
-        self.enable_attention_dp = enable_attention_dp
-        self.attention_dp_balance = attention_dp_balance
-        self.batching_wait_iters = batching_wait_iters
-        self.timeout_iters = timeout_iters
-        self.kv_cache_dtype = kv_cache_dtype
-        self.enable_block_reuse = enable_block_reuse
-        self.free_gpu_memory_fraction = free_gpu_memory_fraction
-        self.max_batch_size = max_batch_size
-        self.enable_padding = enable_padding
-
+    def __init__(self, server_config_data: dict, env_vars: str = ""):
+        # Extract required fields
+        self.name = server_config_data['name']
+        self.model_name = server_config_data['model_name']
+        self.gpus = server_config_data['gpus']
         self.model_path = ""
+        self.env_vars = env_vars
 
-    def to_cmd(self, working_dir: str) -> List[str]:
+        # Extract optional fields with defaults
+        self.tp = server_config_data.get('tensor_parallel_size', self.gpus)
+        self.ep = server_config_data.get('moe_expert_parallel_size', 1)
+        self.pp = server_config_data.get('pipeline_parallel_size', 1)
+        self.gpus_per_node = server_config_data.get('gpus_per_node', self.gpus)
+        self.max_num_tokens = server_config_data.get('max_num_tokens', 2048)
+        self.max_batch_size = server_config_data.get('max_batch_size', 512)
+        self.max_seq_len = server_config_data.get('max_seq_len', 0)
+        self.disable_overlap_scheduler = server_config_data.get(
+            'disable_overlap_scheduler', False)
+        self.num_postprocess_workers = server_config_data.get(
+            'num_postprocess_workers', 0)
+        self.stream_interval = server_config_data.get('stream_interval', 10)
+        self.attn_backend = server_config_data.get('attn_backend', "TRTLLM")
+        self.enable_chunked_prefill = server_config_data.get(
+            'enable_chunked_prefill', False)
+        self.enable_attention_dp = server_config_data.get(
+            'enable_attention_dp', False)
+        self.trust_remote_code = server_config_data.get('trust_remote_code',
+                                                        False)
+
+        # attention_dp_config
+        attention_dp_config = server_config_data.get('attention_dp_config', {})
+        self.attention_dp_balance = attention_dp_config.get(
+            'enable_balance', False)
+        self.batching_wait_iters = attention_dp_config.get(
+            'batching_wait_iters', 0)
+        self.timeout_iters = attention_dp_config.get('timeout_iters', 60)
+
+        # moe_config
+        moe_config = server_config_data.get('moe_config', {})
+        self.moe_backend = moe_config.get('backend', "")
+        self.moe_max_num_tokens = moe_config.get('max_num_tokens', 0)
+
+        # cuda_graph_config
+        cuda_graph_config = server_config_data.get('cuda_graph_config', {})
+        self.enable_cuda_graph = False
+        if cuda_graph_config:
+            self.enable_cuda_graph = True
+            self.enable_padding = cuda_graph_config.get('enable_padding', True)
+            self.cuda_graph_batch_sizes = cuda_graph_config.get(
+                'batch_sizes', [])
+            self.cuda_graph_max_batch_size = cuda_graph_config.get(
+                'max_batch_size', 0)
+        else:
+            self.enable_padding = True
+            self.cuda_graph_batch_sizes = []
+            self.cuda_graph_max_batch_size = 0
+
+        # kv_cache_config
+        kv_cache_config = server_config_data.get('kv_cache_config', {})
+        self.kv_cache_dtype = kv_cache_config.get('dtype', "fp8")
+        self.enable_block_reuse = kv_cache_config.get('enable_block_reuse',
+                                                      False)
+        self.free_gpu_memory_fraction = kv_cache_config.get(
+            'free_gpu_memory_fraction', 0.8)
+
+        # cache_transceiver_config
+        cache_transceiver_config = server_config_data.get(
+            'cache_transceiver_config', {})
+        self.cache_transceiver_backend = cache_transceiver_config.get(
+            'backend', "")
+        self.cache_transceiver_max_tokens_in_buffer = cache_transceiver_config.get(
+            'max_tokens_in_buffer', 0)
+
+        # speculative_config
+        speculative_config = server_config_data.get('speculative_config', {})
+        self.spec_decoding_type = speculative_config.get('decoding_type', "")
+        self.num_nextn_predict_layers = speculative_config.get(
+            'num_nextn_predict_layers', 0)
+        eagle3_value = speculative_config.get('eagle3_layers_to_capture', [])
+        if isinstance(eagle3_value, int):
+            self.eagle3_layers_to_capture = [eagle3_value]
+        elif isinstance(eagle3_value, list):
+            self.eagle3_layers_to_capture = eagle3_value
+        else:
+            self.eagle3_layers_to_capture = []
+        self.max_draft_len = speculative_config.get('max_draft_len', 0)
+        self.speculative_model_dir = speculative_config.get(
+            'speculative_model_dir', "")
+
+        # Store filtered config for extra_llm_api_config (exclude name, model_name, gpus, client_configs)
+        self.extra_llm_api_config_data = {
+            k: v
+            for k, v in server_config_data.items()
+            if k not in ['name', 'model_name', 'gpus', 'client_configs']
+        }
+
+    def to_cmd(self,
+               output_dir: str,
+               numa_bind: bool = False,
+               disagg_serving_type: str = "",
+               hostname: str = "localhost",
+               port: int = 8000) -> List[str]:
         model_dir = get_model_dir(self.model_name)
         self.model_path = model_dir if os.path.exists(
             model_dir) else self.model_name
-        config_path = os.path.join(working_dir,
-                                   f"extra-llm-api-config.{self.name}.yml")
-        return [
-            "trtllm-serve", self.model_path, "--host", "localhost", "--port",
-            "8000", "--backend", "pytorch", "--extra_llm_api_options",
+        config_filename = f"extra-llm-api-config.{self.name}.yml"
+        if "CTX" in disagg_serving_type:
+            config_filename = f"extra-llm-api-config.{self.name}.ctx.yml"
+        elif "GEN" in disagg_serving_type:
+            config_filename = f"extra-llm-api-config.{self.name}.gen.yml"
+        config_path = os.path.join(output_dir, config_filename)
+
+        numa_bind_cmd = []
+        if numa_bind:
+            numa_bind_cmd = ["numactl", "-m 0,1"]
+
+        cmd = numa_bind_cmd + [
+            "trtllm-serve", self.model_path, "--host", hostname, "--port",
+            str(port), "--backend", "pytorch", "--extra_llm_api_options",
             config_path
         ]
+        return cmd
+
+    def to_env(self) -> Dict[str, str]:
+        return to_env_dict(self.env_vars)
 
     def to_db_data(self) -> dict:
-        """Convert ServerConfig to Database data"""
-        return {
-            "s_model_name": self.model_name.lower(),
-            "l_gpus": self.gpus,
-            "l_tp": self.tp,
-            "l_ep": self.ep,
-            "l_pp": self.pp,
-            "l_max_num_tokens": self.max_num_tokens,
-            "b_enable_chunked_prefill": self.enable_chunked_prefill,
-            "b_disable_overlap_scheduler": self.disable_overlap_scheduler,
-            "s_attention_backend": self.attention_backend,
-            "s_moe_backend": self.moe_backend,
-            "l_moe_max_num_tokens": self.moe_max_num_tokens,
-            "l_stream_interval": self.stream_interval,
-            "b_enable_attention_dp": self.enable_attention_dp,
-            "b_attention_dp_balance": self.attention_dp_balance,
-            "l_batching_wait_iters": self.batching_wait_iters,
-            "l_timeout_iters": self.timeout_iters,
-            "s_kv_cache_dtype": self.kv_cache_dtype,
-            "b_enable_block_reuse": self.enable_block_reuse,
-            "d_free_gpu_memory_fraction": self.free_gpu_memory_fraction,
-            "l_max_batch_size": self.max_batch_size,
-            "b_enable_padding": self.enable_padding,
-            "s_server_log_link": "",
+        db_data = {
+            "s_model_name":
+            self.model_name.lower(),
+            "l_gpus":
+            self.gpus,
+            "l_tp":
+            self.tp,
+            "l_ep":
+            self.ep,
+            "l_pp":
+            self.pp,
+            "l_gpus_per_node":
+            self.gpus_per_node,
+            "l_max_num_tokens":
+            self.max_num_tokens,
+            "l_max_batch_size":
+            self.max_batch_size,
+            "l_max_seq_len":
+            self.max_seq_len,
+            "b_disable_overlap_scheduler":
+            self.disable_overlap_scheduler,
+            "l_num_postprocess_workers":
+            self.num_postprocess_workers,
+            "l_stream_interval":
+            self.stream_interval,
+            "s_attn_backend":
+            self.attn_backend,
+            "b_enable_chunked_prefill":
+            self.enable_chunked_prefill,
+            "b_enable_attention_dp":
+            self.enable_attention_dp,
+            "b_trust_remote_code":
+            self.trust_remote_code,
+            # attention_dp_config
+            "b_attention_dp_balance":
+            self.attention_dp_balance,
+            "l_batching_wait_iters":
+            self.batching_wait_iters,
+            "l_timeout_iters":
+            self.timeout_iters,
+            # moe_config
+            "s_moe_backend":
+            self.moe_backend,
+            "l_moe_max_num_tokens":
+            self.moe_max_num_tokens,
+            # cuda_graph_config
+            "b_enable_cuda_graph":
+            self.enable_cuda_graph,
+            "b_enable_padding":
+            self.enable_padding,
+            "l_cuda_graph_max_batch_size":
+            self.cuda_graph_max_batch_size,
+            "s_cuda_graph_batch_sizes":
+            ",".join(map(str, self.cuda_graph_batch_sizes)),
+            # kv_cache_config
+            "s_kv_cache_dtype":
+            self.kv_cache_dtype,
+            "b_enable_block_reuse":
+            self.enable_block_reuse,
+            "d_free_gpu_memory_fraction":
+            self.free_gpu_memory_fraction,
+            # cache_transceiver_config
+            "s_cache_transceiver_backend":
+            self.cache_transceiver_backend,
+            "l_cache_transceiver_max_tokens_in_buffer":
+            self.cache_transceiver_max_tokens_in_buffer,
+            # speculative_config
+            "s_spec_decoding_type":
+            self.spec_decoding_type,
+            "l_num_nextn_predict_layers":
+            self.num_nextn_predict_layers,
+            "s_eagle3_layers_to_capture":
+            ",".join(map(str, self.eagle3_layers_to_capture)),
+            "l_max_draft_len":
+            self.max_draft_len,
+            "s_speculative_model_dir":
+            self.speculative_model_dir,
+            "s_server_log_link":
+            "",
+            "s_server_env_var":
+            self.env_vars,
         }
+        return db_data
 
     def generate_extra_llm_api_config(self) -> str:
         """Generate extra-llm-api-config.yml content"""
-        config_lines = [
-            f"tensor_parallel_size: {self.tp}",
-            f"moe_expert_parallel_size: {self.ep}",
-            f"pipeline_parallel_size: {self.pp}",
-            f"max_num_tokens: {self.max_num_tokens}",
-            f"enable_attention_dp: {str(self.enable_attention_dp).lower()}",
-            f"disable_overlap_scheduler: {str(self.disable_overlap_scheduler).lower()}",
-            f"stream_interval: {self.stream_interval}",
-            f"attn_backend: {self.attention_backend}",
-            f"enable_chunked_prefill: {str(self.enable_chunked_prefill).lower()}",
-            "cuda_graph_config:",
-            f"  enable_padding: {str(self.enable_padding).lower()}",
-            f"  max_batch_size: {self.max_batch_size}",
-            "kv_cache_config:",
-            f"  dtype: {self.kv_cache_dtype}",
-            f"  free_gpu_memory_fraction: {self.free_gpu_memory_fraction}",
-            f"  enable_block_reuse: {str(self.enable_block_reuse).lower()}",
-            "print_iter_log: false",
-        ]
+        # Make a copy to avoid modifying the original
+        config_data = dict(self.extra_llm_api_config_data)
 
-        # Add moe_config if moe_backend is specified
-        if self.moe_backend:
-            config_lines.append("moe_config:")
-            config_lines.append(f"  backend: {self.moe_backend}")
-            if self.moe_max_num_tokens:
-                config_lines.append(
-                    f"  max_num_tokens: {self.moe_max_num_tokens}")
+        # Handle speculative_model_dir path conversion if it exists
+        if 'speculative_config' in config_data and 'speculative_model_dir' in config_data[
+                'speculative_config']:
+            spec_model_dir = config_data['speculative_config'][
+                'speculative_model_dir']
+            if spec_model_dir:
+                config_data['speculative_config'][
+                    'speculative_model_dir'] = os.path.join(
+                        llm_models_root(), spec_model_dir)
 
-        if self.attention_dp_balance:
-            config_lines.append("attention_dp_balance:")
-            config_lines.append("  enable_balance: true")
-            config_lines.append(
-                f"  batching_wait_iters: {self.batching_wait_iters}")
-            config_lines.append(f"  timeout_iters: {self.timeout_iters}")
-
-        return "\n".join(config_lines)
+        return yaml.dump(config_data, default_flow_style=False, sort_keys=False)
 
 
 class ClientConfig:
@@ -636,28 +741,30 @@ class ClientConfig:
     """
 
     def __init__(self,
-                 name: str,
+                 client_config_data: dict,
                  model_name: str,
-                 concurrency: int,
-                 iterations: int,
-                 isl: int,
-                 osl: int,
-                 random_range_ratio: float = 0.0):
-        self.name = name
+                 env_vars: str = ""):
+        self.name = client_config_data.get('name', '')
         self.model_name = model_name
-        self.concurrency = concurrency
-        self.iterations = iterations
-        self.isl = isl
-        self.osl = osl
-        self.random_range_ratio = random_range_ratio
-
+        self.concurrency = client_config_data.get('concurrency', 1)
+        self.iterations = client_config_data.get('iterations', 1)
+        self.isl = client_config_data.get('isl', 1024)
+        self.osl = client_config_data.get('osl', 1024)
+        self.random_range_ratio = client_config_data.get(
+            'random_range_ratio', 0.0)
+        self.backend = client_config_data.get('backend', "")
+        self.use_chat_template = client_config_data.get('use_chat_template',
+                                                        False)
+        self.streaming = client_config_data.get('streaming', True)
         self.model_path = ""
+        self.env_vars = env_vars
 
-    def to_cmd(self, working_dir: str) -> List[str]:
+    def to_cmd(self, need_hostname: bool = True) -> List[str]:
         model_dir = get_model_dir(self.model_name)
         self.model_path = model_dir if os.path.exists(
             model_dir) else self.model_name
-        return [
+
+        benchmark_cmd = [
             "python", "-m", "tensorrt_llm.serve.scripts.benchmark_serving",
             "--model", self.model_path, "--dataset-name", "random",
             "--random-ids", "--num-prompts",
@@ -668,17 +775,40 @@ class ClientConfig:
             "--percentile-metrics", "ttft,tpot,itl,e2el", "--max-concurrency",
             str(self.concurrency)
         ]
+        if need_hostname:
+            hostname_port = ["--host", "localhost", "--port", "8000"]
+            benchmark_cmd.extend(hostname_port)
+        if self.backend:
+            benchmark_cmd.append("--backend")
+            benchmark_cmd.append(self.backend)
+        if self.use_chat_template:
+            benchmark_cmd.append("--use-chat-template")
+        if not self.streaming:
+            benchmark_cmd.append("--non-streaming")
+        return benchmark_cmd
+
+    def to_env(self) -> Dict[str, str]:
+        return to_env_dict(self.env_vars)
 
     def to_db_data(self) -> dict:
         """Convert ClientConfig to Database data"""
-        return {
+        db_data = {
             "l_concurrency": self.concurrency,
             "l_iterations": self.iterations,
             "l_isl": self.isl,
             "l_osl": self.osl,
             "d_random_range_ratio": self.random_range_ratio,
+            "s_backend": self.backend,
+            "b_use_chat_template": self.use_chat_template,
+            "b_streaming": self.streaming,
             "s_client_log_link": "",
+            "s_client_env_vars": self.env_vars,
         }
+        if self.backend:
+            db_data["s_backend"] = self.backend
+        if self.use_chat_template:
+            db_data["b_use_chat_template"] = self.use_chat_template
+        return db_data
 
 
 def parse_select_pattern(select_pattern: str):
@@ -720,8 +850,8 @@ def parse_select_pattern(select_pattern: str):
     return execution_plan
 
 
-def parse_config_file(config_file_path: str, select_pattern: str = None):
-    """Parse YAML configuration file and create ServerConfig and ClientConfig objects
+def parse_aggr_config_file(config_file_path: str, select_pattern: str = None):
+    """Parse YAML configuration file and create ServerConfig and ClientConfig objects for aggregated server
 
     Args:
         config_file_path: Path to YAML configuration file
@@ -742,6 +872,16 @@ def parse_config_file(config_file_path: str, select_pattern: str = None):
     with open(config_file_path, 'r') as f:
         config = yaml.safe_load(f)
 
+    # Read environment config
+    environment = config.get('environment', {})
+    if not environment:
+        environment = {}
+
+    # Get environment variables
+    environment.get('worker_env_var', '')
+    server_env_var = environment.get('server_env_var', '')
+    client_env_var = environment.get('client_env_var', '')
+
     server_configs = []
     server_client_configs = {}
 
@@ -752,39 +892,8 @@ def parse_config_file(config_file_path: str, select_pattern: str = None):
         if execution_plan is not None and server_name not in execution_plan:
             continue
 
-        # Create ServerConfig object
-        server_config = ServerConfig(
-            name=server_config_data['name'],
-            model_name=server_config_data['model_name'],
-            gpus=server_config_data['gpus'],
-            tp=server_config_data['tp'],
-            ep=server_config_data['ep'],
-            pp=server_config_data.get('pp', 1),
-            attention_backend=server_config_data.get('attention_backend',
-                                                     'TRTLLM'),
-            moe_backend=server_config_data.get('moe_backend', ''),
-            moe_max_num_tokens=server_config_data.get('moe_max_num_tokens', 0),
-            stream_interval=server_config_data.get('stream_interval', 10),
-            enable_attention_dp=server_config_data.get('enable_attention_dp',
-                                                       False),
-            attention_dp_balance=server_config_data.get('attention_dp_balance',
-                                                        False),
-            batching_wait_iters=server_config_data.get('batching_wait_iters',
-                                                       10),
-            timeout_iters=server_config_data.get('timeout_iters', 50),
-            enable_chunked_prefill=server_config_data.get(
-                'enable_chunked_prefill', False),
-            max_num_tokens=server_config_data.get('max_num_tokens', 2048),
-            disable_overlap_scheduler=server_config_data.get(
-                'disable_overlap_scheduler', False),
-            kv_cache_dtype=server_config_data.get('kv_cache_dtype', 'fp8'),
-            enable_block_reuse=server_config_data.get('enable_block_reuse',
-                                                      False),
-            free_gpu_memory_fraction=server_config_data.get(
-                'free_gpu_memory_fraction', 0.8),
-            max_batch_size=server_config_data.get('max_batch_size', 256),
-            enable_padding=server_config_data.get('enable_padding', True))
-
+        # Create ServerConfig object directly from dict
+        server_config = ServerConfig(server_config_data, server_env_var)
         server_id = len(server_configs)
         server_configs.append(server_config)
 
@@ -802,15 +911,9 @@ def parse_config_file(config_file_path: str, select_pattern: str = None):
                 if client_name not in selected_client_names:
                     continue
 
-            client_config = ClientConfig(
-                name=client_config_data['name'],
-                model_name=server_config_data['model_name'],
-                concurrency=client_config_data['concurrency'],
-                iterations=client_config_data.get('iterations', 1),
-                isl=client_config_data.get('isl', 1024),
-                osl=client_config_data.get('osl', 1024),
-                random_range_ratio=client_config_data.get(
-                    'random_range_ratio', 0.0))
+            client_config = ClientConfig(client_config_data,
+                                         server_config_data['model_name'],
+                                         client_env_var)
             client_configs.append(client_config)
 
         server_client_configs[server_id] = client_configs
@@ -818,6 +921,87 @@ def parse_config_file(config_file_path: str, select_pattern: str = None):
     return execution_plan, server_configs, server_client_configs
 
 
+def parse_multi_node_disagg_config_file(config_file_path: str,
+                                        select_pattern: str = None):
+    disagg_serving_type = os.environ.get("DISAGG_SERVING_TYPE", "BENCHMARK")
+
+    # Read YAML config file
+    with open(config_file_path, 'r') as f:
+        config = yaml.safe_load(f)
+
+    disagg_configs = []
+    hardware = config.get('hardware', {})
+    benchmark = config.get('benchmark', {})
+    environment = config.get('environment', {})
+    slurm_config = config.get('slurm', {})
+    worker_config = config.get('worker_config', {})
+    timeout = slurm_config.get('timeout', 3600)
+    numa_bind = slurm_config.get('numa_bind', False)
+
+    # Get model name from environment
+    model_name = environment.get('model_name', '')
+    assert model_name, "model_name is required in environment section"
+
+    # Get environment variables
+    worker_env_var = environment.get('worker_env_var', '')
+    server_env_var = environment.get('server_env_var', '')
+    client_env_var = environment.get('client_env_var', '')
+
+    # Create ctx_server config data
+    ctx_server_config_data = {
+        'name': 'ctx_server',
+        'model_name': model_name,
+        'gpus': hardware.get('gpus_per_ctx_server'),
+        'gpus_per_node': hardware.get('gpus_per_node'),
+        **worker_config.get('ctx', {})
+    }
+
+    # Create gen_server config data
+    gen_server_config_data = {
+        'name': 'gen_server',
+        'model_name': model_name,
+        'gpus': hardware.get('gpus_per_gen_server'),
+        'gpus_per_node': hardware.get('gpus_per_node'),
+        **worker_config.get('gen', {})
+    }
+
+    # Create client config data
+    concurrency_str = benchmark.get('concurrency_list', '1')
+    concurrency = int(concurrency_str) if isinstance(concurrency_str,
+                                                     str) else concurrency_str
+
+    client_config_data = {
+        'name': 'client',
+        'concurrency': concurrency,
+        'iterations': benchmark.get('multi_round', 1),
+        'isl': benchmark.get('input_length', 1024),
+        'osl': benchmark.get('output_length', 1024),
+        'random_range_ratio': benchmark.get('benchmark_ratio', 0.0),
+        'backend': 'openai',
+        'use_chat_template': False,
+        'streaming': benchmark.get('streaming', True),
+    }
+
+    # Create disagg_config dict
+    disagg_config = {
+        'disagg_serving_type': disagg_serving_type,
+        'hostname': socket.gethostname(),
+        'numa_bind': numa_bind,
+        'timeout': timeout,
+        'name': 'disagg_config',
+        'model_name': model_name,
+        'hardware': hardware,
+        'ctx_server': ServerConfig(ctx_server_config_data, worker_env_var),
+        'gen_server': ServerConfig(gen_server_config_data, worker_env_var),
+        'server_env_var': server_env_var,
+        'client': ClientConfig(client_config_data, model_name, client_env_var),
+    }
+    print_info(f"disagg_config: {disagg_config}")
+    disagg_configs.append(disagg_config)
+
+    return disagg_configs
+
+
 class PerfTestConfig:
     """
     Configurations defining the LLM perf test.
@@ -928,15 +1112,16 @@ class PerfTestConfig:
         self.gen_server_workers = 0
 
         # Used for perf sanity test
-        # config_file: YAML path, select_pattern: server/client selection string
-        # server_configs: list[ServerConfig], server_client_configs: dict[server_id -> list[ClientConfig]]
         self.upload_to_db = False
         self.config_file = None
         self.gpu_type = None
         self.config_path = None
         self.select_pattern = None
+        # Aggregated mode
         self.server_configs = []
         self.server_client_configs = {}
+        # Multi-node disaggregated mode
+        self.disagg_configs = []
 
     def _to_string_disagg(self, entries: List[str]):
         entries.append(f"disagg_server")
@@ -965,10 +1150,16 @@ class PerfTestConfig:
         # Used for perf sanity test
         if self.config_file is not None:
             entries = ["perf_sanity", self.config_file]
-            if custom_server_name is not None:
-                entries.append(f"server:{custom_server_name}")
-            if custom_client_name is not None:
-                entries.append(f"client:{custom_client_name}")
+            if "disagg" in self.config_file:
+                # For multi-node disagg, add disagg config name
+                if custom_server_name is not None:
+                    entries.append(f"disagg:{custom_server_name}")
+            else:
+                # For aggr_server
+                if custom_server_name is not None:
+                    entries.append(f"server:{custom_server_name}")
+                if custom_client_name is not None:
+                    entries.append(f"client:{custom_client_name}")
             return "-".join(entries)
 
         # First, add the model name.
@@ -1140,15 +1331,33 @@ class PerfTestConfig:
         # Extract configs from test param labels.
         labels = test_param_labels.split("-")
 
+        def get_gpu_type(label: str) -> str:
+            parts = label.split("_")
+            if len(parts) < 2 or parts[0] != "l0":
+                return ""
+            if parts[1] == "dgx":
+                if len(parts) >= 3:
+                    gpu_type = f"{parts[1]}_{parts[2]}"
+                else:
+                    gpu_type = ""
+            else:
+                gpu_type = parts[1]
+            return gpu_type.lower()
+
         # Used for perf sanity test
         if "perf_sanity" in labels[0]:
             assert len(labels) > 1, "perf_sanity test must have a config file!"
-            self.runtime = "server-benchmark"
             self.upload_to_db = "upload" in labels[0]
             self.config_file = labels[1]
-            self.gpu_type = labels[1].replace("l0_", "").lower()
+            if "disagg" in labels[1]:
+                self.runtime = "multi_node_disagg_server"
+            else:
+                self.runtime = "aggr_server"
+            self.gpu_type = get_gpu_type(labels[1])
+            config_folder = os.getenv("TRTLLM_CONFIG_FOLDER",
+                                      "tests/scripts/perf-sanity")
             self.config_path = os.path.join(
-                "tests/scripts/perf-sanity", f"{labels[1]}.yaml"
+                config_folder, f"{labels[1]}.yaml"
                 if not labels[1].endswith(".yaml") else labels[1])
             self.select_pattern = labels[2] if len(labels) > 2 else None
             return
@@ -1370,14 +1579,21 @@ class PerfTestConfig:
                     [b >= 32 for b in self.batch_sizes]
                 ), f"gpt_350m and bloom_560m with small BS are very unstable! Please increase to at least 32."
 
-    def set_server_client_configs(self, llm_root: str) -> None:
+    def set_aggr_server_configs(self, llm_root: str) -> None:
         """
         Set the server and client configs.
         """
-        if self.runtime == "server-benchmark":
-            config_file_path = os.path.join(llm_root, self.config_path)
-            _, self.server_configs, self.server_client_configs = parse_config_file(
-                config_file_path, self.select_pattern)
+        config_file_path = os.path.join(llm_root, self.config_path)
+        _, self.server_configs, self.server_client_configs = parse_aggr_config_file(
+            config_file_path, self.select_pattern)
+
+    def set_multi_node_disagg_server_configs(self, llm_root: str) -> None:
+        """
+        Set the multi-node disaggregated server configs.
+        """
+        config_file_path = os.path.join(llm_root, self.config_path)
+        self.disagg_configs = parse_multi_node_disagg_config_file(
+            config_file_path, self.select_pattern)
 
     def get_model_family(self) -> str:
         """
@@ -1464,6 +1680,7 @@ class MultiMetricPerfTest(AbstractPerfScriptTestClass):
     def set_runtime_configs(self,
                             llm_root,
                             working_dir,
+                            output_dir,
                             perf_cache_fpath,
                             gpu_clock_lock=None) -> None:
         if self._config.runtime == "cpp":
@@ -1477,11 +1694,14 @@ class MultiMetricPerfTest(AbstractPerfScriptTestClass):
                                                  llm_root)
         elif self._config.runtime == "bench":
             benchmark_script = "trtllm-bench"
-        elif self._config.runtime == "server-benchmark":
+        elif self._config.runtime == "aggr_server":
             benchmark_script = None
-            self._config.set_server_client_configs(llm_root)
+            self._config.set_aggr_server_configs(llm_root)
         elif self._config.runtime == "disagg_server":
             benchmark_script = None
+        elif self._config.runtime == "multi_node_disagg_server":
+            benchmark_script = None
+            self._config.set_multi_node_disagg_server_configs(llm_root)
         else:
             raise RuntimeError(f"Invalid runtime {self._config.runtime}.")
 
@@ -1490,7 +1710,9 @@ class MultiMetricPerfTest(AbstractPerfScriptTestClass):
 
         if self._config.runtime == "bench":
             build_script = "trtllm-bench"
-        elif self._config.runtime == "server-benchmark":
+        elif self._config.runtime == "aggr_server":
+            build_script = None
+        elif self._config.runtime == "multi_node_disagg_server":
             build_script = None
         elif self._config.pp_size > 1 or self._config.model_name not in allowed_models:
             build_script = "trtllm-build"
@@ -1502,31 +1724,114 @@ class MultiMetricPerfTest(AbstractPerfScriptTestClass):
         self._build_script = build_script
         self._benchmark_script = benchmark_script
         self._working_dir = working_dir
+        self._output_dir = output_dir
         self._perf_cache_fpath = perf_cache_fpath
         self._llm_root = llm_root
         self._gpu_clock_lock = gpu_clock_lock
 
-    def get_trtllm_server_client_commands(self):
+    def get_trtllm_aggr_commands(self, output_dir):
         server_cmds = []
+        server_envs = []
         client_cmds = []
+        client_envs = []
         names = []
         for server_idx, client_configs in self._config.server_client_configs.items(
         ):
             server_config = self._config.server_configs[server_idx]
-            server_cmd = server_config.to_cmd(self._working_dir)
-            server_cmd = " ".join(server_cmd)
+            server_cmd = server_config.to_cmd(output_dir)
+            server_env = server_config.to_env()
             # Generate extra-llm-api-config.yml
             config_content = server_config.generate_extra_llm_api_config()
             config_filename = f"extra-llm-api-config.{server_config.name}.yml"
-            config_path = os.path.join(self._working_dir, config_filename)
+            config_path = os.path.join(output_dir, config_filename)
             with open(config_path, 'w') as f:
                 f.write(config_content)
             for client_config in client_configs:
                 server_cmds.append(server_cmd)
-                client_cmd = client_config.to_cmd(self._working_dir)
+                server_envs.append(server_env)
+                client_cmd = client_config.to_cmd(need_hostname=True)
+                client_env = client_config.to_env()
                 client_cmds.append(client_cmd)
+                client_envs.append(client_env)
                 names.append(f"{server_config.name}-{client_config.name}")
-        return server_cmds, client_cmds, names
+        return server_cmds, server_envs, client_cmds, client_envs, names
+
+    def get_trtllm_multi_node_disagg_commands(self, output_dir):
+        ctx_server_cmds = []
+        ctx_server_envs = []
+        gen_server_cmds = []
+        gen_server_envs = []
+        disagg_server_cmds = []
+        disagg_server_envs = []
+        benchmark_cmds = []
+        benchmark_envs = []
+        # Create hostnames directory
+        hostnames_dir = os.path.join(output_dir, "hostnames")
+        if not os.path.exists(hostnames_dir):
+            os.makedirs(hostnames_dir, exist_ok=True)
+
+        for disagg_config in self._config.disagg_configs:
+            disagg_serving_type = disagg_config['disagg_serving_type']
+            hostname = disagg_config['hostname']
+            numa_bind = disagg_config['numa_bind']
+            ctx_server_cmd = None
+            ctx_server_env = None
+            gen_server_cmd = None
+            gen_server_env = None
+            disagg_server_cmd = None
+            disagg_server_env = None
+            benchmark_cmd = None
+            benchmark_env = None
+            if "CTX" in disagg_serving_type or "GEN" in disagg_serving_type:
+                # Write hostname to hostnames folder
+                hostname_file = os.path.join(hostnames_dir,
+                                             f"{disagg_serving_type}.txt")
+                with open(hostname_file, 'w') as f:
+                    f.write(hostname)
+                # Generate CTX or GEN server commands if this is a CTX or GEN node
+                is_ctx = "CTX" in disagg_serving_type
+                server_config = disagg_config[
+                    'ctx_server'] if is_ctx else disagg_config['gen_server']
+                server_cmd = server_config.to_cmd(output_dir, numa_bind,
+                                                  disagg_serving_type, hostname,
+                                                  8336)
+                server_env = server_config.to_env()
+                if is_ctx:
+                    ctx_server_cmd = server_cmd
+                    ctx_server_env = server_env
+                else:
+                    gen_server_cmd = server_cmd
+                    gen_server_env = server_env
+                # Generate extra-llm-api-config.yml
+                config_content = server_config.generate_extra_llm_api_config()
+                config_filename = f"extra-llm-api-config.{server_config.name}.{'ctx' if is_ctx else 'gen'}.yml"
+                config_path = os.path.join(output_dir, config_filename)
+                with open(config_path, 'w') as f:
+                    f.write(config_content)
+            elif "DISAGG_SERVER" in disagg_serving_type:
+                timeout = disagg_config['timeout']
+                # Generate DISAGG server command if this is the DISAGG server node
+                disagg_server_cmd = [
+                    "trtllm-serve", "disaggregated", "-c",
+                    f"{output_dir}/server_config.yaml", "-t",
+                    str(timeout), "-r",
+                    str(timeout)
+                ]
+                disagg_server_env = to_env_dict(disagg_config['server_env_var'])
+            elif "BENCHMARK" in disagg_serving_type:
+                # Generate benchmark command if this is the BENCHMARK server node
+                benchmark_cmd = disagg_config['client'].to_cmd(
+                    need_hostname=False)
+                benchmark_env = disagg_config['client'].to_env()
+            ctx_server_cmds.append(ctx_server_cmd)
+            ctx_server_envs.append(ctx_server_env)
+            gen_server_cmds.append(gen_server_cmd)
+            gen_server_envs.append(gen_server_env)
+            disagg_server_cmds.append(disagg_server_cmd)
+            disagg_server_envs.append(disagg_server_env)
+            benchmark_cmds.append(benchmark_cmd)
+            benchmark_envs.append(benchmark_env)
+        return ctx_server_cmds, ctx_server_envs, gen_server_cmds, gen_server_envs, disagg_server_cmds, disagg_server_envs, benchmark_cmds, benchmark_envs
 
     def get_trtllm_build_command(self, engine_dir, checkpoint_dir) -> list:
         build_cmd = [
@@ -1793,25 +2098,26 @@ class MultiMetricPerfTest(AbstractPerfScriptTestClass):
         return benchmark_cmd
 
     def get_commands(self):
-
         # Whether this is python or cpp runtime perf test.
         is_python = self._config.runtime == "python"
         num_gpus = self._config.num_gpus
-        is_server_benchmark = self._config.runtime == "server-benchmark"
+        is_aggr = self._config.runtime == "aggr_server"
         is_disagg = self._config.runtime == "disagg_server"
-
-        if is_server_benchmark:
-            perf_sanity_working_dir = os.path.join(self._working_dir,
-                                                   "perf-sanity")
-            if not os.path.exists(perf_sanity_working_dir):
-                os.makedirs(perf_sanity_working_dir, exist_ok=True)
-            server_cmds, client_cmds, names = self.get_trtllm_server_client_commands(
-            )
-            return PerfServerClientBenchmarkCmds(
-                server_cmds=server_cmds,
-                client_cmds=client_cmds,
-                names=names,
-                working_dir=perf_sanity_working_dir)
+        is_multi_node_disagg = self._config.runtime == "multi_node_disagg_server"
+        perf_sanity_output_dir = os.path.join(self._output_dir,
+                                              self._test_param_labels)
+        if is_aggr:
+            if not os.path.exists(perf_sanity_output_dir):
+                os.makedirs(perf_sanity_output_dir, exist_ok=True)
+            server_cmds, server_envs, client_cmds, client_envs, names = self.get_trtllm_aggr_commands(
+                perf_sanity_output_dir)
+            return PerfAggrScriptTestCmds(server_cmds=server_cmds,
+                                          server_envs=server_envs,
+                                          client_cmds=client_cmds,
+                                          client_envs=client_envs,
+                                          names=names,
+                                          timeout=3600,
+                                          output_dir=perf_sanity_output_dir)
 
         if is_disagg:
             ctx_cmd, gen_cmd = self._get_disagg_worker_deploy_command()
@@ -1821,6 +2127,30 @@ class MultiMetricPerfTest(AbstractPerfScriptTestClass):
             return PerfDisaggScriptTestCmds(ctx_cmd, gen_cmd, server_cmd,
                                             client_cmd, benchmark_cmd)
 
+        if is_multi_node_disagg:
+            if not os.path.exists(perf_sanity_output_dir):
+                os.makedirs(perf_sanity_output_dir, exist_ok=True)
+            ctx_server_cmds, ctx_server_envs, gen_server_cmds, gen_server_envs, disagg_server_cmds, disagg_server_envs, benchmark_cmds, benchmark_envs = self.get_trtllm_multi_node_disagg_commands(
+                perf_sanity_output_dir)
+            return PerfMultiNodeDisaggScriptTestCmds(
+                ctx_server_cmds=ctx_server_cmds,
+                ctx_server_envs=ctx_server_envs,
+                gen_server_cmds=gen_server_cmds,
+                gen_server_envs=gen_server_envs,
+                disagg_server_cmds=disagg_server_cmds,
+                disagg_server_envs=disagg_server_envs,
+                benchmark_cmds=benchmark_cmds,
+                benchmark_envs=benchmark_envs,
+                timeout=self._config.disagg_configs[0]['timeout'],
+                hostname=self._config.disagg_configs[0]['hostname'],
+                disagg_serving_type=self._config.disagg_configs[0]
+                ['disagg_serving_type'],
+                num_ctx_servers=self._config.disagg_configs[0]['hardware']
+                ['num_ctx_servers'],
+                num_gen_servers=self._config.disagg_configs[0]['hardware']
+                ['num_gen_servers'],
+                output_dir=perf_sanity_output_dir)
+
         if is_python and num_gpus > 1:
             # TODO: Fix https://nvbugs/4449875
             pytest.skip(
@@ -1976,7 +2306,6 @@ class MultiMetricPerfTest(AbstractPerfScriptTestClass):
         Run through the commands and parse multiple perf metrics from the logs.
         """
         #print info to separate cases
-        print_info(f"Running perf test for case: {self._short_test_name}")
         self._current_cmd_idx = 0
         metrics = self._get_metrics()
         outputs = {}
@@ -2078,8 +2407,19 @@ class MultiMetricPerfTest(AbstractPerfScriptTestClass):
         """
         Upload the test results and baseline to database.
         """
-        # Currently only server-benchmark need to store the test result.
-        if self._config.runtime == "server-benchmark":
+
+        def prefix_server_config_dict(config_dict: dict,
+                                      prefix_name: str) -> dict:
+            prefixed_dict = {}
+            for key, value in config_dict.items():
+                type_prefix = key[0:2]  # 'l_', 's_', 'b_', 'd_'
+                rest = key[2:]
+                prefixed_dict[f"{type_prefix}{prefix_name}_{rest}"] = value
+            return prefixed_dict
+
+        match_keys = []
+        # Only aggr_server and multi_node_disagg_server will upload.
+        if self._config.runtime == "aggr_server":
             job_config = get_job_info()
             job_config["s_gpu_type"] = self._config.gpu_type
             is_post_merge = job_config["b_is_post_merge"]
@@ -2094,49 +2434,115 @@ class MultiMetricPerfTest(AbstractPerfScriptTestClass):
                     # If cmd_idx not in self._test_results or some metrics missing, skip this cmd_idx
                     if cmd_idx not in self._test_results or not all(
                             metric_type in self._test_results[cmd_idx]
-                            for metric_type in SERVER_BENCHMARK_METRICS):
+                            for metric_type in AGGR_SERVER_METRICS):
                         print_info(
                             f"Skipped posting command {cmd_idx} 's test results since some metrics are missing in test results."
                         )
                         cmd_idx += 1
                         continue
-                    new_data = {}
+                    new_data = {
+                        "s_runtime":
+                        "multi_node_aggr_server" if server_config.gpus
+                        != server_config.gpus_per_node else "aggr_server"
+                    }
                     new_data.update(job_config)
                     new_data.update(server_config_dict)
                     new_data.update(client_config_dict)
-                    for metric_type in SERVER_BENCHMARK_METRICS:
+                    for metric_type in AGGR_SERVER_METRICS:
                         new_data[
                             f"d_{PERF_METRIC_STRING[metric_type]}"] = self._test_results[
                                 cmd_idx][metric_type]
                     add_id(new_data)
                     new_data_dict[cmd_idx] = new_data
                     cmd_idx += 1
+                    if not match_keys:
+                        match_keys.append("s_runtime")
+                        match_keys.extend(server_config_dict.keys())
+                        match_keys.extend(client_config_dict.keys())
 
-            # Get history data for each cmd_idx
-            history_baseline_dict, history_data_dict = get_history_data(
-                new_data_dict)
-            # Prepare regressive test cases
-            regressive_data_list = prepare_regressive_test_cases(
-                history_baseline_dict, new_data_dict)
-
-            if is_post_merge:
-                # Prepare new baseline data for post-merge
-                new_baseline_data_dict = prepare_baseline_data(
-                    history_baseline_dict, history_data_dict, new_data_dict)
-            else:
-                # Pre-merge does not need to upload baseline data
-                new_baseline_data_dict = None
-
-            if self._config.upload_to_db:
-                # Upload the new perf data and baseline data to database
-                post_new_perf_data(new_baseline_data_dict, new_data_dict,
-                                   regressive_data_list)
-
-            # Print regressive test cases
-            print_regressive_test_cases(regressive_data_list)
+        elif self._config.runtime == "multi_node_disagg_server":
+            if self._config.disagg_configs[0][
+                    'disagg_serving_type'] != "BENCHMARK":
+                return
+            job_config = get_job_info()
+            job_config["s_gpu_type"] = self._config.gpu_type
+            is_post_merge = job_config["b_is_post_merge"]
+            new_data_dict = {}
+            cmd_idx = 0
+            for disagg_config in self._config.disagg_configs:
+                # If cmd_idx not in self._test_results or some metrics missing, skip this cmd_idx
+                if cmd_idx not in self._test_results or not all(
+                        metric_type in self._test_results[cmd_idx]
+                        for metric_type in AGGR_SERVER_METRICS):
+                    print_info(
+                        f"Skipped posting command {cmd_idx} 's test results since some metrics are missing in test results."
+                    )
+                    cmd_idx += 1
+                    continue
+                # Get ctx_server and gen_server configs with prefixed keys
+                ctx_server_config_dict = disagg_config['ctx_server'].to_db_data(
+                )
+                gen_server_config_dict = disagg_config['gen_server'].to_db_data(
+                )
+                ctx_server_config_dict = prefix_server_config_dict(
+                    ctx_server_config_dict, 'ctx')
+                gen_server_config_dict = prefix_server_config_dict(
+                    gen_server_config_dict, 'gen')
+                client_config_dict = disagg_config['client'].to_db_data()
+                # Build new_data
+                new_data = {
+                    "s_runtime": "multi_node_disagg_server",
+                    "s_server_env_var": disagg_config['server_env_var']
+                }
+                new_data.update(job_config)
+                new_data.update(ctx_server_config_dict)
+                new_data.update(gen_server_config_dict)
+                new_data.update(client_config_dict)
+                # Add hardware information
+                hardware = disagg_config.get('hardware', {})
+                new_data["l_num_ctx_servers"] = hardware.get(
+                    'num_ctx_servers', 0)
+                new_data["l_num_gen_servers"] = hardware.get(
+                    'num_gen_servers', 0)
+                # Add metrics from test results
+                for metric_type in AGGR_SERVER_METRICS:
+                    new_data[
+                        f"d_{PERF_METRIC_STRING[metric_type]}"] = self._test_results[
+                            cmd_idx][metric_type]
+                add_id(new_data)
+                new_data_dict[cmd_idx] = new_data
+                cmd_idx += 1
+                if not match_keys:
+                    match_keys.extend(
+                        ["s_runtime", "l_num_ctx_servers", "l_num_gen_servers"])
+                    match_keys.extend(ctx_server_config_dict.keys())
+                    match_keys.extend(gen_server_config_dict.keys())
+                    match_keys.extend(client_config_dict.keys())
         else:
             return
 
+        # Get history data for each cmd_idx
+        history_baseline_dict, history_data_dict = get_history_data(
+            new_data_dict, self._config.gpu_type, match_keys)
+        # Prepare regressive test cases
+        regressive_data_list = prepare_regressive_test_cases(
+            history_baseline_dict, new_data_dict)
+
+        if is_post_merge:
+            # Prepare new baseline data for post-merge
+            new_baseline_data_dict = prepare_baseline_data(
+                history_baseline_dict, history_data_dict, new_data_dict)
+        else:
+            # Pre-merge does not need to upload baseline data
+            new_baseline_data_dict = None
+
+        if self._config.upload_to_db:
+            # Upload the new perf data and baseline data to database
+            post_new_perf_data(new_baseline_data_dict, new_data_dict,
+                               regressive_data_list)
+
+        print_regressive_test_cases(regressive_data_list)
+
     def _get_engine_dir(self) -> str:
         """
         Get the engine directory to store the engine.
@@ -2150,13 +2556,13 @@ class MultiMetricPerfTest(AbstractPerfScriptTestClass):
         Generate all the metric configs for the current test.
         """
         metrics = []
-        if self._config.runtime == "server-benchmark":
+        if self._config.runtime == "aggr_server":
             cmd_idx = 0
             for server_idx, client_configs in self._config.server_client_configs.items(
             ):
                 server_name = self._config.server_configs[server_idx].name
                 for client_config in client_configs:
-                    for metric_type in SERVER_BENCHMARK_METRICS:
+                    for metric_type in AGGR_SERVER_METRICS:
                         metrics.append(
                             PerfTestMetric(
                                 original_test_name=self._full_test_name,
@@ -2193,6 +2599,28 @@ class MultiMetricPerfTest(AbstractPerfScriptTestClass):
                     ))
             return metrics
 
+        if self._config.runtime == "multi_node_disagg_server":
+            cmd_idx = 0
+            for disagg_config in self._config.disagg_configs:
+                config_name = disagg_config['name']
+                for metric_type in AGGR_SERVER_METRICS:
+                    metrics.append(
+                        PerfTestMetric(
+                            original_test_name=self._full_test_name,
+                            metric_name=self._get_metric_name(
+                                metric_type=metric_type,
+                                disagg_config_name=config_name),
+                            metric_type=metric_type,
+                            metric_regex=self._get_metric_regex(metric_type),
+                            metric_threshold=self._get_metric_threshold(
+                                metric_type),
+                            metric_abs_threshold=self._get_metric_abs_threshold(
+                                metric_type),
+                            cmd_idx=cmd_idx,
+                        ))
+                cmd_idx += 1
+            return metrics
+
         # Build command is the first command.
         cmd_idx = 0 if self._config.runtime != "bench" else 1
         if self._config.runtime == "bench":
@@ -2264,7 +2692,8 @@ class MultiMetricPerfTest(AbstractPerfScriptTestClass):
                          input_len: int = None,
                          output_len: int = None,
                          server_name: str = None,
-                         client_name: str = None) -> str:
+                         client_name: str = None,
+                         disagg_config_name: str = None) -> str:
         """
         Construct the metric name for given metric_type, bs, input_len, and output_len.
         """
@@ -2278,11 +2707,14 @@ class MultiMetricPerfTest(AbstractPerfScriptTestClass):
         if metric_type in BUILDER_METRICS:
             # We build one engine for all benchmark runs, so add all bs and seq lens to the metric name.
             metric_label = self._config.to_string(device_subtype=device_subtype)
-        elif self._config.runtime == "server-benchmark":
+        elif self._config.runtime == "aggr_server":
             metric_label = self._config.to_string(
                 custom_server_name=server_name,
                 custom_client_name=client_name,
             )
+        elif self._config.runtime == "multi_node_disagg_server":
+            metric_label = self._config.to_string(
+                custom_server_name=disagg_config_name)
         else:
             # Otherwise, generate per-bs and per-seqlen label.
             metric_label = self._config.to_string(
@@ -2303,10 +2735,14 @@ class MultiMetricPerfTest(AbstractPerfScriptTestClass):
             if metric_type not in BENCH_PERF_METRIC_LOG_QUERIES:
                 raise ValueError(f"Unexpected metric_type: {metric_type}")
             return BENCH_PERF_METRIC_LOG_QUERIES[metric_type]
-        elif self._config.runtime == "server-benchmark":
-            if metric_type not in SERVER_BENCHMARK_PERF_METRIC_LOG_QUERIES:
+        elif self._config.runtime == "aggr_server":
+            if metric_type not in AGGR_SERVER_PERF_METRIC_LOG_QUERIES:
                 raise ValueError(f"Unexpected metric_type: {metric_type}")
-            return SERVER_BENCHMARK_PERF_METRIC_LOG_QUERIES[metric_type]
+            return AGGR_SERVER_PERF_METRIC_LOG_QUERIES[metric_type]
+        elif self._config.runtime == "multi_node_disagg_server":
+            if metric_type not in AGGR_SERVER_PERF_METRIC_LOG_QUERIES:
+                raise ValueError(f"Unexpected metric_type: {metric_type}")
+            return AGGR_SERVER_PERF_METRIC_LOG_QUERIES[metric_type]
         else:
             pytest.skip("only support trtllm-bench runtime for now")
 
@@ -2491,7 +2927,7 @@ def run_perf_test(perf_case_name, trt_performance_cache_fpath,
     """
     working_dir = llm_venv.get_working_directory()
     test_runner = MultiMetricPerfTest(perf_case_name)
-    test_runner.set_runtime_configs(llm_root, working_dir,
+    test_runner.set_runtime_configs(llm_root, working_dir, output_dir,
                                     trt_performance_cache_fpath,
                                     trt_gpu_clock_lock)
     test_runner.run_metrics(llm_venv, trt_gpu_clock_lock,
diff --git a/tests/integration/defs/perf/utils.py b/tests/integration/defs/perf/utils.py
index f6296e0b30..d3c38ddb2d 100644
--- a/tests/integration/defs/perf/utils.py
+++ b/tests/integration/defs/perf/utils.py
@@ -26,6 +26,7 @@ from pathlib import Path
 from typing import Dict, List, NamedTuple, Optional
 
 import requests
+import yaml
 from _pytest.nodes import Item
 from _pytest.python import Function
 from defs.trt_test_alternative import (check_output, popen, print_error,
@@ -235,54 +236,80 @@ class PerfBenchScriptTestCmds(NamedTuple):
         return cmd_str
 
 
-class PerfServerClientBenchmarkCmds(NamedTuple):
-    server_cmds: List[str]
+class PerfAggrScriptTestCmds(NamedTuple):
+    server_cmds: List[List[str]]
+    server_envs: List[Dict[str, str]]
     client_cmds: List[List[str]]
+    client_envs: List[Dict[str, str]]
     names: List[str]
-    working_dir: str
+    timeout: int
+    output_dir: str
 
-    def wait_for_endpoint_ready(self, url: str, timeout: int = 5400):
+    def wait_for_endpoint_ready(self, url: str, timeout: int = 7200):
         start = time.monotonic()
-        while time.monotonic() - start < timeout:
+        while True:
+            elapsed_time = time.monotonic() - start
+            if elapsed_time > timeout:
+                print_error(
+                    f"Timeout waiting for endpoint {url} to be ready after {timeout} seconds"
+                )
+                break
             try:
-                time.sleep(10)
+                print_info(
+                    f"Waiting for endpoint {url} to be ready, elapsed time: {elapsed_time}s"
+                )
+                time.sleep(1)
                 if requests.get(url).status_code == 200:
-                    print(f"endpoint {url} is ready")
+                    print_info(f"endpoint {url} is ready")
                     return
             except Exception as err:
-                print(f"endpoint {url} is not ready, with exception: {err}")
+                print_info(
+                    f"endpoint {url} is not ready, with exception: {err}")
         print_error(
             f"Endpoint {url} did not become ready within {timeout} seconds")
 
     def run_cmd(self, cmd_idx: int, venv) -> str:
         output = ""
+        server_proc = None
         server_file_path = os.path.join(
-            self.working_dir, f"trtllm-serve.{self.names[cmd_idx]}.log")
+            self.output_dir, f"trtllm-serve.{self.names[cmd_idx]}.log")
         client_file_path = os.path.join(
-            self.working_dir, f"trtllm-benchmark.{self.names[cmd_idx]}.log")
+            self.output_dir, f"trtllm-benchmark.{self.names[cmd_idx]}.log")
         try:
-            with (  # Start server process
-                    open(server_file_path, 'w') as server_ctx,
-                    popen(self.server_cmds[cmd_idx],
-                          stdout=server_ctx,
-                          stderr=subprocess.STDOUT,
-                          env=venv._new_env,
-                          shell=True) as server_proc):
-                self.wait_for_endpoint_ready(
-                    "http://localhost:8000/v1/models",
-                    timeout=7200)  # 120 minutes for large models
-                output += subprocess.check_output(self.client_cmds[cmd_idx],
-                                                  env=venv._new_env).decode()
-                # Write output to client file path
-                with open(client_file_path, 'w') as client_ctx:
-                    client_ctx.write(output)
+            server_envs = copy.deepcopy(os.environ)
+            # server_envs.update(self.server_envs[cmd_idx])
+            print_info(
+                f"Starting server. cmd is {self.server_cmds[cmd_idx]} envs are {server_envs}"
+            )
+            with open(server_file_path, 'w') as server_ctx:
+                server_proc = subprocess.Popen(
+                    self.server_cmds[cmd_idx],
+                    stdout=server_ctx,
+                    stderr=subprocess.STDOUT,
+                    env=server_envs,
+                )
+            self.wait_for_endpoint_ready("http://localhost:8000/health",
+                                         timeout=self.timeout)
+            client_envs = copy.deepcopy(os.environ)
+            # client_envs.update(self.client_envs[cmd_idx])
+            print_info(
+                f"Starting client. cmd is {self.client_cmds[cmd_idx]} envs are {client_envs}"
+            )
+            output = subprocess.check_output(
+                self.client_cmds[cmd_idx],
+                env=client_envs,
+                stderr=subprocess.STDOUT,
+            ).decode()
+
+            with open(client_file_path, 'w') as client_ctx:
+                client_ctx.write(output)
         finally:
             server_proc.terminate()
             server_proc.wait()
         return output
 
     def get_cmd_str(self, cmd_idx) -> List[str]:
-        return ["server-benchmark tests, please check config files"]
+        return ["aggr_server tests, please check config files"]
 
 
 class PerfDisaggScriptTestCmds(NamedTuple):
@@ -347,6 +374,259 @@ class PerfDisaggScriptTestCmds(NamedTuple):
         return ["disaggregated server tests, please check config files"]
 
 
+class PerfMultiNodeDisaggScriptTestCmds(NamedTuple):
+    ctx_server_cmds: List[List[str]]
+    ctx_server_envs: List[Dict[str, str]]
+    gen_server_cmds: List[List[str]]
+    gen_server_envs: List[Dict[str, str]]
+    disagg_server_cmds: List[List[str]]
+    disagg_server_envs: List[Dict[str, str]]
+    benchmark_cmds: List[List[str]]
+    benchmark_envs: List[Dict[str, str]]
+    timeout: int
+    hostname: str
+    disagg_serving_type: str
+    num_ctx_servers: int
+    num_gen_servers: int
+    output_dir: str
+
+    def _generate_disagg_server_config(self,
+                                       cmd_idx: int,
+                                       ctx_gen_port: int = 8336,
+                                       disagg_server_port: int = 8333) -> str:
+        print_info(
+            f"Generating disagg server config for command index {cmd_idx}")
+        # Wait for all hostname files to be created
+        hostnames_folder = os.path.join(self.output_dir, "hostnames")
+        print_info(f"Waiting for hostnames folder: {hostnames_folder}")
+
+        expected_count = self.num_ctx_servers + self.num_gen_servers
+        start_time = time.time()
+        hostnames = []
+        while True:
+            elapsed_time = time.time() - start_time
+            print_info(
+                f"Waiting for hostnames in {hostnames_folder}, elapsed time: {elapsed_time}s, current: {len(hostnames)}, expected: {expected_count}"
+            )
+            if elapsed_time > self.timeout:
+                print_error(
+                    f"Time out. Hostnames files are not ready after {self.timeout}s"
+                )
+            time.sleep(10)
+            if not os.path.exists(hostnames_folder):
+                continue
+            hostnames = os.listdir(hostnames_folder)
+            if len(hostnames) >= expected_count:
+                break
+        print_info(
+            f"All hostnames found in {hostnames_folder} after elapsed time: {elapsed_time}s"
+        )
+
+        # Read ctx and gen hostnames
+        ctx_hostnames = []
+        gen_hostnames = []
+        for hostname_file in hostnames:
+            hostname_file_path = os.path.join(hostnames_folder, hostname_file)
+            with open(hostname_file_path, 'r') as f:
+                actual_hostname = f.read().strip()
+                print_info(f"Hostname: {actual_hostname} in {hostname_file}")
+            if hostname_file.startswith("CTX"):
+                ctx_hostnames.append(actual_hostname)
+            elif hostname_file.startswith("GEN"):
+                gen_hostnames.append(actual_hostname)
+        print_info(f"ctx_hostnames: {ctx_hostnames}")
+        print_info(f"gen_hostnames: {gen_hostnames}")
+
+        # Generate server config
+        server_config = {
+            'hostname': self.hostname,
+            'port': disagg_server_port,
+            'backend': 'pytorch',
+            'context_servers': {
+                'num_instances': self.num_ctx_servers,
+                'urls': [f'{host}:{ctx_gen_port}' for host in ctx_hostnames]
+            },
+            'generation_servers': {
+                'num_instances': self.num_gen_servers,
+                'urls': [f'{host}:{ctx_gen_port}' for host in gen_hostnames]
+            }
+        }
+
+        config_path = os.path.join(self.output_dir, "server_config.yaml")
+        with open(config_path, 'w') as f:
+            yaml.dump(server_config, f)
+        print_info(f"Server config file {config_path} generated")
+
+        return config_path
+
+    def _get_disagg_server_hostname_and_port(self) -> tuple:
+        config_path = os.path.join(self.output_dir, "server_config.yaml")
+        print_info(f"Waiting for server config file: {config_path}")
+        start_time = time.time()
+        while True:
+            if os.path.exists(config_path):
+                print_info(f"Server config file found: {config_path}")
+                break
+            elapsed_time = time.time() - start_time
+            if elapsed_time > self.timeout:
+                print_error(
+                    f"Server config file {config_path} not found after {self.timeout}s"
+                )
+            print_info(
+                f"Waiting for server config file, elapsed time: {elapsed_time}s"
+            )
+            time.sleep(10)  # Check every 10 seconds
+
+        # Read server config to get hostname and port
+        with open(config_path, 'r') as f:
+            server_config = yaml.safe_load(f)
+        disagg_server_hostname = server_config['hostname']
+        disagg_server_port = str(server_config['port'])
+        return disagg_server_hostname, disagg_server_port
+
+    def wait_for_benchmark_ready(self,
+                                 benchmark_status_file: str,
+                                 timeout: int = 7200):
+        print_info(
+            f"Server {self.disagg_serving_type} waiting for benchmark status file: {benchmark_status_file}"
+        )
+        start_time = time.time()
+        while True:
+            if os.path.exists(benchmark_status_file):
+                print_info(
+                    f"Benchmark status file found, terminating server {self.disagg_serving_type}"
+                )
+                break
+            elapsed_time = time.time() - start_time
+            print_info(
+                f"Waiting for benchmark status file, elapsed time: {elapsed_time}s"
+            )
+            if elapsed_time > timeout:
+                print_error(
+                    f"Timeout waiting for benchmark status file after {timeout}s, terminating server {self.disagg_serving_type}"
+                )
+                break
+            time.sleep(10)  # Check every 10 seconds
+
+    def wait_for_endpoint_ready(self, url: str, timeout: int = 7200):
+        start = time.monotonic()
+        while True:
+            elapsed_time = time.monotonic() - start
+            if elapsed_time > timeout:
+                print_error(
+                    f"Timeout waiting for endpoint {url} to be ready after {timeout} seconds"
+                )
+                break
+            print_info(
+                f"Waiting for endpoint {url} to be ready, elapsed time: {elapsed_time}s"
+            )
+            try:
+                time.sleep(10)
+                if requests.get(url).status_code == 200:
+                    print_info(f"endpoint {url} is ready")
+                    return
+            except Exception as err:
+                print_info(
+                    f"endpoint {url} is not ready, with exception: {err}")
+        print_error(
+            f"Endpoint {url} did not become ready within {timeout} seconds")
+
+    def run_cmd(self, cmd_idx: int, venv) -> str:
+        output = ""
+        server_proc = None
+        benchmark_status_file = os.path.join(self.output_dir,
+                                             f"benchmark_status.{cmd_idx}.txt")
+        if "CTX" in self.disagg_serving_type or "GEN" in self.disagg_serving_type:
+            server_file_path = os.path.join(
+                self.output_dir,
+                f"trtllm-serve.{cmd_idx}.{self.disagg_serving_type}.log")
+            is_ctx = "CTX" in self.disagg_serving_type
+            server_cmd = self.ctx_server_cmds[
+                cmd_idx] if is_ctx else self.gen_server_cmds[cmd_idx]
+            server_envs = copy.deepcopy(os.environ)
+            # server_envs.update(self.ctx_server_envs[cmd_idx]
+            #                    if is_ctx else self.gen_server_envs[cmd_idx])
+            try:
+                print_info(
+                    f"Starting server. disagg_serving_type: {self.disagg_serving_type} cmd is {server_cmd} envs are {server_envs}"
+                )
+                with open(server_file_path, 'w') as server_ctx:
+                    server_proc = subprocess.Popen(
+                        server_cmd,
+                        stdout=server_ctx,
+                        stderr=subprocess.STDOUT,
+                        env=server_envs,
+                    )
+                self.wait_for_benchmark_ready(benchmark_status_file,
+                                              timeout=self.timeout)
+            finally:
+                print_info(f"Server {self.disagg_serving_type} stopped")
+                server_proc.terminate()
+                server_proc.wait()
+        elif self.disagg_serving_type == "DISAGG_SERVER":
+            disagg_server_file_path = os.path.join(
+                self.output_dir,
+                f"trtllm-serve.{cmd_idx}.{self.disagg_serving_type}.log")
+            disagg_server_cmd = self.disagg_server_cmds[cmd_idx]
+            disagg_server_envs = copy.deepcopy(os.environ)
+            # disagg_server_envs.update(self.disagg_server_envs[cmd_idx])
+            try:
+                # Generate disagg server config (this will wait for all hostnames)
+                self._generate_disagg_server_config(cmd_idx)
+                print_info(
+                    f"Starting disagg server. disagg_serving_type: {self.disagg_serving_type} disagg server cmd is {disagg_server_cmd} envs are {disagg_server_envs}"
+                )
+                with open(disagg_server_file_path, 'w') as disagg_server_ctx:
+                    disagg_server_proc = subprocess.Popen(
+                        disagg_server_cmd,
+                        stdout=disagg_server_ctx,
+                        stderr=subprocess.STDOUT,
+                        env=disagg_server_envs,
+                    )
+                self.wait_for_benchmark_ready(benchmark_status_file,
+                                              timeout=self.timeout)
+            finally:
+                print_info(f"Disagg server {self.disagg_serving_type} stopped")
+                disagg_server_proc.terminate()
+                disagg_server_proc.wait()
+        elif self.disagg_serving_type == "BENCHMARK":
+            benchmark_file_path = os.path.join(
+                self.output_dir, f"trtllm-benchmark.{cmd_idx}.log")
+            try:
+                # Get disagg server's hostname and port
+                disagg_server_hostname, disagg_server_port = self._get_disagg_server_hostname_and_port(
+                )
+                # Add hostname and port to benchmark command
+                benchmark_cmd = self.benchmark_cmds[cmd_idx] + [
+                    '--host', disagg_server_hostname, '--port',
+                    disagg_server_port
+                ]
+                benchmark_envs = copy.deepcopy(os.environ)
+                # benchmark_envs.update(self.benchmark_envs[cmd_idx])
+                self.wait_for_endpoint_ready(
+                    f"http://{disagg_server_hostname}:{disagg_server_port}/health",
+                    timeout=self.timeout,
+                )
+                # Run benchmark
+                print_info(
+                    f"Starting benchmark. disagg_serving_type: {self.disagg_serving_type} benchmark cmd is {benchmark_cmd} envs are {benchmark_envs}"
+                )
+                output = subprocess.check_output(
+                    benchmark_cmd, env=benchmark_envs,
+                    stderr=subprocess.STDOUT).decode()
+                with open(benchmark_file_path, 'w') as benchmark_ctx:
+                    benchmark_ctx.write(output)
+            finally:
+                with open(benchmark_status_file, 'w') as status_file:
+                    status_file.write("Done")
+        return output
+
+    def get_cmd_str(self, cmd_idx) -> List[str]:
+        return [
+            "multi-node disaggregated server tests, please check config files"
+        ]
+
+
 class AbstractPerfScriptTestClass(abc.ABC):
     """
     Abstract class for all script-based perf tests.
@@ -453,6 +733,14 @@ class AbstractPerfScriptTestClass(abc.ABC):
         is_prepare_dataset_cmd = 'prepare_dataset' in commands.get_cmd_str(
             cmd_idx)
 
+        is_perf_sanity_test = "perf_sanity" in full_test_name
+
+        is_disagg_server = False
+        if self._config.runtime == "multi_node_disagg_server":
+            disagg_serving_type = self._config.disagg_configs[0][
+                'disagg_serving_type']
+            is_disagg_server = disagg_serving_type != "BENCHMARK"
+
         # Start the timer.
         self._start_timestamp = datetime.utcnow()
         try:
@@ -460,7 +748,8 @@ class AbstractPerfScriptTestClass(abc.ABC):
                 # Capture the stdout from _gpu_clock_lock because the pipeline JUnit update script tries to parse
                 # the log to find the GPU clocks.
                 with io.StringIO() as buf:
-                    if self._gpu_clock_lock:
+                    # Perf-sanity test doesn't lock gpu clock
+                    if self._gpu_clock_lock and not is_perf_sanity_test:
                         # Lock GPU clock and start monitoring.
                         with contextlib.redirect_stdout(
                                 buf), self._gpu_clock_lock, tmpDir:
@@ -515,9 +804,12 @@ class AbstractPerfScriptTestClass(abc.ABC):
             # Parse the perf result from the test outputs.
             if is_prepare_dataset_cmd:
                 print_info(
-                    f"skip writing perf result when calling generating dataset in trtllm-bench"
+                    f"skip writing perf result when calling generating dataset in trtllm-bench."
                 )
                 outputs.pop(cmd_idx)
+            elif is_disagg_server:
+                print_info(
+                    f"skip writing perf result when running disagg's server.")
             else:
                 self._perf_result = self.get_perf_result(outputs)
 
diff --git a/tests/integration/test_lists/test-db/l0_dgx_b200_perf_sanity.yml b/tests/integration/test_lists/test-db/l0_dgx_b200_perf_sanity.yml
new file mode 100644
index 0000000000..d4470fe1a4
--- /dev/null
+++ b/tests/integration/test_lists/test-db/l0_dgx_b200_perf_sanity.yml
@@ -0,0 +1,41 @@
+version: 0.0.1
+l0_dgx_b200_perf_sanity:
+- condition:
+    ranges:
+      system_gpu_count:
+        gte: 8
+        lte: 8
+    wildcards:
+      gpu:
+      - '*b200*'
+      linux_distribution_name: ubuntu*
+      cpu: x86_64
+    terms:
+      stage: post_merge
+      backend: pytorch
+      orchestrator: mpi
+  tests:
+  - perf/test_perf.py::test_perf[perf_sanity_upload-l0_dgx_b200-r1_fp8_dep8_mtp1_1k1k] TIMEOUT (180)
+  - perf/test_perf.py::test_perf[perf_sanity_upload-l0_dgx_b200-r1_fp8_tep8_mtp3_1k1k] TIMEOUT (180)
+  - perf/test_perf.py::test_perf[perf_sanity_upload-l0_dgx_b200-r1_fp8_tp8_mtp3_1k1k] TIMEOUT (180)
+
+- condition:
+    ranges:
+      system_gpu_count:
+        gte: 4
+        lte: 4
+    wildcards:
+      gpu:
+      - '*b200*'
+      linux_distribution_name: ubuntu*
+      cpu: x86_64
+    terms:
+      stage: post_merge
+      backend: pytorch
+      orchestrator: mpi
+  tests:
+  - perf/test_perf.py::test_perf[perf_sanity_upload-l0_dgx_b200-r1_fp4_v2_dep4_mtp1_1k1k] TIMEOUT (180)
+  - perf/test_perf.py::test_perf[perf_sanity_upload-l0_dgx_b200-r1_fp4_v2_tep4_mtp3_1k1k] TIMEOUT (180)
+  - perf/test_perf.py::test_perf[perf_sanity_upload-l0_dgx_b200-r1_fp4_v2_tp4_mtp3_1k1k] TIMEOUT (180)
+  - perf/test_perf.py::test_perf[perf_sanity_upload-l0_dgx_b200-gpt_oss_fp4_dep2,gpt_oss_fp4_dep4] TIMEOUT (180)
+  - perf/test_perf.py::test_perf[perf_sanity_upload-l0_dgx_b200-gpt_oss_fp4_tp4_eagle3] TIMEOUT (180)
diff --git a/tests/integration/test_lists/test-db/l0_dgx_b300_perf_sanity.yml b/tests/integration/test_lists/test-db/l0_dgx_b300_perf_sanity.yml
new file mode 100644
index 0000000000..ff0b9eafe3
--- /dev/null
+++ b/tests/integration/test_lists/test-db/l0_dgx_b300_perf_sanity.yml
@@ -0,0 +1,41 @@
+version: 0.0.1
+l0_dgx_b300_perf_sanity:
+- condition:
+    ranges:
+      system_gpu_count:
+        gte: 8
+        lte: 8
+    wildcards:
+      gpu:
+      - '*gb110*'
+      - '*b300*'
+      linux_distribution_name: ubuntu*
+      cpu: x86_64
+    terms:
+      stage: post_merge
+      backend: pytorch
+      orchestrator: mpi
+  tests:
+  - perf/test_perf.py::test_perf[perf_sanity_upload-l0_dgx_b200-r1_fp8_dep8_mtp1_1k1k] TIMEOUT (180)
+  - perf/test_perf.py::test_perf[perf_sanity_upload-l0_dgx_b200-r1_fp8_tep8_mtp3_1k1k] TIMEOUT (180)
+  - perf/test_perf.py::test_perf[perf_sanity_upload-l0_dgx_b200-r1_fp8_tp8_mtp3_1k1k] TIMEOUT (180)
+
+- condition:
+    ranges:
+      system_gpu_count:
+        gte: 4
+        lte: 4
+    wildcards:
+      gpu:
+      - '*gb110*'
+      - '*b300*'
+      linux_distribution_name: ubuntu*
+      cpu: x86_64
+    terms:
+      stage: post_merge
+      backend: pytorch
+      orchestrator: mpi
+  tests:
+  - perf/test_perf.py::test_perf[perf_sanity_upload-l0_dgx_b200-r1_fp4_v2_dep4_mtp1_1k1k] TIMEOUT (180)
+  - perf/test_perf.py::test_perf[perf_sanity_upload-l0_dgx_b200-r1_fp4_v2_tep4_mtp3_1k1k] TIMEOUT (180)
+  - perf/test_perf.py::test_perf[perf_sanity_upload-l0_dgx_b200-r1_fp4_v2_tp4_mtp3_1k1k] TIMEOUT (180)
diff --git a/tests/integration/test_lists/test-db/l0_gb200_multi_gpus_perf_sanity.yml b/tests/integration/test_lists/test-db/l0_gb200_multi_gpus_perf_sanity.yml
new file mode 100644
index 0000000000..23f4b20f97
--- /dev/null
+++ b/tests/integration/test_lists/test-db/l0_gb200_multi_gpus_perf_sanity.yml
@@ -0,0 +1,22 @@
+version: 0.0.1
+l0_gb200_multi_gpus_perf_sanity:
+- condition:
+    ranges:
+      system_gpu_count:
+        gte: 4
+        lte: 4
+    wildcards:
+      gpu:
+      - '*gb200*'
+      linux_distribution_name: ubuntu*
+      cpu: aarch64
+    terms:
+      stage: post_merge
+      backend: pytorch
+  tests:
+  - perf/test_perf.py::test_perf[perf_sanity_upload-l0_gb200_multi_gpus-r1_fp4_v2_dep4_mtp1_1k1k]
+  - perf/test_perf.py::test_perf[perf_sanity_upload-l0_gb200_multi_gpus-r1_fp4_v2_tep4_mtp3_1k1k]
+  - perf/test_perf.py::test_perf[perf_sanity_upload-l0_gb200_multi_gpus-r1_fp4_v2_tp4_mtp3_1k1k]
+  - perf/test_perf.py::test_perf[perf_sanity_upload-l0_gb200_multi_gpus-r1_fp4_v2_dep4_mtp1_8k1k]
+  - perf/test_perf.py::test_perf[perf_sanity_upload-l0_gb200_multi_gpus-r1_fp4_v2_tep4_mtp3_8k1k]
+  - perf/test_perf.py::test_perf[perf_sanity_upload-l0_gb200_multi_gpus-r1_fp4_v2_tp4_mtp3_8k1k]
diff --git a/tests/integration/test_lists/test-db/l0_gb200_multi_nodes_perf_sanity.yml b/tests/integration/test_lists/test-db/l0_gb200_multi_nodes_perf_sanity.yml
new file mode 100644
index 0000000000..bc7d95b047
--- /dev/null
+++ b/tests/integration/test_lists/test-db/l0_gb200_multi_nodes_perf_sanity.yml
@@ -0,0 +1,16 @@
+version: 0.0.1
+l0_gb200_multi_nodes_perf_sanity:
+- condition:
+    ranges:
+      # 2 nodes with each node has 4 GPUs
+      system_gpu_count:
+        gte: 8
+        lte: 8
+    wildcards:
+      gpu:
+      - '*gb200*'
+    terms:
+      stage: post_merge
+      backend: pytorch
+  tests:
+  - perf/test_perf.py::test_perf[perf_sanity_upload-l0_gb200_multi_nodes-r1_fp4_v2_dep8_mtp1]
diff --git a/tests/integration/test_lists/test-db/perf_sanity_l0_dgx_b200.yml b/tests/integration/test_lists/test-db/perf_sanity_l0_dgx_b200.yml
deleted file mode 100644
index 3fdd60670f..0000000000
--- a/tests/integration/test_lists/test-db/perf_sanity_l0_dgx_b200.yml
+++ /dev/null
@@ -1,35 +0,0 @@
-version: 0.0.1
-perf_sanity_l0_dgx_b200:
-- condition:
-    ranges:
-      system_gpu_count:
-        gte: 4
-        lte: 4
-    wildcards:
-      gpu:
-      - '*b200*'
-      linux_distribution_name: ubuntu*
-      cpu: x86_64
-    terms:
-      stage: pre_merge
-      backend: pytorch
-      orchestrator: mpi
-  tests:
-  - perf/test_perf.py::test_perf[perf_sanity_upload-l0_dgx_b200]
-
-- condition:
-    ranges:
-      system_gpu_count:
-        gte: 4
-        lte: 4
-    wildcards:
-      gpu:
-      - '*b200*'
-      linux_distribution_name: ubuntu*
-      cpu: x86_64
-    terms:
-      stage: post_merge
-      backend: pytorch
-      orchestrator: mpi
-  tests:
-  - perf/test_perf.py::test_perf[perf_sanity_upload-l0_dgx_b200]
diff --git a/tests/integration/test_lists/test-db/perf_sanity_l0_dgx_b300.yml b/tests/integration/test_lists/test-db/perf_sanity_l0_dgx_b300.yml
deleted file mode 100644
index ef98b37ef9..0000000000
--- a/tests/integration/test_lists/test-db/perf_sanity_l0_dgx_b300.yml
+++ /dev/null
@@ -1,37 +0,0 @@
-version: 0.0.1
-perf_sanity_l0_dgx_b300:
-- condition:
-    ranges:
-      system_gpu_count:
-        gte: 4
-        lte: 4
-    wildcards:
-      gpu:
-      - '*gb110*'
-      - '*b300*'
-      linux_distribution_name: ubuntu*
-      cpu: x86_64
-    terms:
-      stage: pre_merge
-      backend: pytorch
-      orchestrator: mpi
-  tests:
-  - perf/test_perf.py::test_perf[perf_sanity_upload-l0_dgx_b300]
-
-- condition:
-    ranges:
-      system_gpu_count:
-        gte: 4
-        lte: 4
-    wildcards:
-      gpu:
-      - '*gb110*'
-      - '*b300*'
-      linux_distribution_name: ubuntu*
-      cpu: x86_64
-    terms:
-      stage: post_merge
-      backend: pytorch
-      orchestrator: mpi
-  tests:
-  - perf/test_perf.py::test_perf[perf_sanity_upload-l0_dgx_b300]
diff --git a/tests/scripts/perf-sanity/README.md b/tests/scripts/perf-sanity/README.md
index ee928939c6..66f9a93fc6 100644
--- a/tests/scripts/perf-sanity/README.md
+++ b/tests/scripts/perf-sanity/README.md
@@ -1,134 +1,109 @@
-# TensorRT-LLM Benchmark Test System
+# TensorRT-LLM Perf Sanity Test System
 
-Benchmarking scripts for TensorRT-LLM serving performance tests with configuration-driven test cases and CSV report generation.
+Performance sanity testing scripts for TensorRT-LLM with configuration-driven test cases supporting single-node, multi-node aggregated, and multi-node disaggregated architectures.
 
 ## Overview
 
-- Run performance benchmarks across multiple model configurations
+- Run performance sanity benchmarks across multiple model configurations
+- Support three deployment architectures: single-node, multi-node aggregated, and multi-node disaggregated
 - Manage test cases through YAML configuration files
-- Support selective execution of specific test cases
+- Automated resource calculation and job submission via SLURM
 
-## Scripts Overview
+## Configuration File Types
 
-### 1. `benchmark_config.yaml` - Test Case Configuration
-**Purpose**: Defines all benchmark test cases in a structured YAML format.
+There are three types of YAML configuration files for different deployment architectures:
+
+### 1. Single-Node Aggregated Test Configuration
+
+**File Example**: `l0_dgx_b200.yaml`
+
+**Use Case**: Single-node performance tests on a single server with multiple GPUs.
 
 **Structure**:
 ```yaml
 server_configs:
-  - name: "r1_fp4_dep4"
-    model_name: "deepseek_r1_0528_fp4"
-    tp: 4
-    ep: 4
-    pp: 1
+  - name: "r1_fp8_dep8_mtp1_1k1k"
+    model_name: "deepseek_r1_0528_fp8"
+    gpus: 8
+    tensor_parallel_size: 8
+    moe_expert_parallel_size: 8
+    pipeline_parallel_size: 1
+    max_batch_size: 512
+    max_num_tokens: 8192
     attention_backend: "TRTLLM"
-    moe_backend: "CUTLASS"
-    moe_max_num_tokens: ""
     enable_attention_dp: true
-    enable_chunked_prefill: false
-    max_num_tokens: 2176
-    disable_overlap_scheduler: false
-    kv_cache_dtype: "fp8"
-    enable_block_reuse: false
-    free_gpu_memory_fraction: 0.8
-    max_batch_size: 256
-    enable_padding: true
+    attention_dp_config:
+      batching_wait_iters: 0
+      enable_balance: true
+      timeout_iters: 60
+    moe_config:
+      backend: 'DEEPGEMM'
+    cuda_graph_config:
+      enable_padding: true
+      max_batch_size: 512
+    kv_cache_config:
+      dtype: 'fp8'
+      enable_block_reuse: false
+      free_gpu_memory_fraction: 0.8
+    speculative_config:
+      decoding_type: 'MTP'
+      num_nextn_predict_layers: 1
     client_configs:
-      - name: "con1_iter1_1024_1024"
-        concurrency: 1
-        iterations: 1
+      - name: "con4096_iter10_1k1k"
+        concurrency: 4096
+        iterations: 10
         isl: 1024
         osl: 1024
-        random_range_ratio: 0.0
-      - name: "con8_iter1_1024_1024"
-        concurrency: 8
-        iterations: 1
-        isl: 1024
-        osl: 1024
-        random_range_ratio: 0.0
+        random_range_ratio: 0.8
+        backend: "openai"
+```
 
-  - name: "r1_fp4_tep4"
-    model_name: "deepseek_r1_0528_fp4"
-    tp: 4
-    ep: 4
-    pp: 1
-    attention_backend: "TRTLLM"
-    moe_backend: "CUTLASS"
-    moe_max_num_tokens: ""
-    enable_attention_dp: false
-    enable_chunked_prefill: false
-    max_num_tokens: 2176
-    disable_overlap_scheduler: false
-    kv_cache_dtype: "fp8"
-    enable_block_reuse: false
-    free_gpu_memory_fraction: 0.8
-    max_batch_size: 256
-    enable_padding: true
+
+### 2. Multi-Node Aggregated Test Configuration
+
+**File Example**: `l0_gb200_multi_nodes.yaml`
+
+**Use Case**: Multi-node aggregated architecture where model runs across multiple nodes with unified execution.
+
+**Structure**:
+```yaml
+# Hardware Config
+hardware:
+  gpus_per_node: 4
+  gpus_per_server: 8
+
+server_configs:
+  - name: "r1_fp4_v2_dep8_mtp1"
+    model_name: "deepseek_r1_0528_fp4_v2"
+    gpus: 8
+    gpus_per_node: 4
+    trust_remote_code: true
+    tensor_parallel_size: 8
+    moe_expert_parallel_size: 8
+    pipeline_parallel_size: 1
+    max_batch_size: 512
+    max_num_tokens: 2112
+    attn_backend: "TRTLLM"
+    enable_attention_dp: true
+    attention_dp_config:
+      batching_wait_iters: 0
+      enable_balance: true
+      timeout_iters: 60
+    moe_config:
+      backend: 'CUTLASS'
+    cuda_graph_config:
+      enable_padding: true
+      max_batch_size: 512
+    kv_cache_config:
+      dtype: 'fp8'
+      enable_block_reuse: false
+      free_gpu_memory_fraction: 0.5
     client_configs:
-      - name: "con1_iter1_1024_1024"
-        concurrency: 1
-        iterations: 1
+      - name: "con32_iter12_1k1k"
+        concurrency: 32
+        iterations: 12
         isl: 1024
         osl: 1024
-        random_range_ratio: 0.0
-      - name: "con8_iter1_1024_1024"
-        concurrency: 8
-        iterations: 1
-        isl: 1024
-        osl: 1024
-        random_range_ratio: 0.0
-```
-
-### 2. `run_benchmark_serve.py` - Main Benchmark Runner
-**Purpose**: Executes performance benchmarks based on YAML configuration files.
-
-**Usage**:
-```bash
-python run_benchmark_serve.py --log_folder <log_folder> --config_file <config_file> [--select <select_pattern>] [--timeout 5400]
-```
-
-**Arguments**:
-- `--log_folder`: Directory to store benchmark logs (required)
-- `--config_file`: Path to YAML configuration file (required)
-- `--select`: Select pattern for specific Server and Client Config. (optional, default: all test cases)
-- `--timeout`: Timeout for server setup. (optional, default: 3600 seconds)
-
-**Examples**:
-```bash
-# Select
-python run_benchmark_serve.py --log_folder ./results --config_file benchmark_config.yaml --select "r1_fp4_dep4:con8_iter1_1024_1024,r1_fp4_tep4:con1_iter1_1024_1024"
-
-```
-
-### 3. `parse_benchmark_results.py` - Results Parser
-**Purpose**: Print log's perf.
-
-**Arguments**:
-- `--log_folder`: Directory to store benchmark logs (required)
-
-**Usage**:
-```bash
-python parse_benchmark_results.py --log_folder <log_folder>
-```
-
-
-### 4. `benchmark-serve.sh` - SLURM Job Script
-**Usage**:
-```bash
-sbatch benchmark-serve.sh [IMAGE] [bench_dir] [log_folder] [select_pattern]
-```
-
-**Parameters**:
-- `IMAGE`: Docker image (default: tensorrt-llm-staging/release:main-x86_64)
-- `bench_dir`: Directory containing config file and benchmark scripts (default: current directory)
-- `log_folder`: Directory containing output logs and csv. (default: current directory)
-- `select_pattern`: Select pattern (default: default - all test cases)
-
-**Examples**:
-```bash
-
-bench_dir="/path/to/benchmark/scripts"
-log_folder="/path/to/store/output/files"
-sbatch --reservation=RES--COM-3970 --qos=reservation -D ${log_folder} ${bench_dir}/benchmark-serve.sh urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm-staging/release:main-x86_64 ${bench_dir} ${log_folder} "r1_fp4_dep4:con8_iter1_1024_1024,r1_fp4_tep4:con1_iter1_1024_1024"
-
+        random_range_ratio: 0.8
+        backend: "openai"
 ```
diff --git a/tests/scripts/perf-sanity/l0_dgx_b200.yaml b/tests/scripts/perf-sanity/l0_dgx_b200.yaml
index d8fccb78ef..17679d4ac8 100644
--- a/tests/scripts/perf-sanity/l0_dgx_b200.yaml
+++ b/tests/scripts/perf-sanity/l0_dgx_b200.yaml
@@ -1,58 +1,293 @@
 server_configs:
-  - name: "r1_fp4_dep4"
-    model_name: "deepseek_r1_0528_fp4"
-    gpus: 4
-    tp: 4
-    ep: 4
-    pp: 1
-    attention_backend: "TRTLLM"
-    moe_backend: "CUTLASS"
+  - name: "r1_fp8_dep8_mtp1_1k1k"
+    model_name: "deepseek_r1_0528_fp8"
+    gpus: 8
+    tensor_parallel_size: 8
+    moe_expert_parallel_size: 8
+    pipeline_parallel_size: 1
+    max_batch_size: 512
+    max_num_tokens: 8192
+    attn_backend: "TRTLLM"
     enable_attention_dp: true
-    enable_chunked_prefill: false
-    max_num_tokens: 2176
-    kv_cache_dtype: "fp8"
-    free_gpu_memory_fraction: 0.8
-    max_batch_size: 256
-    enable_padding: true
+    attention_dp_config:
+      batching_wait_iters: 0
+      enable_balance: true
+      timeout_iters: 60
+    moe_config:
+      backend: 'DEEPGEMM'
+    cuda_graph_config:
+      enable_padding: true
+      max_batch_size: 512
+    kv_cache_config:
+      dtype: 'fp8'
+      enable_block_reuse: false
+      free_gpu_memory_fraction: 0.8
+    speculative_config:
+      decoding_type: 'MTP'
+      num_nextn_predict_layers: 1
     client_configs:
-      - name: "con1_iter1_1024_1024"
-        concurrency: 1
-        iterations: 1
+      - name: "con4096_iter10_1k1k"
+        concurrency: 4096
+        iterations: 10
         isl: 1024
         osl: 1024
-        random_range_ratio: 0.0
-      - name: "con8_iter1_1024_1024"
-        concurrency: 8
-        iterations: 1
-        isl: 1024
-        osl: 1024
-        random_range_ratio: 0.0
+        random_range_ratio: 0.8
+        backend: "openai"
 
-  - name: "r1_fp4_tep4"
-    model_name: "deepseek_r1_0528_fp4"
-    gpus: 4
-    tp: 4
-    ep: 4
-    pp: 1
-    attention_backend: "TRTLLM"
-    moe_backend: "CUTLASS"
+  - name: "r1_fp8_tep8_mtp3_1k1k"
+    model_name: "deepseek_r1_0528_fp8"
+    gpus: 8
+    tensor_parallel_size: 8
+    moe_expert_parallel_size: 8
+    pipeline_parallel_size: 1
+    max_batch_size: 64
+    max_num_tokens: 8192
+    attn_backend: "TRTLLM"
     enable_attention_dp: false
-    enable_chunked_prefill: false
-    max_num_tokens: 2176
-    kv_cache_dtype: "fp8"
-    free_gpu_memory_fraction: 0.8
-    max_batch_size: 256
-    enable_padding: true
+    moe_config:
+      backend: 'DEEPGEMM'
+    cuda_graph_config:
+      enable_padding: true
+      max_batch_size: 64
+    kv_cache_config:
+      dtype: 'fp8'
+      enable_block_reuse: false
+      free_gpu_memory_fraction: 0.8
+    speculative_config:
+      decoding_type: 'MTP'
+      num_nextn_predict_layers: 3
     client_configs:
-      - name: "con1_iter1_1024_1024"
-        concurrency: 1
-        iterations: 1
+      - name: "con64_iter10_1k1k"
+        concurrency: 64
+        iterations: 10
         isl: 1024
         osl: 1024
-        random_range_ratio: 0.0
-      - name: "con8_iter1_1024_1024"
+        random_range_ratio: 0.8
+        backend: "openai"
+
+  - name: "r1_fp8_tp8_mtp3_1k1k"
+    model_name: "deepseek_r1_0528_fp8"
+    gpus: 8
+    tensor_parallel_size: 8
+    moe_expert_parallel_size: 1
+    pipeline_parallel_size: 1
+    max_batch_size: 8
+    max_num_tokens: 8192
+    attn_backend: "TRTLLM"
+    enable_attention_dp: false
+    moe_config:
+      backend: 'TRTLLM'
+    cuda_graph_config:
+      enable_padding: true
+      max_batch_size: 8
+    kv_cache_config:
+      dtype: 'fp8'
+      enable_block_reuse: false
+      free_gpu_memory_fraction: 0.8
+    speculative_config:
+      decoding_type: 'MTP'
+      num_nextn_predict_layers: 3
+    client_configs:
+      - name: "con8_iter10_1k1k"
         concurrency: 8
-        iterations: 1
+        iterations: 10
         isl: 1024
         osl: 1024
-        random_range_ratio: 0.0
+        random_range_ratio: 0.8
+        backend: "openai"
+
+  - name: "r1_fp4_v2_dep4_mtp1_1k1k"
+    model_name: "deepseek_r1_0528_fp4_v2"
+    gpus: 4
+    tensor_parallel_size: 4
+    moe_expert_parallel_size: 4
+    pipeline_parallel_size: 1
+    max_batch_size: 512
+    max_num_tokens: 8192
+    attn_backend: "TRTLLM"
+    enable_attention_dp: true
+    attention_dp_config:
+      batching_wait_iters: 0
+      enable_balance: true
+      timeout_iters: 60
+    moe_config:
+      backend: 'CUTLASS'
+    cuda_graph_config:
+      enable_padding: true
+      max_batch_size: 512
+    kv_cache_config:
+      dtype: 'fp8'
+      enable_block_reuse: false
+      free_gpu_memory_fraction: 0.8
+    speculative_config:
+      decoding_type: 'MTP'
+      num_nextn_predict_layers: 1
+    client_configs:
+      - name: "con2048_iter10_1k1k"
+        concurrency: 2048
+        iterations: 10
+        isl: 1024
+        osl: 1024
+        random_range_ratio: 0.8
+        backend: "openai"
+
+  - name: "r1_fp4_v2_tep4_mtp3_1k1k"
+    model_name: "deepseek_r1_0528_fp4_v2"
+    gpus: 4
+    tensor_parallel_size: 4
+    moe_expert_parallel_size: 4
+    pipeline_parallel_size: 1
+    max_batch_size: 32
+    max_num_tokens: 8192
+    attn_backend: "TRTLLM"
+    enable_attention_dp: false
+    moe_config:
+      backend: 'TRTLLM'
+    cuda_graph_config:
+      enable_padding: true
+      max_batch_size: 32
+    kv_cache_config:
+      dtype: 'fp8'
+      enable_block_reuse: false
+      free_gpu_memory_fraction: 0.8
+    speculative_config:
+      decoding_type: 'MTP'
+      num_nextn_predict_layers: 3
+    client_configs:
+      - name: "con32_iter10_1k1k"
+        concurrency: 32
+        iterations: 10
+        isl: 1024
+        osl: 1024
+        random_range_ratio: 0.8
+        backend: "openai"
+
+  - name: "r1_fp4_v2_tp4_mtp3_1k1k"
+    model_name: "deepseek_r1_0528_fp4_v2"
+    gpus: 4
+    tensor_parallel_size: 4
+    moe_expert_parallel_size: 1
+    pipeline_parallel_size: 1
+    max_batch_size: 4
+    max_num_tokens: 8192
+    attn_backend: "TRTLLM"
+    enable_attention_dp: false
+    moe_config:
+      backend: 'TRTLLM'
+    cuda_graph_config:
+      enable_padding: true
+      max_batch_size: 4
+    kv_cache_config:
+      dtype: 'fp8'
+      enable_block_reuse: false
+      free_gpu_memory_fraction: 0.8
+    speculative_config:
+      decoding_type: 'MTP'
+      num_nextn_predict_layers: 3
+    client_configs:
+      - name: "con4_iter10_1k1k"
+        concurrency: 4
+        iterations: 10
+        isl: 1024
+        osl: 1024
+        random_range_ratio: 0.8
+        backend: "openai"
+
+  - name: "gpt_oss_fp4_dep2_1k1k"
+    model_name: "gpt_oss_120b_fp4"
+    gpus: 2
+    tensor_parallel_size: 2
+    moe_expert_parallel_size: 2
+    pipeline_parallel_size: 1
+    max_batch_size: 1024
+    max_num_tokens: 20000
+    attn_backend: "TRTLLM"
+    enable_attention_dp: true
+    attention_dp_config:
+      enable_balance: true
+    moe_config:
+      backend: 'TRTLLM'
+    cuda_graph_config:
+      enable_padding: true
+      max_batch_size: 1024
+    kv_cache_config:
+      dtype: 'fp8'
+      enable_block_reuse: false
+      free_gpu_memory_fraction: 0.8
+    num_postprocess_workers: 4
+    stream_interval: 20
+    client_configs:
+      - name: "con2048_iter5_1k1k"
+        concurrency: 2048
+        iterations: 5
+        isl: 1024
+        osl: 1024
+        random_range_ratio: 0.8
+        backend: "openai"
+
+  - name: "gpt_oss_fp4_dep4_1k1k"
+    model_name: "gpt_oss_120b_fp4"
+    gpus: 4
+    tensor_parallel_size: 4
+    moe_expert_parallel_size: 4
+    pipeline_parallel_size: 1
+    max_batch_size: 512
+    max_num_tokens: 20000
+    attn_backend: "TRTLLM"
+    enable_attention_dp: true
+    attention_dp_config:
+      enable_balance: true
+    moe_config:
+      backend: 'TRTLLM'
+    cuda_graph_config:
+      enable_padding: true
+      max_batch_size: 512
+    kv_cache_config:
+      dtype: 'fp8'
+      enable_block_reuse: false
+      free_gpu_memory_fraction: 0.8
+    num_postprocess_workers: 4
+    stream_interval: 20
+    client_configs:
+      - name: "con2048_iter5_1k1k"
+        concurrency: 2048
+        iterations: 5
+        isl: 1024
+        osl: 1024
+        random_range_ratio: 0.8
+        backend: "openai"
+
+  - name: "gpt_oss_fp4_tp4_eagle3_1k1k"
+    model_name: "gpt_oss_120b_fp4"
+    gpus: 4
+    tensor_parallel_size: 4
+    moe_expert_parallel_size: 1
+    pipeline_parallel_size: 1
+    max_batch_size: 1
+    max_num_tokens: 20000
+    attn_backend: "TRTLLM"
+    enable_attention_dp: false
+    moe_config:
+      backend: 'TRTLLM'
+    cuda_graph_config:
+      enable_padding: true
+      max_batch_size: 1
+    kv_cache_config:
+      dtype: 'fp8'
+      enable_block_reuse: false
+      free_gpu_memory_fraction: 0.8
+    speculative_config:
+      decoding_type: 'Eagle'
+      eagle3_layers_to_capture: [-1]
+      max_draft_len: 3
+      speculative_model_dir: "gpt_oss/gpt-oss-120b-Eagle3"
+    stream_interval: 20
+    num_postprocess_workers: 4
+    client_configs:
+      - name: "con1_iter32_1k1k"
+        concurrency: 1
+        iterations: 32
+        isl: 1024
+        osl: 1024
+        random_range_ratio: 0.8
+        backend: "openai"
diff --git a/tests/scripts/perf-sanity/l0_dgx_b300.yaml b/tests/scripts/perf-sanity/l0_dgx_b300.yaml
index d8fccb78ef..b19ca77812 100644
--- a/tests/scripts/perf-sanity/l0_dgx_b300.yaml
+++ b/tests/scripts/perf-sanity/l0_dgx_b300.yaml
@@ -1,58 +1,194 @@
 server_configs:
-  - name: "r1_fp4_dep4"
-    model_name: "deepseek_r1_0528_fp4"
-    gpus: 4
-    tp: 4
-    ep: 4
-    pp: 1
-    attention_backend: "TRTLLM"
-    moe_backend: "CUTLASS"
+  - name: "r1_fp8_dep8_mtp1_1k1k"
+    model_name: "deepseek_r1_0528_fp8"
+    gpus: 8
+    tensor_parallel_size: 8
+    moe_expert_parallel_size: 8
+    pipeline_parallel_size: 1
+    max_batch_size: 512
+    max_num_tokens: 8192
+    attn_backend: "TRTLLM"
     enable_attention_dp: true
-    enable_chunked_prefill: false
-    max_num_tokens: 2176
-    kv_cache_dtype: "fp8"
-    free_gpu_memory_fraction: 0.8
-    max_batch_size: 256
-    enable_padding: true
+    attention_dp_config:
+      batching_wait_iters: 0
+      enable_balance: true
+      timeout_iters: 60
+    moe_config:
+      backend: 'DEEPGEMM'
+    cuda_graph_config:
+      enable_padding: true
+      max_batch_size: 512
+    kv_cache_config:
+      dtype: 'fp8'
+      enable_block_reuse: false
+      free_gpu_memory_fraction: 0.8
+    speculative_config:
+      decoding_type: 'MTP'
+      num_nextn_predict_layers: 1
     client_configs:
-      - name: "con1_iter1_1024_1024"
-        concurrency: 1
-        iterations: 1
+      - name: "con4096_iter10_1k1k"
+        concurrency: 4096
+        iterations: 10
         isl: 1024
         osl: 1024
-        random_range_ratio: 0.0
-      - name: "con8_iter1_1024_1024"
-        concurrency: 8
-        iterations: 1
-        isl: 1024
-        osl: 1024
-        random_range_ratio: 0.0
+        random_range_ratio: 0.8
+        backend: "openai"
 
-  - name: "r1_fp4_tep4"
-    model_name: "deepseek_r1_0528_fp4"
-    gpus: 4
-    tp: 4
-    ep: 4
-    pp: 1
-    attention_backend: "TRTLLM"
-    moe_backend: "CUTLASS"
+  - name: "r1_fp8_tep8_mtp3_1k1k"
+    model_name: "deepseek_r1_0528_fp8"
+    gpus: 8
+    tensor_parallel_size: 8
+    moe_expert_parallel_size: 8
+    pipeline_parallel_size: 1
+    max_batch_size: 64
+    max_num_tokens: 8192
+    attn_backend: "TRTLLM"
     enable_attention_dp: false
-    enable_chunked_prefill: false
-    max_num_tokens: 2176
-    kv_cache_dtype: "fp8"
-    free_gpu_memory_fraction: 0.8
-    max_batch_size: 256
-    enable_padding: true
+    moe_config:
+      backend: 'DEEPGEMM'
+    cuda_graph_config:
+      enable_padding: true
+      max_batch_size: 64
+    kv_cache_config:
+      dtype: 'fp8'
+      enable_block_reuse: false
+      free_gpu_memory_fraction: 0.8
+    speculative_config:
+      decoding_type: 'MTP'
+      num_nextn_predict_layers: 3
     client_configs:
-      - name: "con1_iter1_1024_1024"
-        concurrency: 1
-        iterations: 1
+      - name: "con64_iter10_1k1k"
+        concurrency: 64
+        iterations: 10
         isl: 1024
         osl: 1024
-        random_range_ratio: 0.0
-      - name: "con8_iter1_1024_1024"
+        random_range_ratio: 0.8
+        backend: "openai"
+
+  - name: "r1_fp8_tp8_mtp3_1k1k"
+    model_name: "deepseek_r1_0528_fp8"
+    gpus: 8
+    tensor_parallel_size: 8
+    moe_expert_parallel_size: 1
+    pipeline_parallel_size: 1
+    max_batch_size: 8
+    max_num_tokens: 8192
+    attn_backend: "TRTLLM"
+    enable_attention_dp: false
+    moe_config:
+      backend: 'TRTLLM'
+    cuda_graph_config:
+      enable_padding: true
+      max_batch_size: 8
+    kv_cache_config:
+      dtype: 'fp8'
+      enable_block_reuse: false
+      free_gpu_memory_fraction: 0.8
+    speculative_config:
+      decoding_type: 'MTP'
+      num_nextn_predict_layers: 3
+    client_configs:
+      - name: "con8_iter10_1k1k"
         concurrency: 8
-        iterations: 1
+        iterations: 10
         isl: 1024
         osl: 1024
-        random_range_ratio: 0.0
+        random_range_ratio: 0.8
+        backend: "openai"
+
+  - name: "r1_fp4_v2_dep4_mtp1_1k1k"
+    model_name: "deepseek_r1_0528_fp4_v2"
+    gpus: 4
+    tensor_parallel_size: 4
+    moe_expert_parallel_size: 4
+    pipeline_parallel_size: 1
+    max_batch_size: 512
+    max_num_tokens: 8192
+    attn_backend: "TRTLLM"
+    enable_attention_dp: true
+    attention_dp_config:
+      batching_wait_iters: 0
+      enable_balance: true
+      timeout_iters: 60
+    moe_config:
+      backend: 'CUTLASS'
+    cuda_graph_config:
+      enable_padding: true
+      max_batch_size: 512
+    kv_cache_config:
+      dtype: 'fp8'
+      enable_block_reuse: false
+      free_gpu_memory_fraction: 0.8
+    speculative_config:
+      decoding_type: 'MTP'
+      num_nextn_predict_layers: 1
+    client_configs:
+      - name: "con2048_iter10_1k1k"
+        concurrency: 2048
+        iterations: 10
+        isl: 1024
+        osl: 1024
+        random_range_ratio: 0.8
+        backend: "openai"
+
+  - name: "r1_fp4_v2_tep4_mtp3_1k1k"
+    model_name: "deepseek_r1_0528_fp4_v2"
+    gpus: 4
+    tensor_parallel_size: 4
+    moe_expert_parallel_size: 4
+    pipeline_parallel_size: 1
+    max_batch_size: 32
+    max_num_tokens: 8192
+    attn_backend: "TRTLLM"
+    enable_attention_dp: false
+    moe_config:
+      backend: 'TRTLLM'
+    cuda_graph_config:
+      enable_padding: true
+      max_batch_size: 32
+    kv_cache_config:
+      dtype: 'fp8'
+      enable_block_reuse: false
+      free_gpu_memory_fraction: 0.8
+    speculative_config:
+      decoding_type: 'MTP'
+      num_nextn_predict_layers: 3
+    client_configs:
+      - name: "con32_iter10_1k1k"
+        concurrency: 32
+        iterations: 10
+        isl: 1024
+        osl: 1024
+        random_range_ratio: 0.8
+        backend: "openai"
+
+  - name: "r1_fp4_v2_tp4_mtp3_1k1k"
+    model_name: "deepseek_r1_0528_fp4_v2"
+    gpus: 4
+    tensor_parallel_size: 4
+    moe_expert_parallel_size: 1
+    pipeline_parallel_size: 1
+    max_batch_size: 4
+    max_num_tokens: 8192
+    attn_backend: "TRTLLM"
+    enable_attention_dp: false
+    moe_config:
+      backend: 'TRTLLM'
+    cuda_graph_config:
+      enable_padding: true
+      max_batch_size: 4
+    kv_cache_config:
+      dtype: 'fp8'
+      enable_block_reuse: false
+      free_gpu_memory_fraction: 0.8
+    speculative_config:
+      decoding_type: 'MTP'
+      num_nextn_predict_layers: 3
+    client_configs:
+      - name: "con4_iter10_1k1k"
+        concurrency: 4
+        iterations: 10
+        isl: 1024
+        osl: 1024
+        random_range_ratio: 0.8
+        backend: "openai"
diff --git a/tests/scripts/perf-sanity/l0_gb200_multi_gpus.yaml b/tests/scripts/perf-sanity/l0_gb200_multi_gpus.yaml
new file mode 100644
index 0000000000..8e8efc1bc3
--- /dev/null
+++ b/tests/scripts/perf-sanity/l0_gb200_multi_gpus.yaml
@@ -0,0 +1,294 @@
+server_configs:
+  # 1k1k configs
+  - name: "r1_fp4_v2_dep4_mtp1_1k1k"
+    model_name: "deepseek_r1_0528_fp4_v2"
+    gpus: 4
+    tensor_parallel_size: 4
+    moe_expert_parallel_size: 4
+    pipeline_parallel_size: 1
+    max_batch_size: 512
+    max_num_tokens: 8192
+    attn_backend: "TRTLLM"
+    enable_attention_dp: true
+    attention_dp_config:
+      batching_wait_iters: 0
+      enable_balance: true
+      timeout_iters: 60
+    moe_config:
+      backend: 'CUTLASS'
+    cuda_graph_config:
+      enable_padding: true
+      max_batch_size: 512
+    kv_cache_config:
+      dtype: 'fp8'
+      enable_block_reuse: false
+      free_gpu_memory_fraction: 0.8
+    speculative_config:
+      decoding_type: 'MTP'
+      num_nextn_predict_layers: 1
+    client_configs:
+      - name: "con2048_iter10_1k1k"
+        concurrency: 2048
+        iterations: 10
+        isl: 1024
+        osl: 1024
+        random_range_ratio: 0.8
+        backend: "openai"
+
+  - name: "r1_fp4_v2_tep4_mtp3_1k1k"
+    model_name: "deepseek_r1_0528_fp4_v2"
+    gpus: 4
+    tensor_parallel_size: 4
+    moe_expert_parallel_size: 4
+    pipeline_parallel_size: 1
+    max_batch_size: 32
+    max_num_tokens: 8192
+    attn_backend: "TRTLLM"
+    enable_attention_dp: false
+    moe_config:
+      backend: 'TRTLLM'
+    cuda_graph_config:
+      enable_padding: true
+      max_batch_size: 32
+    kv_cache_config:
+      dtype: 'fp8'
+      enable_block_reuse: false
+      free_gpu_memory_fraction: 0.8
+    speculative_config:
+      decoding_type: 'MTP'
+      num_nextn_predict_layers: 3
+    client_configs:
+      - name: "con32_iter10_1k1k"
+        concurrency: 32
+        iterations: 10
+        isl: 1024
+        osl: 1024
+        random_range_ratio: 0.8
+        backend: "openai"
+
+  - name: "r1_fp4_v2_tp4_mtp3_1k1k"
+    model_name: "deepseek_r1_0528_fp4_v2"
+    gpus: 4
+    tensor_parallel_size: 4
+    moe_expert_parallel_size: 1
+    pipeline_parallel_size: 1
+    max_batch_size: 4
+    max_num_tokens: 8192
+    attn_backend: "TRTLLM"
+    enable_attention_dp: false
+    moe_config:
+      backend: 'TRTLLM'
+    cuda_graph_config:
+      enable_padding: true
+      max_batch_size: 4
+    kv_cache_config:
+      dtype: 'fp8'
+      enable_block_reuse: false
+      free_gpu_memory_fraction: 0.8
+    speculative_config:
+      decoding_type: 'MTP'
+      num_nextn_predict_layers: 3
+    client_configs:
+      - name: "con4_iter10_1k1k"
+        concurrency: 4
+        iterations: 10
+        isl: 1024
+        osl: 1024
+        random_range_ratio: 0.8
+        backend: "openai"
+
+  # 8k1k configs
+  - name: "r1_fp4_v2_dep4_mtp1_8k1k"
+    model_name: "deepseek_r1_0528_fp4_v2"
+    gpus: 4
+    tensor_parallel_size: 4
+    moe_expert_parallel_size: 4
+    pipeline_parallel_size: 1
+    max_batch_size: 512
+    max_num_tokens: 10304
+    attn_backend: "TRTLLM"
+    enable_attention_dp: true
+    attention_dp_config:
+      batching_wait_iters: 0
+      enable_balance: true
+      timeout_iters: 60
+    moe_config:
+      backend: 'CUTLASS'
+    cuda_graph_config:
+      enable_padding: true
+      max_batch_size: 512
+    kv_cache_config:
+      dtype: 'fp8'
+      enable_block_reuse: false
+      free_gpu_memory_fraction: 0.8
+    speculative_config:
+      decoding_type: 'MTP'
+      num_nextn_predict_layers: 1
+    client_configs:
+      - name: "con2048_iter10_8k1k"
+        concurrency: 2048
+        iterations: 10
+        isl: 8192
+        osl: 1024
+        random_range_ratio: 0.8
+        backend: "openai"
+
+  - name: "r1_fp4_v2_tep4_mtp3_8k1k"
+    model_name: "deepseek_r1_0528_fp4_v2"
+    gpus: 4
+    tensor_parallel_size: 4
+    moe_expert_parallel_size: 4
+    pipeline_parallel_size: 1
+    max_batch_size: 32
+    max_num_tokens: 10304
+    attn_backend: "TRTLLM"
+    enable_attention_dp: false
+    moe_config:
+      backend: 'TRTLLM'
+    cuda_graph_config:
+      enable_padding: true
+      max_batch_size: 32
+    kv_cache_config:
+      dtype: 'fp8'
+      enable_block_reuse: false
+      free_gpu_memory_fraction: 0.8
+    speculative_config:
+      decoding_type: 'MTP'
+      num_nextn_predict_layers: 3
+    client_configs:
+      - name: "con32_iter10_8k1k"
+        concurrency: 32
+        iterations: 10
+        isl: 8192
+        osl: 1024
+        random_range_ratio: 0.8
+        backend: "openai"
+
+  - name: "r1_fp4_v2_tp4_mtp3_8k1k"
+    model_name: "deepseek_r1_0528_fp4_v2"
+    gpus: 4
+    tensor_parallel_size: 4
+    moe_expert_parallel_size: 1
+    pipeline_parallel_size: 1
+    max_batch_size: 4
+    max_num_tokens: 10304
+    attn_backend: "TRTLLM"
+    enable_attention_dp: false
+    moe_config:
+      backend: 'TRTLLM'
+    cuda_graph_config:
+      enable_padding: true
+      max_batch_size: 4
+    kv_cache_config:
+      dtype: 'fp8'
+      enable_block_reuse: false
+      free_gpu_memory_fraction: 0.8
+    speculative_config:
+      decoding_type: 'MTP'
+      num_nextn_predict_layers: 3
+    client_configs:
+      - name: "con4_iter10_8k1k"
+        concurrency: 4
+        iterations: 10
+        isl: 8192
+        osl: 1024
+        random_range_ratio: 0.8
+        backend: "openai"
+
+  # 1k8k configs
+  - name: "r1_fp4_v2_dep4_mtp1_1k8k"
+    model_name: "deepseek_r1_0528_fp4_v2"
+    gpus: 4
+    tensor_parallel_size: 4
+    moe_expert_parallel_size: 4
+    pipeline_parallel_size: 1
+    max_batch_size: 512
+    max_num_tokens: 8192
+    attn_backend: "TRTLLM"
+    enable_attention_dp: true
+    attention_dp_config:
+      batching_wait_iters: 0
+      enable_balance: true
+      timeout_iters: 60
+    moe_config:
+      backend: 'CUTLASS'
+    cuda_graph_config:
+      enable_padding: true
+      max_batch_size: 512
+    kv_cache_config:
+      dtype: 'fp8'
+      enable_block_reuse: false
+      free_gpu_memory_fraction: 0.8
+    speculative_config:
+      decoding_type: 'MTP'
+      num_nextn_predict_layers: 1
+    client_configs:
+      - name: "con2048_iter10_1k8k"
+        concurrency: 2048
+        iterations: 10
+        isl: 1024
+        osl: 8192
+        random_range_ratio: 0.8
+        backend: "openai"
+
+  - name: "r1_fp4_v2_tep4_mtp3_1k8k"
+    model_name: "deepseek_r1_0528_fp4_v2"
+    gpus: 4
+    tensor_parallel_size: 4
+    moe_expert_parallel_size: 4
+    pipeline_parallel_size: 1
+    max_batch_size: 32
+    max_num_tokens: 8192
+    attn_backend: "TRTLLM"
+    enable_attention_dp: false
+    moe_config:
+      backend: 'TRTLLM'
+    cuda_graph_config:
+      enable_padding: true
+      max_batch_size: 32
+    kv_cache_config:
+      dtype: 'fp8'
+      enable_block_reuse: false
+      free_gpu_memory_fraction: 0.8
+    speculative_config:
+      decoding_type: 'MTP'
+      num_nextn_predict_layers: 3
+    client_configs:
+      - name: "con32_iter10_1k8k"
+        concurrency: 32
+        iterations: 10
+        isl: 1024
+        osl: 8192
+        random_range_ratio: 0.8
+        backend: "openai"
+
+  - name: "r1_fp4_v2_tp4_mtp3_1k8k"
+    model_name: "deepseek_r1_0528_fp4_v2"
+    gpus: 4
+    tensor_parallel_size: 4
+    moe_expert_parallel_size: 1
+    pipeline_parallel_size: 1
+    max_batch_size: 4
+    max_num_tokens: 8192
+    attn_backend: "TRTLLM"
+    enable_attention_dp: false
+    moe_config:
+      backend: 'TRTLLM'
+    cuda_graph_config:
+      enable_padding: true
+      max_batch_size: 4
+    kv_cache_config:
+      dtype: 'fp8'
+      enable_block_reuse: false
+      free_gpu_memory_fraction: 0.8
+    speculative_config:
+      decoding_type: 'MTP'
+      num_nextn_predict_layers: 3
+    client_configs:
+      - name: "con4_iter10_1k8k"
+        concurrency: 4
+        iterations: 10
+        isl: 1024
+        osl: 8192
+        random_range_ratio: 0.8
+        backend: "openai"
diff --git a/tests/scripts/perf-sanity/l0_gb200_multi_nodes.yaml b/tests/scripts/perf-sanity/l0_gb200_multi_nodes.yaml
new file mode 100644
index 0000000000..3dcdc83684
--- /dev/null
+++ b/tests/scripts/perf-sanity/l0_gb200_multi_nodes.yaml
@@ -0,0 +1,71 @@
+# Hardware Config
+hardware:
+  gpus_per_node: 4
+  gpus_per_server: 8
+
+server_configs:
+  - name: "r1_fp4_v2_dep8_mtp1"
+    model_name: "deepseek_r1_0528_fp4_v2"
+    gpus: 8
+    gpus_per_node: 4
+    trust_remote_code: true
+    tensor_parallel_size: 8
+    moe_expert_parallel_size: 8
+    pipeline_parallel_size: 1
+    max_batch_size: 512
+    max_num_tokens: 2112
+    attn_backend: "TRTLLM"
+    enable_attention_dp: true
+    attention_dp_config:
+      batching_wait_iters: 0
+      enable_balance: true
+      timeout_iters: 60
+    moe_config:
+      backend: 'CUTLASS'
+    cuda_graph_config:
+      enable_padding: true
+      max_batch_size: 512
+    kv_cache_config:
+      dtype: 'fp8'
+      enable_block_reuse: false
+      free_gpu_memory_fraction: 0.5
+    client_configs:
+      - name: "con32_iter12_1k1k"
+        concurrency: 32
+        iterations: 12
+        isl: 1024
+        osl: 1024
+        random_range_ratio: 0.8
+        backend: "openai"
+
+  - name: "r1_fp4_v2_tep8_mtp3"
+    model_name: "deepseek_r1_0528_fp4_v2"
+    gpus: 8
+    gpus_per_node: 4
+    trust_remote_code: true
+    tensor_parallel_size: 8
+    moe_expert_parallel_size: 8
+    pipeline_parallel_size: 1
+    max_batch_size: 512
+    max_num_tokens: 3136
+    attn_backend: "TRTLLM"
+    enable_attention_dp: false
+    moe_config:
+      backend: "TRTLLM"
+    cuda_graph_config:
+      enable_padding: true
+      max_batch_size: 512
+    kv_cache_config:
+      dtype: 'fp8'
+      free_gpu_memory_fraction: 0.5
+    speculative_config:
+      decoding_type: 'MTP'
+      num_nextn_predict_layers: 3
+    client_configs:
+      - name: "con32_iter12_1k1k"
+        concurrency: 32
+        iterations: 12
+        isl: 1024
+        osl: 1024
+        random_range_ratio: 0.8
+        backend: "openai"

From 4da0e1473cf244c7035a7bfc8155ad8bb0bc0238 Mon Sep 17 00:00:00 2001
From: Zheng Duan <200704041+zhengd-nv@users.noreply.github.com>
Date: Mon, 8 Dec 2025 09:51:10 +0800
Subject: [PATCH 06/10] [None][test] add ntp tolerance in time metrics
 verification (#9741)

Signed-off-by: zhengd-nv <200704041+zhengd-nv@users.noreply.github.com>
---
 .../integration/defs/disaggregated/test_disaggregated.py | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/tests/integration/defs/disaggregated/test_disaggregated.py b/tests/integration/defs/disaggregated/test_disaggregated.py
index b2db88f0d2..bb811de4d1 100644
--- a/tests/integration/defs/disaggregated/test_disaggregated.py
+++ b/tests/integration/defs/disaggregated/test_disaggregated.py
@@ -114,13 +114,16 @@ def validate_timing_metrics(perf_metrics_item, request_context=""):
          )), f"gen server_first_token_time is not numeric in {request_context}"
     assert gen_server_arrival <= gen_server_first_token, f"gen server_arrival_time > server_first_token_time in {request_context}"
 
+    # Network Time Protocol can ensure ms-level accuracy in LAN
+    ntp_tolerance = 1e-3
+
     # Validate timing relationships between different levels
     # Disaggregated server should receive request before individual servers
-    assert disagg_arrival <= ctx_server_arrival, f"disagg_arrival > ctx_server_arrival in {request_context}"
-    assert disagg_arrival <= gen_server_arrival, f"disagg_arrival > gen_server_arrival in {request_context}"
+    assert disagg_arrival - ntp_tolerance <= ctx_server_arrival, f"disagg_arrival > ctx_server_arrival in {request_context}"
+    assert disagg_arrival - ntp_tolerance <= gen_server_arrival, f"disagg_arrival > gen_server_arrival in {request_context}"
 
     # Context should complete before generation starts
-    assert ctx_server_first_token <= gen_server_arrival, f"ctx_server_first_token > gen_server_arrival in {request_context}"
+    assert ctx_server_first_token - ntp_tolerance <= gen_server_arrival, f"ctx_server_first_token > gen_server_arrival in {request_context}"
 
     # Validate internal timing consistency
     ctx_arrival_time = ctx_metrics["arrival_time"]

From 8e27ce7084d9fab1051e88fc945732e59689761b Mon Sep 17 00:00:00 2001
From: xxi <95731198+xxi-nv@users.noreply.github.com>
Date: Mon, 8 Dec 2025 10:19:40 +0800
Subject: [PATCH 07/10] [TRTLLM-9603][feat] Enable ConfigurableMoE test in the
 CI (#9645)

---
 .../modules/fused_moe/configurable_moe.py     |  29 +++--
 .../defs/accuracy/test_llm_api_pytorch.py     |  94 ++++++++++++--
 tests/integration/defs/conftest.py            |  92 ++++++++++++++
 .../test_lists/test-db/l0_dgx_b200.yml        |  10 ++
 tests/unittest/_torch/modules/conftest.py     | 118 ++++++++++++++++++
 .../unittest/_torch/modules/test_fused_moe.py |  62 ++++++++-
 6 files changed, 382 insertions(+), 23 deletions(-)
 create mode 100644 tests/unittest/_torch/modules/conftest.py

diff --git a/tensorrt_llm/_torch/modules/fused_moe/configurable_moe.py b/tensorrt_llm/_torch/modules/fused_moe/configurable_moe.py
index 717d8f78fe..c7df8e1f9a 100644
--- a/tensorrt_llm/_torch/modules/fused_moe/configurable_moe.py
+++ b/tensorrt_llm/_torch/modules/fused_moe/configurable_moe.py
@@ -170,18 +170,23 @@ class ConfigurableMoE(MoE):
         # ConfigurableMoE's super().__init__() was called with real layer_idx and initialized load balancer.
         # Backend was created with init_load_balancer=False and without_comm=True to avoid
         # duplicate initialization. Now sync all attributes from ConfigurableMoE to backend.
-        self.backend.aux_stream_dict = self.aux_stream_dict
-        self.backend.layer_idx = self.layer_idx
-        self.backend.layer_idx_str = self.layer_idx_str
-        self.backend.num_slots = self.num_slots
-        self.backend.layer_load_balancer = self.layer_load_balancer
-        self.backend.repeat_count = self.repeat_count
-        self.backend.repeat_idx = self.repeat_idx
-        self.backend.initial_local_expert_ids = self.initial_local_expert_ids
-        self.backend.initial_global_assignments = self.initial_global_assignments
-        self.backend.slot_start = self.slot_start
-        self.backend.slot_end = self.slot_end
-        self.backend.expert_size_per_partition = self.expert_size_per_partition
+        if self.backend is not None:
+            # Add a check to WAR the issue that the backend is none during torch.compile
+            assert not torch.compiler.is_compiling(), (
+                "Backend should not be none if not in torch.compile"
+            )
+            self.backend.aux_stream_dict = self.aux_stream_dict
+            self.backend.layer_idx = self.layer_idx
+            self.backend.layer_idx_str = self.layer_idx_str
+            self.backend.num_slots = self.num_slots
+            self.backend.layer_load_balancer = self.layer_load_balancer
+            self.backend.repeat_count = self.repeat_count
+            self.backend.repeat_idx = self.repeat_idx
+            self.backend.initial_local_expert_ids = self.initial_local_expert_ids
+            self.backend.initial_global_assignments = self.initial_global_assignments
+            self.backend.slot_start = self.slot_start
+            self.backend.slot_end = self.slot_end
+            self.backend.expert_size_per_partition = self.expert_size_per_partition
 
         # Create weights here, because the backend needs the layer_load_balancer info to create weights
         model_config._frozen = False
diff --git a/tests/integration/defs/accuracy/test_llm_api_pytorch.py b/tests/integration/defs/accuracy/test_llm_api_pytorch.py
index f5396fc8a6..09b1613f75 100644
--- a/tests/integration/defs/accuracy/test_llm_api_pytorch.py
+++ b/tests/integration/defs/accuracy/test_llm_api_pytorch.py
@@ -13,9 +13,37 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import os
+import sys
 
 import pytest
 import torch
+from mpi4py.futures import MPIPoolExecutor
+
+
+def patch_mpi_pool_session_for_env(mocker, env_vars: dict):
+    """
+    Patch MpiPoolSession._start_mpi_pool to propagate environment variables to MPI child processes.
+
+    Uses MPIPoolExecutor's built-in `env` parameter instead of `initializer` to avoid
+    segfault issues during process cleanup (UCX memory cache conflicts with PyTorch
+    tensor cleanup during Py_FinalizeEx).
+
+    Args:
+        mocker: pytest-mock mocker fixture
+        env_vars: Dictionary of environment variable name -> value to propagate
+    """
+    from tensorrt_llm.llmapi.mpi_session import MpiPoolSession
+
+    def patched_start_mpi_pool(self):
+        assert not self.mpi_pool, 'MPI session already started'
+        self.mpi_pool = MPIPoolExecutor(max_workers=self.n_workers,
+                                        path=sys.path,
+                                        env=env_vars)
+
+    mocker.patch.object(MpiPoolSession, '_start_mpi_pool',
+                        patched_start_mpi_pool)
+
+
 from defs.conftest import get_sm_version, is_sm_100f
 
 from tensorrt_llm import LLM
@@ -1830,9 +1858,24 @@ class TestDeepSeekV3Lite(LlmapiAccuracyTestHarness):
                              ids=["tp4", "ep4", "tp2pp2", "pp4"])
     @parametrize_with_ids("mtp_nextn", [0, 2])
     @parametrize_with_ids("moe_backend", ["CUTLASS", "TRTLLM", "CUTEDSL"])
+    @pytest.mark.parametrize("enable_configurable_moe", [0, 1],
+                             ids=lambda x: ""
+                             if x == 0 else "enable_configurable_moe")
     def test_nvfp4_4gpus(self, fp8kv, attention_dp, cuda_graph,
                          overlap_scheduler, tp_size, pp_size, ep_size,
-                         torch_compile, mtp_nextn, moe_backend):
+                         torch_compile, mtp_nextn, moe_backend,
+                         enable_configurable_moe, mocker):
+        # Handle ENABLE_CONFIGURABLE_MOE environment variable
+        if enable_configurable_moe == 1 and moe_backend != "TRTLLM":
+            pytest.skip(
+                f"ENABLE_CONFIGURABLE_MOE=1 is only supported with TRTLLM backend, "
+                f"current backend is {moe_backend}")
+
+        # Patch MpiPoolSession to propagate env vars to MPI worker processes
+        env_value = "1" if enable_configurable_moe == 1 and moe_backend == "TRTLLM" else "0"
+        patch_mpi_pool_session_for_env(mocker,
+                                       {"ENABLE_CONFIGURABLE_MOE": env_value})
+
         if moe_backend == "TRTLLM" and (get_sm_version() == 120
                                         or get_sm_version() == 121):
             pytest.skip(
@@ -3452,9 +3495,23 @@ class TestQwen3_30B_A3B(LlmapiAccuracyTestHarness):
         ids=["latency", "ep2", "ep4"])
     @pytest.mark.parametrize("activation_dtype", ["static_fp8", "mxfp8"],
                              ids=["fp8", "mxfp8"])
+    @pytest.mark.parametrize("enable_configurable_moe", [0, 1],
+                             ids=lambda x: ""
+                             if x == 0 else "enable_configurable_moe")
     def test_w4a8_mxfp4(self, moe_backend, tp_size, pp_size, ep_size,
                         attention_dp, cuda_graph, overlap_scheduler,
-                        activation_dtype):
+                        activation_dtype, enable_configurable_moe, mocker):
+        # Handle ENABLE_CONFIGURABLE_MOE environment variable
+        if enable_configurable_moe == 1 and moe_backend != "TRTLLM":
+            pytest.skip(
+                f"ENABLE_CONFIGURABLE_MOE=1 is only supported with TRTLLM backend, "
+                f"current backend is {moe_backend}")
+
+        # Patch MpiPoolSession to propagate env vars to MPI worker processes
+        env_value = "1" if enable_configurable_moe == 1 and moe_backend == "TRTLLM" else "0"
+        patch_mpi_pool_session_for_env(mocker,
+                                       {"ENABLE_CONFIGURABLE_MOE": env_value})
+
         if moe_backend == "TRITON":
             if not IS_TRITON_KERNELS_AVAILABLE:
                 pytest.skip("TRITON moe backend is not available.")
@@ -3906,9 +3963,23 @@ class TestGPTOSS(LlmapiAccuracyTestHarness):
             (4, 1, 4, True, True, True),
         ],
         ids=["tp4", "ep4", "dp4"])
+    @pytest.mark.parametrize("enable_configurable_moe", [0, 1],
+                             ids=lambda x: ""
+                             if x == 0 else "enable_configurable_moe")
     def test_w4_4gpus(self, kv_cache_dtype, moe_backend, tp_size, pp_size,
                       ep_size, attention_dp, cuda_graph, overlap_scheduler,
-                      mocker):
+                      enable_configurable_moe, mocker):
+        # Handle ENABLE_CONFIGURABLE_MOE environment variable
+        if enable_configurable_moe == 1 and moe_backend != "TRTLLM":
+            pytest.skip(
+                f"ENABLE_CONFIGURABLE_MOE=1 is only supported with TRTLLM backend, "
+                f"current backend is {moe_backend}")
+
+        # Patch MpiPoolSession to propagate env vars to MPI worker processes
+        env_value = "1" if enable_configurable_moe == 1 and moe_backend == "TRTLLM" else "0"
+        patch_mpi_pool_session_for_env(mocker,
+                                       {"ENABLE_CONFIGURABLE_MOE": env_value})
+
         if moe_backend == "TRITON":
             if not IS_TRITON_KERNELS_AVAILABLE:
                 pytest.skip("Triton kernels are not available")
@@ -3925,7 +3996,8 @@ class TestGPTOSS(LlmapiAccuracyTestHarness):
 
         pytorch_config = dict(
             disable_overlap_scheduler=not overlap_scheduler,
-            cuda_graph_config=CudaGraphConfig() if cuda_graph else None)
+            cuda_graph_config=CudaGraphConfig() if cuda_graph else None,
+            moe_config=MoeConfig(backend=moe_backend))
 
         kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.7,
                                         dtype=kv_cache_dtype)
@@ -3939,8 +4011,7 @@ class TestGPTOSS(LlmapiAccuracyTestHarness):
                   max_seq_len=max_seq_len,
                   max_batch_size=720,
                   **pytorch_config,
-                  enable_attention_dp=attention_dp,
-                  moe_config=MoeConfig(backend=moe_backend))
+                  enable_attention_dp=attention_dp)
 
         with llm:
             model_name = "GPT-OSS/120B-MXFP4"
@@ -4252,8 +4323,17 @@ class TestGPTOSS(LlmapiAccuracyTestHarness):
     @pytest.mark.parametrize(
         "kv_cache_dtype",
         ["auto", pytest.param("fp8", marks=skip_pre_blackwell)])
-    def test_w4_4gpus_online_eplb(self, kv_cache_dtype, mocker):
+    @pytest.mark.parametrize("enable_configurable_moe", [0, 1],
+                             ids=lambda x: ""
+                             if x == 0 else "enable_configurable_moe")
+    def test_w4_4gpus_online_eplb(self, kv_cache_dtype, enable_configurable_moe,
+                                  mocker):
         """Test GPTOSS with online expert parallel load balancer using TRTLLM backend and attention DP."""
+        # Patch MpiPoolSession to propagate env vars to MPI worker processes
+        env_value = "1" if enable_configurable_moe == 1 else "0"
+        patch_mpi_pool_session_for_env(mocker,
+                                       {"ENABLE_CONFIGURABLE_MOE": env_value})
+
         mocker.patch.object(GSM8K, "MAX_OUTPUT_LEN", 8192)
         mocker.patch.dict(GSM8K.EVALUATE_KWARGS,
                           {"scores_filter": "exact_match,flexible-extract"})
diff --git a/tests/integration/defs/conftest.py b/tests/integration/defs/conftest.py
index ee374ef086..792eca22a7 100644
--- a/tests/integration/defs/conftest.py
+++ b/tests/integration/defs/conftest.py
@@ -2209,6 +2209,94 @@ def pytest_generate_tests(metafunc: pytest.Metafunc):
     metafunc.parametrize("case", uts, ids=lambda x: x)
 
 
+# Test cases that use enable_configurable_moe parameter and need ID conversion
+TESTS_WITH_CONFIGURABLE_MOE = [
+    "TestDeepSeekV3Lite::test_nvfp4_4gpus",
+    "TestGPTOSS::test_w4_4gpus",
+    "TestGPTOSS::test_w4_4gpus_online_eplb",
+    "TestQwen3_30B_A3B::test_w4a8_mxfp4",
+]
+
+
+def _convert_clean_to_original_moe_test_id(test_id):
+    """Convert clean MoE test ID back to original format for pytest collection.
+
+    Example: "test_llm_api_pytorch.py::test_foo[param]" -> "test_llm_api_pytorch.py::test_foo[-param]"
+
+    This is needed because the `enable_configurable_moe` parameter uses empty string
+    as ID when value is 0, resulting in test IDs like "test_foo[-param]".
+    We clean these up in pytest_collection_modifyitems, but pytest filters tests
+    during collection using the original IDs. So when user runs with clean test name,
+    we need to convert it back to match the original.
+    """
+    if "test_llm_api_pytorch.py" not in test_id:
+        return test_id
+
+    # Match pattern like "test_name[params]" and add leading dash after "["
+    # But only if params don't already start with "-" or "enable_configurable_moe"
+    match = re.search(r"\[([^\]]+)\]", test_id)
+    if match:
+        params = match.group(1)
+        # Skip if already has leading dash or starts with enable_configurable_moe
+        if not params.startswith("-") and not params.startswith(
+                "enable_configurable_moe"):
+            # Add leading dash to params
+            new_params = "-" + params
+            test_id = test_id.replace(f"[{params}]", f"[{new_params}]")
+
+    return test_id
+
+
+def pytest_sessionstart(session):
+    """Convert clean MoE test IDs in config.args to original format for collection.
+
+    This is needed because pytest filters tests during collection using original IDs.
+    When user runs with clean test name, we convert it back to match the original.
+    """
+    args = session.config.args
+    for i, arg in enumerate(args):
+        if "test_llm_api_pytorch.py" in arg and "[" in arg:
+            # Only apply conversion to specific tests that use enable_configurable_moe
+            should_convert = any(test_name in arg
+                                 for test_name in TESTS_WITH_CONFIGURABLE_MOE)
+            if should_convert:
+                args[i] = _convert_clean_to_original_moe_test_id(arg)
+
+
+def _clean_moe_test_ids(items):
+    """Clean up test IDs by removing leading/trailing dashes from parameter IDs.
+
+    This is needed because `enable_configurable_moe` parameter can be empty,
+    resulting in ugly test IDs like "test_foo[-True]" or "test_foo[--abc]".
+    We clean these up to "test_foo[True]" or "test_foo[abc]" so that:
+    1. Test names in waive files and test lists remain unchanged
+    2. Test reports look cleaner
+    """
+    for item in items:
+        if "test_llm_api_pytorch.py" in item.nodeid and "[" in item.nodeid:
+            # Only apply cleanup to specific tests that use enable_configurable_moe
+            should_cleanup = any(test_name in item.nodeid
+                                 for test_name in TESTS_WITH_CONFIGURABLE_MOE)
+            if should_cleanup:
+                original_nodeid = item.nodeid
+                original_name = item.name
+                nodeid = item.nodeid
+                name = item.name
+
+                # Clean up leading/trailing dashes in nodeid
+                nodeid = nodeid.replace("[-", "[")
+                nodeid = nodeid.replace("-]", "]")
+
+                # Clean up leading/trailing dashes in name
+                name = name.replace("[-", "[")
+                name = name.replace("-]", "]")
+
+                if nodeid != original_nodeid:
+                    item._nodeid = nodeid
+                if name != original_name:
+                    item.name = name
+
+
 @pytest.hookimpl(tryfirst=True, hookwrapper=True)
 def pytest_collection_modifyitems(session, config, items):
     testlist_path = config.getoption("--test-list")
@@ -2217,6 +2305,10 @@ def pytest_collection_modifyitems(session, config, items):
     perf_test = config.getoption("--perf")
     test_model_suites = config.getoption("--test-model-suites")
 
+    # TODO Once the MoE refactor is complete, this should be removed.
+    # This is a temporary WAR to minimize the impact of the MoE refactor on the existing test lists.
+    _clean_moe_test_ids(items)
+
     if perf_test:
         global ALL_PYTEST_ITEMS
         ALL_PYTEST_ITEMS = None
diff --git a/tests/integration/test_lists/test-db/l0_dgx_b200.yml b/tests/integration/test_lists/test-db/l0_dgx_b200.yml
index 1204d0c8e6..7bac4b180f 100644
--- a/tests/integration/test_lists/test-db/l0_dgx_b200.yml
+++ b/tests/integration/test_lists/test-db/l0_dgx_b200.yml
@@ -17,6 +17,10 @@ l0_dgx_b200:
   tests:
   - unittest/_torch/modules/test_fused_moe.py::test_fused_moe_alltoall_fp4[DeepEPLowLatency]
   - unittest/_torch/modules/test_fused_moe.py::test_fused_moe_alltoall_fp4[MNNVL]
+  - unittest/_torch/modules/test_fused_moe.py::test_fused_moe_nvfp4[enable_configurable_moe-TRTLLM-dtype1]
+  - unittest/_torch/modules/test_fused_moe.py::test_fused_moe_w4a8_nvfp4_fp8[enable_configurable_moe-TRTLLM]
+  - unittest/_torch/modules/test_fused_moe.py::test_fused_moe_mxfp4_mxfp8[enable_configurable_moe-True-8-64-TRTLLM]
+  - unittest/_torch/modules/test_fused_moe.py::test_fused_moe_wfp4a16[enable_configurable_moe-TRTLLM-2880-dtype0]
   - accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_bfloat16_4gpus[pp4-attn_backend=TRTLLM-torch_compile=False]
   - accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8_4gpus[tp4-fp8kv=True-attn_backend=TRTLLM-torch_compile=False]
   - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[tp4-mtp_nextn=2-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=True]
@@ -158,6 +162,8 @@ l0_dgx_b200:
   - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[pp4-mtp_nextn=2-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=True]
   - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[pp4-mtp_nextn=0-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False]
   - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[pp4-mtp_nextn=2-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False]
+  - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[enable_configurable_moe-moe_backend=TRTLLM-mtp_nextn=0-tp4-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False]
+  - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[enable_configurable_moe-moe_backend=TRTLLM-mtp_nextn=0-ep4-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=True]
   - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTLASS-mtp_nextn=0-tp4-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=True]
   - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTLASS-mtp_nextn=0-tp4-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=True]
   - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTLASS-mtp_nextn=0-tp2pp2-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=True]
@@ -191,12 +197,16 @@ l0_dgx_b200:
   - accuracy/test_llm_api_pytorch.py::TestLlama4ScoutInstruct::test_fp8_chunked_prefill[tp4ep4-cuda_graph=True]
   - accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_guided_decoding[xgrammar-mtp_nextn=0]
   - accuracy/test_disaggregated_serving.py::TestQwen3_30B_A3B::test_mixed_ctx_gen_model[ctxpp2gentp2]
+  - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[enable_configurable_moe-tp4-trtllm-fp8]
+  - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[enable_configurable_moe-ep4-trtllm-fp8]
+  - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[enable_configurable_moe-dp4-trtllm-fp8]
   - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[tp4-cutlass-auto]
   - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[tp4-triton-auto]
   - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[ep4-trtllm-auto]
   - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[ep4-trtllm-fp8]
   - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[dp4-cutlass-auto]
   - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[dp4-triton-auto]
+  - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus_online_eplb[enable_configurable_moe-fp8]
   - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3[trtllm-one_model-overlap_scheduler]
   - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3[trtllm-one_model-no_overlap_scheduler]
   - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3[trtllm-two_model-no_overlap_scheduler]
diff --git a/tests/unittest/_torch/modules/conftest.py b/tests/unittest/_torch/modules/conftest.py
new file mode 100644
index 0000000000..c7e85eeeea
--- /dev/null
+++ b/tests/unittest/_torch/modules/conftest.py
@@ -0,0 +1,118 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# TEMPORARY FILE - Will be removed after MoE refactor is complete.
+#
+# Background:
+# The `enable_configurable_moe` parameter is a temporary measure during the MoE
+# refactor. The old and new MoE flows will coexist for a period of time. To avoid
+# large-scale changes to the existing test lists, we handle the test ID cleanup
+# here. Once the refactor is complete and all tests use ConfigurableMoE by default,
+# this file will no longer be needed and should be deleted.
+#
+# Two-phase approach:
+# 1. pytest_sessionstart: Convert clean test names in CLI args back to original
+#    format so pytest can find tests during collection.
+# 2. pytest_collection_modifyitems: Clean up the collected test IDs for display
+#    and waive matching.
+import re
+
+# Test functions that use enable_configurable_moe parameter and need ID conversion
+TESTS_WITH_CONFIGURABLE_MOE = [
+    "test_fused_moe_nvfp4",
+    "test_fused_moe_mxfp4_mxfp8",
+    "test_fused_moe_w4a8_nvfp4_fp8",
+    "test_fused_moe_wfp4a16",
+]
+
+
+def _convert_clean_to_original_moe_test_id(test_id):
+    """Convert clean MoE test ID back to original format for pytest collection.
+
+    Example: "test_fused_moe.py::test_foo[TRTLLM-dtype0]" -> "test_fused_moe.py::test_foo[-TRTLLM-dtype0]"
+
+    This is needed because the `enable_configurable_moe` parameter uses empty string
+    as ID when value is 0, resulting in test IDs like "test_foo[-TRTLLM-dtype0]".
+    We clean these up in pytest_collection_modifyitems, but pytest filters tests
+    during collection using the original IDs. So when user runs with clean test name,
+    we need to convert it back to match the original.
+    """
+    if "test_fused_moe.py" not in test_id:
+        return test_id
+
+    # Match pattern like "test_name[params]" and add leading dash after "["
+    # But only if params don't already start with "-" or "enable_configurable_moe"
+    match = re.search(r"\[([^\]]+)\]", test_id)
+    if match:
+        params = match.group(1)
+        # Skip if already has leading dash or starts with enable_configurable_moe
+        if not params.startswith("-") and not params.startswith("enable_configurable_moe"):
+            # Add leading dash to params
+            new_params = "-" + params
+            test_id = test_id.replace(f"[{params}]", f"[{new_params}]")
+
+    return test_id
+
+
+def pytest_sessionstart(session):
+    """Convert clean MoE test IDs in config.args to original format for collection.
+
+    This is needed because pytest filters tests during collection using original IDs.
+    When user runs with clean test name, we convert it back to match the original.
+    """
+    args = session.config.args
+    for i, arg in enumerate(args):
+        if "test_fused_moe.py" in arg and "[" in arg:
+            # Only apply conversion to specific tests that use enable_configurable_moe
+            should_convert = any(test_name in arg for test_name in TESTS_WITH_CONFIGURABLE_MOE)
+            if should_convert:
+                args[i] = _convert_clean_to_original_moe_test_id(arg)
+
+
+def pytest_collection_modifyitems(items):
+    """Clean up test IDs by removing leading/trailing dashes from parameter IDs.
+
+    This is needed because `enable_configurable_moe` parameter can be empty,
+    resulting in ugly test IDs like "test_foo[-True]" or "test_foo[--abc]".
+    We clean these up to "test_foo[True]" or "test_foo[abc]" so that:
+    1. Test names in waive files and test lists remain unchanged
+    2. Test reports look cleaner
+
+    This runs BEFORE the global conftest applies waives (due to hookwrapper).
+    """
+    for item in items:
+        if "test_fused_moe.py" in item.nodeid and "[" in item.nodeid:
+            # Only apply cleanup to specific tests that use enable_configurable_moe
+            should_cleanup = any(
+                test_name in item.nodeid for test_name in TESTS_WITH_CONFIGURABLE_MOE
+            )
+            if should_cleanup:
+                original_nodeid = item.nodeid
+                original_name = item.name
+                nodeid = item.nodeid
+                name = item.name
+
+                # Clean up leading/trailing dashes in nodeid
+                nodeid = nodeid.replace("[-", "[")
+                nodeid = nodeid.replace("-]", "]")
+
+                # Clean up leading/trailing dashes in name
+                name = name.replace("[-", "[")
+                name = name.replace("-]", "]")
+
+                if nodeid != original_nodeid:
+                    item._nodeid = nodeid
+                if name != original_name:
+                    item.name = name
diff --git a/tests/unittest/_torch/modules/test_fused_moe.py b/tests/unittest/_torch/modules/test_fused_moe.py
index 4ba09fa79c..1db2aab76a 100644
--- a/tests/unittest/_torch/modules/test_fused_moe.py
+++ b/tests/unittest/_torch/modules/test_fused_moe.py
@@ -1356,7 +1356,20 @@ def test_fused_moe_fp8_blockwise_cute_dsl_multi_gpu(ep_size, routing_method,
 @pytest.mark.parametrize("moe_backend", [
     pytest.param("TRTLLM", marks=skip_blackwell_geforce), "CUTLASS", "CUTEDSL"
 ])
-def test_fused_moe_nvfp4(dtype, moe_backend):
+@pytest.mark.parametrize("enable_configurable_moe", [0, 1],
+                         ids=lambda x: ""
+                         if x == 0 else "enable_configurable_moe")
+def test_fused_moe_nvfp4(dtype, moe_backend, enable_configurable_moe, mocker):
+
+    if enable_configurable_moe == 1 and moe_backend != "TRTLLM":
+        pytest.skip("ENABLE_CONFIGURABLE_MOE=1, only TRTLLM backend is enabled")
+
+    mocker.patch.dict(
+        os.environ, {
+            "ENABLE_CONFIGURABLE_MOE":
+            "1"
+            if enable_configurable_moe == 1 and moe_backend == "TRTLLM" else "0"
+        })
 
     if moe_backend == "TRTLLM" and dtype == torch.float16:
         pytest.skip("TRTLLM NVFP4 MoE backend does not support float16 yet")
@@ -1515,7 +1528,20 @@ def test_fused_moe_nvfp4(dtype, moe_backend):
 @pytest.mark.parametrize(
     "moe_backend",
     [pytest.param("TRTLLM", marks=skip_blackwell_geforce), "CUTLASS"])
-def test_fused_moe_w4a8_nvfp4_fp8(moe_backend):
+@pytest.mark.parametrize("enable_configurable_moe", [0, 1],
+                         ids=lambda x: ""
+                         if x == 0 else "enable_configurable_moe")
+def test_fused_moe_w4a8_nvfp4_fp8(moe_backend, enable_configurable_moe, mocker):
+    if enable_configurable_moe == 1 and moe_backend != "TRTLLM":
+        pytest.skip("ENABLE_CONFIGURABLE_MOE=1, only TRTLLM backend is enabled")
+
+    mocker.patch.dict(
+        os.environ, {
+            "ENABLE_CONFIGURABLE_MOE":
+            "1"
+            if enable_configurable_moe == 1 and moe_backend == "TRTLLM" else "0"
+        })
+
     dtype = torch.bfloat16
     mapping = Mapping()
     mapping.rank = mpi_rank()
@@ -1930,7 +1956,21 @@ def test_fused_moe_w4afp8(dtype, weight_loading_mode):
 @pytest.mark.parametrize("hidden_unpadded", [64, 192, 256])
 @pytest.mark.parametrize("seq_len", [8, 128])
 @pytest.mark.parametrize("bias", [True, False])
-def test_fused_moe_mxfp4_mxfp8(moe_backend, hidden_unpadded, seq_len, bias):
+@pytest.mark.parametrize("enable_configurable_moe", [0, 1],
+                         ids=lambda x: ""
+                         if x == 0 else "enable_configurable_moe")
+def test_fused_moe_mxfp4_mxfp8(moe_backend, hidden_unpadded, seq_len, bias,
+                               enable_configurable_moe, mocker):
+
+    if enable_configurable_moe == 1 and moe_backend != "TRTLLM":
+        pytest.skip("ENABLE_CONFIGURABLE_MOE=1, only TRTLLM backend is enabled")
+
+    mocker.patch.dict(
+        os.environ, {
+            "ENABLE_CONFIGURABLE_MOE":
+            "1"
+            if enable_configurable_moe == 1 and moe_backend == "TRTLLM" else "0"
+        })
 
     if moe_backend == "CUTLASS" and hidden_unpadded % 128 != 0:
         pytest.skip()
@@ -2191,7 +2231,21 @@ def test_fused_moe_mxfp4_mxfp8(moe_backend, hidden_unpadded, seq_len, bias):
             marks=[skip_pre_hopper, skip_blackwell, skip_blackwell_geforce]),
     ],
 )
-def test_fused_moe_wfp4a16(dtype, hidden_size, moe_backend):
+@pytest.mark.parametrize("enable_configurable_moe", [0, 1],
+                         ids=lambda x: ""
+                         if x == 0 else "enable_configurable_moe")
+def test_fused_moe_wfp4a16(dtype, hidden_size, moe_backend,
+                           enable_configurable_moe, mocker):
+
+    if enable_configurable_moe == 1 and moe_backend != "TRTLLM":
+        pytest.skip("ENABLE_CONFIGURABLE_MOE=1, only TRTLLM backend is enabled")
+
+    mocker.patch.dict(
+        os.environ, {
+            "ENABLE_CONFIGURABLE_MOE":
+            "1"
+            if enable_configurable_moe == 1 and moe_backend == "TRTLLM" else "0"
+        })
 
     mapping = Mapping()
     mapping.rank = mpi_rank()

From 9bfb6179ec6dab87cf7f42a1c5a4b39dbf2b8d27 Mon Sep 17 00:00:00 2001
From: fredricz-20070104 <226039983+fredricz-20070104@users.noreply.github.com>
Date: Mon, 8 Dec 2025 10:41:40 +0800
Subject: [PATCH 08/10] [https://nvbugs/5422621][test] Add GB 200 WIDEEP test
 case for RCCA 5422621 (#9506)

Signed-off-by: FredricZ-2007 <226039983+fredricz-20070104@users.noreply.github.com>
---
 .../defs/perf/disagg/execution/executor.py    |   4 +-
 .../perf/disagg/execution/subprocess_utils.py |   8 ++
 ...1_gen1_dep16_bs64_eplb0_mtp3_ccb-NIXL.yaml |   3 +
 ...x1_gen1_dep16_bs64_eplb0_mtp3_ccb-UCX.yaml |   3 +
 ...1_gen1_dep32_bs16_eplb0_mtp3_ccb-NIXL.yaml |   3 +
 ...x1_gen1_dep32_bs16_eplb0_mtp3_ccb-UCX.yaml |   3 +
 ...x1_gen4_tep8_bs32_eplb0_mtp0_ccb-NIXL.yaml |   3 +
 ...tx1_gen4_tep8_bs32_eplb0_mtp0_ccb-UCX.yaml |   3 +
 ..._gen1_dep16_bs128_eplb0_mtp1_ccb-NIXL.yaml |   3 +
 ...2_gen1_dep16_bs128_eplb0_mtp1_ccb-UCX.yaml |   3 +
 ...x1_gen1_tep8_bs32_eplb0_mtp0_ccb-NIXL.yaml |  95 +++++++++++++++
 ...tx1_gen1_tep8_bs32_eplb0_mtp0_ccb-UCX.yaml |  95 +++++++++++++++
 ...1_gen1_dep32_bs32_eplb0_mtp0_ccb-NIXL.yaml |   3 +
 ...x1_gen1_dep32_bs32_eplb0_mtp0_ccb-UCX.yaml |   3 +
 ...x1_gen4_tep8_bs32_eplb0_mtp0_ccb-NIXL.yaml |   3 +
 ...tx1_gen4_tep8_bs32_eplb0_mtp0_ccb-UCX.yaml |   3 +
 ...x1_gen4_tep8_bs32_eplb0_mtp3_ccb-NIXL.yaml |   3 +
 ...tx1_gen4_tep8_bs32_eplb0_mtp3_ccb-UCX.yaml |   3 +
 ..._gen1_dep16_bs128_eplb0_mtp3_ccb-NIXL.yaml |   3 +
 ...2_gen1_dep16_bs128_eplb0_mtp3_ccb-UCX.yaml |   3 +
 ...x1_gen3_tep8_bs16_eplb0_mtp3_ccb-NIXL.yaml |   3 +
 ...tx1_gen3_tep8_bs16_eplb0_mtp3_ccb-UCX.yaml |   3 +
 ...x1_gen3_tep8_bs32_eplb0_mtp0_ccb-NIXL.yaml |   3 +
 ...tx1_gen3_tep8_bs32_eplb0_mtp0_ccb-UCX.yaml |   3 +
 ...6_gen1_dep16_bs64_eplb0_mtp0_ccb-NIXL.yaml |   3 +
 ...x6_gen1_dep16_bs64_eplb0_mtp0_ccb-UCX.yaml |   3 +
 ...8_gen1_dep32_bs16_eplb0_mtp3_ccb-NIXL.yaml |   3 +
 ...x8_gen1_dep32_bs16_eplb0_mtp3_ccb-UCX.yaml |   3 +
 ...en1_dep16_bs128_eplb288_mtp3_ccb-NIXL.yaml |   1 +
 ...gen1_dep16_bs64_eplb288_mtp3_ccb-NIXL.yaml |   3 +
 ..._gen1_dep16_bs64_eplb288_mtp3_ccb-UCX.yaml |   3 +
 ...gen1_dep32_bs16_eplb288_mtp3_ccb-NIXL.yaml |   3 +
 ..._gen1_dep32_bs16_eplb288_mtp3_ccb-UCX.yaml |   3 +
 ...en1_dep16_bs128_eplb288_mtp1_ccb-NIXL.yaml |   3 +
 ...gen1_dep16_bs128_eplb288_mtp1_ccb-UCX.yaml |   3 +
 ...gen1_dep32_bs32_eplb288_mtp0_ccb-NIXL.yaml |   3 +
 ..._gen1_dep32_bs32_eplb288_mtp0_ccb-UCX.yaml |   3 +
 ...en1_dep16_bs128_eplb288_mtp3_ccb-NIXL.yaml |   3 +
 ...gen1_dep16_bs128_eplb288_mtp3_ccb-UCX.yaml |   3 +
 ...1_dep48_bs16_eplb288_mtp3_ccb-DEFAULT.yaml | 110 ++++++++++++++++++
 ..._dep32_bs128_eplb288_mtp3_ccb-DEFAULT.yaml |   3 +
 ...gen1_dep16_bs64_eplb288_mtp0_ccb-NIXL.yaml |   3 +
 ..._gen1_dep16_bs64_eplb288_mtp0_ccb-UCX.yaml |   3 +
 ...gen1_dep32_bs16_eplb288_mtp3_ccb-NIXL.yaml |   3 +
 ..._gen1_dep32_bs16_eplb288_mtp3_ccb-UCX.yaml |   3 +
 .../defs/perf/disagg/testlist/disagg.txt      |   2 +
 .../defs/perf/disagg/testlist/wideep.txt      |   1 +
 .../defs/perf/disagg/utils/common.py          |   4 +-
 tests/integration/defs/pytest.ini             |   2 +-
 49 files changed, 435 insertions(+), 4 deletions(-)
 create mode 100644 tests/integration/defs/perf/disagg/test_configs/disagg/perf/Qwen3-235B-A22B-FP8_1k1k_ctx1_gen1_tep8_bs32_eplb0_mtp0_ccb-NIXL.yaml
 create mode 100644 tests/integration/defs/perf/disagg/test_configs/disagg/perf/Qwen3-235B-A22B-FP8_1k1k_ctx1_gen1_tep8_bs32_eplb0_mtp0_ccb-UCX.yaml
 create mode 100644 tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-r1-fp4_1k1k_ctx2_gen1_dep48_bs16_eplb288_mtp3_ccb-DEFAULT.yaml

diff --git a/tests/integration/defs/perf/disagg/execution/executor.py b/tests/integration/defs/perf/disagg/execution/executor.py
index 170d2f3e35..fc4d6e785e 100644
--- a/tests/integration/defs/perf/disagg/execution/executor.py
+++ b/tests/integration/defs/perf/disagg/execution/executor.py
@@ -249,8 +249,8 @@ class JobManager:
             logger.error(f"Job submission exception: {error_msg}")
             # Clean up temporary file on exception
             temp_config_path = test_config.temp_config_path
-            if os.path.exists(temp_config_path):
-                os.remove(temp_config_path)
+            # if os.path.exists(temp_config_path):
+            #     os.remove(temp_config_path)
             return False, error_msg
 
     @staticmethod
diff --git a/tests/integration/defs/perf/disagg/execution/subprocess_utils.py b/tests/integration/defs/perf/disagg/execution/subprocess_utils.py
index a66d190a16..7034254ee0 100644
--- a/tests/integration/defs/perf/disagg/execution/subprocess_utils.py
+++ b/tests/integration/defs/perf/disagg/execution/subprocess_utils.py
@@ -12,6 +12,8 @@ No complex process tree cleanup is needed because:
 import subprocess
 from typing import Optional
 
+from utils.logger import logger
+
 
 def exec_cmd(*popenargs, timeout: Optional[float] = None, **kwargs) -> int:
     """Execute command and return exit code.
@@ -54,4 +56,10 @@ def exec_cmd_with_output(*popenargs, timeout: Optional[float] = None, **kwargs)
         check=True,
         **kwargs,
     )
+
+    # Log stderr if it exists
+    if result.stderr:
+        stderr_output = result.stderr.decode()
+        logger.error(f"Command stderr: {stderr_output}")
+
     return result.stdout.decode()
diff --git a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen1_dep16_bs64_eplb0_mtp3_ccb-NIXL.yaml b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen1_dep16_bs64_eplb0_mtp3_ccb-NIXL.yaml
index 841eb55b6f..f2cb1cb438 100644
--- a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen1_dep16_bs64_eplb0_mtp3_ccb-NIXL.yaml
+++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen1_dep16_bs64_eplb0_mtp3_ccb-NIXL.yaml
@@ -14,6 +14,7 @@ slurm:
   account: <account>
   job_time: 02:00:00
   job_name: unified-benchmark
+  extra_args: "--gres=gpu:4"
   numa_bind: true
 benchmark:
   mode: e2e
@@ -36,6 +37,8 @@ environment:
   trtllm_repo: ''
   build_wheel: false
   work_dir: <full_path_to_work_dir>
+  worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes"
+  server_env_var: "TRTLLM_SERVER_DISABLE_GC=1"
 profiling:
   nsys_on: false
 accuracy:
diff --git a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen1_dep16_bs64_eplb0_mtp3_ccb-UCX.yaml b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen1_dep16_bs64_eplb0_mtp3_ccb-UCX.yaml
index 8fe7d96229..aea7d01c16 100644
--- a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen1_dep16_bs64_eplb0_mtp3_ccb-UCX.yaml
+++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen1_dep16_bs64_eplb0_mtp3_ccb-UCX.yaml
@@ -14,6 +14,7 @@ slurm:
   account: <account>
   job_time: 02:00:00
   job_name: unified-benchmark
+  extra_args: "--gres=gpu:4"
   numa_bind: true
 benchmark:
   mode: e2e
@@ -36,6 +37,8 @@ environment:
   trtllm_repo: ''
   build_wheel: false
   work_dir: <full_path_to_work_dir>
+  worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes"
+  server_env_var: "TRTLLM_SERVER_DISABLE_GC=1"
 profiling:
   nsys_on: false
 accuracy:
diff --git a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen1_dep32_bs16_eplb0_mtp3_ccb-NIXL.yaml b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen1_dep32_bs16_eplb0_mtp3_ccb-NIXL.yaml
index 44a93659e7..d49ce13c0d 100644
--- a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen1_dep32_bs16_eplb0_mtp3_ccb-NIXL.yaml
+++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen1_dep32_bs16_eplb0_mtp3_ccb-NIXL.yaml
@@ -14,6 +14,7 @@ slurm:
   account: <account>
   job_time: 02:00:00
   job_name: unified-benchmark
+  extra_args: "--gres=gpu:4"
   numa_bind: true
 benchmark:
   mode: e2e
@@ -36,6 +37,8 @@ environment:
   trtllm_repo: ''
   build_wheel: false
   work_dir: <full_path_to_work_dir>
+  worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes"
+  server_env_var: "TRTLLM_SERVER_DISABLE_GC=1"
 profiling:
   nsys_on: false
 accuracy:
diff --git a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen1_dep32_bs16_eplb0_mtp3_ccb-UCX.yaml b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen1_dep32_bs16_eplb0_mtp3_ccb-UCX.yaml
index 68c6f5f8c2..2f8c655fc6 100644
--- a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen1_dep32_bs16_eplb0_mtp3_ccb-UCX.yaml
+++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen1_dep32_bs16_eplb0_mtp3_ccb-UCX.yaml
@@ -14,6 +14,7 @@ slurm:
   account: <account>
   job_time: 02:00:00
   job_name: unified-benchmark
+  extra_args: "--gres=gpu:4"
   numa_bind: true
 benchmark:
   mode: e2e
@@ -36,6 +37,8 @@ environment:
   trtllm_repo: ''
   build_wheel: false
   work_dir: <full_path_to_work_dir>
+  worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes"
+  server_env_var: "TRTLLM_SERVER_DISABLE_GC=1"
 profiling:
   nsys_on: false
 accuracy:
diff --git a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_ccb-NIXL.yaml b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_ccb-NIXL.yaml
index cc16b00b1b..786b107f81 100644
--- a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_ccb-NIXL.yaml
+++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_ccb-NIXL.yaml
@@ -14,6 +14,7 @@ slurm:
   account: <account>
   job_time: 02:00:00
   job_name: unified-benchmark
+  extra_args: "--gres=gpu:4"
   numa_bind: true
 benchmark:
   mode: e2e
@@ -36,6 +37,8 @@ environment:
   trtllm_repo: ''
   build_wheel: false
   work_dir: <full_path_to_work_dir>
+  worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes"
+  server_env_var: "TRTLLM_SERVER_DISABLE_GC=1"
 profiling:
   nsys_on: false
 accuracy:
diff --git a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_ccb-UCX.yaml b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_ccb-UCX.yaml
index 2751424f82..f118685588 100644
--- a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_ccb-UCX.yaml
+++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_ccb-UCX.yaml
@@ -14,6 +14,7 @@ slurm:
   account: <account>
   job_time: 02:00:00
   job_name: unified-benchmark
+  extra_args: "--gres=gpu:4"
   numa_bind: true
 benchmark:
   mode: e2e
@@ -36,6 +37,8 @@ environment:
   trtllm_repo: ''
   build_wheel: false
   work_dir: <full_path_to_work_dir>
+  worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes"
+  server_env_var: "TRTLLM_SERVER_DISABLE_GC=1"
 profiling:
   nsys_on: false
 accuracy:
diff --git a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/Qwen3-235B-A22B-FP4_1k1k_ctx2_gen1_dep16_bs128_eplb0_mtp1_ccb-NIXL.yaml b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/Qwen3-235B-A22B-FP4_1k1k_ctx2_gen1_dep16_bs128_eplb0_mtp1_ccb-NIXL.yaml
index a44c8e3286..4aea781e7d 100644
--- a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/Qwen3-235B-A22B-FP4_1k1k_ctx2_gen1_dep16_bs128_eplb0_mtp1_ccb-NIXL.yaml
+++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/Qwen3-235B-A22B-FP4_1k1k_ctx2_gen1_dep16_bs128_eplb0_mtp1_ccb-NIXL.yaml
@@ -14,6 +14,7 @@ slurm:
   account: <account>
   job_time: 02:00:00
   job_name: unified-benchmark
+  extra_args: "--gres=gpu:4"
   numa_bind: true
 benchmark:
   mode: e2e
@@ -36,6 +37,8 @@ environment:
   trtllm_repo: ''
   build_wheel: false
   work_dir: <full_path_to_work_dir>
+  worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes"
+  server_env_var: "TRTLLM_SERVER_DISABLE_GC=1"
 profiling:
   nsys_on: false
 accuracy:
diff --git a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/Qwen3-235B-A22B-FP4_1k1k_ctx2_gen1_dep16_bs128_eplb0_mtp1_ccb-UCX.yaml b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/Qwen3-235B-A22B-FP4_1k1k_ctx2_gen1_dep16_bs128_eplb0_mtp1_ccb-UCX.yaml
index 37f884cee5..8ad78695cb 100644
--- a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/Qwen3-235B-A22B-FP4_1k1k_ctx2_gen1_dep16_bs128_eplb0_mtp1_ccb-UCX.yaml
+++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/Qwen3-235B-A22B-FP4_1k1k_ctx2_gen1_dep16_bs128_eplb0_mtp1_ccb-UCX.yaml
@@ -14,6 +14,7 @@ slurm:
   account: <account>
   job_time: 02:00:00
   job_name: unified-benchmark
+  extra_args: "--gres=gpu:4"
   numa_bind: true
 benchmark:
   mode: e2e
@@ -36,6 +37,8 @@ environment:
   trtllm_repo: ''
   build_wheel: false
   work_dir: <full_path_to_work_dir>
+  worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes"
+  server_env_var: "TRTLLM_SERVER_DISABLE_GC=1"
 profiling:
   nsys_on: false
 accuracy:
diff --git a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/Qwen3-235B-A22B-FP8_1k1k_ctx1_gen1_tep8_bs32_eplb0_mtp0_ccb-NIXL.yaml b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/Qwen3-235B-A22B-FP8_1k1k_ctx1_gen1_tep8_bs32_eplb0_mtp0_ccb-NIXL.yaml
new file mode 100644
index 0000000000..7b61b2ed53
--- /dev/null
+++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/Qwen3-235B-A22B-FP8_1k1k_ctx1_gen1_tep8_bs32_eplb0_mtp0_ccb-NIXL.yaml
@@ -0,0 +1,95 @@
+# nvbugs: 5561153
+metadata:
+  model_name: Qwen3-235B-A22B-FP8
+  precision: fp8
+  model_dir_name: Qwen3-235B-A22B-FP8
+  supported_gpus:
+  - GB200
+  - GB300
+  script_file: disaggr_torch.slurm
+  benchmark_type: 1k1k
+  config_index: 21
+slurm:
+  script_file: disaggr_torch.slurm
+  partition: <partition>
+  account: <account>
+  job_time: 02:00:00
+  job_name: unified-benchmark
+  extra_args: "--gres=gpu:4"
+  numa_bind: true
+benchmark:
+  mode: e2e
+  use_nv_sa_benchmark: true
+  multi_round: 8
+  benchmark_ratio: 0.8
+  streaming: true
+  concurrency_list: 1 2 4 8 16 36
+  input_length: 1024
+  output_length: 1024
+  dataset_file: <dataset_file>
+hardware:
+  gpus_per_node: 4
+  num_ctx_servers: 1
+  num_gen_servers: 1
+environment:
+  container_mount: <container_mount>
+  container_image: <container_image>
+  model_path: <model_path>
+  trtllm_repo: ''
+  build_wheel: false
+  work_dir: <full_path_to_work_dir>
+  worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes"
+  server_env_var: "TRTLLM_SERVER_DISABLE_GC=1"
+profiling:
+  nsys_on: false
+accuracy:
+  enable_accuracy_test: false
+  model: local-completions
+  tasks: gsm8k
+  model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096
+worker_config:
+  gen:
+    tensor_parallel_size: 4
+    moe_expert_parallel_size: 4
+    enable_attention_dp: false
+    pipeline_parallel_size: 1
+    max_batch_size: 64
+    max_num_tokens: 2048
+    max_seq_len: 2051
+    cuda_graph_config:
+      enable_padding: true
+      max_batch_size: 128
+    print_iter_log: true
+    kv_cache_config:
+      enable_block_reuse: true
+      free_gpu_memory_fraction: 0.7
+      dtype: fp8
+    moe_config:
+      backend: TRTLLM
+    cache_transceiver_config:
+      max_tokens_in_buffer: 2048
+      backend: NIXL
+    stream_interval: 20
+    num_postprocess_workers: 4
+    allreduce_strategy: MNNVL
+    disable_overlap_scheduler: false
+  ctx:
+    max_batch_size: 32
+    max_num_tokens: 2048
+    max_seq_len: 2051
+    tensor_parallel_size: 4
+    moe_expert_parallel_size: 4
+    enable_attention_dp: false
+    pipeline_parallel_size: 1
+    print_iter_log: true
+    cuda_graph_config: null
+    disable_overlap_scheduler: true
+    kv_cache_config:
+      enable_block_reuse: true
+      free_gpu_memory_fraction: 0.7
+      dtype: fp8
+    moe_config:
+      backend: TRTLLM
+    cache_transceiver_config:
+      max_tokens_in_buffer: 2048
+      backend: NIXL
diff --git a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/Qwen3-235B-A22B-FP8_1k1k_ctx1_gen1_tep8_bs32_eplb0_mtp0_ccb-UCX.yaml b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/Qwen3-235B-A22B-FP8_1k1k_ctx1_gen1_tep8_bs32_eplb0_mtp0_ccb-UCX.yaml
new file mode 100644
index 0000000000..283755728b
--- /dev/null
+++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/Qwen3-235B-A22B-FP8_1k1k_ctx1_gen1_tep8_bs32_eplb0_mtp0_ccb-UCX.yaml
@@ -0,0 +1,95 @@
+# nvbugs: 5561153
+metadata:
+  model_name: Qwen3-235B-A22B-FP8
+  precision: fp8
+  model_dir_name: Qwen3-235B-A22B-FP8
+  supported_gpus:
+  - GB200
+  - GB300
+  script_file: disaggr_torch.slurm
+  benchmark_type: 1k1k
+  config_index: 21
+slurm:
+  script_file: disaggr_torch.slurm
+  partition: <partition>
+  account: <account>
+  job_time: 02:00:00
+  job_name: unified-benchmark
+  extra_args: "--gres=gpu:4"
+  numa_bind: true
+benchmark:
+  mode: e2e
+  use_nv_sa_benchmark: true
+  multi_round: 8
+  benchmark_ratio: 0.8
+  streaming: true
+  concurrency_list: 1 2 4 8 16 36
+  input_length: 1024
+  output_length: 1024
+  dataset_file: <dataset_file>
+hardware:
+  gpus_per_node: 4
+  num_ctx_servers: 1
+  num_gen_servers: 1
+environment:
+  container_mount: <container_mount>
+  container_image: <container_image>
+  model_path: <model_path>
+  trtllm_repo: ''
+  build_wheel: false
+  work_dir: <full_path_to_work_dir>
+  worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes"
+  server_env_var: "TRTLLM_SERVER_DISABLE_GC=1"
+profiling:
+  nsys_on: false
+accuracy:
+  enable_accuracy_test: false
+  model: local-completions
+  tasks: gsm8k
+  model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096
+worker_config:
+  gen:
+    tensor_parallel_size: 4
+    moe_expert_parallel_size: 4
+    enable_attention_dp: false
+    pipeline_parallel_size: 1
+    max_batch_size: 64
+    max_num_tokens: 2048
+    max_seq_len: 2051
+    cuda_graph_config:
+      enable_padding: true
+      max_batch_size: 128
+    print_iter_log: true
+    kv_cache_config:
+      enable_block_reuse: true
+      free_gpu_memory_fraction: 0.7
+      dtype: fp8
+    moe_config:
+      backend: TRTLLM
+    cache_transceiver_config:
+      max_tokens_in_buffer: 2048
+      backend: UCX
+    stream_interval: 20
+    num_postprocess_workers: 4
+    allreduce_strategy: MNNVL
+    disable_overlap_scheduler: false
+  ctx:
+    max_batch_size: 32
+    max_num_tokens: 2048
+    max_seq_len: 2051
+    tensor_parallel_size: 4
+    moe_expert_parallel_size: 4
+    enable_attention_dp: false
+    pipeline_parallel_size: 1
+    print_iter_log: true
+    cuda_graph_config: null
+    disable_overlap_scheduler: true
+    kv_cache_config:
+      enable_block_reuse: true
+      free_gpu_memory_fraction: 0.7
+      dtype: fp8
+    moe_config:
+      backend: TRTLLM
+    cache_transceiver_config:
+      max_tokens_in_buffer: 2048
+      backend: UCX
diff --git a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_1k1k_ctx1_gen1_dep32_bs32_eplb0_mtp0_ccb-NIXL.yaml b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_1k1k_ctx1_gen1_dep32_bs32_eplb0_mtp0_ccb-NIXL.yaml
index f9dd57dc2c..33ee191ffd 100644
--- a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_1k1k_ctx1_gen1_dep32_bs32_eplb0_mtp0_ccb-NIXL.yaml
+++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_1k1k_ctx1_gen1_dep32_bs32_eplb0_mtp0_ccb-NIXL.yaml
@@ -14,6 +14,7 @@ slurm:
   account: <account>
   job_time: 02:00:00
   job_name: unified-benchmark
+  extra_args: "--gres=gpu:4"
   numa_bind: true
 benchmark:
   mode: e2e
@@ -37,6 +38,8 @@ environment:
   build_wheel: false
   trtllm_wheel_path: ''
   work_dir: <full_path_to_work_dir>
+  worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes"
+  server_env_var: "TRTLLM_SERVER_DISABLE_GC=1"
 profiling:
   nsys_on: false
 accuracy:
diff --git a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_1k1k_ctx1_gen1_dep32_bs32_eplb0_mtp0_ccb-UCX.yaml b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_1k1k_ctx1_gen1_dep32_bs32_eplb0_mtp0_ccb-UCX.yaml
index 0a704285d4..12ac8edad0 100644
--- a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_1k1k_ctx1_gen1_dep32_bs32_eplb0_mtp0_ccb-UCX.yaml
+++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_1k1k_ctx1_gen1_dep32_bs32_eplb0_mtp0_ccb-UCX.yaml
@@ -14,6 +14,7 @@ slurm:
   account: <account>
   job_time: 02:00:00
   job_name: unified-benchmark
+  extra_args: "--gres=gpu:4"
   numa_bind: true
 benchmark:
   mode: e2e
@@ -36,6 +37,8 @@ environment:
   trtllm_repo: ''
   build_wheel: false
   work_dir: <full_path_to_work_dir>
+  worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes"
+  server_env_var: "TRTLLM_SERVER_DISABLE_GC=1"
 profiling:
   nsys_on: false
 accuracy:
diff --git a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_ccb-NIXL.yaml b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_ccb-NIXL.yaml
index bda1706561..ab5bd6f719 100644
--- a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_ccb-NIXL.yaml
+++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_ccb-NIXL.yaml
@@ -14,6 +14,7 @@ slurm:
   account: <account>
   job_time: 02:00:00
   job_name: unified-benchmark
+  extra_args: "--gres=gpu:4"
   numa_bind: true
 benchmark:
   mode: e2e
@@ -36,6 +37,8 @@ environment:
   trtllm_repo: ''
   build_wheel: false
   work_dir: <full_path_to_work_dir>
+  worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes"
+  server_env_var: "TRTLLM_SERVER_DISABLE_GC=1"
 profiling:
   nsys_on: false
 accuracy:
diff --git a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_ccb-UCX.yaml b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_ccb-UCX.yaml
index 1526472d23..7d8cb97621 100644
--- a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_ccb-UCX.yaml
+++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_ccb-UCX.yaml
@@ -14,6 +14,7 @@ slurm:
   account: <account>
   job_time: 02:00:00
   job_name: unified-benchmark
+  extra_args: "--gres=gpu:4"
   numa_bind: true
 benchmark:
   mode: e2e
@@ -36,6 +37,8 @@ environment:
   trtllm_repo: ''
   build_wheel: false
   work_dir: <full_path_to_work_dir>
+  worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes"
+  server_env_var: "TRTLLM_SERVER_DISABLE_GC=1"
 profiling:
   nsys_on: false
 accuracy:
diff --git a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp3_ccb-NIXL.yaml b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp3_ccb-NIXL.yaml
index cb0363fdd3..3f9a7d6a2d 100644
--- a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp3_ccb-NIXL.yaml
+++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp3_ccb-NIXL.yaml
@@ -14,6 +14,7 @@ slurm:
   account: <account>
   job_time: 02:00:00
   job_name: unified-benchmark
+  extra_args: "--gres=gpu:4"
   numa_bind: true
 benchmark:
   mode: e2e
@@ -36,6 +37,8 @@ environment:
   trtllm_repo: ''
   build_wheel: false
   work_dir: <full_path_to_work_dir>
+  worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes"
+  server_env_var: "TRTLLM_SERVER_DISABLE_GC=1"
 profiling:
   nsys_on: false
 accuracy:
diff --git a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp3_ccb-UCX.yaml b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp3_ccb-UCX.yaml
index 87142ebc06..f2fd2bc21d 100644
--- a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp3_ccb-UCX.yaml
+++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp3_ccb-UCX.yaml
@@ -14,6 +14,7 @@ slurm:
   account: <account>
   job_time: 02:00:00
   job_name: unified-benchmark
+  extra_args: "--gres=gpu:4"
   numa_bind: true
 benchmark:
   mode: e2e
@@ -36,6 +37,8 @@ environment:
   trtllm_repo: ''
   build_wheel: false
   work_dir: <full_path_to_work_dir>
+  worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes"
+  server_env_var: "TRTLLM_SERVER_DISABLE_GC=1"
 profiling:
   nsys_on: false
 accuracy:
diff --git a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb0_mtp3_ccb-NIXL.yaml b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb0_mtp3_ccb-NIXL.yaml
index 4d8565a190..5d9d739d58 100644
--- a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb0_mtp3_ccb-NIXL.yaml
+++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb0_mtp3_ccb-NIXL.yaml
@@ -14,6 +14,7 @@ slurm:
   account: <account>
   job_time: 02:00:00
   job_name: unified-benchmark
+  extra_args: "--gres=gpu:4"
   numa_bind: true
 benchmark:
   mode: e2e
@@ -36,6 +37,8 @@ environment:
   trtllm_repo: ''
   build_wheel: false
   work_dir: <full_path_to_work_dir>
+  worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes"
+  server_env_var: "TRTLLM_SERVER_DISABLE_GC=1"
 profiling:
   nsys_on: false
 accuracy:
diff --git a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb0_mtp3_ccb-UCX.yaml b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb0_mtp3_ccb-UCX.yaml
index 8f54e9d2a5..f97137297b 100644
--- a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb0_mtp3_ccb-UCX.yaml
+++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb0_mtp3_ccb-UCX.yaml
@@ -14,6 +14,7 @@ slurm:
   account: <account>
   job_time: 02:00:00
   job_name: unified-benchmark
+  extra_args: "--gres=gpu:4"
   numa_bind: true
 benchmark:
   mode: e2e
@@ -36,6 +37,8 @@ environment:
   trtllm_repo: ''
   build_wheel: false
   work_dir: <full_path_to_work_dir>
+  worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes"
+  server_env_var: "TRTLLM_SERVER_DISABLE_GC=1"
 profiling:
   nsys_on: false
 accuracy:
diff --git a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs16_eplb0_mtp3_ccb-NIXL.yaml b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs16_eplb0_mtp3_ccb-NIXL.yaml
index 400eac6cf8..6b9078ac5a 100644
--- a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs16_eplb0_mtp3_ccb-NIXL.yaml
+++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs16_eplb0_mtp3_ccb-NIXL.yaml
@@ -14,6 +14,7 @@ slurm:
   account: <account>
   job_time: 02:00:00
   job_name: unified-benchmark
+  extra_args: "--gres=gpu:4"
   numa_bind: true
 benchmark:
   mode: e2e
@@ -36,6 +37,8 @@ environment:
   trtllm_repo: ''
   build_wheel: false
   work_dir: <full_path_to_work_dir>
+  worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes"
+  server_env_var: "TRTLLM_SERVER_DISABLE_GC=1"
 profiling:
   nsys_on: false
 accuracy:
diff --git a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs16_eplb0_mtp3_ccb-UCX.yaml b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs16_eplb0_mtp3_ccb-UCX.yaml
index 0a44b5b2e5..468354c073 100644
--- a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs16_eplb0_mtp3_ccb-UCX.yaml
+++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs16_eplb0_mtp3_ccb-UCX.yaml
@@ -14,6 +14,7 @@ slurm:
   account: <account>
   job_time: 02:00:00
   job_name: unified-benchmark
+  extra_args: "--gres=gpu:4"
   numa_bind: true
 benchmark:
   mode: e2e
@@ -36,6 +37,8 @@ environment:
   trtllm_repo: ''
   build_wheel: false
   work_dir: <full_path_to_work_dir>
+  worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes"
+  server_env_var: "TRTLLM_SERVER_DISABLE_GC=1"
 profiling:
   nsys_on: false
 accuracy:
diff --git a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs32_eplb0_mtp0_ccb-NIXL.yaml b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs32_eplb0_mtp0_ccb-NIXL.yaml
index ab31d77167..a970ee6de4 100644
--- a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs32_eplb0_mtp0_ccb-NIXL.yaml
+++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs32_eplb0_mtp0_ccb-NIXL.yaml
@@ -14,6 +14,7 @@ slurm:
   account: <account>
   job_time: 02:00:00
   job_name: unified-benchmark
+  extra_args: "--gres=gpu:4"
   numa_bind: true
 benchmark:
   mode: e2e
@@ -36,6 +37,8 @@ environment:
   trtllm_repo: ''
   build_wheel: false
   work_dir: <full_path_to_work_dir>
+  worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes"
+  server_env_var: "TRTLLM_SERVER_DISABLE_GC=1"
 profiling:
   nsys_on: false
 accuracy:
diff --git a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs32_eplb0_mtp0_ccb-UCX.yaml b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs32_eplb0_mtp0_ccb-UCX.yaml
index b0581a7e26..22dc90a06b 100644
--- a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs32_eplb0_mtp0_ccb-UCX.yaml
+++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs32_eplb0_mtp0_ccb-UCX.yaml
@@ -14,6 +14,7 @@ slurm:
   account: <account>
   job_time: 02:00:00
   job_name: unified-benchmark
+  extra_args: "--gres=gpu:4"
   numa_bind: true
 benchmark:
   mode: e2e
@@ -36,6 +37,8 @@ environment:
   trtllm_repo: ''
   build_wheel: false
   work_dir: <full_path_to_work_dir>
+  worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes"
+  server_env_var: "TRTLLM_SERVER_DISABLE_GC=1"
 profiling:
   nsys_on: false
 accuracy:
diff --git a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_8k1k_ctx6_gen1_dep16_bs64_eplb0_mtp0_ccb-NIXL.yaml b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_8k1k_ctx6_gen1_dep16_bs64_eplb0_mtp0_ccb-NIXL.yaml
index a67db056a2..a54b0dacd5 100644
--- a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_8k1k_ctx6_gen1_dep16_bs64_eplb0_mtp0_ccb-NIXL.yaml
+++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_8k1k_ctx6_gen1_dep16_bs64_eplb0_mtp0_ccb-NIXL.yaml
@@ -14,6 +14,7 @@ slurm:
   account: <account>
   job_time: 02:00:00
   job_name: unified-benchmark
+  extra_args: "--gres=gpu:4"
   numa_bind: true
 benchmark:
   mode: e2e
@@ -36,6 +37,8 @@ environment:
   trtllm_repo: ''
   build_wheel: false
   work_dir: <full_path_to_work_dir>
+  worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes"
+  server_env_var: "TRTLLM_SERVER_DISABLE_GC=1"
 profiling:
   nsys_on: false
 accuracy:
diff --git a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_8k1k_ctx6_gen1_dep16_bs64_eplb0_mtp0_ccb-UCX.yaml b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_8k1k_ctx6_gen1_dep16_bs64_eplb0_mtp0_ccb-UCX.yaml
index 5a6132741f..ab081e78cf 100644
--- a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_8k1k_ctx6_gen1_dep16_bs64_eplb0_mtp0_ccb-UCX.yaml
+++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_8k1k_ctx6_gen1_dep16_bs64_eplb0_mtp0_ccb-UCX.yaml
@@ -14,6 +14,7 @@ slurm:
   account: <account>
   job_time: 02:00:00
   job_name: unified-benchmark
+  extra_args: "--gres=gpu:4"
   numa_bind: true
 benchmark:
   mode: e2e
@@ -36,6 +37,8 @@ environment:
   trtllm_repo: ''
   build_wheel: false
   work_dir: <full_path_to_work_dir>
+  worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes"
+  server_env_var: "TRTLLM_SERVER_DISABLE_GC=1"
 profiling:
   nsys_on: false
 accuracy:
diff --git a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_8k1k_ctx8_gen1_dep32_bs16_eplb0_mtp3_ccb-NIXL.yaml b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_8k1k_ctx8_gen1_dep32_bs16_eplb0_mtp3_ccb-NIXL.yaml
index 58cb470baf..f4a5d3bc3a 100644
--- a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_8k1k_ctx8_gen1_dep32_bs16_eplb0_mtp3_ccb-NIXL.yaml
+++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_8k1k_ctx8_gen1_dep32_bs16_eplb0_mtp3_ccb-NIXL.yaml
@@ -14,6 +14,7 @@ slurm:
   account: <account>
   job_time: 02:00:00
   job_name: unified-benchmark
+  extra_args: "--gres=gpu:4"
   numa_bind: true
 benchmark:
   mode: e2e
@@ -36,6 +37,8 @@ environment:
   trtllm_repo: ''
   build_wheel: false
   work_dir: <full_path_to_work_dir>
+  worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes"
+  server_env_var: "TRTLLM_SERVER_DISABLE_GC=1"
 profiling:
   nsys_on: false
 accuracy:
diff --git a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_8k1k_ctx8_gen1_dep32_bs16_eplb0_mtp3_ccb-UCX.yaml b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_8k1k_ctx8_gen1_dep32_bs16_eplb0_mtp3_ccb-UCX.yaml
index c7ea82b572..9388365383 100644
--- a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_8k1k_ctx8_gen1_dep32_bs16_eplb0_mtp3_ccb-UCX.yaml
+++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_8k1k_ctx8_gen1_dep32_bs16_eplb0_mtp3_ccb-UCX.yaml
@@ -14,6 +14,7 @@ slurm:
   account: <account>
   job_time: 02:00:00
   job_name: unified-benchmark
+  extra_args: "--gres=gpu:4"
   numa_bind: true
 benchmark:
   mode: e2e
@@ -36,6 +37,8 @@ environment:
   trtllm_repo: ''
   build_wheel: false
   work_dir: <full_path_to_work_dir>
+  worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes"
+  server_env_var: "TRTLLM_SERVER_DISABLE_GC=1"
 profiling:
   nsys_on: false
 accuracy:
diff --git a/tests/integration/defs/perf/disagg/test_configs/wideep/accuracy/deepseek-r1-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb288_mtp3_ccb-NIXL.yaml b/tests/integration/defs/perf/disagg/test_configs/wideep/accuracy/deepseek-r1-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb288_mtp3_ccb-NIXL.yaml
index ba44ed4c10..1eaf479dcc 100644
--- a/tests/integration/defs/perf/disagg/test_configs/wideep/accuracy/deepseek-r1-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb288_mtp3_ccb-NIXL.yaml
+++ b/tests/integration/defs/perf/disagg/test_configs/wideep/accuracy/deepseek-r1-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb288_mtp3_ccb-NIXL.yaml
@@ -21,6 +21,7 @@ slurm:
   account: <account>
   job_time: 02:00:00
   job_name: unified-benchmark
+  extra_args: "--gres=gpu:4"
   numa_bind: true
 benchmark:
   mode: gen_only
diff --git a/tests/integration/defs/perf/disagg/test_configs/wideep/perf/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen1_dep16_bs64_eplb288_mtp3_ccb-NIXL.yaml b/tests/integration/defs/perf/disagg/test_configs/wideep/perf/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen1_dep16_bs64_eplb288_mtp3_ccb-NIXL.yaml
index 489b4aeacf..60a221d996 100644
--- a/tests/integration/defs/perf/disagg/test_configs/wideep/perf/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen1_dep16_bs64_eplb288_mtp3_ccb-NIXL.yaml
+++ b/tests/integration/defs/perf/disagg/test_configs/wideep/perf/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen1_dep16_bs64_eplb288_mtp3_ccb-NIXL.yaml
@@ -15,6 +15,7 @@ slurm:
   account: <account>
   job_time: 02:00:00
   job_name: unified-benchmark
+  extra_args: "--gres=gpu:4"
   numa_bind: true
 benchmark:
   mode: gen_only
@@ -37,6 +38,8 @@ environment:
   trtllm_repo: ''
   build_wheel: false
   work_dir: <full_path_to_work_dir>
+  worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes"
+  server_env_var: "TRTLLM_SERVER_DISABLE_GC=1"
 profiling:
   nsys_on: false
 accuracy:
diff --git a/tests/integration/defs/perf/disagg/test_configs/wideep/perf/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen1_dep16_bs64_eplb288_mtp3_ccb-UCX.yaml b/tests/integration/defs/perf/disagg/test_configs/wideep/perf/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen1_dep16_bs64_eplb288_mtp3_ccb-UCX.yaml
index 5a25ecfc4a..8724f191f5 100644
--- a/tests/integration/defs/perf/disagg/test_configs/wideep/perf/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen1_dep16_bs64_eplb288_mtp3_ccb-UCX.yaml
+++ b/tests/integration/defs/perf/disagg/test_configs/wideep/perf/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen1_dep16_bs64_eplb288_mtp3_ccb-UCX.yaml
@@ -15,6 +15,7 @@ slurm:
   account: <account>
   job_time: 02:00:00
   job_name: unified-benchmark
+  extra_args: "--gres=gpu:4"
   numa_bind: true
 benchmark:
   mode: gen_only
@@ -37,6 +38,8 @@ environment:
   trtllm_repo: ''
   build_wheel: false
   work_dir: <full_path_to_work_dir>
+  worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes"
+  server_env_var: "TRTLLM_SERVER_DISABLE_GC=1"
 profiling:
   nsys_on: false
 accuracy:
diff --git a/tests/integration/defs/perf/disagg/test_configs/wideep/perf/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen1_dep32_bs16_eplb288_mtp3_ccb-NIXL.yaml b/tests/integration/defs/perf/disagg/test_configs/wideep/perf/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen1_dep32_bs16_eplb288_mtp3_ccb-NIXL.yaml
index 28c55ce399..738c720650 100644
--- a/tests/integration/defs/perf/disagg/test_configs/wideep/perf/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen1_dep32_bs16_eplb288_mtp3_ccb-NIXL.yaml
+++ b/tests/integration/defs/perf/disagg/test_configs/wideep/perf/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen1_dep32_bs16_eplb288_mtp3_ccb-NIXL.yaml
@@ -15,6 +15,7 @@ slurm:
   account: <account>
   job_time: 02:00:00
   job_name: unified-benchmark
+  extra_args: "--gres=gpu:4"
   numa_bind: true
 benchmark:
   mode: gen_only
@@ -37,6 +38,8 @@ environment:
   trtllm_repo: ''
   build_wheel: false
   work_dir: <full_path_to_work_dir>
+  worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes"
+  server_env_var: "TRTLLM_SERVER_DISABLE_GC=1"
 profiling:
   nsys_on: false
 accuracy:
diff --git a/tests/integration/defs/perf/disagg/test_configs/wideep/perf/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen1_dep32_bs16_eplb288_mtp3_ccb-UCX.yaml b/tests/integration/defs/perf/disagg/test_configs/wideep/perf/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen1_dep32_bs16_eplb288_mtp3_ccb-UCX.yaml
index e2a9f70588..af30a466be 100644
--- a/tests/integration/defs/perf/disagg/test_configs/wideep/perf/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen1_dep32_bs16_eplb288_mtp3_ccb-UCX.yaml
+++ b/tests/integration/defs/perf/disagg/test_configs/wideep/perf/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen1_dep32_bs16_eplb288_mtp3_ccb-UCX.yaml
@@ -15,6 +15,7 @@ slurm:
   account: <account>
   job_time: 02:00:00
   job_name: unified-benchmark
+  extra_args: "--gres=gpu:4"
   numa_bind: true
 benchmark:
   mode: gen_only
@@ -37,6 +38,8 @@ environment:
   trtllm_repo: ''
   build_wheel: false
   work_dir: <full_path_to_work_dir>
+  worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes"
+  server_env_var: "TRTLLM_SERVER_DISABLE_GC=1"
 profiling:
   nsys_on: false
 accuracy:
diff --git a/tests/integration/defs/perf/disagg/test_configs/wideep/perf/Qwen3-235B-A22B-FP4_1k1k_ctx2_gen1_dep16_bs128_eplb288_mtp1_ccb-NIXL.yaml b/tests/integration/defs/perf/disagg/test_configs/wideep/perf/Qwen3-235B-A22B-FP4_1k1k_ctx2_gen1_dep16_bs128_eplb288_mtp1_ccb-NIXL.yaml
index 5cf614ba63..c44b3f6bba 100644
--- a/tests/integration/defs/perf/disagg/test_configs/wideep/perf/Qwen3-235B-A22B-FP4_1k1k_ctx2_gen1_dep16_bs128_eplb288_mtp1_ccb-NIXL.yaml
+++ b/tests/integration/defs/perf/disagg/test_configs/wideep/perf/Qwen3-235B-A22B-FP4_1k1k_ctx2_gen1_dep16_bs128_eplb288_mtp1_ccb-NIXL.yaml
@@ -15,6 +15,7 @@ slurm:
   account: <account>
   job_time: 02:00:00
   job_name: unified-benchmark
+  extra_args: "--gres=gpu:4"
   numa_bind: true
 benchmark:
   mode: gen_only
@@ -37,6 +38,8 @@ environment:
   trtllm_repo: ''
   build_wheel: false
   work_dir: <full_path_to_work_dir>
+  worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes"
+  server_env_var: "TRTLLM_SERVER_DISABLE_GC=1"
 profiling:
   nsys_on: false
 accuracy:
diff --git a/tests/integration/defs/perf/disagg/test_configs/wideep/perf/Qwen3-235B-A22B-FP4_1k1k_ctx2_gen1_dep16_bs128_eplb288_mtp1_ccb-UCX.yaml b/tests/integration/defs/perf/disagg/test_configs/wideep/perf/Qwen3-235B-A22B-FP4_1k1k_ctx2_gen1_dep16_bs128_eplb288_mtp1_ccb-UCX.yaml
index 872e5c7a1c..b7a79d7434 100644
--- a/tests/integration/defs/perf/disagg/test_configs/wideep/perf/Qwen3-235B-A22B-FP4_1k1k_ctx2_gen1_dep16_bs128_eplb288_mtp1_ccb-UCX.yaml
+++ b/tests/integration/defs/perf/disagg/test_configs/wideep/perf/Qwen3-235B-A22B-FP4_1k1k_ctx2_gen1_dep16_bs128_eplb288_mtp1_ccb-UCX.yaml
@@ -15,6 +15,7 @@ slurm:
   account: <account>
   job_time: 02:00:00
   job_name: unified-benchmark
+  extra_args: "--gres=gpu:4"
   numa_bind: true
 benchmark:
   mode: gen_only
@@ -37,6 +38,8 @@ environment:
   trtllm_repo: ''
   build_wheel: false
   work_dir: <full_path_to_work_dir>
+  worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes"
+  server_env_var: "TRTLLM_SERVER_DISABLE_GC=1"
 profiling:
   nsys_on: false
 accuracy:
diff --git a/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-r1-fp4_1k1k_ctx1_gen1_dep32_bs32_eplb288_mtp0_ccb-NIXL.yaml b/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-r1-fp4_1k1k_ctx1_gen1_dep32_bs32_eplb288_mtp0_ccb-NIXL.yaml
index c6879f3cbb..73a27246c0 100644
--- a/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-r1-fp4_1k1k_ctx1_gen1_dep32_bs32_eplb288_mtp0_ccb-NIXL.yaml
+++ b/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-r1-fp4_1k1k_ctx1_gen1_dep32_bs32_eplb288_mtp0_ccb-NIXL.yaml
@@ -15,6 +15,7 @@ slurm:
   account: <account>
   job_time: 02:00:00
   job_name: unified-benchmark
+  extra_args: "--gres=gpu:4"
   numa_bind: true
 benchmark:
   mode: gen_only
@@ -37,6 +38,8 @@ environment:
   trtllm_repo: ''
   build_wheel: false
   work_dir: <full_path_to_work_dir>
+  worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes"
+  server_env_var: "TRTLLM_SERVER_DISABLE_GC=1"
 profiling:
   nsys_on: false
 accuracy:
diff --git a/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-r1-fp4_1k1k_ctx1_gen1_dep32_bs32_eplb288_mtp0_ccb-UCX.yaml b/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-r1-fp4_1k1k_ctx1_gen1_dep32_bs32_eplb288_mtp0_ccb-UCX.yaml
index 2f254163c5..e95e71ca15 100644
--- a/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-r1-fp4_1k1k_ctx1_gen1_dep32_bs32_eplb288_mtp0_ccb-UCX.yaml
+++ b/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-r1-fp4_1k1k_ctx1_gen1_dep32_bs32_eplb288_mtp0_ccb-UCX.yaml
@@ -15,6 +15,7 @@ slurm:
   account: <account>
   job_time: 02:00:00
   job_name: unified-benchmark
+  extra_args: "--gres=gpu:4"
   numa_bind: true
 benchmark:
   mode: gen_only
@@ -37,6 +38,8 @@ environment:
   trtllm_repo: ''
   build_wheel: false
   work_dir: <full_path_to_work_dir>
+  worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes"
+  server_env_var: "TRTLLM_SERVER_DISABLE_GC=1"
 profiling:
   nsys_on: false
 accuracy:
diff --git a/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-r1-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb288_mtp3_ccb-NIXL.yaml b/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-r1-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb288_mtp3_ccb-NIXL.yaml
index 01362f7853..6055421a27 100644
--- a/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-r1-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb288_mtp3_ccb-NIXL.yaml
+++ b/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-r1-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb288_mtp3_ccb-NIXL.yaml
@@ -15,6 +15,7 @@ slurm:
   account: <account>
   job_time: 02:00:00
   job_name: unified-benchmark
+  extra_args: "--gres=gpu:4"
   numa_bind: true
 benchmark:
   mode: gen_only
@@ -37,6 +38,8 @@ environment:
   trtllm_repo: ''
   build_wheel: false
   work_dir: <full_path_to_work_dir>
+  worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes"
+  server_env_var: "TRTLLM_SERVER_DISABLE_GC=1"
 profiling:
   nsys_on: false
 accuracy:
diff --git a/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-r1-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb288_mtp3_ccb-UCX.yaml b/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-r1-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb288_mtp3_ccb-UCX.yaml
index 17ffdbd15b..6b47c0fc36 100644
--- a/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-r1-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb288_mtp3_ccb-UCX.yaml
+++ b/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-r1-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb288_mtp3_ccb-UCX.yaml
@@ -15,6 +15,7 @@ slurm:
   account: <account>
   job_time: 02:00:00
   job_name: unified-benchmark
+  extra_args: "--gres=gpu:4"
   numa_bind: true
 benchmark:
   mode: gen_only
@@ -37,6 +38,8 @@ environment:
   trtllm_repo: ''
   build_wheel: false
   work_dir: <full_path_to_work_dir>
+  worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes"
+  server_env_var: "TRTLLM_SERVER_DISABLE_GC=1"
 profiling:
   nsys_on: false
 accuracy:
diff --git a/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-r1-fp4_1k1k_ctx2_gen1_dep48_bs16_eplb288_mtp3_ccb-DEFAULT.yaml b/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-r1-fp4_1k1k_ctx2_gen1_dep48_bs16_eplb288_mtp3_ccb-DEFAULT.yaml
new file mode 100644
index 0000000000..1e71708f57
--- /dev/null
+++ b/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-r1-fp4_1k1k_ctx2_gen1_dep48_bs16_eplb288_mtp3_ccb-DEFAULT.yaml
@@ -0,0 +1,110 @@
+# nvbugs: 5422621
+metadata:
+  model_name: deepseek-r1-fp4
+  precision: fp4
+  model_dir_name: DeepSeek-R1-0528-FP4-V2
+  supported_gpus:
+  - GB200
+  - GB300
+  script_file: disaggr_torch.slurm
+  benchmark_type: 8k1k
+  config_index: 7
+  dataset_file: datasets/deepseek-r1-8192-1024-200000-ratio-1_for_serve.json
+slurm:
+  script_file: disaggr_torch.slurm
+  partition: <partition>
+  account: <account>
+  job_time: 02:00:00
+  job_name: unified-benchmark
+  extra_args: "--gres=gpu:4"
+  numa_bind: true
+benchmark:
+  mode: gen_only
+  use_nv_sa_benchmark: false
+  multi_round: 8
+  benchmark_ratio: 0.8
+  streaming: true
+  concurrency_list: '12288'
+  input_length: 1024
+  output_length: 1024
+  dataset_file: <dataset_file>
+hardware:
+  gpus_per_node: 4
+  num_ctx_servers: 2
+  num_gen_servers: 1
+environment:
+  container_mount: <container_mount>
+  container_image: <container_image>
+  model_path: <model_path>
+  trtllm_repo: ''
+  build_wheel: false
+  work_dir: <full_path_to_work_dir>
+  worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes"
+  server_env_var: "TRTLLM_SERVER_DISABLE_GC=1"
+profiling:
+  nsys_on: false
+accuracy:
+  enable_accuracy_test: false
+  model: local-completions
+  tasks: gsm8k
+  model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096
+worker_config:
+  gen:
+    enable_layerwise_nvtx_marker: true
+    tensor_parallel_size: 48
+    moe_expert_parallel_size: 48
+    enable_attention_dp: true
+    enable_lm_head_tp_in_adp: true
+    pipeline_parallel_size: 1
+    max_batch_size: 1024
+    max_num_tokens: 1024
+    max_seq_len: 2176
+    cuda_graph_config:
+      enable_padding: true
+      batch_sizes:
+      - 1
+      - 2
+      - 4
+      - 8
+      - 16
+      - 32
+      - 64
+      - 128
+      - 256
+      - 512
+      - 768
+      - 1024
+      - 2048
+    print_iter_log: true
+    kv_cache_config:
+      enable_block_reuse: false
+      free_gpu_memory_fraction: 0.7
+      dtype: fp8
+    moe_config:
+      backend: WIDEEP
+      load_balancer:
+        num_slots: 288
+        layer_updates_per_iter: 1
+    cache_transceiver_config:
+      max_tokens_in_buffer: 8320
+      backend: DEFAULT
+    stream_interval: 20
+  ctx:
+    enable_layerwise_nvtx_marker: true
+    max_batch_size: 4
+    max_num_tokens: 4480
+    max_seq_len: 2176
+    tensor_parallel_size: 4
+    moe_expert_parallel_size: 4
+    enable_attention_dp: true
+    pipeline_parallel_size: 1
+    print_iter_log: true
+    cuda_graph_config: null
+    disable_overlap_scheduler: true
+    kv_cache_config:
+      enable_block_reuse: false
+      free_gpu_memory_fraction: 0.85
+      dtype: fp8
+    cache_transceiver_config:
+      max_tokens_in_buffer: 8320
+      backend: DEFAULT
diff --git a/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-r1-fp4_8k1k_ctx2_gen1_dep32_bs128_eplb288_mtp3_ccb-DEFAULT.yaml b/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-r1-fp4_8k1k_ctx2_gen1_dep32_bs128_eplb288_mtp3_ccb-DEFAULT.yaml
index 52012de6e2..06900691bc 100644
--- a/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-r1-fp4_8k1k_ctx2_gen1_dep32_bs128_eplb288_mtp3_ccb-DEFAULT.yaml
+++ b/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-r1-fp4_8k1k_ctx2_gen1_dep32_bs128_eplb288_mtp3_ccb-DEFAULT.yaml
@@ -15,6 +15,7 @@ slurm:
   account: <account>
   job_time: 02:00:00
   job_name: disaggr-test
+  extra_args: "--gres=gpu:4"
   numa_bind: true
 hardware:
   gpus_per_node: 4
@@ -37,6 +38,8 @@ environment:
   trtllm_repo: ''
   build_wheel: false
   work_dir: <full_path_to_work_dir>
+  worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes"
+  server_env_var: "TRTLLM_SERVER_DISABLE_GC=1"
 profiling:
   nsys_on: false
 accuracy:
diff --git a/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-r1-fp4_8k1k_ctx6_gen1_dep16_bs64_eplb288_mtp0_ccb-NIXL.yaml b/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-r1-fp4_8k1k_ctx6_gen1_dep16_bs64_eplb288_mtp0_ccb-NIXL.yaml
index 216c6f8899..13572a6049 100644
--- a/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-r1-fp4_8k1k_ctx6_gen1_dep16_bs64_eplb288_mtp0_ccb-NIXL.yaml
+++ b/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-r1-fp4_8k1k_ctx6_gen1_dep16_bs64_eplb288_mtp0_ccb-NIXL.yaml
@@ -15,6 +15,7 @@ slurm:
   account: <account>
   job_time: 02:00:00
   job_name: unified-benchmark
+  extra_args: "--gres=gpu:4"
   numa_bind: true
 benchmark:
   mode: gen_only
@@ -37,6 +38,8 @@ environment:
   trtllm_repo: ''
   build_wheel: false
   work_dir: <full_path_to_work_dir>
+  worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes"
+  server_env_var: "TRTLLM_SERVER_DISABLE_GC=1"
 profiling:
   nsys_on: false
 accuracy:
diff --git a/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-r1-fp4_8k1k_ctx6_gen1_dep16_bs64_eplb288_mtp0_ccb-UCX.yaml b/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-r1-fp4_8k1k_ctx6_gen1_dep16_bs64_eplb288_mtp0_ccb-UCX.yaml
index 104e567525..30e6152302 100644
--- a/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-r1-fp4_8k1k_ctx6_gen1_dep16_bs64_eplb288_mtp0_ccb-UCX.yaml
+++ b/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-r1-fp4_8k1k_ctx6_gen1_dep16_bs64_eplb288_mtp0_ccb-UCX.yaml
@@ -15,6 +15,7 @@ slurm:
   account: <account>
   job_time: 02:00:00
   job_name: unified-benchmark
+  extra_args: "--gres=gpu:4"
   numa_bind: true
 benchmark:
   mode: gen_only
@@ -37,6 +38,8 @@ environment:
   trtllm_repo: ''
   build_wheel: false
   work_dir: <full_path_to_work_dir>
+  worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes"
+  server_env_var: "TRTLLM_SERVER_DISABLE_GC=1"
 profiling:
   nsys_on: false
 accuracy:
diff --git a/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-r1-fp4_8k1k_ctx8_gen1_dep32_bs16_eplb288_mtp3_ccb-NIXL.yaml b/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-r1-fp4_8k1k_ctx8_gen1_dep32_bs16_eplb288_mtp3_ccb-NIXL.yaml
index 9aa8e38d15..55391a698c 100644
--- a/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-r1-fp4_8k1k_ctx8_gen1_dep32_bs16_eplb288_mtp3_ccb-NIXL.yaml
+++ b/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-r1-fp4_8k1k_ctx8_gen1_dep32_bs16_eplb288_mtp3_ccb-NIXL.yaml
@@ -15,6 +15,7 @@ slurm:
   account: <account>
   job_time: 02:00:00
   job_name: unified-benchmark
+  extra_args: "--gres=gpu:4"
   numa_bind: true
 benchmark:
   mode: gen_only
@@ -37,6 +38,8 @@ environment:
   trtllm_repo: ''
   build_wheel: false
   work_dir: <full_path_to_work_dir>
+  worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes"
+  server_env_var: "TRTLLM_SERVER_DISABLE_GC=1"
 profiling:
   nsys_on: false
 accuracy:
diff --git a/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-r1-fp4_8k1k_ctx8_gen1_dep32_bs16_eplb288_mtp3_ccb-UCX.yaml b/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-r1-fp4_8k1k_ctx8_gen1_dep32_bs16_eplb288_mtp3_ccb-UCX.yaml
index d60df72d59..62301215e9 100644
--- a/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-r1-fp4_8k1k_ctx8_gen1_dep32_bs16_eplb288_mtp3_ccb-UCX.yaml
+++ b/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-r1-fp4_8k1k_ctx8_gen1_dep32_bs16_eplb288_mtp3_ccb-UCX.yaml
@@ -15,6 +15,7 @@ slurm:
   account: <account>
   job_time: 02:00:00
   job_name: unified-benchmark
+  extra_args: "--gres=gpu:4"
   numa_bind: true
 benchmark:
   mode: gen_only
@@ -37,6 +38,8 @@ environment:
   trtllm_repo: ''
   build_wheel: false
   work_dir: <full_path_to_work_dir>
+  worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes"
+  server_env_var: "TRTLLM_SERVER_DISABLE_GC=1"
 profiling:
   nsys_on: false
 accuracy:
diff --git a/tests/integration/defs/perf/disagg/testlist/disagg.txt b/tests/integration/defs/perf/disagg/testlist/disagg.txt
index 8f06a99961..bd0c10fb4c 100644
--- a/tests/integration/defs/perf/disagg/testlist/disagg.txt
+++ b/tests/integration/defs/perf/disagg/testlist/disagg.txt
@@ -16,6 +16,8 @@ test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_deepseek-r1-fp4_
 test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs16_eplb0_mtp3_ccb-UCX]
 test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_Qwen3-235B-A22B-FP4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_ccb-NIXL]
 test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_Qwen3-235B-A22B-FP4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_ccb-UCX]
+test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_Qwen3-235B-A22B-FP8_1k1k_ctx1_gen1_tep8_bs32_eplb0_mtp0_ccb-NIXL]
+test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_Qwen3-235B-A22B-FP8_1k1k_ctx1_gen1_tep8_bs32_eplb0_mtp0_ccb-UCX]
 # test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_Qwen3-235B-A22B-FP4_1k1k_ctx1_gen1_dep16_bs64_eplb0_mtp3_ccb-NIXL]
 # test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_Qwen3-235B-A22B-FP4_1k1k_ctx2_gen1_dep16_bs128_eplb0_mtp1_ccb-NIXL]
 # test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_Qwen3-235B-A22B-FP4_1k1k_ctx2_gen1_dep16_bs128_eplb0_mtp1_ccb-UCX]
diff --git a/tests/integration/defs/perf/disagg/testlist/wideep.txt b/tests/integration/defs/perf/disagg/testlist/wideep.txt
index 4f1064ec68..55e7bd4721 100644
--- a/tests/integration/defs/perf/disagg/testlist/wideep.txt
+++ b/tests/integration/defs/perf/disagg/testlist/wideep.txt
@@ -7,6 +7,7 @@ test_disagg.py::TestDisaggBenchmark::test_benchmark[wideep_perf_deepseek-r1-fp4_
 test_disagg.py::TestDisaggBenchmark::test_benchmark[wideep_perf_deepseek-r1-fp4_8k1k_ctx6_gen1_dep16_bs64_eplb288_mtp0_ccb-UCX]
 test_disagg.py::TestDisaggBenchmark::test_benchmark[wideep_perf_deepseek-r1-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb288_mtp3_ccb-NIXL]
 test_disagg.py::TestDisaggBenchmark::test_benchmark[wideep_perf_deepseek-r1-fp4_1k1k_ctx1_gen1_dep32_bs32_eplb288_mtp0_ccb-NIXL]
+test_disagg.py::TestDisaggBenchmark::test_benchmark[wideep_perf_deepseek-r1-fp4_1k1k_ctx2_gen1_dep48_bs16_eplb288_mtp3_ccb-DEFAULT]
 # test_disagg.py::TestDisaggBenchmark::test_benchmark[wideep_perf_Qwen3-235B-A22B-FP4_1k1k_ctx1_gen1_dep16_bs64_eplb288_mtp3_ccb-NIXL]
 # test_disagg.py::TestDisaggBenchmark::test_benchmark[wideep_perf_Qwen3-235B-A22B-FP4_1k1k_ctx1_gen1_dep32_bs16_eplb288_mtp3_ccb-NIXL]
 # test_disagg.py::TestDisaggBenchmark::test_benchmark[wideep_perf_Qwen3-235B-A22B-FP4_1k1k_ctx2_gen1_dep16_bs128_eplb288_mtp1_ccb-NIXL]
diff --git a/tests/integration/defs/perf/disagg/utils/common.py b/tests/integration/defs/perf/disagg/utils/common.py
index b62ef4341c..9fb72fbacb 100644
--- a/tests/integration/defs/perf/disagg/utils/common.py
+++ b/tests/integration/defs/perf/disagg/utils/common.py
@@ -1,6 +1,7 @@
 """Disaggregated Benchmark Configuration."""
 
 import os
+from datetime import datetime
 
 SESSION_COLLECT_CMD_TYPE = "session_collect"
 
@@ -169,7 +170,8 @@ def extract_config_fields(config_data: dict) -> dict:
 
     # Generate derived fields
     dep_flag = "dep" if gen_enable_dp else "tep"
-    log_base = f"{isl}-{osl}"
+    date_prefix = datetime.now().strftime("%Y%m%d")
+    log_base = f"{date_prefix}/{isl}-{osl}"
     context_dir = (
         f"ctx{ctx_num}_gen{gen_num}_{dep_flag}{gen_tp_size}_"
         f"batch{gen_batch_size}_eplb{eplb_slots}_mtp{mtp_size}"
diff --git a/tests/integration/defs/pytest.ini b/tests/integration/defs/pytest.ini
index 6d6237d581..dcca875f03 100644
--- a/tests/integration/defs/pytest.ini
+++ b/tests/integration/defs/pytest.ini
@@ -6,7 +6,7 @@ junit_family=legacy
 addopts = --ignore-glob="*perf/test_perf.py"  --ignore-glob="*perf/disagg/*"  --ignore-glob="*test_list_validation.py"  --ignore-glob="*llm-test-workspace*"  --durations=0 -W ignore::DeprecationWarning
 pythonpath =
     ../../../examples/auto_deploy
-norecursedirs = ./triton/perf
+norecursedirs = ./triton/perf ./perf/disagg
 markers =
     skip_less_device: skip when less device detected than the declared
     skip_less_mpi_world_size: skip when less mpi world size detected than the declared

From 8b9ab9a701b706cf934b0b8cb680d2f7e85f57c2 Mon Sep 17 00:00:00 2001
From: Yukun He <23156053+hyukn@users.noreply.github.com>
Date: Mon, 8 Dec 2025 10:47:21 +0800
Subject: [PATCH 09/10] [None][fix] Fix two tuning cache miss issues. (#9743)

Signed-off-by: Yukun He <23156053+hyukn@users.noreply.github.com>
---
 tensorrt_llm/_torch/autotuner.py                   | 7 ++++++-
 tensorrt_llm/_torch/custom_ops/torch_custom_ops.py | 4 ++--
 2 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/tensorrt_llm/_torch/autotuner.py b/tensorrt_llm/_torch/autotuner.py
index feecf3d174..609efd1055 100644
--- a/tensorrt_llm/_torch/autotuner.py
+++ b/tensorrt_llm/_torch/autotuner.py
@@ -511,6 +511,11 @@ class AutoTunerProfilingCache:
         cache = {}
         cache_data = serializable_cache["cache_data"]
 
+        def lists_to_tuples(obj):
+            if isinstance(obj, list):
+                return tuple(lists_to_tuples(x) for x in obj)
+            return obj
+
         for key_str, value in cache_data.items():
             # Reconstruct the tuple key safely
             try:
@@ -521,7 +526,7 @@ class AutoTunerProfilingCache:
                 continue
 
             runner_id = value["runner_id"]
-            tactic = value["tactic"]
+            tactic = lists_to_tuples(value["tactic"])
             min_time = value["min_time"]
 
             cache[key] = (runner_id, tactic, min_time)
diff --git a/tensorrt_llm/_torch/custom_ops/torch_custom_ops.py b/tensorrt_llm/_torch/custom_ops/torch_custom_ops.py
index 003f137883..fe09758cfe 100644
--- a/tensorrt_llm/_torch/custom_ops/torch_custom_ops.py
+++ b/tensorrt_llm/_torch/custom_ops/torch_custom_ops.py
@@ -486,10 +486,10 @@ class CublasLtFP4GemmRunner(TunableRunner):
         self.cublaslt_runner = CublasLtFP4GemmRunner.runner_dict[instance_key]
 
     def unique_id(self):
-        return hash((
+        return (
             self.to_userbuffers,
             self.output_dtype,
-        ))
+        )
 
     def get_valid_tactics(self, inputs: List[torch.Tensor],
                           profile: OptimizationProfile, **kwargs) -> List[int]:

From 03f89d7aa40f77c6c4d5b9f1416d1ddca2a72427 Mon Sep 17 00:00:00 2001
From: TensorRT LLM <90828364+tensorrt-cicd@users.noreply.github.com>
Date: Mon, 8 Dec 2025 03:07:46 +0000
Subject: [PATCH 10/10] [None][infra] Check in most recent lock file from
 nightly pipeline

Signed-off-by: TensorRT LLM <90828364+tensorrt-cicd@users.noreply.github.com>
---
 security_scanning/examples/models/core/mllama/poetry.lock | 2 +-
 security_scanning/metadata.json                           | 4 ++--
 security_scanning/poetry.lock                             | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/security_scanning/examples/models/core/mllama/poetry.lock b/security_scanning/examples/models/core/mllama/poetry.lock
index c58e7c12b5..11e0ed3ccb 100644
--- a/security_scanning/examples/models/core/mllama/poetry.lock
+++ b/security_scanning/examples/models/core/mllama/poetry.lock
@@ -708,7 +708,7 @@ files = [
 [[package]]
 name = "nvidia-modelopt"
 version = "0.21.1"
-description = "Nvidia Model Optimizer: a unified model optimization and deployment toolkit."
+description = "Nvidia TensorRT Model Optimizer: a unified model optimization and deployment toolkit."
 optional = false
 python-versions = "<3.13,>=3.8"
 files = [
diff --git a/security_scanning/metadata.json b/security_scanning/metadata.json
index d160caaae0..0c24542544 100644
--- a/security_scanning/metadata.json
+++ b/security_scanning/metadata.json
@@ -1,4 +1,4 @@
 {
-  "commit_hash": "e4c707845ff58fcc0b1d87afb4dd0e64885c780a",
-  "timestamp": "2025-12-07T02:39:14Z"
+  "commit_hash": "8e27ce7084d9fab1051e88fc945732e59689761b",
+  "timestamp": "2025-12-08T02:39:23Z"
 }
diff --git a/security_scanning/poetry.lock b/security_scanning/poetry.lock
index e5959abf84..18ed93657e 100644
--- a/security_scanning/poetry.lock
+++ b/security_scanning/poetry.lock
@@ -2793,7 +2793,7 @@ files = [
 [[package]]
 name = "nvidia-modelopt"
 version = "0.37.0"
-description = "Nvidia Model Optimizer: a unified model optimization and deployment toolkit."
+description = "Nvidia TensorRT Model Optimizer: a unified model optimization and deployment toolkit."
 optional = false
 python-versions = "<3.13,>=3.10"
 files = [