From 92862232883915834afc9e850640a63398f46790 Mon Sep 17 00:00:00 2001
From: Simeng Liu <109828133+SimengLiu-nv@users.noreply.github.com>
Date: Tue, 21 Oct 2025 18:12:05 -0700
Subject: [PATCH] [https://nvbugs/5515753][ci] Add NCCL_DEBUG=INFO flag to
 collect more info with CI failure.  (#8440)

Signed-off-by: Simeng Liu <simengl@nvidia.com>
Signed-off-by: Mike Iovine <6158008+mikeiovine@users.noreply.github.com>
Signed-off-by: Mike Iovine <miovine@nvidia.com>
---
 jenkins/L0_Test.groovy                              | 13 +++++++++++++
 .../_torch/multi_gpu/test_mnnvl_allreduce.py        |  1 -
 .../_torch/multi_gpu_modeling/test_deepseek.py      |  3 ---
 tests/unittest/_torch/thop/parallel/test_moe.py     |  5 +++++
 4 files changed, 18 insertions(+), 4 deletions(-)

diff --git a/jenkins/L0_Test.groovy b/jenkins/L0_Test.groovy
index dd0cb2b439..74ce61270e 100644
--- a/jenkins/L0_Test.groovy
+++ b/jenkins/L0_Test.groovy
@@ -2209,6 +2209,19 @@ def runLLMTestlistOnPlatformImpl(pipeline, platform, testList, config=VANILLA_CO
         def noIsolateTests = false
         def rerunFailed = false
 
+        echoNodeAndGpuInfo(pipeline, stageName)
+        sh 'if [ "$(id -u)" -eq 0 ]; then dmesg -C || true; fi'
+
+        def extraInternalEnv = ""
+        def pytestTestTimeout = "3600"
+
+        // TRT uses half of the host logic cores for engine building which is bad for multi-GPU machines.
+        extraInternalEnv = "__LUNOWUD=\"-thread_pool_size=${TESTER_CORES}\""
+        // CPP test execution is timing out easily, so we always override its internal timeout to the same value as pytest
+        extraInternalEnv += " CPP_TEST_TIMEOUT_OVERRIDDEN=${pytestTestTimeout}"
+        // Enable NCCL debug information for multi-GPU tests
+        extraInternalEnv += " NCCL_DEBUG=INFO"
+
         def testDBList = renderTestDB(testList, llmSrc, stageName)
 
         // Process shard test list and create separate files for regular and isolate tests
diff --git a/tests/unittest/_torch/multi_gpu/test_mnnvl_allreduce.py b/tests/unittest/_torch/multi_gpu/test_mnnvl_allreduce.py
index 9fee65d30b..56cf5a9562 100644
--- a/tests/unittest/_torch/multi_gpu/test_mnnvl_allreduce.py
+++ b/tests/unittest/_torch/multi_gpu/test_mnnvl_allreduce.py
@@ -194,7 +194,6 @@ def row_linear_residual_norm_fusion_forward(
 )
 def test_row_linear_residual_norm_fusion(seq_len, hidden_size, dtype, strategy,
                                          fusion):
-
     if strategy == AllReduceStrategy.NCCL_SYMMETRIC and 2048 in seq_len:
         pytest.skip("https://nvbugspro.nvidia.com/bug/5573856")
 
diff --git a/tests/unittest/_torch/multi_gpu_modeling/test_deepseek.py b/tests/unittest/_torch/multi_gpu_modeling/test_deepseek.py
index a94e89c743..5a38f0d078 100644
--- a/tests/unittest/_torch/multi_gpu_modeling/test_deepseek.py
+++ b/tests/unittest/_torch/multi_gpu_modeling/test_deepseek.py
@@ -33,9 +33,6 @@ def test_deepseek_streaming(model_name, backend, quant, tp_size):
     is_fp8 = quant == "fp8"
     is_fp4 = quant == "fp4"
 
-    if tp_size == 4:
-        pytest.skip(f"https://nvbugs/5515753")
-
     if torch.cuda.device_count() < tp_size:
         pytest.skip(f"Not enough GPUs available, need {tp_size} "
                     f"but only have {torch.cuda.device_count()}")
diff --git a/tests/unittest/_torch/thop/parallel/test_moe.py b/tests/unittest/_torch/thop/parallel/test_moe.py
index 77e9299f6a..5fefa69925 100644
--- a/tests/unittest/_torch/thop/parallel/test_moe.py
+++ b/tests/unittest/_torch/thop/parallel/test_moe.py
@@ -1056,6 +1056,7 @@ class TestMoeFp4:
     )
     def test_autotune(self, num_tokens, hidden_size, intermediate_size,
                       routing_info):
+        pytest.skip("https://nvbugs/5575841")
 
         self.run_moe_fp4_test(num_tokens,
                               hidden_size,
@@ -1138,6 +1139,7 @@ class TestMoeFp4:
                              ids=["use_score_as_input", "use_topk_as_input"])
     def test_no_autotune(self, num_tokens, hidden_size, intermediate_size,
                          routing_info, use_topk_as_input):
+        pytest.skip("https://nvbugs/5575841")
 
         self.run_moe_fp4_test(num_tokens,
                               hidden_size,
@@ -1234,6 +1236,9 @@ class TestMoeFp4:
         if padding >= 256:
             pytest.skip("Routing kernel requires that padding be less than 256")
 
+        if intermediate_size == 384:
+            pytest.skip("https://nvbugs/5434352")
+
         assert top_k <= num_experts
         assert top_k <= 10
         assert num_experts % 4 == 0