mirror of
https://github.com/NVIDIA/TensorRT-LLM.git
synced 2026-02-22 02:35:21 +08:00
[https://nvbugs/5515753][ci] Add NCCL_DEBUG=INFO flag to collect more info with CI failure. (#8440)
Signed-off-by: Simeng Liu <simengl@nvidia.com> Signed-off-by: Mike Iovine <6158008+mikeiovine@users.noreply.github.com> Signed-off-by: Mike Iovine <miovine@nvidia.com>
This commit is contained in:
parent
ee6944bfa2
commit
9286223288
@ -2209,6 +2209,19 @@ def runLLMTestlistOnPlatformImpl(pipeline, platform, testList, config=VANILLA_CO
|
||||
def noIsolateTests = false
|
||||
def rerunFailed = false
|
||||
|
||||
echoNodeAndGpuInfo(pipeline, stageName)
|
||||
sh 'if [ "$(id -u)" -eq 0 ]; then dmesg -C || true; fi'
|
||||
|
||||
def extraInternalEnv = ""
|
||||
def pytestTestTimeout = "3600"
|
||||
|
||||
// TRT uses half of the host logic cores for engine building which is bad for multi-GPU machines.
|
||||
extraInternalEnv = "__LUNOWUD=\"-thread_pool_size=${TESTER_CORES}\""
|
||||
// CPP test execution is timing out easily, so we always override its internal timeout to the same value as pytest
|
||||
extraInternalEnv += " CPP_TEST_TIMEOUT_OVERRIDDEN=${pytestTestTimeout}"
|
||||
// Enable NCCL debug information for multi-GPU tests
|
||||
extraInternalEnv += " NCCL_DEBUG=INFO"
|
||||
|
||||
def testDBList = renderTestDB(testList, llmSrc, stageName)
|
||||
|
||||
// Process shard test list and create separate files for regular and isolate tests
|
||||
|
||||
@ -194,7 +194,6 @@ def row_linear_residual_norm_fusion_forward(
|
||||
)
|
||||
def test_row_linear_residual_norm_fusion(seq_len, hidden_size, dtype, strategy,
|
||||
fusion):
|
||||
|
||||
if strategy == AllReduceStrategy.NCCL_SYMMETRIC and 2048 in seq_len:
|
||||
pytest.skip("https://nvbugspro.nvidia.com/bug/5573856")
|
||||
|
||||
|
||||
@ -33,9 +33,6 @@ def test_deepseek_streaming(model_name, backend, quant, tp_size):
|
||||
is_fp8 = quant == "fp8"
|
||||
is_fp4 = quant == "fp4"
|
||||
|
||||
if tp_size == 4:
|
||||
pytest.skip(f"https://nvbugs/5515753")
|
||||
|
||||
if torch.cuda.device_count() < tp_size:
|
||||
pytest.skip(f"Not enough GPUs available, need {tp_size} "
|
||||
f"but only have {torch.cuda.device_count()}")
|
||||
|
||||
@ -1056,6 +1056,7 @@ class TestMoeFp4:
|
||||
)
|
||||
def test_autotune(self, num_tokens, hidden_size, intermediate_size,
|
||||
routing_info):
|
||||
pytest.skip("https://nvbugs/5575841")
|
||||
|
||||
self.run_moe_fp4_test(num_tokens,
|
||||
hidden_size,
|
||||
@ -1138,6 +1139,7 @@ class TestMoeFp4:
|
||||
ids=["use_score_as_input", "use_topk_as_input"])
|
||||
def test_no_autotune(self, num_tokens, hidden_size, intermediate_size,
|
||||
routing_info, use_topk_as_input):
|
||||
pytest.skip("https://nvbugs/5575841")
|
||||
|
||||
self.run_moe_fp4_test(num_tokens,
|
||||
hidden_size,
|
||||
@ -1234,6 +1236,9 @@ class TestMoeFp4:
|
||||
if padding >= 256:
|
||||
pytest.skip("Routing kernel requires that padding be less than 256")
|
||||
|
||||
if intermediate_size == 384:
|
||||
pytest.skip("https://nvbugs/5434352")
|
||||
|
||||
assert top_k <= num_experts
|
||||
assert top_k <= 10
|
||||
assert num_experts % 4 == 0
|
||||
|
||||
Loading…
Reference in New Issue
Block a user