[TRTLLM-7351][infra] Add isolate marker for L0 (#7497)

Signed-off-by: qqiao <qqiao@nvidia.com> Signed-off-by: Emma Qiao <qqiao@nvidia.com> Co-authored-by: Yanchao Lu <yanchaol@nvidia.com>
2026-01-13 22:18:36 +08:00 · 2025-10-15 07:58:14 +08:00 · 2025-10-15 07:58:14 +08:00 · 493da020c1
commit 493da020c1
parent 9d855f47ad
6 changed files with 501 additions and 53 deletions
--- a/jenkins/L0_Test.groovy
+++ b/jenkins/L0_Test.groovy
@ -270,6 +270,250 @@ def cleanUpNodeResources(def pipeline, SlurmCluster cluster, String nodeName, St
    }
 }

+def runIsolatedTests(preprocessedLists, testCmdLine, llmSrc, stageName) {
+    // Run the isolated tests one by one to avoid any potential conflicts
+    def isolateTestList = preprocessedLists.isolate
+    def isolateTestLines = readFile(file: isolateTestList).readLines()
+    def rerunFailed = false
+
+    for (int i = 0; i < isolateTestLines.size(); i++) {
+        def isolateTestName = isolateTestLines[i].trim()
+        // Create a temporary file for this single isolated test
+        def singleTestFile = "${isolateTestList}_isolated_${i}.txt"
+        sh "echo '${isolateTestName}' > ${singleTestFile}"
+        sh "cat ${singleTestFile}"
+
+        def isolateTestCmdLine = testCmdLine.findAll { cmd ->
+            !cmd.contains("--test-list=") &&
+            !cmd.contains("--test-prefix=") &&
+            !cmd.contains("--csv=") &&
+            !cmd.contains("--junit-xml")
+        }
+        isolateTestCmdLine += ["--test-list=${singleTestFile}"]
+        isolateTestCmdLine += ["--test-prefix=${stageName}"]
+        isolateTestCmdLine += ["--csv=${WORKSPACE}/${stageName}/report_isolated_${i}.csv"]
+        isolateTestCmdLine += ["--junit-xml ${WORKSPACE}/${stageName}/results_isolated_${i}.xml"]
+        isolateTestCmdLine += ["--cov-append"]  // Append coverage data to avoid overwriting previous data
+
+        try {
+            sh """
+                cd ${llmSrc}/tests/integration/defs && \
+                ${isolateTestCmdLine.join(" ")}
+            """
+        } catch (InterruptedException e) {
+            throw e
+        } catch (Exception e) {
+            def isRerunFailed = rerunFailedTests(stageName, llmSrc, isolateTestCmdLine, "results_isolated_${i}.xml", "isolated_${i}")
+            if (isRerunFailed) {
+                // Mark that at least one isolated test failed, but continue processing other tests
+                rerunFailed = true
+                echo "Isolated test ${i} (${isolateTestName}) failed after rerun attempt, continuing with remaining tests"
+            }
+        } finally {
+            // Clean up the temporary test file
+            sh "rm -f ${singleTestFile}"
+        }
+    }
+
+    // After processing all isolated tests, set stage failure if any test failed
+    if (rerunFailed) {
+        catchError(buildResult: 'SUCCESS', stageResult: 'FAILURE') {
+            error "One or more isolated tests failed after rerun attempts"
+        }
+    }
+
+    return rerunFailed  // Return the updated value
+}
+
+def processShardTestList(llmSrc, testDBList, splitId, splits, perfMode=false) {
+    // Preprocess testDBList to extract ISOLATION markers
+    echo "Preprocessing testDBList to extract ISOLATION markers..."
+
+    def originalTestLines = readFile(file: testDBList).readLines()
+    def cleanedTestLines = []
+    def isolationTestLines = []
+
+    originalTestLines.each { originalLine ->
+        def trimmedLine = originalLine.trim()
+        if (trimmedLine && trimmedLine.contains('ISOLATION')) {
+            // Remove ISOLATION marker and nearby comma from the line
+            def cleanedLine = trimmedLine
+
+            // Handle different comma patterns around ISOLATION
+            if (trimmedLine.contains('ISOLATION,')) {
+                // Case: "ISOLATION,OTHER_MARKER" -> remove "ISOLATION,"
+                cleanedLine = cleanedLine.replace('ISOLATION,', '').trim()
+            } else if (trimmedLine.contains(',ISOLATION')) {
+                // Case: "OTHER_MARKER,ISOLATION" -> remove ",ISOLATION"
+                cleanedLine = cleanedLine.replace(',ISOLATION', '').trim()
+            } else {
+                // Case: standalone "ISOLATION" -> remove " ISOLATION"
+                cleanedLine = cleanedLine.replace(' ISOLATION', '').trim()
+            }
+
+            // Add the cleaned line to isolationTestLines if original line had ISOLATION
+            isolationTestLines.add(cleanedLine)
+            cleanedTestLines.add(cleanedLine)
+
+        } else if (trimmedLine) {
+            // Line doesn't contain ISOLATION, add as-is
+            cleanedTestLines.add(originalLine.trim())
+        }
+    }
+
+    // Create cleaned testDBList file (without ISOLATION markers)
+    def cleanedTestDBList = testDBList.replaceAll('\\.txt$', '_cleaned.txt')
+    if (cleanedTestLines.size() > 0) {
+        def cleanedContent = cleanedTestLines.join('\n')
+        sh "echo '${cleanedContent.replace("'", "'\\''")}' > ${cleanedTestDBList}"
+        echo "Created cleaned testDBList: ${cleanedTestDBList} with ${cleanedTestLines.size()} lines (ISOLATION markers removed)"
+    } else {
+        sh "touch ${cleanedTestDBList}"
+        echo "No tests found, created empty cleaned testDBList: ${cleanedTestDBList}"
+    }
+
+    sh "cat ${cleanedTestDBList}"
+    echo "Original testDBList contains ${isolationTestLines.size()} tests that had ISOLATION markers"
+
+    def shardTestList = []
+
+    if (perfMode) {
+        // In perfMode, skip pytest collection as it may cause errors with automatically generated testcases
+        // Instead, use all tests from the original testDBList
+        echo "Performance mode enabled - skipping pytest collection, using all tests from testDBList"
+    } else {
+        def testListCmd = [
+            "LLM_ROOT=${llmSrc}",
+            "LLM_BACKEND_ROOT=${llmSrc}/triton_backend",
+            "pytest",
+            "--collect-only",
+            "--splitting-algorithm least_duration",
+            "--test-list=${cleanedTestDBList}",
+            "--quiet",
+            "--splits ${splits}",
+            "--group ${splitId}"
+        ]
+
+        try {
+            // First execute the pytest command and check if it succeeds
+            def pytestOutput = sh(
+                script: "cd ${llmSrc}/tests/integration/defs && ${testListCmd.join(' ')}",
+                returnStdout: true
+            ).trim()
+
+            // Debug: Show the raw pytest output
+            echo "<<<START_PYTEST_OUTPUT>>>"
+            echo "${pytestOutput}"
+            echo "<<<END_PYTEST_OUTPUT>>>"
+
+            // Filter the output to get only test lines with '::' that occur after "Running X items in this shard"
+            def lines = pytestOutput.split('\n')
+            def foundRunningLine = false
+            def lineIndex = 0
+            shardTestList = lines.findAll { line ->
+                lineIndex++
+
+                if (line.matches(/.*Running \d+ items in this shard.*/) || line.matches(/.*\[pytest-split\] Running group.*/)) {
+                    foundRunningLine = true
+                    return false  // Don't include the "Running" line itself
+                }
+
+                def hasDoubleColon = line.contains('::')
+                def shouldInclude = foundRunningLine && hasDoubleColon
+                return shouldInclude
+            }
+            echo "Filtering complete. shardTestList size: ${shardTestList.size()}"
+        } catch (Exception e) {
+            echo "Error: Failed to execute pytest command for test collection: ${e.getMessage()}"
+            error "Test collection failed for shard ${splitId}/${splits}. Cannot proceed without valid test list."
+        }
+    }
+
+    if (shardTestList || perfMode) {
+        // Split the shard test list into regular and isolate tests
+        def shardRegularTests = []
+        def shardIsolateTests = []
+
+        if (perfMode) {
+            // In perfMode, put all tests in regular and skip isolation
+            echo "Performance mode enabled - all tests will run as regular tests (no isolation)"
+            shardRegularTests = cleanedTestLines.findAll { it.trim() }
+        } else {
+            // Process each test from shardTestList
+            shardTestList.each { test ->
+                def trimmedTest = test.trim()
+                if (trimmedTest) {
+                    // Process test_unittests.py::test_unittests_v2[xxxx] pattern
+                    if (trimmedTest.startsWith('test_unittests.py::test_unittests_v2[') && trimmedTest.endsWith(']')) {
+                        // Extract content between [ and ]
+                        def startIndex = trimmedTest.indexOf('[') + 1
+                        def endIndex = trimmedTest.lastIndexOf(']')
+                        trimmedTest = trimmedTest.substring(startIndex, endIndex)
+                    }
+
+                    // Check if this test is in the isolation list
+                    def isolationTestLine = isolationTestLines.find { it.contains(trimmedTest) }
+                    if (isolationTestLine) {
+                        // This test needs isolation
+                        shardIsolateTests.add(isolationTestLine)
+                    } else {
+                        // This test is a regular test - find the actual line from cleanedTestLines
+                        def cleanedTestLine = cleanedTestLines.find { it.contains(trimmedTest) }
+                        shardRegularTests.add(cleanedTestLine)
+                    }
+                }
+            }
+        }
+
+        // Define file paths for regular and isolate tests
+        def regularTestList = testDBList.replaceAll('\\.txt$', '_regular.txt')
+        def isolateTestList = testDBList.replaceAll('\\.txt$', '_isolate.txt')
+
+        // Create shard-specific test files
+        if (shardRegularTests.size() > 0) {
+            def shardRegularContent = shardRegularTests.join('\n')
+            sh "echo '${shardRegularContent.replace("'", "'\\''")}' > ${regularTestList}"
+            echo "Created ${regularTestList} with ${shardRegularTests.size()} regular tests for this shard"
+        } else {
+            sh "touch ${regularTestList}"
+            echo "No regular tests in this shard, created empty file: ${regularTestList}"
+        }
+        sh "cat ${regularTestList}"
+
+        if (shardIsolateTests.size() > 0) {
+            def shardIsolateContent = shardIsolateTests.join('\n')
+            sh "echo '${shardIsolateContent.replace("'", "'\\''")}' > ${isolateTestList}"
+            echo "Created ${isolateTestList} with ${shardIsolateTests.size()} isolate tests for this shard"
+        } else {
+            sh "touch ${isolateTestList}"
+            echo "No isolate tests in this shard, created empty file: ${isolateTestList}"
+        }
+        sh "cat ${isolateTestList}"
+
+        // Return preprocessed lists object for compatibility
+        return [
+            regular: regularTestList,
+            isolate: isolateTestList,
+            regularCount: shardRegularTests.size(),
+            isolateCount: shardIsolateTests.size()
+        ]
+    } else {
+        echo "No tests found in current shard or failed to list tests"
+        // Create empty files and preprocessed lists object
+        def regularTestList = testDBList.replaceAll('\\.txt$', '_regular.txt')
+        def isolateTestList = testDBList.replaceAll('\\.txt$', '_isolate.txt')
+        sh "touch ${regularTestList}"
+        sh "touch ${isolateTestList}"
+
+        return [
+            regular: regularTestList,
+            isolate: isolateTestList,
+            regularCount: 0,
+            isolateCount: 0
+        ]
+    }
+}
+
 def executeLLMTestOnSlurm(pipeline, platform, testList, config=VANILLA_CONFIG, perfMode=false, stageName="Undefined", splitId=1, splits=1, skipInstallWheel=false, cpver="cp312", runner)
 {
    runner {
@ -1384,24 +1628,28 @@ def getSSHConnectionPorts(portConfigFile, stageName)
    return [userPort, monitorPort]
 }

-def rerunFailedTests(stageName, llmSrc, testCmdLine) {
-    if (!fileExists("${WORKSPACE}/${stageName}/results.xml")) {
-        error "There is not results.xml file, skip the rerun step"
+def rerunFailedTests(stageName, llmSrc, testCmdLine, resultFileName="results.xml", testType="regular") {
+    if (!fileExists("${WORKSPACE}/${stageName}/${resultFileName}")) {
+        error "There is not ${resultFileName} file, skip the rerun step"
    }

+    // Create rerun directory structure to avoid conflicts
+    def rerunDir = "${WORKSPACE}/${stageName}/rerun/${testType}"
+    sh "mkdir -p ${rerunDir}"
+
    // Generate rerun test lists
    def failSignaturesList = trtllm_utils.getFailSignaturesList().join(",")
    sh """
        python3 ${llmSrc}/jenkins/scripts/test_rerun.py \
        generate_rerun_tests_list \
-        --output-dir=${WORKSPACE}/${stageName}/ \
-        --input-file=${WORKSPACE}/${stageName}/results.xml \
+        --output-dir=${rerunDir}/ \
+        --input-file=${WORKSPACE}/${stageName}/${resultFileName} \
        --fail-signatures='${failSignaturesList}'
    """

    // If there are some failed tests that cannot be rerun (e.g. test duration > 10 min and no known failure signatures),
    // fail the stage immediately without attempting any reruns
-    def rerunTestList = "${WORKSPACE}/${stageName}/rerun_0.txt"
+    def rerunTestList = "${rerunDir}/rerun_0.txt"
    if (fileExists(rerunTestList)) {
        sh "cat ${rerunTestList}"
        error "There are some failed tests that cannot be rerun, skip the rerun step."
@ -1410,32 +1658,32 @@ def rerunFailedTests(stageName, llmSrc, testCmdLine) {
    // If the stage has more than 5 failed tests, skip the rerun step
    def validLineCount = 0
    for (times in [1, 2]) {
-        def currentRerunTestList = "${WORKSPACE}/${stageName}/rerun_${times}.txt"
+        def currentRerunTestList = "${rerunDir}/rerun_${times}.txt"
        if (fileExists(currentRerunTestList)) {
            count = sh(
                script: "grep -v '^[[:space:]]*\$' ${currentRerunTestList} | wc -l",
                returnStdout: true
            ).trim().toInteger()
-            echo "Found ${count} tests to rerun ${times} time(s)"
+            echo "Found ${count} ${testType} tests to rerun ${times} time(s)"
            validLineCount += count
        }
    }
    if (validLineCount > 5) {
-        error "There are more than 5 failed tests, skip the rerun step."
+        error "There are more than 5 failed ${testType} tests, skip the rerun step."
    } else if (validLineCount == 0) {
-        error "No failed tests need to be rerun, skip the rerun step."
+        error "No failed ${testType} tests need to be rerun, skip the rerun step."
    }

    // Rerun tests
    def isRerunFailed = false
    for (times in [1, 2]) {
-        def currentRerunTestList = "${WORKSPACE}/${stageName}/rerun_${times}.txt"
+        def currentRerunTestList = "${rerunDir}/rerun_${times}.txt"
        if (!fileExists(currentRerunTestList)) {
-            echo "No failed tests need to be rerun ${times} time(s)"
+            echo "No failed ${testType} tests need to be rerun ${times} time(s)"
            continue
        }
        sh "cat ${currentRerunTestList}"
-        def xmlFile = "${WORKSPACE}/${stageName}/rerun_results_${times}.xml"
+        def xmlFile = "${rerunDir}/rerun_results_${times}.xml"
        // change the testCmdLine for rerun
        def noNeedLine = ["--splitting-algorithm", "--splits", "--group", "--waives-file", "--cov"]
        def needToChangeLine = ["--test-list", "--csv", "--junit-xml"]
@ -1444,7 +1692,7 @@ def rerunFailedTests(stageName, llmSrc, testCmdLine) {
        }
        newTestCmdLine += [
            "--test-list=${currentRerunTestList}",
-            "--csv=${WORKSPACE}/${stageName}/rerun_report_${times}.csv",
+            "--csv=${rerunDir}/rerun_report_${times}.csv",
            "--junit-xml ${xmlFile}",
            "--reruns ${times - 1}"
        ]
@ -1457,45 +1705,124 @@ def rerunFailedTests(stageName, llmSrc, testCmdLine) {
            throw e
        } catch (Exception e) {
            if (!fileExists(xmlFile)) {
-                echo "The tests crashed when rerun attempt."
+                echo "The ${testType} tests crashed when rerun attempt."
                throw e
            }
-            echo "The tests still failed after rerun attempt."
+            echo "The ${testType} tests still failed after rerun attempt."
            isRerunFailed = true
        }
    }

-    // Specify the stage name correctly
-    sh "cd ${WORKSPACE}/${stageName} && sed -i 's/testsuite name=\"pytest\"/testsuite name=\"${stageName}\"/g' *.xml || true"
+    echo "isRerunFailed for ${testType}: ${isRerunFailed}"
+    return isRerunFailed
+}

-    // Generate rerun report
-    def inputFiles = ["${WORKSPACE}/${stageName}/results.xml",
-                      "${WORKSPACE}/${stageName}/rerun_results_1.xml",
-                      "${WORKSPACE}/${stageName}/rerun_results_2.xml"]
+def generateRerunReport(stageName, llmSrc) {
+    echo "Generating comprehensive rerun report for stage: ${stageName}"
+
+    def rerunBaseDir = "${WORKSPACE}/${stageName}/rerun"
+    def regularRerunDir = "${rerunBaseDir}/regular"
+
+    // Check if regular rerun directory exists
+    def hasRegularReruns = sh(script: "[ -d '${regularRerunDir}' ] && echo 'true' || echo 'false'", returnStdout: true).trim() == 'true'
+
+    // Find all isolated rerun directories (isolated_0, isolated_1, etc.)
+    def isolatedRerunDirs = []
+    def isolatedDirsOutput = sh(script: "find ${rerunBaseDir} -type d -name 'isolated_*' 2>/dev/null || true", returnStdout: true).trim()
+    if (isolatedDirsOutput) {
+        isolatedRerunDirs = isolatedDirsOutput.split('\n').findAll { it.trim() }
+    }
+    def hasIsolatedReruns = isolatedRerunDirs.size() > 0
+
+    echo "Found regular reruns: ${hasRegularReruns}"
+    echo "Found isolated rerun directories: ${isolatedRerunDirs}"
+
+    if (!hasRegularReruns && !hasIsolatedReruns) {
+        echo "No rerun results found, skipping rerun report generation"
+        return
+    }
+
+    // Specify the stage name correctly for all result xml files.
+    sh "cd ${WORKSPACE}/${stageName} && find . -name '*.xml' -exec sed -i 's/testsuite name=\"pytest\"/testsuite name=\"${stageName}\"/g' {} + || true"
+
+    // Collect all original and rerun result files
+    def allInputFiles = []
+
+    // Add original results
+    if (fileExists("${WORKSPACE}/${stageName}/results.xml")) {
+        allInputFiles.add("${WORKSPACE}/${stageName}/results.xml")
+    }
+
+    // Add isolated test results
+    def isolatedResults = sh(script: "find ${WORKSPACE}/${stageName} -name 'results_isolated_*.xml' 2>/dev/null || true", returnStdout: true).trim()
+    if (isolatedResults) {
+        isolatedResults.split('\n').each { file ->
+            if (file.trim()) {
+                allInputFiles.add(file.trim())
+            }
+        }
+    }
+
+    // Add regular rerun results
+    if (hasRegularReruns) {
+        for (times in [1, 2]) {
+            def rerunFile = "${regularRerunDir}/rerun_results_${times}.xml"
+            if (fileExists(rerunFile)) {
+                allInputFiles.add(rerunFile)
+            }
+        }
+    }
+
+    // Add isolated rerun results from all isolated directories
+    if (hasIsolatedReruns) {
+        isolatedRerunDirs.each { isolatedDir ->
+            for (times in [1, 2]) {
+                def rerunFile = "${isolatedDir}/rerun_results_${times}.xml"
+                if (fileExists(rerunFile)) {
+                    allInputFiles.add(rerunFile)
+                    echo "Added isolated rerun result: ${rerunFile}"
+                }
+            }
+        }
+    }
+
+    if (allInputFiles.isEmpty()) {
+        echo "No valid input files found for rerun report generation"
+        return
+    }
+
+    echo "Generating rerun report with input files: ${allInputFiles.join(',')}"
+
+    // Generate comprehensive rerun report
    sh """
        python3 ${llmSrc}/jenkins/scripts/test_rerun.py \
        generate_rerun_report \
        --output-file=${WORKSPACE}/${stageName}/rerun_results.xml \
-        --input-files=${inputFiles.join(",")}
+        --input-files=${allInputFiles.join(",")}
    """

-    // Update original results xml file with rerun results xml files for junit
+    // Update original results xml file with all rerun results for junit
    sh """
        python3 ${llmSrc}/jenkins/scripts/test_rerun.py \
        merge_junit_xmls \
        --output-file=${WORKSPACE}/${stageName}/results.xml \
-        --input-files=${inputFiles.join(",")} \
+        --input-files=${allInputFiles.join(",")} \
        --deduplicate
    """

-    trtllm_utils.uploadArtifacts(
-        "${WORKSPACE}/${stageName}/rerun_results.html",
-        "${UPLOAD_PATH}/rerun_reports/${stageName}_rerun_results.html"
-    )
+    // Upload rerun report
+    if (fileExists("${WORKSPACE}/${stageName}/rerun_results.html")) {
+        trtllm_utils.uploadArtifacts(
+            "${WORKSPACE}/${stageName}/rerun_results.html",
+            "${UPLOAD_PATH}/rerun_reports/${stageName}_rerun_results.html"
+        )
+        echo "Test rerun report: https://urm.nvidia.com/artifactory/${UPLOAD_PATH}/rerun_reports/${stageName}_rerun_results.html"
+    }

-    echo "Test rerun report: https://urm.nvidia.com/artifactory/${UPLOAD_PATH}/rerun_reports/${stageName}_rerun_results.html"
-    echo "isRerunFailed: ${isRerunFailed}"
-    return isRerunFailed
+    // Remove isolation results since they are merged into results.xml
+    sh "rm -rf ${WORKSPACE}/${stageName}/results_isolated_*.xml || true"
+
+    echo "Rerun report generation completed for stage: ${stageName}"
 }

 def runLLMTestlistOnPlatformImpl(pipeline, platform, testList, config=VANILLA_CONFIG, perfMode=false, stageName="Undefined", splitId=1, splits=1, skipInstallWheel=false, cpver="cp312")
@ -1668,6 +1995,9 @@ def runLLMTestlistOnPlatformImpl(pipeline, platform, testList, config=VANILLA_CO

        def extraInternalEnv = ""
        def pytestTestTimeout = "3600"
+        def noRegularTests = false
+        def noIsolateTests = false
+        def rerunFailed = false

        // TRT uses half of the host logic cores for engine building which is bad for multi-GPU machines.
        extraInternalEnv = "__LUNOWUD=\"-thread_pool_size=${TESTER_CORES}\""
@ -1675,7 +2005,10 @@ def runLLMTestlistOnPlatformImpl(pipeline, platform, testList, config=VANILLA_CO
        extraInternalEnv += " CPP_TEST_TIMEOUT_OVERRIDDEN=${pytestTestTimeout}"

        def testDBList = renderTestDB(testList, llmSrc, stageName)
-        testList = "${testList}_${splitId}"
+
+        // Process shard test list and create separate files for regular and isolate tests
+        def preprocessedLists = processShardTestList(llmSrc, testDBList, splitId, splits, perfMode)
+
        def testCmdLine = [
            "LLM_ROOT=${llmSrc}",
            "LLM_BACKEND_ROOT=${llmSrc}/triton_backend",
@ -1687,19 +2020,22 @@ def runLLMTestlistOnPlatformImpl(pipeline, platform, testList, config=VANILLA_CO
            testFilter[(DETAILED_LOG)] ? "-s" : "",
            "--timeout-method=thread",
            "--apply-test-list-correction",
-            "--splitting-algorithm least_duration",
            "--timeout=${pytestTestTimeout}",
            "--rootdir ${llmSrc}/tests/integration/defs",
            "--test-prefix=${stageName}",
-            "--splits ${splits}",
-            "--group ${splitId}",
            "--waives-file=${llmSrc}/tests/integration/test_lists/waives.txt",
-            "--test-list=${testDBList}",
            "--output-dir=${WORKSPACE}/${stageName}/",
            "--csv=${WORKSPACE}/${stageName}/report.csv",
            "--junit-xml ${WORKSPACE}/${stageName}/results.xml",
            "-o junit_logging=out-err"
        ]
+
+        // Only add --test-list if there are regular tests to run
+        if (preprocessedLists.regularCount > 0) {
+            // Remove any existing --test-list options and add the new one
+            testCmdLine = testCmdLine.findAll { cmd -> !cmd.contains("--test-list=") }
+            testCmdLine += ["--test-list=${preprocessedLists.regular}"]
+        }
        if (perfMode) {
            testCmdLine += [
                "--perf",
@ -1751,22 +2087,63 @@ def runLLMTestlistOnPlatformImpl(pipeline, platform, testList, config=VANILLA_CO
            ]) {
                sh "env | sort"
                try {
-                    sh """
-                        rm -rf ${stageName}/ && \
-                        cd ${llmSrc}/tests/integration/defs && \
-                        ${testCmdLine.join(" ")}
-                    """
+                    if (preprocessedLists.regularCount > 0) {
+                        sh """
+                            rm -rf ${stageName}/ && \
+                            cd ${llmSrc}/tests/integration/defs && \
+                            ${testCmdLine.join(" ")}
+                        """
+                    } else {
+                        echo "No regular tests to run for stage ${stageName}"
+                        noRegularTests = true
+                        sh "mkdir -p ${stageName}"
+                        // Create an empty results.xml file for consistency
+                        sh """
+                            echo '<?xml version="1.0" encoding="UTF-8"?>' > ${stageName}/results.xml
+                            echo '<testsuites>' >> ${stageName}/results.xml
+                            echo '<testsuite name="${stageName}" errors="0" failures="0" skipped="0" tests="0" time="0.0">' >> ${stageName}/results.xml
+                            echo '</testsuite>' >> ${stageName}/results.xml
+                            echo '</testsuites>' >> ${stageName}/results.xml
+                        """
+                    }
                } catch (InterruptedException e) {
                    throw e
                } catch (Exception e) {
-                    def isRerunFailed = rerunFailedTests(stageName, llmSrc, testCmdLine)
+                    def isRerunFailed = rerunFailedTests(stageName, llmSrc, testCmdLine, "results.xml", "regular")
                    if (isRerunFailed) {
-                        error "The tests still failed after rerun attempt."
+                        catchError(buildResult: 'SUCCESS', stageResult: 'FAILURE') {
+                            error "Regular tests failed after rerun attempt"
+                        }
+                        rerunFailed = true
                    }
                }
            }
        }

+        // Run the isolated tests if exists
+        if (preprocessedLists.isolateCount > 0) {
+            stage ("[${stageName}] Run Pytest (Isolated)") {
+                echo "There are ${preprocessedLists.isolateCount} isolated tests to run"
+                rerunFailed = runIsolatedTests(preprocessedLists, testCmdLine, llmSrc, stageName) || rerunFailed
+            }
+        } else {
+            echo "No isolated tests to run for stage ${stageName}"
+            noIsolateTests = true
+        }
+
+        if (noRegularTests && noIsolateTests) {
+            error "No tests were executed for stage ${stageName}, please check the test list and test-db rendering result."
+        }
+
+        // Generate comprehensive rerun report if any reruns occurred
+        stage ("[${stageName}] Generate Report") {
+            generateRerunReport(stageName, llmSrc)
+        }
+
+        if (rerunFailed) {
+            error "Some tests still failed after rerun attempts, please check the test report."
+        }
+
        if (perfMode) {
            basePerfFilename = stageName.contains("PyTorch") ? "base_perf_pytorch.csv" : "base_perf.csv"
            basePerfPath = "${llmSrc}/tests/integration/defs/perf/${basePerfFilename}"
--- a/scripts/check_test_list.py
+++ b/scripts/check_test_list.py
@ -17,6 +17,9 @@ import argparse
 import os
 import subprocess

+# The markers in our test lists, need to be preprocess before checking
+MARKER_LIST_IN_TEST = [" TIMEOUT"]
+

 def install_python_dependencies(llm_src):
    subprocess.run(
@ -51,9 +54,28 @@ def verify_l0_test_lists(llm_src):
        lines = f.readlines()

    for line in lines:
-        # Remove 'TIMEOUT (number)' and strip spaces
-        cleaned_line = line.split(" TIMEOUT ", 1)[0].strip()
-        cleaned_lines.add(cleaned_line)
+        # Remove markers and rest of the line if present
+        cleaned_line = line.strip()
+
+        # Handle ISOLATION marker removal (including comma patterns)
+        if 'ISOLATION,' in cleaned_line:
+            # Case: "ISOLATION,OTHER_MARKER" -> remove "ISOLATION,"
+            cleaned_line = cleaned_line.replace('ISOLATION,', '').strip()
+        elif ',ISOLATION' in cleaned_line:
+            # Case: "OTHER_MARKER,ISOLATION" -> remove ",ISOLATION"
+            cleaned_line = cleaned_line.replace(',ISOLATION', '').strip()
+        elif ' ISOLATION' in cleaned_line:
+            # Case: standalone "ISOLATION" -> remove " ISOLATION"
+            cleaned_line = cleaned_line.replace(' ISOLATION', '').strip()
+
+        # Handle other markers (like TIMEOUT) - remove marker and everything after it
+        for marker in MARKER_LIST_IN_TEST:
+            if marker in cleaned_line and marker != " ISOLATION":
+                cleaned_line = cleaned_line.split(marker, 1)[0].strip()
+                break
+
+        if cleaned_line:
+            cleaned_lines.add(cleaned_line)

    with open(test_list, "w") as f:
        f.writelines(f"{line}\n" for line in sorted(cleaned_lines))
--- a/tests/README.md
+++ b/tests/README.md
@ -236,6 +236,55 @@ To set a timeout for specific long-running test cases, follow these steps:
   disaggregated/test_disaggregated.py::test_disaggregated_single_gpu_with_mpirun[TinyLlama-1.1B-Chat-v1.0] TIMEOUT (30)
   ```

-### Notes:
- The `TIMEOUT` setting ensures that the test case will be terminated if it exceeds the specified time limit.
- This setting is useful for preventing long-running or stuck tests from blocking the pipeline or local testing.
+## 6. Set isolated execution for cases individually
+
+Some test cases may experience intermittent failures due to resource conflicts, memory leaks, or state pollution when run together with other tests. The `ISOLATION` marker ensures these cases run in a separate pytest process, avoiding such issues.
+
+### When to use the `ISOLATION` marker:
+- Tests that modify global state or environment variables
+- Tests with memory-intensive operations that may affect subsequent tests
+- Tests that experience intermittent failures only when run with other tests
+- Tests that require exclusive access to certain resources (GPU memory, files, etc.)
+
+### Usage:
+Add `ISOLATION` to the test case line with proper spacing:
+
+**For CI (test-db YAML files):**
+```yaml
+- disaggregated/test_disaggregated.py::test_disaggregated_single_gpu_with_mpirun[TinyLlama-1.1B-Chat-v1.0] ISOLATION
+```
+
+**For Local Testing (TXT files):**
+```
+disaggregated/test_disaggregated.py::test_disaggregated_single_gpu_with_mpirun[TinyLlama-1.1B-Chat-v1.0] ISOLATION
+```
+
+## 7. Combining test markers
+
+Multiple markers can be combined for the same test case using commas. Both formats are valid:
+
+```yaml
+- test_case.py::test_function[param] ISOLATION, TIMEOUT (90)
+- test_case.py::test_function[param] TIMEOUT (90), ISOLATION
+```
+
+### Example:
+```yaml
+# Regular test (runs with other tests)
+- accuracy/test_llm_api.py::test_basic_functionality[gpt2]
+
+# Test with timeout only
+- accuracy/test_llm_api.py::test_long_running[model] TIMEOUT (60)
+
+# Isolated test (runs in separate process)
+- accuracy/test_llm_api.py::test_memory_intensive[large_model] ISOLATION
+
+# Isolated test with timeout
+- accuracy/test_llm_api.py::test_complex_workflow[model] ISOLATION, TIMEOUT (120)
+```
+
+### Important Notes:
+- **TIMEOUT**: Ensures the test terminates if it exceeds the specified time limit (in minutes). Useful for preventing stuck tests from blocking the pipeline.
+- **ISOLATION**: Runs the test in a separate pytest process to avoid resource conflicts and state pollution. Use sparingly as it increases execution time.
+- Ensure there is **at least one space** before and after each marker keyword
+- Both markers are case-sensitive and must be written exactly as `TIMEOUT` and `ISOLATION`
--- a/tests/integration/test_lists/test-db/l0_a100.yml
+++ b/tests/integration/test_lists/test-db/l0_a100.yml
@ -14,7 +14,7 @@ l0_a100:
      backend: "pytorch"
  tests:
    - unittest/llmapi/test_llm_pytorch.py
-    - unittest/llmapi/test_mpi_session.py # generic tests
+    - unittest/llmapi/test_mpi_session.py ISOLATION
    - unittest/trt/model_api/test_model_quantization.py
    # executor
    - unittest/executor/test_base_worker.py
--- a/tests/integration/test_lists/test-db/l0_b200.yml
+++ b/tests/integration/test_lists/test-db/l0_b200.yml
@ -119,7 +119,7 @@ l0_b200:
  tests:
  - triton_server/test_triton.py::test_llava[llava]
  - triton_server/test_triton.py::test_gpt_ib_ptuning[gpt-ib-ptuning]
-  - triton_server/test_triton.py::test_gpt_2b_ib_lora[gpt-2b-ib-lora]
+  - triton_server/test_triton.py::test_gpt_2b_ib_lora[gpt-2b-ib-lora] ISOLATION
 - condition:
    ranges:
      system_gpu_count:
--- a/tests/integration/test_lists/test-db/l0_dgx_b200.yml
+++ b/tests/integration/test_lists/test-db/l0_dgx_b200.yml
@ -38,7 +38,7 @@ l0_dgx_b200:
  - accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[tep4_latency_moe_trtllm-torch_compile=True]
  - accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[dep4_latency_moe_trtllm-torch_compile=False]
  - accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[dep4_latency_moe_cutlass-torch_compile=False]
-  - accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[dep4_latency_moe_cutlass-torch_compile=True]
+  - accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[dep4_latency_moe_cutlass-torch_compile=True] ISOLATION
  - accuracy/test_llm_api_pytorch.py::TestQwen3NextThinking::test_auto_dtype[tp4ep4]
  - accuracy/test_llm_api_pytorch.py::TestLlama4ScoutInstruct::test_fp4[tp4-cuda_graph=True]
  - accuracy/test_llm_api_pytorch.py::TestLlama4ScoutInstruct::test_fp4[tp8ep8-cuda_graph=True]