diff --git a/jenkins/L0_MergeRequest.groovy b/jenkins/L0_MergeRequest.groovy index 441f12a444..444f649165 100644 --- a/jenkins/L0_MergeRequest.groovy +++ b/jenkins/L0_MergeRequest.groovy @@ -717,6 +717,45 @@ def collectTestResults(pipeline, testFilter) junit(testResults: '**/results*.xml', allowEmptyResults : true) } // Collect test result stage + stage("Rerun report") { + sh "rm -rf rerun && mkdir -p rerun" + sh "find . -type f -wholename '*/rerun_results.xml' -exec sh -c 'mv \"{}\" \"rerun/\$(basename \$(dirname \"{}\"))_rerun_results.xml\"' \\; || true" + sh "find rerun -type f" + def rerunFileCount = sh(returnStdout: true, script: 'find rerun -type f | wc -l').replaceAll("\\s","").toInteger() + if (rerunFileCount == 0) { + echo "Rerun report is skipped because there is no rerun test data file." + return + } + def xmlFiles = findFiles(glob: 'rerun/**/*.xml') + def xmlFileList = xmlFiles.collect { it.path } + def inputfiles = xmlFileList.join(',') + echo "inputfiles: ${inputfiles}" + trtllm_utils.llmExecStepWithRetry(pipeline, script: "apk add python3") + trtllm_utils.llmExecStepWithRetry(pipeline, script: "apk add py3-pip") + trtllm_utils.llmExecStepWithRetry(pipeline, script: "pip3 config set global.break-system-packages true") + sh """ + python3 llm/tests/integration/defs/test_rerun.py \ + generate_rerun_report \ + --output-file=rerun/rerun_report.xml \ + --input-files=${inputfiles} + """ + trtllm_utils.uploadArtifacts("rerun/rerun_report.html", "${UPLOAD_PATH}/test-results/") + echo "Rerun report: https://urm.nvidia.com/artifactory/${UPLOAD_PATH}/test-results/rerun_report.html" + def isOfficialPostMergeJob = (env.JOB_NAME ==~ /.*PostMerge.*/) + if (env.alternativeTRT || isOfficialPostMergeJob) { + catchError( + buildResult: 'FAILURE', + stageResult: 'FAILURE') { + error "Some failed tests were reruned, please check the rerun report." + } + } else { + catchError( + buildResult: 'SUCCESS', + stageResult: 'UNSTABLE') { + error "Some failed tests were reruned, please check the rerun report." + } + } + } // Rerun report stage try { stage("Test coverage") { sh "ls" diff --git a/jenkins/L0_Test.groovy b/jenkins/L0_Test.groovy index 605db83a31..c02e8927d6 100644 --- a/jenkins/L0_Test.groovy +++ b/jenkins/L0_Test.groovy @@ -876,6 +876,114 @@ def getSSHConnectionPorts(portConfigFile, stageName) return [userPort, monitorPort] } +def rerunFailedTests(stageName, llmSrc, testCmdLine) { + if (!fileExists("${WORKSPACE}/${stageName}/results.xml")) { + error "There is not results.xml file, skip the rerun step" + } + + // Generate rerun test lists + def failSignaturesList = trtllm_utils.getFailSignaturesList().join(",") + sh """ + python3 ${llmSrc}/tests/integration/defs/test_rerun.py \ + generate_rerun_tests_list \ + --output-dir=${WORKSPACE}/${stageName}/ \ + --input-file=${WORKSPACE}/${stageName}/results.xml \ + --fail-signatures='${failSignaturesList}' + """ + + // If there are some failed tests that cannot be rerun (e.g. test duration > 10 min and no known failure signatures), + // fail the stage immediately without attempting any reruns + rerunTestList = "${WORKSPACE}/${stageName}/rerun_0.txt" + if (fileExists(rerunTestList)) { + sh "cat ${rerunTestList}" + error "There are some failed tests that cannot be rerun, skip the rerun step." + } + + // If the stage has more than 5 failed tests, skip the rerun step + def validLineCount = 0 + for (times in [1, 2]) { + rerunTestList = "${WORKSPACE}/${stageName}/rerun_${times}.txt" + if (fileExists(rerunTestList)) { + count = sh( + script: "grep -v '^[[:space:]]*\$' ${rerunTestList} | wc -l", + returnStdout: true + ).trim().toInteger() + echo "Found ${count} tests to rerun ${times} time(s)" + validLineCount += count + } + } + if (validLineCount > 5) { + error "There are more than 5 failed tests, skip the rerun step." + } + + // Rerun tests + isRerunFailed = false + for (times in [1, 2]) { + rerunTestList = "${WORKSPACE}/${stageName}/rerun_${times}.txt" + if (!fileExists(rerunTestList)) { + echo "No failed tests need to be rerun ${times} time(s)" + continue + } + sh "cat ${rerunTestList}" + xmlFile = "${WORKSPACE}/${stageName}/rerun_results_${times}.xml" + // change the testCmdLine for rerun + noNeedLine = ["--splitting-algorithm", "--splits", "--group", "--waives-file", "--cov"] + needToChangeLine = ["--test-list", "--csv", "--junit-xml"] + testCmdLine = testCmdLine.findAll { cmd -> + !noNeedLine.any { line -> cmd.contains(line) } && !needToChangeLine.any { line -> cmd.contains(line) } + } + testCmdLine += [ + "--test-list=${rerunTestList}", + "--csv=${WORKSPACE}/${stageName}/rerun_report_${times}.csv", + "--junit-xml ${xmlFile}", + "--reruns ${times - 1}" + ] + try { + sh """ + cd ${llmSrc}/tests/integration/defs && \ + ${testCmdLine.join(" ")} + """ + } catch(InterruptedException e) { + throw e + } catch (Exception e) { + if (!fileExists(xmlFile)) { + echo "The tests crashed when rerun attempt." + throw e + } + echo "The tests still failed after rerun attempt." + isRerunFailed = true + } + } + + // generate rerun report + inputFiles = ["${WORKSPACE}/${stageName}/results.xml", + "${WORKSPACE}/${stageName}/rerun_results_1.xml", + "${WORKSPACE}/${stageName}/rerun_results_2.xml"] + sh """ + python3 ${llmSrc}/tests/integration/defs/test_rerun.py \ + generate_rerun_report \ + --output-file=${WORKSPACE}/${stageName}/rerun_results.xml \ + --input-files=${inputFiles.join(",")} + """ + + // Update original results xml file with rerun results xml files for junit + sh """ + python3 ${llmSrc}/tests/integration/defs/test_rerun.py \ + merge_junit_xmls \ + --output-file=${WORKSPACE}/${stageName}/results.xml \ + --input-files=${inputFiles.join(",")} \ + --deduplicate + """ + + trtllm_utils.uploadArtifacts( + "${WORKSPACE}/${stageName}/rerun_results.html", + "${UPLOAD_PATH}/rerun_reports/${stageName}_rerun_results.html" + ) + + echo "Test rerun report: https://urm.nvidia.com/artifactory/${UPLOAD_PATH}/rerun_reports/${stageName}_rerun_results.html" + return isRerunFailed +} + def runLLMTestlistOnPlatformImpl(pipeline, platform, testList, config=VANILLA_CONFIG, perfMode=false, stageName="Undefined", splitId=1, splits=1, skipInstallWheel=false, cpver="cp312") { // Step 1: create LLM_ROOT dir @@ -1101,16 +1209,21 @@ def runLLMTestlistOnPlatformImpl(pipeline, platform, testList, config=VANILLA_CO string(credentialsId: 'llm_evaltool_repo_url', variable: 'EVALTOOL_REPO_URL') ]) { sh "env | sort" - trtllm_utils.llmExecStepWithRetry( - pipeline, - numRetries: 1, - script: """ + try { + sh """ rm -rf ${stageName}/ && \ cd ${llmSrc}/tests/integration/defs && \ ${testCmdLine.join(" ")} - """, - retryLog: "stageName = ${stageName}, HOST_NODE_NAME = ${env.HOST_NODE_NAME}" - ) + """ + } catch (InterruptedException e) { + throw e + } catch (Exception e) { + isRerunFailed = rerunFailedTests(stageName, llmSrc, testCmdLine) + if (isRerunFailed) { + echo "The tests still failed after rerun attempt." + throw e + } + } } } diff --git a/tests/integration/defs/test_rerun.py b/tests/integration/defs/test_rerun.py new file mode 100644 index 0000000000..e3af8928f1 --- /dev/null +++ b/tests/integration/defs/test_rerun.py @@ -0,0 +1,463 @@ +import argparse +import os +import sys +import xml.etree.ElementTree as ET + + +def parse_name(classname, name, filename): + if "test_unittests_v2[unittest/" in name and \ + filename == "test_unittests.py": + return name[name.find("test_unittests_v2[unittest/") + 18:-1] + elif filename in name: + return name[name.find('/') + 1:] + elif filename[:-2].replace("/", ".") in classname: + return filename + "::" + classname.split(".")[-1] + "::" + name + else: + return filename + "::" + name + + +def generate_rerun_tests_list(outdir, xml_filename, failSignaturesList): + # Generate rerun test lists: + # 1. Parse the test results xml file + # 2. For failed tests: + # - If test duration <= 5 min: add to rerun_2.txt (will rerun 2 times) + # - If test duration > 5 min and <= 10 min: add to rerun_1.txt (will rerun 1 time) + # - If test duration > 10 min but contains fail signatures in error message: add to rerun_1.txt + # - If test duration > 10 min and no known failure signatures: add to rerun_0.txt (will not rerun) + print(failSignaturesList) + + rerun_0_filename = os.path.join(outdir, 'rerun_0.txt') + rerun_1_filename = os.path.join(outdir, 'rerun_1.txt') + rerun_2_filename = os.path.join(outdir, 'rerun_2.txt') + + tree = ET.parse(xml_filename) + root = tree.getroot() + suite = root.find('testsuite') + + with open(rerun_0_filename, 'w') as rerun_0_file, \ + open(rerun_1_filename, 'w') as rerun_1_file, \ + open(rerun_2_filename, 'w') as rerun_2_file: + for case in suite.findall('testcase'): + if case.find('failure') is not None or \ + case.find('error') is not None: + duration = float(case.attrib.get('time', 0)) + test_name = parse_name(case.attrib.get('classname', ''), \ + case.attrib.get('name', ''), \ + case.attrib.get('file', '')) + if duration <= 5 * 60: + rerun_2_file.write(test_name + '\n') + print(test_name + " will rerun 2 times") + elif duration <= 10 * 60: + rerun_1_file.write(test_name + '\n') + print(test_name + " will rerun 1 time") + elif any(failSig.lower() in ET.tostring( + case, encoding='unicode').lower() + for failSig in failSignaturesList): + rerun_1_file.write(test_name + '\n') + print(test_name + + " will rerun 1 time, because of fail signature") + else: + rerun_0_file.write(test_name + '\n') + print(test_name + " will not rerun") + + # Remove empty files + for filename in [rerun_0_filename, rerun_1_filename, rerun_2_filename]: + if os.path.getsize(filename) == 0: + os.remove(filename) + + +def merge_junit_xmls(merged_xml_filename, xml_filenames, deduplicate=False): + # Merge xml files into one. + # If deduplicate is true, remove duplicate test cases. + merged_root = ET.Element('testsuites') + merged_suite_map = {} + + for xml_filename in xml_filenames: + if not os.path.exists(xml_filename): + continue + + suites = ET.parse(xml_filename).getroot() + suite_list = suites.findall('testsuite') + for suite in suite_list: + suite_name = suite.attrib.get('name', '') + if suite_name not in merged_suite_map: + merged_suite_map[suite_name] = suite + else: + original_suite = merged_suite_map[suite_name] + case_list = suite.findall('testcase') + for case in case_list: + existing_case = original_suite.find( + f"testcase[@name='{case.attrib['name']}'][@classname='{case.attrib['classname']}']" + ) + # find the duplicate case in original_suite + if existing_case is not None: + if deduplicate: + # remove the duplicate case in original_suite + original_suite.remove(existing_case) + else: + # add rerun flag to the new case for rerun report + case.set('isrerun', 'true') + original_suite.extend(case_list) + + # Update suite attributes + for suite in merged_suite_map.values(): + attribs = {'errors': 0, 'failures': 0, 'skipped': 0, 'tests': 0} + for case in suite.findall('testcase'): + attribs['tests'] += 1 + if case.find('failure') is not None: + attribs['failures'] += 1 + elif case.find('error') is not None: + attribs['errors'] += 1 + elif case.find('skipped') is not None: + attribs['skipped'] += 1 + for key, value in attribs.items(): + suite.set(key, str(value)) + + # add suite to merged_root + merged_root.append(suite) + + if os.path.exists(merged_xml_filename): + os.remove(merged_xml_filename) + + # Write to new file + tree = ET.ElementTree(merged_root) + tree.write(merged_xml_filename, encoding='utf-8', xml_declaration=True) + + +def escape_html(text): + return text.replace('&', '&').replace('<', '<').replace('>', '>') + + +def xml_to_html(xml_filename, html_filename, sort_by_name=False): + # HTML template + html_template = """ + + +
+Tests: {tests_count} | + Failed: {failed_tests_count} | + Skipped: {skipped_tests_count} | + Passed: {passed_tests_count} +
+{''.join(f'{escape_html(line)}' for line in failure.get('message', '').splitlines(True))}
+ {''.join(f'{escape_html(line)}' for line in (failure.text or '').splitlines(True))}
+ {''.join(f'{escape_html(line)}' for line in (system_out.text or '').splitlines(True))}
+ {''.join(f'{escape_html(line)}' for line in (system_err.text or '').splitlines(True))}
+ {''.join(f'{escape_html(line)}' for line in error.get('message', '').splitlines(True))}
+ {''.join(f'{escape_html(line)}' for line in (error.text or '').splitlines(True))}
+ {''.join(f'{escape_html(line)}' for line in (system_out.text or '').splitlines(True))}
+ {''.join(f'{escape_html(line)}' for line in (system_err.text or '').splitlines(True))}
+ {''.join(f'{escape_html(line)}' for line in skipped_message.splitlines(True))}
+ {''.join(f'{escape_html(line)}' for line in (system_out.text or '').splitlines(True))}
+ {''.join(f'{escape_html(line)}' for line in (system_err.text or '').splitlines(True))}
+