mirror of
https://github.com/NVIDIA/TensorRT-LLM.git
synced 2026-02-21 18:25:20 +08:00
Signed-off-by: Yanchao Lu <yanchaol@nvidia.com> Co-authored-by: Zhanrui Sun <184402041+ZhanruiSunCh@users.noreply.github.com>
This commit is contained in:
parent
425dad01fd
commit
d1d17dbeba
@ -99,6 +99,8 @@ MODEL_CACHE_DIR="/scratch.trt_llm_data/llm-models"
|
||||
ENABLE_NGC_DEVEL_IMAGE_TEST = params.enableNgcDevelImageTest ?: false
|
||||
ENABLE_NGC_RELEASE_IMAGE_TEST = params.enableNgcReleaseImageTest ?: false
|
||||
|
||||
COMMON_SSH_OPTIONS = "-o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null"
|
||||
|
||||
def uploadResults(def pipeline, SlurmCluster cluster, String nodeName, String stageName){
|
||||
withCredentials([usernamePassword(credentialsId: 'svc_tensorrt', usernameVariable: 'USERNAME', passwordVariable: 'PASSWORD')]) {
|
||||
def remote = [
|
||||
@ -113,7 +115,7 @@ def uploadResults(def pipeline, SlurmCluster cluster, String nodeName, String st
|
||||
pipeline.stage('Submit Test Results') {
|
||||
sh "mkdir -p ${stageName}"
|
||||
def resultsFilePath = "/home/svc_tensorrt/bloom/scripts/${nodeName}/results/results.xml"
|
||||
def downloadResultCmd = "sshpass -p '${remote.passwd}' scp -r -p -oStrictHostKeyChecking=no ${remote.user}@${remote.host}:${resultsFilePath} ${stageName}/"
|
||||
def downloadResultCmd = "sshpass -p '${remote.passwd}' scp -r -p ${COMMON_SSH_OPTIONS} ${remote.user}@${remote.host}:${resultsFilePath} ${stageName}/"
|
||||
def downloadSucceed = sh(script: downloadResultCmd, returnStatus: true) == 0
|
||||
if (downloadSucceed) {
|
||||
sh "ls ${stageName}"
|
||||
@ -239,7 +241,7 @@ def runLLMTestlistOnSlurm(pipeline, platform, testList, config=VANILLA_CONFIG, p
|
||||
|
||||
Utils.exec(pipeline, script: "chmod +x ${jenkinsSetupPath}", returnStdout: true)
|
||||
|
||||
Utils.exec(pipeline, script: "sshpass -p '${remote.passwd}' scp -r -p -oStrictHostKeyChecking=no ${jenkinsSetupPath} ${remote.user}@${remote.host}:~/bloom/scripts/${nodeName}-slurm_jenkins_agent_setup.sh",)
|
||||
Utils.exec(pipeline, script: "sshpass -p '${remote.passwd}' scp -r -p ${COMMON_SSH_OPTIONS} ${jenkinsSetupPath} ${remote.user}@${remote.host}:~/bloom/scripts/${nodeName}-slurm_jenkins_agent_setup.sh",)
|
||||
|
||||
Utils.exec(
|
||||
pipeline,
|
||||
@ -327,7 +329,7 @@ def runLLMTestlistOnSlurm_MultiNodes(pipeline, platform, testList, config=VANILL
|
||||
|
||||
stage('Prepare Testing') {
|
||||
// Create Job Workspace folder in Frontend Node
|
||||
Utils.exec(pipeline, script: "sshpass -p '${remote.passwd}' ssh -oStrictHostKeyChecking=no ${remote.user}@${remote.host} 'mkdir ${jobWorkspace}'",)
|
||||
Utils.exec(pipeline, script: "sshpass -p '${remote.passwd}' ssh ${COMMON_SSH_OPTIONS} ${remote.user}@${remote.host} 'mkdir -p ${jobWorkspace}'",)
|
||||
|
||||
// Download and Unzip Tar File
|
||||
trtllm_utils.llmExecStepWithRetry(pipeline, script: "cd ${llmPath} && wget -nv ${llmTarfile}")
|
||||
@ -336,11 +338,11 @@ def runLLMTestlistOnSlurm_MultiNodes(pipeline, platform, testList, config=VANILL
|
||||
// Upload slurm_run_sh to Frontend node
|
||||
def scriptRunLocalPath = "${llmSrcLocal}/jenkins/scripts/slurm_run.sh"
|
||||
Utils.exec(pipeline, script: "chmod +x ${scriptRunLocalPath}", returnStdout: true)
|
||||
Utils.exec(pipeline, script: "sshpass -p '${remote.passwd}' scp -r -p -oStrictHostKeyChecking=no ${scriptRunLocalPath} ${remote.user}@${remote.host}:${scriptRunNode}",)
|
||||
Utils.exec(pipeline, script: "sshpass -p '${remote.passwd}' scp -r -p ${COMMON_SSH_OPTIONS} ${scriptRunLocalPath} ${remote.user}@${remote.host}:${scriptRunNode}",)
|
||||
|
||||
// Upload waives.txt to Frontend node
|
||||
def waivesListLocalPath = "${llmSrcLocal}/tests/integration/test_lists/waives.txt"
|
||||
Utils.exec(pipeline, script: "sshpass -p '${remote.passwd}' scp -r -p -oStrictHostKeyChecking=no ${waivesListLocalPath} ${remote.user}@${remote.host}:${waivesListPathNode}",)
|
||||
Utils.exec(pipeline, script: "sshpass -p '${remote.passwd}' scp -r -p ${COMMON_SSH_OPTIONS} ${waivesListLocalPath} ${remote.user}@${remote.host}:${waivesListPathNode}",)
|
||||
|
||||
// Generate Test List and Upload to Frontend Node
|
||||
def makoArgs = getMakoArgsFromStageName(stageName, true)
|
||||
@ -349,7 +351,7 @@ def runLLMTestlistOnSlurm_MultiNodes(pipeline, platform, testList, config=VANILL
|
||||
// if the line cannot be split by "=", just ignore that line.
|
||||
def makoOptsJson = transformMakoArgsToJson(["Mako options:"] + makoArgs)
|
||||
def testListPath = renderTestDB(testList, llmSrcLocal, stageName, makoOptsJson)
|
||||
Utils.exec(pipeline, script: "sshpass -p '${remote.passwd}' scp -r -p -oStrictHostKeyChecking=no ${testListPath} ${remote.user}@${remote.host}:${testListPathNode}",)
|
||||
Utils.exec(pipeline, script: "sshpass -p '${remote.passwd}' scp -r -p ${COMMON_SSH_OPTIONS} ${testListPath} ${remote.user}@${remote.host}:${testListPathNode}",)
|
||||
|
||||
// Generate Multi Node Job Launch Script
|
||||
def container = LLM_DOCKER_IMAGE.replace("urm.nvidia.com/", "urm.nvidia.com#")
|
||||
@ -393,7 +395,7 @@ def runLLMTestlistOnSlurm_MultiNodes(pipeline, platform, testList, config=VANILL
|
||||
""".stripIndent()
|
||||
pipeline.writeFile(file: scriptLaunchDestPath, text: scriptContent)
|
||||
Utils.exec(pipeline, script: "chmod +x ${scriptLaunchDestPath}", returnStdout: true)
|
||||
Utils.exec(pipeline, script: "sshpass -p '${remote.passwd}' scp -r -p -oStrictHostKeyChecking=no ${scriptLaunchDestPath} ${remote.user}@${remote.host}:${scriptLaunch}",)
|
||||
Utils.exec(pipeline, script: "sshpass -p '${remote.passwd}' scp -r -p ${COMMON_SSH_OPTIONS} ${scriptLaunchDestPath} ${remote.user}@${remote.host}:${scriptLaunch}",)
|
||||
}
|
||||
stage('Run Test') {
|
||||
def scriptLaunch = "${jobWorkspace}/slurm_launch.sh"
|
||||
@ -1089,7 +1091,7 @@ def getSSHConnectionPorts(portConfigFile, stageName)
|
||||
usernamePassword(credentialsId: 'tensorrt_llm_infra_debug_vm_01_credentials', usernameVariable: 'USERNAME', passwordVariable: 'PASSWORD'),
|
||||
string(credentialsId: 'DEBUG_HOST_NAME', variable: 'HOST_NAME')
|
||||
]) {
|
||||
portUsage = sh(script: "ssh -v ${USERNAME}@${HOST_NAME} -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null 'netstat -tuln'",returnStdout: true)
|
||||
portUsage = sh(script: "ssh -v ${USERNAME}@${HOST_NAME} ${COMMON_SSH_OPTIONS} 'netstat -tuln'", returnStdout: true)
|
||||
}
|
||||
echo "Port Usage: ${portUsage}"
|
||||
|
||||
@ -1248,7 +1250,7 @@ def runLLMTestlistOnPlatformImpl(pipeline, platform, testList, config=VANILLA_CO
|
||||
def llmRootConfig = "${LLM_ROOT}${config}"
|
||||
sh "mkdir ${llmRootConfig}"
|
||||
|
||||
def llmPath = sh (script: "realpath ${llmRootConfig}",returnStdout: true).trim()
|
||||
def llmPath = sh (script: "realpath ${llmRootConfig}", returnStdout: true).trim()
|
||||
def llmSrc = "${llmPath}/TensorRT-LLM/src"
|
||||
echoNodeAndGpuInfo(pipeline, stageName)
|
||||
|
||||
@ -1362,9 +1364,9 @@ def runLLMTestlistOnPlatformImpl(pipeline, platform, testList, config=VANILLA_CO
|
||||
usernamePassword(credentialsId: 'tensorrt_llm_infra_debug_vm_01_credentials', usernameVariable: 'USERNAME', passwordVariable: 'PASSWORD'),
|
||||
string(credentialsId: 'DEBUG_HOST_NAME', variable: 'HOST_NAME')
|
||||
]) {
|
||||
sh "sshpass -p ${PASSWORD} -v ssh ${USERNAME}@${HOST_NAME} -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null 'cat >> ~/.ssh/authorized_keys' < ~/.ssh/id_rsa.pub"
|
||||
sh "ssh -v ${USERNAME}@${HOST_NAME} -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null 'echo \"\" > ~/.ssh/known_hosts && cat ~/.ssh/id_rsa.pub' >> ~/.ssh/authorized_keys"
|
||||
sh "ssh -v ${USERNAME}@${HOST_NAME} -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null 'cat ~/.ssh/ports_config.txt' >> ${portConfigFilePath}"
|
||||
sh "sshpass -p ${PASSWORD} -v ssh ${USERNAME}@${HOST_NAME} ${COMMON_SSH_OPTIONS} 'cat >> ~/.ssh/authorized_keys' < ~/.ssh/id_rsa.pub"
|
||||
sh "ssh -v ${USERNAME}@${HOST_NAME} ${COMMON_SSH_OPTIONS} 'echo \"\" > ~/.ssh/known_hosts && cat ~/.ssh/id_rsa.pub' >> ~/.ssh/authorized_keys"
|
||||
sh "ssh -v ${USERNAME}@${HOST_NAME} ${COMMON_SSH_OPTIONS} 'cat ~/.ssh/ports_config.txt' >> ${portConfigFilePath}"
|
||||
|
||||
def (int userPort, int monitorPort) = getSSHConnectionPorts(portConfigFilePath, stageName)
|
||||
if (userPort == 0) {
|
||||
@ -1373,7 +1375,7 @@ def runLLMTestlistOnPlatformImpl(pipeline, platform, testList, config=VANILLA_CO
|
||||
return
|
||||
}
|
||||
|
||||
sh "ssh -f -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -L 1111:127.0.0.1:${monitorPort} -R ${monitorPort}:127.0.0.1:1112 -NR ${userPort}:localhost:22 ${USERNAME}@${HOST_NAME}"
|
||||
sh "ssh -f ${COMMON_SSH_OPTIONS} -L 1111:127.0.0.1:${monitorPort} -R ${monitorPort}:127.0.0.1:1112 -NR ${userPort}:localhost:22 ${USERNAME}@${HOST_NAME}"
|
||||
sh "autossh -fNR ${userPort}:localhost:22 ${USERNAME}@${HOST_NAME}"
|
||||
sh "ps aux | grep ssh"
|
||||
try {
|
||||
|
||||
Loading…
Reference in New Issue
Block a user