[None][fix] Add OpenSearch URL in slurm_launch.sh for Multinode Perf Sanity Test (#9990)

Signed-off-by: Chenfei Zhang <chenfeiz@nvidia.com>
This commit is contained in:
chenfeiz0326 2025-12-23 16:02:38 +08:00 committed by GitHub
parent cc1323be24
commit 48c875f8ea
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 32 additions and 1 deletions

View File

@ -1058,12 +1058,31 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
""".replaceAll("(?m)^\\s*", "")
}
// Define environment variables to export
def envVarNames = [
'OPEN_SEARCH_DB_BASE_URL',
'OPEN_SEARCH_DB_CREDENTIALS',
'BUILD_ID',
'BUILD_URL',
'JOB_NAME',
'globalVars',
'gitlabCommit'
]
def envVarsToExport = [:]
envVarNames.each { varName ->
envVarsToExport[varName] = env."${varName}"
}
srunArgs = [
"--container-name=multi_node_test-\${SLURM_JOB_ID}",
"--container-image=$containerImageArg",
"--container-workdir=/home/svc_tensorrt/bloom/scripts",
"--container-mounts=$mounts",
"--container-env=NVIDIA_IMEX_CHANNELS"
]
envVarsToExport.each { varName, varValue ->
srunArgs.add("--container-env=${varName}")
}
if(nodeCount > 1) {
srunArgs.add("--mpi=pmi2")
}
@ -1072,6 +1091,17 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
if (cluster.host.contains("oci-nrt") || cluster.host.contains("oci-hsg") || cluster.host.contains("lbd-lax")) {
exemptionComment = """--comment='{"OccupiedIdleGPUsJobReaper":{"exemptIdleTimeMins":"90","reason":"other","description":"Long data and model loading time and disaggregated serving tests"}}'"""
}
def envExportStatements = envVarsToExport.collect { varName, varValue ->
def escapedValue = varValue?.toString() ?: ''
escapedValue = escapedValue
.replace('\\', '\\\\') // Backslash
.replace('"', '\\"') // Double quote
.replace('$', '\\$') // Dollar sign (prevent variable expansion)
.replace('`', '\\`') // Backtick (prevent command substitution)
"export ${varName}=\"${escapedValue}\""
}.join('\n')
def scriptContent = """#!/bin/bash
#SBATCH ${exemptionComment} --output=${outputPath}
${taskArgs.collect { "#SBATCH $it" }.join('\n')}
@ -1092,6 +1122,7 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
export coverageConfigFile="$coverageConfigFile"
export NVIDIA_IMEX_CHANNELS=\${NVIDIA_IMEX_CHANNELS:-0}
export NVIDIA_VISIBLE_DEVICES=\${NVIDIA_VISIBLE_DEVICES:-\$(seq -s, 0 \$((\$(nvidia-smi --query-gpu=count -i 0 --format=noheader)-1)))}
${envExportStatements}
echo "Env NVIDIA_IMEX_CHANNELS: \$NVIDIA_IMEX_CHANNELS"
echo "Env NVIDIA_VISIBLE_DEVICES: \$NVIDIA_VISIBLE_DEVICES"

View File

@ -145,7 +145,7 @@ def get_job_info():
# Set trigger_mr_commit to commit
trigger_mr_commit = commit
artifact_url = f"https://urm.nvidia.com/artifactory/sw-tensorrt-generic/llm-artifacts/LLM/main/L0_PostMerge/{job_id}" if job_id else ""
artifact_url = f"https://urm.nvidia.com/artifactory/sw-tensorrt-generic/llm-artifacts/LLM/main/L0_MergeRequest_PR/{job_id}" if job_id else ""
else:
artifact_url = f"https://urm.nvidia.com/artifactory/sw-tensorrt-generic/llm-artifacts/LLM/main/L0_PostMerge/{job_id}" if job_id else ""