[TRTINFRA-7328][infra] Consume SlurmCluster scratchPath and cleanup mounts (#9600)

Signed-off-by: Matt Lefebvre <mlefebvre@nvidia.com>
This commit is contained in:
Matt Lefebvre 2025-12-09 13:34:09 -08:00 committed by GitHub
parent 4da3121363
commit 5de4e3f621
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -459,7 +459,7 @@ def cleanUpSlurmResources(def pipeline, SlurmCluster cluster, String jobUID){
Utils.exec(pipeline, script: "echo Sleeping to allow Slurm job termination; sleep 30")
def cleanupCommands = [
"rm -rf /lustre/fs1/portfolios/coreai/projects/coreai_tensorrt_ci/users/svc_tensorrt/containers/container-${slurmJobID}.sqsh || true",
"rm -rf ${cluster.scratchPath}/users/svc_tensorrt/containers/container-${slurmJobID}.sqsh || true",
"rm -rf ${jobWorkspace} || true",
].join(" && ")
Utils.exec(
@ -510,7 +510,7 @@ def cleanUpNodeResources(def pipeline, SlurmCluster cluster, String nodeName, St
def entrypoint = SlurmConfig.containerRuntimeToEntrypoint[cluster.containerRuntime]
def cleanupCommands = [
"rm -rf /home/svc_tensorrt/bloom/scripts/agent-${nodeName}.jar /home/svc_tensorrt/bloom/scripts/${nodeName}-${entrypoint} || true",
"rm -rf /lustre/fs1/portfolios/coreai/projects/coreai_tensorrt_ci/users/svc_tensorrt/containers/container-${slurmJobID}.sqsh || true",
"rm -rf ${cluster.scratchPath}/users/svc_tensorrt/containers/container-${slurmJobID}.sqsh || true",
].join(" && ")
Utils.exec(
pipeline,
@ -565,12 +565,7 @@ def runLLMTestlistWithAgent(pipeline, platform, testList, config=VANILLA_CONFIG,
Utils.exec(pipeline, script: "echo Sleeping before Slurm job submission; sleep \$((RANDOM % 29 + 1))")
// Specific for OCI machines
def mounts = [
"/lustre/fs1/portfolios/coreai/projects/coreai_tensorrt_ci:/scratch.trt_llm_data:ro",
"/home/svc_tensorrt:/home/svc_tensorrt",
"/home/svc_tensorrt/.cache:/root/.cache"
].join(",")
def mounts = getMountListForSlurmTest(cluster, false).join(",")
def slurmSubmitOutput = Utils.exec(
pipeline,
timeout: false,
@ -822,6 +817,42 @@ def getPytestBaseCommandLine(
return testCmdLine as String[]
}
def getMountListForSlurmTest(SlurmCluster cluster, boolean useSbatch = false)
{
def mounts = []
// mounts for SLURM job submission and logs
if (useSbatch) {
mounts += [
"/home/svc_tensorrt/bloom/scripts",
]
} else {
mounts += [
"/home/svc_tensorrt/bloom/scripts",
"/home/svc_tensorrt/slurm-logs",
]
}
// data/cache mounts
if (cluster.containerRuntime == ContainerRuntime.DOCKER) {
mounts += [
"/home/scratch.trt_llm_data:/scratch.trt_llm_data:ro",
]
} else if (cluster.containerRuntime == ContainerRuntime.ENROOT) {
if (!cluster.scratchPath) {
throw new Exception("Scratch path is not set for cluster: ${cluster.name}")
}
mounts += [
"${cluster.scratchPath}:/scratch.trt_llm_data:ro",
"/home/svc_tensorrt/.cache:/root/.cache",
]
} else {
throw new Exception("Unsupported container runtime: ${cluster.containerRuntime}")
}
return mounts
}
def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG, perfMode=false, stageName="Undefined", splitId=1, splits=1, gpuCount=1, nodeCount=1, skipInstallWheel=false, cpver="cp312")
{
SlurmPartition partition = SlurmConfig.partitionConfig[platform] as SlurmPartition
@ -959,7 +990,7 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
// Generate Job Launch Script
def container = LLM_DOCKER_IMAGE.replace("urm.nvidia.com/", "urm.nvidia.com#")
def mounts = "/home/scratch.trt_llm_data:/scratch.trt_llm_data:ro,/home/svc_tensorrt/bloom/scripts:/home/svc_tensorrt/bloom/scripts"
def mounts = getMountListForSlurmTest(cluster, true).join(",")
String[] taskArgs = getNodeArgs(nodeCount, gpuCount)
if (taskArgs == null) {
error "Invalid Slurm test stage name is set"
@ -971,13 +1002,7 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
def containerImageArg = container
def srunPrologue = ""
if (cluster.containerRuntime == ContainerRuntime.ENROOT) {
mounts = [
"/lustre/fs1/portfolios/coreai/projects/coreai_tensorrt_ci:/scratch.trt_llm_data:ro",
"/home/svc_tensorrt/bloom/scripts",
"/home/svc_tensorrt/.cache:/root/.cache",
].join(",")
def enrootImagePath = "/lustre/fs1/portfolios/coreai/projects/coreai_tensorrt_ci/users/svc_tensorrt/containers/container-\${SLURM_JOB_ID}.sqsh"
def enrootImagePath = "${cluster.scratchPath}/users/svc_tensorrt/containers/container-\${SLURM_JOB_ID}.sqsh"
containerImageArg = enrootImagePath
srunPrologue = """