[None][infra] Fix TRT-LLM data scratch mount point for gb10x (#10880)

Signed-off-by: qqiao <qqiao@nvidia.com>
Signed-off-by: Emma Qiao <qqiao@nvidia.com>
Co-authored-by: Yanchao Lu <yanchaol@nvidia.com>
This commit is contained in:
Emma Qiao 2026-01-24 14:00:17 +08:00 committed by GitHub
parent 78a008d61a
commit 9d65b8bf24
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -702,6 +702,13 @@ def runLLMTestlistWithAgent(pipeline, platform, testList, config=VANILLA_CONFIG,
dockerArgs += " --device=/dev/gdrdrv:/dev/gdrdrv"
}
}
if (fileExists('/home/scratch.trt_llm_data_ci')) {
dockerArgs += " -v /home/scratch.trt_llm_data_ci:/scratch.trt_llm_data:ro "
} else if (fileExists('/home/scratch.trt_llm_data')) {
dockerArgs += " -v /home/scratch.trt_llm_data:/scratch.trt_llm_data:ro "
} else {
echo "Existing TRT-LLM data scratch mount points cannot be set up in this cluster, ignore..."
}
}
dockerArgs = "${dockerArgs} " +
@ -710,7 +717,6 @@ def runLLMTestlistWithAgent(pipeline, platform, testList, config=VANILLA_CONFIG,
"--entrypoint=\"\" " +
"--security-opt seccomp=unconfined " +
"-u root:root " +
"-v /home/scratch.trt_llm_data_ci:/scratch.trt_llm_data:ro " +
"-v /tmp/ccache:${CCACHE_DIR}:rw " +
"-v /tmp/pipcache/http-v2:/root/.cache/pip/http-v2:rw " +
"--cap-add=SYSLOG"
@ -1855,7 +1861,10 @@ def createKubernetesPodConfig(image, type, arch = "amd64", gpuCount = 1, perfMod
server: 10.117.145.14
path: /vol/scratch1/scratch.michaeln_blossom
"""
if (type.contains("6000d") || type.contains("gh200")) {
// Austin FlexCache looks slow and unstable recently. Remove gh200 temporarily.
// That means gh200 nodes will use the default Blossom data scratch.
if (type.contains("6000d")) {
// rtx-pro-6000d and gh200 nodes are located in Austin DC, we use the FlexCache to speed up the data access.
llmModelVolume = """
- name: scratch-trt-llm-data