mirror of
https://github.com/NVIDIA/TensorRT-LLM.git
synced 2026-01-14 06:27:45 +08:00
[feat] Multi-node CI testing support via Slurm (#4771)
Signed-off-by: Yuanjing Xue <197832395+yuanjingx87@users.noreply.github.com> Signed-off-by: yuanjingx87 <197832395+yuanjingx87@users.noreply.github.com> Signed-off-by: Yanchao Lu <yanchaol@nvidia.com> Co-authored-by: Yanchao Lu <yanchaol@nvidia.com>
This commit is contained in:
parent
e5ee5c5352
commit
a1c5704055
@ -91,6 +91,63 @@ TESTER_MEMORY = "96Gi"
|
||||
CCACHE_DIR="/mnt/sw-tensorrt-pvc/scratch.trt_ccache/llm_ccache"
|
||||
MODEL_CACHE_DIR="/scratch.trt_llm_data/llm-models"
|
||||
|
||||
def uploadResults(def pipeline, SlurmCluster cluster, String nodeName, String stageName){
|
||||
withCredentials([usernamePassword(credentialsId: 'svc_tensorrt', usernameVariable: 'USERNAME', passwordVariable: 'PASSWORD')]) {
|
||||
def remote = [
|
||||
ip : cluster.ip,
|
||||
host : cluster.host,
|
||||
user : "${pipeline.USERNAME}",
|
||||
passwd : "${pipeline.PASSWORD}",
|
||||
allowAnyHosts: true,
|
||||
]
|
||||
|
||||
Utils.exec(pipeline, script: "apt-get update && apt-get install -y sshpass openssh-client")
|
||||
pipeline.stage('Submit Test Results') {
|
||||
sh "mkdir -p ${stageName}"
|
||||
def resultsFilePath = "/home/svc_tensorrt/bloom/scripts/${nodeName}/results/results.xml"
|
||||
def downloadResultCmd = "sshpass -p '${remote.passwd}' scp -r -p -oStrictHostKeyChecking=no ${remote.user}@${remote.host}:${resultsFilePath} ${stageName}/"
|
||||
def downloadSucceed = sh(script: downloadResultCmd, returnStatus: true) == 0
|
||||
if (downloadSucceed) {
|
||||
sh "ls ${stageName}"
|
||||
echo "Upload test results."
|
||||
sh "tar -czvf results-${stageName}.tar.gz ${stageName}/"
|
||||
trtllm_utils.uploadArtifacts(
|
||||
"results-${stageName}.tar.gz",
|
||||
"${UPLOAD_PATH}/test-results/"
|
||||
)
|
||||
junit(testResults: "${stageName}/results*.xml")
|
||||
} else {
|
||||
println("No results xml to submit")
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
//TODO: consolidate slurm related code for both multi nodes and single nodes
|
||||
def cleanUpNodeResourcesMultiNodes(def pipeline, SlurmCluster cluster, String jobUID){
|
||||
withCredentials([usernamePassword(credentialsId: 'svc_tensorrt', usernameVariable: 'USERNAME', passwordVariable: 'PASSWORD')]) {
|
||||
def remote = [
|
||||
ip : cluster.ip,
|
||||
host : cluster.host,
|
||||
user : "${pipeline.USERNAME}",
|
||||
passwd : "${pipeline.PASSWORD}",
|
||||
allowAnyHosts: true,
|
||||
]
|
||||
|
||||
Utils.exec(pipeline, script: "apt-get update && apt-get install -y sshpass openssh-client")
|
||||
pipeline.stage('Clean up SLURM Agent Resources') {
|
||||
Utils.exec(
|
||||
pipeline,
|
||||
timeout: false,
|
||||
script: Utils.sshUserCmd(
|
||||
remote,
|
||||
"rm -rf /home/svc_tensorrt/bloom/scripts/${jobUID}"
|
||||
)
|
||||
)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
def cleanUpNodeResources(def pipeline, SlurmCluster cluster, String nodeName){
|
||||
withCredentials([usernamePassword(credentialsId: 'svc_tensorrt', usernameVariable: 'USERNAME', passwordVariable: 'PASSWORD')]) {
|
||||
def remote = [
|
||||
@ -98,7 +155,6 @@ def cleanUpNodeResources(def pipeline, SlurmCluster cluster, String nodeName){
|
||||
host : cluster.host,
|
||||
user : "${pipeline.USERNAME}",
|
||||
passwd : "${pipeline.PASSWORD}",
|
||||
password : "${pipeline.PASSWORD}",
|
||||
allowAnyHosts: true,
|
||||
]
|
||||
|
||||
@ -164,7 +220,6 @@ def runLLMTestlistOnSlurm(pipeline, platform, testList, config=VANILLA_CONFIG, p
|
||||
host : cluster.host,
|
||||
user : "${pipeline.USERNAME}",
|
||||
passwd : "${pipeline.PASSWORD}",
|
||||
password : "${pipeline.PASSWORD}",
|
||||
allowAnyHosts: true,
|
||||
]
|
||||
|
||||
@ -211,6 +266,133 @@ def runLLMTestlistOnSlurm(pipeline, platform, testList, config=VANILLA_CONFIG, p
|
||||
}
|
||||
}
|
||||
|
||||
def getNodeArgs(int nodeCount, int gpuCount) {
|
||||
int gpusPerNode = ((gpuCount / nodeCount) as BigDecimal).setScale(0, BigDecimal.ROUND_CEILING).intValue()
|
||||
return [
|
||||
"--nodes=${nodeCount}",
|
||||
"--ntasks=${gpuCount}",
|
||||
"--ntasks-per-node=${gpusPerNode}",
|
||||
"--gpus-per-node=${gpusPerNode}",
|
||||
].join(" ")
|
||||
}
|
||||
|
||||
def runLLMTestlistOnSlurm_MultiNodes(pipeline, platform, testList, config=VANILLA_CONFIG, perfMode=false, stageName="Undefined", splitId=1, splits=1, gpuCount=1, nodeCount=2, skipInstallWheel=false, cpver="cp312")
|
||||
{
|
||||
SlurmPartition partition = SlurmConfig.partitionConfig[platform] as SlurmPartition
|
||||
SlurmCluster cluster = SlurmConfig.clusterConfig[partition.clusterName]
|
||||
|
||||
def jobUID = "${cluster.host}-multi_node_test-${UUID.randomUUID().toString()}"
|
||||
|
||||
try {
|
||||
// Run ssh command to start node in desired cluster via SLURM
|
||||
withCredentials([
|
||||
usernamePassword(
|
||||
credentialsId: 'svc_tensorrt',
|
||||
usernameVariable: 'USERNAME',
|
||||
passwordVariable: 'PASSWORD'
|
||||
)
|
||||
]) {
|
||||
def remote = [
|
||||
ip : cluster.ip,
|
||||
host : cluster.host,
|
||||
user : "${pipeline.USERNAME}",
|
||||
passwd : "${pipeline.PASSWORD}",
|
||||
allowAnyHosts: true,
|
||||
]
|
||||
Utils.exec(pipeline, script: "apt-get update && apt-get install -y sshpass openssh-client")
|
||||
def tarName = BUILD_CONFIGS[config][TARNAME]
|
||||
def llmTarfile = "https://urm.nvidia.com/artifactory/${ARTIFACT_PATH}/${tarName}"
|
||||
def llmPath = sh (script: "realpath .", returnStdout: true).trim()
|
||||
def jobWorkspace = "/home/svc_tensorrt/bloom/scripts/${jobUID}"
|
||||
def resourcePathNode = "/tmp"
|
||||
def llmSrcNode = "${resourcePathNode}/TensorRT-LLM/src"
|
||||
def llmSrcLocal = "${llmPath}/TensorRT-LLM/src"
|
||||
def scriptRunNode = "${jobWorkspace}/slurm_run.sh"
|
||||
def testListPathNode = "${jobWorkspace}/${testList}.txt"
|
||||
def isAarch64 = config.contains("aarch64")
|
||||
def pytestTestTimeout = "7200"
|
||||
|
||||
stage('Prepare Testing') {
|
||||
// Create Job Workspace folder in Frontend Node
|
||||
Utils.exec(pipeline, script: "sshpass -p '${remote.passwd}' ssh -oStrictHostKeyChecking=no ${remote.user}@${remote.host} 'mkdir ${jobWorkspace}'",)
|
||||
|
||||
// Download and Unzip Tar File
|
||||
trtllm_utils.llmExecStepWithRetry(pipeline, script: "cd ${llmPath} && wget -nv ${llmTarfile}")
|
||||
sh "cd ${llmPath} && tar -zxf ${BUILD_CONFIGS[config][TARNAME]}"
|
||||
|
||||
// Upload slurm_run_sh to Frontend node
|
||||
def scriptRunLocalPath = "${llmSrcLocal}/jenkins/scripts/slurm_run.sh"
|
||||
Utils.exec(pipeline, script: "chmod +x ${scriptRunLocalPath}", returnStdout: true)
|
||||
Utils.exec(pipeline, script: "sshpass -p '${remote.passwd}' scp -r -p -oStrictHostKeyChecking=no ${scriptRunLocalPath} ${remote.user}@${remote.host}:${scriptRunNode}",)
|
||||
|
||||
// Generate Test List and Upload to Frontend Node
|
||||
def makoArgs = getMakoArgsFromStageName(stageName, true)
|
||||
// TODO: currently the options will only be processed if the first
|
||||
// line is "Mako options:", maybe we can make it more generic, which
|
||||
// if the line cannot be split by "=", just ignore that line.
|
||||
def makoOptsJson = transformMakoArgsToJson(["Mako options:"] + makoArgs)
|
||||
def testListPath = renderTestDB(testList, llmSrcLocal, stageName, makoOptsJson)
|
||||
Utils.exec(pipeline, script: "sshpass -p '${remote.passwd}' scp -r -p -oStrictHostKeyChecking=no ${testListPath} ${remote.user}@${remote.host}:${testListPathNode}",)
|
||||
|
||||
// Generate Multi Node Job Launch Script
|
||||
def container = LLM_DOCKER_IMAGE.replace("urm.nvidia.com/", "urm.nvidia.com#")
|
||||
def mounts = "/home/scratch.trt_llm_data:/scratch.trt_llm_data:ro,/home/svc_tensorrt/bloom/scripts:/home/svc_tensorrt/bloom/scripts"
|
||||
String taskArgs = getNodeArgs(nodeCount, gpuCount)
|
||||
|
||||
if (taskArgs == null) {
|
||||
error "Invalid multinode task stage name is set"
|
||||
}
|
||||
|
||||
taskArgs = [
|
||||
taskArgs,
|
||||
"--exclusive",
|
||||
"--container-image=${container}",
|
||||
"--container-workdir=/home/svc_tensorrt/bloom/scripts",
|
||||
"--container-mounts=${mounts}",
|
||||
].join(" ")
|
||||
|
||||
def scriptLaunch = "/home/svc_tensorrt/bloom/scripts/${jobUID}/slurm_launch.sh"
|
||||
def srunCmd = SlurmConfig.generateMultiNodeCommand(partition, taskArgs, scriptRunNode)
|
||||
scriptLaunchDestPath = Utils.createTempLocation(pipeline, "./slurm_launch.sh")
|
||||
def scriptContent = """#!/bin/bash
|
||||
export jobWorkspace=$jobWorkspace
|
||||
export tarName=$tarName
|
||||
export llmTarfile=$llmTarfile
|
||||
export llmSrcNode=$llmSrcNode
|
||||
export stageName=$stageName
|
||||
export testList=$testList
|
||||
export testListPathNode=$testListPathNode
|
||||
export pytestTestTimeout=$pytestTestTimeout
|
||||
export splits=$splits
|
||||
export splitId=$splitId
|
||||
export perfMode=$perfMode
|
||||
export resourcePathNode=$resourcePathNode
|
||||
export MODEL_CACHE_DIR=$MODEL_CACHE_DIR
|
||||
chmod +x ${scriptRunNode}
|
||||
${srunCmd}
|
||||
""".stripIndent()
|
||||
pipeline.writeFile(file: scriptLaunchDestPath, text: scriptContent)
|
||||
Utils.exec(pipeline, script: "chmod +x ${scriptLaunchDestPath}", returnStdout: true)
|
||||
Utils.exec(pipeline, script: "sshpass -p '${remote.passwd}' scp -r -p -oStrictHostKeyChecking=no ${scriptLaunchDestPath} ${remote.user}@${remote.host}:${scriptLaunch}",)
|
||||
}
|
||||
stage('Run Test') {
|
||||
def scriptLaunch = "${jobWorkspace}/slurm_launch.sh"
|
||||
Utils.exec(
|
||||
pipeline,
|
||||
timeout: false,
|
||||
script: Utils.sshUserCmd(
|
||||
remote,
|
||||
"""bash ${scriptLaunch}"""
|
||||
)
|
||||
)
|
||||
}
|
||||
}
|
||||
} finally {
|
||||
uploadResults(pipeline, cluster, jobUID, stageName)
|
||||
cleanUpNodeResourcesMultiNodes(pipeline, cluster, jobUID)
|
||||
}
|
||||
}
|
||||
|
||||
def trimForStageList(stageNameList)
|
||||
{
|
||||
if (stageNameList == null) {
|
||||
@ -686,9 +868,49 @@ def generateStageFailTestResultXml(stageName, subName, failureLog, resultPath) {
|
||||
</failure></testcase></testsuite></testsuites>"""
|
||||
}
|
||||
|
||||
def transformMakoArgsToJson(optList) {
|
||||
def makoOpts = [:]
|
||||
def startedMakoOpts = false
|
||||
def param = null
|
||||
def value = null
|
||||
optList.each { val ->
|
||||
if (startedMakoOpts) {
|
||||
// Handle case where value is missing
|
||||
param = null
|
||||
value = null
|
||||
try {
|
||||
(param, value) = val.split("=")
|
||||
} catch (ArrayIndexOutOfBoundsException ex) {
|
||||
param = val.split("=")[0]
|
||||
value = null
|
||||
}
|
||||
|
||||
// Try to convert nulls, booleans, and floats into the correct type
|
||||
if (value != null) {
|
||||
if (value.toLowerCase() == "none") {
|
||||
echo "Converted mako param '${param}' value '${value}' to 'null'"
|
||||
value = null
|
||||
} else if (value.toLowerCase() in ["true", "false"]) {
|
||||
echo "Converted mako param '${param}' value '${value}' to Boolean '${value.toBoolean()}'"
|
||||
value = value.toBoolean()
|
||||
}
|
||||
}
|
||||
makoOpts[(param)] = value
|
||||
}
|
||||
if (val.equals("Mako options:")) {
|
||||
startedMakoOpts = true
|
||||
}
|
||||
}
|
||||
|
||||
def makoOptsJson = JsonOutput.toJson(makoOpts)
|
||||
|
||||
// Print and return the Test DB Query as a JSON string
|
||||
echo "Test DB Mako opts: ${makoOptsJson}"
|
||||
return makoOptsJson
|
||||
}
|
||||
|
||||
def getMakoOpts(getMakoScript, makoArgs=[]) {
|
||||
// We want to save a map for the Mako opts
|
||||
def makoOpts = [:]
|
||||
def turtleOutput = ""
|
||||
|
||||
// Echo the command
|
||||
@ -723,50 +945,25 @@ def getMakoOpts(getMakoScript, makoArgs=[]) {
|
||||
// Split each line of turtle output into a list
|
||||
def turtleOutList = turtleOutput.split("\n")
|
||||
|
||||
// Extract the mako opts
|
||||
def startedMakoOpts = false
|
||||
def param = null
|
||||
def value = null
|
||||
turtleOutList.each { val ->
|
||||
if (startedMakoOpts) {
|
||||
// Handle case where value is missing
|
||||
param = null
|
||||
value = null
|
||||
try {
|
||||
(param, value) = val.split("=")
|
||||
} catch (ArrayIndexOutOfBoundsException ex) {
|
||||
param = val.split("=")[0]
|
||||
value = null
|
||||
}
|
||||
|
||||
// Try to convert nulls, booleans, and floats into the correct type
|
||||
if (value != null) {
|
||||
if (value.toLowerCase() == "none") {
|
||||
echo "Converted mako param '${param}' value '${value}' to 'null'"
|
||||
value = null
|
||||
} else if (value.toLowerCase() in ["true", "false"]) {
|
||||
echo "Converted mako param '${param}' value '${value}' to Boolean '${value.toBoolean()}'"
|
||||
value = value.toBoolean()
|
||||
}
|
||||
}
|
||||
makoOpts[(param)] = value
|
||||
}
|
||||
if (val.equals("Mako options:")) {
|
||||
startedMakoOpts = true
|
||||
}
|
||||
}
|
||||
|
||||
// Finally, convert the query to a json string
|
||||
def makoOptsJson = JsonOutput.toJson(makoOpts)
|
||||
|
||||
// Print and return the Test DB Query as a JSON string
|
||||
echo "Test DB Mako opts: ${makoOptsJson}"
|
||||
def makoOptsJson = transformMakoArgsToJson(turtleOutList)
|
||||
|
||||
return makoOptsJson
|
||||
}
|
||||
|
||||
def renderTestDB(testContext, llmSrc, stageName) {
|
||||
def scriptPath = "${llmSrc}/tests/integration/defs/sysinfo/get_sysinfo.py"
|
||||
def parseMultiNodeTaskConfigFromStageName(String stageName) {
|
||||
def taskConfig = null
|
||||
def matcher = (stageName =~ /([^-]+)-(\d+)_GPUs-(\d+)_Nodes/)
|
||||
if (matcher.find()) {
|
||||
taskConfig = [
|
||||
gpu: "${matcher.group(1)}",
|
||||
system_gpu_count: "${matcher.group(2)}",
|
||||
node_count: "${matcher.group(3)}" // "node_count" might not be used currently
|
||||
]
|
||||
}
|
||||
return taskConfig
|
||||
}
|
||||
|
||||
def getMakoArgsFromStageName(stageName, parseSysinfo=false) {
|
||||
def makoArgs = []
|
||||
def isPostMerge = stageName.contains("Post-Merge")
|
||||
makoArgs += [isPostMerge ? "stage=post_merge" : "stage=pre_merge"]
|
||||
@ -798,7 +995,27 @@ def renderTestDB(testContext, llmSrc, stageName) {
|
||||
makoArgs += ["auto_trigger=others"]
|
||||
}
|
||||
|
||||
def makoOpts = getMakoOpts(scriptPath, makoArgs)
|
||||
if (parseSysinfo) {
|
||||
def taskConfig = parseMultiNodeTaskConfigFromStageName(stageName)
|
||||
if (taskConfig) {
|
||||
makoArgs += [
|
||||
"gpu=${taskConfig.gpu}",
|
||||
"system_gpu_count=${taskConfig.system_gpu_count}"
|
||||
]
|
||||
}
|
||||
}
|
||||
|
||||
return makoArgs
|
||||
}
|
||||
|
||||
def renderTestDB(testContext, llmSrc, stageName, preDefinedMakoOpts=null) {
|
||||
def makoOpts = preDefinedMakoOpts
|
||||
|
||||
if (!makoOpts) {
|
||||
def scriptPath = "${llmSrc}/tests/integration/defs/sysinfo/get_sysinfo.py"
|
||||
def makoArgs = getMakoArgsFromStageName(stageName)
|
||||
makoOpts = getMakoOpts(scriptPath, makoArgs)
|
||||
}
|
||||
|
||||
sh "pip3 install --extra-index-url https://urm.nvidia.com/artifactory/api/pypi/sw-tensorrt-pypi/simple --ignore-installed trt-test-db==1.8.5+bc6df7"
|
||||
def testDBPath = "${llmSrc}/tests/integration/test_lists/test-db"
|
||||
@ -1600,6 +1817,11 @@ def launchTestJobs(pipeline, testFilter, dockerNode=null)
|
||||
]
|
||||
fullSet += SBSASlurmTestConfigs.keySet()
|
||||
|
||||
multiNodesSBSAConfigs = [
|
||||
"GB200-8_GPUs-2_Nodes-PyTorch-[Post-Merge]-1": ["gb200-multi-node", "l0_gb200_multi_nodes", 1, 1, 8, 2],
|
||||
]
|
||||
fullSet += multiNodesSBSAConfigs.keySet()
|
||||
|
||||
if (env.targetArch == AARCH64_TRIPLE) {
|
||||
parallelJobs = SBSATestConfigs.collectEntries{key, values -> [key, [createKubernetesPodConfig(LLM_DOCKER_IMAGE, values[0], "arm64"), {
|
||||
runLLMTestlistOnPlatform(pipeline, values[0], values[1], LINUX_AARCH64_CONFIG, false, key, values[2], values[3])
|
||||
@ -1617,6 +1839,20 @@ def launchTestJobs(pipeline, testFilter, dockerNode=null)
|
||||
runLLMTestlistOnSlurm(pipeline, values[0], values[1], config, key.contains("Perf"), key, values[2], values[3], values[4] ?: 1)
|
||||
}]]}
|
||||
parallelJobs += parallelSlurmJobs
|
||||
|
||||
// Add SBSA multi node Slurm jobs
|
||||
parallelMultiNodesSBSAJobs = multiNodesSBSAConfigs.collectEntries{key, values -> [key, [createKubernetesPodConfig(LLM_DOCKER_IMAGE, "slurm", "arm64"), {
|
||||
def config = LINUX_AARCH64_CONFIG
|
||||
if (key.contains("single-device")) {
|
||||
config = SINGLE_DEVICE_CONFIG
|
||||
}
|
||||
if (key.contains("llvm")) {
|
||||
config = LLVM_CONFIG
|
||||
}
|
||||
runLLMTestlistOnSlurm_MultiNodes(pipeline, values[0], values[1], config, key.contains("Perf"), key, values[2], values[3], values[4] ?: 1, values[5] ?: 2)
|
||||
}]]}
|
||||
|
||||
parallelJobs += parallelMultiNodesSBSAJobs
|
||||
}
|
||||
|
||||
docBuildSpec = createKubernetesPodConfig(LLM_DOCKER_IMAGE, "a10")
|
||||
|
||||
90
jenkins/scripts/slurm_run.sh
Executable file
90
jenkins/scripts/slurm_run.sh
Executable file
@ -0,0 +1,90 @@
|
||||
#!/bin/bash
|
||||
cd $resourcePathNode
|
||||
llmSrcNode=$resourcePathNode/TensorRT-LLM/src
|
||||
|
||||
# generate .coveragerc in workspace
|
||||
cat << EOF > $jobWorkspace/.coveragerc
|
||||
[run]
|
||||
branch = True
|
||||
data_file = $jobWorkspace/.coverage.$stageName
|
||||
[paths]
|
||||
source =
|
||||
$llmSrcNode/tensorrt_llm/
|
||||
---wheel_path---/tensorrt_llm/
|
||||
EOF
|
||||
|
||||
resultsPath=$jobWorkspace/results
|
||||
mkdir -p $resultsPath
|
||||
if [ $SLURM_LOCALID -eq 0 ]; then
|
||||
wget -nv $llmTarfile
|
||||
tar -zxf $tarName
|
||||
which python3
|
||||
python3 --version
|
||||
apt-get install -y libffi-dev
|
||||
nvidia-smi
|
||||
cd $llmSrcNode && pip3 install --retries 1 -r requirements-dev.txt
|
||||
cd $resourcePathNode && pip3 install --force-reinstall --no-deps TensorRT-LLM/tensorrt_llm-*.whl
|
||||
git config --global --add safe.directory "*"
|
||||
gpuUuids=$(nvidia-smi -q | grep "GPU UUID" | awk '{print $4}' | tr '\n' ',' || true)
|
||||
echo "HOST_NODE_NAME = $HOST_NODE_NAME ; GPU_UUIDS = =$gpuUuids ; STAGE_NAME = $stageName"
|
||||
touch install_lock.lock
|
||||
else
|
||||
while [ ! -f install_lock.lock ]; do
|
||||
sleep 5
|
||||
done
|
||||
fi
|
||||
testList="$testList_$splitId"
|
||||
export CPP_TEST_TIMEOUT_OVERRIDDEN=7200
|
||||
export LLM_ROOT=$llmSrcNode
|
||||
export LLM_MODELS_ROOT=$MODEL_CACHE_DIR
|
||||
export UCX_TLS=^gdr_copy
|
||||
cd $llmSrcNode/tests/integration/defs
|
||||
testCmdLines=(
|
||||
"$llmSrcNode/tensorrt_llm/llmapi/trtllm-llmapi-launch"
|
||||
"pytest"
|
||||
"-v"
|
||||
"--timeout=$pytestTestTimeout"
|
||||
"--test-list=$testListPathNode"
|
||||
"--rootdir $llmSrcNode/tests/integration/defs"
|
||||
"--test-prefix=$stageName"
|
||||
"--splits $splits"
|
||||
"--group $splitId"
|
||||
"--output-dir=$jobWorkspace/"
|
||||
"--csv=$resultsPath/report.csv"
|
||||
"--junit-xml $resultsPath/results.xml"
|
||||
"-o junit_logging=out-err"
|
||||
)
|
||||
if [ "$perfMode" = "true" ]; then
|
||||
testCmdLines+=(
|
||||
"--perf"
|
||||
"--perf-log-formats csv"
|
||||
"--perf-log-formats yaml"
|
||||
)
|
||||
fi
|
||||
trtllmWhlPath=$(pip3 show tensorrt_llm | grep Location | cut -d ' ' -f 2)
|
||||
trtllmWhlPath=$(echo "$trtllmWhlPath" | sed 's/[[:space:]]+/_/g')
|
||||
echo "TRTLLM WHEEL PATH: $trtllmWhlPath"
|
||||
sed -i "s|---wheel_path---|$trtllmWhlPath|g" "$coverageConfigFile"
|
||||
testCmdLines+=(
|
||||
"--cov=$llmSrcNode/examples/"
|
||||
"--cov=$llmSrcNode/tensorrt_llm/"
|
||||
"--cov=$trtllmWhlPath/tensorrt_llm/"
|
||||
"--cov-report="
|
||||
"--cov-config=$coverageConfigFile"
|
||||
)
|
||||
containerPipLLMLibPath=$(pip3 show tensorrt_llm | grep "Location" | awk -F ":" '{ gsub(/ /, "", $2); print $2"/tensorrt_llm/libs"}')
|
||||
containerPipLLMLibPath=$(echo "$containerPipLLMLibPath" | sed 's/[[:space:]]+/_/g')
|
||||
containerLDLibPath=$LD_LIBRARY_PATH
|
||||
containerLDLibPath=$(echo "$containerLDLibPath" | sed 's/[[:space:]]+/_/g')
|
||||
if [[ "$containerLDLibPath" != *"$containerPipLLMLibPath"* ]]; then
|
||||
containerLDLibPath="$containerPipLLMLibPath:$containerLDLibPath"
|
||||
containerLDLibPath="${containerLDLibPath%:}"
|
||||
fi
|
||||
export LD_LIBRARY_PATH=$containerLDLibPath
|
||||
echo "Library Path:"
|
||||
echo "$LD_LIBRARY_PATH"
|
||||
env | sort
|
||||
fullCmd="${testCmdLines[*]}"
|
||||
echo "Running: $testCase"
|
||||
echo "Full Command: $fullCmd"
|
||||
eval $fullCmd
|
||||
@ -0,0 +1,16 @@
|
||||
version: 0.0.1
|
||||
l0_gb200_multi_nodes:
|
||||
- condition:
|
||||
ranges:
|
||||
# 2 nodes with each node has 4 GPUs
|
||||
system_gpu_count:
|
||||
gte: 8
|
||||
lte: 8
|
||||
wildcards:
|
||||
gpu:
|
||||
- '*gb200*'
|
||||
terms:
|
||||
stage: post_merge
|
||||
backend: pytorch
|
||||
tests:
|
||||
- accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_8gpus[latency]
|
||||
Loading…
Reference in New Issue
Block a user