mirror of
https://github.com/NVIDIA/TensorRT-LLM.git
synced 2026-02-09 20:43:50 +08:00
This MR integrates Conan into the build system, so that it can be used to fetch dependencies in future changes. Also installs all requirements-dev.txt inside a virtualenv instead of the system, since some of Conan's dependencies may conflict with the system packages. Virtualenv is used instead of venv because the triton server backend container has only virtualenv installed. This also allows developers to cache the requirements-dev.txt packages between container launches. Signed-off-by: Tyler Burt <195370667+tburt-nv@users.noreply.github.com>
722 lines
26 KiB
Groovy
722 lines
26 KiB
Groovy
@Library(['bloom-jenkins-shared-lib@main', 'trtllm-jenkins-shared-lib@main']) _
|
|
|
|
import groovy.transform.Field
|
|
|
|
// LLM repository configuration
|
|
withCredentials([string(credentialsId: 'default-llm-repo', variable: 'DEFAULT_LLM_REPO')]) {
|
|
LLM_REPO = env.gitlabSourceRepoHttpUrl ? env.gitlabSourceRepoHttpUrl : "${DEFAULT_LLM_REPO}"
|
|
}
|
|
LLM_ROOT = "llm"
|
|
|
|
ARTIFACT_PATH = env.artifactPath ? env.artifactPath : "sw-tensorrt-generic/llm-artifacts/${JOB_NAME}/${BUILD_NUMBER}"
|
|
UPLOAD_PATH = env.uploadPath ? env.uploadPath : "sw-tensorrt-generic/llm-artifacts/${JOB_NAME}/${BUILD_NUMBER}"
|
|
|
|
X86_64_TRIPLE = "x86_64-linux-gnu"
|
|
AARCH64_TRIPLE = "aarch64-linux-gnu"
|
|
|
|
LLM_DOCKER_IMAGE = env.dockerImage
|
|
|
|
AGENT_IMAGE = "urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:pytorch-25.03-py3-x86_64-ubuntu24.04-trt10.9.0.34-skip-devel-202504250100-3759"
|
|
|
|
POD_TIMEOUT_SECONDS = env.podTimeoutSeconds ? env.podTimeoutSeconds : "21600"
|
|
|
|
// Literals for easier access.
|
|
@Field
|
|
def WHEEL_EXTRA_ARGS = "extraArgs"
|
|
|
|
@Field
|
|
def TARNAME = "tarName"
|
|
|
|
@Field
|
|
def WHEEL_ARCHS = "wheelArchs"
|
|
|
|
@Field
|
|
def CONFIG_LINUX_X86_64_VANILLA = "linux_x86_64_Vanilla"
|
|
|
|
@Field
|
|
def CONFIG_LINUX_X86_64_SINGLE_DEVICE = "linux_x86_64_SingleDevice"
|
|
|
|
@Field
|
|
def CONFIG_LINUX_X86_64_LLVM = "linux_x86_64_LLVM"
|
|
|
|
@Field
|
|
CONFIG_LINUX_AARCH64 = "linux_aarch64"
|
|
|
|
@Field
|
|
def CONFIG_LINUX_AARCH64_LLVM = "linux_aarch64_LLVM"
|
|
|
|
@Field
|
|
def BUILD_CONFIGS = [
|
|
// Vanilla TARNAME is used for packaging in runLLMPackage
|
|
// cmake-vars cannot be empty, so passing (default) multi-device configuration.
|
|
(CONFIG_LINUX_X86_64_VANILLA) : [
|
|
(WHEEL_EXTRA_ARGS) : "--extra-cmake-vars ENABLE_MULTI_DEVICE=1 --extra-cmake-vars WARNING_IS_ERROR=ON --micro_benchmarks",
|
|
(TARNAME) : "TensorRT-LLM.tar.gz",
|
|
(WHEEL_ARCHS): "80-real;86-real;89-real;90-real;100-real;120-real",
|
|
],
|
|
(CONFIG_LINUX_X86_64_SINGLE_DEVICE) : [
|
|
(WHEEL_EXTRA_ARGS) : "--extra-cmake-vars ENABLE_MULTI_DEVICE=0 --extra-cmake-vars WARNING_IS_ERROR=ON --extra-cmake-vars ENABLE_UCX=0 --micro_benchmarks",
|
|
(TARNAME) : "single-device-TensorRT-LLM.tar.gz",
|
|
(WHEEL_ARCHS): "80-real;86-real;89-real;90-real;100-real;120-real",
|
|
],
|
|
(CONFIG_LINUX_X86_64_LLVM) : [
|
|
(WHEEL_EXTRA_ARGS) : "--extra-cmake-vars ENABLE_MULTI_DEVICE=1 --extra-cmake-vars WARNING_IS_ERROR=ON --micro_benchmarks -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_CUDA_HOST_COMPILER=clang -DCMAKE_LINKER_TYPE=LLD",
|
|
(TARNAME) : "llvm-TensorRT-LLM.tar.gz",
|
|
(WHEEL_ARCHS): "80-real;86-real;89-real;90-real;100-real;120-real",
|
|
],
|
|
(CONFIG_LINUX_AARCH64): [
|
|
(WHEEL_EXTRA_ARGS) : "--extra-cmake-vars WARNING_IS_ERROR=ON",
|
|
(TARNAME) : "TensorRT-LLM-GH200.tar.gz",
|
|
(WHEEL_ARCHS): "90-real;100-real;120-real",
|
|
],
|
|
(CONFIG_LINUX_AARCH64_LLVM) : [
|
|
(WHEEL_EXTRA_ARGS) : "--extra-cmake-vars WARNING_IS_ERROR=ON -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_CUDA_HOST_COMPILER=clang -DCMAKE_LINKER_TYPE=LLD",
|
|
(TARNAME) : "llvm-TensorRT-LLM-GH200.tar.gz",
|
|
(WHEEL_ARCHS): "90-real;100-real;120-real",
|
|
],
|
|
]
|
|
|
|
// TODO: Move common variables to an unified location
|
|
BUILD_CORES_REQUEST = "8"
|
|
BUILD_CORES_LIMIT = "8"
|
|
BUILD_MEMORY_REQUEST = "48Gi"
|
|
BUILD_MEMORY_LIMIT = "64Gi"
|
|
BUILD_JOBS = "8"
|
|
|
|
TESTER_CORES = "12"
|
|
TESTER_MEMORY = "96Gi"
|
|
|
|
CCACHE_DIR="/mnt/sw-tensorrt-pvc/scratch.trt_ccache/llm_ccache"
|
|
|
|
String getShortenedJobName(String path)
|
|
{
|
|
static final nameMapping = [
|
|
"L0_MergeRequest": "l0-mr",
|
|
"L0_Custom": "l0-cus",
|
|
"L0_PostMerge": "l0-pm",
|
|
"L0_PostMergeDocker": "l0-pmd",
|
|
"L1_Custom": "l1-cus",
|
|
"L1_Nightly": "l1-nt",
|
|
"L1_Stable": "l1-stb",
|
|
]
|
|
def parts = path.split('/')
|
|
// Apply nameMapping to the last part (jobName)
|
|
def jobName = parts[-1]
|
|
boolean replaced = false
|
|
nameMapping.each { key, value ->
|
|
if (jobName.contains(key)) {
|
|
jobName = jobName.replace(key, value)
|
|
replaced = true
|
|
}
|
|
}
|
|
if (!replaced) {
|
|
jobName = jobName.length() > 7 ? jobName.substring(0, 7) : jobName
|
|
}
|
|
// Replace the last part with the transformed jobName
|
|
parts[-1] = jobName
|
|
// Rejoin the parts with '-', convert to lowercase
|
|
return parts.join('-').toLowerCase()
|
|
}
|
|
|
|
def createKubernetesPodConfig(image, type, arch = "amd64")
|
|
{
|
|
def targetCould = "kubernetes-cpu"
|
|
def selectors = """
|
|
nvidia.com/node_type: builder
|
|
kubernetes.io/os: linux
|
|
kubernetes.io/arch: ${arch}"""
|
|
def containerConfig = ""
|
|
def nodeLabelPrefix = ""
|
|
def jobName = getShortenedJobName(env.JOB_NAME)
|
|
def buildID = env.BUILD_ID
|
|
|
|
switch(type)
|
|
{
|
|
case "build":
|
|
containerConfig = """
|
|
- name: trt-llm
|
|
image: ${image}
|
|
command: ['sleep', ${POD_TIMEOUT_SECONDS}]
|
|
volumeMounts:
|
|
- name: sw-tensorrt-pvc
|
|
mountPath: "/mnt/sw-tensorrt-pvc"
|
|
readOnly: false
|
|
tty: true
|
|
resources:
|
|
requests:
|
|
cpu: ${BUILD_CORES_REQUEST}
|
|
memory: ${BUILD_MEMORY_REQUEST}
|
|
ephemeral-storage: 200Gi
|
|
limits:
|
|
cpu: ${BUILD_CORES_LIMIT}
|
|
memory: ${BUILD_MEMORY_LIMIT}
|
|
ephemeral-storage: 200Gi
|
|
imagePullPolicy: Always"""
|
|
nodeLabelPrefix = "cpu"
|
|
break
|
|
case "package":
|
|
containerConfig = """
|
|
- name: trt-llm
|
|
image: ${image}
|
|
command: ['cat']
|
|
tty: true
|
|
resources:
|
|
requests:
|
|
cpu: '2'
|
|
memory: 10Gi
|
|
ephemeral-storage: 25Gi
|
|
limits:
|
|
cpu: '2'
|
|
memory: 10Gi
|
|
ephemeral-storage: 25Gi
|
|
imagePullPolicy: Always"""
|
|
nodeLabelPrefix = "cpu"
|
|
break
|
|
}
|
|
def nodeLabel = trtllm_utils.appendRandomPostfix("${nodeLabelPrefix}---tensorrt-${jobName}-${buildID}")
|
|
def pvcVolume = """
|
|
- name: sw-tensorrt-pvc
|
|
persistentVolumeClaim:
|
|
claimName: sw-tensorrt-pvc
|
|
"""
|
|
if (arch == "arm64") {
|
|
// WAR: PVC mount is not setup on GH200 machines, use a small local cache as a WAR
|
|
pvcVolume = """
|
|
- name: sw-tensorrt-pvc
|
|
nfs:
|
|
server: 10.117.145.13
|
|
path: /vol/scratch1/scratch.svc_tensorrt_blossom
|
|
"""
|
|
}
|
|
def podConfig = [
|
|
cloud: targetCould,
|
|
namespace: "sw-tensorrt",
|
|
label: nodeLabel,
|
|
yaml: """
|
|
apiVersion: v1
|
|
kind: Pod
|
|
spec:
|
|
qosClass: Guaranteed
|
|
affinity:
|
|
nodeAffinity:
|
|
requiredDuringSchedulingIgnoredDuringExecution:
|
|
nodeSelectorTerms:
|
|
- matchExpressions:
|
|
- key: "tensorrt/taints"
|
|
operator: DoesNotExist
|
|
- key: "tensorrt/affinity"
|
|
operator: NotIn
|
|
values:
|
|
- "core"
|
|
nodeSelector: ${selectors}
|
|
containers:
|
|
${containerConfig}
|
|
env:
|
|
- name: HOST_NODE_NAME
|
|
valueFrom:
|
|
fieldRef:
|
|
fieldPath: spec.nodeName
|
|
- name: jnlp
|
|
image: urm.nvidia.com/docker/jenkins/inbound-agent:4.11-1-jdk11
|
|
args: ['\$(JENKINS_SECRET)', '\$(JENKINS_NAME)']
|
|
resources:
|
|
requests:
|
|
cpu: '2'
|
|
memory: 10Gi
|
|
ephemeral-storage: 25Gi
|
|
limits:
|
|
cpu: '2'
|
|
memory: 10Gi
|
|
ephemeral-storage: 25Gi
|
|
qosClass: Guaranteed
|
|
volumes:
|
|
${pvcVolume}
|
|
""".stripIndent(),
|
|
]
|
|
|
|
return podConfig
|
|
}
|
|
|
|
def echoNodeAndGpuInfo(pipeline, stageName)
|
|
{
|
|
String hostNodeName = sh(script: 'echo $HOST_NODE_NAME', returnStdout: true)
|
|
String gpuUuids = pipeline.sh(script: "nvidia-smi -q | grep \"GPU UUID\" | awk '{print \$4}' | tr '\n' ',' || true", returnStdout: true)
|
|
pipeline.echo "HOST_NODE_NAME = ${hostNodeName} ; GPU_UUIDS = ${gpuUuids} ; STAGE_NAME = ${stageName}"
|
|
}
|
|
|
|
def downloadArtifacts(stageName, reuseArtifactPath, artifacts, serverId = 'Artifactory')
|
|
{
|
|
def reused = true
|
|
stage(stageName) {
|
|
for (downit in artifacts) {
|
|
def uploadpath = downit.key
|
|
try {
|
|
rtDownload(
|
|
failNoOp: true,
|
|
serverId: serverId,
|
|
spec: """{
|
|
"files": [
|
|
{
|
|
"pattern": "${reuseArtifactPath}/${uploadpath}"
|
|
}
|
|
]
|
|
}""",
|
|
)
|
|
} catch (Exception e) {
|
|
echo "failed downloading ${reuseArtifactPath}/${uploadpath}, need rebuild."
|
|
reused = false
|
|
catchError(buildResult: 'SUCCESS', stageResult: 'UNSTABLE') { throw e }
|
|
}
|
|
}
|
|
|
|
if (!reused) {
|
|
return null
|
|
}
|
|
|
|
reuseArtifactPath = reuseArtifactPath.substring(reuseArtifactPath.indexOf('/')+1)
|
|
def newArtifacts = [:]
|
|
for (reuseit in artifacts) {
|
|
def uploadpath = reuseit.key
|
|
newArtifacts[reuseit.key] = "${reuseArtifactPath}/${uploadpath}"
|
|
}
|
|
|
|
return newArtifacts
|
|
}
|
|
}
|
|
|
|
def uploadArtifacts(artifacts, prefix = UPLOAD_PATH, retryTimes = 2, serverId = 'Artifactory')
|
|
{
|
|
for (it in artifacts) {
|
|
def uploadpath = it.key
|
|
def filepath = it.value
|
|
echo "uploading ${filepath} as ${uploadpath}"
|
|
trtllm_utils.llmRetry(retryTimes, "uploadArtifacts", {
|
|
rtUpload (
|
|
serverId: serverId,
|
|
spec: """{
|
|
"files": [
|
|
{
|
|
"pattern": "${filepath}",
|
|
"target": "${prefix}/${uploadpath}"
|
|
}
|
|
]
|
|
}""",
|
|
)
|
|
})
|
|
}
|
|
}
|
|
|
|
def buildOrCache(pipeline, key, reuseArtifactPath, artifacts, image, k8s_cpu, runner)
|
|
{
|
|
if (reuseArtifactPath) {
|
|
stage(key) {
|
|
def newArtifacts = downloadArtifacts("[${key}] Reuse", reuseArtifactPath, artifacts)
|
|
if (newArtifacts != null) {
|
|
uploadArtifacts(newArtifacts)
|
|
} else {
|
|
reuseArtifactPath = null
|
|
}
|
|
}
|
|
}
|
|
if (reuseArtifactPath) {
|
|
return
|
|
}
|
|
|
|
trtllm_utils.launchKubernetesPod(pipeline, createKubernetesPodConfig(image, "build", k8s_cpu), "trt-llm", {
|
|
stage(key) {
|
|
stage("[${key}] Run") {
|
|
echoNodeAndGpuInfo(pipeline, key)
|
|
runner()
|
|
}
|
|
stage("Upload") {
|
|
rtServer (
|
|
id: 'Artifactory',
|
|
url: 'https://urm.nvidia.com/artifactory',
|
|
credentialsId: 'urm-artifactory-creds',
|
|
// If Jenkins is configured to use an http proxy, you can bypass the proxy when using this Artifactory server:
|
|
bypassProxy: true,
|
|
// Configure the connection timeout (in seconds).
|
|
// The default value (if not configured) is 300 seconds:
|
|
timeout: 300
|
|
)
|
|
uploadArtifacts(artifacts)
|
|
}
|
|
}
|
|
})
|
|
}
|
|
|
|
def prepareLLMBuild(pipeline, config)
|
|
{
|
|
def buildFlags = BUILD_CONFIGS[config]
|
|
def tarName = buildFlags[TARNAME]
|
|
|
|
def is_linux_x86_64 = config.contains("linux_x86_64")
|
|
def artifacts = ["${tarName}": tarName]
|
|
def runner = {
|
|
runLLMBuild(pipeline, buildFlags, tarName, is_linux_x86_64)
|
|
}
|
|
|
|
return [artifacts, runner]
|
|
|
|
}
|
|
|
|
def runLLMBuild(pipeline, buildFlags, tarName, is_linux_x86_64)
|
|
{
|
|
// Step 1: cloning tekit source code
|
|
sh "pwd && ls -alh"
|
|
sh "env | sort"
|
|
sh "ccache -sv"
|
|
sh "rm -rf **/*.xml *.tar.gz"
|
|
|
|
trtllm_utils.checkoutSource(LLM_REPO, env.gitlabCommit, LLM_ROOT, true, true)
|
|
if (env.alternativeTRT) {
|
|
sh "cd ${LLM_ROOT} && sed -i 's#tensorrt~=.*\$#tensorrt#g' requirements.txt && cat requirements.txt"
|
|
}
|
|
|
|
sh "mkdir TensorRT-LLM"
|
|
sh "cp -r ${LLM_ROOT}/ TensorRT-LLM/src/"
|
|
|
|
// Step 2: building wheels in container
|
|
// Random sleep to avoid resource contention
|
|
sleep(10 * Math.random())
|
|
sh "curl ifconfig.me || true"
|
|
sh "nproc && free -g && hostname"
|
|
sh "cat ${CCACHE_DIR}/ccache.conf"
|
|
|
|
sh "env | sort"
|
|
sh "ldconfig --print-cache || true"
|
|
sh "ls -lh /"
|
|
sh "id || true"
|
|
sh "whoami || true"
|
|
echo "Building TensorRT-LLM Python package ..."
|
|
sh "git config --global --add safe.directory \"*\""
|
|
def pipArgs = "--no-cache-dir"
|
|
if (is_linux_x86_64) {
|
|
pipArgs = ""
|
|
}
|
|
|
|
// install python package
|
|
trtllm_utils.llmExecStepWithRetry(pipeline, script: "cd ${LLM_ROOT} && pip3 install -r requirements-dev.txt ${pipArgs}")
|
|
|
|
if (env.alternativeTRT) {
|
|
trtllm_utils.replaceWithAlternativeTRT(env.alternativeTRT, "cp312")
|
|
}
|
|
|
|
withCredentials([usernamePassword(credentialsId: "urm-artifactory-creds", usernameVariable: 'CONAN_LOGIN_USERNAME', passwordVariable: 'CONAN_PASSWORD')]) {
|
|
sh "cd ${LLM_ROOT} && python3 scripts/build_wheel.py --use_ccache -j ${BUILD_JOBS} -a '${buildFlags[WHEEL_ARCHS]}' ${buildFlags[WHEEL_EXTRA_ARGS]} --benchmarks --extra-make-targets modelSpec"
|
|
}
|
|
if (is_linux_x86_64) {
|
|
sh "cd ${LLM_ROOT} && python3 scripts/build_cpp_examples.py"
|
|
}
|
|
|
|
// Step 3: packaging wheels into tarfile
|
|
sh "cp ${LLM_ROOT}/build/tensorrt_llm-*.whl TensorRT-LLM/"
|
|
|
|
// Step 4: packaging benchmark and required cpp dependencies into tarfile
|
|
sh "mkdir -p TensorRT-LLM/benchmarks/cpp"
|
|
sh "cp ${LLM_ROOT}/cpp/build/benchmarks/bertBenchmark TensorRT-LLM/benchmarks/cpp"
|
|
sh "cp ${LLM_ROOT}/cpp/build/benchmarks/gptSessionBenchmark TensorRT-LLM/benchmarks/cpp"
|
|
sh "cp ${LLM_ROOT}/cpp/build/benchmarks/gptManagerBenchmark TensorRT-LLM/benchmarks/cpp"
|
|
sh "cp ${LLM_ROOT}/cpp/build/tensorrt_llm/libtensorrt_llm.so TensorRT-LLM/benchmarks/cpp"
|
|
sh "cp ${LLM_ROOT}/cpp/build/tensorrt_llm/plugins/libnvinfer_plugin_tensorrt_llm.so TensorRT-LLM/benchmarks/cpp"
|
|
|
|
// Copy model_spec.so (pybinding for ModelSpec) since various tests will need it.
|
|
sh "mkdir -p TensorRT-LLM/src/cpp/tests/batch_manager"
|
|
sh "cp ${LLM_ROOT}/cpp/build/tests/batch_manager/model_spec.so TensorRT-LLM/src/cpp/tests/batch_manager/"
|
|
|
|
if (is_linux_x86_64) {
|
|
sh "rm -rf ${tarName}"
|
|
sh "pigz --version || true"
|
|
sh "bash -c 'tar --use-compress-program=\"pigz -k\" -cf ${tarName} TensorRT-LLM/'"
|
|
} else {
|
|
sh "tar -czvf ${tarName} TensorRT-LLM/"
|
|
}
|
|
}
|
|
|
|
def buildWheelInContainer(pipeline, libraries=[], triple=X86_64_TRIPLE, clean=false, pre_cxx11abi=false, cpver="312", extra_args="")
|
|
{
|
|
// Random sleep to avoid resource contention
|
|
sleep(10 * Math.random())
|
|
sh "curl ifconfig.me || true"
|
|
sh "nproc && free -g && hostname"
|
|
sh "ccache -sv"
|
|
sh "cat ${CCACHE_DIR}/ccache.conf"
|
|
|
|
// Step 1: cloning tekit source code
|
|
trtllm_utils.checkoutSource(LLM_REPO, env.gitlabCommit, LLM_ROOT, true, true)
|
|
if (env.alternativeTRT) {
|
|
trtllm_utils.replaceWithAlternativeTRT(env.alternativeTRT, cpver)
|
|
sh "cd ${LLM_ROOT} && sed -i 's#tensorrt~=.*\$#tensorrt#g' requirements.txt && cat requirements.txt"
|
|
}
|
|
// Step 2: building libs in container
|
|
sh "bash -c 'pip3 show tensorrt || true'"
|
|
|
|
if (extra_args == "") {
|
|
if (triple == AARCH64_TRIPLE) {
|
|
extra_args = "-a '90-real;100-real;120-real'"
|
|
} else {
|
|
extra_args = "-a '80-real;86-real;89-real;90-real;100-real;120-real'"
|
|
}
|
|
}
|
|
if (pre_cxx11abi) {
|
|
extra_args = extra_args + " -l -D 'USE_CXX11_ABI=0'"
|
|
} else {
|
|
if (libraries.size() != 0) {
|
|
extra_args = extra_args + " -l -D 'USE_CXX11_ABI=1'"
|
|
}
|
|
}
|
|
if (clean) {
|
|
extra_args = extra_args + " --clean"
|
|
}
|
|
sh "bash -c 'git config --global --add safe.directory \"*\"'"
|
|
// Because different architectures involve different macros, a comprehensive test is conducted here.
|
|
withCredentials([usernamePassword(credentialsId: "urm-artifactory-creds", usernameVariable: 'CONAN_LOGIN_USERNAME', passwordVariable: 'CONAN_PASSWORD')]) {
|
|
trtllm_utils.llmExecStepWithRetry(pipeline, script: "bash -c \"cd ${LLM_ROOT} && python3 scripts/build_wheel.py --use_ccache -j ${BUILD_JOBS} -D 'WARNING_IS_ERROR=ON' ${extra_args}\"")
|
|
}
|
|
}
|
|
|
|
def prepareBuildLib(pipeline, triple, pre_cxx11abi)
|
|
{
|
|
def libraries = [
|
|
"batch_manager",
|
|
"executor",
|
|
"internal_cutlass_kernels",
|
|
]
|
|
if ((triple == X86_64_TRIPLE && pre_cxx11abi) || (triple == AARCH64_TRIPLE && !pre_cxx11abi)) {
|
|
libraries += [
|
|
"ucx_wrapper",
|
|
]
|
|
}
|
|
|
|
def artifacts = [:]
|
|
for (library_name in libraries) {
|
|
def libdir
|
|
def is_static
|
|
if (library_name == "batch_manager") {
|
|
libdir = "tensorrt_llm/batch_manager"
|
|
is_static = true
|
|
} else if (library_name == "executor") {
|
|
libdir = "tensorrt_llm/executor"
|
|
is_static = true
|
|
} else if (library_name == "internal_cutlass_kernels"){
|
|
libdir = "tensorrt_llm/kernels/internal_cutlass_kernels"
|
|
is_static = true
|
|
} else if (library_name == "ucx_wrapper") {
|
|
libdir = "tensorrt_llm/executor/cache_transmission/ucx_utils"
|
|
}
|
|
|
|
def libname = "libtensorrt_llm_" + library_name
|
|
def ext = ".so"
|
|
if (is_static) {
|
|
libname += "_static"
|
|
ext = ".a"
|
|
}
|
|
def filepath = "${LLM_ROOT}/cpp/build/" + libdir + "/" + libname + ext
|
|
def uploadname = libname + ext
|
|
if (is_static && pre_cxx11abi) {
|
|
uploadname = libname + ".pre_cxx11" + ext
|
|
}
|
|
def uploadpath = "${triple}/${uploadname}"
|
|
artifacts[uploadpath] = filepath
|
|
}
|
|
|
|
def cpver = "cp312"
|
|
if (triple == X86_64_TRIPLE) {
|
|
cpver = "cp310"
|
|
}
|
|
|
|
return [artifacts, {
|
|
buildWheelInContainer(pipeline, libraries, triple, false, pre_cxx11abi, cpver)
|
|
}]
|
|
}
|
|
|
|
def prepareLLMPackage(pipeline, archTriple=X86_64_TRIPLE)
|
|
{
|
|
def tarFileName = "TensorRT-LLM.tar.gz"
|
|
def linuxPkgName = "tensorrt-llm-release-src-${env.gitlabCommit}.tar.gz"
|
|
if (archTriple == AARCH64_TRIPLE) {
|
|
tarFileName = "TensorRT-LLM-GH200.tar.gz"
|
|
linuxPkgName = "tensorrt-llm-sbsa-release-src-${env.gitlabCommit}.tar.gz"
|
|
}
|
|
def artifacts = ["${linuxPkgName}": "${LLM_ROOT}/${linuxPkgName}"]
|
|
return [artifacts, { runLLMPackage(pipeline, archTriple, tarFileName, linuxPkgName) }]
|
|
}
|
|
|
|
def runLLMPackage(pipeline, archTriple, tarFileName, linuxPkgName)
|
|
{
|
|
// Random sleep to avoid resource contention
|
|
sleep(10 * Math.random())
|
|
sh "curl ifconfig.me || true"
|
|
sh "nproc && free -g && hostname"
|
|
|
|
// Step 1: create LLM_ROOT dir and download code
|
|
sh "pwd && ls -alh"
|
|
sh "mkdir ${LLM_ROOT}"
|
|
def llmPath = sh (script: "realpath ${LLM_ROOT}",returnStdout: true).trim()
|
|
|
|
// Download tar generated from build jobs
|
|
def llmTarfile = "https://urm.nvidia.com/artifactory/${ARTIFACT_PATH}/${tarFileName}"
|
|
trtllm_utils.llmExecStepWithRetry(pipeline, script: "cd ${llmPath} && wget -nv ${llmTarfile}")
|
|
// The path TensorRT-LLM/src is defined in the build job
|
|
sh "cd ${llmPath} && tar -zxf ${tarFileName} TensorRT-LLM/src"
|
|
// create a additional `pkg/tensorrt_llm` folder to make sure the generated tar.gz has only one tensorrt_llm folder
|
|
def llmPackage = "${llmPath}/TensorRT-LLM/pkg/"
|
|
sh "rm -rf ${llmPackage}"
|
|
sh "mkdir -p ${llmPackage}"
|
|
sh "mv ${llmPath}/TensorRT-LLM/src ${llmPackage}/tensorrt_llm"
|
|
|
|
// download libs
|
|
trtllm_utils.llmExecStepWithRetry(pipeline, script: """
|
|
pip3 install gitignore_parser && \
|
|
python3 ${llmPackage}/tensorrt_llm/scripts/package_trt_llm.py \
|
|
--lib_list oss \
|
|
--arch ${archTriple} \
|
|
--download ${env.gitlabCommit} \
|
|
--addr https://urm.nvidia.com/artifactory/${ARTIFACT_PATH} \
|
|
-v \
|
|
${llmPackage}/tensorrt_llm
|
|
""")
|
|
|
|
// clean the internal files and create one tar package
|
|
sh """cd ${llmPackage}/tensorrt_llm && \
|
|
python3 ${llmPackage}/tensorrt_llm/scripts/package_trt_llm.py \
|
|
--lib_list oss \
|
|
--clean \
|
|
--package ${llmPath}/${linuxPkgName} \
|
|
${llmPackage}/tensorrt_llm
|
|
"""
|
|
|
|
sh "cd ${llmPath} && ls -alh"
|
|
}
|
|
|
|
def launchStages(pipeline, cpu_arch, enableFailFast)
|
|
{
|
|
stage("Show Environment") {
|
|
sh "env | sort"
|
|
echo "dockerImage: ${env.dockerImage}"
|
|
echo "gitlabSourceRepoHttpUrl: ${env.gitlabSourceRepoHttpUrl}"
|
|
echo "gitlabCommit: ${env.gitlabCommit}"
|
|
echo "alternativeTRT: ${env.alternativeTRT}"
|
|
echo "Using GitLab repo: ${LLM_REPO}. Commit: ${env.gitlabCommit}"
|
|
}
|
|
|
|
def wheelDockerImage = env.wheelDockerImage
|
|
if (!wheelDockerImage && cpu_arch == AARCH64_TRIPLE) {
|
|
wheelDockerImage = env.dockerImage
|
|
}
|
|
|
|
buildConfigs = [
|
|
"Build TRT-LLM": [LLM_DOCKER_IMAGE] + prepareLLMBuild(
|
|
pipeline, cpu_arch == AARCH64_TRIPLE ? CONFIG_LINUX_AARCH64 : CONFIG_LINUX_X86_64_VANILLA),
|
|
"Build TRT-LLM LLVM": [LLM_DOCKER_IMAGE] + prepareLLMBuild(
|
|
pipeline, cpu_arch == AARCH64_TRIPLE ? CONFIG_LINUX_AARCH64_LLVM : CONFIG_LINUX_X86_64_LLVM),
|
|
] + [true, false].collectEntries{ cxx11 -> [
|
|
"Build libs (cxx11=${cxx11})".toString(), [wheelDockerImage] + prepareBuildLib(
|
|
pipeline, cpu_arch, !cxx11),
|
|
]}
|
|
|
|
if (cpu_arch == X86_64_TRIPLE) {
|
|
buildConfigs += [
|
|
"Build TRT-LLM SingleDevice": [LLM_DOCKER_IMAGE] + prepareLLMBuild(
|
|
pipeline, CONFIG_LINUX_X86_64_SINGLE_DEVICE),
|
|
]
|
|
}
|
|
|
|
def packageConf = prepareLLMPackage(pipeline, cpu_arch)
|
|
def artifacts = packageConf[0]
|
|
def runner = packageConf[1]
|
|
|
|
rtServer (
|
|
id: 'Artifactory',
|
|
url: 'https://urm.nvidia.com/artifactory',
|
|
credentialsId: 'urm-artifactory-creds',
|
|
// If Jenkins is configured to use an http proxy, you can bypass the proxy when using this Artifactory server:
|
|
bypassProxy: true,
|
|
// Configure the connection timeout (in seconds).
|
|
// The default value (if not configured) is 300 seconds:
|
|
timeout: 300
|
|
)
|
|
def reuseArtifactPath = env.reuseArtifactPath
|
|
if (reuseArtifactPath) {
|
|
def stageName = "Reuse Check"
|
|
newArtifacts = downloadArtifacts(stageName, reuseArtifactPath, artifacts)
|
|
if (!newArtifacts) {
|
|
echo "previous package does not exist, rebuild all the artifacts"
|
|
reuseArtifactPath = null
|
|
} else {
|
|
artifacts = newArtifacts
|
|
runner = null
|
|
}
|
|
}
|
|
|
|
def k8s_cpu = "amd64"
|
|
if (cpu_arch == AARCH64_TRIPLE) {
|
|
k8s_cpu = "arm64"
|
|
}
|
|
|
|
parallelJobs = buildConfigs.collectEntries{key, values -> [key, {
|
|
script {
|
|
buildOrCache(pipeline, key, reuseArtifactPath, values[1], values[0], k8s_cpu, values[2])
|
|
}
|
|
}]}
|
|
parallelJobs.failFast = enableFailFast
|
|
|
|
if (cpu_arch == X86_64_TRIPLE && !reuseArtifactPath) {
|
|
def key = "Build with build type Debug"
|
|
parallelJobs += [
|
|
(key): {
|
|
script {
|
|
trtllm_utils.launchKubernetesPod(pipeline, createKubernetesPodConfig(LLM_DOCKER_IMAGE, "build", k8s_cpu), "trt-llm", {
|
|
stage(key) {
|
|
stage("[${key}] Run") {
|
|
echoNodeAndGpuInfo(pipeline, key)
|
|
buildWheelInContainer(pipeline, [], X86_64_TRIPLE, false, false, "cp312", "-a '90-real' -b Debug --benchmarks --micro_benchmarks")
|
|
}
|
|
}
|
|
})
|
|
}
|
|
}]
|
|
}
|
|
|
|
stage("Build") {
|
|
pipeline.parallel parallelJobs
|
|
} // Build stage
|
|
stage("Package") {
|
|
container("trt-llm") {
|
|
if (!reuseArtifactPath) {
|
|
runner()
|
|
}
|
|
uploadArtifacts(artifacts)
|
|
}
|
|
}
|
|
}
|
|
|
|
pipeline {
|
|
agent {
|
|
kubernetes createKubernetesPodConfig(AGENT_IMAGE, "package", "amd64")
|
|
}
|
|
options {
|
|
// Check the valid options at: https://www.jenkins.io/doc/book/pipeline/syntax/
|
|
// some step like results analysis stage, does not need to check out source code
|
|
skipDefaultCheckout()
|
|
// to better analyze the time for each step/test
|
|
timestamps()
|
|
timeout(time: 24, unit: 'HOURS')
|
|
}
|
|
environment {
|
|
//Workspace normally is: /home/jenkins/agent/workspace/LLM/L0_MergeRequest@tmp/
|
|
HF_HOME="${env.WORKSPACE_TMP}/.cache/huggingface"
|
|
CCACHE_DIR="${CCACHE_DIR}"
|
|
PIP_INDEX_URL="https://urm.nvidia.com/artifactory/api/pypi/pypi-remote/simple"
|
|
// force datasets to be offline mode, to prevent CI jobs are downloading HF dataset causing test failures
|
|
HF_DATASETS_OFFLINE=1
|
|
}
|
|
stages {
|
|
stage("BuildJob") {
|
|
steps {
|
|
launchStages(this, params.targetArch, params.enableFailFast)
|
|
}
|
|
}
|
|
} // stage
|
|
} // pipeline
|