mirror of
https://github.com/NVIDIA/TensorRT-LLM.git
synced 2026-01-13 22:18:36 +08:00
638 lines
24 KiB
Groovy
638 lines
24 KiB
Groovy
@Library(['bloom-jenkins-shared-lib@main', 'trtllm-jenkins-shared-lib@main']) _
|
|
|
|
import groovy.transform.Field
|
|
|
|
// LLM repository configuration
|
|
withCredentials([string(credentialsId: 'default-llm-repo', variable: 'DEFAULT_LLM_REPO')]) {
|
|
LLM_REPO = env.gitlabSourceRepoHttpUrl ? env.gitlabSourceRepoHttpUrl : "${DEFAULT_LLM_REPO}"
|
|
}
|
|
LLM_ROOT = "llm"
|
|
|
|
ARTIFACT_PATH = env.artifactPath ? env.artifactPath : "sw-tensorrt-generic/llm-artifacts/${JOB_NAME}/${BUILD_NUMBER}"
|
|
UPLOAD_PATH = env.uploadPath ? env.uploadPath : "sw-tensorrt-generic/llm-artifacts/${JOB_NAME}/${BUILD_NUMBER}"
|
|
|
|
X86_64_TRIPLE = "x86_64-linux-gnu"
|
|
AARCH64_TRIPLE = "aarch64-linux-gnu"
|
|
|
|
LLM_DOCKER_IMAGE = env.dockerImage
|
|
|
|
// Always use x86_64 image for agent
|
|
AGENT_IMAGE = env.dockerImage.replace("aarch64", "x86_64")
|
|
|
|
POD_TIMEOUT_SECONDS_BUILD = env.podTimeoutSeconds ? env.podTimeoutSeconds : "43200"
|
|
|
|
// Literals for easier access.
|
|
@Field
|
|
def WHEEL_EXTRA_ARGS = "extraArgs"
|
|
|
|
@Field
|
|
def TARNAME = "tarName"
|
|
|
|
@Field
|
|
def WHEEL_ARCHS = "wheelArchs"
|
|
|
|
@Field
|
|
def BUILD_JOBS_FOR_CONFIG = "buildJobsForConfig"
|
|
|
|
@Field
|
|
def CONFIG_LINUX_X86_64_VANILLA = "linux_x86_64_Vanilla"
|
|
|
|
@Field
|
|
def CONFIG_LINUX_X86_64_SINGLE_DEVICE = "linux_x86_64_SingleDevice"
|
|
|
|
@Field
|
|
def CONFIG_LINUX_X86_64_LLVM = "linux_x86_64_LLVM"
|
|
|
|
@Field
|
|
def CONFIG_LINUX_AARCH64 = "linux_aarch64"
|
|
|
|
@Field
|
|
def CONFIG_LINUX_AARCH64_LLVM = "linux_aarch64_LLVM"
|
|
|
|
@Field
|
|
def CONFIG_LINUX_X86_64_PYBIND = "linux_x86_64_Pybind"
|
|
|
|
@Field
|
|
def CONFIG_LINUX_AARCH64_PYBIND = "linux_aarch64_Pybind"
|
|
|
|
@Field
|
|
def BUILD_CONFIGS = [
|
|
// Vanilla TARNAME is used for packaging in runLLMPackage
|
|
// cmake-vars cannot be empty, so passing (default) multi-device configuration.
|
|
(CONFIG_LINUX_X86_64_VANILLA) : [
|
|
(WHEEL_EXTRA_ARGS) : "--extra-cmake-vars ENABLE_MULTI_DEVICE=1 --extra-cmake-vars WARNING_IS_ERROR=ON --extra-cmake-vars NIXL_ROOT=/opt/nvidia/nvda_nixl --extra-cmake-vars MOONCAKE_ROOT=/usr/local/Mooncake --micro_benchmarks",
|
|
(TARNAME) : "TensorRT-LLM.tar.gz",
|
|
(WHEEL_ARCHS): "80-real;86-real;89-real;90-real;100-real;103-real;120-real",
|
|
],
|
|
(CONFIG_LINUX_X86_64_PYBIND) : [
|
|
(WHEEL_EXTRA_ARGS) : "--binding_type pybind --extra-cmake-vars ENABLE_MULTI_DEVICE=1 --extra-cmake-vars WARNING_IS_ERROR=ON --extra-cmake-vars NIXL_ROOT=/opt/nvidia/nvda_nixl --extra-cmake-vars MOONCAKE_ROOT=/usr/local/Mooncake --micro_benchmarks",
|
|
(TARNAME) : "pybind-TensorRT-LLM.tar.gz",
|
|
(WHEEL_ARCHS): "80-real;86-real;89-real;90-real;100-real;103-real;120-real",
|
|
],
|
|
(CONFIG_LINUX_X86_64_SINGLE_DEVICE) : [
|
|
(WHEEL_EXTRA_ARGS) : "--extra-cmake-vars ENABLE_MULTI_DEVICE=0 --extra-cmake-vars WARNING_IS_ERROR=ON --extra-cmake-vars ENABLE_UCX=0 --micro_benchmarks",
|
|
(TARNAME) : "single-device-TensorRT-LLM.tar.gz",
|
|
(WHEEL_ARCHS): "80-real;86-real;89-real;90-real;100-real;103-real;120-real",
|
|
],
|
|
(CONFIG_LINUX_X86_64_LLVM) : [
|
|
(WHEEL_EXTRA_ARGS) : "--extra-cmake-vars ENABLE_MULTI_DEVICE=1 --extra-cmake-vars WARNING_IS_ERROR=ON --micro_benchmarks -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_CUDA_HOST_COMPILER=clang -DCMAKE_LINKER_TYPE=LLD",
|
|
(TARNAME) : "llvm-TensorRT-LLM.tar.gz",
|
|
(WHEEL_ARCHS): "80-real;86-real;89-real;90-real;100-real;103-real;120-real",
|
|
],
|
|
(CONFIG_LINUX_AARCH64): [
|
|
(WHEEL_EXTRA_ARGS) : "--extra-cmake-vars WARNING_IS_ERROR=ON --extra-cmake-vars NIXL_ROOT=/opt/nvidia/nvda_nixl --extra-cmake-vars MOONCAKE_ROOT=/usr/local/Mooncake",
|
|
(TARNAME) : "TensorRT-LLM-GH200.tar.gz",
|
|
(WHEEL_ARCHS): "90-real;100-real;103-real;120-real",
|
|
(BUILD_JOBS_FOR_CONFIG): "4", // TODO: Remove after fix the build OOM issue on SBSA
|
|
],
|
|
(CONFIG_LINUX_AARCH64_PYBIND): [
|
|
(WHEEL_EXTRA_ARGS) : "--binding_type pybind --extra-cmake-vars WARNING_IS_ERROR=ON --extra-cmake-vars NIXL_ROOT=/opt/nvidia/nvda_nixl --extra-cmake-vars MOONCAKE_ROOT=/usr/local/Mooncake",
|
|
(TARNAME) : "pybind-TensorRT-LLM-GH200.tar.gz",
|
|
(WHEEL_ARCHS): "90-real;100-real;103-real;120-real",
|
|
(BUILD_JOBS_FOR_CONFIG): "4", // TODO: Remove after fix the build OOM issue on SBSA
|
|
],
|
|
(CONFIG_LINUX_AARCH64_LLVM) : [
|
|
(WHEEL_EXTRA_ARGS) : "--extra-cmake-vars WARNING_IS_ERROR=ON -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_CUDA_HOST_COMPILER=clang -DCMAKE_LINKER_TYPE=LLD",
|
|
(TARNAME) : "llvm-TensorRT-LLM-GH200.tar.gz",
|
|
(WHEEL_ARCHS): "90-real;100-real;103-real;120-real",
|
|
(BUILD_JOBS_FOR_CONFIG): "4", // TODO: Remove after fix the build OOM issue on SBSA
|
|
],
|
|
]
|
|
|
|
@Field
|
|
def GITHUB_PR_API_URL = "github_pr_api_url"
|
|
@Field
|
|
def CACHED_CHANGED_FILE_LIST = "cached_changed_file_list"
|
|
@Field
|
|
def ACTION_INFO = "action_info"
|
|
def globalVars = [
|
|
(GITHUB_PR_API_URL): null,
|
|
(CACHED_CHANGED_FILE_LIST): null,
|
|
(ACTION_INFO): null,
|
|
]
|
|
|
|
// TODO: Move common variables to an unified location
|
|
BUILD_CORES_REQUEST = "8"
|
|
BUILD_CORES_LIMIT = "8"
|
|
BUILD_MEMORY_REQUEST = "48Gi"
|
|
BUILD_MEMORY_LIMIT = "96Gi"
|
|
BUILD_JOBS = "8"
|
|
|
|
TESTER_CORES = "12"
|
|
TESTER_MEMORY = "96Gi"
|
|
|
|
CCACHE_DIR="/mnt/sw-tensorrt-pvc/scratch.trt_ccache/llm_ccache"
|
|
|
|
String getShortenedJobName(String path)
|
|
{
|
|
static final nameMapping = [
|
|
"L0_MergeRequest": "l0-mr",
|
|
"L0_Custom": "l0-cus",
|
|
"L0_PostMerge": "l0-pm",
|
|
"L0_PostMergeDocker": "l0-pmd",
|
|
"L1_Custom": "l1-cus",
|
|
"L1_Nightly": "l1-nt",
|
|
"L1_Stable": "l1-stb",
|
|
]
|
|
def parts = path.split('/')
|
|
// Apply nameMapping to the last part (jobName)
|
|
def jobName = parts[-1]
|
|
boolean replaced = false
|
|
nameMapping.each { key, value ->
|
|
if (jobName.contains(key)) {
|
|
jobName = jobName.replace(key, value)
|
|
replaced = true
|
|
}
|
|
}
|
|
if (!replaced) {
|
|
jobName = jobName.length() > 7 ? jobName.substring(0, 7) : jobName
|
|
}
|
|
// Replace the last part with the transformed jobName
|
|
parts[-1] = jobName
|
|
// Rejoin the parts with '-', convert to lowercase
|
|
return parts.join('-').toLowerCase()
|
|
}
|
|
|
|
def createKubernetesPodConfig(image, type, arch = "amd64")
|
|
{
|
|
def targetCould = "kubernetes-cpu"
|
|
def selectors = """
|
|
nvidia.com/node_type: builder
|
|
kubernetes.io/os: linux
|
|
kubernetes.io/arch: ${arch}"""
|
|
def containerConfig = ""
|
|
def nodeLabelPrefix = ""
|
|
def jobName = getShortenedJobName(env.JOB_NAME)
|
|
def buildID = env.BUILD_ID
|
|
|
|
def archSuffix = arch == "arm64" ? "arm" : "amd"
|
|
def jnlpImage = "urm.nvidia.com/sw-ipp-blossom-sre-docker-local/lambda/custom_jnlp_images_${archSuffix}_linux:jdk17"
|
|
|
|
switch(type)
|
|
{
|
|
case "build":
|
|
containerConfig = """
|
|
- name: trt-llm
|
|
image: ${image}
|
|
command: ['sleep', ${POD_TIMEOUT_SECONDS_BUILD}]
|
|
volumeMounts:
|
|
- name: sw-tensorrt-pvc
|
|
mountPath: "/mnt/sw-tensorrt-pvc"
|
|
readOnly: false
|
|
tty: true
|
|
resources:
|
|
requests:
|
|
cpu: ${BUILD_CORES_REQUEST}
|
|
memory: ${BUILD_MEMORY_REQUEST}
|
|
ephemeral-storage: 200Gi
|
|
limits:
|
|
cpu: ${BUILD_CORES_LIMIT}
|
|
memory: ${BUILD_MEMORY_LIMIT}
|
|
ephemeral-storage: 200Gi
|
|
imagePullPolicy: Always"""
|
|
nodeLabelPrefix = "cpu"
|
|
break
|
|
case "package":
|
|
containerConfig = """
|
|
- name: trt-llm
|
|
image: ${image}
|
|
command: ['cat']
|
|
tty: true
|
|
resources:
|
|
requests:
|
|
cpu: '2'
|
|
memory: 10Gi
|
|
ephemeral-storage: 25Gi
|
|
limits:
|
|
cpu: '2'
|
|
memory: 10Gi
|
|
ephemeral-storage: 25Gi
|
|
imagePullPolicy: Always"""
|
|
nodeLabelPrefix = "cpu"
|
|
break
|
|
}
|
|
def nodeLabel = trtllm_utils.appendRandomPostfix("${nodeLabelPrefix}---tensorrt-${jobName}-${buildID}")
|
|
def pvcVolume = """
|
|
- name: sw-tensorrt-pvc
|
|
persistentVolumeClaim:
|
|
claimName: sw-tensorrt-pvc
|
|
"""
|
|
if (arch == "arm64") {
|
|
// PVC mount isn't supported on aarch64 platform. Use NFS as a WAR.
|
|
pvcVolume = """
|
|
- name: sw-tensorrt-pvc
|
|
nfs:
|
|
server: 10.117.145.13
|
|
path: /vol/scratch1/scratch.svc_tensorrt_blossom
|
|
"""
|
|
}
|
|
def podConfig = [
|
|
cloud: targetCould,
|
|
namespace: "sw-tensorrt",
|
|
label: nodeLabel,
|
|
yaml: """
|
|
apiVersion: v1
|
|
kind: Pod
|
|
spec:
|
|
qosClass: Guaranteed
|
|
affinity:
|
|
nodeAffinity:
|
|
requiredDuringSchedulingIgnoredDuringExecution:
|
|
nodeSelectorTerms:
|
|
- matchExpressions:
|
|
- key: "tensorrt/taints"
|
|
operator: DoesNotExist
|
|
- key: "tensorrt/affinity"
|
|
operator: NotIn
|
|
values:
|
|
- "core"
|
|
nodeSelector: ${selectors}
|
|
containers:
|
|
${containerConfig}
|
|
env:
|
|
- name: HOST_NODE_NAME
|
|
valueFrom:
|
|
fieldRef:
|
|
fieldPath: spec.nodeName
|
|
- name: jnlp
|
|
image: ${jnlpImage}
|
|
args: ['\$(JENKINS_SECRET)', '\$(JENKINS_NAME)']
|
|
resources:
|
|
requests:
|
|
cpu: '2'
|
|
memory: 10Gi
|
|
ephemeral-storage: 25Gi
|
|
limits:
|
|
cpu: '2'
|
|
memory: 10Gi
|
|
ephemeral-storage: 25Gi
|
|
qosClass: Guaranteed
|
|
volumes:
|
|
${pvcVolume}
|
|
""".stripIndent(),
|
|
]
|
|
|
|
return podConfig
|
|
}
|
|
|
|
def echoNodeAndGpuInfo(pipeline, stageName)
|
|
{
|
|
String hostNodeName = sh(script: 'echo $HOST_NODE_NAME', returnStdout: true)
|
|
String gpuUuids = pipeline.sh(script: "nvidia-smi -q | grep \"GPU UUID\" | awk '{print \$4}' | tr '\n' ',' || true", returnStdout: true)
|
|
pipeline.echo "HOST_NODE_NAME = ${hostNodeName} ; GPU_UUIDS = ${gpuUuids} ; STAGE_NAME = ${stageName}"
|
|
}
|
|
|
|
def downloadArtifacts(stageName, reuseArtifactPath, artifacts, serverId = 'Artifactory')
|
|
{
|
|
def reused = true
|
|
stage(stageName) {
|
|
for (downit in artifacts) {
|
|
def uploadpath = downit.key
|
|
try {
|
|
rtDownload(
|
|
failNoOp: true,
|
|
serverId: serverId,
|
|
spec: """{
|
|
"files": [
|
|
{
|
|
"pattern": "${reuseArtifactPath}/${uploadpath}"
|
|
}
|
|
]
|
|
}""",
|
|
)
|
|
} catch (Exception e) {
|
|
echo "failed downloading ${reuseArtifactPath}/${uploadpath}, need rebuild."
|
|
reused = false
|
|
catchError(buildResult: 'SUCCESS', stageResult: 'UNSTABLE') { throw e }
|
|
}
|
|
}
|
|
|
|
if (!reused) {
|
|
return null
|
|
}
|
|
|
|
reuseArtifactPath = reuseArtifactPath.substring(reuseArtifactPath.indexOf('/')+1)
|
|
def newArtifacts = [:]
|
|
for (reuseit in artifacts) {
|
|
def uploadpath = reuseit.key
|
|
newArtifacts[reuseit.key] = "${reuseArtifactPath}/${uploadpath}"
|
|
}
|
|
|
|
return newArtifacts
|
|
}
|
|
}
|
|
|
|
def uploadArtifacts(artifacts, prefix = UPLOAD_PATH, retryTimes = 2, serverId = 'Artifactory')
|
|
{
|
|
for (it in artifacts) {
|
|
def uploadpath = it.key
|
|
def filepath = it.value
|
|
def spec = """{
|
|
"files": [
|
|
{
|
|
"pattern": "${filepath}",
|
|
"target": "${prefix}/${uploadpath}"
|
|
}
|
|
]
|
|
}"""
|
|
echo "Uploading ${filepath} as ${uploadpath}. Spec: ${spec}"
|
|
trtllm_utils.llmRetry(retryTimes, "uploadArtifacts", {
|
|
rtUpload (
|
|
serverId: serverId,
|
|
spec: spec,
|
|
)
|
|
})
|
|
}
|
|
}
|
|
|
|
def buildOrCache(pipeline, key, reuseArtifactPath, artifacts, image, k8s_cpu, runner)
|
|
{
|
|
if (reuseArtifactPath) {
|
|
stage(key) {
|
|
def newArtifacts = downloadArtifacts("[${key}] Reuse", reuseArtifactPath, artifacts)
|
|
if (newArtifacts != null) {
|
|
uploadArtifacts(newArtifacts)
|
|
} else {
|
|
reuseArtifactPath = null
|
|
}
|
|
}
|
|
}
|
|
if (reuseArtifactPath) {
|
|
return
|
|
}
|
|
|
|
trtllm_utils.launchKubernetesPod(pipeline, createKubernetesPodConfig(image, "build", k8s_cpu), "trt-llm", {
|
|
stage(key) {
|
|
stage("[${key}] Run") {
|
|
echoNodeAndGpuInfo(pipeline, key)
|
|
runner()
|
|
}
|
|
stage("Upload") {
|
|
rtServer (
|
|
id: 'Artifactory',
|
|
url: 'https://urm.nvidia.com/artifactory',
|
|
credentialsId: 'urm-artifactory-creds',
|
|
// If Jenkins is configured to use an http proxy, you can bypass the proxy when using this Artifactory server:
|
|
bypassProxy: true,
|
|
// Configure the connection timeout (in seconds).
|
|
// The default value (if not configured) is 300 seconds:
|
|
timeout: 300
|
|
)
|
|
uploadArtifacts(artifacts)
|
|
}
|
|
}
|
|
})
|
|
}
|
|
|
|
def prepareLLMBuild(pipeline, config)
|
|
{
|
|
def buildFlags = BUILD_CONFIGS[config]
|
|
def tarName = buildFlags[TARNAME]
|
|
|
|
def is_linux_x86_64 = config.contains("linux_x86_64")
|
|
def artifacts = ["${tarName}": tarName]
|
|
def runner = {
|
|
runLLMBuild(pipeline, buildFlags, tarName, is_linux_x86_64)
|
|
}
|
|
|
|
return [artifacts, runner]
|
|
|
|
}
|
|
|
|
def runLLMBuild(pipeline, buildFlags, tarName, is_linux_x86_64)
|
|
{
|
|
// Step 1: cloning tekit source code
|
|
sh "pwd && ls -alh"
|
|
sh "env | sort"
|
|
sh "ccache -sv"
|
|
sh "rm -rf **/*.xml *.tar.gz"
|
|
|
|
trtllm_utils.checkoutSource(LLM_REPO, env.gitlabCommit, LLM_ROOT, true, true)
|
|
if (env.alternativeTRT) {
|
|
sh "cd ${LLM_ROOT} && sed -i 's#tensorrt~=.*\$#tensorrt#g' requirements.txt && cat requirements.txt"
|
|
}
|
|
|
|
sh "mkdir TensorRT-LLM"
|
|
sh "cp -r ${LLM_ROOT}/ TensorRT-LLM/src/"
|
|
|
|
// Step 2: building wheels in container
|
|
// Random sleep to avoid resource contention
|
|
sleep(10 * Math.random())
|
|
sh "curl ifconfig.me || true"
|
|
sh "nproc && free -g && hostname"
|
|
sh "cat ${CCACHE_DIR}/ccache.conf"
|
|
|
|
sh "env | sort"
|
|
sh "ldconfig --print-cache || true"
|
|
sh "ls -lh /"
|
|
sh "id || true"
|
|
sh "whoami || true"
|
|
echo "Building TensorRT-LLM Python package ..."
|
|
sh "git config --global --add safe.directory \"*\""
|
|
def pipArgs = "--no-cache-dir"
|
|
if (is_linux_x86_64) {
|
|
pipArgs = ""
|
|
}
|
|
|
|
// install python package
|
|
trtllm_utils.llmExecStepWithRetry(pipeline, script: "cd ${LLM_ROOT} && pip3 install -r requirements-dev.txt ${pipArgs}")
|
|
|
|
if (env.alternativeTRT) {
|
|
trtllm_utils.replaceWithAlternativeTRT(env.alternativeTRT, "cp312")
|
|
}
|
|
|
|
def buildJobs = buildFlags[BUILD_JOBS_FOR_CONFIG] ?: BUILD_JOBS
|
|
|
|
withCredentials([usernamePassword(credentialsId: "urm-artifactory-creds", usernameVariable: 'CONAN_LOGIN_USERNAME', passwordVariable: 'CONAN_PASSWORD')]) {
|
|
sh "cd ${LLM_ROOT} && python3 scripts/build_wheel.py --use_ccache -G Ninja -j ${buildJobs} -a '${buildFlags[WHEEL_ARCHS]}' ${buildFlags[WHEEL_EXTRA_ARGS]} --benchmarks"
|
|
}
|
|
if (is_linux_x86_64) {
|
|
sh "cd ${LLM_ROOT} && python3 scripts/build_cpp_examples.py"
|
|
}
|
|
|
|
// Build tritonserver artifacts
|
|
def llmPath = sh (script: "realpath ${LLM_ROOT}",returnStdout: true).trim()
|
|
// TODO: Remove after the cmake version is upgraded to 3.31.8
|
|
// Get triton tag from docker/dockerfile.multi
|
|
def tritonShortTag = "r25.10"
|
|
sh "cd ${LLM_ROOT}/triton_backend/inflight_batcher_llm && mkdir build && cd build && cmake .. -DTRTLLM_DIR=${llmPath} -DTRITON_COMMON_REPO_TAG=${tritonShortTag} -DTRITON_CORE_REPO_TAG=${tritonShortTag} -DTRITON_THIRD_PARTY_REPO_TAG=${tritonShortTag} -DTRITON_BACKEND_REPO_TAG=${tritonShortTag} -DUSE_CXX11_ABI=ON && make -j${buildJobs} install"
|
|
|
|
// Step 3: packaging wheels into tarfile
|
|
sh "cp ${LLM_ROOT}/build/tensorrt_llm-*.whl TensorRT-LLM/"
|
|
|
|
// Step 4: packaging tritonserver artifacts into tarfile
|
|
sh "mkdir -p TensorRT-LLM/triton_backend/inflight_batcher_llm/"
|
|
sh "cp ${LLM_ROOT}/triton_backend/inflight_batcher_llm/build/libtriton_tensorrtllm.so TensorRT-LLM/triton_backend/inflight_batcher_llm/"
|
|
sh "cp ${LLM_ROOT}/triton_backend/inflight_batcher_llm/build/trtllmExecutorWorker TensorRT-LLM/triton_backend/inflight_batcher_llm/"
|
|
|
|
// Step 5: packaging benchmark and required cpp dependencies into tarfile
|
|
sh "mkdir -p TensorRT-LLM/benchmarks/cpp"
|
|
sh "cp ${LLM_ROOT}/cpp/build/benchmarks/bertBenchmark TensorRT-LLM/benchmarks/cpp"
|
|
sh "cp ${LLM_ROOT}/cpp/build/benchmarks/gptManagerBenchmark TensorRT-LLM/benchmarks/cpp"
|
|
sh "cp ${LLM_ROOT}/cpp/build/benchmarks/disaggServerBenchmark TensorRT-LLM/benchmarks/cpp"
|
|
sh "cp ${LLM_ROOT}/cpp/build/tensorrt_llm/libtensorrt_llm.so TensorRT-LLM/benchmarks/cpp"
|
|
sh "cp ${LLM_ROOT}/cpp/build/tensorrt_llm/plugins/libnvinfer_plugin_tensorrt_llm.so TensorRT-LLM/benchmarks/cpp"
|
|
|
|
if (is_linux_x86_64) {
|
|
sh "rm -rf ${tarName}"
|
|
sh "pigz --version || true"
|
|
sh "bash -c 'tar --use-compress-program=\"pigz -k\" -cf ${tarName} TensorRT-LLM/'"
|
|
} else {
|
|
sh "tar -czvf ${tarName} TensorRT-LLM/"
|
|
}
|
|
}
|
|
|
|
def buildWheelInContainer(pipeline, libraries=[], triple=X86_64_TRIPLE, clean=false, pre_cxx11abi=false, cpver="312", extra_args="")
|
|
{
|
|
// Random sleep to avoid resource contention
|
|
sleep(10 * Math.random())
|
|
sh "curl ifconfig.me || true"
|
|
sh "nproc && free -g && hostname"
|
|
sh "ccache -sv"
|
|
sh "cat ${CCACHE_DIR}/ccache.conf"
|
|
|
|
// Step 1: cloning tekit source code
|
|
trtllm_utils.checkoutSource(LLM_REPO, env.gitlabCommit, LLM_ROOT, true, true)
|
|
if (env.alternativeTRT) {
|
|
trtllm_utils.replaceWithAlternativeTRT(env.alternativeTRT, cpver)
|
|
sh "cd ${LLM_ROOT} && sed -i 's#tensorrt~=.*\$#tensorrt#g' requirements.txt && cat requirements.txt"
|
|
}
|
|
// Step 2: building libs in container
|
|
sh "bash -c 'pip3 show tensorrt || true'"
|
|
|
|
if (extra_args == "") {
|
|
if (triple == AARCH64_TRIPLE) {
|
|
extra_args = "-a '90-real;100-real;103-real;120-real'"
|
|
} else {
|
|
extra_args = "-a '80-real;86-real;89-real;90-real;100-real;103-real;120-real'"
|
|
}
|
|
}
|
|
if (pre_cxx11abi) {
|
|
extra_args = extra_args + " -l -D 'USE_CXX11_ABI=0'"
|
|
} else {
|
|
if (libraries.size() != 0) {
|
|
extra_args = extra_args + " -l -D 'USE_CXX11_ABI=1'"
|
|
}
|
|
}
|
|
if (clean) {
|
|
extra_args = extra_args + " --clean"
|
|
}
|
|
sh "bash -c 'git config --global --add safe.directory \"*\"'"
|
|
// Because different architectures involve different macros, a comprehensive test is conducted here.
|
|
withCredentials([usernamePassword(credentialsId: "urm-artifactory-creds", usernameVariable: 'CONAN_LOGIN_USERNAME', passwordVariable: 'CONAN_PASSWORD')]) {
|
|
trtllm_utils.llmExecStepWithRetry(pipeline, script: "bash -c \"cd ${LLM_ROOT} && python3 scripts/build_wheel.py --use_ccache -G Ninja -j ${BUILD_JOBS} -D 'WARNING_IS_ERROR=ON' ${extra_args}\"")
|
|
}
|
|
}
|
|
|
|
def launchStages(pipeline, cpu_arch, enableFailFast, globalVars)
|
|
{
|
|
stage("Show Environment") {
|
|
sh "env | sort"
|
|
echo "dockerImage: ${env.dockerImage}"
|
|
echo "gitlabSourceRepoHttpUrl: ${env.gitlabSourceRepoHttpUrl}"
|
|
echo "gitlabCommit: ${env.gitlabCommit}"
|
|
echo "alternativeTRT: ${env.alternativeTRT}"
|
|
echo "Using GitLab repo: ${LLM_REPO}. Commit: ${env.gitlabCommit}"
|
|
|
|
echo "env.globalVars is: ${env.globalVars}"
|
|
globalVars = trtllm_utils.updateMapWithJson(pipeline, globalVars, env.globalVars, "globalVars")
|
|
globalVars[ACTION_INFO] = trtllm_utils.setupPipelineDescription(pipeline, globalVars[ACTION_INFO])
|
|
}
|
|
|
|
def wheelDockerImage = env.wheelDockerImagePy310
|
|
if (!wheelDockerImage && cpu_arch == AARCH64_TRIPLE) {
|
|
wheelDockerImage = env.dockerImage
|
|
}
|
|
|
|
buildConfigs = [
|
|
"Build TRT-LLM": [LLM_DOCKER_IMAGE] + prepareLLMBuild(
|
|
pipeline, cpu_arch == AARCH64_TRIPLE ? CONFIG_LINUX_AARCH64 : CONFIG_LINUX_X86_64_VANILLA),
|
|
"Build TRT-LLM LLVM": [LLM_DOCKER_IMAGE] + prepareLLMBuild(
|
|
pipeline, cpu_arch == AARCH64_TRIPLE ? CONFIG_LINUX_AARCH64_LLVM : CONFIG_LINUX_X86_64_LLVM),
|
|
"Build TRT-LLM Pybind": [LLM_DOCKER_IMAGE] + prepareLLMBuild(
|
|
pipeline, cpu_arch == AARCH64_TRIPLE ? CONFIG_LINUX_AARCH64_PYBIND : CONFIG_LINUX_X86_64_PYBIND),
|
|
]
|
|
|
|
if (cpu_arch == X86_64_TRIPLE) {
|
|
buildConfigs += [
|
|
"Build TRT-LLM SingleDevice": [LLM_DOCKER_IMAGE] + prepareLLMBuild(
|
|
pipeline, CONFIG_LINUX_X86_64_SINGLE_DEVICE),
|
|
]
|
|
}
|
|
|
|
rtServer (
|
|
id: 'Artifactory',
|
|
url: 'https://urm.nvidia.com/artifactory',
|
|
credentialsId: 'urm-artifactory-creds',
|
|
// If Jenkins is configured to use an http proxy, you can bypass the proxy when using this Artifactory server:
|
|
bypassProxy: true,
|
|
// Configure the connection timeout (in seconds).
|
|
// The default value (if not configured) is 300 seconds:
|
|
timeout: 300
|
|
)
|
|
def reuseArtifactPath = env.reuseArtifactPath
|
|
|
|
def k8s_cpu = "amd64"
|
|
if (cpu_arch == AARCH64_TRIPLE) {
|
|
k8s_cpu = "arm64"
|
|
}
|
|
|
|
parallelJobs = buildConfigs.collectEntries{key, values -> [key, {
|
|
script {
|
|
buildOrCache(pipeline, key, reuseArtifactPath, values[1], values[0], k8s_cpu, values[2])
|
|
}
|
|
}]}
|
|
parallelJobs.failFast = enableFailFast
|
|
|
|
if (cpu_arch == X86_64_TRIPLE && !reuseArtifactPath) {
|
|
def key = "Build With Build Type Debug"
|
|
parallelJobs += [
|
|
(key): {
|
|
script {
|
|
trtllm_utils.launchKubernetesPod(pipeline, createKubernetesPodConfig(LLM_DOCKER_IMAGE, "build", k8s_cpu), "trt-llm", {
|
|
stage(key) {
|
|
stage("[${key}] Run") {
|
|
echoNodeAndGpuInfo(pipeline, key)
|
|
buildWheelInContainer(pipeline, [], X86_64_TRIPLE, false, false, "cp312", "-a '90-real' -b Debug --benchmarks --micro_benchmarks")
|
|
}
|
|
}
|
|
})
|
|
}
|
|
}]
|
|
}
|
|
|
|
stage("Build") {
|
|
pipeline.parallel parallelJobs
|
|
} // Build stage
|
|
}
|
|
|
|
pipeline {
|
|
agent {
|
|
kubernetes createKubernetesPodConfig(AGENT_IMAGE, "package", "amd64")
|
|
}
|
|
options {
|
|
// Check the valid options at: https://www.jenkins.io/doc/book/pipeline/syntax/
|
|
// some step like results analysis stage, does not need to check out source code
|
|
skipDefaultCheckout()
|
|
// to better analyze the time for each step/test
|
|
timestamps()
|
|
timeout(time: 24, unit: 'HOURS')
|
|
}
|
|
environment {
|
|
//Workspace normally is: /home/jenkins/agent/workspace/LLM/L0_MergeRequest@tmp/
|
|
HF_HOME="${env.WORKSPACE_TMP}/.cache/huggingface"
|
|
CCACHE_DIR="${CCACHE_DIR}"
|
|
GITHUB_MIRROR="https://urm.nvidia.com/artifactory/github-go-remote"
|
|
PIP_INDEX_URL="https://urm.nvidia.com/artifactory/api/pypi/pypi-remote/simple"
|
|
// force datasets to be offline mode, to prevent CI jobs are downloading HF dataset causing test failures
|
|
HF_DATASETS_OFFLINE=1
|
|
}
|
|
stages {
|
|
stage("Build Job") {
|
|
steps {
|
|
launchStages(this, params.targetArch, params.enableFailFast, globalVars)
|
|
}
|
|
}
|
|
} // stage
|
|
} // pipeline
|