mirror of
https://github.com/NVIDIA/TensorRT-LLM.git
synced 2026-01-13 22:18:36 +08:00
[None][infra] Enable single-gpu CI on spark (#9304)
Signed-off-by: qqiao <qqiao@nvidia.com> Signed-off-by: Emma Qiao <qqiao@nvidia.com> Signed-off-by: Jenny Liu <JennyLiu-nv+JennyLiu@users.noreply.github.com> Co-authored-by: Yanchao Lu <yanchaol@nvidia.com>
This commit is contained in:
parent
cce7247815
commit
fb05cd769a
@ -100,7 +100,7 @@ MODEL_CACHE_DIR="/scratch.trt_llm_data/llm-models"
|
||||
REQUIRED_OPEN_DRIVER_TYPES = ["b100-ts2", "rtx-5080", "rtx-5090", "rtx-pro-6000", "rtx-pro-6000d"]
|
||||
|
||||
// GPU types that don't support dynamic driver flashing
|
||||
REQUIRED_NO_DRIVER_TYPES = ["dgx-h100", "dgx-h200", "gh200"]
|
||||
REQUIRED_NO_DRIVER_TYPES = ["dgx-h100", "dgx-h200", "gh200", "gb10x"]
|
||||
|
||||
// ENABLE_NGC_DEVEL_IMAGE_TEST is currently disabled in the Jenkins BuildDockerImageSanityTest job config
|
||||
ENABLE_NGC_DEVEL_IMAGE_TEST = params.enableNgcDevelImageTest ?: false
|
||||
@ -672,7 +672,9 @@ def runLLMTestlistWithAgent(pipeline, platform, testList, config=VANILLA_CONFIG,
|
||||
|
||||
if (cluster.host.contains("dlcluster")) {
|
||||
dockerArgs += " " + sh(script: 'echo " -e NVIDIA_IMEX_CHANNELS=${NVIDIA_IMEX_CHANNELS:-0}"', returnStdout: true).trim()
|
||||
dockerArgs += " --device=/dev/gdrdrv:/dev/gdrdrv"
|
||||
if (fileExists('/dev/gdrdrv')) {
|
||||
dockerArgs += " --device=/dev/gdrdrv:/dev/gdrdrv"
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@ -1562,7 +1564,7 @@ EOF_TIMEOUT_XML
|
||||
|
||||
def createKubernetesPodConfig(image, type, arch = "amd64", gpuCount = 1, perfMode = false)
|
||||
{
|
||||
def targetCould = "kubernetes-cpu"
|
||||
def targetCloud = "kubernetes-cpu"
|
||||
def selectors = """
|
||||
nvidia.com/node_type: builder
|
||||
kubernetes.io/arch: ${arch}
|
||||
@ -1571,6 +1573,8 @@ def createKubernetesPodConfig(image, type, arch = "amd64", gpuCount = 1, perfMod
|
||||
def nodeLabelPrefix = ""
|
||||
def jobName = getShortenedJobName(env.JOB_NAME)
|
||||
def buildID = env.BUILD_ID
|
||||
def tolerations = ""
|
||||
def extraDeviceEnv = ""
|
||||
|
||||
def archSuffix = arch == "arm64" ? "arm" : "amd"
|
||||
def jnlpImage = "urm.nvidia.com/sw-ipp-blossom-sre-docker-local/lambda/custom_jnlp_images_${archSuffix}_linux:jdk17"
|
||||
@ -1653,14 +1657,40 @@ def createKubernetesPodConfig(image, type, arch = "amd64", gpuCount = 1, perfMod
|
||||
def gpuType = KubernetesManager.selectGPU(type)
|
||||
nodeLabelPrefix = type
|
||||
|
||||
targetCould = "kubernetes"
|
||||
targetCloud = "kubernetes"
|
||||
// DGX Spark requires a special setting for accessing the device.
|
||||
// It has 128GB unified memory as per spec. Use half of the memory at the CPU side.
|
||||
if (type == "gb10x") {
|
||||
targetCloud = "nvks-sparks-cloud"
|
||||
memorySize = "64Gi"
|
||||
tolerations = """
|
||||
tolerations:
|
||||
- key: "node_for_blossom_trt"
|
||||
operator: "Exists"
|
||||
effect: "NoSchedule"
|
||||
"""
|
||||
extraDeviceEnv = """
|
||||
- name: NVIDIA_VISIBLE_DEVICES
|
||||
value: "all"
|
||||
- name: NVIDIA_DRIVER_CAPABILITIES
|
||||
value: "compute,utility"
|
||||
"""
|
||||
}
|
||||
|
||||
// The following GPU types doesn't support dynamic driver flashing.
|
||||
if (REQUIRED_NO_DRIVER_TYPES.any { type.contains(it) }) {
|
||||
selectors = """
|
||||
if (type == "gb10x") {
|
||||
selectors = """
|
||||
kubernetes.io/arch: ${arch}
|
||||
kubernetes.io/os: linux
|
||||
nvidia.com/gpu.machine: NVIDIA_DGX_Spark
|
||||
nvidia.com/tenant: blossom_trt"""
|
||||
} else {
|
||||
selectors = """
|
||||
kubernetes.io/arch: ${arch}
|
||||
kubernetes.io/os: linux
|
||||
nvidia.com/gpu_type: ${gpuType}"""
|
||||
}
|
||||
} else if (perfMode && !hasMultipleGPUs) {
|
||||
// Use single GPU machine with "tensorrt/test_type: perf" for stable perf testing.
|
||||
// H100 / A100 single GPU machine has this unique label in TensorRT Blossom pool.
|
||||
@ -1744,7 +1774,7 @@ def createKubernetesPodConfig(image, type, arch = "amd64", gpuCount = 1, perfMod
|
||||
}
|
||||
|
||||
def podConfig = [
|
||||
cloud: targetCould,
|
||||
cloud: targetCloud,
|
||||
namespace: "sw-tensorrt",
|
||||
label: nodeLabel,
|
||||
yaml: """
|
||||
@ -1771,6 +1801,7 @@ def createKubernetesPodConfig(image, type, arch = "amd64", gpuCount = 1, perfMod
|
||||
valueFrom:
|
||||
fieldRef:
|
||||
fieldPath: spec.nodeName
|
||||
${extraDeviceEnv}
|
||||
- name: jnlp
|
||||
image: ${jnlpImage}
|
||||
args: ['\$(JENKINS_SECRET)', '\$(JENKINS_NAME)']
|
||||
@ -1790,6 +1821,7 @@ def createKubernetesPodConfig(image, type, arch = "amd64", gpuCount = 1, perfMod
|
||||
medium: Memory
|
||||
${llmModelVolume}
|
||||
${pvcVolume}
|
||||
${tolerations}
|
||||
""".stripIndent(),
|
||||
]
|
||||
|
||||
@ -3202,9 +3234,11 @@ def launchTestJobs(pipeline, testFilter)
|
||||
parallelJobs += parallelSlurmJobs
|
||||
|
||||
// Try to match what are being tested on x86 H100_PCIe.
|
||||
// The total machine time is scaled proportionally according to the number of each GPU.
|
||||
// SBSA machines from the Blossom machine pool
|
||||
SBSATestConfigs = [
|
||||
"GH200-TensorRT-Post-Merge-1": ["gh200", "l0_gh200", 1, 1],
|
||||
// DGX Spark is also named as GB10 Grace Blackwell Superchip.
|
||||
"GB10-PyTorch-1": ["gb10x", "l0_gb10", 1, 1],
|
||||
]
|
||||
fullSet += SBSATestConfigs.keySet()
|
||||
|
||||
@ -3212,6 +3246,7 @@ def launchTestJobs(pipeline, testFilter)
|
||||
"GB200-4_GPUs-PyTorch-1": ["gb200-x4-oci", "l0_gb200_multi_gpus", 1, 2, 4],
|
||||
"GB200-4_GPUs-PyTorch-2": ["gb200-x4-oci", "l0_gb200_multi_gpus", 2, 2, 4],
|
||||
"GB200-4_GPUs-PyTorch-Post-Merge-1": ["gb200-x4-oci", "l0_gb200_multi_gpus", 1, 1, 4],
|
||||
"GB10-PyTorch-Post-Merge-1": ["gb10x-single", "l0_gb10", 1, 1],
|
||||
// Perf sanity post merge test
|
||||
"GB200-4_GPUs-PyTorch-Perf-Sanity-Post-Merge-1": ["gb200-x4-oci", "l0_gb200_multi_gpus_perf_sanity", 1, 1, 4],
|
||||
// Disable GB300 stages due to nodes will be offline temporarily.
|
||||
|
||||
@ -16,6 +16,7 @@
|
||||
|
||||
import datetime
|
||||
import gc
|
||||
import logging
|
||||
import os
|
||||
import platform
|
||||
import re
|
||||
@ -55,6 +56,9 @@ try:
|
||||
except ImportError:
|
||||
trt_environment = None
|
||||
|
||||
# Logger
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# TODO: turn off this when the nightly storage issue is resolved.
|
||||
DEBUG_CI_STORAGE = os.environ.get("DEBUG_CI_STORAGE", False)
|
||||
GITLAB_API_USER = os.environ.get("GITLAB_API_USER")
|
||||
@ -2681,60 +2685,139 @@ IS_UNDER_CI_ENV = "JENKINS_HOME" in os.environ
|
||||
gpu_warning_threshold = 1024 * 1024 * 1024
|
||||
|
||||
|
||||
def get_gpu_memory_wo_pynvml():
|
||||
import psutil
|
||||
|
||||
logger.warning(
|
||||
f"\nWarning: pynvml not available, using fallback commands for memory monitoring"
|
||||
)
|
||||
|
||||
gpu_memory = {}
|
||||
system_total_mb = 0
|
||||
system_used_mb = 0
|
||||
try:
|
||||
mem_output = check_output("free -m | awk '/^Mem:/ {print $3, $2}'",
|
||||
shell=True)
|
||||
parts = mem_output.strip().split()
|
||||
system_used_mb = int(parts[0])
|
||||
system_total_mb = int(parts[1])
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# Parse nvidia-smi pmon to get GPU memory usage
|
||||
try:
|
||||
gpu_output = check_output("nvidia-smi pmon -s m -c 1", shell=True)
|
||||
lines = gpu_output.strip().split('\n')
|
||||
|
||||
for line in lines:
|
||||
parts = line.split()
|
||||
try:
|
||||
gpu_idx = int(parts[0])
|
||||
|
||||
# Initialize GPU entry if not exists
|
||||
if gpu_idx not in gpu_memory:
|
||||
gpu_memory[gpu_idx] = {
|
||||
"total_used": 0,
|
||||
"total": system_total_mb,
|
||||
"process": {}
|
||||
}
|
||||
|
||||
# Skip if no active process (pid is '-')
|
||||
if parts[1] == '-':
|
||||
continue
|
||||
|
||||
pid = int(parts[1])
|
||||
mem_mb = int(parts[3])
|
||||
gpu_memory[gpu_idx]["total_used"] += mem_mb
|
||||
|
||||
# Get process info (same as pynvml version)
|
||||
try:
|
||||
p = psutil.Process(pid)
|
||||
host_memory_in_mbs = p.memory_full_info(
|
||||
).uss // 1024 // 1024
|
||||
gpu_memory[gpu_idx]["process"][pid] = (
|
||||
mem_mb,
|
||||
host_memory_in_mbs,
|
||||
p.cmdline(),
|
||||
)
|
||||
except Exception:
|
||||
pass
|
||||
except (ValueError, IndexError):
|
||||
continue
|
||||
except Exception as gpu_err:
|
||||
logging.warning(f"nvidia-smi pmon error: {gpu_err}")
|
||||
|
||||
# Create default entry for GPU 0 if no GPUs detected
|
||||
if not gpu_memory:
|
||||
gpu_memory[0] = {
|
||||
"total_used": system_used_mb,
|
||||
"total": system_total_mb,
|
||||
"process": {}
|
||||
}
|
||||
return gpu_memory
|
||||
|
||||
|
||||
def collect_status(item: pytest.Item):
|
||||
if not IS_UNDER_CI_ENV:
|
||||
return
|
||||
|
||||
import psutil
|
||||
import pynvml
|
||||
|
||||
pynvml.nvmlInit()
|
||||
|
||||
handles = {
|
||||
idx: pynvml.nvmlDeviceGetHandleByIndex(idx)
|
||||
for idx in range(pynvml.nvmlDeviceGetCount())
|
||||
}
|
||||
|
||||
deadline = time.perf_counter() + 60 # 1 min
|
||||
observed_used = 0
|
||||
global gpu_warning_threshold
|
||||
|
||||
while time.perf_counter() < deadline:
|
||||
observed_used = max(
|
||||
pynvml.nvmlDeviceGetMemoryInfo(device).used
|
||||
for device in handles.values())
|
||||
if observed_used <= gpu_warning_threshold:
|
||||
break
|
||||
time.sleep(1)
|
||||
else:
|
||||
gpu_warning_threshold = max(observed_used, gpu_warning_threshold)
|
||||
warnings.warn(
|
||||
f"Test {item.name} does not free up GPU memory correctly!")
|
||||
|
||||
gpu_memory = {}
|
||||
for idx, device in handles.items():
|
||||
total_used = pynvml.nvmlDeviceGetMemoryInfo(device).used // 1024 // 1024
|
||||
total = pynvml.nvmlDeviceGetMemoryInfo(device).total // 1024 // 1024
|
||||
detail = pynvml.nvmlDeviceGetComputeRunningProcesses(device)
|
||||
process = {}
|
||||
|
||||
for entry in detail:
|
||||
try:
|
||||
p = psutil.Process(entry.pid)
|
||||
host_memory_in_mbs = p.memory_full_info().uss // 1024 // 1024
|
||||
process[entry.pid] = (
|
||||
entry.usedGpuMemory // 1024 // 1024,
|
||||
host_memory_in_mbs,
|
||||
p.cmdline(),
|
||||
)
|
||||
except Exception:
|
||||
pass
|
||||
try:
|
||||
import pynvml
|
||||
pynvml.nvmlInit()
|
||||
|
||||
gpu_memory[idx] = {
|
||||
"total_used": total_used,
|
||||
"total": total,
|
||||
"process": process
|
||||
handles = {
|
||||
idx: pynvml.nvmlDeviceGetHandleByIndex(idx)
|
||||
for idx in range(pynvml.nvmlDeviceGetCount())
|
||||
}
|
||||
|
||||
deadline = time.perf_counter() + 60 # 1 min
|
||||
observed_used = 0
|
||||
global gpu_warning_threshold
|
||||
|
||||
while time.perf_counter() < deadline:
|
||||
observed_used = max(
|
||||
pynvml.nvmlDeviceGetMemoryInfo(device).used
|
||||
for device in handles.values())
|
||||
if observed_used <= gpu_warning_threshold:
|
||||
break
|
||||
time.sleep(1)
|
||||
else:
|
||||
gpu_warning_threshold = max(observed_used, gpu_warning_threshold)
|
||||
warnings.warn(
|
||||
f"Test {item.name} does not free up GPU memory correctly!")
|
||||
|
||||
for idx, device in handles.items():
|
||||
total_used = pynvml.nvmlDeviceGetMemoryInfo(
|
||||
device).used // 1024 // 1024
|
||||
total = pynvml.nvmlDeviceGetMemoryInfo(device).total // 1024 // 1024
|
||||
detail = pynvml.nvmlDeviceGetComputeRunningProcesses(device)
|
||||
process = {}
|
||||
|
||||
for entry in detail:
|
||||
try:
|
||||
p = psutil.Process(entry.pid)
|
||||
host_memory_in_mbs = p.memory_full_info(
|
||||
).uss // 1024 // 1024
|
||||
process[entry.pid] = (
|
||||
entry.usedGpuMemory // 1024 // 1024,
|
||||
host_memory_in_mbs,
|
||||
p.cmdline(),
|
||||
)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
gpu_memory[idx] = {
|
||||
"total_used": total_used,
|
||||
"total": total,
|
||||
"process": process
|
||||
}
|
||||
except Exception:
|
||||
gpu_memory = get_gpu_memory_wo_pynvml()
|
||||
|
||||
print("\nCurrent memory status:")
|
||||
print(gpu_memory)
|
||||
|
||||
|
||||
@ -191,7 +191,13 @@ def construct_gpu_properties(mako_opts, device_index=0):
|
||||
assert gpu_name != "", "device_product_name is empty after removing substring 'NVIDIA' and leading/trailing whitespaces."
|
||||
|
||||
compute_capability = get_compute_capability(device_index)
|
||||
gpu_memory = pynvml.nvmlDeviceGetMemoryInfo(handle).total / (1024**2)
|
||||
try:
|
||||
gpu_memory = pynvml.nvmlDeviceGetMemoryInfo(handle).total / (1024**
|
||||
2)
|
||||
except pynvml.NVMLError_NotSupported as e:
|
||||
logger.warning("Unable to get GPU memory info: {}".format(e))
|
||||
# Fallback to 8 GiB, expressed in MiB to match the nvml path above.
|
||||
gpu_memory = 8 * 1024
|
||||
# Gather GPU information
|
||||
mako_opt_dict["gpu"] = gpu_name
|
||||
mako_opt_dict["gpu_memory"] = gpu_memory
|
||||
|
||||
42
tests/integration/test_lists/test-db/l0_gb10.yml
Normal file
42
tests/integration/test_lists/test-db/l0_gb10.yml
Normal file
@ -0,0 +1,42 @@
|
||||
version: 0.0.1
|
||||
# DGX Spark is also named as GB10 Grace Blackwell Superchip.
|
||||
l0_gb10:
|
||||
- condition:
|
||||
ranges:
|
||||
system_gpu_count:
|
||||
gte: 1
|
||||
lte: 1
|
||||
wildcards:
|
||||
gpu:
|
||||
- '*gb10*'
|
||||
linux_distribution_name: ubuntu*
|
||||
cpu: aarch64
|
||||
terms:
|
||||
stage: post_merge
|
||||
backend: pytorch
|
||||
tests:
|
||||
# ------------- PyTorch tests ---------------
|
||||
- unittest/_torch/attention/test_attention_mla.py
|
||||
- test_e2e.py::test_ptp_quickstart_bert[VANILLA-BertForSequenceClassification-bert/bert-base-uncased-yelp-polarity]
|
||||
- test_e2e.py::test_ptp_quickstart_bert[TRTLLM-BertForSequenceClassification-bert/bert-base-uncased-yelp-polarity]
|
||||
- accuracy/test_llm_api_pytorch.py::TestQwen3_8B::test_bf16[latency]
|
||||
- condition:
|
||||
ranges:
|
||||
system_gpu_count:
|
||||
gte: 1
|
||||
lte: 1
|
||||
wildcards:
|
||||
gpu:
|
||||
- '*gb10*'
|
||||
linux_distribution_name: ubuntu*
|
||||
cpu: aarch64
|
||||
terms:
|
||||
stage: pre_merge
|
||||
backend: pytorch
|
||||
tests:
|
||||
# ------------- PyTorch tests ---------------
|
||||
# Below cases which are commented out due to they failed on gb10
|
||||
# - unittest/_torch/modeling -k "modeling_mllama"
|
||||
- unittest/_torch/modeling -k "modeling_out_of_tree"
|
||||
# - unittest/_torch/modules/test_fused_moe.py::test_fused_moe_nvfp4[CUTLASS-dtype0]
|
||||
# - unittest/_torch/modules/test_fused_moe.py::test_fused_moe_nvfp4[CUTLASS-dtype1]
|
||||
Loading…
Reference in New Issue
Block a user