TensorRT-LLMs/tests/integration/defs/perf/gpu_clock_lock.py
Eran Geva 32c7f8c36f
[#7588][feat] lock gpu clocks in test_perf.py to reliably detect perf regressions (#8099)
Signed-off-by: Eran Geva <19514940+MrGeva@users.noreply.github.com>
2025-10-02 11:18:10 +03:00

512 lines
19 KiB
Python

# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Controls GPU clock settings(Not implemented yet) and monitors GPU status using pynvml.
The monitoring part creates a Python thread (not truly multiprocess) that polls the GPU and CPU states.
"""
import datetime
import os
import platform
import socket
# Std
import threading
import time
import psutil # type: ignore
# Nvidia
import pynvml # type: ignore
from defs.trt_test_alternative import print_info, print_warning
from .misc import clean_device_product_name
class InvalidGPUMonitoringResultError(RuntimeError):
"""GPU monitoring result is invalid, probably caused by clock frequency drops due to thermal issue."""
class GPUClockLockFailFastError(RuntimeError):
"""GPU clock locking has failed."""
class GPUState:
def __init__(self, gpu_id, gpu_clock, mem_clock, timestamp, graphics_clk,
gpu_util, mem_util, encoder_util, decoder_util, gpu_temp,
mem_temp, fan_speed, perf_state, power_draw, process_num):
self.gpu_id = gpu_id
self.gpu_clock__MHz = gpu_clock
self.memory_clock__MHz = mem_clock
self.timestamp = timestamp
self.graphics_clock__MHz = graphics_clk
self.gpu_utilization__pct = gpu_util
self.memory_utilization__pct = mem_util
self.encoder_utilization__pct = encoder_util
self.decoder_utilization__pct = decoder_util
self.gpu_temperature__C = gpu_temp
self.memory_temperature__C = mem_temp
self.fan_speed__pct = fan_speed
self.perf_state = perf_state
self.power_draw__W = power_draw
self.process_num = process_num
class GPUClockLock:
def __init__(self, gpu_id, interval_ms):
"""
Sets up clock values and tears down every run. At the end of the session call teardown to complete session and
reset GPU clocks.
Args:
gpu_id (str): GPU identifier, either comma-separated UUIDs or comma-separated indices in string.
interval_ms (float): Interval duration between monitoring samples.
"""
# Initialize pynvml
self._nvml_initialized = False
self._gpu_handles = []
# Input params.
self._gpu_id = gpu_id
self._gpu_id_list = [int(id) for id in gpu_id.split(",")]
self._mobile_disable_clock_locking = False
# Create GPU handles, one per GPU.
try:
pynvml.nvmlInit()
self._nvml_initialized = True
self._gpu_handles = [
pynvml.nvmlDeviceGetHandleByIndex(gpu_id)
for gpu_id in self._gpu_id_list
]
print_info(f"Created GPU handles: {self._gpu_handles}")
except pynvml.NVMLError as e:
print_warning(f"Failed to initialize NVML: {e}")
if self._gpu_handles is None:
print_warning(
"Unable to create GPU handles. GPU monitoring will be disabled."
)
else:
print_info("GPU handles created successfully!")
# Setup device properties.
self._setup_properties()
# Fields for monitoring thread.
self._interval_ms = interval_ms
self._is_monitoring = False
self._state_data = []
# Fields for clock locking
self._original_clocks = {}
self._clocks_locked = False
def get_os_properties(self):
return self._os_properties
def get_cpu_properties(self):
return self._cpu_properties
def get_gpu_properties(self):
return self._gpu_properties
def get_gpu_id(self):
return self._gpu_id
def get_driver_version(self):
return self._nvidia_driver_version
def get_device_count(self):
return self._nvidia_device_count
def get_ip_address(self):
return self._ip_address
def get_target_gpu_clocks(self):
"""
Get the target GPU clocks (sm_clk and mem_clk) for the first GPU in the list.
"""
if self._gpu_handles and len(self._gpu_handles) > 0:
try:
# Get maximum supported clocks for the first GPU
handle = self._gpu_handles[0]
max_sm_clk = pynvml.nvmlDeviceGetMaxClockInfo(
handle, pynvml.NVML_CLOCK_SM)
max_mem_clk = pynvml.nvmlDeviceGetMaxClockInfo(
handle, pynvml.NVML_CLOCK_MEM)
return (max_sm_clk, max_mem_clk)
except pynvml.NVMLError as e:
print_warning(f"Failed to get max clock info: {e}")
return None
return None
def _lock_gpu_clocks(self):
"""
Lock GPU clocks to maximum supported frequencies for consistent performance.
Implements fail-fast semantics: if any GPU fails to lock, all operations
are rolled back and an exception is raised.
"""
if self._mobile_disable_clock_locking:
print_info("Clock locking disabled for mobile/Jetson devices")
return
if not self._gpu_handles:
print_warning("No GPU handles available for clock locking")
return
target_clocks = self.get_target_gpu_clocks()
if not target_clocks:
print_warning("Could not determine target GPU clocks")
raise GPUClockLockFailFastError(
"Could not determine target GPU clocks")
target_sm_clk, target_mem_clk = target_clocks
# Phase 1: Retrieve original clocks for all GPUs (fail-fast if any fails)
original_clocks_backup = {}
for gpu_idx, handle in enumerate(self._gpu_handles):
try:
original_sm_clk = pynvml.nvmlDeviceGetApplicationsClock(
handle, pynvml.NVML_CLOCK_SM)
original_mem_clk = pynvml.nvmlDeviceGetApplicationsClock(
handle, pynvml.NVML_CLOCK_MEM)
original_clocks_backup[gpu_idx] = (original_sm_clk,
original_mem_clk)
print_info(
f"GPU {gpu_idx}: Retrieved original clocks SM={original_sm_clk}MHz, MEM={original_mem_clk}MHz"
)
except pynvml.NVMLError as e:
print_error(
f"Failed to retrieve original clocks for GPU {gpu_idx}: {e}"
)
raise GPUClockLockFailFastError(
f"Failed to retrieve original clocks for GPU {gpu_idx}: {e}"
)
# Phase 2: Apply clock locks to all GPUs (fail-fast if any fails)
locked_gpus = []
try:
for gpu_idx, handle in enumerate(self._gpu_handles):
try:
pynvml.nvmlDeviceSetApplicationsClocks(
handle, target_mem_clk, target_sm_clk)
locked_gpus.append(gpu_idx)
print_info(
f"GPU {gpu_idx}: Locked clocks to SM={target_sm_clk}MHz, MEM={target_mem_clk}MHz"
)
except pynvml.NVMLError as e:
print_error(f"Failed to lock clocks for GPU {gpu_idx}: {e}")
# Rollback any GPUs that were successfully locked
self._rollback_locked_gpus(locked_gpus,
original_clocks_backup)
raise GPUClockLockFailFastError(
f"Failed to lock clocks for GPU {gpu_idx}: {e}")
# Phase 3: Only mark as locked if all GPUs succeeded
self._original_clocks = original_clocks_backup
self._clocks_locked = True
print_info(
f"Successfully locked clocks on {len(locked_gpus)} GPU(s)")
except Exception:
# Ensure we don't leave any GPUs in a locked state
if locked_gpus:
self._rollback_locked_gpus(locked_gpus, original_clocks_backup)
raise
def _rollback_locked_gpus(self, locked_gpu_indices, original_clocks_backup):
"""
Rollback clock locks for specific GPUs to their original values.
Args:
locked_gpu_indices: List of GPU indices that were successfully locked
original_clocks_backup: Dictionary of original clock values for each GPU
"""
for gpu_idx in locked_gpu_indices:
if gpu_idx < len(
self._gpu_handles) and gpu_idx in original_clocks_backup:
try:
handle = self._gpu_handles[gpu_idx]
original_sm_clk, original_mem_clk = original_clocks_backup[
gpu_idx]
pynvml.nvmlDeviceSetApplicationsClocks(
handle, original_mem_clk, original_sm_clk)
print_info(
f"GPU {gpu_idx}: Rolled back clocks to SM={original_sm_clk}MHz, MEM={original_mem_clk}MHz"
)
except pynvml.NVMLError as e:
print_warning(
f"Failed to rollback clocks for GPU {gpu_idx}: {e}")
def _unlock_gpu_clocks(self):
"""
Restore GPU clocks to their original values.
"""
if not self._clocks_locked or not self._gpu_handles:
return
for gpu_idx, handle in enumerate(self._gpu_handles):
try:
if gpu_idx in self._original_clocks:
original_sm_clk, original_mem_clk = self._original_clocks[
gpu_idx]
pynvml.nvmlDeviceSetApplicationsClocks(
handle, original_mem_clk, original_sm_clk)
print_info(
f"GPU {gpu_idx}: Restored clocks to SM={original_sm_clk}MHz, MEM={original_mem_clk}MHz"
)
else:
# Reset to default clocks if we don't have original values
pynvml.nvmlDeviceResetApplicationsClocks(handle)
print_info(f"GPU {gpu_idx}: Reset clocks to default")
except pynvml.NVMLError as e:
print_warning(
f"Failed to restore clocks for GPU {gpu_idx}: {e}")
self._clocks_locked = False
self._original_clocks = {}
def __enter__(self):
"""
Do all the steps needed at the start of a test case:
- Lock gpu clock to target.
- Start monitoring.
"""
print_info("gpu clock lock enter!!!")
if not self._nvml_initialized:
pynvml.nvmlInit()
self._nvml_initialized = True
self._gpu_handles = [
pynvml.nvmlDeviceGetHandleByIndex(gpu_id)
for gpu_id in self._gpu_id_list
]
print_info(f"Reinitialized GPU handles: {self._gpu_handles}")
# Lock GPU clocks for consistent performance
self._lock_gpu_clocks()
self.start_monitor()
return self
def __exit__(self, *args):
"""
Do all the steps needed at the end of a test case:
- Stop monitoring.
- Set gpu clock back to original state.
- Validate gpu monitoring result.
"""
self.stop_monitor()
# Restore original GPU clocks
self._unlock_gpu_clocks()
self.validate_gpu_monitoring_data()
print_info("gpu clock lock exit!!!")
def start_monitor(self):
"""Start GPU monitoring."""
if self._gpu_handles is None:
print_warning(
"Unable to start GPU monitoring. GPU handles are not initialized."
)
return
if self._is_monitoring:
raise RuntimeError(
"GPU monitoring is already in progress. Monitoring cannot be started!"
)
# Delete state_data
self._state_data = []
self._is_monitoring = True
# Initialize thread
self._thread = threading.Thread(
target=self._monitoring_thread,
name="LLM Test - GPUMonitor",
kwargs={"interval_ms": self._interval_ms})
self._thread.daemon = True
self._thread.start()
def stop_monitor(self):
"""Stop GPU monitoring."""
if self._gpu_handles is None:
return
if not self._is_monitoring:
raise RuntimeError(
"GPU monitoring has not been started. Monitoring cannot be stopped!"
)
self._is_monitoring = False
self._thread.join()
def get_state_data(self):
"""
Get all the gpu monitoring data since monitoring started.
Seems like from our empirical data get_state_data() can return None.
This might have something to do with thread failure if something were to happen to GPU monitoring thread.
"""
return self._state_data
def validate_gpu_monitoring_data(self, deviation_perc=0.07, num_entries=3):
"""
Check that all the current monitoring data is within the given deviation_perc for the given number of
consecutive entries in a row.
The "num_entries" argument specifies the number of consecutive entries the monitoring data needs to be invalid
before considering the entire dataset as invalid
"""
if self._mobile_disable_clock_locking:
print_info("Skipped gpu monitoring validation for mobile board")
return
def teardown(self):
"""
Call when the session finishes. Reset GPU clocks back to its original state.
"""
# Revert clocks back to normal if all tests have finished.
self._unlock_gpu_clocks()
if self._nvml_initialized:
pynvml.nvmlShutdown()
self._nvml_initialized = False
print_info("NVML shutdown.")
def _gpu_poll_state(self):
if not self._nvml_initialized:
print_warning("NVML is not initialized. Skipping GPU polling.")
return
for gpu_idx, gpu_handle in enumerate(self._gpu_handles):
try:
sm_clk = pynvml.nvmlDeviceGetClockInfo(gpu_handle,
pynvml.NVML_CLOCK_SM)
mem_clk = pynvml.nvmlDeviceGetClockInfo(gpu_handle,
pynvml.NVML_CLOCK_MEM)
graphics_clk = pynvml.nvmlDeviceGetClockInfo(
gpu_handle, pynvml.NVML_CLOCK_GRAPHICS)
gpu_util = pynvml.nvmlDeviceGetUtilizationRates(gpu_handle).gpu
mem_util = pynvml.nvmlDeviceGetUtilizationRates(
gpu_handle).memory
gpu_temp = pynvml.nvmlDeviceGetTemperature(
gpu_handle, pynvml.NVML_TEMPERATURE_GPU)
perf_state = pynvml.nvmlDeviceGetPerformanceState(gpu_handle)
power_draw = pynvml.nvmlDeviceGetPowerUsage(
gpu_handle) / 1000.0 # Convert from milliwatts to watts
process_num = len(
pynvml.nvmlDeviceGetComputeRunningProcesses(gpu_handle))
encoder_util = pynvml.nvmlDeviceGetEncoderUtilization(
gpu_handle)[0] # Get encoder utilization percentage
decoder_util = pynvml.nvmlDeviceGetDecoderUtilization(
gpu_handle)[0] # Get decoder utilization percentage
gpu_state = GPUState(
gpu_id=self._gpu_id_list[gpu_idx],
gpu_clock=sm_clk,
mem_clock=mem_clk,
timestamp=datetime.datetime.now(),
graphics_clk=graphics_clk,
gpu_util=gpu_util,
mem_util=mem_util,
gpu_temp=gpu_temp,
mem_temp=
None, # Can't use pynvml to get memory temperature data
fan_speed=
None, # Will hit not supported exception when call pynvml.nvmlDeviceGetFanSpeed()
perf_state=perf_state,
power_draw=power_draw,
process_num=process_num,
encoder_util=encoder_util,
decoder_util=decoder_util)
self._state_data.append(gpu_state)
except pynvml.NVMLError as e:
print_warning(f"Error polling GPU state for GPU {gpu_idx}: {e}")
def _monitoring_thread(self, interval_ms):
"""Actual thread that runs to monitor similar to perf_runner.monitor"""
interval = interval_ms / 1000
# Get the state of the object.
while self._is_monitoring:
self._gpu_poll_state()
# Sleep the thread
time.sleep(interval)
# Final time for interpolation.
self._gpu_poll_state()
time.sleep(interval)
self._gpu_poll_state()
def _setup_properties(self):
"""Set up OS/CPU/GPU properties """
try:
self._os_properties = {
"os_name": os.name,
"platform": platform.system(),
"platform_version": platform.version()
}
except Exception as e:
self._os_properties = None
print_warning("Unable to fetch os properties. Reason: {}".format(e))
try:
self._cpu_properties = {
"cpu_count":
os.cpu_count(),
"cpu_freq":
psutil.cpu_freq()._asdict() if psutil.cpu_freq() else None
}
except Exception as e:
self._cpu_properties = None
print_warning(
"Unable to fetch cpu properties. Reason: {}".format(e))
try:
self._ip_address = socket.gethostbyname(socket.gethostname())
except Exception as e:
self._ip_address = None
print_warning("Unable to fetch os IP address. Reason: {}".format(e))
if self._gpu_handles is not None:
self._nvidia_driver_version = pynvml.nvmlSystemGetDriverVersion()
self._nvidia_device_count = pynvml.nvmlDeviceGetCount()
self._gpu_properties = {
"device_product_name":
pynvml.nvmlDeviceGetName(self._gpu_handles[0]),
"pci_device_id":
pynvml.nvmlDeviceGetPciInfo(self._gpu_handles[0]).pciDeviceId
}
# Clean up the device product name because the product names have changed after driver updates.
self._gpu_properties[
"device_product_name"] = clean_device_product_name(
self._gpu_properties["device_product_name"])
if "jetson" in self._gpu_properties[
"device_product_name"] or "p3710" in self._gpu_properties[
"device_product_name"]:
self._mobile_disable_clock_locking = True
else:
self._nvidia_driver_version = None
self._nvidia_device_count = None
self._gpu_properties = None