TensorRT-LLMs/tests/integration/defs/perf/gpu_clock_lock.py

# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Controls GPU clock settings(Not implemented yet) and monitors GPU status using pynvml.

The monitoring part creates a Python thread (not truly multiprocess) that polls the GPU and CPU states.
"""

import datetime
import os
import platform
import socket
# Std
import threading
import time

import psutil  # type: ignore
# Nvidia
import pynvml  # type: ignore
from defs.trt_test_alternative import print_info, print_warning

from .misc import clean_device_product_name


class InvalidGPUMonitoringResultError(RuntimeError):
    """GPU monitoring result is invalid, probably caused by clock frequency drops due to thermal issue."""


class GPUClockLockFailFastError(RuntimeError):
    """GPU clock locking has failed."""


class GPUState:

    def __init__(self, gpu_id, gpu_clock, mem_clock, timestamp, graphics_clk,
                 gpu_util, mem_util, encoder_util, decoder_util, gpu_temp,
                 mem_temp, fan_speed, perf_state, power_draw, process_num):
        self.gpu_id = gpu_id
        self.gpu_clock__MHz = gpu_clock
        self.memory_clock__MHz = mem_clock
        self.timestamp = timestamp
        self.graphics_clock__MHz = graphics_clk
        self.gpu_utilization__pct = gpu_util
        self.memory_utilization__pct = mem_util
        self.encoder_utilization__pct = encoder_util
        self.decoder_utilization__pct = decoder_util
        self.gpu_temperature__C = gpu_temp
        self.memory_temperature__C = mem_temp
        self.fan_speed__pct = fan_speed
        self.perf_state = perf_state
        self.power_draw__W = power_draw
        self.process_num = process_num


class GPUClockLock:

    def __init__(self, gpu_id, interval_ms):
        """
        Sets up clock values and tears down every run. At the end of the session call teardown to complete session and
        reset GPU clocks.

        Args:
            gpu_id (str): GPU identifier, either comma-separated UUIDs or comma-separated indices in string.
            interval_ms (float): Interval duration between monitoring samples.
        """
        # Initialize pynvml
        self._nvml_initialized = False
        self._gpu_handles = []

        # Input params.
        self._gpu_id = gpu_id
        self._gpu_id_list = [int(id) for id in gpu_id.split(",")]
        self._mobile_disable_clock_locking = False

        # Create GPU handles, one per GPU.
        try:
            pynvml.nvmlInit()
            self._nvml_initialized = True
            self._gpu_handles = [
                pynvml.nvmlDeviceGetHandleByIndex(gpu_id)
                for gpu_id in self._gpu_id_list
            ]
            print_info(f"Created GPU handles: {self._gpu_handles}")
        except pynvml.NVMLError as e:
            print_warning(f"Failed to initialize NVML: {e}")

        if self._gpu_handles is None:
            print_warning(
                "Unable to create GPU handles. GPU monitoring will be disabled."
            )
        else:
            print_info("GPU handles created successfully!")

        # Setup device properties.
        self._setup_properties()

        # Fields for monitoring thread.
        self._interval_ms = interval_ms
        self._is_monitoring = False
        self._state_data = []

        # Fields for clock locking
        self._original_clocks = {}
        self._clocks_locked = False

    def get_os_properties(self):
        return self._os_properties

    def get_cpu_properties(self):
        return self._cpu_properties

    def get_gpu_properties(self):
        return self._gpu_properties

    def get_gpu_id(self):
        return self._gpu_id

    def get_driver_version(self):
        return self._nvidia_driver_version

    def get_device_count(self):
        return self._nvidia_device_count

    def get_ip_address(self):
        return self._ip_address

    def get_target_gpu_clocks(self):
        """
        Get the target GPU clocks (sm_clk and mem_clk) for the first GPU in the list.
        """
        if self._gpu_handles and len(self._gpu_handles) > 0:
            try:
                # Get maximum supported clocks for the first GPU
                handle = self._gpu_handles[0]
                max_sm_clk = pynvml.nvmlDeviceGetMaxClockInfo(
                    handle, pynvml.NVML_CLOCK_SM)
                max_mem_clk = pynvml.nvmlDeviceGetMaxClockInfo(
                    handle, pynvml.NVML_CLOCK_MEM)
                return (max_sm_clk, max_mem_clk)
            except pynvml.NVMLError as e:
                print_warning(f"Failed to get max clock info: {e}")
                return None
        return None

    def _lock_gpu_clocks(self):
        """
        Lock GPU clocks to maximum supported frequencies for consistent performance.

        Implements fail-fast semantics: if any GPU fails to lock, all operations
        are rolled back and an exception is raised.
        """
        if self._mobile_disable_clock_locking:
            print_info("Clock locking disabled for mobile/Jetson devices")
            return

        if not self._gpu_handles:
            print_warning("No GPU handles available for clock locking")
            return

        target_clocks = self.get_target_gpu_clocks()
        if not target_clocks:
            print_warning("Could not determine target GPU clocks")
            raise GPUClockLockFailFastError(
                "Could not determine target GPU clocks")

        target_sm_clk, target_mem_clk = target_clocks

        # Phase 1: Retrieve original clocks for all GPUs (fail-fast if any fails)
        original_clocks_backup = {}
        for gpu_idx, handle in enumerate(self._gpu_handles):
            try:
                original_sm_clk = pynvml.nvmlDeviceGetApplicationsClock(
                    handle, pynvml.NVML_CLOCK_SM)
                original_mem_clk = pynvml.nvmlDeviceGetApplicationsClock(
                    handle, pynvml.NVML_CLOCK_MEM)
                original_clocks_backup[gpu_idx] = (original_sm_clk,
                                                   original_mem_clk)
                print_info(
                    f"GPU {gpu_idx}: Retrieved original clocks SM={original_sm_clk}MHz, MEM={original_mem_clk}MHz"
                )
            except pynvml.NVMLError as e:
                print_error(
                    f"Failed to retrieve original clocks for GPU {gpu_idx}: {e}"
                )
                raise GPUClockLockFailFastError(
                    f"Failed to retrieve original clocks for GPU {gpu_idx}: {e}"
                )

        # Phase 2: Apply clock locks to all GPUs (fail-fast if any fails)
        locked_gpus = []
        try:
            for gpu_idx, handle in enumerate(self._gpu_handles):
                try:
                    pynvml.nvmlDeviceSetApplicationsClocks(
                        handle, target_mem_clk, target_sm_clk)
                    locked_gpus.append(gpu_idx)
                    print_info(
                        f"GPU {gpu_idx}: Locked clocks to SM={target_sm_clk}MHz, MEM={target_mem_clk}MHz"
                    )
                except pynvml.NVMLError as e:
                    print_error(f"Failed to lock clocks for GPU {gpu_idx}: {e}")
                    # Rollback any GPUs that were successfully locked
                    self._rollback_locked_gpus(locked_gpus,
                                               original_clocks_backup)
                    raise GPUClockLockFailFastError(
                        f"Failed to lock clocks for GPU {gpu_idx}: {e}")

            # Phase 3: Only mark as locked if all GPUs succeeded
            self._original_clocks = original_clocks_backup
            self._clocks_locked = True
            print_info(
                f"Successfully locked clocks on {len(locked_gpus)} GPU(s)")

        except Exception:
            # Ensure we don't leave any GPUs in a locked state
            if locked_gpus:
                self._rollback_locked_gpus(locked_gpus, original_clocks_backup)
            raise

    def _rollback_locked_gpus(self, locked_gpu_indices, original_clocks_backup):
        """
        Rollback clock locks for specific GPUs to their original values.

        Args:
            locked_gpu_indices: List of GPU indices that were successfully locked
            original_clocks_backup: Dictionary of original clock values for each GPU
        """
        for gpu_idx in locked_gpu_indices:
            if gpu_idx < len(
                    self._gpu_handles) and gpu_idx in original_clocks_backup:
                try:
                    handle = self._gpu_handles[gpu_idx]
                    original_sm_clk, original_mem_clk = original_clocks_backup[
                        gpu_idx]
                    pynvml.nvmlDeviceSetApplicationsClocks(
                        handle, original_mem_clk, original_sm_clk)
                    print_info(
                        f"GPU {gpu_idx}: Rolled back clocks to SM={original_sm_clk}MHz, MEM={original_mem_clk}MHz"
                    )
                except pynvml.NVMLError as e:
                    print_warning(
                        f"Failed to rollback clocks for GPU {gpu_idx}: {e}")

    def _unlock_gpu_clocks(self):
        """
        Restore GPU clocks to their original values.
        """
        if not self._clocks_locked or not self._gpu_handles:
            return

        for gpu_idx, handle in enumerate(self._gpu_handles):
            try:
                if gpu_idx in self._original_clocks:
                    original_sm_clk, original_mem_clk = self._original_clocks[
                        gpu_idx]
                    pynvml.nvmlDeviceSetApplicationsClocks(
                        handle, original_mem_clk, original_sm_clk)
                    print_info(
                        f"GPU {gpu_idx}: Restored clocks to SM={original_sm_clk}MHz, MEM={original_mem_clk}MHz"
                    )
                else:
                    # Reset to default clocks if we don't have original values
                    pynvml.nvmlDeviceResetApplicationsClocks(handle)
                    print_info(f"GPU {gpu_idx}: Reset clocks to default")

            except pynvml.NVMLError as e:
                print_warning(
                    f"Failed to restore clocks for GPU {gpu_idx}: {e}")

        self._clocks_locked = False
        self._original_clocks = {}

    def __enter__(self):
        """
        Do all the steps needed at the start of a test case:
        - Lock gpu clock to target.
        - Start monitoring.
        """
        print_info("gpu clock lock enter!!!")
        if not self._nvml_initialized:
            pynvml.nvmlInit()
            self._nvml_initialized = True
            self._gpu_handles = [
                pynvml.nvmlDeviceGetHandleByIndex(gpu_id)
                for gpu_id in self._gpu_id_list
            ]
            print_info(f"Reinitialized GPU handles: {self._gpu_handles}")

        # Lock GPU clocks for consistent performance
        self._lock_gpu_clocks()

        self.start_monitor()
        return self

    def __exit__(self, *args):
        """
        Do all the steps needed at the end of a test case:
        - Stop monitoring.
        - Set gpu clock back to original state.
        - Validate gpu monitoring result.
        """
        self.stop_monitor()

        # Restore original GPU clocks
        self._unlock_gpu_clocks()

        self.validate_gpu_monitoring_data()
        print_info("gpu clock lock exit!!!")

    def start_monitor(self):
        """Start GPU monitoring."""

        if self._gpu_handles is None:
            print_warning(
                "Unable to start GPU monitoring. GPU handles are not initialized."
            )
            return

        if self._is_monitoring:
            raise RuntimeError(
                "GPU monitoring is already in progress. Monitoring cannot be started!"
            )

        # Delete state_data
        self._state_data = []
        self._is_monitoring = True

        # Initialize thread
        self._thread = threading.Thread(
            target=self._monitoring_thread,
            name="LLM Test - GPUMonitor",
            kwargs={"interval_ms": self._interval_ms})
        self._thread.daemon = True
        self._thread.start()

    def stop_monitor(self):
        """Stop GPU monitoring."""
        if self._gpu_handles is None:
            return

        if not self._is_monitoring:
            raise RuntimeError(
                "GPU monitoring has not been started. Monitoring cannot be stopped!"
            )

        self._is_monitoring = False
        self._thread.join()

    def get_state_data(self):
        """
        Get all the gpu monitoring data since monitoring started.
        Seems like from our empirical data get_state_data() can return None.
        This might have something to do with thread failure if something were to happen to GPU monitoring thread.
        """
        return self._state_data

    def validate_gpu_monitoring_data(self, deviation_perc=0.07, num_entries=3):
        """
        Check that all the current monitoring data is within the given deviation_perc for the given number of
        consecutive entries in a row.

        The "num_entries" argument specifies the number of consecutive entries the monitoring data needs to be invalid
        before considering the entire dataset as invalid
        """

        if self._mobile_disable_clock_locking:
            print_info("Skipped gpu monitoring validation for mobile board")
            return

    def teardown(self):
        """
        Call when the session finishes. Reset GPU clocks back to its original state.
        """
        # Revert clocks back to normal if all tests have finished.
        self._unlock_gpu_clocks()

        if self._nvml_initialized:
            pynvml.nvmlShutdown()
            self._nvml_initialized = False
        print_info("NVML shutdown.")

    def _gpu_poll_state(self):
        if not self._nvml_initialized:
            print_warning("NVML is not initialized. Skipping GPU polling.")
            return

        for gpu_idx, gpu_handle in enumerate(self._gpu_handles):
            try:
                sm_clk = pynvml.nvmlDeviceGetClockInfo(gpu_handle,
                                                       pynvml.NVML_CLOCK_SM)
                mem_clk = pynvml.nvmlDeviceGetClockInfo(gpu_handle,
                                                        pynvml.NVML_CLOCK_MEM)
                graphics_clk = pynvml.nvmlDeviceGetClockInfo(
                    gpu_handle, pynvml.NVML_CLOCK_GRAPHICS)
                gpu_util = pynvml.nvmlDeviceGetUtilizationRates(gpu_handle).gpu
                mem_util = pynvml.nvmlDeviceGetUtilizationRates(
                    gpu_handle).memory
                gpu_temp = pynvml.nvmlDeviceGetTemperature(
                    gpu_handle, pynvml.NVML_TEMPERATURE_GPU)
                perf_state = pynvml.nvmlDeviceGetPerformanceState(gpu_handle)
                power_draw = pynvml.nvmlDeviceGetPowerUsage(
                    gpu_handle) / 1000.0  # Convert from milliwatts to watts
                process_num = len(
                    pynvml.nvmlDeviceGetComputeRunningProcesses(gpu_handle))
                encoder_util = pynvml.nvmlDeviceGetEncoderUtilization(
                    gpu_handle)[0]  # Get encoder utilization percentage
                decoder_util = pynvml.nvmlDeviceGetDecoderUtilization(
                    gpu_handle)[0]  # Get decoder utilization percentage
                gpu_state = GPUState(
                    gpu_id=self._gpu_id_list[gpu_idx],
                    gpu_clock=sm_clk,
                    mem_clock=mem_clk,
                    timestamp=datetime.datetime.now(),
                    graphics_clk=graphics_clk,
                    gpu_util=gpu_util,
                    mem_util=mem_util,
                    gpu_temp=gpu_temp,
                    mem_temp=
                    None,  # Can't use pynvml to get memory temperature data
                    fan_speed=
                    None,  # Will hit not supported exception when call pynvml.nvmlDeviceGetFanSpeed()
                    perf_state=perf_state,
                    power_draw=power_draw,
                    process_num=process_num,
                    encoder_util=encoder_util,
                    decoder_util=decoder_util)
                self._state_data.append(gpu_state)
            except pynvml.NVMLError as e:
                print_warning(f"Error polling GPU state for GPU {gpu_idx}: {e}")

    def _monitoring_thread(self, interval_ms):
        """Actual thread that runs to monitor similar to perf_runner.monitor"""
        interval = interval_ms / 1000

        # Get the state of the object.
        while self._is_monitoring:
            self._gpu_poll_state()
            # Sleep the thread
            time.sleep(interval)

        # Final time for interpolation.
        self._gpu_poll_state()
        time.sleep(interval)
        self._gpu_poll_state()

    def _setup_properties(self):
        """Set up OS/CPU/GPU properties """
        try:
            self._os_properties = {
                "os_name": os.name,
                "platform": platform.system(),
                "platform_version": platform.version()
            }
        except Exception as e:
            self._os_properties = None
            print_warning("Unable to fetch os properties. Reason: {}".format(e))

        try:
            self._cpu_properties = {
                "cpu_count":
                os.cpu_count(),
                "cpu_freq":
                psutil.cpu_freq()._asdict() if psutil.cpu_freq() else None
            }
        except Exception as e:
            self._cpu_properties = None
            print_warning(
                "Unable to fetch cpu properties. Reason: {}".format(e))

        try:
            self._ip_address = socket.gethostbyname(socket.gethostname())
        except Exception as e:
            self._ip_address = None
            print_warning("Unable to fetch os IP address. Reason: {}".format(e))

        if self._gpu_handles is not None:
            self._nvidia_driver_version = pynvml.nvmlSystemGetDriverVersion()
            self._nvidia_device_count = pynvml.nvmlDeviceGetCount()
            self._gpu_properties = {
                "device_product_name":
                pynvml.nvmlDeviceGetName(self._gpu_handles[0]),
                "pci_device_id":
                pynvml.nvmlDeviceGetPciInfo(self._gpu_handles[0]).pciDeviceId
            }

            # Clean up the device product name because the product names have changed after driver updates.
            self._gpu_properties[
                "device_product_name"] = clean_device_product_name(
                    self._gpu_properties["device_product_name"])

            if "jetson" in self._gpu_properties[
                    "device_product_name"] or "p3710" in self._gpu_properties[
                        "device_product_name"]:
                self._mobile_disable_clock_locking = True

        else:
            self._nvidia_driver_version = None
            self._nvidia_device_count = None
            self._gpu_properties = None