# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import argparse import json import logging import os import platform import re import socket import sys import psutil import pynvml from cuda import cuda # Logger logger = logging.getLogger(__name__) # Globals chip_name_mapping = {} def load_chip_name_mapping(file_path): global chip_name_mapping try: with open(file_path, "r") as fp: chip_name_mapping = json.load(fp) except Exception as e: logger.warning(f"Failed to load chip name mapping: {e}") def get_host_memory_info(): # Get memory information memory = psutil.virtual_memory() # Convert to MiB total_memory_mib = memory.total / (1024 * 1024) available_memory_mib = memory.available / (1024 * 1024) return round(total_memory_mib, 2), round(available_memory_mib, 2) def ASSERT_DRV(err): if isinstance(err, cuda.CUresult): if err != cuda.CUresult.CUDA_SUCCESS: raise RuntimeError('Cuda Error: {}'.format(err)) elif isinstance(err, nvrtc.nvrtcResult): if err != nvrtc.nvrtcResult.NVRTC_SUCCESS: raise RuntimeError('Nvrtc Error: {}'.format(err)) else: raise RuntimeError('Unknown error type: {}'.format(err)) def get_compute_capability(device_id=0): # Init err, = cuda.cuInit(0) ASSERT_DRV(err) # Device err, cuDevice = cuda.cuDeviceGet(0) ASSERT_DRV(err) # Get target architecture err, sm_major = cuda.cuDeviceGetAttribute( cuda.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevice) ASSERT_DRV(err) err, sm_minor = cuda.cuDeviceGetAttribute( cuda.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevice) ASSERT_DRV(err) return ".".join((str(sm_major), str(sm_minor))) def is_aarch64(): """Function to detect if running on ARM""" return platform.processor().startswith("aarch64") def is_linux(): """Function to detect if running on Linux""" try: return os.uname()[0].lower() == "linux" except: return False def is_power(): """Function to detect if running on IBM Power""" return platform.processor() == "ppc64le" def get_linux_distribution(): try: import distro return (distro.id(), distro.version(), distro.codename()) except: logger.warning( "Unable to use distro module, defaulting operating system to ('na', 'na', 'na')" ) return ("na", "na", "na") def is_windows(): """ Function used to detect if either running on native or WSL """ return is_native_windows() or is_wsl() def try_set(write_dict, key, callable_func): try: write_dict[key] = callable_func() except Exception as e: logger.warning("Unable to set mako variable {}: {}".format(key, e)) return write_dict def is_wsl(): try: with open("/proc/version", "r") as f: for line in f: if re.search("microsoft", line, re.I): return True return False except: return False def is_native_windows(): """Function used to detect if running native windows""" return sys.platform == "win32" def parse_key_value(string): try: key, value = string.split('=', 1) return key.strip(), value.strip() except ValueError: raise argparse.ArgumentTypeError("Invalid KEY=VALUE format") def construct_gpu_properties(mako_opts, device_index=0): # Initialize NVML pynvml.nvmlInit() mako_opt_dict = dict() try: # Get handle for the specified GPU device num_gpus = pynvml.nvmlDeviceGetCount() handle = pynvml.nvmlDeviceGetHandleByIndex(device_index) # Get GPU name full_name = pynvml.nvmlDeviceGetName(handle) if "Graphics Device" in full_name: # Use chip name to get the GPU name pci_info = pynvml.nvmlDeviceGetPciInfo(handle) if pci_info is None: logger.warning( "Failed to get PCI information. PCI parameters and chip name won't be reported" ) else: pci_device_id = hex(pci_info.pciDeviceId) if pci_device_id is not None and chip_name_mapping: entity = str(pci_device_id).lower() if entity in chip_name_mapping: chip = str(chip_name_mapping[entity]).lower() else: err_msg = r"Could not find a chip name associated with this device id - {}. Chip name won't be reported".format( entity) logger.warning(err_msg) full_name = "{} Bring-up Board".format(chip.upper()) gpu_name = full_name.replace("NVIDIA", "").strip().upper() assert gpu_name != "", "device_product_name is empty after removing substring 'NVIDIA' and leading/trailing whitespaces." compute_capability = get_compute_capability(device_index) gpu_memory = pynvml.nvmlDeviceGetMemoryInfo(handle).total / (1024**2) # Gather GPU information mako_opt_dict["gpu"] = gpu_name mako_opt_dict["gpu_memory"] = gpu_memory # Get memory info for this OS host_mem_total_mib, host_mem_available_mib = get_host_memory_info() mako_opt_dict["host_mem_available_mib"] = host_mem_available_mib mako_opt_dict["host_mem_total_mib"] = host_mem_total_mib mako_opt_dict = try_set(mako_opt_dict, "hostname", socket.gethostname) mako_opt_dict = try_set(mako_opt_dict, "is_aarch64", is_aarch64) mako_opt_dict = try_set(mako_opt_dict, "is_linux", is_linux) mako_opt_dict = try_set(mako_opt_dict, "is_native_windows", is_native_windows) mako_opt_dict = try_set(mako_opt_dict, "is_power", is_power) mako_opt_dict = try_set(mako_opt_dict, "is_wsl", is_wsl) mako_opt_dict["system_gpu_count"] = num_gpus # Set linux specific information linux_distribution_info = get_linux_distribution() mako_opt_dict["linux_distribution_name"] = linux_distribution_info[0] mako_opt_dict["linux_version"] = linux_distribution_info[1] mako_opt_dict["linux_codename"] = linux_distribution_info[2] if compute_capability is not None: mako_opt_dict = try_set(mako_opt_dict, "compute_capability", lambda: float(compute_capability)) mako_opt_dict = try_set( mako_opt_dict, "supports_tf32", lambda: compute_capability and float(compute_capability) >= 8.0) mako_opt_dict = try_set( mako_opt_dict, "supports_int8", lambda: compute_capability and float(compute_capability) >= 6.1) mako_opt_dict = try_set( mako_opt_dict, "supports_fp8", lambda: compute_capability and float(compute_capability) >= 8.9) try: if is_native_windows(): sysname, _, _, _, machine, _ = platform.uname() else: sysname, _, _, _, machine = os.uname() mako_opt_dict["cpu"] = machine mako_opt_dict["sysname"] = sysname except Exception as e: logger.warning( "Unable to set cpu and sysname mako info: {}".format(e)) #Set is_android, cuda_version, cudnn_version, and cublas_version to None #Set is_remote to False temporarily mako_opt_dict["cuda_version"] = None mako_opt_dict["cublas_version"] = None mako_opt_dict["cudnn_version"] = None mako_opt_dict["is_android"] = None mako_opt_dict["is_remote"] = False if mako_opts: mako_opt_dict.update(dict(mako_opts)) print("Mako options:") for key, value in mako_opt_dict.items(): print(f"{key}={value}") finally: # Shutdown NVML pynvml.nvmlShutdown() def main(): parser = argparse.ArgumentParser( description="Get arguments when generating system/gpu info") parser.add_argument("--device", type=int, default=0, help="Index of device.") parser.add_argument( "-M", "--mako-opt", metavar="KEY=VALUE", type=parse_key_value, action="append", help= "Specify a KEY=VALUE pair. Each KEY is assigned to a Mako context variable with value VALUE when " "pre-processing the test list provided by the --filter-file " "argument. Multiple --mako-opts arguments may be specified on the command line." ) parser.add_argument("--chip-mapping-file", type=str, required=True, help="Path to the GPU chip mapping JSON file.") args = parser.parse_args() load_chip_name_mapping(args.chip_mapping_file) construct_gpu_properties(args.mako_opt, args.device) if __name__ == '__main__': main()