diff --git a/codecarbon/core/gpu.py b/codecarbon/core/gpu.py index 2ff55caab..f2bf16d9f 100644 --- a/codecarbon/core/gpu.py +++ b/codecarbon/core/gpu.py @@ -1,12 +1,74 @@ +import subprocess +from collections import namedtuple from dataclasses import dataclass, field from typing import Any, Dict, List, Union -import pynvml - from codecarbon.core.units import Energy, Power, Time from codecarbon.external.logger import logger +def is_rocm_system(): + """Returns True if the system has an rocm-smi interface.""" + try: + # Check if rocm-smi is available + subprocess.check_output(["rocm-smi", "--help"]) + return True + except (subprocess.CalledProcessError, OSError): + return False + + +def is_nvidia_system(): + """Returns True if the system has an nvidia-smi interface.""" + try: + # Check if nvidia-smi is available + subprocess.check_output(["nvidia-smi", "--help"]) + return True + except Exception: + return False + + +try: + import pynvml + + pynvml.nvmlInit() + PYNVML_AVAILABLE = True +except ImportError: + if is_nvidia_system(): + logger.warning( + "Nvidia GPU detected but pynvml is not available. " + "Please install pynvml to get GPU metrics." + ) + PYNVML_AVAILABLE = False +except Exception: + if is_nvidia_system(): + logger.warning( + "Nvidia GPU detected but pynvml initialization failed. " + "Please ensure NVIDIA drivers are properly installed." + ) + PYNVML_AVAILABLE = False + +try: + import amdsmi + + AMDSMI_AVAILABLE = True +except ImportError: + if is_rocm_system(): + logger.warning( + "AMD GPU detected but amdsmi is not available. " + "Please install amdsmi to get GPU metrics." + ) + AMDSMI_AVAILABLE = False +except AttributeError as e: + # In some environments, amdsmi may be present but not properly configured, leading to AttributeError when importing + logger.warning( + "AMD GPU detected but amdsmi is not properly configured. " + "Please ensure amdsmi is correctly installed to get GPU metrics." + "Tips : check consistency between Python amdsmi package and ROCm versions, and ensure AMD drivers are up to date." + f" Error: {e}" + ) + AMDSMI_AVAILABLE = False + + @dataclass class GPUDevice: """ @@ -27,10 +89,10 @@ class GPUDevice: handle: any gpu_index: int - # Energy consumed in kWh - energy_delta: Energy = field(default_factory=lambda: Energy(0)) # Power based on reading power: Power = field(default_factory=lambda: Power(0)) + # Energy consumed in kWh + energy_delta: Energy = field(default_factory=lambda: Energy(0)) # Last energy reading in kWh last_energy: Energy = field(default_factory=lambda: Energy(0)) @@ -108,7 +170,10 @@ def _to_utf8(self, str_or_bytes) -> Any: return str_or_bytes - def _get_total_energy_consumption(self) -> int: + +@dataclass +class NvidiaGPUDevice(GPUDevice): + def _get_total_energy_consumption(self): """Returns total energy consumption for this GPU in millijoules (mJ) since the driver was last reloaded https://docs.nvidia.com/deploy/nvml-api/group__nvmlDeviceQueries.html#group__nvmlDeviceQueries_1g732ab899b5bd18ac4bfb93c02de4900a """ @@ -166,6 +231,7 @@ def _get_power_limit(self) -> Union[int, None]: try: return pynvml.nvmlDeviceGetEnforcedPowerLimit(self.handle) except Exception: + logger.warning("Failed to retrieve gpu power limit", exc_info=True) return None def _get_gpu_utilization(self): @@ -185,39 +251,154 @@ def _get_compute_processes(self) -> List: device with the memory used https://docs.nvidia.com/deploy/nvml-api/group__nvmlDeviceQueries.html#group__nvmlDeviceQueries_1g46ceaea624d5c96e098e03c453419d68 """ - try: - processes = pynvml.nvmlDeviceGetComputeRunningProcesses(self.handle) - - return [{"pid": p.pid, "used_memory": p.usedGpuMemory} for p in processes] - except pynvml.NVMLError: - return [] + processes = pynvml.nvmlDeviceGetComputeRunningProcesses(self.handle) + return [{"pid": p.pid, "used_memory": p.usedGpuMemory} for p in processes] def _get_graphics_processes(self) -> List: """Returns the list of processes ids having a graphics context on the device with the memory used https://docs.nvidia.com/deploy/nvml-api/group__nvmlDeviceQueries.html#group__nvmlDeviceQueries_1g7eacf7fa7ba4f4485d166736bf31195e """ + processes = pynvml.nvmlDeviceGetGraphicsRunningProcesses(self.handle) + return [{"pid": p.pid, "used_memory": p.usedGpuMemory} for p in processes] + + +class AMDGPUDevice(GPUDevice): + def _get_total_energy_consumption(self): + """Returns energy in millijoules. Energy Status Units is equivalent to around 15.3 microjoules.""" + energy_count = amdsmi.amdsmi_get_energy_count(self.handle) + # energy_count contains 'power' and 'counter_resolution' + # Result is in uJ (microjoules), convert to mJ + energy = energy_count["power"] * energy_count["counter_resolution"] / 1000 + return energy + + def _get_gpu_name(self): + """Returns the name of the GPU device""" try: - processes = pynvml.nvmlDeviceGetGraphicsRunningProcesses(self.handle) + asic_info = amdsmi.amdsmi_get_gpu_asic_info(self.handle) + name = asic_info.get("market_name", "Unknown GPU") + except Exception: + name = "Unknown GPU" + return self._to_utf8(name) - return [{"pid": p.pid, "used_memory": p.usedGpuMemory} for p in processes] - except pynvml.NVMLError: + def _get_uuid(self): + """Returns the globally unique GPU device UUID""" + uuid = amdsmi.amdsmi_get_gpu_device_uuid(self.handle) + return self._to_utf8(uuid) + + def _get_memory_info(self): + """Returns memory info in bytes""" + memory_info = amdsmi.amdsmi_get_gpu_vram_usage(self.handle) + AMDMemory = namedtuple("AMDMemory", ["total", "used", "free"]) + # vram_total and vram_used are already in MB + total_mb = memory_info["vram_total"] + used_mb = memory_info["vram_used"] + return AMDMemory( + total=total_mb * 1024 * 1024, + used=used_mb * 1024 * 1024, + free=(total_mb - used_mb) * 1024 * 1024, + ) + + def _get_temperature(self): + """Returns degrees in the Celsius scale. Returns temperature in millidegrees Celsius.""" + # amdsmi_get_temp_metric returns temperature in millidegrees Celsius + temp_milli_celsius = amdsmi.amdsmi_get_temp_metric( + self.handle, + sensor_type=amdsmi.AmdSmiTemperatureType.EDGE, + metric=amdsmi.AmdSmiTemperatureMetric.CURRENT, + ) + # Convert from millidegrees to degrees + return temp_milli_celsius // 1000 + + def _get_power_usage(self): + """Returns power usage in milliwatts""" + # amdsmi_get_power_info returns power in watts, convert to milliwatts + power_info = amdsmi.amdsmi_get_power_info(self.handle) + return int(power_info["average_socket_power"] * 1000) + + def _get_power_limit(self): + """Returns max power usage in milliwatts""" + # Get power cap info which contains power_cap in uW (microwatts) + try: + power_cap_info = amdsmi.amdsmi_get_power_cap_info(self.handle) + # power_cap is in uW, convert to mW + return int(power_cap_info["power_cap"] / 1000) + except Exception: + logger.warning("Failed to retrieve gpu power cap", exc_info=True) + return None + + def _get_gpu_utilization(self): + """Returns the % of utilization of the kernels during the last sample""" + activity = amdsmi.amdsmi_get_gpu_activity(self.handle) + return activity["gfx_activity"] + + def _get_compute_mode(self): + """Returns the compute mode of the GPU""" + return None + + def _get_compute_processes(self): + """Returns the list of processes ids having a compute context on the device with the memory used""" + try: + processes = amdsmi.amdsmi_get_gpu_process_list(self.handle) + return [{"pid": p["pid"], "used_memory": p["mem"]} for p in processes] + except Exception: + # logger.warning("Failed to retrieve gpu compute processes", exc_info=True) + return [] + + def _get_graphics_processes(self): + """Returns the list of processes ids having a graphics context on the device with the memory used""" + try: + processes = amdsmi.amdsmi_get_gpu_process_list(self.handle) + return [ + {"pid": p["pid"], "used_memory": p["mem"]} + for p in processes + if p["engine_usage"].get("gfx", 0) > 0 + ] + except Exception: + # logger.warning("Failed to retrieve gpu graphics processes", exc_info=True) return [] class AllGPUDevices: + device_count: int + devices: List[GPUDevice] + def __init__(self) -> None: - if is_gpu_details_available(): + gpu_details_available = is_gpu_details_available() + if gpu_details_available: logger.debug("GPU available. Starting setup") - self.device_count = pynvml.nvmlDeviceGetCount() else: logger.error("There is no GPU available") - self.device_count = 0 self.devices = [] - for i in range(self.device_count): - handle = pynvml.nvmlDeviceGetHandleByIndex(i) - gpu_device = GPUDevice(handle=handle, gpu_index=i) - self.devices.append(gpu_device) + + if PYNVML_AVAILABLE: + logger.debug("PyNVML available. Starting setup") + pynvml.nvmlInit() + nvidia_devices_count = pynvml.nvmlDeviceGetCount() + for i in range(nvidia_devices_count): + handle = pynvml.nvmlDeviceGetHandleByIndex(i) + nvidia_gpu_device = NvidiaGPUDevice(handle=handle, gpu_index=i) + self.devices.append(nvidia_gpu_device) + + if AMDSMI_AVAILABLE: + logger.debug("AMDSMI available. Starting setup") + try: + amdsmi.amdsmi_init() + amd_devices_handles = amdsmi.amdsmi_get_processor_handles() + if len(amd_devices_handles) == 0: + print( + "No AMD GPUs foundon machine with amdsmi_get_processor_handles() !" + ) + else: + for i, handle in enumerate(amd_devices_handles): + logger.debug( + f"Found AMD GPU device with handle {handle} and index {i} : {amdsmi.amdsmi_get_gpu_device_uuid(handle)}" + ) + amd_gpu_device = AMDGPUDevice(handle=handle, gpu_index=i) + self.devices.append(amd_gpu_device) + except amdsmi.AmdSmiException as e: + logger.warning(f"Failed to initialize AMDSMI: {e}", exc_info=True) + self.device_count = len(self.devices) def get_gpu_static_info(self) -> List: """Get all GPUs static information. @@ -239,7 +420,7 @@ def get_gpu_static_info(self) -> List: devices_static_info.append(gpu_device.get_static_details()) return devices_static_info - except pynvml.NVMLError: + except Exception: logger.warning("Failed to retrieve gpu static info", exc_info=True) return [] @@ -267,11 +448,11 @@ def get_gpu_details(self) -> List: try: devices_info = [] for i in range(self.device_count): - gpu_device: GPUDevice = self.devices[i] + gpu_device = self.devices[i] devices_info.append(gpu_device.get_gpu_details()) return devices_info - except pynvml.NVMLError: + except Exception: logger.warning("Failed to retrieve gpu information", exc_info=True) return [] @@ -290,20 +471,15 @@ def get_delta(self, last_duration: Time) -> List: try: devices_info = [] for i in range(self.device_count): - gpu_device: GPUDevice = self.devices[i] + gpu_device = self.devices[i] devices_info.append(gpu_device.delta(last_duration)) return devices_info - except pynvml.NVMLError: + except Exception: logger.warning("Failed to retrieve gpu information", exc_info=True) return [] def is_gpu_details_available() -> bool: """Returns True if the GPU details are available.""" - try: - pynvml.nvmlInit() - return True - - except pynvml.NVMLError: - return False + return PYNVML_AVAILABLE or AMDSMI_AVAILABLE diff --git a/codecarbon/core/resource_tracker.py b/codecarbon/core/resource_tracker.py index 120faf4ef..cbf61de18 100644 --- a/codecarbon/core/resource_tracker.py +++ b/codecarbon/core/resource_tracker.py @@ -209,14 +209,20 @@ def set_CPU_tracking(self): def set_GPU_tracking(self): logger.info("[setup] GPU Tracking...") - if self.tracker._gpu_ids: + if isinstance(self.tracker._gpu_ids, str): self.tracker._gpu_ids = parse_gpu_ids(self.tracker._gpu_ids) - if self.tracker._gpu_ids: - self.tracker._conf["gpu_ids"] = self.tracker._gpu_ids - self.tracker._conf["gpu_count"] = len(self.tracker._gpu_ids) - - if gpu.is_gpu_details_available(): - logger.info("Tracking Nvidia GPU via pynvml") + self.tracker._conf["gpu_ids"] = self.tracker._gpu_ids + self.tracker._conf["gpu_count"] = len(self.tracker._gpu_ids) + + is_nvidia = gpu.is_nvidia_system() + is_rocm = gpu.is_rocm_system() + if is_nvidia or is_rocm: + if is_nvidia: + logger.info("Tracking Nvidia GPUs via PyNVML") + self.gpu_tracker = "pynvml" + else: + logger.info("Tracking AMD GPUs via AMDSMI") + self.gpu_tracker = "amdsmi" gpu_devices = GPU.from_utils(self.tracker._gpu_ids) self.tracker._hardware.append(gpu_devices) gpu_names = [n["name"] for n in gpu_devices.devices.get_gpu_static_info()] @@ -224,11 +230,9 @@ def set_GPU_tracking(self): self.tracker._conf["gpu_model"] = "".join( [f"{i} x {name}" for name, i in gpu_names_dict.items()] ) - if self.tracker._conf.get("gpu_count") is None: - self.tracker._conf["gpu_count"] = len( - gpu_devices.devices.get_gpu_static_info() - ) - self.gpu_tracker = "pynvml" + self.tracker._conf["gpu_count"] = len( + gpu_devices.devices.get_gpu_static_info() + ) else: logger.info("No GPU found.") diff --git a/codecarbon/emissions_tracker.py b/codecarbon/emissions_tracker.py index fb6e0d392..3fdd4f1b8 100644 --- a/codecarbon/emissions_tracker.py +++ b/codecarbon/emissions_tracker.py @@ -369,7 +369,6 @@ def __init__( self._tasks: Dict[str, Task] = {} self._active_task: Optional[str] = None self._active_task_emissions_at_start: Optional[EmissionsData] = None - # Tracking mode detection self._hardware = [] resource_tracker = ResourceTracker(self)