mlco2 · benoit-cty · Jan 16, 2024 · Jan 16, 2024 · Jan 16, 2024 · Jan 26, 2024
@@ -1,12 +1,74 @@
+import subprocess
+from collections import namedtuple
 from dataclasses import dataclass, field
 from typing import Any, Dict, List, Union
 
-import pynvml
-
 from codecarbon.core.units import Energy, Power, Time
 from codecarbon.external.logger import logger
 
 
+def is_rocm_system():
+    """Returns True if the system has an rocm-smi interface."""
+    try:
+        # Check if rocm-smi is available
+        subprocess.check_output(["rocm-smi", "--help"])
+        return True
+    except (subprocess.CalledProcessError, OSError):
+        return False
+
+
+def is_nvidia_system():
+    """Returns True if the system has an nvidia-smi interface."""
+    try:
+        # Check if nvidia-smi is available
+        subprocess.check_output(["nvidia-smi", "--help"])
+        return True
+    except Exception:
+        return False
+
+
+try:
+    import pynvml
+
+    pynvml.nvmlInit()
+    PYNVML_AVAILABLE = True
+except ImportError:
+    if is_nvidia_system():
+        logger.warning(
+            "Nvidia GPU detected but pynvml is not available. "
+            "Please install pynvml to get GPU metrics."
+        )
+    PYNVML_AVAILABLE = False
+except Exception:
+    if is_nvidia_system():
+        logger.warning(
+            "Nvidia GPU detected but pynvml initialization failed. "
+            "Please ensure NVIDIA drivers are properly installed."
+        )
+    PYNVML_AVAILABLE = False
+
+try:
+    import amdsmi
+
+    AMDSMI_AVAILABLE = True
+except ImportError:
+    if is_rocm_system():
+        logger.warning(
+            "AMD GPU detected but amdsmi is not available. "
+            "Please install amdsmi to get GPU metrics."
+        )
+    AMDSMI_AVAILABLE = False
+except AttributeError as e:
+    # In some environments, amdsmi may be present but not properly configured, leading to AttributeError when importing
+    logger.warning(
+        "AMD GPU detected but amdsmi is not properly configured. "
+        "Please ensure amdsmi is correctly installed to get GPU metrics."
+        "Tips : check consistency between Python amdsmi package and ROCm versions, and ensure AMD drivers are up to date."
+        f" Error: {e}"
+    )
+    AMDSMI_AVAILABLE = False
+
+
 @dataclass
 class GPUDevice:
     """
@@ -27,10 +89,10 @@ class GPUDevice:
 
     handle: any
     gpu_index: int
-    # Energy consumed in kWh
-    energy_delta: Energy = field(default_factory=lambda: Energy(0))
     # Power based on reading
     power: Power = field(default_factory=lambda: Power(0))
+    # Energy consumed in kWh
+    energy_delta: Energy = field(default_factory=lambda: Energy(0))
     # Last energy reading in kWh
     last_energy: Energy = field(default_factory=lambda: Energy(0))
 
@@ -108,7 +170,10 @@ def _to_utf8(self, str_or_bytes) -> Any:
 
         return str_or_bytes
 
-    def _get_total_energy_consumption(self) -> int:
+
+@dataclass
+class NvidiaGPUDevice(GPUDevice):
+    def _get_total_energy_consumption(self):
         """Returns total energy consumption for this GPU in millijoules (mJ) since the driver was last reloaded
         https://docs.nvidia.com/deploy/nvml-api/group__nvmlDeviceQueries.html#group__nvmlDeviceQueries_1g732ab899b5bd18ac4bfb93c02de4900a
         """
@@ -166,6 +231,7 @@ def _get_power_limit(self) -> Union[int, None]:
         try:
             return pynvml.nvmlDeviceGetEnforcedPowerLimit(self.handle)
         except Exception:
+            logger.warning("Failed to retrieve gpu power limit", exc_info=True)
             return None
 
     def _get_gpu_utilization(self):
@@ -185,39 +251,154 @@ def _get_compute_processes(self) -> List:
         device with the memory used
         https://docs.nvidia.com/deploy/nvml-api/group__nvmlDeviceQueries.html#group__nvmlDeviceQueries_1g46ceaea624d5c96e098e03c453419d68
         """
-        try:
-            processes = pynvml.nvmlDeviceGetComputeRunningProcesses(self.handle)
-
-            return [{"pid": p.pid, "used_memory": p.usedGpuMemory} for p in processes]
-        except pynvml.NVMLError:
-            return []
+        processes = pynvml.nvmlDeviceGetComputeRunningProcesses(self.handle)
+        return [{"pid": p.pid, "used_memory": p.usedGpuMemory} for p in processes]
 
     def _get_graphics_processes(self) -> List:
         """Returns the list of processes ids having a graphics context on the
         device with the memory used
         https://docs.nvidia.com/deploy/nvml-api/group__nvmlDeviceQueries.html#group__nvmlDeviceQueries_1g7eacf7fa7ba4f4485d166736bf31195e
         """
+        processes = pynvml.nvmlDeviceGetGraphicsRunningProcesses(self.handle)
+        return [{"pid": p.pid, "used_memory": p.usedGpuMemory} for p in processes]
+
+
+class AMDGPUDevice(GPUDevice):
+    def _get_total_energy_consumption(self):
+        """Returns energy in millijoules. Energy Status Units is equivalent to around 15.3 microjoules."""
+        energy_count = amdsmi.amdsmi_get_energy_count(self.handle)
+        # energy_count contains 'power' and 'counter_resolution'
+        # Result is in uJ (microjoules), convert to mJ
+        energy = energy_count["power"] * energy_count["counter_resolution"] / 1000
+        return energy
+
+    def _get_gpu_name(self):
+        """Returns the name of the GPU device"""
         try:
-            processes = pynvml.nvmlDeviceGetGraphicsRunningProcesses(self.handle)
+            asic_info = amdsmi.amdsmi_get_gpu_asic_info(self.handle)
+            name = asic_info.get("market_name", "Unknown GPU")
+        except Exception:
+            name = "Unknown GPU"
+        return self._to_utf8(name)
 
-            return [{"pid": p.pid, "used_memory": p.usedGpuMemory} for p in processes]
-        except pynvml.NVMLError:
+    def _get_uuid(self):
+        """Returns the globally unique GPU device UUID"""
+        uuid = amdsmi.amdsmi_get_gpu_device_uuid(self.handle)
+        return self._to_utf8(uuid)
+
+    def _get_memory_info(self):
+        """Returns memory info in bytes"""
+        memory_info = amdsmi.amdsmi_get_gpu_vram_usage(self.handle)
+        AMDMemory = namedtuple("AMDMemory", ["total", "used", "free"])
+        # vram_total and vram_used are already in MB
+        total_mb = memory_info["vram_total"]
+        used_mb = memory_info["vram_used"]
+        return AMDMemory(
+            total=total_mb * 1024 * 1024,
+            used=used_mb * 1024 * 1024,
+            free=(total_mb - used_mb) * 1024 * 1024,
+        )
+
+    def _get_temperature(self):
+        """Returns degrees in the Celsius scale. Returns temperature in millidegrees Celsius."""
+        # amdsmi_get_temp_metric returns temperature in millidegrees Celsius
+        temp_milli_celsius = amdsmi.amdsmi_get_temp_metric(
+            self.handle,
+            sensor_type=amdsmi.AmdSmiTemperatureType.EDGE,
+            metric=amdsmi.AmdSmiTemperatureMetric.CURRENT,
+        )
+        # Convert from millidegrees to degrees
+        return temp_milli_celsius // 1000
+
+    def _get_power_usage(self):
+        """Returns power usage in milliwatts"""
+        # amdsmi_get_power_info returns power in watts, convert to milliwatts
+        power_info = amdsmi.amdsmi_get_power_info(self.handle)
+        return int(power_info["average_socket_power"] * 1000)
+
+    def _get_power_limit(self):
+        """Returns max power usage in milliwatts"""
+        # Get power cap info which contains power_cap in uW (microwatts)
+        try:
+            power_cap_info = amdsmi.amdsmi_get_power_cap_info(self.handle)
+            # power_cap is in uW, convert to mW
+            return int(power_cap_info["power_cap"] / 1000)
+        except Exception:
+            logger.warning("Failed to retrieve gpu power cap", exc_info=True)
+            return None
+
+    def _get_gpu_utilization(self):
+        """Returns the % of utilization of the kernels during the last sample"""
+        activity = amdsmi.amdsmi_get_gpu_activity(self.handle)
+        return activity["gfx_activity"]
+
+    def _get_compute_mode(self):
+        """Returns the compute mode of the GPU"""
+        return None
+
+    def _get_compute_processes(self):
+        """Returns the list of processes ids having a compute context on the device with the memory used"""
+        try:
+            processes = amdsmi.amdsmi_get_gpu_process_list(self.handle)
+            return [{"pid": p["pid"], "used_memory": p["mem"]} for p in processes]
+        except Exception:
+            # logger.warning("Failed to retrieve gpu compute processes", exc_info=True)
+            return []
+
+    def _get_graphics_processes(self):
+        """Returns the list of processes ids having a graphics context on the device with the memory used"""
+        try:
+            processes = amdsmi.amdsmi_get_gpu_process_list(self.handle)
+            return [
+                {"pid": p["pid"], "used_memory": p["mem"]}
+                for p in processes
+                if p["engine_usage"].get("gfx", 0) > 0
+            ]
+        except Exception:
+            # logger.warning("Failed to retrieve gpu graphics processes", exc_info=True)
             return []
 
 
 class AllGPUDevices:
+    device_count: int
+    devices: List[GPUDevice]
+
     def __init__(self) -> None:
-        if is_gpu_details_available():
+        gpu_details_available = is_gpu_details_available()
+        if gpu_details_available:
             logger.debug("GPU available. Starting setup")
-            self.device_count = pynvml.nvmlDeviceGetCount()
         else:
             logger.error("There is no GPU available")
-            self.device_count = 0
         self.devices = []
-        for i in range(self.device_count):
-            handle = pynvml.nvmlDeviceGetHandleByIndex(i)
-            gpu_device = GPUDevice(handle=handle, gpu_index=i)
-            self.devices.append(gpu_device)
+
+        if PYNVML_AVAILABLE:
+            logger.debug("PyNVML available. Starting setup")
+            pynvml.nvmlInit()
+            nvidia_devices_count = pynvml.nvmlDeviceGetCount()
+            for i in range(nvidia_devices_count):
+                handle = pynvml.nvmlDeviceGetHandleByIndex(i)
+                nvidia_gpu_device = NvidiaGPUDevice(handle=handle, gpu_index=i)
+                self.devices.append(nvidia_gpu_device)
+
+        if AMDSMI_AVAILABLE:
+            logger.debug("AMDSMI available. Starting setup")
+            try:
+                amdsmi.amdsmi_init()
+                amd_devices_handles = amdsmi.amdsmi_get_processor_handles()
+                if len(amd_devices_handles) == 0:
+                    print(
+                        "No AMD GPUs foundon machine with amdsmi_get_processor_handles() !"
+                    )
+                else:
+                    for i, handle in enumerate(amd_devices_handles):
+                        logger.debug(
+                            f"Found AMD GPU device with handle {handle} and index {i} : {amdsmi.amdsmi_get_gpu_device_uuid(handle)}"
+                        )
+                        amd_gpu_device = AMDGPUDevice(handle=handle, gpu_index=i)
+                        self.devices.append(amd_gpu_device)
+            except amdsmi.AmdSmiException as e:
+                logger.warning(f"Failed to initialize AMDSMI: {e}", exc_info=True)
+        self.device_count = len(self.devices)
 
     def get_gpu_static_info(self) -> List:
         """Get all GPUs static information.
@@ -239,7 +420,7 @@ def get_gpu_static_info(self) -> List:
                 devices_static_info.append(gpu_device.get_static_details())
             return devices_static_info
 
-        except pynvml.NVMLError:
+        except Exception:
             logger.warning("Failed to retrieve gpu static info", exc_info=True)
             return []
 
@@ -267,11 +448,11 @@ def get_gpu_details(self) -> List:
         try:
             devices_info = []
             for i in range(self.device_count):
-                gpu_device: GPUDevice = self.devices[i]
+                gpu_device = self.devices[i]
                 devices_info.append(gpu_device.get_gpu_details())
             return devices_info
 
-        except pynvml.NVMLError:
+        except Exception:
             logger.warning("Failed to retrieve gpu information", exc_info=True)
             return []
 
@@ -290,20 +471,15 @@ def get_delta(self, last_duration: Time) -> List:
         try:
             devices_info = []
             for i in range(self.device_count):
-                gpu_device: GPUDevice = self.devices[i]
+                gpu_device = self.devices[i]
                 devices_info.append(gpu_device.delta(last_duration))
             return devices_info
 
-        except pynvml.NVMLError:
+        except Exception:
             logger.warning("Failed to retrieve gpu information", exc_info=True)
             return []
 
 
 def is_gpu_details_available() -> bool:
     """Returns True if the GPU details are available."""
-    try:
-        pynvml.nvmlInit()
-        return True
-
-    except pynvml.NVMLError:
-        return False
+    return PYNVML_AVAILABLE or AMDSMI_AVAILABLE
@@ -209,26 +209,30 @@ def set_CPU_tracking(self):
 
     def set_GPU_tracking(self):
         logger.info("[setup] GPU Tracking...")
-        if self.tracker._gpu_ids:
+        if isinstance(self.tracker._gpu_ids, str):
             self.tracker._gpu_ids = parse_gpu_ids(self.tracker._gpu_ids)
-            if self.tracker._gpu_ids:
-                self.tracker._conf["gpu_ids"] = self.tracker._gpu_ids
-                self.tracker._conf["gpu_count"] = len(self.tracker._gpu_ids)
-
-        if gpu.is_gpu_details_available():
-            logger.info("Tracking Nvidia GPU via pynvml")
+            self.tracker._conf["gpu_ids"] = self.tracker._gpu_ids
+            self.tracker._conf["gpu_count"] = len(self.tracker._gpu_ids)
+
+        is_nvidia = gpu.is_nvidia_system()
+        is_rocm = gpu.is_rocm_system()
+        if is_nvidia or is_rocm:
+            if is_nvidia:
+                logger.info("Tracking Nvidia GPUs via PyNVML")
+                self.gpu_tracker = "pynvml"
+            else:
+                logger.info("Tracking AMD GPUs via AMDSMI")
+                self.gpu_tracker = "amdsmi"
             gpu_devices = GPU.from_utils(self.tracker._gpu_ids)
             self.tracker._hardware.append(gpu_devices)
             gpu_names = [n["name"] for n in gpu_devices.devices.get_gpu_static_info()]
             gpu_names_dict = Counter(gpu_names)
             self.tracker._conf["gpu_model"] = "".join(
                 [f"{i} x {name}" for name, i in gpu_names_dict.items()]
             )
-            if self.tracker._conf.get("gpu_count") is None:
-                self.tracker._conf["gpu_count"] = len(
-                    gpu_devices.devices.get_gpu_static_info()
-                )
-            self.gpu_tracker = "pynvml"
+            self.tracker._conf["gpu_count"] = len(
+                gpu_devices.devices.get_gpu_static_info()
+            )
         else:
             logger.info("No GPU found.")
 

@@ -369,7 +369,6 @@ def __init__(
         self._tasks: Dict[str, Task] = {}
         self._active_task: Optional[str] = None
         self._active_task_emissions_at_start: Optional[EmissionsData] = None
-
         # Tracking mode detection
         self._hardware = []
         resource_tracker = ResourceTracker(self)