Skip to content
240 changes: 208 additions & 32 deletions codecarbon/core/gpu.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,74 @@
import subprocess
from collections import namedtuple
from dataclasses import dataclass, field
from typing import Any, Dict, List, Union

import pynvml

from codecarbon.core.units import Energy, Power, Time
from codecarbon.external.logger import logger


def is_rocm_system():
"""Returns True if the system has an rocm-smi interface."""
try:
# Check if rocm-smi is available
subprocess.check_output(["rocm-smi", "--help"])
return True
except (subprocess.CalledProcessError, OSError):
return False


def is_nvidia_system():
"""Returns True if the system has an nvidia-smi interface."""
try:
# Check if nvidia-smi is available
subprocess.check_output(["nvidia-smi", "--help"])
return True
except Exception:
return False


try:
import pynvml

pynvml.nvmlInit()
PYNVML_AVAILABLE = True
except ImportError:
if is_nvidia_system():
logger.warning(
"Nvidia GPU detected but pynvml is not available. "
"Please install pynvml to get GPU metrics."
)
PYNVML_AVAILABLE = False
except Exception:
if is_nvidia_system():
logger.warning(
"Nvidia GPU detected but pynvml initialization failed. "
"Please ensure NVIDIA drivers are properly installed."
)
PYNVML_AVAILABLE = False

try:
import amdsmi

AMDSMI_AVAILABLE = True
except ImportError:
if is_rocm_system():
logger.warning(
"AMD GPU detected but amdsmi is not available. "
"Please install amdsmi to get GPU metrics."
)
AMDSMI_AVAILABLE = False
except AttributeError as e:
# In some environments, amdsmi may be present but not properly configured, leading to AttributeError when importing
logger.warning(
"AMD GPU detected but amdsmi is not properly configured. "
"Please ensure amdsmi is correctly installed to get GPU metrics."
"Tips : check consistency between Python amdsmi package and ROCm versions, and ensure AMD drivers are up to date."
f" Error: {e}"
)
AMDSMI_AVAILABLE = False


@dataclass
class GPUDevice:
"""
Expand All @@ -27,10 +89,10 @@ class GPUDevice:

handle: any
gpu_index: int
# Energy consumed in kWh
energy_delta: Energy = field(default_factory=lambda: Energy(0))
# Power based on reading
power: Power = field(default_factory=lambda: Power(0))
# Energy consumed in kWh
energy_delta: Energy = field(default_factory=lambda: Energy(0))
# Last energy reading in kWh
last_energy: Energy = field(default_factory=lambda: Energy(0))

Expand Down Expand Up @@ -108,7 +170,10 @@ def _to_utf8(self, str_or_bytes) -> Any:

return str_or_bytes

def _get_total_energy_consumption(self) -> int:

@dataclass
class NvidiaGPUDevice(GPUDevice):
def _get_total_energy_consumption(self):
"""Returns total energy consumption for this GPU in millijoules (mJ) since the driver was last reloaded
https://docs.nvidia.com/deploy/nvml-api/group__nvmlDeviceQueries.html#group__nvmlDeviceQueries_1g732ab899b5bd18ac4bfb93c02de4900a
"""
Expand Down Expand Up @@ -166,6 +231,7 @@ def _get_power_limit(self) -> Union[int, None]:
try:
return pynvml.nvmlDeviceGetEnforcedPowerLimit(self.handle)
except Exception:
logger.warning("Failed to retrieve gpu power limit", exc_info=True)
return None

def _get_gpu_utilization(self):
Expand All @@ -185,39 +251,154 @@ def _get_compute_processes(self) -> List:
device with the memory used
https://docs.nvidia.com/deploy/nvml-api/group__nvmlDeviceQueries.html#group__nvmlDeviceQueries_1g46ceaea624d5c96e098e03c453419d68
"""
try:
processes = pynvml.nvmlDeviceGetComputeRunningProcesses(self.handle)

return [{"pid": p.pid, "used_memory": p.usedGpuMemory} for p in processes]
except pynvml.NVMLError:
return []
processes = pynvml.nvmlDeviceGetComputeRunningProcesses(self.handle)
return [{"pid": p.pid, "used_memory": p.usedGpuMemory} for p in processes]

def _get_graphics_processes(self) -> List:
"""Returns the list of processes ids having a graphics context on the
device with the memory used
https://docs.nvidia.com/deploy/nvml-api/group__nvmlDeviceQueries.html#group__nvmlDeviceQueries_1g7eacf7fa7ba4f4485d166736bf31195e
"""
processes = pynvml.nvmlDeviceGetGraphicsRunningProcesses(self.handle)
return [{"pid": p.pid, "used_memory": p.usedGpuMemory} for p in processes]


class AMDGPUDevice(GPUDevice):
def _get_total_energy_consumption(self):
"""Returns energy in millijoules. Energy Status Units is equivalent to around 15.3 microjoules."""
energy_count = amdsmi.amdsmi_get_energy_count(self.handle)
# energy_count contains 'power' and 'counter_resolution'
# Result is in uJ (microjoules), convert to mJ
energy = energy_count["power"] * energy_count["counter_resolution"] / 1000
return energy

def _get_gpu_name(self):
"""Returns the name of the GPU device"""
try:
processes = pynvml.nvmlDeviceGetGraphicsRunningProcesses(self.handle)
asic_info = amdsmi.amdsmi_get_gpu_asic_info(self.handle)
name = asic_info.get("market_name", "Unknown GPU")
except Exception:
name = "Unknown GPU"
return self._to_utf8(name)

return [{"pid": p.pid, "used_memory": p.usedGpuMemory} for p in processes]
except pynvml.NVMLError:
def _get_uuid(self):
"""Returns the globally unique GPU device UUID"""
uuid = amdsmi.amdsmi_get_gpu_device_uuid(self.handle)
return self._to_utf8(uuid)

def _get_memory_info(self):
"""Returns memory info in bytes"""
memory_info = amdsmi.amdsmi_get_gpu_vram_usage(self.handle)
AMDMemory = namedtuple("AMDMemory", ["total", "used", "free"])
# vram_total and vram_used are already in MB
total_mb = memory_info["vram_total"]
used_mb = memory_info["vram_used"]
return AMDMemory(
total=total_mb * 1024 * 1024,
used=used_mb * 1024 * 1024,
free=(total_mb - used_mb) * 1024 * 1024,
)

def _get_temperature(self):
"""Returns degrees in the Celsius scale. Returns temperature in millidegrees Celsius."""
# amdsmi_get_temp_metric returns temperature in millidegrees Celsius
temp_milli_celsius = amdsmi.amdsmi_get_temp_metric(
self.handle,
sensor_type=amdsmi.AmdSmiTemperatureType.EDGE,
metric=amdsmi.AmdSmiTemperatureMetric.CURRENT,
)
# Convert from millidegrees to degrees
return temp_milli_celsius // 1000

def _get_power_usage(self):
"""Returns power usage in milliwatts"""
# amdsmi_get_power_info returns power in watts, convert to milliwatts
power_info = amdsmi.amdsmi_get_power_info(self.handle)
return int(power_info["average_socket_power"] * 1000)

def _get_power_limit(self):
"""Returns max power usage in milliwatts"""
# Get power cap info which contains power_cap in uW (microwatts)
try:
power_cap_info = amdsmi.amdsmi_get_power_cap_info(self.handle)
# power_cap is in uW, convert to mW
return int(power_cap_info["power_cap"] / 1000)
except Exception:
logger.warning("Failed to retrieve gpu power cap", exc_info=True)
return None

def _get_gpu_utilization(self):
"""Returns the % of utilization of the kernels during the last sample"""
activity = amdsmi.amdsmi_get_gpu_activity(self.handle)
return activity["gfx_activity"]

def _get_compute_mode(self):
"""Returns the compute mode of the GPU"""
return None

def _get_compute_processes(self):
"""Returns the list of processes ids having a compute context on the device with the memory used"""
try:
processes = amdsmi.amdsmi_get_gpu_process_list(self.handle)
return [{"pid": p["pid"], "used_memory": p["mem"]} for p in processes]
except Exception:
# logger.warning("Failed to retrieve gpu compute processes", exc_info=True)
return []

def _get_graphics_processes(self):
"""Returns the list of processes ids having a graphics context on the device with the memory used"""
try:
processes = amdsmi.amdsmi_get_gpu_process_list(self.handle)
return [
{"pid": p["pid"], "used_memory": p["mem"]}
for p in processes
if p["engine_usage"].get("gfx", 0) > 0
]
except Exception:
# logger.warning("Failed to retrieve gpu graphics processes", exc_info=True)
return []


class AllGPUDevices:
device_count: int
devices: List[GPUDevice]

def __init__(self) -> None:
if is_gpu_details_available():
gpu_details_available = is_gpu_details_available()
if gpu_details_available:
logger.debug("GPU available. Starting setup")
self.device_count = pynvml.nvmlDeviceGetCount()
else:
logger.error("There is no GPU available")
self.device_count = 0
self.devices = []
for i in range(self.device_count):
handle = pynvml.nvmlDeviceGetHandleByIndex(i)
gpu_device = GPUDevice(handle=handle, gpu_index=i)
self.devices.append(gpu_device)

if PYNVML_AVAILABLE:
logger.debug("PyNVML available. Starting setup")
pynvml.nvmlInit()
nvidia_devices_count = pynvml.nvmlDeviceGetCount()
for i in range(nvidia_devices_count):
handle = pynvml.nvmlDeviceGetHandleByIndex(i)
nvidia_gpu_device = NvidiaGPUDevice(handle=handle, gpu_index=i)
self.devices.append(nvidia_gpu_device)

if AMDSMI_AVAILABLE:
logger.debug("AMDSMI available. Starting setup")
try:
amdsmi.amdsmi_init()
amd_devices_handles = amdsmi.amdsmi_get_processor_handles()
if len(amd_devices_handles) == 0:
print(
"No AMD GPUs foundon machine with amdsmi_get_processor_handles() !"
)
else:
for i, handle in enumerate(amd_devices_handles):
logger.debug(
f"Found AMD GPU device with handle {handle} and index {i} : {amdsmi.amdsmi_get_gpu_device_uuid(handle)}"
)
amd_gpu_device = AMDGPUDevice(handle=handle, gpu_index=i)
self.devices.append(amd_gpu_device)
except amdsmi.AmdSmiException as e:
logger.warning(f"Failed to initialize AMDSMI: {e}", exc_info=True)
self.device_count = len(self.devices)

def get_gpu_static_info(self) -> List:
"""Get all GPUs static information.
Expand All @@ -239,7 +420,7 @@ def get_gpu_static_info(self) -> List:
devices_static_info.append(gpu_device.get_static_details())
return devices_static_info

except pynvml.NVMLError:
except Exception:
logger.warning("Failed to retrieve gpu static info", exc_info=True)
return []

Expand Down Expand Up @@ -267,11 +448,11 @@ def get_gpu_details(self) -> List:
try:
devices_info = []
for i in range(self.device_count):
gpu_device: GPUDevice = self.devices[i]
gpu_device = self.devices[i]
devices_info.append(gpu_device.get_gpu_details())
return devices_info

except pynvml.NVMLError:
except Exception:
logger.warning("Failed to retrieve gpu information", exc_info=True)
return []

Expand All @@ -290,20 +471,15 @@ def get_delta(self, last_duration: Time) -> List:
try:
devices_info = []
for i in range(self.device_count):
gpu_device: GPUDevice = self.devices[i]
gpu_device = self.devices[i]
devices_info.append(gpu_device.delta(last_duration))
return devices_info

except pynvml.NVMLError:
except Exception:
logger.warning("Failed to retrieve gpu information", exc_info=True)
return []


def is_gpu_details_available() -> bool:
"""Returns True if the GPU details are available."""
try:
pynvml.nvmlInit()
return True

except pynvml.NVMLError:
return False
return PYNVML_AVAILABLE or AMDSMI_AVAILABLE
28 changes: 16 additions & 12 deletions codecarbon/core/resource_tracker.py
Original file line number Diff line number Diff line change
Expand Up @@ -209,26 +209,30 @@ def set_CPU_tracking(self):

def set_GPU_tracking(self):
logger.info("[setup] GPU Tracking...")
if self.tracker._gpu_ids:
if isinstance(self.tracker._gpu_ids, str):
self.tracker._gpu_ids = parse_gpu_ids(self.tracker._gpu_ids)
if self.tracker._gpu_ids:
self.tracker._conf["gpu_ids"] = self.tracker._gpu_ids
self.tracker._conf["gpu_count"] = len(self.tracker._gpu_ids)

if gpu.is_gpu_details_available():
logger.info("Tracking Nvidia GPU via pynvml")
self.tracker._conf["gpu_ids"] = self.tracker._gpu_ids
self.tracker._conf["gpu_count"] = len(self.tracker._gpu_ids)

is_nvidia = gpu.is_nvidia_system()
is_rocm = gpu.is_rocm_system()
if is_nvidia or is_rocm:
if is_nvidia:
logger.info("Tracking Nvidia GPUs via PyNVML")
self.gpu_tracker = "pynvml"
else:
logger.info("Tracking AMD GPUs via AMDSMI")
self.gpu_tracker = "amdsmi"
gpu_devices = GPU.from_utils(self.tracker._gpu_ids)
self.tracker._hardware.append(gpu_devices)
gpu_names = [n["name"] for n in gpu_devices.devices.get_gpu_static_info()]
gpu_names_dict = Counter(gpu_names)
self.tracker._conf["gpu_model"] = "".join(
[f"{i} x {name}" for name, i in gpu_names_dict.items()]
)
if self.tracker._conf.get("gpu_count") is None:
self.tracker._conf["gpu_count"] = len(
gpu_devices.devices.get_gpu_static_info()
)
self.gpu_tracker = "pynvml"
self.tracker._conf["gpu_count"] = len(
gpu_devices.devices.get_gpu_static_info()
)
else:
logger.info("No GPU found.")

Expand Down
1 change: 0 additions & 1 deletion codecarbon/emissions_tracker.py
Original file line number Diff line number Diff line change
Expand Up @@ -369,7 +369,6 @@ def __init__(
self._tasks: Dict[str, Task] = {}
self._active_task: Optional[str] = None
self._active_task_emissions_at_start: Optional[EmissionsData] = None

# Tracking mode detection
self._hardware = []
resource_tracker = ResourceTracker(self)
Expand Down
Loading