Skip to content

Commit b7ffe80

Browse files
tkatilapfl
authored andcommitted
discover: improve module_id read and make it optional
The previously used /sys/devices/virtual/accel/accel*/device did not work in 6.8 kernel (24.04) and we now use /sys/class/accel/accel*/device. This change also makes the module_id optional so if we cannot read it, we just ignore it in the metrics. Signed-off-by: Tuomas Katila <[email protected]>
1 parent 30245e3 commit b7ffe80

File tree

1 file changed

+21
-8
lines changed

1 file changed

+21
-8
lines changed

cmd/discover/network.go

Lines changed: 21 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,7 @@ const (
3434
driverPath = "bus/pci/drivers/habanalabs/"
3535
pciDevicePattern = "????:??:??.?"
3636
netDevicePattern = "net/*"
37-
accelDevicePath = "devices/virtual/accel"
37+
accelDevicePath = "class/accel"
3838
accelDevicePattern = "accel[0-9]*"
3939
accelDeviceDir = "device"
4040
accelModuleIdFile = "module_id"
@@ -90,27 +90,40 @@ func sysfsDriverPath() string {
9090
return filepath.Join(getSysfsRoot(), driverPath)
9191
}
9292

93+
func sysfsClassAccelPath() string {
94+
return filepath.Join(getSysfsRoot(), accelDevicePath)
95+
}
96+
9397
func getModuleIds() (map[string]string, error) {
9498
moduleIds := make(map[string]string)
9599

96-
pattern := filepath.Join(getSysfsRoot(), accelDevicePath, accelDevicePattern, accelDeviceDir)
100+
pattern := filepath.Join(sysfsClassAccelPath(), accelDevicePattern)
97101
paths, err := filepath.Glob(pattern)
98102
if err != nil {
99103
return nil, err
100104
}
101105

102106
for _, path := range paths {
103-
id, err := os.ReadFile(filepath.Join(path, accelModuleIdFile))
107+
id, err := os.ReadFile(filepath.Join(path, accelDeviceDir, accelModuleIdFile))
104108
if err != nil {
105-
return nil, err
109+
klog.Warningf("no module ID found in %s", path)
110+
111+
continue
106112
}
107113

108-
pciaddr, err := os.ReadFile(filepath.Join(path, accelPCIDeviceFile))
114+
pciaddr, err := os.ReadFile(filepath.Join(path, accelDeviceDir, accelPCIDeviceFile))
109115
if err != nil {
110-
return nil, err
116+
klog.Warningf("no PCI address file in %s", path)
117+
118+
continue
111119
}
112120

113-
moduleIds[strings.TrimSpace(string(pciaddr))] = strings.TrimSpace(string(id))
121+
trimPCI := strings.TrimSpace(string(pciaddr))
122+
trimID := strings.TrimSpace(string(id))
123+
124+
klog.V(3).Infof("PCI address %s has module ID: %s", trimPCI, trimID)
125+
126+
moduleIds[trimPCI] = trimID
114127
}
115128

116129
return moduleIds, nil
@@ -157,7 +170,7 @@ func getNetworkConfigs(interfaces []string) ([]string, map[string]*networkConfig
157170
pciaddr := filepath.Base(p)
158171
id, exists := moduleIds[pciaddr]
159172
if !exists {
160-
return nil, nil, fmt.Errorf("PCI device '%s' does not have a module id", pciaddr)
173+
klog.Warningf("PCI device '%s' does not have a module id", pciaddr)
161174
}
162175

163176
devicesymlinktarget, err := filepath.EvalSymlinks(p)

0 commit comments

Comments
 (0)