From 697870ff0c6f014cf6b32d4505221947a2355b00 Mon Sep 17 00:00:00 2001 From: Tuomas Katila Date: Tue, 18 Nov 2025 11:57:52 +0200 Subject: [PATCH 1/4] gpu: remove unused variables Signed-off-by: Tuomas Katila --- cmd/gpu_plugin/gpu_plugin.go | 2 -- 1 file changed, 2 deletions(-) diff --git a/cmd/gpu_plugin/gpu_plugin.go b/cmd/gpu_plugin/gpu_plugin.go index 619fde38b..42caec066 100644 --- a/cmd/gpu_plugin/gpu_plugin.go +++ b/cmd/gpu_plugin/gpu_plugin.go @@ -43,8 +43,6 @@ const ( devfsDriDirectory = "/dev/dri" wslDxgPath = "/dev/dxg" wslLibPath = "/usr/lib/wsl" - nfdFeatureDir = "/etc/kubernetes/node-feature-discovery/features.d" - resourceFilename = "intel-gpu-resources.txt" gpuDeviceRE = `^card[0-9]+$` controlDeviceRE = `^controlD[0-9]+$` pciAddressRE = "^[0-9a-f]{4}:[0-9a-f]{2}:[0-9a-f]{2}\\.[0-9a-f]{1}$" From 592f86d9bcefdaeb8476c9530827012e450cc728 Mon Sep 17 00:00:00 2001 From: Tuomas Katila Date: Tue, 18 Nov 2025 11:59:32 +0200 Subject: [PATCH 2/4] gpu: add support for additional temperature limits Use the existing "temp-limit" as the global limit, and introduce GPU and memory thresholds. Signed-off-by: Tuomas Katila --- cmd/gpu_plugin/gpu_plugin.go | 14 ++++++++++---- cmd/gpu_plugin/gpu_plugin_test.go | 29 ++++++++++++++++++++++++----- 2 files changed, 34 insertions(+), 9 deletions(-) diff --git a/cmd/gpu_plugin/gpu_plugin.go b/cmd/gpu_plugin/gpu_plugin.go index 42caec066..65aa8da9d 100644 --- a/cmd/gpu_plugin/gpu_plugin.go +++ b/cmd/gpu_plugin/gpu_plugin.go @@ -70,7 +70,9 @@ type cliOptions struct { allowIDs string denyIDs string sharedDevNum int - temperatureLimit int + globalTempLimit int + memoryTempLimit int + gpuTempLimit int enableMonitoring bool wslScan bool healthManagement bool @@ -402,13 +404,15 @@ func (dp *devicePlugin) healthStatusForCard(cardPath string) string { return health } - limit := float64(dp.options.temperatureLimit) + globalTempLimit := float64(dp.options.globalTempLimit) + memoryTempLimit := float64(dp.options.memoryTempLimit) + gpuTempLimit := float64(dp.options.gpuTempLimit) // Temperatures for different areas klog.V(4).Infof("Temperatures: Memory=%.1fC, GPU=%.1fC, Global=%.1fC", deviceTemps.Memory, deviceTemps.GPU, deviceTemps.Global) - if deviceTemps.GPU > limit || deviceTemps.Global > limit || deviceTemps.Memory > limit { + if deviceTemps.GPU > gpuTempLimit || deviceTemps.Global > globalTempLimit || deviceTemps.Memory > memoryTempLimit { health = pluginapi.Unhealthy } @@ -784,7 +788,9 @@ func main() { flag.BoolVar(&opts.healthManagement, "health-management", false, "enable GPU health management") flag.BoolVar(&opts.wslScan, "wsl", false, "scan for / use WSL devices") flag.IntVar(&opts.sharedDevNum, "shared-dev-num", 1, "number of containers sharing the same GPU device") - flag.IntVar(&opts.temperatureLimit, "temp-limit", 100, "temperature limit at which device is marked unhealthy") + flag.IntVar(&opts.globalTempLimit, "temp-limit", 100, "Global temperature limit at which device is marked unhealthy") + flag.IntVar(&opts.gpuTempLimit, "gpu-temp-limit", 100, "GPU temperature limit at which device is marked unhealthy") + flag.IntVar(&opts.memoryTempLimit, "memory-temp-limit", 100, "Memory temperature limit at which device is marked unhealthy") flag.StringVar(&opts.preferredAllocationPolicy, "allocation-policy", "none", "modes of allocating GPU devices: balanced, packed and none") flag.StringVar(&opts.allowIDs, "allow-ids", "", "comma-separated list of device IDs to allow (e.g. 0x49c5,0x49c6)") flag.StringVar(&opts.denyIDs, "deny-ids", "", "comma-separated list of device IDs to deny (e.g. 0x49c5,0x49c6)") diff --git a/cmd/gpu_plugin/gpu_plugin_test.go b/cmd/gpu_plugin/gpu_plugin_test.go index beb9f7262..86fd48e65 100644 --- a/cmd/gpu_plugin/gpu_plugin_test.go +++ b/cmd/gpu_plugin/gpu_plugin_test.go @@ -58,10 +58,11 @@ func (n *mockNotifier) Notify(newDeviceTree dpapi.DeviceTree) { } type mockL0Service struct { - indices []uint32 - memSize uint64 - healthy bool - fail bool + indices []uint32 + memSize uint64 + healthy bool + failTemp bool + fail bool } func (m *mockL0Service) Run(keep bool) { @@ -83,7 +84,7 @@ func (m *mockL0Service) GetDeviceHealth(bdfAddress string) (levelzeroservice.Dev return levelzeroservice.DeviceHealth{Memory: m.healthy, Bus: m.healthy, SoC: m.healthy}, nil } func (m *mockL0Service) GetDeviceTemperature(bdfAddress string) (levelzeroservice.DeviceTemperature, error) { - if m.fail { + if m.fail || m.failTemp { return levelzeroservice.DeviceTemperature{}, errors.Errorf("error, error") } @@ -608,6 +609,24 @@ func TestScanWithHealth(t *testing.T) { healthy: true, }, }, + { + name: "one device with failure on temp reading", + pciAddresses: map[string]string{"0000:00:00.0": "card0"}, + sysfsdirs: []string{"card0/device/drm/card0", "card0/device/drm/controlD64"}, + sysfsfiles: map[string][]byte{ + "card0/device/vendor": []byte("0x8086"), + }, + devfsdirs: []string{ + "card0", + "by-path/pci-0000:00:00.0-card", + "by-path/pci-0000:00:00.0-render", + }, + expectedI915Devs: 1, + l0mock: &mockL0Service{ + healthy: true, + failTemp: true, + }, + }, { name: "one unhealthy device with proper symlink", pciAddresses: map[string]string{"0000:00:00.0": "card0"}, From 43a11ad37222a0d8ee660de1277017be0bbbc963 Mon Sep 17 00:00:00 2001 From: Tuomas Katila Date: Wed, 19 Nov 2025 12:14:09 +0200 Subject: [PATCH 3/4] gpu: levelzero: update compute-runtime components Fix uninitialized variable that caused random behaviour. Signed-off-by: Tuomas Katila --- build/docker/intel-gpu-levelzero.Dockerfile | 14 +++++++------- .../templates/intel-gpu-levelzero.Dockerfile.in | 14 +++++++------- cmd/gpu_levelzero/zes.c | 15 +++++++++++---- 3 files changed, 25 insertions(+), 18 deletions(-) diff --git a/build/docker/intel-gpu-levelzero.Dockerfile b/build/docker/intel-gpu-levelzero.Dockerfile index cf04f45f9..3951ef0a3 100644 --- a/build/docker/intel-gpu-levelzero.Dockerfile +++ b/build/docker/intel-gpu-levelzero.Dockerfile @@ -44,13 +44,13 @@ RUN if [ $ROCKYLINUX -eq 0 ]; then \ LATEST_GO=$(curl --no-progress-meter https://go.dev/dl/?mode=json | jq ".[] | select(.version | startswith(\"go${CGO_VERSION}\")).version" | tr -d "\"") && \ wget -q https://go.dev/dl/$LATEST_GO.linux-amd64.tar.gz -O - | tar -xz -C /usr/local && \ cd /runtime && \ - wget -q https://github.com/intel/compute-runtime/releases/download/25.09.32961.7/intel-level-zero-gpu_1.6.32961.7_amd64.deb && \ - wget -q https://github.com/intel/compute-runtime/releases/download/25.09.32961.7/intel-opencl-icd_25.09.32961.7_amd64.deb && \ - wget -q https://github.com/intel/compute-runtime/releases/download/25.09.32961.7/libigdgmm12_22.6.0_amd64.deb && \ - wget -q https://github.com/oneapi-src/level-zero/releases/download/v1.20.2/level-zero-devel_1.20.2+u22.04_amd64.deb && \ - wget -q https://github.com/oneapi-src/level-zero/releases/download/v1.20.2/level-zero_1.20.2+u22.04_amd64.deb && \ - wget -q https://github.com/intel/intel-graphics-compiler/releases/download/v2.8.3/intel-igc-core-2_2.8.3+18762_amd64.deb && \ - wget -q https://github.com/intel/intel-graphics-compiler/releases/download/v2.8.3/intel-igc-opencl-2_2.8.3+18762_amd64.deb && \ + wget -q https://github.com/intel/intel-graphics-compiler/releases/download/v2.20.3/intel-igc-core-2_2.20.3+19972_amd64.deb && \ + wget -q https://github.com/intel/intel-graphics-compiler/releases/download/v2.20.3/intel-igc-opencl-2_2.20.3+19972_amd64.deb && \ + wget -q https://github.com/intel/compute-runtime/releases/download/25.40.35563.4/intel-opencl-icd_25.40.35563.4-0_amd64.deb && \ + wget -q https://github.com/intel/compute-runtime/releases/download/25.40.35563.4/libigdgmm12_22.8.2_amd64.deb && \ + wget -q https://github.com/intel/compute-runtime/releases/download/25.40.35563.4/libze-intel-gpu1_25.40.35563.4-0_amd64.deb && \ + wget -q https://github.com/oneapi-src/level-zero/releases/download/v1.24.3/level-zero_1.24.3+u22.04_amd64.deb && \ + wget -q https://github.com/oneapi-src/level-zero/releases/download/v1.24.3/level-zero-devel_1.24.3+u22.04_amd64.deb && \ dpkg -i *.deb && \ rm -rf /var/lib/apt/lists/\*; \ else \ diff --git a/build/docker/templates/intel-gpu-levelzero.Dockerfile.in b/build/docker/templates/intel-gpu-levelzero.Dockerfile.in index d109c619c..fd77fa8fa 100644 --- a/build/docker/templates/intel-gpu-levelzero.Dockerfile.in +++ b/build/docker/templates/intel-gpu-levelzero.Dockerfile.in @@ -37,13 +37,13 @@ RUN if [ $ROCKYLINUX -eq 0 ]; then \N LATEST_GO=$(curl --no-progress-meter https://go.dev/dl/?mode=json | jq ".[] | select(.version | startswith(\"go${CGO_VERSION}\")).version" | tr -d "\"") && \N wget -q https://go.dev/dl/$LATEST_GO.linux-amd64.tar.gz -O - | tar -xz -C /usr/local && \N cd /runtime && \N - wget -q https://github.com/intel/compute-runtime/releases/download/25.09.32961.7/intel-level-zero-gpu_1.6.32961.7_amd64.deb && \N - wget -q https://github.com/intel/compute-runtime/releases/download/25.09.32961.7/intel-opencl-icd_25.09.32961.7_amd64.deb && \N - wget -q https://github.com/intel/compute-runtime/releases/download/25.09.32961.7/libigdgmm12_22.6.0_amd64.deb && \N - wget -q https://github.com/oneapi-src/level-zero/releases/download/v1.20.2/level-zero-devel_1.20.2+u22.04_amd64.deb && \N - wget -q https://github.com/oneapi-src/level-zero/releases/download/v1.20.2/level-zero_1.20.2+u22.04_amd64.deb && \N - wget -q https://github.com/intel/intel-graphics-compiler/releases/download/v2.8.3/intel-igc-core-2_2.8.3+18762_amd64.deb && \N - wget -q https://github.com/intel/intel-graphics-compiler/releases/download/v2.8.3/intel-igc-opencl-2_2.8.3+18762_amd64.deb && \N + wget -q https://github.com/intel/intel-graphics-compiler/releases/download/v2.20.3/intel-igc-core-2_2.20.3+19972_amd64.deb && \N + wget -q https://github.com/intel/intel-graphics-compiler/releases/download/v2.20.3/intel-igc-opencl-2_2.20.3+19972_amd64.deb && \N + wget -q https://github.com/intel/compute-runtime/releases/download/25.40.35563.4/intel-opencl-icd_25.40.35563.4-0_amd64.deb && \N + wget -q https://github.com/intel/compute-runtime/releases/download/25.40.35563.4/libigdgmm12_22.8.2_amd64.deb && \N + wget -q https://github.com/intel/compute-runtime/releases/download/25.40.35563.4/libze-intel-gpu1_25.40.35563.4-0_amd64.deb && \N + wget -q https://github.com/oneapi-src/level-zero/releases/download/v1.24.3/level-zero_1.24.3+u22.04_amd64.deb && \N + wget -q https://github.com/oneapi-src/level-zero/releases/download/v1.24.3/level-zero-devel_1.24.3+u22.04_amd64.deb && \N dpkg -i *.deb && \N rm -rf /var/lib/apt/lists/\*; \N else \N diff --git a/cmd/gpu_levelzero/zes.c b/cmd/gpu_levelzero/zes.c index a4593deb8..5243e0321 100644 --- a/cmd/gpu_levelzero/zes.c +++ b/cmd/gpu_levelzero/zes.c @@ -137,8 +137,12 @@ static ze_result_t enumerate_zes_devices(void) for (uint32_t i = 0; i < count; ++i) { zes_device_handle_t dev_h = zes_handles[i]; - zes_pci_properties_t pci_props; + zes_pci_properties_t pci_props = { + .pNext = NULL, + }; + if (zesDevicePciGetProperties(dev_h, &pci_props) != ZE_RESULT_SUCCESS) { + print_log(LOG_WARNING, "Failed to get PCI properties for device %d: %X\n", i, res); continue; } @@ -332,8 +336,9 @@ bool zes_device_bus_is_healthy(char* bdf_address, uint32_t* error) return true; } - zes_pci_state_t pci_state; - memset(&pci_state, 0, sizeof(pci_state)); + zes_pci_state_t pci_state = { + .pNext = NULL, + }; ze_result_t res = zesDevicePciGetState(handle, &pci_state); if (res == ZE_RESULT_SUCCESS) { @@ -409,7 +414,9 @@ double zes_device_temp_max(char* bdf_address, char* sensor, uint32_t* error) } for (uint32_t i = 0; i < count; ++i) { - zes_temp_properties_t props; + zes_temp_properties_t props = { + .pNext = NULL, + }; res = zesTemperatureGetProperties(tempHandles[i], &props); if (res != ZE_RESULT_SUCCESS) { From f27095642429984fe2905d4d783a895fc9fc675c Mon Sep 17 00:00:00 2001 From: Tuomas Katila Date: Wed, 19 Nov 2025 12:36:15 +0200 Subject: [PATCH 4/4] build: levelzero: re-download compute-runtime and others By re-downloading the components, we save on the overall container size. While the build time increases slightly, the container size drops by around 100M (520->420). Signed-off-by: Tuomas Katila --- build/docker/intel-gpu-levelzero.Dockerfile | 17 ++++++++++++++--- .../templates/intel-gpu-levelzero.Dockerfile.in | 17 ++++++++++++++--- 2 files changed, 28 insertions(+), 6 deletions(-) diff --git a/build/docker/intel-gpu-levelzero.Dockerfile b/build/docker/intel-gpu-levelzero.Dockerfile index 3951ef0a3..3737d7787 100644 --- a/build/docker/intel-gpu-levelzero.Dockerfile +++ b/build/docker/intel-gpu-levelzero.Dockerfile @@ -52,6 +52,7 @@ RUN if [ $ROCKYLINUX -eq 0 ]; then \ wget -q https://github.com/oneapi-src/level-zero/releases/download/v1.24.3/level-zero_1.24.3+u22.04_amd64.deb && \ wget -q https://github.com/oneapi-src/level-zero/releases/download/v1.24.3/level-zero-devel_1.24.3+u22.04_amd64.deb && \ dpkg -i *.deb && \ + rm -f *.deb && \ rm -rf /var/lib/apt/lists/\*; \ else \ source /etc/os-release && dnf install -y gcc jq wget 'dnf-command(config-manager)' && \ @@ -83,9 +84,19 @@ ARG CMD ARG ROCKYLINUX COPY --from=builder /runtime /runtime RUN if [ $ROCKYLINUX -eq 0 ]; then \ - apt-get update && apt-get install --no-install-recommends -y ocl-icd-libopencl1 && \ - rm /runtime/level-zero-devel_*.deb && \ - cd /runtime && dpkg -i *.deb && rm -rf /runtime && \ + apt-get update && apt-get install --no-install-recommends -y ocl-icd-libopencl1 wget ca-certificates && \ + cd /runtime && \ + wget https://github.com/intel/intel-graphics-compiler/releases/download/v2.20.3/intel-igc-core-2_2.20.3+19972_amd64.deb && \ + wget https://github.com/intel/intel-graphics-compiler/releases/download/v2.20.3/intel-igc-opencl-2_2.20.3+19972_amd64.deb && \ + wget https://github.com/intel/compute-runtime/releases/download/25.40.35563.4/intel-opencl-icd_25.40.35563.4-0_amd64.deb && \ + wget https://github.com/intel/compute-runtime/releases/download/25.40.35563.4/libigdgmm12_22.8.2_amd64.deb && \ + wget https://github.com/intel/compute-runtime/releases/download/25.40.35563.4/libze-intel-gpu1_25.40.35563.4-0_amd64.deb && \ + wget https://github.com/oneapi-src/level-zero/releases/download/v1.24.3/level-zero_1.24.3+u22.04_amd64.deb && \ + dpkg -i *.deb && \ + apt-get -y remove wget ca-certificates && \ + apt-get -y autoremove && \ + rm -f *.deb && \ + rm -rf /var/lib/apt/lists/\* && \ rm "/lib/x86_64-linux-gnu/libze_validation"* && rm "/lib/x86_64-linux-gnu/libze_tracing_layer"*; \ else \ cp -a /runtime//*.so* /usr/lib64/ && cp -a /runtime/OpenCL /etc/ && cp -a /runtime/licenses/* /usr/share/licenses/; \ diff --git a/build/docker/templates/intel-gpu-levelzero.Dockerfile.in b/build/docker/templates/intel-gpu-levelzero.Dockerfile.in index fd77fa8fa..c72ee067b 100644 --- a/build/docker/templates/intel-gpu-levelzero.Dockerfile.in +++ b/build/docker/templates/intel-gpu-levelzero.Dockerfile.in @@ -45,6 +45,7 @@ RUN if [ $ROCKYLINUX -eq 0 ]; then \N wget -q https://github.com/oneapi-src/level-zero/releases/download/v1.24.3/level-zero_1.24.3+u22.04_amd64.deb && \N wget -q https://github.com/oneapi-src/level-zero/releases/download/v1.24.3/level-zero-devel_1.24.3+u22.04_amd64.deb && \N dpkg -i *.deb && \N + rm -f *.deb && \N rm -rf /var/lib/apt/lists/\*; \N else \N source /etc/os-release && dnf install -y gcc jq wget 'dnf-command(config-manager)' && \N @@ -80,9 +81,19 @@ ARG ROCKYLINUX COPY --from=builder /runtime /runtime RUN if [ $ROCKYLINUX -eq 0 ]; then \N - apt-get update && apt-get install --no-install-recommends -y ocl-icd-libopencl1 && \N - rm /runtime/level-zero-devel_*.deb && \N - cd /runtime && dpkg -i *.deb && rm -rf /runtime && \N + apt-get update && apt-get install --no-install-recommends -y ocl-icd-libopencl1 wget ca-certificates && \N + cd /runtime && \N + wget https://github.com/intel/intel-graphics-compiler/releases/download/v2.20.3/intel-igc-core-2_2.20.3+19972_amd64.deb && \N + wget https://github.com/intel/intel-graphics-compiler/releases/download/v2.20.3/intel-igc-opencl-2_2.20.3+19972_amd64.deb && \N + wget https://github.com/intel/compute-runtime/releases/download/25.40.35563.4/intel-opencl-icd_25.40.35563.4-0_amd64.deb && \N + wget https://github.com/intel/compute-runtime/releases/download/25.40.35563.4/libigdgmm12_22.8.2_amd64.deb && \N + wget https://github.com/intel/compute-runtime/releases/download/25.40.35563.4/libze-intel-gpu1_25.40.35563.4-0_amd64.deb && \N + wget https://github.com/oneapi-src/level-zero/releases/download/v1.24.3/level-zero_1.24.3+u22.04_amd64.deb && \N + dpkg -i *.deb && \N + apt-get -y remove wget ca-certificates && \N + apt-get -y autoremove && \N + rm -f *.deb && \N + rm -rf /var/lib/apt/lists/\* && \N rm "/lib/x86_64-linux-gnu/libze_validation"* && rm "/lib/x86_64-linux-gnu/libze_tracing_layer"*; \N else \N cp -a /runtime//*.so* /usr/lib64/ && cp -a /runtime/OpenCL /etc/ && cp -a /runtime/licenses/* /usr/share/licenses/; \N