Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 7 additions & 1 deletion bundle/manifests/amd-gpu-operator.clusterserviceversion.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ metadata:
capabilities: Seamless Upgrades
categories: AI/Machine Learning,Monitoring
containerImage: registry.test.pensando.io:5000/amd-gpu-operator:dev
createdAt: "2026-04-06T08:31:30Z"
createdAt: "2026-04-22T01:09:34Z"
description: |-
Operator responsible for deploying AMD GPU kernel drivers, device plugin, device test runner and device metrics exporter
For more information, visit [documentation](https://instinct.docs.amd.com/projects/gpu-operator/en/latest/)
Expand Down Expand Up @@ -1310,6 +1310,12 @@ spec:
verbs:
- get
- update
- apiGroups:
- resource.k8s.io
resources:
- deviceclasses
verbs:
- create
- apiGroups:
- argoproj.io
resources:
Expand Down
6 changes: 6 additions & 0 deletions config/rbac/role.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -197,3 +197,9 @@ rules:
verbs:
- get
- update
- apiGroups:
- resource.k8s.io
resources:
- deviceclasses
verbs:
- create
59 changes: 58 additions & 1 deletion internal/controllers/device_config_reconciler.go
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,9 @@ import (
k8serrors "k8s.io/apimachinery/pkg/api/errors"
meta "k8s.io/apimachinery/pkg/api/meta"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/apis/meta/v1/unstructured"
"k8s.io/apimachinery/pkg/labels"
"k8s.io/apimachinery/pkg/runtime/schema"
"k8s.io/apimachinery/pkg/types"
"k8s.io/client-go/rest"
"k8s.io/client-go/util/retry"
Expand Down Expand Up @@ -79,8 +81,15 @@ const (
DeviceConfigReconcilerName = "DriverAndPluginReconciler"
deviceConfigFinalizer = "amd.node.kubernetes.io/deviceconfig-finalizer"
testRunnerNodeLabelPrefix = "testrunner.amd.com"
deviceClassName = "gpu.amd.com"
)

var draDeviceClassGVK = schema.GroupVersionKind{
Group: "resource.k8s.io",
Version: "v1",
Kind: "DeviceClass",
}

// ModuleReconciler reconciles a Module object
type DeviceConfigReconciler struct {
client.Client
Expand Down Expand Up @@ -108,7 +117,7 @@ func NewDeviceConfigReconciler(
kmmWatchEnabled bool) *DeviceConfigReconciler {
upgradeMgrHandler := newUpgradeMgrHandler(client, k8sConfig, isOpenShift)
remediationMgrHandler := newRemediationMgrHandler(client, apiReader, k8sConfig, isOpenShift)
helper := newDeviceConfigReconcilerHelper(client, kmmHandler, dpHandler, nlHandler, upgradeMgrHandler, remediationMgrHandler, metricsHandler, testrunnerHandler, configmanagerHandler, workerMgr, kmmWatchEnabled)
helper := newDeviceConfigReconcilerHelper(client, kmmHandler, dpHandler, nlHandler, upgradeMgrHandler, remediationMgrHandler, metricsHandler, testrunnerHandler, configmanagerHandler, workerMgr, isOpenShift, kmmWatchEnabled)
podEventHandler := watchers.NewPodEventHandler(client, workerMgr)
nodeEventHandler := watchers.NewNodeEventHandler(client, workerMgr)
daemonsetEventHandler := watchers.NewDaemonsetEventHandler(client)
Expand Down Expand Up @@ -203,6 +212,7 @@ func (r *DeviceConfigReconciler) init(ctx context.Context) {
//+kubebuilder:rbac:groups=core,resources=pods/eviction,verbs=delete;get;list;create
//+kubebuilder:rbac:groups=apiextensions.k8s.io,resources=customresourcedefinitions,verbs=get;list;watch;delete
//+kubebuilder:rbac:groups=monitoring.coreos.com,resources=servicemonitors,verbs=get;list;watch;create;update;patch;delete
//+kubebuilder:rbac:groups=resource.k8s.io,resources=deviceclasses,verbs=create

func (r *DeviceConfigReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) {
res := ctrl.Result{}
Expand Down Expand Up @@ -299,6 +309,11 @@ func (r *DeviceConfigReconciler) Reconcile(ctx context.Context, req ctrl.Request
return res, fmt.Errorf("failed to handle device-plugin for DeviceConfig %s: %v", req.NamespacedName, err)
}

logger.Info("start DeviceClass reconciliation")
if err = r.helper.handleDeviceClass(ctx, devConfig); err != nil {
return res, fmt.Errorf("failed to handle DeviceClass for DeviceConfig %s: %v", req.NamespacedName, err)
}

logger.Info("start dra-driver reconciliation")
if err = r.helper.handleDRADriver(ctx, devConfig, nodes); err != nil {
return res, fmt.Errorf("failed to handle dra-driver for DeviceConfig %s: %v", req.NamespacedName, err)
Expand Down Expand Up @@ -374,6 +389,7 @@ type deviceConfigReconcilerHelperAPI interface {
setFinalizer(ctx context.Context, devConfig *amdv1alpha1.DeviceConfig) error
handleKMMModule(ctx context.Context, devConfig *amdv1alpha1.DeviceConfig, nodes *v1.NodeList) error
handleDevicePlugin(ctx context.Context, devConfig *amdv1alpha1.DeviceConfig, nodes *v1.NodeList) error
handleDeviceClass(ctx context.Context, devConfig *amdv1alpha1.DeviceConfig) error
handleDRADriver(ctx context.Context, devConfig *amdv1alpha1.DeviceConfig, nodes *v1.NodeList) error
handleKMMVersionLabel(ctx context.Context, devConfig *amdv1alpha1.DeviceConfig, nodes *v1.NodeList) error
handleBuildConfigMap(ctx context.Context, devConfig *amdv1alpha1.DeviceConfig, nodes *v1.NodeList) error
Expand All @@ -392,6 +408,7 @@ type deviceConfigReconcilerHelperAPI interface {
type deviceConfigReconcilerHelper struct {
client client.Client
kmmWatchEnabled bool
isOpenShift bool
kmmHandler kmmmodule.KMMModuleAPI
devicePluginHandler plugin.DevicePluginAPI
nlHandler nodelabeller.NodeLabeller
Expand All @@ -418,12 +435,14 @@ func newDeviceConfigReconcilerHelper(client client.Client,
testrunnerHandler testrunner.TestRunner,
configmanagerHandler configmanager.ConfigManager,
workerMgr workermgr.WorkerMgrAPI,
isOpenShift bool,
kmmWatchEnabled bool) deviceConfigReconcilerHelperAPI {
conditionUpdater := conditions.NewDeviceConfigConditionMgr()
validator := validator.NewValidator()
return &deviceConfigReconcilerHelper{
client: client,
kmmWatchEnabled: kmmWatchEnabled,
isOpenShift: isOpenShift,
kmmHandler: kmmHandler,
devicePluginHandler: dpHandler,
nlHandler: nlHandler,
Expand Down Expand Up @@ -1234,6 +1253,44 @@ func (dcrh *deviceConfigReconcilerHelper) handleDRADriver(ctx context.Context, d
return nil
}

func (dcrh *deviceConfigReconcilerHelper) handleDeviceClass(ctx context.Context, devConfig *amdv1alpha1.DeviceConfig) error {
if !dcrh.isOpenShift {
return nil
}
if !devConfig.Spec.DRADriver.IsEnabled() {
return nil
}

logger := log.FromContext(ctx)

dc := &unstructured.Unstructured{}
dc.SetGroupVersionKind(draDeviceClassGVK)
dc.SetName(deviceClassName)
dc.SetLabels(map[string]string{
"app.kubernetes.io/component": "amd-gpu",
"app.kubernetes.io/part-of": "amd-gpu",
})
dc.Object["spec"] = map[string]interface{}{
"selectors": []interface{}{
map[string]interface{}{
"cel": map[string]interface{}{
"expression": "device.driver == '" + deviceClassName + "'",
},
},
},
}

if err := dcrh.client.Create(ctx, dc); err != nil {
if k8serrors.IsAlreadyExists(err) {
return nil
}
return fmt.Errorf("failed to create DeviceClass %s: %v", deviceClassName, err)
}

logger.Info("Created DeviceClass", "name", deviceClassName)
return nil
}

func (dcrh *deviceConfigReconcilerHelper) handleKMMVersionLabel(ctx context.Context, devConfig *amdv1alpha1.DeviceConfig, nodes *v1.NodeList) error {
// label corresponding node with given kmod version
// so that KMM could manage the upgrade by watching the node's version label change
Expand Down
94 changes: 86 additions & 8 deletions internal/controllers/device_config_reconciler_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -197,7 +197,7 @@ var _ = Describe("getLabelsPerModules", func() {
BeforeEach(func() {
ctrl := gomock.NewController(GinkgoT())
kubeClient = mock_client.NewMockClient(ctrl)
dcrh = newDeviceConfigReconcilerHelper(kubeClient, nil, nil, nil, nil, nil, nil, nil, nil, nil, true)
dcrh = newDeviceConfigReconcilerHelper(kubeClient, nil, nil, nil, nil, nil, nil, nil, nil, nil, false, true)
})

ctx := context.Background()
Expand Down Expand Up @@ -241,7 +241,7 @@ var _ = Describe("deviceConfigReconcilerHelper with KMM watch disabled", func()
BeforeEach(func() {
ctrl := gomock.NewController(GinkgoT())
kubeClient = mock_client.NewMockClient(ctrl)
dcrh = newDeviceConfigReconcilerHelper(kubeClient, nil, nil, nil, nil, nil, nil, nil, nil, nil, true)
dcrh = newDeviceConfigReconcilerHelper(kubeClient, nil, nil, nil, nil, nil, nil, nil, nil, nil, false, true)
})
ctx := context.Background()
nn := types.NamespacedName{
Expand Down Expand Up @@ -282,7 +282,7 @@ var _ = Describe("setFinalizer", func() {
BeforeEach(func() {
ctrl := gomock.NewController(GinkgoT())
kubeClient = mock_client.NewMockClient(ctrl)
dcrh = newDeviceConfigReconcilerHelper(kubeClient, nil, nil, nil, nil, nil, nil, nil, nil, nil, true)
dcrh = newDeviceConfigReconcilerHelper(kubeClient, nil, nil, nil, nil, nil, nil, nil, nil, nil, false, true)
})

ctx := context.Background()
Expand Down Expand Up @@ -318,7 +318,7 @@ var _ = Describe("finalizeDeviceConfig", func() {
BeforeEach(func() {
ctrl := gomock.NewController(GinkgoT())
kubeClient = mock_client.NewMockClient(ctrl)
dcrh = newDeviceConfigReconcilerHelper(kubeClient, nil, nil, nil, nil, nil, nil, nil, nil, nil, true)
dcrh = newDeviceConfigReconcilerHelper(kubeClient, nil, nil, nil, nil, nil, nil, nil, nil, nil, false, true)
})

ctx := context.Background()
Expand Down Expand Up @@ -539,7 +539,7 @@ var _ = Describe("handleKMMModule", func() {
ctrl := gomock.NewController(GinkgoT())
kubeClient = mock_client.NewMockClient(ctrl)
kmmHelper = kmmmodule.NewMockKMMModuleAPI(ctrl)
dcrh = newDeviceConfigReconcilerHelper(kubeClient, kmmHelper, nil, nil, nil, nil, nil, nil, nil, nil, true)
dcrh = newDeviceConfigReconcilerHelper(kubeClient, kmmHelper, nil, nil, nil, nil, nil, nil, nil, nil, false, true)
})

ctx := context.Background()
Expand Down Expand Up @@ -609,7 +609,7 @@ var _ = Describe("handleBuildConfigMap", func() {
ctrl := gomock.NewController(GinkgoT())
kubeClient = mock_client.NewMockClient(ctrl)
kmmHelper = kmmmodule.NewMockKMMModuleAPI(ctrl)
dcrh = newDeviceConfigReconcilerHelper(kubeClient, kmmHelper, nil, nil, nil, nil, nil, nil, nil, nil, true)
dcrh = newDeviceConfigReconcilerHelper(kubeClient, kmmHelper, nil, nil, nil, nil, nil, nil, nil, nil, false, true)
})

ctx := context.Background()
Expand Down Expand Up @@ -676,7 +676,7 @@ var _ = Describe("handleNodeLabeller", func() {
ctrl := gomock.NewController(GinkgoT())
kubeClient = mock_client.NewMockClient(ctrl)
nodeLabellerHelper = nodelabeller.NewMockNodeLabeller(ctrl)
dcrh = newDeviceConfigReconcilerHelper(kubeClient, nil, nil, nodeLabellerHelper, nil, nil, nil, nil, nil, nil, true)
dcrh = newDeviceConfigReconcilerHelper(kubeClient, nil, nil, nodeLabellerHelper, nil, nil, nil, nil, nil, nil, false, true)
})

ctx := context.Background()
Expand Down Expand Up @@ -762,7 +762,7 @@ var _ = Describe("buildNodeAssignments", func() {
BeforeEach(func() {
ctrl := gomock.NewController(GinkgoT())
kubeClient := mock_client.NewMockClient(ctrl)
dcrh = newDeviceConfigReconcilerHelper(kubeClient, nil, nil, nil, nil, nil, nil, nil, nil, nil, true)
dcrh = newDeviceConfigReconcilerHelper(kubeClient, nil, nil, nil, nil, nil, nil, nil, nil, nil, false, true)
})

It("skips non-ready DeviceConfigs", func() {
Expand Down Expand Up @@ -821,3 +821,81 @@ var _ = Describe("buildNodeAssignments", func() {
Expect(err).ToNot(HaveOccurred())
})
})

var _ = Describe("handleDeviceClass", func() {
var (
kubeClient *mock_client.MockClient
dcrh deviceConfigReconcilerHelperAPI
)

ctx := context.Background()
draEnabled := true
draDisabled := false

draEnabledConfig := &amdv1alpha1.DeviceConfig{
ObjectMeta: metav1.ObjectMeta{Name: devConfigName, Namespace: devConfigNamespace},
Spec: amdv1alpha1.DeviceConfigSpec{
DRADriver: amdv1alpha1.DRADriverSpec{Enable: &draEnabled},
},
}

draDisabledConfig := &amdv1alpha1.DeviceConfig{
ObjectMeta: metav1.ObjectMeta{Name: devConfigName, Namespace: devConfigNamespace},
Spec: amdv1alpha1.DeviceConfigSpec{
DRADriver: amdv1alpha1.DRADriverSpec{Enable: &draDisabled},
},
}

It("should skip when not on OpenShift", func() {
ctrl := gomock.NewController(GinkgoT())
kubeClient = mock_client.NewMockClient(ctrl)
dcrh = newDeviceConfigReconcilerHelper(kubeClient, nil, nil, nil, nil, nil, nil, nil, nil, nil, false, true)

err := dcrh.handleDeviceClass(ctx, draEnabledConfig)
Expect(err).ToNot(HaveOccurred())
})

It("should skip when DRA driver is not enabled", func() {
ctrl := gomock.NewController(GinkgoT())
kubeClient = mock_client.NewMockClient(ctrl)
dcrh = newDeviceConfigReconcilerHelper(kubeClient, nil, nil, nil, nil, nil, nil, nil, nil, nil, true, true)

err := dcrh.handleDeviceClass(ctx, draDisabledConfig)
Expect(err).ToNot(HaveOccurred())
})

It("should create DeviceClass when it does not exist", func() {
ctrl := gomock.NewController(GinkgoT())
kubeClient = mock_client.NewMockClient(ctrl)
dcrh = newDeviceConfigReconcilerHelper(kubeClient, nil, nil, nil, nil, nil, nil, nil, nil, nil, true, true)

kubeClient.EXPECT().Create(ctx, gomock.Any()).Return(nil)

err := dcrh.handleDeviceClass(ctx, draEnabledConfig)
Expect(err).ToNot(HaveOccurred())
})

It("should succeed when DeviceClass already exists", func() {
ctrl := gomock.NewController(GinkgoT())
kubeClient = mock_client.NewMockClient(ctrl)
dcrh = newDeviceConfigReconcilerHelper(kubeClient, nil, nil, nil, nil, nil, nil, nil, nil, nil, true, true)

kubeClient.EXPECT().Create(ctx, gomock.Any()).Return(
k8serrors.NewAlreadyExists(schema.GroupResource{Group: "resource.k8s.io", Resource: "deviceclasses"}, "gpu.amd.com"),
)

err := dcrh.handleDeviceClass(ctx, draEnabledConfig)
Expect(err).ToNot(HaveOccurred())
})

It("should return error when Create fails", func() {
ctrl := gomock.NewController(GinkgoT())
kubeClient = mock_client.NewMockClient(ctrl)
dcrh = newDeviceConfigReconcilerHelper(kubeClient, nil, nil, nil, nil, nil, nil, nil, nil, nil, true, true)

kubeClient.EXPECT().Create(ctx, gomock.Any()).Return(fmt.Errorf("server error"))

err := dcrh.handleDeviceClass(ctx, draEnabledConfig)
Expect(err).To(HaveOccurred())
})
})
14 changes: 14 additions & 0 deletions internal/controllers/mock_device_config_reconciler.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.