diff --git a/bundle/manifests/amd-gpu-operator.clusterserviceversion.yaml b/bundle/manifests/amd-gpu-operator.clusterserviceversion.yaml index a627341a4..26c7a3307 100644 --- a/bundle/manifests/amd-gpu-operator.clusterserviceversion.yaml +++ b/bundle/manifests/amd-gpu-operator.clusterserviceversion.yaml @@ -36,7 +36,7 @@ metadata: capabilities: Seamless Upgrades categories: AI/Machine Learning,Monitoring containerImage: docker.io/rocm/amd-gpu-operator:dev - createdAt: "2026-04-02T12:26:30Z" + createdAt: "2026-04-07T12:28:11Z" description: |- Operator responsible for deploying AMD GPU kernel drivers, device plugin, device test runner and device metrics exporter For more information, visit [documentation](https://instinct.docs.amd.com/projects/gpu-operator/en/latest/) @@ -1242,6 +1242,18 @@ spec: - get - update - watch + - apiGroups: + - apps + resources: + - deployments + verbs: + - create + - delete + - get + - list + - patch + - update + - watch - apiGroups: - kmm.sigs.x-k8s.io resources: diff --git a/config/rbac/role.yaml b/config/rbac/role.yaml index 2d0b992aa..0da4e9253 100644 --- a/config/rbac/role.yaml +++ b/config/rbac/role.yaml @@ -131,6 +131,18 @@ rules: - get - update - watch +- apiGroups: + - apps + resources: + - deployments + verbs: + - create + - delete + - get + - list + - patch + - update + - watch - apiGroups: - kmm.sigs.x-k8s.io resources: diff --git a/docs/autoremediation/auto-remediation.md b/docs/autoremediation/auto-remediation.md index 31a9516f9..c14758c54 100644 --- a/docs/autoremediation/auto-remediation.md +++ b/docs/autoremediation/auto-remediation.md @@ -262,7 +262,14 @@ Each entry in the ConfigMap maps a unique error code (AFID) to its remediation w The following example demonstrates a complete error mapping configuration: ```yaml -- nodeCondition: AMDGPUXgmi +apiVersion: v1 +kind: ConfigMap +metadata: + name: auto-remediation-custom-config + namespace: kube-amd-gpu +data: + workflow: | + - nodeCondition: AMDGPUXgmi workflowTemplate: default-template validationTestsProfile: framework: AGFHC diff --git a/hack/k8s-patch/template-patch/config-manager-rbac.yaml b/hack/k8s-patch/template-patch/config-manager-rbac.yaml index 4f14ce281..82ad0c429 100644 --- a/hack/k8s-patch/template-patch/config-manager-rbac.yaml +++ b/hack/k8s-patch/template-patch/config-manager-rbac.yaml @@ -30,7 +30,6 @@ rules: - "apps" resources: - daemonsets - - deployments verbs: - get - list diff --git a/helm-charts-k8s/Chart.lock b/helm-charts-k8s/Chart.lock index 9fc17b6e4..991dff6b3 100644 --- a/helm-charts-k8s/Chart.lock +++ b/helm-charts-k8s/Chart.lock @@ -9,4 +9,4 @@ dependencies: repository: file://./charts/remediation-crds version: v1.0.0 digest: sha256:0806f6b6d7aa21be77bf1c91e720ae3238338a16f107df450a53b02ef940db1b -generated: "2026-04-02T12:26:25.920315689Z" +generated: "2026-04-07T12:28:07.188885215Z" diff --git a/helm-charts-k8s/templates/config-manager-rbac.yaml b/helm-charts-k8s/templates/config-manager-rbac.yaml index 4f14ce281..82ad0c429 100644 --- a/helm-charts-k8s/templates/config-manager-rbac.yaml +++ b/helm-charts-k8s/templates/config-manager-rbac.yaml @@ -30,7 +30,6 @@ rules: - "apps" resources: - daemonsets - - deployments verbs: - get - list diff --git a/helm-charts-k8s/templates/manager-rbac.yaml b/helm-charts-k8s/templates/manager-rbac.yaml index 4a9547ec4..774022851 100644 --- a/helm-charts-k8s/templates/manager-rbac.yaml +++ b/helm-charts-k8s/templates/manager-rbac.yaml @@ -134,6 +134,18 @@ rules: - get - update - watch +- apiGroups: + - apps + resources: + - deployments + verbs: + - create + - delete + - get + - list + - patch + - update + - watch - apiGroups: - kmm.sigs.x-k8s.io resources: diff --git a/internal/controllers/device_config_reconciler.go b/internal/controllers/device_config_reconciler.go index d72c58354..9fad69c13 100644 --- a/internal/controllers/device_config_reconciler.go +++ b/internal/controllers/device_config_reconciler.go @@ -194,6 +194,7 @@ func (r *DeviceConfigReconciler) init(ctx context.Context) { //+kubebuilder:rbac:groups=apps,resources=daemonsets,verbs=create;delete;get;list;patch;watch //+kubebuilder:rbac:groups=apps,resources=daemonsets/status,verbs=create;delete;get;list;patch;watch //+kubebuilder:rbac:groups=apps,resources=daemonsets/finalizers,verbs=create;get;update;watch +//+kubebuilder:rbac:groups=apps,resources=deployments,verbs=create;delete;get;list;patch;watch;update //+kubebuilder:rbac:groups=core,resources=services,verbs=create;delete;get;list;patch;watch //+kubebuilder:rbac:groups=core,resources=services/finalizers,verbs=create;get;update;watch //+kubebuilder:rbac:groups=core,resources=pods,verbs=delete;get;list;watch;create diff --git a/internal/controllers/remediation_handler.go b/internal/controllers/remediation_handler.go index 55b7d8ba4..0fc892494 100644 --- a/internal/controllers/remediation_handler.go +++ b/internal/controllers/remediation_handler.go @@ -618,6 +618,15 @@ func (h *remediationMgrHelper) createDefaultWorkflowTemplate(ctx context.Context ObjectMeta: metav1.ObjectMeta{ Name: "event-notify-template", Namespace: devConfig.Namespace, + OwnerReferences: []metav1.OwnerReference{ + { + APIVersion: devConfig.APIVersion, + Kind: devConfig.Kind, + Name: devConfig.Name, + UID: devConfig.UID, + Controller: ptr.To(true), + }, + }, }, Spec: workflowv1alpha1.WorkflowSpec{ Entrypoint: "notify",