diff --git a/hack/k8s-patch/metadata-patch/values.yaml b/hack/k8s-patch/metadata-patch/values.yaml index 195901c8..ce550d24 100644 --- a/hack/k8s-patch/metadata-patch/values.yaml +++ b/hack/k8s-patch/metadata-patch/values.yaml @@ -300,6 +300,8 @@ deviceConfig: nodeDrainPolicy: {} # -- Enable/disable automatic workflow start on node issues autoStartWorkflow: true + # -- Container image used to create the remediation ConfigMap. This image contains the default remediation ConfigMap configmap.yaml file. + configMapImage: "" # AMD GPU operator controller related configs controllerManager: manager: diff --git a/helm-charts-k8s/README.md b/helm-charts-k8s/README.md index 09885c1b..8553d92f 100644 --- a/helm-charts-k8s/README.md +++ b/helm-charts-k8s/README.md @@ -123,8 +123,6 @@ For bugs and feature requests, please file an issue on our [GitHub Issues](https The AMD GPU Operator is licensed under the [Apache License 2.0](LICENSE). -## gpu-operator-charts - ![Version: v0.0.1](https://img.shields.io/badge/Version-v0.0.1-informational?style=flat-square) ![Type: application](https://img.shields.io/badge/Type-application-informational?style=flat-square) ![AppVersion: dev](https://img.shields.io/badge/AppVersion-dev-informational?style=flat-square) AMD GPU Operator simplifies the deployment and management of AMD Instinct GPU accelerators within Kubernetes clusters. @@ -253,6 +251,7 @@ Kubernetes: `>= 1.29.0-0` | deviceConfig.spec.metricsExporter.upgradePolicy.upgradeStrategy | string | `"RollingUpdate"` | the type of daemonset upgrade, RollingUpdate or OnDelete | | deviceConfig.spec.remediationWorkflow.autoStartWorkflow | bool | `true` | Enable/disable automatic workflow start on node issues | | deviceConfig.spec.remediationWorkflow.config | object | `{}` | Configuration for remediation workflow | +| deviceConfig.spec.remediationWorkflow.configMapImage | string | `""` | Container image used to create the remediation ConfigMap. This image contains the default remediation ConfigMap configmap.yaml file. | | deviceConfig.spec.remediationWorkflow.enable | bool | `false` | enable/disable remediation workflow controller | | deviceConfig.spec.remediationWorkflow.maxParallelWorkflows | int | `0` | Set maximum number of remediation workflows that can run in parallel. Default is 0 which means no limit | | deviceConfig.spec.remediationWorkflow.nodeDrainPolicy | object | `{}` | Policy for draining nodes during remediation | diff --git a/helm-charts-k8s/values.yaml b/helm-charts-k8s/values.yaml index 195901c8..ce550d24 100644 --- a/helm-charts-k8s/values.yaml +++ b/helm-charts-k8s/values.yaml @@ -300,6 +300,8 @@ deviceConfig: nodeDrainPolicy: {} # -- Enable/disable automatic workflow start on node issues autoStartWorkflow: true + # -- Container image used to create the remediation ConfigMap. This image contains the default remediation ConfigMap configmap.yaml file. + configMapImage: "" # AMD GPU operator controller related configs controllerManager: manager: diff --git a/tests/e2e/Makefile b/tests/e2e/Makefile index 3dea053a..0f5ff001 100644 --- a/tests/e2e/Makefile +++ b/tests/e2e/Makefile @@ -18,6 +18,7 @@ E2E_TEST_RUNNER_IMAGE ?= rocm/test-runner:v1.4.0 E2E_KUBEVIRT_DEVICE_PLUGIN_IMAGE ?= rocm/k8s-device-plugin:latest E2E_KUBEVIRT_NODE_LABELLER_IMAGE ?= rocm/k8s-device-plugin:labeller-latest E2E_UTILS_CONTAINER_IMAGE ?= docker.io/rocm/gpu-operator-utils:v1.4.0 +E2E_ANR_CONFIGMAP_IMAGE ?= docker.io/rocm/amd-gpu-operator-remediation-config-utils:latest E2E_NODE_DIAG_IMAGE ?= busybox:1.36 E2E_DRA_DRIVER_IMAGE ?= rocm/k8s-gpu-dra-driver:latest @@ -38,6 +39,7 @@ export E2E_AGFHC_TEST_RUNNER_IMAGE export E2E_KUBEVIRT_DEVICE_PLUGIN_IMAGE export E2E_KUBEVIRT_NODE_LABELLER_IMAGE export E2E_UTILS_CONTAINER_IMAGE +export E2E_ANR_CONFIGMAP_IMAGE export E2E_NODE_DIAG_IMAGE export E2E_DRA_DRIVER_IMAGE diff --git a/tests/helm-e2e/helm_e2e_test.go b/tests/helm-e2e/helm_e2e_test.go index 8aed527a..162159d3 100644 --- a/tests/helm-e2e/helm_e2e_test.go +++ b/tests/helm-e2e/helm_e2e_test.go @@ -1002,6 +1002,7 @@ deviceConfig: TtlForFailedWorkflows: "36h", TesterImage: "test.io/test/remediation-workflow-tester:v1.3.0", AutoStartWorkflow: &boolTrue, + ConfigMapImage: "", }, }, verifyFunc: s.verifyRemediationWorkflow,