From 719aba4b68e98e98a5b1822b48a790ec6a211853 Mon Sep 17 00:00:00 2001
From: ci-penbot-01 <a1.penbot@amd.com>
Date: Thu, 23 Apr 2026 10:49:17 +0530
Subject: [PATCH] handle reboot step failure in workflow (#1363) (#1386)

* handle reboot step failure in workflow

* fix node selector and affinity rules

* update documentation

* use boot id to detect successful reboot

* ignore tests folder from docs lint

(cherry picked from commit 1c647b6169de2f320a4b5bf164a49f3450b44bbf)

Co-authored-by: Uday Bhaskar <udayb@amd.com>
(cherry picked from commit 2225463b9b2cdd61376c06deddd0a6bdde052a7b)
---
 .markdownlint-cli2.jsonc                      |   2 +-
 docs/autoremediation/auto-remediation.md      |  16 +--
 .../controllers/remediation/scripts/reboot.sh |  16 +++
 .../remediation/scripts/waitfornodeready.sh   |  53 +++++++++
 internal/controllers/remediation_handler.go   | 112 +++++++++++++++++-
 internal/utils_container/Dockerfile           |   2 +-
 6 files changed, 188 insertions(+), 13 deletions(-)
 create mode 100644 internal/controllers/remediation/scripts/reboot.sh
 create mode 100644 internal/controllers/remediation/scripts/waitfornodeready.sh

diff --git a/.markdownlint-cli2.jsonc b/.markdownlint-cli2.jsonc
index f4155855..d56a46a9 100644
--- a/.markdownlint-cli2.jsonc
+++ b/.markdownlint-cli2.jsonc
@@ -11,6 +11,6 @@
     "**/.claude/**",
     "**/tests/pytests/**",
     "**/knowledge/**",
-    "**/tests/test-plan/**"
+    "**/tests/**"
   ]
 }
diff --git a/docs/autoremediation/auto-remediation.md b/docs/autoremediation/auto-remediation.md
index b0ca6657..2fc16a06 100644
--- a/docs/autoremediation/auto-remediation.md
+++ b/docs/autoremediation/auto-remediation.md
@@ -312,19 +312,21 @@ The `default-template` workflow performs the following remediation steps:
 
 5. **Suspend Workflow** - Pause workflow execution pending manual intervention or automatic resumption based on configured policies.
 
-6. **Reboot Node** - Perform node reboot to clear transient errors and reinitialize GPU hardware.
+6. **Reboot Node** - Issue a reboot command on the affected node to clear transient errors and reinitialize GPU hardware. This step exits gracefully after triggering the reboot, ensuring the workflow pod is not disrupted by the node shutdown.
 
-7. **Validate GPUs** - Execute AGFHC/RVS validation tests to confirm GPU health after reboot.
+7. **Wait for Node Ready** - Monitor the rebooted node until it comes back online and reports a Ready condition in the Kubernetes cluster before proceeding to validation.
 
-8. **Verify Condition** - Confirm that the triggering node condition has been resolved (status changed to False).
+8. **Validate GPUs** - Execute AGFHC/RVS validation tests to confirm GPU health after reboot.
 
-9. **Remove Taint** - Remove the node taint to restore GPU availability for workload scheduling.
+9. **Verify Condition** - Confirm that the triggering node condition has been resolved (status changed to False).
 
-10. **Remove Labels** - Removes all custom labels that were applied to the node in Step 1, restoring the node to its original label state.
+10. **Remove Taint** - Remove the node taint to restore GPU availability for workload scheduling.
+
+11. **Remove Labels** - Removes all custom labels that were applied to the node in Step 1, restoring the node to its original label state.
 
 Each workflow step is executed as a separate Kubernetes pod. For advanced use cases, users can create custom workflow templates using the Argo CRDs available on the cluster and reference them in the ConfigMap.
 
-While most workflow steps are self-explanatory, Steps 4, 5, and 7 require additional clarification.
+While most workflow steps are self-explanatory, Steps 4, 5, and 8 require additional clarification.
 
 ### Workflow Step 4: Physical Intervention Check
 
@@ -365,7 +367,7 @@ To resume a suspended workflow, apply the label `operator.amd.com/gpu-force-resu
 
 To abort the workflow entirely, apply the label `operator.amd.com/gpu-abort-workflow=true` to the node. This keeps the node in a tainted state for manual remediation. This option is useful when automatic remediation is no longer desired and the workflow should be deleted while paused.
 
-### Workflow Step 7: GPU Validation Testing
+### Workflow Step 8: GPU Validation Testing
 
 This step executes comprehensive GPU health validation tests using the test runner:
 
diff --git a/internal/controllers/remediation/scripts/reboot.sh b/internal/controllers/remediation/scripts/reboot.sh
new file mode 100644
index 00000000..a68f1fcf
--- /dev/null
+++ b/internal/controllers/remediation/scripts/reboot.sh
@@ -0,0 +1,16 @@
+set -e
+NODE_NAME='{{inputs.parameters.node_name}}'
+BOOT_ID_FILE=/tmp/boot_id
+
+BOOT_ID=$(kubectl get node "$NODE_NAME" -o jsonpath='{.status.nodeInfo.bootID}' 2>/dev/null || true)
+if [ -n "$BOOT_ID" ]; then
+  printf '%s' "$BOOT_ID" > "$BOOT_ID_FILE"
+  echo "Captured pre-reboot bootID for node $NODE_NAME: $BOOT_ID"
+else
+  # Fall back to an empty bootID; downstream wait step will degrade to a Ready-only check.
+  printf '' > "$BOOT_ID_FILE"
+  echo "Warning: could not capture bootID for node $NODE_NAME; downstream wait will fall back to Ready-only check." >&2
+fi
+
+echo "Triggering host reboot via nsenter (shutdown -r +1)..."
+exec /nsenter --mount --pid --target=1 -- /sbin/shutdown -r +1
diff --git a/internal/controllers/remediation/scripts/waitfornodeready.sh b/internal/controllers/remediation/scripts/waitfornodeready.sh
new file mode 100644
index 00000000..04f2923c
--- /dev/null
+++ b/internal/controllers/remediation/scripts/waitfornodeready.sh
@@ -0,0 +1,53 @@
+set -e
+NODE_NAME='{{inputs.parameters.node_name}}'
+OLD_BOOT_ID='{{inputs.parameters.old_boot_id}}'
+TIMEOUT_MINUTES=15
+POLL_INTERVAL=30
+STABLE_THRESHOLD=4
+
+if [ -n "$OLD_BOOT_ID" ]; then
+  echo "Waiting for node $NODE_NAME to reboot (old bootID: $OLD_BOOT_ID) and remain Ready for at least 2 minutes (timeout: ${TIMEOUT_MINUTES} minutes)..."
+else
+  echo "Old bootID not provided; waiting for node $NODE_NAME to remain Ready for at least 2 minutes (timeout: ${TIMEOUT_MINUTES} minutes)..."
+fi
+
+ELAPSED=0
+STABLE_COUNT=0
+MAX_SECONDS=$((TIMEOUT_MINUTES * 60))
+
+while [ "$ELAPSED" -lt "$MAX_SECONDS" ]; do
+  READY=$(kubectl get node "$NODE_NAME" -o jsonpath='{.status.conditions[?(@.type=="Ready")].status}' 2>/dev/null || echo "Unknown")
+  CURRENT_BOOT_ID=$(kubectl get node "$NODE_NAME" -o jsonpath='{.status.nodeInfo.bootID}' 2>/dev/null || echo "")
+  echo "[$(date)] Node Ready: $READY, current bootID: $CURRENT_BOOT_ID"
+
+  REBOOT_CONFIRMED=true
+  if [ -n "$OLD_BOOT_ID" ]; then
+    if [ -z "$CURRENT_BOOT_ID" ] || [ "$CURRENT_BOOT_ID" = "$OLD_BOOT_ID" ]; then
+      REBOOT_CONFIRMED=false
+    fi
+  fi
+
+  if [ "$READY" = "True" ] && [ "$REBOOT_CONFIRMED" = "true" ]; then
+    STABLE_COUNT=$((STABLE_COUNT + 1))
+    echo "Node is Ready (and rebooted) for $STABLE_COUNT consecutive check(s)"
+    if [ "$STABLE_COUNT" -ge "$STABLE_THRESHOLD" ]; then
+      echo "Node $NODE_NAME confirmed rebooted (new bootID: $CURRENT_BOOT_ID) and Ready. Proceeding..."
+      exit 0
+    fi
+  else
+    if [ "$STABLE_COUNT" -gt 0 ]; then
+      echo "Node became not Ready, resetting stability counter."
+    fi
+    STABLE_COUNT=0
+    if [ "$REBOOT_CONFIRMED" = "false" ]; then
+      echo "Node has not rebooted yet (bootID unchanged). Retrying in ${POLL_INTERVAL}s..."
+    else
+      echo "Node is not ready yet. Retrying in ${POLL_INTERVAL}s..."
+    fi
+  fi
+  sleep "$POLL_INTERVAL"
+  ELAPSED=$((ELAPSED + POLL_INTERVAL))
+done
+
+echo "Timeout: Node $NODE_NAME did not reboot and remain Ready within ${TIMEOUT_MINUTES} minutes."
+exit 1
diff --git a/internal/controllers/remediation_handler.go b/internal/controllers/remediation_handler.go
index 55b7d8ba..19489d14 100644
--- a/internal/controllers/remediation_handler.go
+++ b/internal/controllers/remediation_handler.go
@@ -599,8 +599,11 @@ func (h *remediationMgrHelper) createDefaultWorkflowTemplate(ctx context.Context
 	utilityContainer := h.getWorkflowUtilityImage(devConfig)
 	utilityContainer.Command = []string{"sh"}
 
+	// Reboot is implemented as a ScriptTemplate so we can capture the node's pre-reboot
+	// bootID as an output parameter for downstream steps (e.g. waitfornodeready) to
+	// confirm that the host actually rebooted.
 	rebootContainer := h.getWorkflowUtilityImage(devConfig)
-	rebootContainer.Command = []string{"/nsenter", "--all", "--target=1", "--", "/sbin/reboot", "-f"}
+	rebootContainer.Command = []string{"sh"}
 	rebootContainer.SecurityContext = &v1.SecurityContext{Privileged: ptr.To(true)}
 
 	notifySrc, err := h.getWorkflowTaskScriptSource("notify.sh")
@@ -666,6 +669,14 @@ func (h *remediationMgrHelper) createDefaultWorkflowTemplate(ctx context.Context
 	if err != nil {
 		return nil, err
 	}
+	waitForNodeReadySrc, err := h.getWorkflowTaskScriptSource("waitfornodeready.sh")
+	if err != nil {
+		return nil, err
+	}
+	rebootSrc, err := h.getWorkflowTaskScriptSource("reboot.sh")
+	if err != nil {
+		return nil, err
+	}
 	untaintSrc, err := h.getWorkflowTaskScriptSource("untaint.sh")
 	if err != nil {
 		return nil, err
@@ -711,6 +722,19 @@ func (h *remediationMgrHelper) createDefaultWorkflowTemplate(ctx context.Context
 						},
 						{Steps: []workflowv1alpha1.WorkflowStep{{Name: "suspend", Template: "suspend"}}},
 						{Steps: []workflowv1alpha1.WorkflowStep{{Name: "reboot", Template: "reboot", ContinueOn: &workflowv1alpha1.ContinueOn{Failed: true}, When: "{{workflow.parameters.skipRebootStep}} == false"}}},
+						{Steps: []workflowv1alpha1.WorkflowStep{{
+							Name:     "waitfornodeready",
+							Template: "waitfornodeready",
+							When:     "{{workflow.parameters.skipRebootStep}} == false",
+							Arguments: workflowv1alpha1.Arguments{
+								Parameters: []workflowv1alpha1.Parameter{
+									{Name: "node_name", Value: workflowv1alpha1.AnyStringPtr("{{workflow.parameters.node_name}}")},
+									// Pass the bootID captured by the reboot step so we can verify
+									// the host actually rebooted (not just transiently NotReady).
+									{Name: "old_boot_id", Value: workflowv1alpha1.AnyStringPtr("{{steps.reboot.outputs.parameters.boot_id}}")},
+								},
+							},
+						}}},
 						{Steps: []workflowv1alpha1.WorkflowStep{{Name: "test", Template: "test", ContinueOn: &workflowv1alpha1.ContinueOn{Failed: true}}}},
 						{Steps: []workflowv1alpha1.WorkflowStep{
 							{
@@ -801,7 +825,32 @@ func (h *remediationMgrHelper) createDefaultWorkflowTemplate(ctx context.Context
 				{
 					Name:      "reboot",
 					Metadata:  instanceIDMeta,
-					Container: &rebootContainer,
+					Inputs: workflowv1alpha1.Inputs{
+						Parameters: []workflowv1alpha1.Parameter{
+							{
+								Name:  "node_name",
+								Value: workflowv1alpha1.AnyStringPtr("{{workflow.parameters.node_name}}"),
+							},
+						},
+					},
+					Outputs: workflowv1alpha1.Outputs{
+						Parameters: []workflowv1alpha1.Parameter{
+							{
+								Name: "boot_id",
+								ValueFrom: &workflowv1alpha1.ValueFrom{
+									Path: "/tmp/boot_id",
+									// If the pod is killed by the reboot before the file is
+									// finalized, fall back to an empty value; the wait step
+									// gracefully degrades to a Ready-only check in that case.
+									Default: workflowv1alpha1.AnyStringPtr(""),
+								},
+							},
+						},
+					},
+					Script: &workflowv1alpha1.ScriptTemplate{
+						Source:    rebootSrc,
+						Container: rebootContainer,
+					},
 					PodSpecPatch: `
 hostPID: true
 hostNetwork: true
@@ -811,6 +860,28 @@ containers:
   tty: true
 `,
 				},
+				{
+					Name:     "waitfornodeready",
+					Metadata: instanceIDMeta,
+					Inputs: workflowv1alpha1.Inputs{
+						Parameters: []workflowv1alpha1.Parameter{
+							{
+								Name:  "node_name",
+								Value: workflowv1alpha1.AnyStringPtr("{{workflow.parameters.node_name}}"),
+							},
+							{
+								// Pre-reboot bootID supplied by the reboot step. Empty when the
+								// reboot step was skipped or its bootID capture failed.
+								Name:    "old_boot_id",
+								Default: workflowv1alpha1.AnyStringPtr(""),
+							},
+						},
+					},
+					Script: &workflowv1alpha1.ScriptTemplate{
+						Source:    waitForNodeReadySrc,
+						Container: utilityContainer,
+					},
+				},
 				{
 					Name:     "test",
 					Metadata: instanceIDMeta,
@@ -1068,10 +1139,43 @@ func (h *remediationMgrHelper) populateWorkflow(ctx context.Context, wfTemplate
 		SecondsAfterCompletion: &ttlSeconds,
 	}
 
+	wf.Spec.NodeSelector = make(map[string]string)
 	for i := range wf.Spec.Templates {
-		if wf.Spec.Templates[i].NodeSelector == nil {
-			wf.Spec.Templates[i].NodeSelector = map[string]string{}
+		if wf.Spec.Templates[i].Name == "waitfornodeready" {
+			wf.Spec.Templates[i].NodeSelector = make(map[string]string)
+			wf.Spec.Templates[i].Affinity = &v1.Affinity{
+				NodeAffinity: &v1.NodeAffinity{
+					RequiredDuringSchedulingIgnoredDuringExecution: &v1.NodeSelector{
+						NodeSelectorTerms: []v1.NodeSelectorTerm{
+							{
+								MatchExpressions: []v1.NodeSelectorRequirement{
+									{
+										Key:      "kubernetes.io/hostname",
+										Operator: v1.NodeSelectorOpNotIn,
+										Values:   []string{nodeName},
+									},
+								},
+							},
+						},
+					},
+				},
+		}
+			wf.Spec.Templates[i].Tolerations = append(wf.Spec.Templates[i].Tolerations,
+				v1.Toleration{
+					Key:      "node-role.kubernetes.io/control-plane",
+					Operator: v1.TolerationOpExists,
+					Effect:   v1.TaintEffectNoSchedule,
+				},
+				v1.Toleration{
+					Key:      "node-role.kubernetes.io/master",
+					Operator: v1.TolerationOpExists,
+					Effect:   v1.TaintEffectNoSchedule,
+				},
+			)
+			wf.Spec.Templates[i].PodSpecPatch = `{"nodeSelector":null}`
+			continue
 		}
+		wf.Spec.Templates[i].NodeSelector = make(map[string]string)
 		wf.Spec.Templates[i].NodeSelector["kubernetes.io/hostname"] = nodeName
 	}
 	// apply tolerations based on node taints
diff --git a/internal/utils_container/Dockerfile b/internal/utils_container/Dockerfile
index db4b5f5f..1849f6b2 100644
--- a/internal/utils_container/Dockerfile
+++ b/internal/utils_container/Dockerfile
@@ -9,7 +9,7 @@ LABEL summary="AMD GPU Operator Utility Image"
 LABEL description="The AMD GPU Operator utils image is a utility image used by the AMD GPU Operator. The operator controller employs this image as the container's base layer to automate tasks on target worker nodes."
 
 # Install nsenter from util-linux package
-RUN microdnf install -y util-linux pciutils kmod tar jq && \
+RUN microdnf install -y util-linux pciutils kmod tar jq systemd && \
     cp /usr/bin/nsenter /nsenter && \
     microdnf clean all