ROCm · ci-penbot-01 · Apr 23, 2026
diff --git a/.markdownlint-cli2.jsonc b/.markdownlint-cli2.jsonc
@@ -11,6 +11,6 @@
     "**/.claude/**",
     "**/tests/pytests/**",
     "**/knowledge/**",
-    "**/tests/test-plan/**"
+    "**/tests/**"
   ]
 }
diff --git a/docs/autoremediation/auto-remediation.md b/docs/autoremediation/auto-remediation.md
@@ -312,19 +312,21 @@ The `default-template` workflow performs the following remediation steps:
 
 5. **Suspend Workflow** - Pause workflow execution pending manual intervention or automatic resumption based on configured policies.
 
-6. **Reboot Node** - Perform node reboot to clear transient errors and reinitialize GPU hardware.
+6. **Reboot Node** - Issue a reboot command on the affected node to clear transient errors and reinitialize GPU hardware. This step exits gracefully after triggering the reboot, ensuring the workflow pod is not disrupted by the node shutdown.
 
-7. **Validate GPUs** - Execute AGFHC/RVS validation tests to confirm GPU health after reboot.
+7. **Wait for Node Ready** - Monitor the rebooted node until it comes back online and reports a Ready condition in the Kubernetes cluster before proceeding to validation.
 
-8. **Verify Condition** - Confirm that the triggering node condition has been resolved (status changed to False).
+8. **Validate GPUs** - Execute AGFHC/RVS validation tests to confirm GPU health after reboot.
 
-9. **Remove Taint** - Remove the node taint to restore GPU availability for workload scheduling.
+9. **Verify Condition** - Confirm that the triggering node condition has been resolved (status changed to False).
 
-10. **Remove Labels** - Removes all custom labels that were applied to the node in Step 1, restoring the node to its original label state.
+10. **Remove Taint** - Remove the node taint to restore GPU availability for workload scheduling.
+
+11. **Remove Labels** - Removes all custom labels that were applied to the node in Step 1, restoring the node to its original label state.
 
 Each workflow step is executed as a separate Kubernetes pod. For advanced use cases, users can create custom workflow templates using the Argo CRDs available on the cluster and reference them in the ConfigMap.
 
-While most workflow steps are self-explanatory, Steps 4, 5, and 7 require additional clarification.
+While most workflow steps are self-explanatory, Steps 4, 5, and 8 require additional clarification.
 
 ### Workflow Step 4: Physical Intervention Check
 
@@ -365,7 +367,7 @@ To resume a suspended workflow, apply the label `operator.amd.com/gpu-force-resu
 
 To abort the workflow entirely, apply the label `operator.amd.com/gpu-abort-workflow=true` to the node. This keeps the node in a tainted state for manual remediation. This option is useful when automatic remediation is no longer desired and the workflow should be deleted while paused.
 
-### Workflow Step 7: GPU Validation Testing
+### Workflow Step 8: GPU Validation Testing
 
 This step executes comprehensive GPU health validation tests using the test runner:
 

diff --git a/internal/controllers/remediation/scripts/reboot.sh b/internal/controllers/remediation/scripts/reboot.sh
@@ -0,0 +1,16 @@
+set -e
+NODE_NAME='{{inputs.parameters.node_name}}'
+BOOT_ID_FILE=/tmp/boot_id
+
+BOOT_ID=$(kubectl get node "$NODE_NAME" -o jsonpath='{.status.nodeInfo.bootID}' 2>/dev/null || true)
+if [ -n "$BOOT_ID" ]; then
+  printf '%s' "$BOOT_ID" > "$BOOT_ID_FILE"
+  echo "Captured pre-reboot bootID for node $NODE_NAME: $BOOT_ID"
+else
+  # Fall back to an empty bootID; downstream wait step will degrade to a Ready-only check.
+  printf '' > "$BOOT_ID_FILE"
+  echo "Warning: could not capture bootID for node $NODE_NAME; downstream wait will fall back to Ready-only check." >&2
+fi
+
+echo "Triggering host reboot via nsenter (shutdown -r +1)..."
+exec /nsenter --mount --pid --target=1 -- /sbin/shutdown -r +1
diff --git a/internal/controllers/remediation/scripts/waitfornodeready.sh b/internal/controllers/remediation/scripts/waitfornodeready.sh
@@ -0,0 +1,53 @@
+set -e
+NODE_NAME='{{inputs.parameters.node_name}}'
+OLD_BOOT_ID='{{inputs.parameters.old_boot_id}}'
+TIMEOUT_MINUTES=15
+POLL_INTERVAL=30
+STABLE_THRESHOLD=4
+
+if [ -n "$OLD_BOOT_ID" ]; then
+  echo "Waiting for node $NODE_NAME to reboot (old bootID: $OLD_BOOT_ID) and remain Ready for at least 2 minutes (timeout: ${TIMEOUT_MINUTES} minutes)..."
+else
+  echo "Old bootID not provided; waiting for node $NODE_NAME to remain Ready for at least 2 minutes (timeout: ${TIMEOUT_MINUTES} minutes)..."
+fi
+
+ELAPSED=0
+STABLE_COUNT=0
+MAX_SECONDS=$((TIMEOUT_MINUTES * 60))
+
+while [ "$ELAPSED" -lt "$MAX_SECONDS" ]; do
+  READY=$(kubectl get node "$NODE_NAME" -o jsonpath='{.status.conditions[?(@.type=="Ready")].status}' 2>/dev/null || echo "Unknown")
+  CURRENT_BOOT_ID=$(kubectl get node "$NODE_NAME" -o jsonpath='{.status.nodeInfo.bootID}' 2>/dev/null || echo "")
+  echo "[$(date)] Node Ready: $READY, current bootID: $CURRENT_BOOT_ID"
+
+  REBOOT_CONFIRMED=true
+  if [ -n "$OLD_BOOT_ID" ]; then
+    if [ -z "$CURRENT_BOOT_ID" ] || [ "$CURRENT_BOOT_ID" = "$OLD_BOOT_ID" ]; then
+      REBOOT_CONFIRMED=false
+    fi
+  fi
+
+  if [ "$READY" = "True" ] && [ "$REBOOT_CONFIRMED" = "true" ]; then
+    STABLE_COUNT=$((STABLE_COUNT + 1))
+    echo "Node is Ready (and rebooted) for $STABLE_COUNT consecutive check(s)"
+    if [ "$STABLE_COUNT" -ge "$STABLE_THRESHOLD" ]; then
+      echo "Node $NODE_NAME confirmed rebooted (new bootID: $CURRENT_BOOT_ID) and Ready. Proceeding..."
+      exit 0
+    fi
+  else
+    if [ "$STABLE_COUNT" -gt 0 ]; then
+      echo "Node became not Ready, resetting stability counter."
+    fi
+    STABLE_COUNT=0
+    if [ "$REBOOT_CONFIRMED" = "false" ]; then
+      echo "Node has not rebooted yet (bootID unchanged). Retrying in ${POLL_INTERVAL}s..."
+    else
+      echo "Node is not ready yet. Retrying in ${POLL_INTERVAL}s..."
+    fi
+  fi
+  sleep "$POLL_INTERVAL"
+  ELAPSED=$((ELAPSED + POLL_INTERVAL))
+done
+
+echo "Timeout: Node $NODE_NAME did not reboot and remain Ready within ${TIMEOUT_MINUTES} minutes."
+exit 1
diff --git a/internal/controllers/remediation_handler.go b/internal/controllers/remediation_handler.go
@@ -599,8 +599,11 @@ func (h *remediationMgrHelper) createDefaultWorkflowTemplate(ctx context.Context
 	utilityContainer := h.getWorkflowUtilityImage(devConfig)
 	utilityContainer.Command = []string{"sh"}
 
+	// Reboot is implemented as a ScriptTemplate so we can capture the node's pre-reboot
+	// bootID as an output parameter for downstream steps (e.g. waitfornodeready) to
+	// confirm that the host actually rebooted.
 	rebootContainer := h.getWorkflowUtilityImage(devConfig)
-	rebootContainer.Command = []string{"/nsenter", "--all", "--target=1", "--", "/sbin/reboot", "-f"}
+	rebootContainer.Command = []string{"sh"}
 	rebootContainer.SecurityContext = &v1.SecurityContext{Privileged: ptr.To(true)}
 
 	notifySrc, err := h.getWorkflowTaskScriptSource("notify.sh")
@@ -666,6 +669,14 @@ func (h *remediationMgrHelper) createDefaultWorkflowTemplate(ctx context.Context
 	if err != nil {
 		return nil, err
 	}
+	waitForNodeReadySrc, err := h.getWorkflowTaskScriptSource("waitfornodeready.sh")
+	if err != nil {
+		return nil, err
+	}
+	rebootSrc, err := h.getWorkflowTaskScriptSource("reboot.sh")
+	if err != nil {
+		return nil, err
+	}
 	untaintSrc, err := h.getWorkflowTaskScriptSource("untaint.sh")
 	if err != nil {
 		return nil, err
@@ -711,6 +722,19 @@ func (h *remediationMgrHelper) createDefaultWorkflowTemplate(ctx context.Context
 						},
 						{Steps: []workflowv1alpha1.WorkflowStep{{Name: "suspend", Template: "suspend"}}},
 						{Steps: []workflowv1alpha1.WorkflowStep{{Name: "reboot", Template: "reboot", ContinueOn: &workflowv1alpha1.ContinueOn{Failed: true}, When: "{{workflow.parameters.skipRebootStep}} == false"}}},
+						{Steps: []workflowv1alpha1.WorkflowStep{{
+							Name:     "waitfornodeready",
+							Template: "waitfornodeready",
+							When:     "{{workflow.parameters.skipRebootStep}} == false",
+							Arguments: workflowv1alpha1.Arguments{
+								Parameters: []workflowv1alpha1.Parameter{
+									{Name: "node_name", Value: workflowv1alpha1.AnyStringPtr("{{workflow.parameters.node_name}}")},
+									// Pass the bootID captured by the reboot step so we can verify
+									// the host actually rebooted (not just transiently NotReady).
+									{Name: "old_boot_id", Value: workflowv1alpha1.AnyStringPtr("{{steps.reboot.outputs.parameters.boot_id}}")},
+								},
+							},
+						}}},
 						{Steps: []workflowv1alpha1.WorkflowStep{{Name: "test", Template: "test", ContinueOn: &workflowv1alpha1.ContinueOn{Failed: true}}}},
 						{Steps: []workflowv1alpha1.WorkflowStep{
 							{
@@ -801,7 +825,32 @@ func (h *remediationMgrHelper) createDefaultWorkflowTemplate(ctx context.Context
 				{
 					Name:      "reboot",
 					Metadata:  instanceIDMeta,
-					Container: &rebootContainer,
+					Inputs: workflowv1alpha1.Inputs{
+						Parameters: []workflowv1alpha1.Parameter{
+							{
+								Name:  "node_name",
+								Value: workflowv1alpha1.AnyStringPtr("{{workflow.parameters.node_name}}"),
+							},
+						},
+					},
+					Outputs: workflowv1alpha1.Outputs{
+						Parameters: []workflowv1alpha1.Parameter{
+							{
+								Name: "boot_id",
+								ValueFrom: &workflowv1alpha1.ValueFrom{
+									Path: "/tmp/boot_id",
+									// If the pod is killed by the reboot before the file is
+									// finalized, fall back to an empty value; the wait step
+									// gracefully degrades to a Ready-only check in that case.
+									Default: workflowv1alpha1.AnyStringPtr(""),
+								},
+							},
+						},
+					},
+					Script: &workflowv1alpha1.ScriptTemplate{
+						Source:    rebootSrc,
+						Container: rebootContainer,
+					},
 					PodSpecPatch: `
 hostPID: true
 hostNetwork: true
@@ -811,6 +860,28 @@ containers:
   tty: true
 `,
 				},
+				{
+					Name:     "waitfornodeready",
+					Metadata: instanceIDMeta,
+					Inputs: workflowv1alpha1.Inputs{
+						Parameters: []workflowv1alpha1.Parameter{
+							{
+								Name:  "node_name",
+								Value: workflowv1alpha1.AnyStringPtr("{{workflow.parameters.node_name}}"),
+							},
+							{
+								// Pre-reboot bootID supplied by the reboot step. Empty when the
+								// reboot step was skipped or its bootID capture failed.
+								Name:    "old_boot_id",
+								Default: workflowv1alpha1.AnyStringPtr(""),
+							},
+						},
+					},
+					Script: &workflowv1alpha1.ScriptTemplate{
+						Source:    waitForNodeReadySrc,
+						Container: utilityContainer,
+					},
+				},
 				{
 					Name:     "test",
 					Metadata: instanceIDMeta,
@@ -1068,10 +1139,43 @@ func (h *remediationMgrHelper) populateWorkflow(ctx context.Context, wfTemplate
 		SecondsAfterCompletion: &ttlSeconds,
 	}
 
+	wf.Spec.NodeSelector = make(map[string]string)
 	for i := range wf.Spec.Templates {
-		if wf.Spec.Templates[i].NodeSelector == nil {
-			wf.Spec.Templates[i].NodeSelector = map[string]string{}
+		if wf.Spec.Templates[i].Name == "waitfornodeready" {
+			wf.Spec.Templates[i].NodeSelector = make(map[string]string)
+			wf.Spec.Templates[i].Affinity = &v1.Affinity{
+				NodeAffinity: &v1.NodeAffinity{
+					RequiredDuringSchedulingIgnoredDuringExecution: &v1.NodeSelector{
+						NodeSelectorTerms: []v1.NodeSelectorTerm{
+							{
+								MatchExpressions: []v1.NodeSelectorRequirement{
+									{
+										Key:      "kubernetes.io/hostname",
+										Operator: v1.NodeSelectorOpNotIn,
+										Values:   []string{nodeName},
+									},
+								},
+							},
+						},
+					},
+				},
+		}
+			wf.Spec.Templates[i].Tolerations = append(wf.Spec.Templates[i].Tolerations,
+				v1.Toleration{
+					Key:      "node-role.kubernetes.io/control-plane",
+					Operator: v1.TolerationOpExists,
+					Effect:   v1.TaintEffectNoSchedule,
+				},
+				v1.Toleration{
+					Key:      "node-role.kubernetes.io/master",
+					Operator: v1.TolerationOpExists,
+					Effect:   v1.TaintEffectNoSchedule,
+				},
+			)
+			wf.Spec.Templates[i].PodSpecPatch = `{"nodeSelector":null}`
+			continue
 		}
+		wf.Spec.Templates[i].NodeSelector = make(map[string]string)
 		wf.Spec.Templates[i].NodeSelector["kubernetes.io/hostname"] = nodeName
 	}
 	// apply tolerations based on node taints

diff --git a/internal/utils_container/Dockerfile b/internal/utils_container/Dockerfile
@@ -9,7 +9,7 @@ LABEL summary="AMD GPU Operator Utility Image"
 LABEL description="The AMD GPU Operator utils image is a utility image used by the AMD GPU Operator. The operator controller employs this image as the container's base layer to automate tasks on target worker nodes."
 
 # Install nsenter from util-linux package
-RUN microdnf install -y util-linux pciutils kmod tar jq && \
+RUN microdnf install -y util-linux pciutils kmod tar jq systemd && \
     cp /usr/bin/nsenter /nsenter && \
     microdnf clean all