Skip to content

Commit 5699642

Browse files
authored
Merge pull request #412 from inteon/consolidate_revival_logic
Consolidate revival logic in case pod is in an unknown, evicted or unreachable state, limit the number of revivals to 3
2 parents 9ba98b9 + d4ccd14 commit 5699642

File tree

7 files changed

+154
-52
lines changed

7 files changed

+154
-52
lines changed

config/prow/cluster/prowjob-crd/prowjob_customresourcedefinition.yaml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9780,6 +9780,12 @@ spec:
97809780
plank. This field should always be the same as
97819781
the ProwJob.ObjectMeta.Name field.
97829782
type: string
9783+
pod_revival_count:
9784+
description: |-
9785+
PodRevivalCount applies only to ProwJobs fulfilled by
9786+
plank. This field shows the amount of times the
9787+
Pod was recreated due to an unexpected stop.
9788+
type: integer
97839789
prev_report_states:
97849790
additionalProperties:
97859791
description: ProwJobState specifies whether the job is running

pkg/apis/prowjobs/v1/types.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1104,6 +1104,10 @@ type ProwJobStatus struct {
11041104
Description string `json:"description,omitempty"`
11051105
URL string `json:"url,omitempty"`
11061106

1107+
// PodRevivalCount applies only to ProwJobs fulfilled by
1108+
// plank. This field shows the amount of times the
1109+
// Pod was recreated due to an unexpected stop.
1110+
PodRevivalCount int `json:"pod_revival_count,omitempty"`
11071111
// PodName applies only to ProwJobs fulfilled by
11081112
// plank. This field should always be the same as
11091113
// the ProwJob.ObjectMeta.Name field.

pkg/config/config.go

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -655,6 +655,12 @@ type Plank struct {
655655
// stuck in an unscheduled state. Defaults to 5 minutes.
656656
PodUnscheduledTimeout *metav1.Duration `json:"pod_unscheduled_timeout,omitempty"`
657657

658+
// MaxRevivals is the maximum number of times a prowjob will be retried in case of an
659+
// unexpected stop of the job before being marked as failed. Generally a job is stopped
660+
// unexpectedly due to the underlying Node being terminated, evicted or becoming unreachable.
661+
// Defaults to 3. A value of 0 means no retries.
662+
MaxRevivals *int `json:"max_revivals,omitempty"`
663+
658664
// DefaultDecorationConfigs holds the default decoration config for specific values.
659665
//
660666
// Each entry in the slice specifies Repo and Cluster regexp filter fields to
@@ -2503,6 +2509,11 @@ func parseProwConfig(c *Config) error {
25032509
c.Plank.PodUnscheduledTimeout = &metav1.Duration{Duration: 5 * time.Minute}
25042510
}
25052511

2512+
if c.Plank.MaxRevivals == nil {
2513+
maxRetries := 3
2514+
c.Plank.MaxRevivals = &maxRetries
2515+
}
2516+
25062517
if err := c.Gerrit.DefaultAndValidate(); err != nil {
25072518
return fmt.Errorf("validating gerrit config: %w", err)
25082519
}

pkg/config/config_test.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8426,6 +8426,7 @@ moonraker:
84268426
client_timeout: 10m0s
84278427
plank:
84288428
max_goroutines: 20
8429+
max_revivals: 3
84298430
pod_pending_timeout: 10m0s
84308431
pod_running_timeout: 48h0m0s
84318432
pod_unscheduled_timeout: 5m0s
@@ -8510,6 +8511,7 @@ moonraker:
85108511
client_timeout: 10m0s
85118512
plank:
85128513
max_goroutines: 20
8514+
max_revivals: 3
85138515
pod_pending_timeout: 10m0s
85148516
pod_running_timeout: 48h0m0s
85158517
pod_unscheduled_timeout: 5m0s
@@ -8587,6 +8589,7 @@ moonraker:
85878589
client_timeout: 10m0s
85888590
plank:
85898591
max_goroutines: 20
8592+
max_revivals: 3
85908593
pod_pending_timeout: 10m0s
85918594
pod_running_timeout: 48h0m0s
85928595
pod_unscheduled_timeout: 5m0s
@@ -8669,6 +8672,7 @@ moonraker:
86698672
client_timeout: 10m0s
86708673
plank:
86718674
max_goroutines: 20
8675+
max_revivals: 3
86728676
pod_pending_timeout: 10m0s
86738677
pod_running_timeout: 48h0m0s
86748678
pod_unscheduled_timeout: 5m0s

pkg/config/prow-config-documented.yaml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1301,6 +1301,11 @@ plank:
13011301
# JobURLPrefixDisableAppendStorageProvider disables that the storageProvider is
13021302
# automatically appended to the JobURLPrefix.
13031303
jobURLPrefixDisableAppendStorageProvider: true
1304+
# MaxRevivals is the maximum number of times a prowjob will be retried in case of an
1305+
# unexpected stop of the job before being marked as failed. Generally a job is stopped
1306+
# unexpectedly due to the underlying Node being terminated, evicted or becoming unreachable.
1307+
# Defaults to 3. A value of 0 means no retries.
1308+
max_revivals: 0
13041309
# PodPendingTimeout defines how long the controller will wait to perform a garbage
13051310
# collection on pending pods. Defaults to 10 minutes.
13061311
pod_pending_timeout: 0s

pkg/plank/controller_test.go

Lines changed: 71 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -65,6 +65,8 @@ const (
6565
podDeletionPreventionFinalizer = "keep-from-vanishing"
6666
)
6767

68+
var maxRevivals = 3
69+
6870
func newFakeConfigAgent(t *testing.T, maxConcurrency int, queueCapacities map[string]int) *fca {
6971
presubmits := []config.Presubmit{
7072
{
@@ -106,6 +108,7 @@ func newFakeConfigAgent(t *testing.T, maxConcurrency int, queueCapacities map[st
106108
PodPendingTimeout: &metav1.Duration{Duration: podPendingTimeout},
107109
PodRunningTimeout: &metav1.Duration{Duration: podRunningTimeout},
108110
PodUnscheduledTimeout: &metav1.Duration{Duration: podUnscheduledTimeout},
111+
MaxRevivals: &maxRevivals,
109112
},
110113
},
111114
JobConfig: config.JobConfig{
@@ -1180,6 +1183,74 @@ func TestSyncPendingJob(t *testing.T) {
11801183
ExpectedNumPods: 1,
11811184
ExpectedURL: "boop-42/error",
11821185
},
1186+
{
1187+
Name: "don't delete evicted pod w/ revivalCount == maxRevivals, complete PJ instead",
1188+
PJ: prowapi.ProwJob{
1189+
ObjectMeta: metav1.ObjectMeta{
1190+
Name: "boop-42",
1191+
Namespace: "prowjobs",
1192+
},
1193+
Spec: prowapi.ProwJobSpec{
1194+
PodSpec: &v1.PodSpec{Containers: []v1.Container{{Name: "test-name", Env: []v1.EnvVar{}}}},
1195+
},
1196+
Status: prowapi.ProwJobStatus{
1197+
PodRevivalCount: maxRevivals,
1198+
State: prowapi.PendingState,
1199+
PodName: "boop-42",
1200+
},
1201+
},
1202+
Pods: []v1.Pod{
1203+
{
1204+
ObjectMeta: metav1.ObjectMeta{
1205+
Name: "boop-42",
1206+
Namespace: "pods",
1207+
},
1208+
Status: v1.PodStatus{
1209+
Phase: v1.PodFailed,
1210+
Reason: Evicted,
1211+
},
1212+
},
1213+
},
1214+
ExpectedComplete: true,
1215+
ExpectedState: prowapi.ErrorState,
1216+
ExpectedNumPods: 1,
1217+
ExpectedURL: "boop-42/error",
1218+
},
1219+
{
1220+
// TODO: this test case tests the current behavior, but the behavior
1221+
// is non-ideal: the pod execution did not fail, instead the node on which
1222+
// the pod was running terminated
1223+
Name: "a terminated pod is handled as-if it failed",
1224+
PJ: prowapi.ProwJob{
1225+
ObjectMeta: metav1.ObjectMeta{
1226+
Name: "boop-42",
1227+
Namespace: "prowjobs",
1228+
},
1229+
Spec: prowapi.ProwJobSpec{
1230+
PodSpec: &v1.PodSpec{Containers: []v1.Container{{Name: "test-name", Env: []v1.EnvVar{}}}},
1231+
},
1232+
Status: prowapi.ProwJobStatus{
1233+
State: prowapi.PendingState,
1234+
PodName: "boop-42",
1235+
},
1236+
},
1237+
Pods: []v1.Pod{
1238+
{
1239+
ObjectMeta: metav1.ObjectMeta{
1240+
Name: "boop-42",
1241+
Namespace: "pods",
1242+
},
1243+
Status: v1.PodStatus{
1244+
Phase: v1.PodFailed,
1245+
Reason: Terminated,
1246+
},
1247+
},
1248+
},
1249+
ExpectedComplete: true,
1250+
ExpectedState: prowapi.FailureState,
1251+
ExpectedNumPods: 1,
1252+
ExpectedURL: "boop-42/error",
1253+
},
11831254
{
11841255
Name: "running pod",
11851256
PJ: prowapi.ProwJob{

pkg/plank/reconciler.go

Lines changed: 53 additions & 52 deletions
Original file line numberDiff line numberDiff line change
@@ -62,7 +62,8 @@ const ControllerName = "plank"
6262

6363
// PodStatus constants
6464
const (
65-
Evicted = "Evicted"
65+
Evicted = "Evicted"
66+
Terminated = "Terminated"
6667
)
6768

6869
// NodeStatus constants
@@ -468,76 +469,51 @@ func (r *reconciler) syncPendingJob(ctx context.Context, pj *prowv1.ProwJob) (*r
468469
pj.Status.PodName = pn
469470
r.log.WithFields(pjutil.ProwJobFields(pj)).Info("Pod is missing, starting a new pod")
470471
}
471-
} else if pod.Status.Reason == Evicted {
472-
// Pod was evicted.
473-
if pj.Spec.ErrorOnEviction {
474-
// ErrorOnEviction is enabled, complete the PJ and mark it as
475-
// errored.
472+
} else if podUnexpectedStopCause := getPodUnexpectedStopCause(pod); podUnexpectedStopCause != PodUnexpectedStopCauseNone {
473+
switch {
474+
case podUnexpectedStopCause == PodUnexpectedStopCauseEvicted && pj.Spec.ErrorOnEviction:
475+
// ErrorOnEviction is enabled, complete the PJ and mark it as errored.
476476
r.log.WithField("error-on-eviction", true).WithFields(pjutil.ProwJobFields(pj)).Info("Pods Node got evicted, fail job.")
477477
pj.SetComplete()
478478
pj.Status.State = prowv1.ErrorState
479479
pj.Status.Description = "Job pod was evicted by the cluster."
480-
} else {
481-
// ErrorOnEviction is disabled. Delete the pod now and recreate it in
482-
// the next resync.
483-
r.log.WithFields(pjutil.ProwJobFields(pj)).Info("Pods Node got evicted, deleting & next sync loop will restart pod")
480+
case pj.Status.PodRevivalCount >= *r.config().Plank.MaxRevivals:
481+
// MaxRevivals is reached, complete the PJ and mark it as errored.
482+
r.log.WithField("unexpected-stop-cause", podUnexpectedStopCause).WithFields(pjutil.ProwJobFields(pj)).Info("Pod Node reached max retries, fail job.")
483+
pj.SetComplete()
484+
pj.Status.State = prowv1.ErrorState
485+
pj.Status.Description = fmt.Sprintf("Job pod reached max revivals (%d) after being stopped unexpectedly (%s)", pj.Status.PodRevivalCount, podUnexpectedStopCause)
486+
default:
487+
// Update the revival count and delete the pod so it gets recreated in the next resync.
488+
pj.Status.PodRevivalCount++
489+
r.log.
490+
WithField("unexpected-stop-cause", podUnexpectedStopCause).
491+
WithFields(pjutil.ProwJobFields(pj)).
492+
Info("Pod has stopped unexpectedly, deleting & next sync loop will restart pod")
493+
484494
client, ok := r.buildClients[pj.ClusterAlias()]
485495
if !ok {
486-
return nil, TerminalError(fmt.Errorf("evicted pod %s: unknown cluster alias %q", pod.Name, pj.ClusterAlias()))
496+
return nil, TerminalError(fmt.Errorf("pod %s which was stopped unexpectedly (%s): unknown cluster alias %q", pod.Name, podUnexpectedStopCause, pj.ClusterAlias()))
487497
}
488-
if finalizers := sets.New[string](pod.Finalizers...); finalizers.Has(kubernetesreporterapi.FinalizerName) {
498+
if finalizers := sets.New(pod.Finalizers...); finalizers.Has(kubernetesreporterapi.FinalizerName) {
489499
// We want the end user to not see this, so we have to remove the finalizer, otherwise the pod hangs
490500
oldPod := pod.DeepCopy()
491501
pod.Finalizers = finalizers.Delete(kubernetesreporterapi.FinalizerName).UnsortedList()
492502
if err := client.Patch(ctx, pod, ctrlruntimeclient.MergeFrom(oldPod)); err != nil {
493503
return nil, fmt.Errorf("failed to patch pod trying to remove %s finalizer: %w", kubernetesreporterapi.FinalizerName, err)
494504
}
495505
}
496-
r.log.WithField("name", pj.ObjectMeta.Name).Debug("Delete Pod.")
497-
return nil, ctrlruntimeclient.IgnoreNotFound(client.Delete(ctx, pod))
498-
}
499-
} else if pod.DeletionTimestamp != nil && pod.Status.Reason == NodeUnreachablePodReason {
500-
// This can happen in any phase and means the node got evicted after it became unresponsive. Delete the finalizer so the pod
501-
// vanishes and we will silently re-create it in the next iteration.
502-
r.log.WithFields(pjutil.ProwJobFields(pj)).Info("Pods Node got lost, deleting & next sync loop will restart pod")
503-
client, ok := r.buildClients[pj.ClusterAlias()]
504-
if !ok {
505-
return nil, TerminalError(fmt.Errorf("unknown pod %s: unknown cluster alias %q", pod.Name, pj.ClusterAlias()))
506-
}
507506

508-
if finalizers := sets.New[string](pod.Finalizers...); finalizers.Has(kubernetesreporterapi.FinalizerName) {
509-
// We want the end user to not see this, so we have to remove the finalizer, otherwise the pod hangs
510-
oldPod := pod.DeepCopy()
511-
pod.Finalizers = finalizers.Delete(kubernetesreporterapi.FinalizerName).UnsortedList()
512-
if err := client.Patch(ctx, pod, ctrlruntimeclient.MergeFrom(oldPod)); err != nil {
513-
return nil, fmt.Errorf("failed to patch pod trying to remove %s finalizer: %w", kubernetesreporterapi.FinalizerName, err)
514-
}
515-
}
516-
517-
return nil, nil
518-
} else {
519-
switch pod.Status.Phase {
520-
case corev1.PodUnknown:
521-
// Pod is in Unknown state. This can happen if there is a problem with
522-
// the node. Delete the old pod, this will fire an event that triggers
523-
// a new reconciliation in which we will re-create the pod.
524-
r.log.WithFields(pjutil.ProwJobFields(pj)).Info("Pod is in unknown state, deleting & restarting pod")
525-
client, ok := r.buildClients[pj.ClusterAlias()]
526-
if !ok {
527-
return nil, TerminalError(fmt.Errorf("unknown pod %s: unknown cluster alias %q", pod.Name, pj.ClusterAlias()))
507+
// Pod is already deleted, so we don't need to delete it again.
508+
if pod.DeletionTimestamp != nil {
509+
return nil, nil
528510
}
529511

530-
if finalizers := sets.New[string](pod.Finalizers...); finalizers.Has(kubernetesreporterapi.FinalizerName) {
531-
// We want the end user to not see this, so we have to remove the finalizer, otherwise the pod hangs
532-
oldPod := pod.DeepCopy()
533-
pod.Finalizers = finalizers.Delete(kubernetesreporterapi.FinalizerName).UnsortedList()
534-
if err := client.Patch(ctx, pod, ctrlruntimeclient.MergeFrom(oldPod)); err != nil {
535-
return nil, fmt.Errorf("failed to patch pod trying to remove %s finalizer: %w", kubernetesreporterapi.FinalizerName, err)
536-
}
537-
}
538512
r.log.WithField("name", pj.ObjectMeta.Name).Debug("Delete Pod.")
539513
return nil, ctrlruntimeclient.IgnoreNotFound(client.Delete(ctx, pod))
540-
514+
}
515+
} else {
516+
switch pod.Status.Phase {
541517
case corev1.PodSucceeded:
542518
pj.SetComplete()
543519
// There were bugs around this in the past so be paranoid and verify each container
@@ -679,6 +655,31 @@ func (r *reconciler) syncPendingJob(ctx context.Context, pj *prowv1.ProwJob) (*r
679655
return nil, nil
680656
}
681657

658+
type PodUnexpectedStopCause string
659+
660+
const (
661+
PodUnexpectedStopCauseNone PodUnexpectedStopCause = ""
662+
PodUnexpectedStopCauseUnknown PodUnexpectedStopCause = "unknown"
663+
PodUnexpectedStopCauseEvicted PodUnexpectedStopCause = "evicted"
664+
PodUnexpectedStopCauseUnreachable PodUnexpectedStopCause = "unreachable"
665+
)
666+
667+
func getPodUnexpectedStopCause(pod *corev1.Pod) PodUnexpectedStopCause {
668+
if pod.Status.Reason == Evicted {
669+
return PodUnexpectedStopCauseEvicted
670+
}
671+
672+
if pod.Status.Reason == NodeUnreachablePodReason && pod.DeletionTimestamp != nil {
673+
return PodUnexpectedStopCauseUnreachable
674+
}
675+
676+
if pod.Status.Phase == corev1.PodUnknown {
677+
return PodUnexpectedStopCauseUnknown
678+
}
679+
680+
return PodUnexpectedStopCauseNone
681+
}
682+
682683
// syncTriggeredJob syncs jobs that do not yet have an associated test workload running
683684
func (r *reconciler) syncTriggeredJob(ctx context.Context, pj *prowv1.ProwJob) (*reconcile.Result, error) {
684685
prevPJ := pj.DeepCopy()

0 commit comments

Comments
 (0)