@@ -62,7 +62,8 @@ const ControllerName = "plank"
6262
6363// PodStatus constants
6464const (
65- Evicted = "Evicted"
65+ Evicted = "Evicted"
66+ Terminated = "Terminated"
6667)
6768
6869// NodeStatus constants
@@ -468,76 +469,51 @@ func (r *reconciler) syncPendingJob(ctx context.Context, pj *prowv1.ProwJob) (*r
468469 pj .Status .PodName = pn
469470 r .log .WithFields (pjutil .ProwJobFields (pj )).Info ("Pod is missing, starting a new pod" )
470471 }
471- } else if pod .Status .Reason == Evicted {
472- // Pod was evicted.
473- if pj .Spec .ErrorOnEviction {
474- // ErrorOnEviction is enabled, complete the PJ and mark it as
475- // errored.
472+ } else if podUnexpectedStopCause := getPodUnexpectedStopCause (pod ); podUnexpectedStopCause != PodUnexpectedStopCauseNone {
473+ switch {
474+ case podUnexpectedStopCause == PodUnexpectedStopCauseEvicted && pj .Spec .ErrorOnEviction :
475+ // ErrorOnEviction is enabled, complete the PJ and mark it as errored.
476476 r .log .WithField ("error-on-eviction" , true ).WithFields (pjutil .ProwJobFields (pj )).Info ("Pods Node got evicted, fail job." )
477477 pj .SetComplete ()
478478 pj .Status .State = prowv1 .ErrorState
479479 pj .Status .Description = "Job pod was evicted by the cluster."
480- } else {
481- // ErrorOnEviction is disabled. Delete the pod now and recreate it in
482- // the next resync.
483- r .log .WithFields (pjutil .ProwJobFields (pj )).Info ("Pods Node got evicted, deleting & next sync loop will restart pod" )
480+ case pj .Status .PodRevivalCount >= * r .config ().Plank .MaxRevivals :
481+ // MaxRevivals is reached, complete the PJ and mark it as errored.
482+ r .log .WithField ("unexpected-stop-cause" , podUnexpectedStopCause ).WithFields (pjutil .ProwJobFields (pj )).Info ("Pod Node reached max retries, fail job." )
483+ pj .SetComplete ()
484+ pj .Status .State = prowv1 .ErrorState
485+ pj .Status .Description = fmt .Sprintf ("Job pod reached max revivals (%d) after being stopped unexpectedly (%s)" , pj .Status .PodRevivalCount , podUnexpectedStopCause )
486+ default :
487+ // Update the revival count and delete the pod so it gets recreated in the next resync.
488+ pj .Status .PodRevivalCount ++
489+ r .log .
490+ WithField ("unexpected-stop-cause" , podUnexpectedStopCause ).
491+ WithFields (pjutil .ProwJobFields (pj )).
492+ Info ("Pod has stopped unexpectedly, deleting & next sync loop will restart pod" )
493+
484494 client , ok := r .buildClients [pj .ClusterAlias ()]
485495 if ! ok {
486- return nil , TerminalError (fmt .Errorf ("evicted pod %s: unknown cluster alias %q" , pod .Name , pj .ClusterAlias ()))
496+ return nil , TerminalError (fmt .Errorf ("pod %s which was stopped unexpectedly (%s) : unknown cluster alias %q" , pod .Name , podUnexpectedStopCause , pj .ClusterAlias ()))
487497 }
488- if finalizers := sets .New [ string ] (pod .Finalizers ... ); finalizers .Has (kubernetesreporterapi .FinalizerName ) {
498+ if finalizers := sets .New (pod .Finalizers ... ); finalizers .Has (kubernetesreporterapi .FinalizerName ) {
489499 // We want the end user to not see this, so we have to remove the finalizer, otherwise the pod hangs
490500 oldPod := pod .DeepCopy ()
491501 pod .Finalizers = finalizers .Delete (kubernetesreporterapi .FinalizerName ).UnsortedList ()
492502 if err := client .Patch (ctx , pod , ctrlruntimeclient .MergeFrom (oldPod )); err != nil {
493503 return nil , fmt .Errorf ("failed to patch pod trying to remove %s finalizer: %w" , kubernetesreporterapi .FinalizerName , err )
494504 }
495505 }
496- r .log .WithField ("name" , pj .ObjectMeta .Name ).Debug ("Delete Pod." )
497- return nil , ctrlruntimeclient .IgnoreNotFound (client .Delete (ctx , pod ))
498- }
499- } else if pod .DeletionTimestamp != nil && pod .Status .Reason == NodeUnreachablePodReason {
500- // This can happen in any phase and means the node got evicted after it became unresponsive. Delete the finalizer so the pod
501- // vanishes and we will silently re-create it in the next iteration.
502- r .log .WithFields (pjutil .ProwJobFields (pj )).Info ("Pods Node got lost, deleting & next sync loop will restart pod" )
503- client , ok := r .buildClients [pj .ClusterAlias ()]
504- if ! ok {
505- return nil , TerminalError (fmt .Errorf ("unknown pod %s: unknown cluster alias %q" , pod .Name , pj .ClusterAlias ()))
506- }
507506
508- if finalizers := sets .New [string ](pod .Finalizers ... ); finalizers .Has (kubernetesreporterapi .FinalizerName ) {
509- // We want the end user to not see this, so we have to remove the finalizer, otherwise the pod hangs
510- oldPod := pod .DeepCopy ()
511- pod .Finalizers = finalizers .Delete (kubernetesreporterapi .FinalizerName ).UnsortedList ()
512- if err := client .Patch (ctx , pod , ctrlruntimeclient .MergeFrom (oldPod )); err != nil {
513- return nil , fmt .Errorf ("failed to patch pod trying to remove %s finalizer: %w" , kubernetesreporterapi .FinalizerName , err )
514- }
515- }
516-
517- return nil , nil
518- } else {
519- switch pod .Status .Phase {
520- case corev1 .PodUnknown :
521- // Pod is in Unknown state. This can happen if there is a problem with
522- // the node. Delete the old pod, this will fire an event that triggers
523- // a new reconciliation in which we will re-create the pod.
524- r .log .WithFields (pjutil .ProwJobFields (pj )).Info ("Pod is in unknown state, deleting & restarting pod" )
525- client , ok := r .buildClients [pj .ClusterAlias ()]
526- if ! ok {
527- return nil , TerminalError (fmt .Errorf ("unknown pod %s: unknown cluster alias %q" , pod .Name , pj .ClusterAlias ()))
507+ // Pod is already deleted, so we don't need to delete it again.
508+ if pod .DeletionTimestamp != nil {
509+ return nil , nil
528510 }
529511
530- if finalizers := sets .New [string ](pod .Finalizers ... ); finalizers .Has (kubernetesreporterapi .FinalizerName ) {
531- // We want the end user to not see this, so we have to remove the finalizer, otherwise the pod hangs
532- oldPod := pod .DeepCopy ()
533- pod .Finalizers = finalizers .Delete (kubernetesreporterapi .FinalizerName ).UnsortedList ()
534- if err := client .Patch (ctx , pod , ctrlruntimeclient .MergeFrom (oldPod )); err != nil {
535- return nil , fmt .Errorf ("failed to patch pod trying to remove %s finalizer: %w" , kubernetesreporterapi .FinalizerName , err )
536- }
537- }
538512 r .log .WithField ("name" , pj .ObjectMeta .Name ).Debug ("Delete Pod." )
539513 return nil , ctrlruntimeclient .IgnoreNotFound (client .Delete (ctx , pod ))
540-
514+ }
515+ } else {
516+ switch pod .Status .Phase {
541517 case corev1 .PodSucceeded :
542518 pj .SetComplete ()
543519 // There were bugs around this in the past so be paranoid and verify each container
@@ -679,6 +655,31 @@ func (r *reconciler) syncPendingJob(ctx context.Context, pj *prowv1.ProwJob) (*r
679655 return nil , nil
680656}
681657
658+ type PodUnexpectedStopCause string
659+
660+ const (
661+ PodUnexpectedStopCauseNone PodUnexpectedStopCause = ""
662+ PodUnexpectedStopCauseUnknown PodUnexpectedStopCause = "unknown"
663+ PodUnexpectedStopCauseEvicted PodUnexpectedStopCause = "evicted"
664+ PodUnexpectedStopCauseUnreachable PodUnexpectedStopCause = "unreachable"
665+ )
666+
667+ func getPodUnexpectedStopCause (pod * corev1.Pod ) PodUnexpectedStopCause {
668+ if pod .Status .Reason == Evicted {
669+ return PodUnexpectedStopCauseEvicted
670+ }
671+
672+ if pod .Status .Reason == NodeUnreachablePodReason && pod .DeletionTimestamp != nil {
673+ return PodUnexpectedStopCauseUnreachable
674+ }
675+
676+ if pod .Status .Phase == corev1 .PodUnknown {
677+ return PodUnexpectedStopCauseUnknown
678+ }
679+
680+ return PodUnexpectedStopCauseNone
681+ }
682+
682683// syncTriggeredJob syncs jobs that do not yet have an associated test workload running
683684func (r * reconciler ) syncTriggeredJob (ctx context.Context , pj * prowv1.ProwJob ) (* reconcile.Result , error ) {
684685 prevPJ := pj .DeepCopy ()
0 commit comments