@@ -60,6 +60,8 @@ import (
6060
6161const ControllerName = "plank"
6262
63+ const MaxPodRetries = 3
64+
6365// PodStatus constants
6466const (
6567 Evicted = "Evicted"
@@ -462,104 +464,57 @@ func (r *reconciler) syncPendingJob(ctx context.Context, pj *prowv1.ProwJob) (*r
462464 pj .Status .PodName = pn
463465 r .log .WithFields (pjutil .ProwJobFields (pj )).Info ("Pod is missing, starting a new pod" )
464466 }
465- } else if pod .Status .Reason == Evicted {
466- // Pod was evicted.
467- if pj .Spec .ErrorOnEviction {
468- // ErrorOnEviction is enabled, complete the PJ and mark it as
469- // errored.
467+ } else if transientFailure := getTransientFailure (pod ); transientFailure != PodTransientFailureNone {
468+ switch {
469+ case transientFailure == PodTransientFailureEvicted && pj .Spec .ErrorOnEviction :
470+ // ErrorOnEviction is enabled, complete the PJ and mark it as errored.
470471 r .log .WithField ("error-on-eviction" , true ).WithFields (pjutil .ProwJobFields (pj )).Info ("Pods Node got evicted, fail job." )
471472 pj .SetComplete ()
472473 pj .Status .State = prowv1 .ErrorState
473474 pj .Status .Description = "Job pod was evicted by the cluster."
474- } else {
475- // ErrorOnEviction is disabled. Delete the pod now and recreate it in
476- // the next resync.
477- r .log .WithFields (pjutil .ProwJobFields (pj )).Info ("Pods Node got evicted, deleting & next sync loop will restart pod" )
478- client , ok := r .buildClients [pj .ClusterAlias ()]
479- if ! ok {
480- return nil , TerminalError (fmt .Errorf ("evicted pod %s: unknown cluster alias %q" , pod .Name , pj .ClusterAlias ()))
481- }
482- if finalizers := sets .New [string ](pod .Finalizers ... ); finalizers .Has (kubernetesreporterapi .FinalizerName ) {
483- // We want the end user to not see this, so we have to remove the finalizer, otherwise the pod hangs
484- oldPod := pod .DeepCopy ()
485- pod .Finalizers = finalizers .Delete (kubernetesreporterapi .FinalizerName ).UnsortedList ()
486- if err := client .Patch (ctx , pod , ctrlruntimeclient .MergeFrom (oldPod )); err != nil {
487- return nil , fmt .Errorf ("failed to patch pod trying to remove %s finalizer: %w" , kubernetesreporterapi .FinalizerName , err )
488- }
489- }
490- r .log .WithField ("name" , pj .ObjectMeta .Name ).Debug ("Delete Pod." )
491- return nil , ctrlruntimeclient .IgnoreNotFound (client .Delete (ctx , pod ))
492- }
493- } else if isPodTerminated (pod ) {
494- // Pod was terminated.
495- if pj .Spec .ErrorOnTermination {
496- // ErrorOnTermination is enabled, complete the PJ and mark it as
497- // errored.
475+ case transientFailure == PodTransientFailureTerminated && pj .Spec .ErrorOnTermination :
476+ // ErrorOnTermination is enabled, complete the PJ and mark it as errored.
498477 r .log .WithField ("error-on-termination" , true ).WithFields (pjutil .ProwJobFields (pj )).Info ("Pods Node got terminated, fail job." )
499478 pj .SetComplete ()
500479 pj .Status .State = prowv1 .ErrorState
501480 pj .Status .Description = "Job pod's node was terminated."
502- } else {
503- // ErrorOnTermination is disabled. Delete the pod now and recreate it in
504- // the next resync.
505- r .log .WithFields (pjutil .ProwJobFields (pj )).Info ("Pods Node got terminated, deleting & next sync loop will restart pod" )
481+ case pj .Status .RetryCount >= MaxPodRetries :
482+ // MaxPodRetries is reached, complete the PJ and mark it as errored.
483+ r .log .WithField ("transient-failure" , transientFailure ).WithFields (pjutil .ProwJobFields (pj )).Info ("Pod Node reached max retries, fail job." )
484+ pj .SetComplete ()
485+ pj .Status .State = prowv1 .ErrorState
486+ pj .Status .Description = fmt .Sprintf ("Job pod reached max retries (%d) for transient failure %s" , MaxPodRetries , transientFailure )
487+ default :
488+ // Update the retry count and delete the pod so it gets recreated in the next resync.
489+ pj .Status .RetryCount ++
490+ r .log .
491+ WithField ("transientFailure" , transientFailure ).
492+ WithFields (pjutil .ProwJobFields (pj )).
493+ Info ("Pod has transient failure, deleting & next sync loop will restart pod" )
494+
506495 client , ok := r .buildClients [pj .ClusterAlias ()]
507496 if ! ok {
508- return nil , TerminalError (fmt .Errorf ("terminated pod %s: unknown cluster alias %q" , pod .Name , pj .ClusterAlias ()))
497+ return nil , TerminalError (fmt .Errorf ("pod %s with transient failure %s : unknown cluster alias %q" , pod .Name , transientFailure , pj .ClusterAlias ()))
509498 }
510- if finalizers := sets .New [ string ] (pod .Finalizers ... ); finalizers .Has (kubernetesreporterapi .FinalizerName ) {
499+ if finalizers := sets .New (pod .Finalizers ... ); finalizers .Has (kubernetesreporterapi .FinalizerName ) {
511500 // We want the end user to not see this, so we have to remove the finalizer, otherwise the pod hangs
512501 oldPod := pod .DeepCopy ()
513502 pod .Finalizers = finalizers .Delete (kubernetesreporterapi .FinalizerName ).UnsortedList ()
514503 if err := client .Patch (ctx , pod , ctrlruntimeclient .MergeFrom (oldPod )); err != nil {
515504 return nil , fmt .Errorf ("failed to patch pod trying to remove %s finalizer: %w" , kubernetesreporterapi .FinalizerName , err )
516505 }
517506 }
518- r .log .WithField ("name" , pj .ObjectMeta .Name ).Debug ("Delete Pod." )
519- return nil , ctrlruntimeclient .IgnoreNotFound (client .Delete (ctx , pod ))
520- }
521- } else if pod .DeletionTimestamp != nil && pod .Status .Reason == NodeUnreachablePodReason {
522- // This can happen in any phase and means the node got evicted after it became unresponsive. Delete the finalizer so the pod
523- // vanishes and we will silently re-create it in the next iteration.
524- r .log .WithFields (pjutil .ProwJobFields (pj )).Info ("Pods Node got lost, deleting & next sync loop will restart pod" )
525- client , ok := r .buildClients [pj .ClusterAlias ()]
526- if ! ok {
527- return nil , TerminalError (fmt .Errorf ("unknown pod %s: unknown cluster alias %q" , pod .Name , pj .ClusterAlias ()))
528- }
529-
530- if finalizers := sets .New [string ](pod .Finalizers ... ); finalizers .Has (kubernetesreporterapi .FinalizerName ) {
531- // We want the end user to not see this, so we have to remove the finalizer, otherwise the pod hangs
532- oldPod := pod .DeepCopy ()
533- pod .Finalizers = finalizers .Delete (kubernetesreporterapi .FinalizerName ).UnsortedList ()
534- if err := client .Patch (ctx , pod , ctrlruntimeclient .MergeFrom (oldPod )); err != nil {
535- return nil , fmt .Errorf ("failed to patch pod trying to remove %s finalizer: %w" , kubernetesreporterapi .FinalizerName , err )
536- }
537- }
538507
539- return nil , nil
540- } else {
541- switch pod .Status .Phase {
542- case corev1 .PodUnknown :
543- // Pod is in Unknown state. This can happen if there is a problem with
544- // the node. Delete the old pod, this will fire an event that triggers
545- // a new reconciliation in which we will re-create the pod.
546- r .log .WithFields (pjutil .ProwJobFields (pj )).Info ("Pod is in unknown state, deleting & restarting pod" )
547- client , ok := r .buildClients [pj .ClusterAlias ()]
548- if ! ok {
549- return nil , TerminalError (fmt .Errorf ("unknown pod %s: unknown cluster alias %q" , pod .Name , pj .ClusterAlias ()))
508+ // Pod is already deleted, so we don't need to delete it again.
509+ if pod .DeletionTimestamp != nil {
510+ return nil , nil
550511 }
551512
552- if finalizers := sets .New [string ](pod .Finalizers ... ); finalizers .Has (kubernetesreporterapi .FinalizerName ) {
553- // We want the end user to not see this, so we have to remove the finalizer, otherwise the pod hangs
554- oldPod := pod .DeepCopy ()
555- pod .Finalizers = finalizers .Delete (kubernetesreporterapi .FinalizerName ).UnsortedList ()
556- if err := client .Patch (ctx , pod , ctrlruntimeclient .MergeFrom (oldPod )); err != nil {
557- return nil , fmt .Errorf ("failed to patch pod trying to remove %s finalizer: %w" , kubernetesreporterapi .FinalizerName , err )
558- }
559- }
560513 r .log .WithField ("name" , pj .ObjectMeta .Name ).Debug ("Delete Pod." )
561514 return nil , ctrlruntimeclient .IgnoreNotFound (client .Delete (ctx , pod ))
562-
515+ }
516+ } else {
517+ switch pod .Status .Phase {
563518 case corev1 .PodSucceeded :
564519 pj .SetComplete ()
565520 // There were bugs around this in the past so be paranoid and verify each container
@@ -701,31 +656,53 @@ func (r *reconciler) syncPendingJob(ctx context.Context, pj *prowv1.ProwJob) (*r
701656 return nil , nil
702657}
703658
704- func isPodTerminated (pod * corev1.Pod ) bool {
659+ type PodTransientFailure string
660+
661+ const (
662+ PodTransientFailureNone PodTransientFailure = ""
663+ PodTransientFailureUnknown PodTransientFailure = "unknown"
664+ PodTransientFailureEvicted PodTransientFailure = "evicted"
665+ PodTransientFailureTerminated PodTransientFailure = "terminated"
666+ PodTransientFailureUnreachable PodTransientFailure = "unreachable"
667+ )
668+
669+ func getTransientFailure (pod * corev1.Pod ) PodTransientFailure {
670+ if pod .Status .Reason == Evicted {
671+ return PodTransientFailureEvicted
672+ }
673+
705674 // If there was a Graceful node shutdown, the Pod's status will have a
706675 // reason set to "Terminated":
707676 // https://kubernetes.io/docs/concepts/architecture/nodes/#graceful-node-shutdown
708677 if pod .Status .Reason == Terminated {
709- return true
678+ return PodTransientFailureTerminated
710679 }
711680
712681 for _ , condition := range pod .Status .Conditions {
713682 // If the node does no longer exist and the pod gets garbage collected,
714683 // this condition will be set:
715684 // https://kubernetes.io/docs/concepts/workloads/pods/disruptions/#pod-disruption-conditions
716685 if condition .Reason == "DeletionByPodGC" {
717- return true
686+ return PodTransientFailureTerminated
718687 }
719688
720689 // On GCP, before a new spot instance is started, the old pods are garbage
721690 // collected (if they have not been already by the Kubernetes PodGC):
722691 // https://github.com/kubernetes/cloud-provider-gcp/blob/25e5dcc715781316bc5e39f8b17c0d5b313453f7/cmd/gcp-controller-manager/node_csr_approver.go#L1035-L1058
723692 if condition .Reason == "DeletionByGCPControllerManager" {
724- return true
693+ return PodTransientFailureTerminated
725694 }
726695 }
727696
728- return false
697+ if pod .Status .Reason == NodeUnreachablePodReason && pod .DeletionTimestamp != nil {
698+ return PodTransientFailureUnreachable
699+ }
700+
701+ if pod .Status .Phase == corev1 .PodUnknown {
702+ return PodTransientFailureUnknown
703+ }
704+
705+ return PodTransientFailureNone
729706}
730707
731708// syncTriggeredJob syncs jobs that do not yet have an associated test workload running
0 commit comments