Skip to content

Commit b2536c8

Browse files
committed
Revive prowjob in case of a node being terminated.
Signed-off-by: Tim Ramlot <[email protected]>
1 parent 9b3f5fa commit b2536c8

File tree

9 files changed

+171
-19
lines changed

9 files changed

+171
-19
lines changed

config/prow/cluster/prowjob-crd/prowjob_customresourcedefinition.yaml

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1584,6 +1584,13 @@ spec:
15841584
If this field is unspecified or false, a new pod will be created to replace
15851585
the evicted one.
15861586
type: boolean
1587+
error_on_termination:
1588+
description: |-
1589+
ErrorOnTermination indicates that the ProwJob should be completed and given
1590+
the ErrorState status if the pod that is executing the job is terminated.
1591+
If this field is unspecified or false, a new pod will be created to replace
1592+
the terminated one.
1593+
type: boolean
15871594
extra_refs:
15881595
description: |-
15891596
ExtraRefs are auxiliary repositories that

pkg/apis/prowjobs/v1/types.go

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -182,6 +182,11 @@ type ProwJobSpec struct {
182182
// If this field is unspecified or false, a new pod will be created to replace
183183
// the evicted one.
184184
ErrorOnEviction bool `json:"error_on_eviction,omitempty"`
185+
// ErrorOnTermination indicates that the ProwJob should be completed and given
186+
// the ErrorState status if the pod that is executing the job is terminated.
187+
// If this field is unspecified or false, a new pod will be created to replace
188+
// the terminated one.
189+
ErrorOnTermination bool `json:"error_on_termination,omitempty"`
185190

186191
// PodSpec provides the basis for running the test under
187192
// a Kubernetes agent

pkg/config/config.go

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -661,6 +661,13 @@ type Plank struct {
661661
// Defaults to 3. A value of 0 means no retries.
662662
MaxRevivals *int `json:"max_revivals,omitempty"`
663663

664+
// TerminationConditionReasons is a set of pod status condition reasons on which the
665+
// controller will match to determine if the pod's node is being terminated. If a node is being
666+
// terminated the controller will restart the prowjob, unless the ErrorOnTermination option
667+
// is set on the prowjob or the MaxRevivals option is reached.
668+
// Defaults to ["DeletionByPodGC", "DeletionByGCPControllerManager"].
669+
TerminationConditionReasons []string `json:"termination_condition_reasons,omitempty"`
670+
664671
// DefaultDecorationConfigs holds the default decoration config for specific values.
665672
//
666673
// Each entry in the slice specifies Repo and Cluster regexp filter fields to
@@ -2514,6 +2521,19 @@ func parseProwConfig(c *Config) error {
25142521
c.Plank.MaxRevivals = &maxRetries
25152522
}
25162523

2524+
if c.Plank.TerminationConditionReasons == nil {
2525+
c.Plank.TerminationConditionReasons = []string{
2526+
// If the node does no longer exist and the pod gets garbage collected,
2527+
// this condition will be set:
2528+
// https://kubernetes.io/docs/concepts/workloads/pods/disruptions/#pod-disruption-conditions
2529+
"DeletionByPodGC",
2530+
// On GCP, before a new spot instance is started, the old pods are garbage
2531+
// collected (if they have not been already by the Kubernetes PodGC):
2532+
// https://github.com/kubernetes/cloud-provider-gcp/blob/25e5dcc715781316bc5e39f8b17c0d5b313453f7/cmd/gcp-controller-manager/node_csr_approver.go#L1035-L1058
2533+
"DeletionByGCPControllerManager",
2534+
}
2535+
}
2536+
25172537
if err := c.Gerrit.DefaultAndValidate(); err != nil {
25182538
return fmt.Errorf("validating gerrit config: %w", err)
25192539
}
@@ -2913,6 +2933,8 @@ func validateAgent(v JobBase, podNamespace string) error {
29132933
return fmt.Errorf("decoration requires agent: %s (found %q)", k, agent)
29142934
case v.ErrorOnEviction && agent != k:
29152935
return fmt.Errorf("error_on_eviction only applies to agent: %s (found %q)", k, agent)
2936+
case v.ErrorOnTermination && agent != k:
2937+
return fmt.Errorf("error_on_termination only applies to agent: %s (found %q)", k, agent)
29162938
case v.Namespace == nil || *v.Namespace == "":
29172939
return fmt.Errorf("failed to default namespace")
29182940
case *v.Namespace != podNamespace && agent != p:

pkg/config/config_test.go

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1428,6 +1428,13 @@ func TestValidateAgent(t *testing.T) {
14281428
},
14291429
pass: true,
14301430
},
1431+
{
1432+
name: "error_on_termination allowed for kubernetes agent",
1433+
base: func(j *JobBase) {
1434+
j.ErrorOnTermination = true
1435+
},
1436+
pass: true,
1437+
},
14311438
}
14321439

14331440
for _, tc := range cases {
@@ -8430,6 +8437,9 @@ plank:
84308437
pod_pending_timeout: 10m0s
84318438
pod_running_timeout: 48h0m0s
84328439
pod_unscheduled_timeout: 5m0s
8440+
termination_condition_reasons:
8441+
- DeletionByPodGC
8442+
- DeletionByGCPControllerManager
84338443
pod_namespace: default
84348444
prowjob_namespace: default
84358445
push_gateway:
@@ -8515,6 +8525,9 @@ plank:
85158525
pod_pending_timeout: 10m0s
85168526
pod_running_timeout: 48h0m0s
85178527
pod_unscheduled_timeout: 5m0s
8528+
termination_condition_reasons:
8529+
- DeletionByPodGC
8530+
- DeletionByGCPControllerManager
85188531
pod_namespace: default
85198532
prowjob_namespace: default
85208533
push_gateway:
@@ -8593,6 +8606,9 @@ plank:
85938606
pod_pending_timeout: 10m0s
85948607
pod_running_timeout: 48h0m0s
85958608
pod_unscheduled_timeout: 5m0s
8609+
termination_condition_reasons:
8610+
- DeletionByPodGC
8611+
- DeletionByGCPControllerManager
85968612
pod_namespace: default
85978613
prowjob_namespace: default
85988614
push_gateway:
@@ -8676,6 +8692,9 @@ plank:
86768692
pod_pending_timeout: 10m0s
86778693
pod_running_timeout: 48h0m0s
86788694
pod_unscheduled_timeout: 5m0s
8695+
termination_condition_reasons:
8696+
- DeletionByPodGC
8697+
- DeletionByGCPControllerManager
86798698
pod_namespace: default
86808699
prowjob_namespace: default
86818700
push_gateway:

pkg/config/jobs.go

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -109,6 +109,11 @@ type JobBase struct {
109109
// If this field is unspecified or false, a new pod will be created to replace
110110
// the evicted one.
111111
ErrorOnEviction bool `json:"error_on_eviction,omitempty"`
112+
// ErrorOnTermination indicates that the ProwJob should be completed and given
113+
// the ErrorState status if the pod that is executing the job is terminated.
114+
// If this field is unspecified or false, a new pod will be created to replace
115+
// the terminated one.
116+
ErrorOnTermination bool `json:"error_on_termination,omitempty"`
112117
// SourcePath contains the path where this job is defined
113118
SourcePath string `json:"-"`
114119
// Spec is the Kubernetes pod spec used if Agent is kubernetes.

pkg/config/prow-config-documented.yaml

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1321,6 +1321,13 @@ plank:
13211321
# Use `org/repo`, `org` or `*` as a key.
13221322
report_templates:
13231323
"": ""
1324+
# TerminationConditionReasons is a set of pod status condition reasons on which the
1325+
# controller will match to determine if the pod's node is being terminated. If a node is being
1326+
# terminated the controller will restart the prowjob, unless the ErrorOnTermination option
1327+
# is set on the prowjob or the MaxRevivals option is reached.
1328+
# Defaults to ["DeletionByPodGC", "DeletionByGCPControllerManager"].
1329+
termination_condition_reasons:
1330+
- ""
13241331
# PodNamespace is the namespace in the cluster that prow
13251332
# components will use for looking up Pods owned by ProwJobs.
13261333
# The namespace needs to exist and will not be created by prow.

pkg/pjutil/pjutil.go

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -228,12 +228,13 @@ func specFromJobBase(jb config.JobBase) prowapi.ProwJobSpec {
228228
namespace = *jb.Namespace
229229
}
230230
return prowapi.ProwJobSpec{
231-
Job: jb.Name,
232-
Agent: prowapi.ProwJobAgent(jb.Agent),
233-
Cluster: jb.Cluster,
234-
Namespace: namespace,
235-
MaxConcurrency: jb.MaxConcurrency,
236-
ErrorOnEviction: jb.ErrorOnEviction,
231+
Job: jb.Name,
232+
Agent: prowapi.ProwJobAgent(jb.Agent),
233+
Cluster: jb.Cluster,
234+
Namespace: namespace,
235+
MaxConcurrency: jb.MaxConcurrency,
236+
ErrorOnEviction: jb.ErrorOnEviction,
237+
ErrorOnTermination: jb.ErrorOnTermination,
237238

238239
ExtraRefs: DecorateExtraRefs(jb.ExtraRefs, jb),
239240
DecorationConfig: jb.DecorationConfig,

pkg/plank/controller_test.go

Lines changed: 76 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -65,7 +65,10 @@ const (
6565
podDeletionPreventionFinalizer = "keep-from-vanishing"
6666
)
6767

68-
var maxRevivals = 3
68+
var (
69+
maxRevivals = 3
70+
terminationConditionReasons = []string{"DeletionByPodGC", "DeletionByGCPControllerManager"}
71+
)
6972

7073
func newFakeConfigAgent(t *testing.T, maxConcurrency int, queueCapacities map[string]int) *fca {
7174
presubmits := []config.Presubmit{
@@ -104,11 +107,12 @@ func newFakeConfigAgent(t *testing.T, maxConcurrency int, queueCapacities map[st
104107
MaxConcurrency: maxConcurrency,
105108
MaxGoroutines: 20,
106109
},
107-
JobQueueCapacities: queueCapacities,
108-
PodPendingTimeout: &metav1.Duration{Duration: podPendingTimeout},
109-
PodRunningTimeout: &metav1.Duration{Duration: podRunningTimeout},
110-
PodUnscheduledTimeout: &metav1.Duration{Duration: podUnscheduledTimeout},
111-
MaxRevivals: &maxRevivals,
110+
JobQueueCapacities: queueCapacities,
111+
PodPendingTimeout: &metav1.Duration{Duration: podPendingTimeout},
112+
PodRunningTimeout: &metav1.Duration{Duration: podRunningTimeout},
113+
PodUnscheduledTimeout: &metav1.Duration{Duration: podUnscheduledTimeout},
114+
MaxRevivals: &maxRevivals,
115+
TerminationConditionReasons: terminationConditionReasons,
112116
},
113117
},
114118
JobConfig: config.JobConfig{
@@ -1217,10 +1221,7 @@ func TestSyncPendingJob(t *testing.T) {
12171221
ExpectedURL: "boop-42/error",
12181222
},
12191223
{
1220-
// TODO: this test case tests the current behavior, but the behavior
1221-
// is non-ideal: the pod execution did not fail, instead the node on which
1222-
// the pod was running terminated
1223-
Name: "a terminated pod is handled as-if it failed",
1224+
Name: "delete terminated pod",
12241225
PJ: prowapi.ProwJob{
12251226
ObjectMeta: metav1.ObjectMeta{
12261227
Name: "boop-42",
@@ -1246,8 +1247,72 @@ func TestSyncPendingJob(t *testing.T) {
12461247
},
12471248
},
12481249
},
1250+
ExpectedComplete: false,
1251+
ExpectedState: prowapi.PendingState,
1252+
ExpectedNumPods: 0,
1253+
},
1254+
{
1255+
Name: "delete terminated pod and remove its k8sreporter finalizer",
1256+
PJ: prowapi.ProwJob{
1257+
ObjectMeta: metav1.ObjectMeta{
1258+
Name: "boop-42",
1259+
Namespace: "prowjobs",
1260+
},
1261+
Spec: prowapi.ProwJobSpec{
1262+
PodSpec: &v1.PodSpec{Containers: []v1.Container{{Name: "test-name", Env: []v1.EnvVar{}}}},
1263+
},
1264+
Status: prowapi.ProwJobStatus{
1265+
State: prowapi.PendingState,
1266+
PodName: "boop-42",
1267+
},
1268+
},
1269+
Pods: []v1.Pod{
1270+
{
1271+
ObjectMeta: metav1.ObjectMeta{
1272+
Name: "boop-42",
1273+
Namespace: "pods",
1274+
Finalizers: []string{"prow.x-k8s.io/gcsk8sreporter"},
1275+
},
1276+
Status: v1.PodStatus{
1277+
Phase: v1.PodFailed,
1278+
Reason: Terminated,
1279+
},
1280+
},
1281+
},
1282+
ExpectedComplete: false,
1283+
ExpectedState: prowapi.PendingState,
1284+
ExpectedNumPods: 0,
1285+
},
1286+
{
1287+
Name: "don't delete terminated pod w/ error_on_termination, complete PJ instead",
1288+
PJ: prowapi.ProwJob{
1289+
ObjectMeta: metav1.ObjectMeta{
1290+
Name: "boop-42",
1291+
Namespace: "prowjobs",
1292+
},
1293+
Spec: prowapi.ProwJobSpec{
1294+
ErrorOnTermination: true,
1295+
PodSpec: &v1.PodSpec{Containers: []v1.Container{{Name: "test-name", Env: []v1.EnvVar{}}}},
1296+
},
1297+
Status: prowapi.ProwJobStatus{
1298+
State: prowapi.PendingState,
1299+
PodName: "boop-42",
1300+
},
1301+
},
1302+
Pods: []v1.Pod{
1303+
{
1304+
ObjectMeta: metav1.ObjectMeta{
1305+
Name: "boop-42",
1306+
Namespace: "pods",
1307+
},
1308+
Status: v1.PodStatus{
1309+
Phase: v1.PodFailed,
1310+
Reason: Terminated,
1311+
},
1312+
},
1313+
},
12491314
ExpectedComplete: true,
1250-
ExpectedState: prowapi.FailureState,
1315+
ExpectedState: prowapi.ErrorState,
12511316
ExpectedNumPods: 1,
12521317
ExpectedURL: "boop-42/error",
12531318
},

pkg/plank/reconciler.go

Lines changed: 23 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ import (
2121
"encoding/json"
2222
"errors"
2323
"fmt"
24+
"slices"
2425
"strings"
2526
"sync"
2627
"time"
@@ -469,14 +470,20 @@ func (r *reconciler) syncPendingJob(ctx context.Context, pj *prowv1.ProwJob) (*r
469470
pj.Status.PodName = pn
470471
r.log.WithFields(pjutil.ProwJobFields(pj)).Info("Pod is missing, starting a new pod")
471472
}
472-
} else if podUnexpectedStopCause := getPodUnexpectedStopCause(pod); podUnexpectedStopCause != PodUnexpectedStopCauseNone {
473+
} else if podUnexpectedStopCause := getPodUnexpectedStopCause(pod, r.config().Plank.TerminationConditionReasons); podUnexpectedStopCause != PodUnexpectedStopCauseNone {
473474
switch {
474475
case podUnexpectedStopCause == PodUnexpectedStopCauseEvicted && pj.Spec.ErrorOnEviction:
475476
// ErrorOnEviction is enabled, complete the PJ and mark it as errored.
476477
r.log.WithField("error-on-eviction", true).WithFields(pjutil.ProwJobFields(pj)).Info("Pods Node got evicted, fail job.")
477478
pj.SetComplete()
478479
pj.Status.State = prowv1.ErrorState
479480
pj.Status.Description = "Job pod was evicted by the cluster."
481+
case podUnexpectedStopCause == PodUnexpectedStopCauseTerminated && pj.Spec.ErrorOnTermination:
482+
// ErrorOnTermination is enabled, complete the PJ and mark it as errored.
483+
r.log.WithField("error-on-termination", true).WithFields(pjutil.ProwJobFields(pj)).Info("Pods Node got terminated, fail job.")
484+
pj.SetComplete()
485+
pj.Status.State = prowv1.ErrorState
486+
pj.Status.Description = "Job pod's node was terminated."
480487
case pj.Status.PodRevivalCount >= *r.config().Plank.MaxRevivals:
481488
// MaxRevivals is reached, complete the PJ and mark it as errored.
482489
r.log.WithField("unexpected-stop-cause", podUnexpectedStopCause).WithFields(pjutil.ProwJobFields(pj)).Info("Pod Node reached max retries, fail job.")
@@ -661,14 +668,28 @@ const (
661668
PodUnexpectedStopCauseNone PodUnexpectedStopCause = ""
662669
PodUnexpectedStopCauseUnknown PodUnexpectedStopCause = "unknown"
663670
PodUnexpectedStopCauseEvicted PodUnexpectedStopCause = "evicted"
671+
PodUnexpectedStopCauseTerminated PodUnexpectedStopCause = "terminated"
664672
PodUnexpectedStopCauseUnreachable PodUnexpectedStopCause = "unreachable"
665673
)
666674

667-
func getPodUnexpectedStopCause(pod *corev1.Pod) PodUnexpectedStopCause {
675+
func getPodUnexpectedStopCause(pod *corev1.Pod, terminationConditionReasons []string) PodUnexpectedStopCause {
668676
if pod.Status.Reason == Evicted {
669677
return PodUnexpectedStopCauseEvicted
670678
}
671679

680+
// If there was a Graceful node shutdown, the Pod's status will have a
681+
// reason set to "Terminated":
682+
// https://kubernetes.io/docs/concepts/architecture/nodes/#graceful-node-shutdown
683+
if pod.Status.Reason == Terminated {
684+
return PodUnexpectedStopCauseTerminated
685+
}
686+
687+
for _, condition := range pod.Status.Conditions {
688+
if slices.Contains(terminationConditionReasons, condition.Reason) {
689+
return PodUnexpectedStopCauseTerminated
690+
}
691+
}
692+
672693
if pod.Status.Reason == NodeUnreachablePodReason && pod.DeletionTimestamp != nil {
673694
return PodUnexpectedStopCauseUnreachable
674695
}

0 commit comments

Comments
 (0)