Skip to content

Commit 0ef7e5e

Browse files
authored
Correct the failure duration for unavailable checks (#2178)
1 parent ab5148d commit 0ef7e5e

File tree

2 files changed

+21
-10
lines changed

2 files changed

+21
-10
lines changed

e2e/fixtures/fixtures.go

Lines changed: 16 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -77,7 +77,8 @@ func CheckInvariant(
7777
quit := make(chan struct{})
7878
waitGroup.Add(1)
7979
var failureStartTime time.Time
80-
var failureDuration time.Duration
80+
var currentFailureDuration time.Duration
81+
var longestFailureDuration time.Duration
8182

8283
go func() {
8384
defer waitGroup.Done()
@@ -92,14 +93,24 @@ func CheckInvariant(
9293
last = err
9394
}
9495

95-
failureDuration = time.Since(failureStartTime)
96-
if failureDuration >= threshold {
96+
currentFailureDuration = time.Since(failureStartTime)
97+
if currentFailureDuration >= threshold {
9798
log.Printf(
9899
"invariant %s failed after: %v",
99100
invariantName,
100-
failureDuration.String(),
101+
currentFailureDuration.String(),
101102
)
102103
testFailed = true
104+
105+
// If the current failure duration is longer than the longest failure duration
106+
// update the longest failure duration. The longest failure duration is used to report
107+
// the failure duration in cases where the cluster was unavailable longer than the
108+
// threshold. If we are not setting this value, we could see cases where the cluster was
109+
// unavailable longer than the threshold but the cluster recovered and therefore the
110+
// error message is reporting the incorrect failure duration.
111+
if currentFailureDuration > longestFailureDuration {
112+
longestFailureDuration = currentFailureDuration
113+
}
103114
}
104115
continue
105116
}
@@ -118,7 +129,7 @@ func CheckInvariant(
118129
close(quit)
119130
waitGroup.Wait()
120131
if testFailed {
121-
return fmt.Errorf("invariant %s failed for %s", invariantName, failureDuration.String())
132+
return fmt.Errorf("invariant %s failed for %s", invariantName, longestFailureDuration.String())
122133
}
123134
return nil
124135
})

e2e/test_operator_ha_upgrades/operator_ha_upgrade_test.go

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -391,7 +391,7 @@ var _ = Describe("Operator HA Upgrades", Label("e2e", "pr"), func() {
391391

392392
// Keep deleting pods until all clusters are running with the new version.
393393
clusters := fdbCluster.GetAllClusters()
394-
Eventually(func() bool {
394+
Eventually(func(g Gomega) bool {
395395
coordinatorMap := map[k8sTypes.UID]corev1.Pod{}
396396

397397
// Are all clusters running at "targetVersion"?
@@ -405,7 +405,7 @@ var _ = Describe("Operator HA Upgrades", Label("e2e", "pr"), func() {
405405

406406
if dbCluster.Status.RunningVersion == targetVersion {
407407
log.Println(
408-
"Cluster ",
408+
"Cluster",
409409
cluster.Name(),
410410
"is running at version ",
411411
targetVersion,
@@ -424,17 +424,17 @@ var _ = Describe("Operator HA Upgrades", Label("e2e", "pr"), func() {
424424
randomCluster := factory.RandomPickOneCluster(clusters)
425425
// Make sure we are not deleting coordinator Pods
426426
var randomPod *corev1.Pod
427-
Eventually(func() bool {
427+
g.Eventually(func() bool {
428428
randomPod = factory.ChooseRandomPod(randomCluster.GetPods())
429429
_, ok := coordinatorMap[randomPod.UID]
430430
if ok {
431-
log.Println("Skipping pod: ", randomPod.Name, "as it is a coordinator")
431+
log.Println("Skipping pod:", randomPod.Name, "as it is a coordinator")
432432
}
433433

434434
return ok
435435
}).WithTimeout(2 * time.Minute).WithPolling(1 * time.Second).Should(BeFalse())
436436

437-
log.Println("Deleting pod: ", randomPod.Name)
437+
log.Println("Deleting pod:", randomPod.Name)
438438
factory.DeletePod(randomPod)
439439
return false
440440
}).WithTimeout(30 * time.Minute).WithPolling(2 * time.Minute).Should(BeTrue())

0 commit comments

Comments
 (0)