Correct the failure duration for unavailable checks (#2178)

johscheuer · web-flow · commit 0ef7e5e714f2 · 2024-11-29T11:39:23.000Z
diff --git a/e2e/fixtures/fixtures.go b/e2e/fixtures/fixtures.go
@@ -77,7 +77,8 @@ func CheckInvariant(
 	quit := make(chan struct{})
 	waitGroup.Add(1)
 	var failureStartTime time.Time
-	var failureDuration time.Duration
+	var currentFailureDuration time.Duration
+	var longestFailureDuration time.Duration
 
 	go func() {
 		defer waitGroup.Done()
@@ -92,14 +93,24 @@ func CheckInvariant(
 						last = err
 					}
 
-					failureDuration = time.Since(failureStartTime)
-					if failureDuration >= threshold {
+					currentFailureDuration = time.Since(failureStartTime)
+					if currentFailureDuration >= threshold {
 						log.Printf(
 							"invariant %s failed after: %v",
 							invariantName,
-							failureDuration.String(),
+							currentFailureDuration.String(),
 						)
 						testFailed = true
+
+						// If the current failure duration is longer than the longest failure duration
+						// update the longest failure duration. The longest failure duration is used to report
+						// the failure duration in cases where the cluster was unavailable longer than the
+						// threshold. If we are not setting this value, we could see cases where the cluster was
+						// unavailable longer than the threshold but the cluster recovered and therefore the
+						// error message is reporting the incorrect failure duration.
+						if currentFailureDuration > longestFailureDuration {
+							longestFailureDuration = currentFailureDuration
+						}
 					}
 					continue
 				}
@@ -118,7 +129,7 @@ func CheckInvariant(
 		close(quit)
 		waitGroup.Wait()
 		if testFailed {
-			return fmt.Errorf("invariant %s failed for %s", invariantName, failureDuration.String())
+			return fmt.Errorf("invariant %s failed for %s", invariantName, longestFailureDuration.String())
 		}
 		return nil
 	})
diff --git a/e2e/test_operator_ha_upgrades/operator_ha_upgrade_test.go b/e2e/test_operator_ha_upgrades/operator_ha_upgrade_test.go
@@ -391,7 +391,7 @@ var _ = Describe("Operator HA Upgrades", Label("e2e", "pr"), func() {
 
 			// Keep deleting pods until all clusters are running with the new version.
 			clusters := fdbCluster.GetAllClusters()
-			Eventually(func() bool {
+			Eventually(func(g Gomega) bool {
 				coordinatorMap := map[k8sTypes.UID]corev1.Pod{}
 
 				// Are all clusters running at "targetVersion"?
@@ -405,7 +405,7 @@ var _ = Describe("Operator HA Upgrades", Label("e2e", "pr"), func() {
 
 					if dbCluster.Status.RunningVersion == targetVersion {
 						log.Println(
-							"Cluster ",
+							"Cluster",
 							cluster.Name(),
 							"is running at version ",
 							targetVersion,
@@ -424,17 +424,17 @@ var _ = Describe("Operator HA Upgrades", Label("e2e", "pr"), func() {
 				randomCluster := factory.RandomPickOneCluster(clusters)
 				// Make sure we are not deleting coordinator Pods
 				var randomPod *corev1.Pod
-				Eventually(func() bool {
+				g.Eventually(func() bool {
 					randomPod = factory.ChooseRandomPod(randomCluster.GetPods())
 					_, ok := coordinatorMap[randomPod.UID]
 					if ok {
-						log.Println("Skipping pod: ", randomPod.Name, "as it is a coordinator")
+						log.Println("Skipping pod:", randomPod.Name, "as it is a coordinator")
 					}
 
 					return ok
 				}).WithTimeout(2 * time.Minute).WithPolling(1 * time.Second).Should(BeFalse())
 
-				log.Println("Deleting pod: ", randomPod.Name)
+				log.Println("Deleting pod:", randomPod.Name)
 				factory.DeletePod(randomPod)
 				return false
 			}).WithTimeout(30 * time.Minute).WithPolling(2 * time.Minute).Should(BeTrue())