Skip to content

Commit 2d9f9e4

Browse files
committed
Only use seconds since last recovery without the uptime seconds of the individual processes
1 parent 735f892 commit 2d9f9e4

File tree

5 files changed

+49
-54
lines changed

5 files changed

+49
-54
lines changed

controllers/change_coordinators.go

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -110,7 +110,6 @@ func (c changeCoordinators) reconcile(
110110
status,
111111
r.MinimumUptimeForCoordinatorChangeWithMissingProcess,
112112
r.MinimumUptimeForCoordinatorChangeWithUndesiredProcess,
113-
r.EnableRecoveryState,
114113
)
115114
if err != nil {
116115
logger.Info("Deferring coordinator change due to safety check", "error", err.Error())

controllers/change_coordinators_test.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -243,7 +243,7 @@ var _ = Describe("Change coordinators", func() {
243243
Expect(requeue.curError).To(HaveOccurred())
244244
Expect(
245245
requeue.curError.Error(),
246-
).To(Equal("cannot change coordinators: cluster is not up for long enough, cluster minimum uptime is 10.00 seconds but 300.00 seconds required for safe coordinator change"))
246+
).To(Equal("cannot: change coordinators: cluster is not up for long enough, clusters last recovery was 10.00 seconds ago, wait until the last recovery was 300 seconds ago"))
247247
Expect(cluster.Status.ConnectionString).To(Equal(originalConnectionString))
248248
})
249249
})
@@ -319,7 +319,7 @@ var _ = Describe("Change coordinators", func() {
319319
Expect(requeue.curError).To(HaveOccurred())
320320
Expect(
321321
requeue.curError.Error(),
322-
).To(Equal("cannot change coordinators: cluster has 1 missing coordinators, cluster minimum uptime is 10.00 seconds but 600.00 seconds required for safe coordinator change"))
322+
).To(Equal("cannot: change coordinators: cluster has 1 missing coordinators, clusters last recovery was 10.00 seconds ago, wait until the last recovery was 600 seconds ago"))
323323
Expect(cluster.Status.ConnectionString).To(Equal(originalConnectionString))
324324
})
325325
})

e2e/fixtures/fdb_operator_client.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -505,6 +505,8 @@ spec:
505505
- --cluster-label-key-for-node-trigger=foundationdb.org/fdb-cluster-name
506506
- --enable-node-index
507507
- --replace-on-security-context-change
508+
- --minimum-uptime-for-coordinator-change-with-undesired-process=20s
509+
- --minimum-uptime-for-coordinator-change-with-missing-process=10s
508510
`
509511
)
510512

pkg/fdbstatus/status_checks.go

Lines changed: 41 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -314,7 +314,7 @@ func GetCoordinatorsFromStatus(status *fdbv1beta2.FoundationDBStatus) map[string
314314
return coordinators
315315
}
316316

317-
// GetMinimumUptimeAndAddressMap returns address map of the processes included the the foundationdb status. The minimum
317+
// GetMinimumUptimeAndAddressMap returns address map of the processes included the foundationdb status. The minimum
318318
// uptime will be either secondsSinceLastRecovered if the recovery state is supported and enabled otherwise we will
319319
// take the minimum uptime of all processes.
320320
func GetMinimumUptimeAndAddressMap(
@@ -631,24 +631,11 @@ func canSafelyExcludeOrIncludeProcesses(
631631
return err
632632
}
633633

634-
version, err := fdbv1beta2.ParseFdbVersion(cluster.GetRunningVersion())
634+
err = checkLastClusterRecovery(cluster, status, minRecoverySeconds, action)
635635
if err != nil {
636636
return err
637637
}
638638

639-
if version.SupportsRecoveryState() {
640-
// We want to make sure that the cluster is recovered for some time. This should protect the cluster from
641-
// getting into a bad state as a result of frequent inclusions/exclusions.
642-
if status.Cluster.RecoveryState.SecondsSinceLastRecovered < minRecoverySeconds {
643-
return fmt.Errorf(
644-
"cannot: %s, clusters last recovery was %0.2f seconds ago, wait until the last recovery was %0.0f seconds ago",
645-
action,
646-
status.Cluster.RecoveryState.SecondsSinceLastRecovered,
647-
minRecoverySeconds,
648-
)
649-
}
650-
}
651-
652639
// In the case of inclusions we also want to make sure we only change the list of excluded server if the cluster is
653640
// in a good shape, otherwise the CC might crash: https://github.com/apple/foundationdb/blob/release-7.1/fdbserver/ClusterRecovery.actor.cpp#L575-L579
654641
if inclusion && !recoveryStateAllowsInclusion(status) {
@@ -846,32 +833,19 @@ func ClusterIsConfigured(
846833
// with more information why it's not safe to change coordinators. This function differentiates between missing (down)
847834
// processes and processes that are only excluded or undesired, applying different minimum uptime requirements for each case.
848835
func CanSafelyChangeCoordinators(
849-
logger logr.Logger,
836+
_ logr.Logger,
850837
cluster *fdbv1beta2.FoundationDBCluster,
851838
status *fdbv1beta2.FoundationDBStatus,
852839
minimumUptimeForMissing time.Duration,
853840
minimumUptimeForExcluded time.Duration,
854-
recoveryStateEnabled bool,
855841
) error {
856-
// TODO double check setting here + true
857-
currentMinimumUptime, _, err := GetMinimumUptimeAndAddressMap(
858-
logger,
859-
cluster,
860-
status,
861-
recoveryStateEnabled,
862-
)
863-
if err != nil {
864-
return fmt.Errorf("failed to get minimum uptime: %w", err)
865-
}
866-
867842
// Analyze current coordinators using the coordinator information from status
868843
// This gives us the definitive list of coordinators regardless of whether processes are running
869844
missingCoordinators := 0
870845

871846
// Create a map of process addresses to process group IDs for faster lookup
872847
coordinators := map[string]bool{}
873848
for _, coordinator := range status.Client.Coordinators.Coordinators {
874-
// TODO validate if logic will work with DNS names.
875849
coordinators[coordinator.Address.String()] = false
876850
}
877851

@@ -894,33 +868,59 @@ func CanSafelyChangeCoordinators(
894868
var requiredUptime float64
895869
var reason string
896870

897-
logger.V(1).
898-
Info("Checking if it is safe to change coordinators", "missingCoordinators", missingCoordinators, "currentMinimumUptime", currentMinimumUptime)
899871
if missingCoordinators > 0 {
900-
// Missing coordinators indicate processes that are down, use lower threshold
872+
// Missing coordinators indicate processes that are down, and we should be using a lower threshold to recover
873+
// from the missing coordinator faster.
901874
requiredUptime = minimumUptimeForMissing.Seconds()
902875
reason = fmt.Sprintf("cluster has %d missing coordinators", missingCoordinators)
903876
} else {
904877
requiredUptime = minimumUptimeForExcluded.Seconds()
905878
reason = "cluster is not up for long enough"
906879

907-
// Perform the default safet checks in case of "normal" coordinator changes or if processes are exclude. If
880+
// Perform the default safety checks in case of "normal" coordinator changes or if processes are excluded. If
908881
// the cluster has missing coordinators, we should bypass those checks to ensure we recruit the new coordinators
909882
// in a timely manner.
910-
err = DefaultSafetyChecks(status, 10, "change coordinators")
883+
err := DefaultSafetyChecks(status, 10, "change coordinators")
911884
if err != nil {
912885
return err
913886
}
914887
}
915888

916889
// Check that the cluster has been stable for the required time
917-
if currentMinimumUptime < requiredUptime {
918-
return fmt.Errorf(
919-
"cannot change coordinators: %s, cluster minimum uptime is %.2f seconds but %.2f seconds required for safe coordinator change",
920-
reason,
921-
currentMinimumUptime,
922-
requiredUptime,
923-
)
890+
return checkLastClusterRecovery(
891+
cluster,
892+
status,
893+
requiredUptime,
894+
fmt.Sprintf("change coordinators: %s", reason),
895+
)
896+
}
897+
898+
// checkLastClusterRecovery checks if the clusters is up long enough to perform the requested change. The uptime of the
899+
// cluster is calculated based on the status.Cluster.RecoveryState.SecondsSinceLastRecovered value. For clusters before
900+
// 7.1.22 this check will be ignored.
901+
// Will return an error with additional details if the cluster is not up long enough.
902+
func checkLastClusterRecovery(
903+
cluster *fdbv1beta2.FoundationDBCluster,
904+
status *fdbv1beta2.FoundationDBStatus,
905+
minRecoverySeconds float64,
906+
action string,
907+
) error {
908+
version, err := fdbv1beta2.ParseFdbVersion(cluster.GetRunningVersion())
909+
if err != nil {
910+
return err
911+
}
912+
913+
if version.SupportsRecoveryState() {
914+
// We want to make sure that the cluster is recovered for some time. This should protect the cluster from
915+
// getting into a bad state as a result of frequent inclusions/exclusions.
916+
if status.Cluster.RecoveryState.SecondsSinceLastRecovered < minRecoverySeconds {
917+
return fmt.Errorf(
918+
"cannot: %s, clusters last recovery was %0.2f seconds ago, wait until the last recovery was %0.0f seconds ago",
919+
action,
920+
status.Cluster.RecoveryState.SecondsSinceLastRecovered,
921+
minRecoverySeconds,
922+
)
923+
}
924924
}
925925

926926
return nil

pkg/fdbstatus/status_checks_test.go

Lines changed: 4 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -2627,7 +2627,6 @@ var _ = Describe("CanSafelyChangeCoordinators", func() {
26272627
status,
26282628
minimumUptimeForMissing,
26292629
minimumUptimeForUndesired,
2630-
true,
26312630
)
26322631
Expect(err).To(HaveOccurred())
26332632
Expect(err.Error()).To(ContainSubstring(expectedError))
@@ -2734,7 +2733,6 @@ var _ = Describe("CanSafelyChangeCoordinators", func() {
27342733
baseStatus,
27352734
minimumUptimeForMissing,
27362735
minimumUptimeForUndesired,
2737-
true,
27382736
)
27392737
Expect(err).NotTo(HaveOccurred())
27402738
})
@@ -2761,13 +2759,11 @@ var _ = Describe("CanSafelyChangeCoordinators", func() {
27612759
baseStatus,
27622760
minimumUptimeForMissing,
27632761
minimumUptimeForUndesired,
2764-
true,
27652762
)
27662763
Expect(err).To(HaveOccurred())
27672764
Expect(
27682765
err.Error(),
2769-
).To(Equal("cannot change coordinators: cluster is not up for long enough, cluster minimum uptime is 500.00 seconds but 600.00 seconds required for safe coordinator change"))
2770-
Expect(err.Error()).To(ContainSubstring("600.00 seconds required"))
2766+
).To(Equal("cannot: change coordinators: cluster is not up for long enough, clusters last recovery was 500.00 seconds ago, wait until the last recovery was 600 seconds ago"))
27712767
})
27722768

27732769
It("should succeed when uptime meets requirements for excluded coordinators", func() {
@@ -2792,7 +2788,6 @@ var _ = Describe("CanSafelyChangeCoordinators", func() {
27922788
baseStatus,
27932789
minimumUptimeForMissing,
27942790
minimumUptimeForUndesired,
2795-
true,
27962791
)
27972792
Expect(err).NotTo(HaveOccurred())
27982793
})
@@ -2809,7 +2804,6 @@ var _ = Describe("CanSafelyChangeCoordinators", func() {
28092804
baseStatus,
28102805
minimumUptimeForMissing,
28112806
minimumUptimeForUndesired,
2812-
true,
28132807
)
28142808
Expect(err).NotTo(HaveOccurred())
28152809
})
@@ -2826,11 +2820,12 @@ var _ = Describe("CanSafelyChangeCoordinators", func() {
28262820
baseStatus,
28272821
minimumUptimeForMissing,
28282822
minimumUptimeForUndesired,
2829-
true,
28302823
)
28312824
Expect(err).To(HaveOccurred())
28322825
Expect(err.Error()).To(ContainSubstring("missing coordinators"))
2833-
Expect(err.Error()).To(ContainSubstring("300.00 seconds required"))
2826+
Expect(
2827+
err.Error(),
2828+
).To(ContainSubstring("wait until the last recovery was 300 seconds ago"))
28342829
})
28352830
})
28362831

@@ -2913,7 +2908,6 @@ var _ = Describe("CanSafelyChangeCoordinators", func() {
29132908
status,
29142909
minimumUptimeForMissing,
29152910
minimumUptimeForUndesired,
2916-
true,
29172911
)
29182912
Expect(err).NotTo(HaveOccurred())
29192913
},

0 commit comments

Comments
 (0)