@@ -30,6 +30,8 @@ import (
3030 "log"
3131 "time"
3232
33+ fdbv1beta2 "github.com/FoundationDB/fdb-kubernetes-operator/api/v1beta2"
34+
3335 "github.com/FoundationDB/fdb-kubernetes-operator/e2e/fixtures"
3436 . "github.com/onsi/ginkgo/v2"
3537 . "github.com/onsi/gomega"
@@ -95,6 +97,9 @@ var _ = Describe("Operator Plugin", Label("e2e", "pr"), func() {
9597 remoteSatellite := fdbCluster .GetRemoteSatellite ()
9698 remoteSatellite .SetSkipReconciliation (true )
9799
100+ remote := fdbCluster .GetRemote ()
101+ remote .SetSkipReconciliation (true )
102+
98103 var wg errgroup.Group
99104 log .Println ("Delete Pods in primary" )
100105 wg .Go (func () error {
@@ -127,7 +132,6 @@ var _ = Describe("Operator Plugin", Label("e2e", "pr"), func() {
127132 // Wait a short amount of time to let the cluster see that the primary and primary satellite is down.
128133 time .Sleep (5 * time .Second )
129134
130- remote := fdbCluster .GetRemote ()
131135 // Ensure the cluster is unavailable.
132136 Eventually (func () bool {
133137 return remote .GetStatus ().Client .DatabaseStatus .Available
@@ -157,4 +161,104 @@ var _ = Describe("Operator Plugin", Label("e2e", "pr"), func() {
157161 }).WithTimeout (2 * time .Minute ).WithPolling (1 * time .Second ).Should (BeTrue ())
158162 })
159163 })
164+
165+ // TODO(johscheuer): Enable once https://github.com/FoundationDB/fdb-kubernetes-operator/issues/2153 is fixed.
166+ PWhen ("all Pods in the primary and satellites are down with" , func () {
167+ BeforeEach (func () {
168+ runningVersion := fdbCluster .GetPrimary ().GetCluster ().GetRunningVersion ()
169+ parsedVersion , err := fdbv1beta2 .ParseFdbVersion (runningVersion )
170+ Expect (err ).NotTo (HaveOccurred ())
171+
172+ if ! parsedVersion .SupportsDNSInClusterFile () {
173+ Skip (fmt .Sprintf ("Current FDB version: \" %s\" doesn't support DNS names in the cluster file" , runningVersion ))
174+ }
175+ })
176+
177+ When ("DNS names in the cluster file are supported" , func () {
178+ BeforeEach (func () {
179+ var errGroup errgroup.Group
180+ // Enable DNS names in the cluster file for the whole cluster.
181+ for _ , cluster := range fdbCluster .GetAllClusters () {
182+ target := cluster
183+ errGroup .Go (func () error {
184+ return target .SetUseDNSInClusterFile (true )
185+ })
186+ }
187+ Expect (errGroup .Wait ()).NotTo (HaveOccurred ())
188+
189+ // This tests is a destructive test where the cluster will stop working for some period.
190+ primary := fdbCluster .GetPrimary ()
191+ primary .SetSkipReconciliation (true )
192+
193+ primarySatellite := fdbCluster .GetPrimarySatellite ()
194+ primarySatellite .SetSkipReconciliation (true )
195+
196+ remoteSatellite := fdbCluster .GetRemoteSatellite ()
197+ remoteSatellite .SetSkipReconciliation (true )
198+
199+ remote := fdbCluster .GetRemote ()
200+ remote .SetSkipReconciliation (true )
201+
202+ var wg errgroup.Group
203+ log .Println ("Delete Pods in primary" )
204+ wg .Go (func () error {
205+ for _ , pod := range primary .GetPods ().Items {
206+ factory .DeletePod (& pod )
207+ }
208+
209+ return nil
210+ })
211+
212+ log .Println ("Delete Pods in primary satellite" )
213+ wg .Go (func () error {
214+ for _ , pod := range primarySatellite .GetPods ().Items {
215+ factory .DeletePod (& pod )
216+ }
217+
218+ return nil
219+ })
220+
221+ log .Println ("Delete Pods in remote satellite" )
222+ wg .Go (func () error {
223+ for _ , pod := range remoteSatellite .GetPods ().Items {
224+ factory .DeletePod (& pod )
225+ }
226+
227+ return nil
228+ })
229+
230+ Expect (wg .Wait ()).NotTo (HaveOccurred ())
231+ // Wait a short amount of time to let the cluster see that the primary and primary satellite is down.
232+ time .Sleep (5 * time .Second )
233+
234+ // Ensure the cluster is unavailable.
235+ Eventually (func () bool {
236+ return remote .GetStatus ().Client .DatabaseStatus .Available
237+ }).WithTimeout (2 * time .Minute ).WithPolling (1 * time .Second ).Should (BeFalse ())
238+ })
239+
240+ AfterEach (func () {
241+ log .Println ("Recreate cluster" )
242+ // Delete the broken cluster.
243+ fdbCluster .Delete ()
244+ // Recreate the cluster to make sure the next tests can proceed
245+ fdbCluster = factory .CreateFdbHaCluster (clusterConfig , clusterOptions ... )
246+ })
247+
248+ It ("should recover the coordinators" , func () {
249+ remote := fdbCluster .GetRemote ()
250+ // Pick one operator pod and execute the recovery command
251+ operatorPod := factory .RandomPickOnePod (factory .GetOperatorPods (remote .Namespace ()).Items )
252+ log .Println ("operatorPod:" , operatorPod .Name )
253+ stdout , stderr , err := factory .ExecuteCmdOnPod (context .Background (), & operatorPod , "manager" , fmt .Sprintf ("kubectl-fdb -n %s recover-multi-region-cluster --version-check=false --wait=false %s" , remote .Namespace (), remote .Name ()), false )
254+ log .Println ("stdout:" , stdout , "stderr:" , stderr )
255+ Expect (err ).NotTo (HaveOccurred ())
256+
257+ // Ensure the cluster is available again.
258+ Eventually (func () bool {
259+ return remote .GetStatus ().Client .DatabaseStatus .Available
260+ }).WithTimeout (2 * time .Minute ).WithPolling (1 * time .Second ).Should (BeTrue ())
261+ })
262+ })
263+ })
160264})
0 commit comments