@@ -68,16 +68,20 @@ type FoundationDBClusterList struct {
6868 Items []FoundationDBCluster `json:"items"`
6969}
7070
71- var conditionsThatNeedReplacement = []ProcessGroupConditionType {
72- MissingProcesses ,
73- PodFailing ,
74- MissingPod ,
75- MissingPVC ,
76- MissingService ,
77- PodPending ,
78- NodeTaintReplacing ,
79- ProcessIsMarkedAsExcluded ,
80- ProcessHasIOError ,
71+ // TODO (johscheuer): I think it would make sense to expose this as a setting in the FoundationDBCluster automation options
72+ // that way users can define what conditions should be used for the replacement logic.
73+ var defaultConditionsThatNeedReplacement = map [ProcessGroupConditionType ]None {
74+ MissingProcesses : {},
75+ PodFailing : {},
76+ MissingPod : {},
77+ MissingPVC : {},
78+ MissingService : {},
79+ PodPending : {},
80+ NodeTaintReplacing : {},
81+ ProcessIsMarkedAsExcluded : {},
82+ ProcessHasIOError : {},
83+ SidecarUnreachable : {},
84+ ProcessHasHighRunLoopBusy : {},
8185}
8286
8387const (
@@ -547,12 +551,13 @@ func (processGroupStatus *ProcessGroupStatus) GetPvcName(cluster *FoundationDBCl
547551 return fmt .Sprintf ("%s-data" , processGroupStatus .GetPodName (cluster ))
548552}
549553
550- // NeedsReplacement checks if the ProcessGroupStatus has conditions that require a replacement of the failed Process Group.
554+ // NeedsReplacementWithConditions checks if the ProcessGroupStatus has conditions that require a replacement of the failed Process Group.
551555// The method will return the failure condition and the timestamp. If no failure is detected an empty condition and a 0
552- // will be returned.
553- func (processGroupStatus * ProcessGroupStatus ) NeedsReplacement (
556+ // will be returned. The conditions that should trigger a replacement can be passed to this method.
557+ func (processGroupStatus * ProcessGroupStatus ) NeedsReplacementWithConditions (
554558 failureTime int ,
555559 taintReplacementTime int ,
560+ conditionsThatNeedReplacement map [ProcessGroupConditionType ]None ,
556561) (ProcessGroupConditionType , int64 ) {
557562 var earliestFailureTime int64 = math .MaxInt64
558563 var earliestTaintReplacementTime int64 = math .MaxInt64
@@ -563,30 +568,38 @@ func (processGroupStatus *ProcessGroupStatus) NeedsReplacement(
563568 }
564569
565570 var failureCondition ProcessGroupConditionType
566- for _ , conditionType := range conditionsThatNeedReplacement {
567- conditionTimePtr := processGroupStatus .GetConditionTime (conditionType )
568- if conditionTimePtr == nil {
571+
572+ // Iterate over all the conditions that the process group has, under normal circumstances the process group
573+ // should have no oder a minimal set of conditions. If any of the condition is part of the conditionsThatNeedReplacement
574+ // check how long the condition is present and check if the process group should be replaced.
575+ var hasConditionThatRequiresReplacement bool
576+ for _ , condition := range processGroupStatus .ProcessGroupConditions {
577+ _ , ok := conditionsThatNeedReplacement [condition .ProcessGroupConditionType ]
578+ if ! ok {
569579 continue
570580 }
571581
572- conditionTime := * conditionTimePtr
573- if conditionType == NodeTaintReplacing {
574- if earliestTaintReplacementTime > conditionTime {
575- earliestTaintReplacementTime = conditionTime
582+ hasConditionThatRequiresReplacement = true
583+ if condition . ProcessGroupConditionType == NodeTaintReplacing {
584+ if earliestTaintReplacementTime > condition . Timestamp {
585+ earliestTaintReplacementTime = condition . Timestamp
576586 }
577587
578- failureCondition = conditionType
588+ failureCondition = condition . ProcessGroupConditionType
579589 continue
580590 }
581591
582- if earliestFailureTime > conditionTime {
583- earliestFailureTime = conditionTime
584- failureCondition = conditionType
592+ if earliestFailureTime > condition . Timestamp {
593+ earliestFailureTime = condition . Timestamp
594+ failureCondition = condition . ProcessGroupConditionType
585595 }
586596 }
587597
588- failureWindowStart := time .Now ().Add (- 1 * time .Duration (failureTime ) * time .Second ).Unix ()
589- if earliestFailureTime < failureWindowStart {
598+ if ! hasConditionThatRequiresReplacement {
599+ return "" , 0
600+ }
601+
602+ if earliestFailureTime < time .Now ().Add (- 1 * time .Duration (failureTime )* time .Second ).Unix () {
590603 return failureCondition , earliestFailureTime
591604 }
592605
@@ -601,6 +614,21 @@ func (processGroupStatus *ProcessGroupStatus) NeedsReplacement(
601614 return "" , 0
602615}
603616
617+ // NeedsReplacement checks if the ProcessGroupStatus has conditions that require a replacement of the failed Process Group.
618+ // The method will return the failure condition and the timestamp. If no failure is detected an empty condition and a 0
619+ // will be returned.
620+ // Deprecated: Use NeedsReplacementWithConditions.
621+ func (processGroupStatus * ProcessGroupStatus ) NeedsReplacement (
622+ failureTime int ,
623+ taintReplacementTime int ,
624+ ) (ProcessGroupConditionType , int64 ) {
625+ return processGroupStatus .NeedsReplacementWithConditions (
626+ failureTime ,
627+ taintReplacementTime ,
628+ defaultConditionsThatNeedReplacement ,
629+ )
630+ }
631+
604632// AddAddresses adds the new address to the ProcessGroupStatus and removes duplicates and old addresses
605633// if the process group is not marked as removal.
606634func (processGroupStatus * ProcessGroupStatus ) AddAddresses (
@@ -1071,6 +1099,9 @@ const (
10711099 // This condition can occur during the migration of the image type, the change of the image configuration
10721100 // for the sidecar or during version incompatible upgrades until the sidecar is updated to the new desired version.
10731101 IncorrectSidecarImage ProcessGroupConditionType = "IncorrectSidecarImage"
1102+ // ProcessHasHighRunLoopBusy represents a process group that has a high run loop busy value. A high run loop busy
1103+ // value can be caused by infrastructure issues or by overloaded processes.
1104+ ProcessHasHighRunLoopBusy ProcessGroupConditionType = "ProcessHasHighRunLoopBusy"
10741105)
10751106
10761107// AllProcessGroupConditionTypes returns all ProcessGroupConditionType
@@ -1093,6 +1124,7 @@ func AllProcessGroupConditionTypes() []ProcessGroupConditionType {
10931124 ProcessIsMarkedAsExcluded ,
10941125 ProcessHasIOError ,
10951126 IncorrectSidecarImage ,
1127+ ProcessHasHighRunLoopBusy ,
10961128 }
10971129}
10981130
@@ -1137,6 +1169,8 @@ func GetProcessGroupConditionType(
11371169 return ProcessHasIOError , nil
11381170 case "IncorrectSidecarImage" :
11391171 return IncorrectSidecarImage , nil
1172+ case "ProcessHasHighRunLoopBusy" :
1173+ return ProcessHasHighRunLoopBusy , nil
11401174 }
11411175
11421176 return "" , fmt .Errorf ("unknown process group condition type: %s" , processGroupConditionType )
@@ -1759,7 +1793,14 @@ func (cluster *FoundationDBCluster) CheckReconciliation(log logr.Logger) (bool,
17591793 0 ,
17601794 len (processGroup .ProcessGroupConditions ),
17611795 )
1796+
17621797 for _ , condition := range processGroup .ProcessGroupConditions {
1798+ // The ProcessHasHighRunLoopBusy is currently only informational and shouldn't block the reconciliation.
1799+ if condition .ProcessGroupConditionType == ProcessHasHighRunLoopBusy {
1800+ logger .V (1 ).
1801+ Info ("Detected process with high run loop busy value" , "processGroupID" , processGroup .ProcessGroupID )
1802+ }
1803+
17631804 // If there is at least one process with an incorrect command line, that means the operator has to restart
17641805 // processes.
17651806 if condition .ProcessGroupConditionType == IncorrectCommandLine &&
@@ -1780,18 +1821,20 @@ func (cluster *FoundationDBCluster) CheckReconciliation(log logr.Logger) (bool,
17801821 conditions = append (conditions , condition .ProcessGroupConditionType )
17811822 }
17821823
1783- logger .Info (
1784- "Has unhealthy process group" ,
1785- "processGroupID" ,
1786- processGroup .ProcessGroupID ,
1787- "state" ,
1788- "HasUnhealthyProcess" ,
1789- "conditions" ,
1790- conditions ,
1791- )
1792- cluster .Status .Generations .HasUnhealthyProcess = cluster .Generation
1793- reconciled = false
1794- continue
1824+ if len (conditions ) > 0 {
1825+ logger .Info (
1826+ "Has unhealthy process group" ,
1827+ "processGroupID" ,
1828+ processGroup .ProcessGroupID ,
1829+ "state" ,
1830+ "HasUnhealthyProcess" ,
1831+ "conditions" ,
1832+ conditions ,
1833+ )
1834+ cluster .Status .Generations .HasUnhealthyProcess = cluster .Generation
1835+ reconciled = false
1836+ continue
1837+ }
17951838 }
17961839
17971840 cluster .Status .ReconciledProcessGroups ++
@@ -3609,3 +3652,8 @@ func (cluster *FoundationDBCluster) GetDatabaseInteractionMode() DatabaseInterac
36093652
36103653 return * cluster .Spec .AutomationOptions .DatabaseInteractionMode
36113654}
3655+
3656+ // GetConditionsThatNeedReplacement returns the conditions that should trigger a replacement.
3657+ func (cluster * FoundationDBCluster ) GetConditionsThatNeedReplacement () map [ProcessGroupConditionType ]None {
3658+ return defaultConditionsThatNeedReplacement
3659+ }
0 commit comments