@@ -17,29 +17,193 @@ limitations under the License.
1717package workapplier
1818
1919import (
20- "time "
20+ "fmt "
2121
22+ "k8s.io/apimachinery/pkg/api/meta"
23+ metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
2224 "k8s.io/klog/v2"
2325
2426 fleetv1beta1 "github.com/kubefleet-dev/kubefleet/apis/placement/v1beta1"
2527 "github.com/kubefleet-dev/kubefleet/pkg/metrics"
26- "github.com/kubefleet-dev/kubefleet/pkg/utils"
28+ "github.com/kubefleet-dev/kubefleet/pkg/utils/controller "
2729)
2830
29- func trackWorkApplyLatencyMetric (work * fleetv1beta1.Work ) {
30- // Calculate the work apply latency.
31- lastUpdateTime , ok := work .GetAnnotations ()[utils .LastWorkUpdateTimeAnnotationKey ]
32- if ok {
33- workUpdateTime , parseErr := time .Parse (time .RFC3339 , lastUpdateTime )
34- if parseErr != nil {
35- klog .ErrorS (parseErr , "Failed to parse the last update timestamp on the work" , "work" , klog .KObj (work ))
36- return
37- }
31+ // Note (chenyu1):
32+ // the metrics for the work applier are added the assumptions that:
33+ // a) the applier should provide low-cardinality metrics only to keep the memory footprint low,
34+ // as there might be a high number (up to 10K) of works/manifests to process in a member cluster; and
35+ // b) keep the overhead of metrics collection low, esp. considering that the applier runs in
36+ // parallel (5 concurrent reconciliations by default) and is on the critical path of Fleet workflows.
37+ //
38+ // TO-DO (chenyu1): evaluate if there is any need for high-cardinality options and/or stateful
39+ // metric collection for better observability.
40+
41+ // Some of the label values for the work/manifest processing request counter metric.
42+ const (
43+ workOrManifestStatusSkipped = "Skipped"
44+ workOrManifestStatusUnknown = "Unknown"
45+
46+ manifestDriftOrDiffDetectionStatusFound = "Found"
47+ manifestDriftOrDiffDetectionStatusNotFound = "NotFound"
48+ )
49+
50+ func trackWorkAndManifestProcessingRequestMetrics (work * fleetv1beta1.Work ) {
51+ // Increment the work processing request counter.
52+
53+ var workApplyStatus string
54+ var shouldSkip bool
55+ workAppliedCond := meta .FindStatusCondition (work .Status .Conditions , fleetv1beta1 .WorkConditionTypeApplied )
56+ switch {
57+ case workAppliedCond == nil :
58+ workApplyStatus = workOrManifestStatusSkipped
59+ case workAppliedCond .ObservedGeneration != work .Generation :
60+ // Normally this should never occur; as the method is called right after the status
61+ // is refreshed.
62+ _ = controller .NewUnexpectedBehaviorError (fmt .Errorf ("failed to track work metrics: applied condition is stale" ))
63+ shouldSkip = true
64+ case workAppliedCond .Status == metav1 .ConditionUnknown :
65+ // At this point Fleet does not set the Applied condition to the Unknown status; this
66+ // branch is added just for completeness reasons.
67+ workApplyStatus = workOrManifestStatusUnknown
68+ case len (workAppliedCond .Reason ) == 0 :
69+ // Normally this should never occur; this branch is added just for completeness reasons.
70+ _ = controller .NewUnexpectedBehaviorError (fmt .Errorf ("failed to track work metrics: applied condition has no reason (work=%v)" , klog .KObj (work )))
71+ shouldSkip = true
72+ default :
73+ workApplyStatus = workAppliedCond .Reason
74+ }
3875
39- latency := time .Since (workUpdateTime )
40- metrics .WorkApplyTime .WithLabelValues (work .GetName ()).Observe (latency .Seconds ())
41- klog .V (2 ).InfoS ("Work has been applied" , "work" , klog .KObj (work ), "latency" , latency .Milliseconds ())
76+ var workAvailabilityStatus string
77+ workAvailableCond := meta .FindStatusCondition (work .Status .Conditions , fleetv1beta1 .WorkConditionTypeAvailable )
78+ switch {
79+ case workAvailableCond == nil :
80+ workAvailabilityStatus = workOrManifestStatusSkipped
81+ case workAvailableCond .ObservedGeneration != work .Generation :
82+ // Normally this should never occur; as the method is called right after the status
83+ // is refreshed.
84+ _ = controller .NewUnexpectedBehaviorError (fmt .Errorf ("failed to track work metrics: available condition is stale" ))
85+ shouldSkip = true
86+ case workAvailableCond .Status == metav1 .ConditionUnknown :
87+ // At this point Fleet does not set the Available condition to the Unknown status; this
88+ // branch is added just for completeness reasons.
89+ workAvailabilityStatus = workOrManifestStatusUnknown
90+ case len (workAvailableCond .Reason ) == 0 :
91+ // Normally this should never occur; this branch is added just for completeness reasons.
92+ _ = controller .NewUnexpectedBehaviorError (fmt .Errorf ("failed to track work metrics: available condition has no reason (work=%v)" , klog .KObj (work )))
93+ default :
94+ workAvailabilityStatus = workAvailableCond .Reason
4295 }
4396
44- klog .V (2 ).InfoS ("No last update timestamp found on the Work object" , "work" , klog .KObj (work ))
97+ var workDiffReportedStatus string
98+ workDiffReportedCond := meta .FindStatusCondition (work .Status .Conditions , fleetv1beta1 .WorkConditionTypeDiffReported )
99+ switch {
100+ case workDiffReportedCond == nil :
101+ workDiffReportedStatus = workOrManifestStatusSkipped
102+ case workDiffReportedCond .ObservedGeneration != work .Generation :
103+ // Normally this should never occur; as the method is called right after the status
104+ // is refreshed.
105+ _ = controller .NewUnexpectedBehaviorError (fmt .Errorf ("failed to track work metrics: diff reported condition is stale" ))
106+ shouldSkip = true
107+ case workDiffReportedCond .Status == metav1 .ConditionUnknown :
108+ // At this point Fleet does not set the DiffReported condition to the Unknown status; this
109+ // branch is added just for completeness reasons.
110+ workDiffReportedStatus = workOrManifestStatusUnknown
111+ case len (workDiffReportedCond .Reason ) == 0 :
112+ // Normally this should never occur; this branch is added just for completeness reasons.
113+ _ = controller .NewUnexpectedBehaviorError (fmt .Errorf ("failed to track work metrics: diff reported condition has no reason (work=%v)" , klog .KObj (work )))
114+ default :
115+ workDiffReportedStatus = workDiffReportedCond .Reason
116+ }
117+
118+ // Do a sanity check; if any of the work conditions is stale; do not emit any metric data point.
119+ if shouldSkip {
120+ // An error has been logged earlier.
121+ return
122+ }
123+
124+ metrics .FleetWorkProcessingRequestsTotal .WithLabelValues (
125+ workApplyStatus ,
126+ workAvailabilityStatus ,
127+ workDiffReportedStatus ,
128+ ).Inc ()
129+
130+ // Increment the manifest processing request counter.
131+ for idx := range work .Status .ManifestConditions {
132+ manifestCond := & work .Status .ManifestConditions [idx ]
133+
134+ // No generation checks are added here, as the observed generation reported
135+ // on manifest conditions is the generation of the corresponding object in the
136+ // member cluster, not the generation of the work object. Plus, as this
137+ // method is called right after the status is refreshed, all conditions are
138+ // expected to be up-to-date.
139+
140+ var manifestApplyStatus string
141+ manifestAppliedCond := meta .FindStatusCondition (manifestCond .Conditions , fleetv1beta1 .WorkConditionTypeApplied )
142+ switch {
143+ case manifestAppliedCond == nil :
144+ manifestApplyStatus = workOrManifestStatusSkipped
145+ case manifestAppliedCond .Status == metav1 .ConditionUnknown :
146+ // At this point Fleet does not set the Applied condition to the Unknown status; this
147+ // branch is added just for completeness reasons.
148+ manifestApplyStatus = workOrManifestStatusUnknown
149+ case len (manifestAppliedCond .Reason ) == 0 :
150+ // Normally this should never occur; this branch is added just for completeness reasons.
151+ _ = controller .NewUnexpectedBehaviorError (fmt .Errorf ("failed to track manifest metrics: applied condition has no reason (manifest=%v, work=%v)" , manifestCond .Identifier , klog .KObj (work )))
152+ continue
153+ default :
154+ manifestApplyStatus = manifestAppliedCond .Reason
155+ }
156+
157+ var manifestAvailabilityStatus string
158+ manifestAvailableCond := meta .FindStatusCondition (manifestCond .Conditions , fleetv1beta1 .WorkConditionTypeAvailable )
159+ switch {
160+ case manifestAvailableCond == nil :
161+ manifestAvailabilityStatus = workOrManifestStatusSkipped
162+ case manifestAvailableCond .Status == metav1 .ConditionUnknown :
163+ // At this point Fleet does not set the Available condition to the Unknown status; this
164+ // branch is added just for completeness reasons.
165+ manifestAvailabilityStatus = workOrManifestStatusUnknown
166+ case len (manifestAvailableCond .Reason ) == 0 :
167+ // Normally this should never occur; this branch is added just for completeness reasons.
168+ _ = controller .NewUnexpectedBehaviorError (fmt .Errorf ("failed to track manifest metrics: available condition has no reason (manifest=%v, work=%v)" , manifestCond .Identifier , klog .KObj (work )))
169+ continue
170+ default :
171+ manifestAvailabilityStatus = manifestAvailableCond .Reason
172+ }
173+
174+ var manifestDiffReportedStatus string
175+ manifestDiffReportedCond := meta .FindStatusCondition (manifestCond .Conditions , fleetv1beta1 .WorkConditionTypeDiffReported )
176+ switch {
177+ case manifestDiffReportedCond == nil :
178+ manifestDiffReportedStatus = workOrManifestStatusSkipped
179+ case manifestDiffReportedCond .Status == metav1 .ConditionUnknown :
180+ // At this point Fleet does not set the DiffReported condition to the Unknown status; this
181+ // branch is added just for completeness reasons.
182+ manifestDiffReportedStatus = workOrManifestStatusUnknown
183+ case len (manifestDiffReportedCond .Reason ) == 0 :
184+ // Normally this should never occur; this branch is added just for completeness reasons.
185+ _ = controller .NewUnexpectedBehaviorError (fmt .Errorf ("failed to track manifest metrics: diff reported condition has no reason (manifest=%v, work=%v)" , manifestCond .Identifier , klog .KObj (work )))
186+ continue
187+ default :
188+ manifestDiffReportedStatus = manifestDiffReportedCond .Reason
189+ }
190+
191+ manifestDriftDetectionStatus := manifestDriftOrDiffDetectionStatusNotFound
192+ if manifestCond .DriftDetails != nil {
193+ manifestDriftDetectionStatus = manifestDriftOrDiffDetectionStatusFound
194+ }
195+
196+ manifestDiffDetectionStatus := manifestDriftOrDiffDetectionStatusNotFound
197+ if manifestCond .DiffDetails != nil {
198+ manifestDiffDetectionStatus = manifestDriftOrDiffDetectionStatusFound
199+ }
200+
201+ metrics .FleetManifestProcessingRequestsTotal .WithLabelValues (
202+ manifestApplyStatus ,
203+ manifestAvailabilityStatus ,
204+ manifestDiffReportedStatus ,
205+ manifestDriftDetectionStatus ,
206+ manifestDiffDetectionStatus ,
207+ ).Inc ()
208+ }
45209}
0 commit comments