Skip to content

Commit afb85c4

Browse files
authored
feat: add new work applier metrics (#52)
* Added new work applier metrics Signed-off-by: michaelawyu <[email protected]> * Minor fixes Signed-off-by: michaelawyu <[email protected]> * Minor fixes Signed-off-by: michaelawyu <[email protected]> * Minor fixes Signed-off-by: michaelawyu <[email protected]> * Minor fixes Signed-off-by: michaelawyu <[email protected]> --------- Signed-off-by: michaelawyu <[email protected]>
1 parent 048cfc6 commit afb85c4

File tree

6 files changed

+784
-22
lines changed

6 files changed

+784
-22
lines changed

cmd/memberagent/main.go

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -103,7 +103,12 @@ func init() {
103103
utilruntime.Must(placementv1beta1.AddToScheme(scheme))
104104
//+kubebuilder:scaffold:scheme
105105

106-
metrics.Registry.MustRegister(fleetmetrics.JoinResultMetrics, fleetmetrics.LeaveResultMetrics, fleetmetrics.WorkApplyTime)
106+
metrics.Registry.MustRegister(
107+
fleetmetrics.JoinResultMetrics,
108+
fleetmetrics.LeaveResultMetrics,
109+
fleetmetrics.FleetWorkProcessingRequestsTotal,
110+
fleetmetrics.FleetManifestProcessingRequestsTotal,
111+
)
107112
}
108113

109114
func main() {

pkg/controllers/workapplier/controller.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -463,8 +463,6 @@ func (r *Reconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Resu
463463
// Track the availability information.
464464
r.trackInMemberClusterObjAvailability(ctx, bundles, workRef)
465465

466-
trackWorkApplyLatencyMetric(work)
467-
468466
// Refresh the status of the Work object.
469467
if err := r.refreshWorkStatus(ctx, work, bundles); err != nil {
470468
return ctrl.Result{}, err
@@ -475,6 +473,8 @@ func (r *Reconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Resu
475473
return ctrl.Result{}, err
476474
}
477475

476+
trackWorkAndManifestProcessingRequestMetrics(work)
477+
478478
// If the Work object is not yet available, reconcile again.
479479
if !isWorkObjectAvailable(work) {
480480
klog.V(2).InfoS("Work object is not yet in an available state; requeue to monitor its availability", "work", workRef)

pkg/controllers/workapplier/controller_test.go

Lines changed: 11 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -26,15 +26,16 @@ import (
2626
"github.com/google/go-cmp/cmp/cmpopts"
2727
appsv1 "k8s.io/api/apps/v1"
2828
corev1 "k8s.io/api/core/v1"
29-
"k8s.io/utils/ptr"
30-
31-
fleetv1beta1 "github.com/kubefleet-dev/kubefleet/apis/placement/v1beta1"
32-
3329
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
3430
"k8s.io/apimachinery/pkg/apis/meta/v1/unstructured"
3531
"k8s.io/apimachinery/pkg/runtime"
3632
"k8s.io/apimachinery/pkg/runtime/schema"
3733
"k8s.io/client-go/kubernetes/scheme"
34+
"k8s.io/utils/ptr"
35+
ctrlmetrics "sigs.k8s.io/controller-runtime/pkg/metrics"
36+
37+
fleetv1beta1 "github.com/kubefleet-dev/kubefleet/apis/placement/v1beta1"
38+
"github.com/kubefleet-dev/kubefleet/pkg/metrics"
3839
)
3940

4041
const (
@@ -188,6 +189,12 @@ func TestMain(m *testing.M) {
188189
// Initialize the variables.
189190
initializeVariables()
190191

192+
// Register the metrics.
193+
ctrlmetrics.Registry.MustRegister(
194+
metrics.FleetWorkProcessingRequestsTotal,
195+
metrics.FleetManifestProcessingRequestsTotal,
196+
)
197+
191198
os.Exit(m.Run())
192199
}
193200

pkg/controllers/workapplier/metrics.go

Lines changed: 179 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -17,29 +17,193 @@ limitations under the License.
1717
package workapplier
1818

1919
import (
20-
"time"
20+
"fmt"
2121

22+
"k8s.io/apimachinery/pkg/api/meta"
23+
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
2224
"k8s.io/klog/v2"
2325

2426
fleetv1beta1 "github.com/kubefleet-dev/kubefleet/apis/placement/v1beta1"
2527
"github.com/kubefleet-dev/kubefleet/pkg/metrics"
26-
"github.com/kubefleet-dev/kubefleet/pkg/utils"
28+
"github.com/kubefleet-dev/kubefleet/pkg/utils/controller"
2729
)
2830

29-
func trackWorkApplyLatencyMetric(work *fleetv1beta1.Work) {
30-
// Calculate the work apply latency.
31-
lastUpdateTime, ok := work.GetAnnotations()[utils.LastWorkUpdateTimeAnnotationKey]
32-
if ok {
33-
workUpdateTime, parseErr := time.Parse(time.RFC3339, lastUpdateTime)
34-
if parseErr != nil {
35-
klog.ErrorS(parseErr, "Failed to parse the last update timestamp on the work", "work", klog.KObj(work))
36-
return
37-
}
31+
// Note (chenyu1):
32+
// the metrics for the work applier are added the assumptions that:
33+
// a) the applier should provide low-cardinality metrics only to keep the memory footprint low,
34+
// as there might be a high number (up to 10K) of works/manifests to process in a member cluster; and
35+
// b) keep the overhead of metrics collection low, esp. considering that the applier runs in
36+
// parallel (5 concurrent reconciliations by default) and is on the critical path of Fleet workflows.
37+
//
38+
// TO-DO (chenyu1): evaluate if there is any need for high-cardinality options and/or stateful
39+
// metric collection for better observability.
40+
41+
// Some of the label values for the work/manifest processing request counter metric.
42+
const (
43+
workOrManifestStatusSkipped = "Skipped"
44+
workOrManifestStatusUnknown = "Unknown"
45+
46+
manifestDriftOrDiffDetectionStatusFound = "Found"
47+
manifestDriftOrDiffDetectionStatusNotFound = "NotFound"
48+
)
49+
50+
func trackWorkAndManifestProcessingRequestMetrics(work *fleetv1beta1.Work) {
51+
// Increment the work processing request counter.
52+
53+
var workApplyStatus string
54+
var shouldSkip bool
55+
workAppliedCond := meta.FindStatusCondition(work.Status.Conditions, fleetv1beta1.WorkConditionTypeApplied)
56+
switch {
57+
case workAppliedCond == nil:
58+
workApplyStatus = workOrManifestStatusSkipped
59+
case workAppliedCond.ObservedGeneration != work.Generation:
60+
// Normally this should never occur; as the method is called right after the status
61+
// is refreshed.
62+
_ = controller.NewUnexpectedBehaviorError(fmt.Errorf("failed to track work metrics: applied condition is stale"))
63+
shouldSkip = true
64+
case workAppliedCond.Status == metav1.ConditionUnknown:
65+
// At this point Fleet does not set the Applied condition to the Unknown status; this
66+
// branch is added just for completeness reasons.
67+
workApplyStatus = workOrManifestStatusUnknown
68+
case len(workAppliedCond.Reason) == 0:
69+
// Normally this should never occur; this branch is added just for completeness reasons.
70+
_ = controller.NewUnexpectedBehaviorError(fmt.Errorf("failed to track work metrics: applied condition has no reason (work=%v)", klog.KObj(work)))
71+
shouldSkip = true
72+
default:
73+
workApplyStatus = workAppliedCond.Reason
74+
}
3875

39-
latency := time.Since(workUpdateTime)
40-
metrics.WorkApplyTime.WithLabelValues(work.GetName()).Observe(latency.Seconds())
41-
klog.V(2).InfoS("Work has been applied", "work", klog.KObj(work), "latency", latency.Milliseconds())
76+
var workAvailabilityStatus string
77+
workAvailableCond := meta.FindStatusCondition(work.Status.Conditions, fleetv1beta1.WorkConditionTypeAvailable)
78+
switch {
79+
case workAvailableCond == nil:
80+
workAvailabilityStatus = workOrManifestStatusSkipped
81+
case workAvailableCond.ObservedGeneration != work.Generation:
82+
// Normally this should never occur; as the method is called right after the status
83+
// is refreshed.
84+
_ = controller.NewUnexpectedBehaviorError(fmt.Errorf("failed to track work metrics: available condition is stale"))
85+
shouldSkip = true
86+
case workAvailableCond.Status == metav1.ConditionUnknown:
87+
// At this point Fleet does not set the Available condition to the Unknown status; this
88+
// branch is added just for completeness reasons.
89+
workAvailabilityStatus = workOrManifestStatusUnknown
90+
case len(workAvailableCond.Reason) == 0:
91+
// Normally this should never occur; this branch is added just for completeness reasons.
92+
_ = controller.NewUnexpectedBehaviorError(fmt.Errorf("failed to track work metrics: available condition has no reason (work=%v)", klog.KObj(work)))
93+
default:
94+
workAvailabilityStatus = workAvailableCond.Reason
4295
}
4396

44-
klog.V(2).InfoS("No last update timestamp found on the Work object", "work", klog.KObj(work))
97+
var workDiffReportedStatus string
98+
workDiffReportedCond := meta.FindStatusCondition(work.Status.Conditions, fleetv1beta1.WorkConditionTypeDiffReported)
99+
switch {
100+
case workDiffReportedCond == nil:
101+
workDiffReportedStatus = workOrManifestStatusSkipped
102+
case workDiffReportedCond.ObservedGeneration != work.Generation:
103+
// Normally this should never occur; as the method is called right after the status
104+
// is refreshed.
105+
_ = controller.NewUnexpectedBehaviorError(fmt.Errorf("failed to track work metrics: diff reported condition is stale"))
106+
shouldSkip = true
107+
case workDiffReportedCond.Status == metav1.ConditionUnknown:
108+
// At this point Fleet does not set the DiffReported condition to the Unknown status; this
109+
// branch is added just for completeness reasons.
110+
workDiffReportedStatus = workOrManifestStatusUnknown
111+
case len(workDiffReportedCond.Reason) == 0:
112+
// Normally this should never occur; this branch is added just for completeness reasons.
113+
_ = controller.NewUnexpectedBehaviorError(fmt.Errorf("failed to track work metrics: diff reported condition has no reason (work=%v)", klog.KObj(work)))
114+
default:
115+
workDiffReportedStatus = workDiffReportedCond.Reason
116+
}
117+
118+
// Do a sanity check; if any of the work conditions is stale; do not emit any metric data point.
119+
if shouldSkip {
120+
// An error has been logged earlier.
121+
return
122+
}
123+
124+
metrics.FleetWorkProcessingRequestsTotal.WithLabelValues(
125+
workApplyStatus,
126+
workAvailabilityStatus,
127+
workDiffReportedStatus,
128+
).Inc()
129+
130+
// Increment the manifest processing request counter.
131+
for idx := range work.Status.ManifestConditions {
132+
manifestCond := &work.Status.ManifestConditions[idx]
133+
134+
// No generation checks are added here, as the observed generation reported
135+
// on manifest conditions is the generation of the corresponding object in the
136+
// member cluster, not the generation of the work object. Plus, as this
137+
// method is called right after the status is refreshed, all conditions are
138+
// expected to be up-to-date.
139+
140+
var manifestApplyStatus string
141+
manifestAppliedCond := meta.FindStatusCondition(manifestCond.Conditions, fleetv1beta1.WorkConditionTypeApplied)
142+
switch {
143+
case manifestAppliedCond == nil:
144+
manifestApplyStatus = workOrManifestStatusSkipped
145+
case manifestAppliedCond.Status == metav1.ConditionUnknown:
146+
// At this point Fleet does not set the Applied condition to the Unknown status; this
147+
// branch is added just for completeness reasons.
148+
manifestApplyStatus = workOrManifestStatusUnknown
149+
case len(manifestAppliedCond.Reason) == 0:
150+
// Normally this should never occur; this branch is added just for completeness reasons.
151+
_ = controller.NewUnexpectedBehaviorError(fmt.Errorf("failed to track manifest metrics: applied condition has no reason (manifest=%v, work=%v)", manifestCond.Identifier, klog.KObj(work)))
152+
continue
153+
default:
154+
manifestApplyStatus = manifestAppliedCond.Reason
155+
}
156+
157+
var manifestAvailabilityStatus string
158+
manifestAvailableCond := meta.FindStatusCondition(manifestCond.Conditions, fleetv1beta1.WorkConditionTypeAvailable)
159+
switch {
160+
case manifestAvailableCond == nil:
161+
manifestAvailabilityStatus = workOrManifestStatusSkipped
162+
case manifestAvailableCond.Status == metav1.ConditionUnknown:
163+
// At this point Fleet does not set the Available condition to the Unknown status; this
164+
// branch is added just for completeness reasons.
165+
manifestAvailabilityStatus = workOrManifestStatusUnknown
166+
case len(manifestAvailableCond.Reason) == 0:
167+
// Normally this should never occur; this branch is added just for completeness reasons.
168+
_ = controller.NewUnexpectedBehaviorError(fmt.Errorf("failed to track manifest metrics: available condition has no reason (manifest=%v, work=%v)", manifestCond.Identifier, klog.KObj(work)))
169+
continue
170+
default:
171+
manifestAvailabilityStatus = manifestAvailableCond.Reason
172+
}
173+
174+
var manifestDiffReportedStatus string
175+
manifestDiffReportedCond := meta.FindStatusCondition(manifestCond.Conditions, fleetv1beta1.WorkConditionTypeDiffReported)
176+
switch {
177+
case manifestDiffReportedCond == nil:
178+
manifestDiffReportedStatus = workOrManifestStatusSkipped
179+
case manifestDiffReportedCond.Status == metav1.ConditionUnknown:
180+
// At this point Fleet does not set the DiffReported condition to the Unknown status; this
181+
// branch is added just for completeness reasons.
182+
manifestDiffReportedStatus = workOrManifestStatusUnknown
183+
case len(manifestDiffReportedCond.Reason) == 0:
184+
// Normally this should never occur; this branch is added just for completeness reasons.
185+
_ = controller.NewUnexpectedBehaviorError(fmt.Errorf("failed to track manifest metrics: diff reported condition has no reason (manifest=%v, work=%v)", manifestCond.Identifier, klog.KObj(work)))
186+
continue
187+
default:
188+
manifestDiffReportedStatus = manifestDiffReportedCond.Reason
189+
}
190+
191+
manifestDriftDetectionStatus := manifestDriftOrDiffDetectionStatusNotFound
192+
if manifestCond.DriftDetails != nil {
193+
manifestDriftDetectionStatus = manifestDriftOrDiffDetectionStatusFound
194+
}
195+
196+
manifestDiffDetectionStatus := manifestDriftOrDiffDetectionStatusNotFound
197+
if manifestCond.DiffDetails != nil {
198+
manifestDiffDetectionStatus = manifestDriftOrDiffDetectionStatusFound
199+
}
200+
201+
metrics.FleetManifestProcessingRequestsTotal.WithLabelValues(
202+
manifestApplyStatus,
203+
manifestAvailabilityStatus,
204+
manifestDiffReportedStatus,
205+
manifestDriftDetectionStatus,
206+
manifestDiffDetectionStatus,
207+
).Inc()
208+
}
45209
}

0 commit comments

Comments
 (0)