Skip to content

Commit 0603223

Browse files
committed
Expose classification metrics from LowNodeUtilization plugin
Enhance the plugin interface introducing an additional Metrics interface to be easily implemented by plugins that intend to register and expose metrics. Expose two new metrics from the LowNodeUtilization plugin: - low_node_utilization_thresholds - low_node_utilization_classification These metrics allow users to properly monitor plugin behavior, which was previously only visible in descheduler logs. Signed-off-by: Simone Tiraboschi <[email protected]>
1 parent f8c8d9a commit 0603223

File tree

9 files changed

+386
-5
lines changed

9 files changed

+386
-5
lines changed

metrics/metrics.go

Lines changed: 172 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,9 @@ limitations under the License.
1717
package metrics
1818

1919
import (
20+
"strings"
2021
"sync"
22+
"unicode"
2123

2224
"k8s.io/component-base/metrics"
2325
"k8s.io/component-base/metrics/legacyregistry"
@@ -29,6 +31,43 @@ const (
2931
DeschedulerSubsystem = "descheduler"
3032
)
3133

34+
// MetricsHandler provides a smart wrapper for metrics that handles export logic
35+
type MetricsHandler interface {
36+
// WithLabelValues is equivalent to metrics.WithLabelValues but respects ShouldExportMetrics
37+
WithLabelValues(lvs ...string) MetricsHandler
38+
// Set sets the gauge value (for GaugeVec metrics)
39+
Set(val float64)
40+
// Inc increments the counter (for CounterVec metrics)
41+
Inc()
42+
// Add adds a value to the counter (for CounterVec metrics)
43+
Add(val float64)
44+
// Observe observes a value for histogram (for HistogramVec metrics)
45+
Observe(val float64)
46+
}
47+
48+
// normalizePluginName converts a plugin name to a normalized form for use in metrics subsystems
49+
// Examples: "LowNodeUtilization" -> "low_node_utilization", "RemovePodsViolatingNodeTaints" -> "remove_pods_violating_node_taints"
50+
func normalizePluginName(pluginName string) string {
51+
var result strings.Builder
52+
for i, r := range pluginName {
53+
if unicode.IsUpper(r) && i > 0 {
54+
result.WriteRune('_')
55+
}
56+
result.WriteRune(unicode.ToLower(r))
57+
}
58+
return result.String()
59+
}
60+
61+
// getPluginSubsystem returns the subsystem name for a plugin, optionally including profile name
62+
func getPluginSubsystem(profileName, pluginName string) string {
63+
normalizedPluginName := normalizePluginName(pluginName)
64+
if profileName == "" {
65+
return DeschedulerSubsystem + "_" + normalizedPluginName
66+
}
67+
normalizedProfileName := normalizePluginName(profileName)
68+
return DeschedulerSubsystem + "_" + normalizedProfileName + "_" + normalizedPluginName
69+
}
70+
3271
var (
3372
PodsEvicted = metrics.NewCounterVec(
3473
&metrics.CounterOpts{
@@ -105,6 +144,139 @@ var (
105144

106145
var registerMetrics sync.Once
107146

147+
type PluginMetricsRegistry struct {
148+
pluginMetricsMap map[string]map[string]interface{} // plugin -> metric name -> metric object
149+
shouldExport bool
150+
mutex sync.RWMutex
151+
}
152+
153+
func NewPluginMetricsRegistry() *PluginMetricsRegistry {
154+
return &PluginMetricsRegistry{
155+
pluginMetricsMap: make(map[string]map[string]interface{}),
156+
shouldExport: true, // Default to true, can be updated later
157+
}
158+
}
159+
160+
func (r *PluginMetricsRegistry) SetShouldExport(shouldExport bool) {
161+
r.mutex.Lock()
162+
defer r.mutex.Unlock()
163+
r.shouldExport = shouldExport
164+
}
165+
166+
func (r *PluginMetricsRegistry) ShouldExport() bool {
167+
r.mutex.RLock()
168+
defer r.mutex.RUnlock()
169+
return r.shouldExport
170+
}
171+
172+
// RegisterMetricsWithNames registers metrics for a specific plugin with name mapping
173+
func (r *PluginMetricsRegistry) RegisterNamedPluginMetrics(pluginName string, namedMetrics map[string]metrics.Registerable) {
174+
r.mutex.Lock()
175+
defer r.mutex.Unlock()
176+
177+
// Initialize the plugin metrics map if needed
178+
if r.pluginMetricsMap[pluginName] == nil {
179+
r.pluginMetricsMap[pluginName] = make(map[string]interface{})
180+
}
181+
182+
for name, metric := range namedMetrics {
183+
// Store in the metrics map
184+
r.pluginMetricsMap[pluginName][name] = metric
185+
186+
// Register the metric
187+
legacyregistry.MustRegister(metric)
188+
}
189+
}
190+
191+
func (r *PluginMetricsRegistry) GetPluginSubsystem(profileName, pluginName string) string {
192+
return getPluginSubsystem(profileName, pluginName)
193+
}
194+
195+
func (r *PluginMetricsRegistry) GetPluginMetric(pluginName, metricName string) interface{} {
196+
r.mutex.RLock()
197+
defer r.mutex.RUnlock()
198+
199+
if pluginMap, exists := r.pluginMetricsMap[pluginName]; exists {
200+
if metric, exists := pluginMap[metricName]; exists {
201+
return metric
202+
}
203+
}
204+
return nil
205+
}
206+
207+
func (r *PluginMetricsRegistry) HandlePluginMetric(pluginName, metricName string) MetricsHandler {
208+
metric := r.GetPluginMetric(pluginName, metricName)
209+
return newMetricsHandler(metric, r.ShouldExport())
210+
}
211+
212+
type metricsHandler struct {
213+
metric interface{} // Could be *metrics.GaugeVec, *metrics.CounterVec, etc.
214+
shouldExport bool
215+
currentLabels []string
216+
}
217+
218+
func newMetricsHandler(metric interface{}, shouldExport bool) *metricsHandler {
219+
return &metricsHandler{
220+
metric: metric,
221+
shouldExport: shouldExport,
222+
}
223+
}
224+
225+
func (h *metricsHandler) WithLabelValues(lvs ...string) MetricsHandler {
226+
if !h.shouldExport || h.metric == nil {
227+
return h // Return no-op handler
228+
}
229+
230+
newHandler := &metricsHandler{
231+
metric: h.metric,
232+
shouldExport: h.shouldExport,
233+
currentLabels: lvs,
234+
}
235+
return newHandler
236+
}
237+
238+
func (h *metricsHandler) Set(val float64) {
239+
if !h.shouldExport || h.metric == nil {
240+
return
241+
}
242+
243+
if gaugeVec, ok := h.metric.(*metrics.GaugeVec); ok {
244+
gaugeVec.WithLabelValues(h.currentLabels...).Set(val)
245+
}
246+
}
247+
248+
func (h *metricsHandler) Inc() {
249+
if !h.shouldExport || h.metric == nil {
250+
return
251+
}
252+
253+
if counterVec, ok := h.metric.(*metrics.CounterVec); ok {
254+
counterVec.WithLabelValues(h.currentLabels...).Inc()
255+
}
256+
}
257+
258+
func (h *metricsHandler) Add(val float64) {
259+
if !h.shouldExport || h.metric == nil {
260+
return
261+
}
262+
263+
if counterVec, ok := h.metric.(*metrics.CounterVec); ok {
264+
counterVec.WithLabelValues(h.currentLabels...).Add(val)
265+
}
266+
}
267+
268+
func (h *metricsHandler) Observe(val float64) {
269+
if !h.shouldExport || h.metric == nil {
270+
return
271+
}
272+
273+
if histogramVec, ok := h.metric.(*metrics.HistogramVec); ok {
274+
histogramVec.WithLabelValues(h.currentLabels...).Observe(val)
275+
}
276+
}
277+
278+
var PluginRegistry = NewPluginMetricsRegistry()
279+
108280
// Register all metrics.
109281
func Register() {
110282
// Register the metrics.

metrics/metrics_test.go

Lines changed: 84 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,84 @@
1+
/*
2+
Copyright 2021 The Kubernetes Authors.
3+
4+
Licensed under the Apache License, Version 2.0 (the "License");
5+
you may not use this file except in compliance with the License.
6+
You may obtain a copy of the License at
7+
8+
http://www.apache.org/licenses/LICENSE-2.0
9+
10+
Unless required by applicable law or agreed to in writing, software
11+
distributed under the License is distributed on an "AS IS" BASIS,
12+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
See the License for the specific language governing permissions and
14+
limitations under the License.
15+
*/
16+
17+
package metrics
18+
19+
import (
20+
"testing"
21+
)
22+
23+
func TestNormalizePluginName(t *testing.T) {
24+
tests := []struct {
25+
input string
26+
expected string
27+
}{
28+
{"LowNodeUtilization", "low_node_utilization"},
29+
{"RemovePodsViolatingNodeTaints", "remove_pods_violating_node_taints"},
30+
{"HighNodeUtilization", "high_node_utilization"},
31+
{"RemoveDuplicates", "remove_duplicates"},
32+
{"simple", "simple"},
33+
{"SimplePlugin", "simple_plugin"},
34+
{"ABC", "a_b_c"},
35+
}
36+
37+
for _, test := range tests {
38+
t.Run(test.input, func(t *testing.T) {
39+
result := normalizePluginName(test.input)
40+
if result != test.expected {
41+
t.Errorf("normalizePluginName(%q) = %q, expected %q", test.input, result, test.expected)
42+
}
43+
})
44+
}
45+
}
46+
47+
func TestPluginMetricsRegistry_GetPluginSubsystem(t *testing.T) {
48+
registry := NewPluginMetricsRegistry()
49+
50+
tests := []struct {
51+
name string
52+
profileName string
53+
pluginName string
54+
expected string
55+
}{
56+
{
57+
name: "plugin without profile",
58+
profileName: "",
59+
pluginName: "LowNodeUtilization",
60+
expected: "descheduler_low_node_utilization",
61+
},
62+
{
63+
name: "plugin with profile",
64+
profileName: "ProfileA",
65+
pluginName: "LowNodeUtilization",
66+
expected: "descheduler_profile_a_low_node_utilization",
67+
},
68+
{
69+
name: "plugin with different profile",
70+
profileName: "CustomProfile",
71+
pluginName: "LowNodeUtilization",
72+
expected: "descheduler_custom_profile_low_node_utilization",
73+
},
74+
}
75+
76+
for _, tt := range tests {
77+
t.Run(tt.name, func(t *testing.T) {
78+
result := registry.GetPluginSubsystem(tt.profileName, tt.pluginName)
79+
if result != tt.expected {
80+
t.Errorf("GetPluginSubsystem(%s, %s) = %q, expected %q", tt.profileName, tt.pluginName, result, tt.expected)
81+
}
82+
})
83+
}
84+
}

pkg/descheduler/descheduler.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -425,6 +425,7 @@ func (d *descheduler) runProfiles(ctx context.Context, client clientset.Interfac
425425
frameworkprofile.WithGetPodsAssignedToNodeFnc(d.getPodsAssignedToNode),
426426
frameworkprofile.WithMetricsCollector(d.metricsCollector),
427427
frameworkprofile.WithPrometheusClient(d.prometheusClient),
428+
frameworkprofile.WithShouldExportMetrics(!d.rs.DisableMetrics),
428429
)
429430
if err != nil {
430431
klog.ErrorS(err, "unable to create a profile", "profile", profile.Name)

pkg/framework/fake/fake.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@ type HandleImpl struct {
2323
PodEvictorImpl *evictions.PodEvictor
2424
MetricsCollectorImpl *metricscollector.MetricsCollector
2525
PrometheusClientImpl promapi.Client
26+
ShouldExportMetricsImpl bool
2627
}
2728

2829
var _ frameworktypes.Handle = &HandleImpl{}

pkg/framework/plugins/example/example.go

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -184,3 +184,6 @@ func (d *Example) Deschedule(ctx context.Context, nodes []*v1.Node) *fwtypes.Sta
184184
logger.Info("Example plugin finished descheduling")
185185
return nil
186186
}
187+
188+
// TODO: add an example metric
189+
// document how to register metrics in README.md

pkg/framework/plugins/nodeutilization/lownodeutilization.go

Lines changed: 27 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -19,11 +19,13 @@ package nodeutilization
1919
import (
2020
"context"
2121
"fmt"
22+
"strconv"
2223

2324
v1 "k8s.io/api/core/v1"
2425
"k8s.io/apimachinery/pkg/runtime"
2526
"k8s.io/klog/v2"
2627

28+
"sigs.k8s.io/descheduler/metrics"
2729
"sigs.k8s.io/descheduler/pkg/api"
2830
"sigs.k8s.io/descheduler/pkg/descheduler/evictions"
2931
nodeutil "sigs.k8s.io/descheduler/pkg/descheduler/node"
@@ -35,9 +37,10 @@ import (
3537

3638
const LowNodeUtilizationPluginName = "LowNodeUtilization"
3739

38-
// this lines makes sure that HighNodeUtilization implements the BalancePlugin
39-
// interface.
40+
// this lines makes sure that LowNodeUtilization implements the BalancePlugin
41+
// and Metrics interfaces.
4042
var _ frameworktypes.BalancePlugin = &LowNodeUtilization{}
43+
var _ frameworktypes.Metrics = &LowNodeUtilization{}
4144

4245
// LowNodeUtilization evicts pods from overutilized nodes to underutilized
4346
// nodes. Note that CPU/Memory requests are used to calculate nodes'
@@ -134,6 +137,16 @@ func (l *LowNodeUtilization) Name() string {
134137
return LowNodeUtilizationPluginName
135138
}
136139

140+
// HandleMetric returns a metrics handler for the specified metric name
141+
func (l *LowNodeUtilization) HandleMetric(metricName string) metrics.MetricsHandler {
142+
return metrics.PluginRegistry.HandlePluginMetric(LowNodeUtilizationPluginName, metricName)
143+
}
144+
145+
// RegisterMetrics registers the plugin's metrics
146+
func (l *LowNodeUtilization) RegisterMetrics(profileName string) {
147+
RegisterMetrics(metrics.PluginRegistry, profileName)
148+
}
149+
137150
// Balance holds the main logic of the plugin. It evicts pods from over
138151
// utilized nodes to under utilized nodes. The goal here is to evenly
139152
// distribute pods across nodes.
@@ -179,6 +192,14 @@ func (l *LowNodeUtilization) Balance(ctx context.Context, nodes []*v1.Node) *fra
179192
)
180193
}
181194

195+
for key, rt := range thresholds {
196+
for i, t := range rt {
197+
for resource, value := range t {
198+
l.HandleMetric("thresholds").WithLabelValues(key, strconv.Itoa(i), resource.String()).Set(float64(value))
199+
}
200+
}
201+
}
202+
182203
// classify nodes in under and over utilized. we will later try to move
183204
// pods from the overutilized nodes to the underutilized ones.
184205
nodeGroups := classifier.Classify(
@@ -255,6 +276,10 @@ func (l *LowNodeUtilization) Balance(ctx context.Context, nodes []*v1.Node) *fra
255276
logger.V(1).Info("Criteria for a node above target utilization", l.overCriteria...)
256277
logger.V(1).Info("Number of overutilized nodes", "totalNumber", len(highNodes))
257278

279+
// Export classification metrics
280+
l.HandleMetric("classification").WithLabelValues("underutilized").Set(float64(len(lowNodes)))
281+
l.HandleMetric("classification").WithLabelValues("overutilized").Set(float64(len(highNodes)))
282+
258283
if len(lowNodes) == 0 {
259284
logger.V(1).Info(
260285
"No node is underutilized, nothing to do here, you might tune your thresholds further",

0 commit comments

Comments
 (0)