Skip to content

Commit 40f17fe

Browse files
committed
Expose classification metrics from LowNodeUtilization plugin
Enhance the plugin interface introducing an additional Metrics interface to be easily implemented by plugins that intend to register and expose metrics. Expose two new metrics from the LowNodeUtilization plugin: - low_node_utilization_thresholds - low_node_utilization_classification These metrics allow users to properly monitor plugin behavior, which was previously only visible in descheduler logs. Signed-off-by: Simone Tiraboschi <[email protected]>
1 parent f8c8d9a commit 40f17fe

File tree

8 files changed

+371
-5
lines changed

8 files changed

+371
-5
lines changed

metrics/metrics.go

Lines changed: 168 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,9 @@ limitations under the License.
1717
package metrics
1818

1919
import (
20+
"strings"
2021
"sync"
22+
"unicode"
2123

2224
"k8s.io/component-base/metrics"
2325
"k8s.io/component-base/metrics/legacyregistry"
@@ -29,6 +31,39 @@ const (
2931
DeschedulerSubsystem = "descheduler"
3032
)
3133

34+
// MetricsHandler provides a smart wrapper for metrics that handles export logic
35+
type MetricsHandler interface {
36+
// WithLabelValues is equivalent to metrics.WithLabelValues but respects ShouldExportMetrics
37+
WithLabelValues(lvs ...string) MetricsHandler
38+
// Set sets the gauge value (for GaugeVec metrics)
39+
Set(val float64)
40+
// Inc increments the counter (for CounterVec metrics)
41+
Inc()
42+
// Add adds a value to the counter (for CounterVec metrics)
43+
Add(val float64)
44+
// Observe observes a value for histogram (for HistogramVec metrics)
45+
Observe(val float64)
46+
}
47+
48+
// normalizePluginName converts a plugin name to a normalized form for use in metrics subsystems
49+
// Examples: "LowNodeUtilization" -> "low_node_utilization", "RemovePodsViolatingNodeTaints" -> "remove_pods_violating_node_taints"
50+
func normalizePluginName(pluginName string) string {
51+
var result strings.Builder
52+
for i, r := range pluginName {
53+
if unicode.IsUpper(r) && i > 0 {
54+
result.WriteRune('_')
55+
}
56+
result.WriteRune(unicode.ToLower(r))
57+
}
58+
return result.String()
59+
}
60+
61+
// getPluginSubsystem returns the subsystem name for a plugin
62+
func getPluginSubsystem(pluginName string) string {
63+
normalizedName := normalizePluginName(pluginName)
64+
return DeschedulerSubsystem + "_" + normalizedName
65+
}
66+
3267
var (
3368
PodsEvicted = metrics.NewCounterVec(
3469
&metrics.CounterOpts{
@@ -105,6 +140,139 @@ var (
105140

106141
var registerMetrics sync.Once
107142

143+
type PluginMetricsRegistry struct {
144+
pluginMetricsMap map[string]map[string]interface{} // plugin -> metric name -> metric object
145+
shouldExport bool
146+
mutex sync.RWMutex
147+
}
148+
149+
func NewPluginMetricsRegistry() *PluginMetricsRegistry {
150+
return &PluginMetricsRegistry{
151+
pluginMetricsMap: make(map[string]map[string]interface{}),
152+
shouldExport: true, // Default to true, can be updated later
153+
}
154+
}
155+
156+
func (r *PluginMetricsRegistry) SetShouldExport(shouldExport bool) {
157+
r.mutex.Lock()
158+
defer r.mutex.Unlock()
159+
r.shouldExport = shouldExport
160+
}
161+
162+
func (r *PluginMetricsRegistry) ShouldExport() bool {
163+
r.mutex.RLock()
164+
defer r.mutex.RUnlock()
165+
return r.shouldExport
166+
}
167+
168+
// RegisterMetricsWithNames registers metrics for a specific plugin with name mapping
169+
func (r *PluginMetricsRegistry) RegisterNamedPluginMetrics(pluginName string, namedMetrics map[string]metrics.Registerable) {
170+
r.mutex.Lock()
171+
defer r.mutex.Unlock()
172+
173+
// Initialize the plugin metrics map if needed
174+
if r.pluginMetricsMap[pluginName] == nil {
175+
r.pluginMetricsMap[pluginName] = make(map[string]interface{})
176+
}
177+
178+
for name, metric := range namedMetrics {
179+
// Store in the metrics map
180+
r.pluginMetricsMap[pluginName][name] = metric
181+
182+
// Register the metric
183+
legacyregistry.MustRegister(metric)
184+
}
185+
}
186+
187+
func (r *PluginMetricsRegistry) GetPluginSubsystem(pluginName string) string {
188+
return getPluginSubsystem(pluginName)
189+
}
190+
191+
func (r *PluginMetricsRegistry) GetPluginMetric(pluginName, metricName string) interface{} {
192+
r.mutex.RLock()
193+
defer r.mutex.RUnlock()
194+
195+
if pluginMap, exists := r.pluginMetricsMap[pluginName]; exists {
196+
if metric, exists := pluginMap[metricName]; exists {
197+
return metric
198+
}
199+
}
200+
return nil
201+
}
202+
203+
func (r *PluginMetricsRegistry) HandlePluginMetric(pluginName, metricName string) MetricsHandler {
204+
metric := r.GetPluginMetric(pluginName, metricName)
205+
return newMetricsHandler(metric, r.ShouldExport())
206+
}
207+
208+
type metricsHandler struct {
209+
metric interface{} // Could be *metrics.GaugeVec, *metrics.CounterVec, etc.
210+
shouldExport bool
211+
currentLabels []string
212+
}
213+
214+
func newMetricsHandler(metric interface{}, shouldExport bool) *metricsHandler {
215+
return &metricsHandler{
216+
metric: metric,
217+
shouldExport: shouldExport,
218+
}
219+
}
220+
221+
func (h *metricsHandler) WithLabelValues(lvs ...string) MetricsHandler {
222+
if !h.shouldExport || h.metric == nil {
223+
return h // Return no-op handler
224+
}
225+
226+
newHandler := &metricsHandler{
227+
metric: h.metric,
228+
shouldExport: h.shouldExport,
229+
currentLabels: lvs,
230+
}
231+
return newHandler
232+
}
233+
234+
func (h *metricsHandler) Set(val float64) {
235+
if !h.shouldExport || h.metric == nil {
236+
return
237+
}
238+
239+
if gaugeVec, ok := h.metric.(*metrics.GaugeVec); ok {
240+
gaugeVec.WithLabelValues(h.currentLabels...).Set(val)
241+
}
242+
}
243+
244+
func (h *metricsHandler) Inc() {
245+
if !h.shouldExport || h.metric == nil {
246+
return
247+
}
248+
249+
if counterVec, ok := h.metric.(*metrics.CounterVec); ok {
250+
counterVec.WithLabelValues(h.currentLabels...).Inc()
251+
}
252+
}
253+
254+
func (h *metricsHandler) Add(val float64) {
255+
if !h.shouldExport || h.metric == nil {
256+
return
257+
}
258+
259+
if counterVec, ok := h.metric.(*metrics.CounterVec); ok {
260+
counterVec.WithLabelValues(h.currentLabels...).Add(val)
261+
}
262+
}
263+
264+
func (h *metricsHandler) Observe(val float64) {
265+
if !h.shouldExport || h.metric == nil {
266+
return
267+
}
268+
269+
if histogramVec, ok := h.metric.(*metrics.HistogramVec); ok {
270+
histogramVec.WithLabelValues(h.currentLabels...).Observe(val)
271+
}
272+
}
273+
274+
var PluginRegistry = NewPluginMetricsRegistry()
275+
108276
// Register all metrics.
109277
func Register() {
110278
// Register the metrics.

metrics/metrics_test.go

Lines changed: 76 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,76 @@
1+
/*
2+
Copyright 2021 The Kubernetes Authors.
3+
4+
Licensed under the Apache License, Version 2.0 (the "License");
5+
you may not use this file except in compliance with the License.
6+
You may obtain a copy of the License at
7+
8+
http://www.apache.org/licenses/LICENSE-2.0
9+
10+
Unless required by applicable law or agreed to in writing, software
11+
distributed under the License is distributed on an "AS IS" BASIS,
12+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
See the License for the specific language governing permissions and
14+
limitations under the License.
15+
*/
16+
17+
package metrics
18+
19+
import (
20+
"testing"
21+
)
22+
23+
func TestNormalizePluginName(t *testing.T) {
24+
tests := []struct {
25+
input string
26+
expected string
27+
}{
28+
{"LowNodeUtilization", "low_node_utilization"},
29+
{"RemovePodsViolatingNodeTaints", "remove_pods_violating_node_taints"},
30+
{"HighNodeUtilization", "high_node_utilization"},
31+
{"RemoveDuplicates", "remove_duplicates"},
32+
{"simple", "simple"},
33+
{"SimplePlugin", "simple_plugin"},
34+
{"ABC", "a_b_c"},
35+
}
36+
37+
for _, test := range tests {
38+
t.Run(test.input, func(t *testing.T) {
39+
result := normalizePluginName(test.input)
40+
if result != test.expected {
41+
t.Errorf("normalizePluginName(%q) = %q, expected %q", test.input, result, test.expected)
42+
}
43+
})
44+
}
45+
}
46+
47+
func TestGetPluginSubsystem(t *testing.T) {
48+
tests := []struct {
49+
pluginName string
50+
expected string
51+
}{
52+
{"LowNodeUtilization", "descheduler_low_node_utilization"},
53+
{"RemovePodsViolatingNodeTaints", "descheduler_remove_pods_violating_node_taints"},
54+
{"DefaultEvictor", "descheduler_default_evictor"},
55+
}
56+
57+
for _, test := range tests {
58+
t.Run(test.pluginName, func(t *testing.T) {
59+
result := getPluginSubsystem(test.pluginName)
60+
if result != test.expected {
61+
t.Errorf("getPluginSubsystem(%q) = %q, expected %q", test.pluginName, result, test.expected)
62+
}
63+
})
64+
}
65+
}
66+
67+
func TestPluginMetricsRegistry_GetPluginSubsystem(t *testing.T) {
68+
registry := NewPluginMetricsRegistry()
69+
70+
result := registry.GetPluginSubsystem("LowNodeUtilization")
71+
expected := "descheduler_low_node_utilization"
72+
73+
if result != expected {
74+
t.Errorf("GetPluginSubsystem(%q) = %q, expected %q", "LowNodeUtilization", result, expected)
75+
}
76+
}

pkg/descheduler/descheduler.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -425,6 +425,7 @@ func (d *descheduler) runProfiles(ctx context.Context, client clientset.Interfac
425425
frameworkprofile.WithGetPodsAssignedToNodeFnc(d.getPodsAssignedToNode),
426426
frameworkprofile.WithMetricsCollector(d.metricsCollector),
427427
frameworkprofile.WithPrometheusClient(d.prometheusClient),
428+
frameworkprofile.WithShouldExportMetrics(!d.rs.DisableMetrics),
428429
)
429430
if err != nil {
430431
klog.ErrorS(err, "unable to create a profile", "profile", profile.Name)

pkg/framework/fake/fake.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@ type HandleImpl struct {
2323
PodEvictorImpl *evictions.PodEvictor
2424
MetricsCollectorImpl *metricscollector.MetricsCollector
2525
PrometheusClientImpl promapi.Client
26+
ShouldExportMetricsImpl bool
2627
}
2728

2829
var _ frameworktypes.Handle = &HandleImpl{}

pkg/framework/plugins/nodeutilization/lownodeutilization.go

Lines changed: 27 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -19,11 +19,13 @@ package nodeutilization
1919
import (
2020
"context"
2121
"fmt"
22+
"strconv"
2223

2324
v1 "k8s.io/api/core/v1"
2425
"k8s.io/apimachinery/pkg/runtime"
2526
"k8s.io/klog/v2"
2627

28+
"sigs.k8s.io/descheduler/metrics"
2729
"sigs.k8s.io/descheduler/pkg/api"
2830
"sigs.k8s.io/descheduler/pkg/descheduler/evictions"
2931
nodeutil "sigs.k8s.io/descheduler/pkg/descheduler/node"
@@ -35,9 +37,10 @@ import (
3537

3638
const LowNodeUtilizationPluginName = "LowNodeUtilization"
3739

38-
// this lines makes sure that HighNodeUtilization implements the BalancePlugin
39-
// interface.
40+
// this lines makes sure that LowNodeUtilization implements the BalancePlugin
41+
// and Metrics interfaces.
4042
var _ frameworktypes.BalancePlugin = &LowNodeUtilization{}
43+
var _ frameworktypes.Metrics = &LowNodeUtilization{}
4144

4245
// LowNodeUtilization evicts pods from overutilized nodes to underutilized
4346
// nodes. Note that CPU/Memory requests are used to calculate nodes'
@@ -134,6 +137,16 @@ func (l *LowNodeUtilization) Name() string {
134137
return LowNodeUtilizationPluginName
135138
}
136139

140+
// HandleMetric returns a metrics handler for the specified metric name
141+
func (l *LowNodeUtilization) HandleMetric(metricName string) metrics.MetricsHandler {
142+
return metrics.PluginRegistry.HandlePluginMetric(LowNodeUtilizationPluginName, metricName)
143+
}
144+
145+
// RegisterMetrics registers the plugin's metrics
146+
func (l *LowNodeUtilization) RegisterMetrics() {
147+
RegisterMetrics(metrics.PluginRegistry)
148+
}
149+
137150
// Balance holds the main logic of the plugin. It evicts pods from over
138151
// utilized nodes to under utilized nodes. The goal here is to evenly
139152
// distribute pods across nodes.
@@ -179,6 +192,14 @@ func (l *LowNodeUtilization) Balance(ctx context.Context, nodes []*v1.Node) *fra
179192
)
180193
}
181194

195+
for key, rt := range thresholds {
196+
for i, t := range rt {
197+
for resource, value := range t {
198+
l.HandleMetric("thresholds").WithLabelValues(key, strconv.Itoa(i), resource.String()).Set(float64(value))
199+
}
200+
}
201+
}
202+
182203
// classify nodes in under and over utilized. we will later try to move
183204
// pods from the overutilized nodes to the underutilized ones.
184205
nodeGroups := classifier.Classify(
@@ -255,6 +276,10 @@ func (l *LowNodeUtilization) Balance(ctx context.Context, nodes []*v1.Node) *fra
255276
logger.V(1).Info("Criteria for a node above target utilization", l.overCriteria...)
256277
logger.V(1).Info("Number of overutilized nodes", "totalNumber", len(highNodes))
257278

279+
// Export classification metrics
280+
l.HandleMetric("classification").WithLabelValues("underutilized").Set(float64(len(lowNodes)))
281+
l.HandleMetric("classification").WithLabelValues("overutilized").Set(float64(len(highNodes)))
282+
258283
if len(lowNodes) == 0 {
259284
logger.V(1).Info(
260285
"No node is underutilized, nothing to do here, you might tune your thresholds further",

0 commit comments

Comments
 (0)