@@ -7,33 +7,108 @@ metadata:
77spec :
88 groups :
99 - name : recordingRules.rules
10+ interval : 30s
1011 rules :
12+ # Base metrics (CPU and Memory utilization)
1113 - record : descheduler:nodeutilization:cpu:avg1m
1214 expr : avg by (instance) (1 - rate(node_cpu_seconds_total{mode='idle'}[1m]))
1315
1416 - record : descheduler:averageworkersutilization:cpu:avg1m
1517 expr : avg(descheduler:nodeutilization:cpu:avg1m * on(instance) group_left(node) label_replace(kube_node_role{role="worker"}, 'instance', "$1", 'node', '(.+)'))
1618
19+ - record : descheduler:nodeutilization:memory:avg1m
20+ expr : |-
21+ (
22+ 1 - avg_over_time(node_memory_MemAvailable_bytes[1m]) /
23+ on(instance) label_replace(kube_node_status_allocatable{resource="memory"}, 'instance', "$1", 'node', '(.+)')
24+ ) and on(instance)
25+ label_replace(kube_node_status_allocatable{resource="memory"}, 'instance', "$1", 'node', '(.+)') > 0
26+
27+ - record : descheduler:averageworkersutilization:memory:avg1m
28+ expr : avg(descheduler:nodeutilization:memory:avg1m * on(instance) group_left(node) label_replace(kube_node_role{role="worker"}, 'instance', "$1", 'node', '(.+)'))
29+
30+ # Pressure metrics
1731 - record : descheduler:nodepressure:cpu:avg1m
1832 # return the cpu pressure if the cpu usage is over 70% otherwise
1933 # return cpu pressure as zero to (partially) filter out false
2034 # positives pressure spikes due to CPU limited pods.
2135 # See: https://github.com/kubernetes/enhancements/issues/5062
2236 expr : |-
23- avg by (instance) (
24- rate(node_pressure_cpu_waiting_seconds_total[1m])
25- ) and (
26- 1 - avg by (instance) (
27- rate(node_cpu_seconds_total{mode='idle'}[1m])
28- )
29- ) > 0.7
37+ (
38+ avg by (instance) (rate(node_pressure_cpu_waiting_seconds_total[1m]))
39+ and
40+ (1 - avg by (instance) (rate(node_cpu_seconds_total{mode='idle'}[1m]))) > 0.7
41+ )
3042 or
43+ (avg by (instance) (rate(node_pressure_cpu_waiting_seconds_total[1m])) * 0)
44+
45+ - record : descheduler:averageworkerspressure:cpu:avg1m
46+ expr : avg(descheduler:nodepressure:cpu:avg1m * on(instance) group_left(node) label_replace(kube_node_role{role="worker"}, 'instance', "$1", 'node', '(.+)'))
47+
48+ - record : descheduler:nodepressure:memory:avg1m
49+ expr : |-
3150 avg by (instance) (
32- rate(node_pressure_cpu_waiting_seconds_total[1m])
33- ) * 0
51+ rate(node_pressure_memory_waiting_seconds_total[1m])
52+ )
53+
54+ - record : descheduler:averageworkerspressure:memory:avg1m
55+ expr : avg(descheduler:nodepressure:memory:avg1m * on(instance) group_left(node) label_replace(kube_node_role{role="worker"}, 'instance', "$1", 'node', '(.+)'))
3456
3557 - record : descheduler:combined_utilization_and_pressure:avg1m
3658 expr : |-
3759 (descheduler:nodeutilization:cpu:avg1m and on() descheduler:averageworkersutilization:cpu:avg1m < 0.8)
3860 or
3961 (descheduler:nodepressure:cpu:avg1m)
62+
63+ - record : descheduler:averageworkersutilization:memory:avg1m
64+ expr : avg(descheduler:nodeutilization:memory:avg1m * on(instance) group_left(node) label_replace(kube_node_role{role="worker"}, 'instance', "$1", 'node', '(.+)'))
65+
66+ - record : descheduler:nodeutilization:memory:avg1m:positivedeviation
67+ expr : |-
68+ descheduler:nodeutilization:memory:avg1m - on() group_left() descheduler:averageworkersutilization:memory:avg1m
69+ and
70+ descheduler:nodeutilization:memory:avg1m - on() group_left() descheduler:averageworkersutilization:memory:avg1m >= 0
71+ or
72+ descheduler:nodeutilization:memory:avg1m * 0
73+
74+ - record : descheduler:nodeutilization:cpu:avg1m:positivedeviation
75+ expr : |-
76+ descheduler:nodeutilization:cpu:avg1m - on() group_left() descheduler:averageworkersutilization:cpu:avg1m
77+ and
78+ descheduler:nodeutilization:cpu:avg1m - on() group_left() descheduler:averageworkersutilization:cpu:avg1m >= 0
79+ or
80+ descheduler:nodeutilization:cpu:avg1m * 0
81+
82+ - record : descheduler:nodepressure:cpu:avg1m:positivedeviation
83+ expr : |-
84+ descheduler:nodepressure:cpu:avg1m - on() group_left() descheduler:averageworkerspressure:cpu:avg1m
85+ and
86+ descheduler:nodepressure:cpu:avg1m - on() group_left() descheduler:averageworkerspressure:cpu:avg1m >= 0
87+ or
88+ descheduler:nodepressure:cpu:avg1m * 0
89+
90+ - record : descheduler:nodepressure:memory:avg1m:positivedeviation
91+ expr : |-
92+ descheduler:nodepressure:memory:avg1m - on() group_left() descheduler:averageworkerspressure:memory:avg1m
93+ and
94+ descheduler:nodepressure:memory:avg1m - on() group_left() descheduler:averageworkerspressure:memory:avg1m >= 0
95+ or
96+ descheduler:nodepressure:memory:avg1m * 0
97+
98+ # Ideal Point Positive Distance (Euclidean distance from ideal using positive deviations)
99+ - record : descheduler:node:ideal_point_positive_distance:avg1m
100+ expr : |-
101+ sqrt(
102+ descheduler:nodeutilization:cpu:avg1m:positivedeviation ^ 2 +
103+ descheduler:nodepressure:cpu:avg1m:positivedeviation ^ 2 +
104+ descheduler:nodeutilization:memory:avg1m:positivedeviation ^ 2 +
105+ descheduler:nodepressure:memory:avg1m:positivedeviation ^ 2
106+ )
107+
108+ # Sigmoid Ideal Point Positive Distance (k=3.0) - Amplified by 3x, clamped to [0,1]
109+ - record : descheduler:node:sigmoid_ideal_point_positive_distance:k3:avg1m
110+ expr : |-
111+ clamp_max(
112+ 3 * descheduler:node:ideal_point_positive_distance:avg1m,
113+ 1.0
114+ )
0 commit comments