|
| 1 | +/* |
| 2 | +Copyright 2022 The Kubernetes Authors. |
| 3 | +
|
| 4 | +Licensed under the Apache License, Version 2.0 (the "License"); |
| 5 | +you may not use this file except in compliance with the License. |
| 6 | +You may obtain a copy of the License at |
| 7 | +
|
| 8 | + http://www.apache.org/licenses/LICENSE-2.0 |
| 9 | +
|
| 10 | +Unless required by applicable law or agreed to in writing, software |
| 11 | +distributed under the License is distributed on an "AS IS" BASIS, |
| 12 | +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 13 | +See the License for the specific language governing permissions and |
| 14 | +limitations under the License. |
| 15 | +*/ |
| 16 | + |
| 17 | +package latencytracker |
| 18 | + |
| 19 | +import ( |
| 20 | + "maps" |
| 21 | + "slices" |
| 22 | + "time" |
| 23 | + |
| 24 | + ca_context "k8s.io/autoscaler/cluster-autoscaler/context" |
| 25 | + "k8s.io/autoscaler/cluster-autoscaler/core/scaledown" |
| 26 | + "k8s.io/autoscaler/cluster-autoscaler/core/scaledown/status" |
| 27 | + "k8s.io/autoscaler/cluster-autoscaler/metrics" |
| 28 | + "k8s.io/klog/v2" |
| 29 | + |
| 30 | + processor "k8s.io/autoscaler/cluster-autoscaler/processors/status" |
| 31 | +) |
| 32 | + |
| 33 | +const ( |
| 34 | + // scaleDownLatencyLogThreshold is the duration after which a scale-down |
| 35 | + // deletion is considered "slow". Deletions that take |
| 36 | + // longer than this threshold will be logged at a more visible level |
| 37 | + scaleDownLatencyLogThreshold = 3 * time.Minute |
| 38 | +) |
| 39 | + |
| 40 | +type unneededNodeState struct { |
| 41 | + unneededSince time.Time |
| 42 | + removalThreshold time.Duration |
| 43 | +} |
| 44 | + |
| 45 | +// NodeLatencyTracker keeps track of nodes that are marked as unneeded, when they became unneeded, |
| 46 | +// and removalThresholds to emit node removal latency metrics. |
| 47 | +type NodeLatencyTracker struct { |
| 48 | + unneededNodes map[string]unneededNodeState |
| 49 | + wrapped processor.ScaleDownStatusProcessor |
| 50 | +} |
| 51 | + |
| 52 | +// NewNodeLatencyTracker creates a new tracker. |
| 53 | +func NewNodeLatencyTracker(wrapped processor.ScaleDownStatusProcessor) *NodeLatencyTracker { |
| 54 | + return &NodeLatencyTracker{ |
| 55 | + unneededNodes: make(map[string]unneededNodeState), |
| 56 | + wrapped: wrapped, |
| 57 | + } |
| 58 | +} |
| 59 | + |
| 60 | +// UpdateScaleDownCandidates updates tracked unneeded nodes and reports those that became needed again. |
| 61 | +func (t *NodeLatencyTracker) UpdateScaleDownCandidates(list []*scaledown.UnneededNode, timestamp time.Time) { |
| 62 | + currentSet := make(map[string]struct{}, len(list)) |
| 63 | + for _, candidate := range list { |
| 64 | + nodeName := candidate.Node.Name |
| 65 | + currentSet[nodeName] = struct{}{} |
| 66 | + if info, exists := t.unneededNodes[nodeName]; !exists { |
| 67 | + t.unneededNodes[nodeName] = unneededNodeState{ |
| 68 | + unneededSince: timestamp, |
| 69 | + removalThreshold: candidate.RemovalThreshold, |
| 70 | + } |
| 71 | + klog.V(6).Infof("Started tracking unneeded node %s at %v with removal threshold %v.", nodeName, timestamp, candidate.RemovalThreshold) |
| 72 | + } else { |
| 73 | + if info.removalThreshold != candidate.RemovalThreshold { |
| 74 | + info.removalThreshold = candidate.RemovalThreshold |
| 75 | + t.unneededNodes[nodeName] = info |
| 76 | + klog.V(6).Infof("Updated removal threshold for tracked node %s to %v.", nodeName, candidate.RemovalThreshold) |
| 77 | + } |
| 78 | + } |
| 79 | + } |
| 80 | + for nodeName := range t.unneededNodes { |
| 81 | + if _, exists := currentSet[nodeName]; !exists { |
| 82 | + delete(t.unneededNodes, nodeName) |
| 83 | + klog.V(6).Infof("Node %s is no longer unneeded (or was removed). Stopped tracking at %v.", nodeName, timestamp) |
| 84 | + } |
| 85 | + } |
| 86 | +} |
| 87 | + |
| 88 | +// Process updates unremovableNodes and reports node removal latency based on scale-down status. |
| 89 | +func (t *NodeLatencyTracker) Process(autoscalingCtx *ca_context.AutoscalingContext, status *status.ScaleDownStatus) { |
| 90 | + if t.wrapped != nil { |
| 91 | + t.wrapped.Process(autoscalingCtx, status) |
| 92 | + } |
| 93 | + for _, unremovableNode := range status.UnremovableNodes { |
| 94 | + nodeName := unremovableNode.Node.Name |
| 95 | + if info, exists := t.unneededNodes[nodeName]; exists { |
| 96 | + duration := time.Since(info.unneededSince) |
| 97 | + metrics.UpdateScaleDownNodeRemovalLatency(false, duration) |
| 98 | + klog.V(4).Infof("Node %q is unremovable, became needed again (unneeded for %s).", nodeName, duration) |
| 99 | + delete(t.unneededNodes, nodeName) |
| 100 | + } |
| 101 | + } |
| 102 | + for _, scaledDownNode := range status.ScaledDownNodes { |
| 103 | + nodeName := scaledDownNode.Node.Name |
| 104 | + if info, exists := t.unneededNodes[nodeName]; exists { |
| 105 | + duration := time.Since(info.unneededSince) |
| 106 | + latency := duration - info.removalThreshold |
| 107 | + metrics.UpdateScaleDownNodeRemovalLatency(true, latency) |
| 108 | + if latency > scaleDownLatencyLogThreshold { |
| 109 | + klog.V(2).Infof( |
| 110 | + "Observing deletion for node %s, unneeded for %s (removal threshold was %s).", |
| 111 | + nodeName, duration, info.removalThreshold, |
| 112 | + ) |
| 113 | + } else { |
| 114 | + klog.V(6).Infof( |
| 115 | + "Observing deletion for node %s, unneeded for %s (removal threshold was %s).", |
| 116 | + nodeName, duration, info.removalThreshold, |
| 117 | + ) |
| 118 | + } |
| 119 | + delete(t.unneededNodes, nodeName) |
| 120 | + } |
| 121 | + } |
| 122 | + if klog.V(6).Enabled() { |
| 123 | + for nodeName := range t.unneededNodes { |
| 124 | + klog.Infof("Node %q remains in unneeded list (not scaled down). Continuing to track latency.", nodeName) |
| 125 | + } |
| 126 | + } |
| 127 | +} |
| 128 | + |
| 129 | +// getTrackedNodes returns the names of all nodes currently tracked as unneeded. |
| 130 | +func (t *NodeLatencyTracker) getTrackedNodes() []string { |
| 131 | + return slices.Collect(maps.Keys(t.unneededNodes)) |
| 132 | +} |
| 133 | + |
| 134 | +// CleanUp cleans up internal structures. |
| 135 | +func (t *NodeLatencyTracker) CleanUp() { |
| 136 | + if t.wrapped != nil { |
| 137 | + t.wrapped.CleanUp() |
| 138 | + } |
| 139 | +} |
0 commit comments