Skip to content

Commit 92416ee

Browse files
authored
Merge pull request #113 from samuel-esp/retry-on-concurrent-updates
Feat: Retry When Detecting Concurrent Updates
2 parents d95f10f + 3358b86 commit 92416ee

File tree

6 files changed

+598
-47
lines changed

6 files changed

+598
-47
lines changed

README.md

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -464,6 +464,16 @@ calls made by Kube Downscaler to the Kubernetes API Server. It can only take int
464464
(default: 10). This setting should only be added to Kube Downscaler arguments if timeout
465465
issues are observed in the logs.
466466
467+
`--max-retries-on-conflict`
468+
469+
: Optional: Specifies the maximum number of retries KubeDownscaler should perform
470+
when encountering a conflict error (HTTP 409). These errors occur when one of the
471+
resources, just before being processed by Kube Downscaler, is modified by another entity,
472+
such as an HPA, CI/CD pipeline, or manual intervention. If enabled, Kube Downscaler will
473+
retry the update immediately, without waiting for the next iteration (default: 0). This
474+
argument is strongly recommended when using the `--once` argument to process large clusters
475+
476+
467477
### Constrained Mode (Limited Access Mode)
468478
469479
The Constrained Mode (also known as Limited Access Mode) is designed for users who do not have full cluster access.

kube_downscaler/cmd.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -72,6 +72,12 @@ def get_parser():
7272
help="Timeout to be used when kubedownscaler performs call to the Kubernetes API Server (default: 10s)",
7373
default=os.getenv("API_SERVER_TIMEOUT", 10),
7474
)
75+
parser.add_argument(
76+
"--max-retries-on-conflict",
77+
type=int,
78+
help="Maximum number of retries for handling concurrent update conflicts (default: 0)",
79+
default=os.getenv("MAX_RETRIES_ON_CONFLICT", 0)
80+
)
7581
upscale_group.add_argument(
7682
"--upscale-period",
7783
help="Default time period to scale up once (default: never)",

kube_downscaler/main.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,7 @@ def main(args=None):
4343
args.upscale_target_only,
4444
args.dry_run,
4545
args.api_server_timeout,
46+
args.max_retries_on_conflict,
4647
args.downtime_replicas,
4748
args.deployment_time_annotation,
4849
args.enable_events,
@@ -66,6 +67,7 @@ def run_loop(
6667
upscale_target_only,
6768
dry_run,
6869
api_server_timeout,
70+
max_retries_on_conflict,
6971
downtime_replicas,
7072
deployment_time_annotation=None,
7173
enable_events=False,
@@ -102,6 +104,7 @@ def run_loop(
102104
admission_controller=admission_controller,
103105
constrained_downscaler=constrained_downscaler,
104106
api_server_timeout=api_server_timeout,
107+
max_retries_on_conflict=max_retries_on_conflict,
105108
downtime_replicas=downtime_replicas,
106109
deployment_time_annotation=deployment_time_annotation,
107110
enable_events=enable_events,

kube_downscaler/scaler.py

Lines changed: 79 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -9,13 +9,14 @@
99
from typing import Pattern
1010

1111
import pykube
12-
from pykube import CronJob
12+
from pykube import CronJob, HTTPClient
1313
from pykube import Deployment
1414
from pykube import HorizontalPodAutoscaler
1515
from pykube import Namespace
1616
from pykube import StatefulSet
1717
from pykube import Job
1818
from pykube import CustomResourceDefinition
19+
from pykube.exceptions import HTTPError
1920
from pykube.objects import NamespacedAPIObject, APIObject
2021
from pykube import DaemonSet
2122
from pykube.objects import NamespacedAPIObject, PodDisruptionBudget
@@ -299,6 +300,28 @@ def get_resources(kind, api, namespaces: FrozenSet[str], excluded_namespaces):
299300
return resources, excluded_namespaces;
300301

301302

303+
def get_resource(kind, api, namespace, resource_name: str):
304+
305+
try:
306+
resource = kind.objects(api).filter(namespace=namespace).get_or_none(name=resource_name)
307+
if resource is None:
308+
logger.debug(f"{kind.endpoint} {namespace}/{resource_name} not found")
309+
except requests.HTTPError as e:
310+
resource = None
311+
if e.response.status_code == 404:
312+
logger.debug(
313+
f"{kind} {resource_name} not found in namespace {namespace} (404)"
314+
)
315+
if e.response.status_code == 403:
316+
logger.warning(
317+
f"KubeDownscaler is not authorized to to retrieve {kind} {namespace}/{resource_name} (403)"
318+
)
319+
else:
320+
raise e
321+
322+
return resource
323+
324+
302325
def scale_jobs_without_admission_controller(plural, admission_controller, constrainted_downscaler):
303326
return (plural == "jobs" and admission_controller == "") or constrainted_downscaler
304327

@@ -864,6 +887,9 @@ def autoscale_resource(
864887
forced_uptime: bool,
865888
forced_downtime: bool,
866889
upscale_target_only: bool,
890+
max_retries_on_conflict: int,
891+
api: HTTPClient,
892+
kind: NamespacedAPIObject,
867893
dry_run: bool,
868894
now: datetime.datetime,
869895
grace_period: int = 0,
@@ -989,14 +1015,55 @@ def autoscale_resource(
9891015
else:
9901016
resource.update()
9911017
except Exception as e:
992-
logger.exception(
993-
f"Failed to process {resource.kind} {resource.namespace}/{resource.name}: {e}"
994-
)
1018+
if isinstance(e, HTTPError) and "the object has been modified" in str(e).lower():
1019+
logger.warning(
1020+
f"Unable to process {resource.kind} {resource.namespace}/{resource.name} because it was recently modified"
1021+
)
1022+
if max_retries_on_conflict > 0:
1023+
logger.info(
1024+
f"Retrying processing {resource.kind} {resource.namespace}/{resource.name} (Remaining Retries: {max_retries_on_conflict})")
1025+
max_retries_on_conflict = max_retries_on_conflict - 1
1026+
refreshed_resource = get_resource(kind, api, resource.namespace, resource.name)
1027+
if refreshed_resource is not None:
1028+
autoscale_resource(
1029+
refreshed_resource,
1030+
upscale_period,
1031+
downscale_period,
1032+
default_uptime,
1033+
default_downtime,
1034+
forced_uptime,
1035+
forced_downtime,
1036+
upscale_target_only,
1037+
max_retries_on_conflict,
1038+
api,
1039+
kind,
1040+
dry_run,
1041+
now,
1042+
grace_period,
1043+
downtime_replicas,
1044+
namespace_excluded=namespace_excluded,
1045+
deployment_time_annotation=deployment_time_annotation,
1046+
enable_events=enable_events,
1047+
matching_labels=matching_labels,
1048+
)
1049+
else:
1050+
logger.warning(
1051+
f"Retry process failed for {resource.kind} {resource.namespace}/{resource.name} because the resource cannot be found, it may have been deleted from the cluster")
1052+
else:
1053+
logger.warning(
1054+
f"Will retry processing {resource.kind} {resource.namespace}/{resource.name} in the next iteration, unless the --once argument is specified"
1055+
)
1056+
elif isinstance(e, HTTPError) and "not found" in str(e).lower():
1057+
logger.info(f"While waiting to process {resource.kind} {resource.namespace}/{resource.name}, the resource was removed from the cluster")
1058+
else:
1059+
logger.exception(
1060+
f"Failed to process {resource.kind} {resource.namespace}/{resource.name}: {e}"
1061+
)
9951062

9961063

9971064
def autoscale_resources(
998-
api,
999-
kind,
1065+
api: HTTPClient,
1066+
kind: NamespacedAPIObject,
10001067
namespace: FrozenSet[Pattern],
10011068
exclude_namespaces: FrozenSet[Pattern],
10021069
exclude_names: FrozenSet[str],
@@ -1008,6 +1075,7 @@ def autoscale_resources(
10081075
forced_uptime: bool,
10091076
upscale_target_only: bool,
10101077
constrained_downscaler: bool,
1078+
max_retries_on_conflict: int,
10111079
dry_run: bool,
10121080
now: datetime.datetime,
10131081
grace_period: int,
@@ -1115,6 +1183,9 @@ def autoscale_resources(
11151183
forced_uptime_for_namespace,
11161184
forced_downtime_for_namespace,
11171185
upscale_target_only,
1186+
max_retries_on_conflict,
1187+
api,
1188+
kind,
11181189
dry_run,
11191190
now,
11201191
grace_period,
@@ -1351,6 +1422,7 @@ def scale(
13511422
admission_controller: str,
13521423
constrained_downscaler: bool,
13531424
api_server_timeout: int,
1425+
max_retries_on_conflict: int,
13541426
downtime_replicas: int = 0,
13551427
deployment_time_annotation: Optional[str] = None,
13561428
enable_events: bool = False,
@@ -1379,6 +1451,7 @@ def scale(
13791451
forced_uptime,
13801452
upscale_target_only,
13811453
constrained_downscaler,
1454+
max_retries_on_conflict,
13821455
dry_run,
13831456
now,
13841457
grace_period,

0 commit comments

Comments
 (0)