Merge pull request #113 from samuel-esp/retry-on-concurrent-updates

samuel-esp · web-flow · commit 92416ee65bb0 · 2024-11-27T15:33:08.000+01:00
Feat: Retry When Detecting Concurrent Updates
diff --git a/README.md b/README.md
@@ -464,6 +464,16 @@ calls made by Kube Downscaler to the Kubernetes API Server. It can only take int
 (default: 10). This setting should only be added to Kube Downscaler arguments if timeout 
 issues are observed in the logs.
 
+`--max-retries-on-conflict`
+
+: Optional: Specifies the maximum number of retries KubeDownscaler should perform 
+when encountering a conflict error (HTTP 409). These errors occur when one of the
+resources, just before being processed by Kube Downscaler, is modified by another entity,
+such as an HPA, CI/CD pipeline, or manual intervention. If enabled, Kube Downscaler will
+retry the update immediately, without waiting for the next iteration (default: 0). This 
+argument is strongly recommended when using the `--once` argument to process large clusters
+
+
 ### Constrained Mode (Limited Access Mode)
 
 The Constrained Mode (also known as Limited Access Mode) is designed for users who do not have full cluster access.
diff --git a/kube_downscaler/cmd.py b/kube_downscaler/cmd.py
@@ -72,6 +72,12 @@ def get_parser():
         help="Timeout to be used when kubedownscaler performs call to the Kubernetes API Server (default: 10s)",
         default=os.getenv("API_SERVER_TIMEOUT", 10),
     )
+    parser.add_argument(
+        "--max-retries-on-conflict",
+        type=int,
+        help="Maximum number of retries for handling concurrent update conflicts (default: 0)",
+        default=os.getenv("MAX_RETRIES_ON_CONFLICT", 0)
+    )
     upscale_group.add_argument(
         "--upscale-period",
         help="Default time period to scale up once (default: never)",
diff --git a/kube_downscaler/main.py b/kube_downscaler/main.py
@@ -43,6 +43,7 @@ def main(args=None):
         args.upscale_target_only,
         args.dry_run,
         args.api_server_timeout,
+        args.max_retries_on_conflict,
         args.downtime_replicas,
         args.deployment_time_annotation,
         args.enable_events,
@@ -66,6 +67,7 @@ def run_loop(
     upscale_target_only,
     dry_run,
     api_server_timeout,
+    max_retries_on_conflict,
     downtime_replicas,
     deployment_time_annotation=None,
     enable_events=False,
@@ -102,6 +104,7 @@ def run_loop(
                 admission_controller=admission_controller,
                 constrained_downscaler=constrained_downscaler,
                 api_server_timeout=api_server_timeout,
+                max_retries_on_conflict=max_retries_on_conflict,
                 downtime_replicas=downtime_replicas,
                 deployment_time_annotation=deployment_time_annotation,
                 enable_events=enable_events,
diff --git a/kube_downscaler/scaler.py b/kube_downscaler/scaler.py
@@ -9,13 +9,14 @@
 from typing import Pattern
 
 import pykube
-from pykube import CronJob
+from pykube import CronJob, HTTPClient
 from pykube import Deployment
 from pykube import HorizontalPodAutoscaler
 from pykube import Namespace
 from pykube import StatefulSet
 from pykube import Job
 from pykube import CustomResourceDefinition
+from pykube.exceptions import HTTPError
 from pykube.objects import NamespacedAPIObject, APIObject
 from pykube import DaemonSet
 from pykube.objects import NamespacedAPIObject, PodDisruptionBudget
@@ -299,6 +300,28 @@ def get_resources(kind, api, namespaces: FrozenSet[str], excluded_namespaces):
     return resources, excluded_namespaces;
 
 
+def get_resource(kind, api, namespace, resource_name: str):
+
+    try:
+        resource = kind.objects(api).filter(namespace=namespace).get_or_none(name=resource_name)
+        if resource is None:
+            logger.debug(f"{kind.endpoint} {namespace}/{resource_name} not found")
+    except requests.HTTPError as e:
+        resource = None
+        if e.response.status_code == 404:
+            logger.debug(
+                f"{kind} {resource_name} not found in namespace {namespace} (404)"
+            )
+        if e.response.status_code == 403:
+            logger.warning(
+                f"KubeDownscaler is not authorized to to retrieve {kind} {namespace}/{resource_name} (403)"
+            )
+        else:
+            raise e
+
+    return resource
+
+
 def scale_jobs_without_admission_controller(plural, admission_controller, constrainted_downscaler):
     return (plural == "jobs" and admission_controller == "") or constrainted_downscaler
 
@@ -864,6 +887,9 @@ def autoscale_resource(
     forced_uptime: bool,
     forced_downtime: bool,
     upscale_target_only: bool,
+    max_retries_on_conflict: int,
+    api: HTTPClient,
+    kind: NamespacedAPIObject,
     dry_run: bool,
     now: datetime.datetime,
     grace_period: int = 0,
@@ -989,14 +1015,55 @@ def autoscale_resource(
                 else:
                     resource.update()
     except Exception as e:
-        logger.exception(
-            f"Failed to process {resource.kind} {resource.namespace}/{resource.name}: {e}"
-        )
+        if isinstance(e, HTTPError) and "the object has been modified" in str(e).lower():
+            logger.warning(
+                f"Unable to process {resource.kind} {resource.namespace}/{resource.name} because it was recently modified"
+            )
+            if max_retries_on_conflict > 0:
+                logger.info(
+                    f"Retrying processing {resource.kind} {resource.namespace}/{resource.name} (Remaining Retries: {max_retries_on_conflict})")
+                max_retries_on_conflict = max_retries_on_conflict - 1
+                refreshed_resource = get_resource(kind, api, resource.namespace, resource.name)
+                if refreshed_resource is not None:
+                    autoscale_resource(
+                        refreshed_resource,
+                        upscale_period,
+                        downscale_period,
+                        default_uptime,
+                        default_downtime,
+                        forced_uptime,
+                        forced_downtime,
+                        upscale_target_only,
+                        max_retries_on_conflict,
+                        api,
+                        kind,
+                        dry_run,
+                        now,
+                        grace_period,
+                        downtime_replicas,
+                        namespace_excluded=namespace_excluded,
+                        deployment_time_annotation=deployment_time_annotation,
+                        enable_events=enable_events,
+                        matching_labels=matching_labels,
+                    )
+                else:
+                    logger.warning(
+                        f"Retry process failed for {resource.kind} {resource.namespace}/{resource.name} because the resource cannot be found, it may have been deleted from the cluster")
+            else:
+                logger.warning(
+                    f"Will retry processing {resource.kind} {resource.namespace}/{resource.name} in the next iteration, unless the --once argument is specified"
+                )
+        elif isinstance(e, HTTPError) and "not found" in str(e).lower():
+            logger.info(f"While waiting to process {resource.kind} {resource.namespace}/{resource.name}, the resource was removed from the cluster")
+        else:
+            logger.exception(
+                f"Failed to process {resource.kind} {resource.namespace}/{resource.name}: {e}"
+            )
 
 
 def autoscale_resources(
-    api,
-    kind,
+    api: HTTPClient,
+    kind: NamespacedAPIObject,
     namespace: FrozenSet[Pattern],
     exclude_namespaces: FrozenSet[Pattern],
     exclude_names: FrozenSet[str],
@@ -1008,6 +1075,7 @@ def autoscale_resources(
     forced_uptime: bool,
     upscale_target_only: bool,
     constrained_downscaler: bool,
+    max_retries_on_conflict: int,
     dry_run: bool,
     now: datetime.datetime,
     grace_period: int,
@@ -1115,6 +1183,9 @@ def autoscale_resources(
                 forced_uptime_for_namespace,
                 forced_downtime_for_namespace,
                 upscale_target_only,
+                max_retries_on_conflict,
+                api,
+                kind,
                 dry_run,
                 now,
                 grace_period,
@@ -1351,6 +1422,7 @@ def scale(
     admission_controller: str,
     constrained_downscaler: bool,
     api_server_timeout: int,
+    max_retries_on_conflict: int,
     downtime_replicas: int = 0,
     deployment_time_annotation: Optional[str] = None,
     enable_events: bool = False,
@@ -1379,6 +1451,7 @@ def scale(
                     forced_uptime,
                     upscale_target_only,
                     constrained_downscaler,
+                    max_retries_on_conflict,
                     dry_run,
                     now,
                     grace_period,
diff --git a/tests/test_autoscale_resource.py b/tests/test_autoscale_resource.py
diff --git a/tests/test_scaler.py b/tests/test_scaler.py