NethServer
diff --git a/‎core/agent/README.md‎
Lines changed: 4 additions & 1 deletion b/‎core/agent/README.md‎
Lines changed: 4 additions & 1 deletion
diff --git a/‎core/agent/agent.go‎
Lines changed: 18 additions & 0 deletions b/‎core/agent/agent.go‎
Lines changed: 18 additions & 0 deletions
diff --git a/‎core/agent/hbuiltin.go‎
Lines changed: 24 additions & 0 deletions b/‎core/agent/hbuiltin.go‎
Lines changed: 24 additions & 0 deletions
diff --git a/‎core/agent/hevent.go‎
Lines changed: 7 additions & 4 deletions b/‎core/agent/hevent.go‎
Lines changed: 7 additions & 4 deletions
diff --git a/‎core/agent/htask.go‎
Lines changed: 7 additions & 1 deletion b/‎core/agent/htask.go‎
Lines changed: 7 additions & 1 deletion
diff --git a/‎core/agent/worklimit.go‎
Lines changed: 72 additions & 0 deletions b/‎core/agent/worklimit.go‎
Lines changed: 72 additions & 0 deletions
@@ -155,7 +155,10 @@ Exit codes and their meaning:
 - `9` - OS `exec()` error, like "file is not executable" and similar
 - `10` - default exit code once `validation-failed` status is set. It
   can be overridden by terminating the step with a non-zero exit code.
-- `11-31` - reserved to the agent implementation
+- `11` - error "Agent is busy" is returned if the number of running
+  actions or events has reached `MAX_CONCURRENCY` (32) and after waiting
+  `OVERLOAD_SLEEP` (500ms) that number did not decrease.
+- `12-31` - reserved to the agent implementation
 - `32-255` - **available to use** for action-specific error numbers
 
 ## File descriptors
 
@@ -27,6 +27,7 @@ import (
 	"os"
 	"os/signal"
 	"syscall"
+	"strconv"
 	"time"
 
 	"github.com/go-redis/redis/v8"
@@ -52,6 +53,9 @@ var eventPaths flagStringSlice
 var pollingDuration = 5000 * time.Millisecond
 var taskExpireDuration = 8 * time.Hour
 
+var maxConcurrency = 32 // default limit of spawned concurrent processes
+var overloadSleep = 500 * time.Millisecond // wait time before rejecting new processes
+
 // Command arguments --actionsdir and --eventsdir can be repeated multiple
 // times. Each item is inserted into a []string.
 type flagStringSlice []string
@@ -98,6 +102,20 @@ func main() {
 		}
 	}
 
+	// Override max number of concurrent tasks and event handlers
+	if v := os.Getenv("MAX_CONCURRENCY"); v != "" {
+		if n, err := strconv.Atoi(v); err == nil && n > 0 {
+			maxConcurrency = n
+		}
+	}
+	// Overrid the wait time before rejecting tasks and events
+	if v := os.Getenv("OVERLOAD_SLEEP"); v != "" {
+		n, convError := time.ParseDuration(v)
+		if convError == nil {
+			overloadSleep = n
+		}
+	}
+
 	var signalChannel = make(chan os.Signal, 1)
 	signal.Notify(signalChannel, syscall.SIGUSR1)
 	var actionsCtx, cancelActions = context.WithCancel(ctx)
 
@@ -25,6 +25,7 @@ import (
 	"encoding/json"
 	"log"
 	"sync"
+	"fmt"
 
 	"github.com/NethServer/ns8-core/core/agent/models"
 	"github.com/go-redis/redis/v8"
@@ -137,3 +138,26 @@ func runCancelTask(rdb *redis.Client, task *models.Task, cancelFuncMap map[strin
 	}
 	log.Printf("task/%s/%s: action \"%s\" status is \"%s\" (%d) at step %s", agentPrefix, task.ID, task.Action, actionDescriptor.Status, exitCode, lastStep)
 }
+
+func rejectAction(rdb *redis.Client, actionCtx context.Context, task *models.Task) {
+	progressChannel := "progress/" + agentPrefix + "/task/" + task.ID
+	outputKey := "task/" + agentPrefix + "/" + task.ID + "/output"
+	errorKey := "task/" + agentPrefix + "/" + task.ID + "/error"
+	exitCodeKey := "task/" + agentPrefix + "/" + task.ID + "/exit_code"
+	actionDescriptor := models.Processor{Status: "pending"}
+	publishStatus(rdb, progressChannel, actionDescriptor) // pending status
+	actionOutput := ""
+	actionError := fmt.Sprintf("Agent is busy. Action %s rejected!\n", task.Action)
+	exitCode := 11
+	actionDescriptor.Status = "aborted"
+	log.Printf(SD_ERR+"Agent is busy. Action %s rejected!", task.Action)
+	rdb.TxPipelined(ctx, func(pipe redis.Pipeliner) error {
+		// Publish the action response
+		pipe.Set(ctx, outputKey, actionOutput, taskExpireDuration)
+		pipe.Set(ctx, errorKey, actionError, taskExpireDuration)
+		pipe.Set(ctx, exitCodeKey, exitCode, taskExpireDuration)
+		pipe.Expire(ctx, "task/" + agentPrefix + "/" + task.ID + "/context", taskExpireDuration)
+		publishStatus(pipe, progressChannel, actionDescriptor) // aborted status
+		return nil
+	})
+}
@@ -26,7 +26,6 @@ import (
 	"os"
 	"os/exec"
 	"strings"
-	"sync"
 	"time"
 
 	"github.com/NethServer/ns8-core/core/agent/models"
@@ -58,13 +57,17 @@ func listenEventsAsync(ctx context.Context, complete chan int) {
 
 	pubsub := rdb.PSubscribe(ctx, "*/event/*")
 
-	var wg sync.WaitGroup
+	wg := NewWorkersLimiter(maxConcurrency, overloadSleep)
 	csyn := make(chan int, 1)
 
 	go func() {
 		for msg := range pubsub.Channel(redis.WithChannelHealthCheckInterval(pollingDuration)) {
 			if before, after, found := strings.Cut(msg.Channel, "/event/"); found {
-				go runEvent(&wg, &models.Event{Source: before, Payload: msg.Payload, Name: after})
+				if wg.ObserveOverload() {
+					log.Printf(SD_ERR + "Agent is busy. Event %s rejected!", msg.Channel)
+				} else {
+					go runEvent(wg, &models.Event{Source: before, Payload: msg.Payload, Name: after})
+				}
 			}
 		}
 		csyn <- 1
@@ -80,7 +83,7 @@ func listenEventsAsync(ctx context.Context, complete chan int) {
 	complete <- 1
 }
 
-func runEvent(wg *sync.WaitGroup, event *models.Event) {
+func runEvent(wg *workersLimiter, event *models.Event) {
 	wg.Add(1)
 	defer wg.Done()
 
 
@@ -289,7 +289,6 @@ func runAction(rdb *redis.Client, actionCtx context.Context, task *models.Task)
 
 func listenActionsAsync(actionsCtx context.Context, complete chan int) {
 	defer func() { complete <- 1 }()
-	var workersRegistry sync.WaitGroup
 	brpopCtx, cancelBrpop := context.WithCancel(ctx)
 	taskCancelFunctions := make(map[string]context.CancelFunc)
 
@@ -318,6 +317,7 @@ func listenActionsAsync(actionsCtx context.Context, complete chan int) {
 		MaxRetryBackoff:   5000 * time.Millisecond,
 	})
 
+	workersRegistry := NewWorkersLimiter(maxConcurrency, overloadSleep)
 	var tcMu sync.Mutex
 	taskCh := make(chan models.Task)
 	go readTasks(rdb, brpopCtx, taskCh)
@@ -338,6 +338,12 @@ func listenActionsAsync(actionsCtx context.Context, complete chan int) {
 					workersRegistry.Wait()
 					break MAINLOOP
 				}
+
+				if workersRegistry.ObserveOverload() {
+					rejectAction(rdb, ctx, &task)
+					continue
+				}
+
 				// Create a cancelable context for the task and
 				// store its cancel function in a safe map
 				taskCtx, taskCancelFunction := context.WithCancel(ctx)
 
@@ -0,0 +1,72 @@
+/*
+ * Copyright (C) 2025 Nethesis S.r.l.
+ * http://www.nethesis.it - [email protected]
+ *
+ * This script is part of NethServer.
+ *
+ * NethServer is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License,
+ * or any later version.
+ *
+ * NethServer is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with NethServer.  If not, see COPYING.
+ */
+
+package main
+
+import (
+	"sync"
+	"sync/atomic"
+	"time"
+)
+
+type workersLimiter struct {
+	wg      sync.WaitGroup
+	counter int64
+	limit   int64
+	delay   time.Duration
+}
+
+func NewWorkersLimiter(limit int, delay time.Duration) *workersLimiter {
+	w := new(workersLimiter)
+	w.counter = 0
+	w.limit = int64(limit)
+	w.delay = delay
+	return w
+}
+
+func (w *workersLimiter) Add(delta int) {
+	// call wg.Add first so it panics like the standard WaitGroup if delta causes it
+	w.wg.Add(delta)
+	atomic.AddInt64(&w.counter, int64(delta))
+}
+
+func (w *workersLimiter) Done() {
+	// call wg.Done first to preserve panic semantics
+	w.wg.Done()
+	atomic.AddInt64(&w.counter, -1)
+}
+
+func (w *workersLimiter) Wait() {
+	w.wg.Wait()
+}
+
+// ObserveOverload checks if the current number of running workers exceeds
+// the configured limit. If the limit is reached, it waits for a short
+// delay before re-checking. Returns true if the system is still
+// overloaded after the delay, signaling that new work should be rejected.
+func (w *workersLimiter) ObserveOverload() bool {
+	if atomic.LoadInt64(&w.counter) >= w.limit {
+		time.Sleep(w.delay)
+		if atomic.LoadInt64(&w.counter) >= w.limit {
+			return true
+		}
+	}
+	return false
+}