Skip to content

Commit d14cf2a

Browse files
committed
Add extra telemetry to monitor failures
1 parent d715a75 commit d14cf2a

File tree

3 files changed

+7
-0
lines changed

3 files changed

+7
-0
lines changed

policy/handler.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@ import (
1212
"sync"
1313
"time"
1414

15+
metrics "github.com/armon/go-metrics"
1516
"github.com/google/go-cmp/cmp"
1617
hclog "github.com/hashicorp/go-hclog"
1718
"github.com/hashicorp/go-multierror"
@@ -228,6 +229,7 @@ func (h *Handler) handleTick(ctx context.Context, policy *sdk.ScalingPolicy) (*s
228229

229230
status, err := target.Status(policy.Target.Config)
230231
if err != nil {
232+
metrics.IncrCounter([]string{"target", "status", "failure_count"}, 1)
231233
h.log.Warn("failed to get target status", "error", err)
232234
return nil, err
233235
}

policy/manager.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -116,6 +116,7 @@ func (m *Manager) monitorPolicies(ctx context.Context, evalCh chan<- *sdk.Scalin
116116

117117
case err := <-m.policyIDsErrCh:
118118
m.log.Error("encountered an error monitoring policy IDs", "error", err)
119+
metrics.IncrCounter([]string{"policy", "manager", "failure_count"}, 1)
119120
if isUnrecoverableError(err) {
120121
return err
121122
}

policyeval/base_worker.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -114,6 +114,7 @@ func (w *BaseWorker) handlePolicy(ctx context.Context, eval *sdk.ScalingEvaluati
114114

115115
currentStatus, err := runTargetStatus(target, eval.Policy)
116116
if err != nil {
117+
metrics.IncrCounter([]string{"target", "status", "failure_count"}, 1)
117118
return fmt.Errorf("failed to get target status: %v", err)
118119
}
119120

@@ -181,6 +182,8 @@ func (w *BaseWorker) handlePolicy(ctx context.Context, eval *sdk.ScalingEvaluati
181182
"on_check_error", eval.Policy.OnCheckError,
182183
"error", err)
183184

185+
metrics.IncrCounterWithLabels([]string{"target", "status", "failure_count"}, 1, []metrics.Label{{Name: "check", Value: checkEval.Check.Name}})
186+
184187
// Define how to handle error.
185188
// Use check behaviour if set or fail iff the policy is set to fail.
186189
switch checkEval.Check.OnError {
@@ -287,6 +290,7 @@ func (w *BaseWorker) handlePolicy(ctx context.Context, eval *sdk.ScalingEvaluati
287290

288291
err = w.scaleTarget(logger, target, eval.Policy, *winner.action, currentStatus)
289292
if err != nil {
293+
metrics.IncrCounter([]string{"target", "scale", "failure_count"}, 1)
290294
return err
291295
}
292296

0 commit comments

Comments
 (0)