diff --git a/changelogs/fragments/461.yml b/changelogs/fragments/461.yml new file mode 100644 index 0000000..de61305 --- /dev/null +++ b/changelogs/fragments/461.yml @@ -0,0 +1,4 @@ +--- +minor_changes: + - prometheus - Remove unnecessary absence alerts. The general ExporterDown metric can cover these scenarios + - prometheus - Moved the ExporterDown alert to its own common alerts file and have it be enabled by default (no .example extension on the file name) diff --git a/prometheus/common/alert-rules.d/crunchy-alert-rules-common.yml b/prometheus/common/alert-rules.d/crunchy-alert-rules-common.yml new file mode 100644 index 0000000..c7dd016 --- /dev/null +++ b/prometheus/common/alert-rules.d/crunchy-alert-rules-common.yml @@ -0,0 +1,21 @@ +### +# +# Copyright © 2017-2025 Crunchy Data Solutions, Inc. All Rights Reserved. +# +### + +groups: +- name: alert-rules + rules: + +########## COMMON RULES ########## + - alert: ExporterDown + expr: avg_over_time(up[5m]) < 0.5 + for: 10s + labels: + service: system + severity: critical + severity_num: 300 + annotations: + description: 'Metrics exporter service for {{ $labels.job }} running on {{ $labels.instance }} has been down at least 50% of the time for the last 5 minutes. Service may be flapping or down.' + summary: 'Prometheus Exporter Service Down' diff --git a/prometheus/common/alert-rules.d/crunchy-alert-rules-etcd.yml.example b/prometheus/common/alert-rules.d/crunchy-alert-rules-etcd.yml.example index 0d25336..98922fb 100644 --- a/prometheus/common/alert-rules.d/crunchy-alert-rules-etcd.yml.example +++ b/prometheus/common/alert-rules.d/crunchy-alert-rules-etcd.yml.example @@ -56,36 +56,3 @@ groups: # severity_num: 300 # annotations: # description: 'The expected minimum count of etcd nodes was not found. Current count {{ $value }}' - -# Absence alerts must be configured per named job, otherwise there's no way to know which job is down -# Below is are some examples using the leader metric for a targets called "etcd#" for a 3 node etcd cluster - -# - alert: ETCDAbsent_etcd1 -# expr: absent(etcd_server_has_leader{job="ip11_etcd1"}) -# for: 10s -# labels: -# service: etcd -# severity: critical -# severity_num: 300 -# annotations: -# description: 'Leader metric is absent from target {{ $labels.job }}. Check that etcd is running on target host.' - -# - alert: ETCDAbsent_etcd2 -# expr: absent(etcd_server_has_leader{job="ip21_etcd2"}) -# for: 10s -# labels: -# service: etcd -# severity: critical -# severity_num: 300 -# annotations: -# description: 'Leader metric is absent from target {{ $labels.job }}. Check that etcd is running on target host.' - -# - alert: ETCDAbsent_etcd3 -# expr: absent(etcd_server_has_leader{job="ip31_etcd3"}) -# for: 10s -# labels: -# service: etcd -# severity: critical -# severity_num: 300 -# annotations: -# description: 'Leader metric is absent from target {{ $labels.job }}. Check that etcd is running on target host.' diff --git a/prometheus/common/alert-rules.d/crunchy-alert-rules-pg.yml.example b/prometheus/common/alert-rules.d/crunchy-alert-rules-pg.yml.example index 49d6540..5602342 100644 --- a/prometheus/common/alert-rules.d/crunchy-alert-rules-pg.yml.example +++ b/prometheus/common/alert-rules.d/crunchy-alert-rules-pg.yml.example @@ -164,18 +164,6 @@ groups: # summary: '{{ $labels.job }} has changed from replica to primary' -## Absence alerts must be configured per named job, otherwise there's no way to know which job is down -## Below is an example for a target job called "Prod" -# - alert: PGConnectionAbsent_Prod -# expr: absent(ccp_connection_stats_max_connections{job="Prod"}) -# for: 10s -# labels: -# service: postgresql -# severity: critical -# severity_num: 300 -# annotations: -# description: 'Connection metric is absent from target (Prod). Check that postgres_exporter can connect to PostgreSQL.' - ## Optional monitor for changes to pg_settings (postgresql.conf) system catalog. ## A similar metric is available for monitoring pg_hba.conf. See ccp_hba_settings_checksum.