Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions changelogs/fragments/461.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
---
minor_changes:
- prometheus - Remove unnecessary absence alerts. The general ExporterDown metric can cover these scenarios
- prometheus - Moved the ExporterDown alert to its own common alerts file and have it be enabled by default (no .example extension on the file name)
21 changes: 21 additions & 0 deletions prometheus/common/alert-rules.d/crunchy-alert-rules-common.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
###
#
# Copyright © 2017-2025 Crunchy Data Solutions, Inc. All Rights Reserved.
#
###

groups:
- name: alert-rules
rules:

########## COMMON RULES ##########
- alert: ExporterDown
expr: avg_over_time(up[5m]) < 0.5
for: 10s
labels:
service: system
severity: critical
severity_num: 300
annotations:
description: 'Metrics exporter service for {{ $labels.job }} running on {{ $labels.instance }} has been down at least 50% of the time for the last 5 minutes. Service may be flapping or down.'
summary: 'Prometheus Exporter Service Down'
Original file line number Diff line number Diff line change
Expand Up @@ -56,36 +56,3 @@ groups:
# severity_num: 300
# annotations:
# description: 'The expected minimum count of etcd nodes was not found. Current count {{ $value }}'

# Absence alerts must be configured per named job, otherwise there's no way to know which job is down
# Below is are some examples using the leader metric for a targets called "etcd#" for a 3 node etcd cluster

# - alert: ETCDAbsent_etcd1
# expr: absent(etcd_server_has_leader{job="ip11_etcd1"})
# for: 10s
# labels:
# service: etcd
# severity: critical
# severity_num: 300
# annotations:
# description: 'Leader metric is absent from target {{ $labels.job }}. Check that etcd is running on target host.'

# - alert: ETCDAbsent_etcd2
# expr: absent(etcd_server_has_leader{job="ip21_etcd2"})
# for: 10s
# labels:
# service: etcd
# severity: critical
# severity_num: 300
# annotations:
# description: 'Leader metric is absent from target {{ $labels.job }}. Check that etcd is running on target host.'

# - alert: ETCDAbsent_etcd3
# expr: absent(etcd_server_has_leader{job="ip31_etcd3"})
# for: 10s
# labels:
# service: etcd
# severity: critical
# severity_num: 300
# annotations:
# description: 'Leader metric is absent from target {{ $labels.job }}. Check that etcd is running on target host.'
Original file line number Diff line number Diff line change
Expand Up @@ -164,18 +164,6 @@ groups:
# summary: '{{ $labels.job }} has changed from replica to primary'


## Absence alerts must be configured per named job, otherwise there's no way to know which job is down
## Below is an example for a target job called "Prod"
# - alert: PGConnectionAbsent_Prod
# expr: absent(ccp_connection_stats_max_connections{job="Prod"})
# for: 10s
# labels:
# service: postgresql
# severity: critical
# severity_num: 300
# annotations:
# description: 'Connection metric is absent from target (Prod). Check that postgres_exporter can connect to PostgreSQL.'


## Optional monitor for changes to pg_settings (postgresql.conf) system catalog.
## A similar metric is available for monitoring pg_hba.conf. See ccp_hba_settings_checksum.
Expand Down