Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
114 changes: 78 additions & 36 deletions infra/k8s/monitoring/grafana/overlays/production/alerting/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -25,22 +25,28 @@ alerting:
period: '300'
region: ap-northeast-2
statistic: Average
- refId: B
relativeTimeRange:
from: 600
to: 0
datasourceUid: __expr__
model:
type: reduce
expression: A
reducer: last
- refId: C
relativeTimeRange:
from: 600
to: 0
datasourceUid: __expr__
model:
type: threshold
expression: B
conditions:
- evaluator:
params:
- 2147483648
type: lt
operator:
type: and
reducer:
type: last
type: classic_condition
for: 5m
labels:
severity: warning
Expand All @@ -66,22 +72,28 @@ alerting:
period: '300'
region: ap-northeast-2
statistic: Maximum
- refId: B
relativeTimeRange:
from: 600
to: 0
datasourceUid: __expr__
model:
type: reduce
expression: A
reducer: last
- refId: C
relativeTimeRange:
from: 600
to: 0
datasourceUid: __expr__
model:
type: threshold
expression: B
conditions:
- evaluator:
params:
- 150
type: gt
operator:
type: and
reducer:
type: last
type: classic_condition
for: 5m
labels:
severity: warning
Expand All @@ -107,22 +119,28 @@ alerting:
expr: (1 - node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes) * 100
intervalMs: 60000
maxDataPoints: 43200
- refId: B
relativeTimeRange:
from: 600
to: 0
datasourceUid: __expr__
model:
type: reduce
expression: A
reducer: last
- refId: C
relativeTimeRange:
from: 600
to: 0
datasourceUid: __expr__
model:
type: threshold
expression: B
conditions:
- evaluator:
params:
- 85
type: gt
operator:
type: and
reducer:
type: last
type: classic_condition
for: 5m
labels:
severity: warning
Expand All @@ -143,22 +161,28 @@ alerting:
expr: (1 - node_filesystem_avail_bytes{mountpoint="/",fstype!="tmpfs"} / node_filesystem_size_bytes{mountpoint="/",fstype!="tmpfs"}) * 100
intervalMs: 60000
maxDataPoints: 43200
- refId: B
relativeTimeRange:
from: 600
to: 0
datasourceUid: __expr__
model:
type: reduce
expression: A
reducer: last
- refId: C
relativeTimeRange:
from: 600
to: 0
datasourceUid: __expr__
model:
type: threshold
expression: B
conditions:
- evaluator:
params:
- 85
type: gt
operator:
type: and
reducer:
type: last
type: classic_condition
for: 5m
labels:
severity: warning
Expand All @@ -179,22 +203,28 @@ alerting:
expr: kube_node_status_condition{condition="Ready",status="true"} == 0
intervalMs: 60000
maxDataPoints: 43200
- refId: B
relativeTimeRange:
from: 600
to: 0
datasourceUid: __expr__
model:
type: reduce
expression: A
reducer: last
- refId: C
relativeTimeRange:
from: 600
to: 0
datasourceUid: __expr__
model:
type: threshold
expression: B
conditions:
- evaluator:
params:
- 0
type: gt
operator:
type: and
reducer:
type: last
type: classic_condition
for: 5m
labels:
severity: critical
Expand All @@ -220,22 +250,28 @@ alerting:
expr: rabbitmq_queue_messages_ready
intervalMs: 60000
maxDataPoints: 43200
- refId: B
relativeTimeRange:
from: 600
to: 0
datasourceUid: __expr__
model:
type: reduce
expression: A
reducer: last
- refId: C
relativeTimeRange:
from: 600
to: 0
datasourceUid: __expr__
model:
type: threshold
expression: B
conditions:
- evaluator:
params:
- 100
type: gt
operator:
type: and
reducer:
type: last
type: classic_condition
for: 5m
labels:
severity: warning
Expand All @@ -256,25 +292,31 @@ alerting:
expr: rabbitmq_queue_messages_unacked
intervalMs: 60000
maxDataPoints: 43200
- refId: B
relativeTimeRange:
from: 600
to: 0
datasourceUid: __expr__
model:
type: reduce
expression: A
reducer: last
- refId: C
relativeTimeRange:
from: 600
to: 0
datasourceUid: __expr__
model:
type: threshold
expression: B
conditions:
- evaluator:
params:
- 50
type: gt
operator:
type: and
reducer:
type: last
type: classic_condition
for: 5m
labels:
severity: warning
annotations:
summary: 'RabbitMQ unacked messages > 50'
description: '큐 {{ `{{ $labels.queue }}` }}에 미확인 메시지 {{ `{{ $values.A }}` }}개'
description: '큐 {{ `{{ $labels.queue }}` }}에 미확인 메시지 {{ `{{ $values.A }}` }}개'
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

Grafana 10 이상에서 도입된 새로운 Alerting 엔진(Unified Alerting)에서는 classic_condition 대신 reducethreshold 표현식을 사용할 때, 알림 메시지(annotations)에서 수치 데이터를 참조하려면 range query인 A 대신 reduction 결과인 B를 사용하는 것이 권장됩니다. $values.A를 사용하면 시계열 데이터 전체가 참조되어 알림 메시지에 값이 정상적으로 표시되지 않을 수 있습니다. 이 변경 사항은 diff에 포함되지 않은 다른 alert rule의 description에도 동일하게 적용하는 것이 좋습니다.

              description: '큐 {{ {{ $labels.queue }} }}에 미확인 메시지 {{ {{ $values.B }} }}개'

Loading
Loading