summaryrefslogtreecommitdiff
path: root/roles/monitoring/prometheus/server/defaults/main/rules_coredns.yml
blob: 126a7ba42171f246474592fd4142763512311cce (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
---
prometheus_server_rules_coredns_extra: []
prometheus_server_rules_coredns:
  - alert: CorednsPanicCount
    expr: increase(coredns_panics_total[15m]) > 0
    for: 0m
    labels:
      severity: critical
    annotations:
      summary: CoreDNS Panic (instance {{ '{{' }} $labels.instance {{ '}}' }})
      description: "Number of CoreDNS panics encountered has been increasing in the last 15 minutes\n  VALUE = {{ '{{' }} $value {{ '}}' }}\n  LABELS = {{ '{{' }} $labels {{ '}}' }}"

  - alert: CorednsFailedReloadCount
    expr: increase(coredns_reload_failed_total[15m]) > 0
    for: 0m
    labels:
      severity: critical
    annotations:
      summary: CoreDNS reload failed (instance {{ '{{' }} $labels.instance {{ '}}' }})
      description: "Number of CoreDNS failed reloads has been increasing in the last 15 minutes\n  VALUE = {{ '{{' }} $value {{ '}}' }}\n  LABELS = {{ '{{' }} $labels {{ '}}' }}"

  - alert: CorednsBrokenForwardHealthchecks
    expr: increase(coredns_forward_healthcheck_broken_total[15m]) > 0
    for: 0m
    labels:
      severity: warning
    annotations:
      summary: CoreDNS broken forward healthchecks (instance {{ '{{' }} $labels.instance {{ '}}' }})
      description: "Number of CoreDNS broken forward healthchecks has been increasing in the last 15 minutes\n  VALUE = {{ '{{' }} $value {{ '}}' }}\n  LABELS = {{ '{{' }} $labels {{ '}}' }}"