summaryrefslogtreecommitdiff
path: root/roles/monitoring/prometheus/server
diff options
context:
space:
mode:
authorChristian Pointner <equinox@spreadspace.org>2024-07-29 20:32:54 +0200
committerChristian Pointner <equinox@spreadspace.org>2024-07-29 20:32:54 +0200
commit7da7c10f1d812129647b92849c2ab4d8c20b0a5f (patch)
tree0f33d7c21ea90e46bbaaa747a2f1bdfdab81967f /roles/monitoring/prometheus/server
parentadd qmk package for new keyboard (diff)
promethues: add coredns
Diffstat (limited to 'roles/monitoring/prometheus/server')
-rw-r--r--roles/monitoring/prometheus/server/defaults/main/main.yml1
-rw-r--r--roles/monitoring/prometheus/server/defaults/main/rules_coredns.yml29
2 files changed, 30 insertions, 0 deletions
diff --git a/roles/monitoring/prometheus/server/defaults/main/main.yml b/roles/monitoring/prometheus/server/defaults/main/main.yml
index 9f867568..e94aaaf5 100644
--- a/roles/monitoring/prometheus/server/defaults/main/main.yml
+++ b/roles/monitoring/prometheus/server/defaults/main/main.yml
@@ -41,6 +41,7 @@ prometheus_server_rules:
nftables: "{{ prometheus_server_rules_nftables + prometheus_server_rules_nftables_extra }}"
whawty-nginx-sso: "{{ prometheus_server_rules_whawty_nginx_sso + prometheus_server_rules_whawty_nginx_sso_extra }}"
mosquitto: "{{ prometheus_server_rules_mosquitto + prometheus_server_rules_mosquitto_extra }}"
+ coredns: "{{ prometheus_server_rules_coredns + prometheus_server_rules_coredns_extra }}"
# prometheus_server_alertmanager:
# url: "127.0.0.1:9093"
diff --git a/roles/monitoring/prometheus/server/defaults/main/rules_coredns.yml b/roles/monitoring/prometheus/server/defaults/main/rules_coredns.yml
new file mode 100644
index 00000000..126a7ba4
--- /dev/null
+++ b/roles/monitoring/prometheus/server/defaults/main/rules_coredns.yml
@@ -0,0 +1,29 @@
+---
+prometheus_server_rules_coredns_extra: []
+prometheus_server_rules_coredns:
+ - alert: CorednsPanicCount
+ expr: increase(coredns_panics_total[15m]) > 0
+ for: 0m
+ labels:
+ severity: critical
+ annotations:
+ summary: CoreDNS Panic (instance {{ '{{' }} $labels.instance {{ '}}' }})
+ description: "Number of CoreDNS panics encountered has been increasing in the last 15 minutes\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}"
+
+ - alert: CorednsFailedReloadCount
+ expr: increase(coredns_reload_failed_total[15m]) > 0
+ for: 0m
+ labels:
+ severity: critical
+ annotations:
+ summary: CoreDNS reload failed (instance {{ '{{' }} $labels.instance {{ '}}' }})
+ description: "Number of CoreDNS failed reloads has been increasing in the last 15 minutes\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}"
+
+ - alert: CorednsBrokenForwardHealthchecks
+ expr: increase(coredns_forward_healthcheck_broken_total[15m]) > 0
+ for: 0m
+ labels:
+ severity: warning
+ annotations:
+ summary: CoreDNS broken forward healthchecks (instance {{ '{{' }} $labels.instance {{ '}}' }})
+ description: "Number of CoreDNS broken forward healthchecks has been increasing in the last 15 minutes\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}"