From 7da7c10f1d812129647b92849c2ab4d8c20b0a5f Mon Sep 17 00:00:00 2001 From: Christian Pointner Date: Mon, 29 Jul 2024 20:32:54 +0200 Subject: promethues: add coredns --- .../prometheus/server/defaults/main/main.yml | 1 + .../server/defaults/main/rules_coredns.yml | 29 ++++++++++++++++++++++ 2 files changed, 30 insertions(+) create mode 100644 roles/monitoring/prometheus/server/defaults/main/rules_coredns.yml (limited to 'roles/monitoring/prometheus/server') diff --git a/roles/monitoring/prometheus/server/defaults/main/main.yml b/roles/monitoring/prometheus/server/defaults/main/main.yml index 9f867568..e94aaaf5 100644 --- a/roles/monitoring/prometheus/server/defaults/main/main.yml +++ b/roles/monitoring/prometheus/server/defaults/main/main.yml @@ -41,6 +41,7 @@ prometheus_server_rules: nftables: "{{ prometheus_server_rules_nftables + prometheus_server_rules_nftables_extra }}" whawty-nginx-sso: "{{ prometheus_server_rules_whawty_nginx_sso + prometheus_server_rules_whawty_nginx_sso_extra }}" mosquitto: "{{ prometheus_server_rules_mosquitto + prometheus_server_rules_mosquitto_extra }}" + coredns: "{{ prometheus_server_rules_coredns + prometheus_server_rules_coredns_extra }}" # prometheus_server_alertmanager: # url: "127.0.0.1:9093" diff --git a/roles/monitoring/prometheus/server/defaults/main/rules_coredns.yml b/roles/monitoring/prometheus/server/defaults/main/rules_coredns.yml new file mode 100644 index 00000000..126a7ba4 --- /dev/null +++ b/roles/monitoring/prometheus/server/defaults/main/rules_coredns.yml @@ -0,0 +1,29 @@ +--- +prometheus_server_rules_coredns_extra: [] +prometheus_server_rules_coredns: + - alert: CorednsPanicCount + expr: increase(coredns_panics_total[15m]) > 0 + for: 0m + labels: + severity: critical + annotations: + summary: CoreDNS Panic (instance {{ '{{' }} $labels.instance {{ '}}' }}) + description: "Number of CoreDNS panics encountered has been increasing in the last 15 minutes\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}" + + - alert: CorednsFailedReloadCount + expr: increase(coredns_reload_failed_total[15m]) > 0 + for: 0m + labels: + severity: critical + annotations: + summary: CoreDNS reload failed (instance {{ '{{' }} $labels.instance {{ '}}' }}) + description: "Number of CoreDNS failed reloads has been increasing in the last 15 minutes\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}" + + - alert: CorednsBrokenForwardHealthchecks + expr: increase(coredns_forward_healthcheck_broken_total[15m]) > 0 + for: 0m + labels: + severity: warning + annotations: + summary: CoreDNS broken forward healthchecks (instance {{ '{{' }} $labels.instance {{ '}}' }}) + description: "Number of CoreDNS broken forward healthchecks has been increasing in the last 15 minutes\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}" -- cgit v1.2.3