summaryrefslogtreecommitdiff
path: root/roles/monitoring
diff options
context:
space:
mode:
authorChristian Pointner <equinox@spreadspace.org>2021-09-23 20:37:20 +0200
committerChristian Pointner <equinox@spreadspace.org>2021-09-23 20:37:20 +0200
commit486c84d53244e44ff72a3c2db42ee12afdb083e8 (patch)
treed2d210290c92a68adccfceb53475c3fe9854427a /roles/monitoring
parentforce blackbox exporter to ipv4 by default (diff)
add some more prometheus rules for blackbox exporter
Diffstat (limited to 'roles/monitoring')
-rw-r--r--roles/monitoring/prometheus/server/defaults/main/main.yml2
-rw-r--r--roles/monitoring/prometheus/server/defaults/main/rules_blackbox.yml47
-rw-r--r--roles/monitoring/prometheus/server/defaults/main/rules_blackbox__https.yml19
-rw-r--r--roles/monitoring/prometheus/server/defaults/main/rules_blackbox__ping.yml10
-rw-r--r--roles/monitoring/prometheus/server/defaults/main/rules_nut.yml3
-rw-r--r--roles/monitoring/prometheus/server/tasks/main.yml2
6 files changed, 80 insertions, 3 deletions
diff --git a/roles/monitoring/prometheus/server/defaults/main/main.yml b/roles/monitoring/prometheus/server/defaults/main/main.yml
index bae0cdba..09cd150c 100644
--- a/roles/monitoring/prometheus/server/defaults/main/main.yml
+++ b/roles/monitoring/prometheus/server/defaults/main/main.yml
@@ -14,7 +14,9 @@ prometheus_server_rules:
prometheus: "{{ prometheus_server_rules_prometheus + ((prometheus_server_alertmanager is defined) | ternary(prometheus_server_rules_prometheus_alertmanager, [])) + prometheus_server_rules_prometheus_extra }}"
node: "{{ prometheus_server_rules_node + prometheus_server_rules_node_extra }}"
openwrt: "{{ prometheus_server_rules_openwrt + prometheus_server_rules_node_extra }}"
+ nut: "{{ prometheus_server_rules_nut + prometheus_server_rules_nut_extra }}"
nut/ups: "{{ prometheus_server_rules_nut__ups + prometheus_server_rules_nut__ups_extra }}"
+ blackbox: "{{ prometheus_server_rules_blackbox + prometheus_server_rules_blackbox_extra }}"
blackbox/ping: "{{ prometheus_server_rules_blackbox__ping + prometheus_server_rules_blackbox__ping_extra }}"
blackbox/https: "{{ prometheus_server_rules_blackbox__https + prometheus_server_rules_blackbox__https_extra }}"
blackbox/ssh: "{{ prometheus_server_rules_blackbox__ssh + prometheus_server_rules_blackbox__ssh_extra }}"
diff --git a/roles/monitoring/prometheus/server/defaults/main/rules_blackbox.yml b/roles/monitoring/prometheus/server/defaults/main/rules_blackbox.yml
new file mode 100644
index 00000000..d5c1fd42
--- /dev/null
+++ b/roles/monitoring/prometheus/server/defaults/main/rules_blackbox.yml
@@ -0,0 +1,47 @@
+---
+prometheus_server_rules_blackbox_extra: []
+prometheus_server_rules_blackbox:
+ - alert: BlackboxProbeFailed
+ expr: probe_success == 0
+ for: 0m
+ labels:
+ severity: critical
+ annotations:
+ summary: Blackbox probe failed (instance {{ '{{' }} $labels.instance {{ '}}' }})
+ description: "Probe failed\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}"
+
+ - alert: BlackboxSlowProbe
+ expr: avg_over_time(probe_duration_seconds[1m]) > 1
+ for: 1m
+ labels:
+ severity: warning
+ annotations:
+ summary: Blackbox slow probe (instance {{ '{{' }} $labels.instance {{ '}}' }})
+ description: "Blackbox probe took more than 1s to complete\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}"
+
+ - alert: BlackboxSslCertificateWillExpireSoon
+ expr: probe_ssl_earliest_cert_expiry - time() < 86400 * 30
+ for: 0m
+ labels:
+ severity: warning
+ annotations:
+ summary: Blackbox SSL certificate will expire soon (instance {{ '{{' }} $labels.instance {{ '}}' }})
+ description: "SSL certificate expires in 30 days\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}"
+
+ - alert: BlackboxSslCertificateWillExpireSoon
+ expr: probe_ssl_earliest_cert_expiry - time() < 86400 * 3
+ for: 0m
+ labels:
+ severity: critical
+ annotations:
+ summary: Blackbox SSL certificate will expire soon (instance {{ '{{' }} $labels.instance {{ '}}' }})
+ description: "SSL certificate expires in 3 days\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}"
+
+ - alert: BlackboxSslCertificateExpired
+ expr: probe_ssl_earliest_cert_expiry - time() <= 0
+ for: 0m
+ labels:
+ severity: critical
+ annotations:
+ summary: Blackbox SSL certificate expired (instance {{ '{{' }} $labels.instance {{ '}}' }})
+ description: "SSL certificate has expired already\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}"
diff --git a/roles/monitoring/prometheus/server/defaults/main/rules_blackbox__https.yml b/roles/monitoring/prometheus/server/defaults/main/rules_blackbox__https.yml
index cfdc10bd..140e3b4f 100644
--- a/roles/monitoring/prometheus/server/defaults/main/rules_blackbox__https.yml
+++ b/roles/monitoring/prometheus/server/defaults/main/rules_blackbox__https.yml
@@ -1,3 +1,20 @@
---
prometheus_server_rules_blackbox__https_extra: []
-prometheus_server_rules_blackbox__https: []
+prometheus_server_rules_blackbox__https:
+ - alert: BlackboxProbeHttpFailure
+ expr: probe_http_status_code <= 199 OR probe_http_status_code >= 400
+ for: 0m
+ labels:
+ severity: critical
+ annotations:
+ summary: Blackbox probe HTTP failure (instance {{ '{{' }} $labels.instance {{ '}}' }})
+ description: "HTTP status code is not 200-399\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}"
+
+ - alert: BlackboxProbeSlowHttp
+ expr: avg_over_time(probe_http_duration_seconds[1m]) > 1
+ for: 1m
+ labels:
+ severity: warning
+ annotations:
+ summary: Blackbox probe slow HTTP (instance {{ '{{' }} $labels.instance {{ '}}' }})
+ description: "HTTP request took more than 1s\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}"
diff --git a/roles/monitoring/prometheus/server/defaults/main/rules_blackbox__ping.yml b/roles/monitoring/prometheus/server/defaults/main/rules_blackbox__ping.yml
index 06ce8607..cc87b6b1 100644
--- a/roles/monitoring/prometheus/server/defaults/main/rules_blackbox__ping.yml
+++ b/roles/monitoring/prometheus/server/defaults/main/rules_blackbox__ping.yml
@@ -1,3 +1,11 @@
---
prometheus_server_rules_blackbox__ping_extra: []
-prometheus_server_rules_blackbox__ping: []
+prometheus_server_rules_blackbox__ping:
+ - alert: BlackboxProbeSlowPing
+ expr: avg_over_time(probe_icmp_duration_seconds[1m]) > 1
+ for: 1m
+ labels:
+ severity: warning
+ annotations:
+ summary: Blackbox probe slow ping (instance {{ '{{' }} $labels.instance {{ '}}' }})
+ description: "Blackbox ping took more than 1s\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}"
diff --git a/roles/monitoring/prometheus/server/defaults/main/rules_nut.yml b/roles/monitoring/prometheus/server/defaults/main/rules_nut.yml
new file mode 100644
index 00000000..d8d64f64
--- /dev/null
+++ b/roles/monitoring/prometheus/server/defaults/main/rules_nut.yml
@@ -0,0 +1,3 @@
+---
+prometheus_server_rules_nut_extra: []
+prometheus_server_rules_nut: []
diff --git a/roles/monitoring/prometheus/server/tasks/main.yml b/roles/monitoring/prometheus/server/tasks/main.yml
index 16167c9c..c0928cc3 100644
--- a/roles/monitoring/prometheus/server/tasks/main.yml
+++ b/roles/monitoring/prometheus/server/tasks/main.yml
@@ -83,7 +83,7 @@
state: directory
- name: generate rules files for all jobs
- loop: "{{ prometheus_server_jobs | union(['prometheus']) }}"
+ loop: "{{ prometheus_server_jobs | union(['prometheus']) | union(prometheus_server_jobs | select('match', '.*/.*') | map('dirname') | unique) }}"
template:
src: rules.yml.j2
dest: "/etc/prometheus/rules/{{ item }}.yml"