summaryrefslogtreecommitdiff
path: root/roles/monitoring/prometheus/server/defaults
diff options
context:
space:
mode:
authorChristian Pointner <equinox@spreadspace.org>2021-09-26 04:29:02 +0200
committerChristian Pointner <equinox@spreadspace.org>2021-09-26 04:29:02 +0200
commit223297c29d78e4fe95e2ed2338455708e4e0a9c3 (patch)
tree02534f529becf59835dd1ed5dee0f2321966a09b /roles/monitoring/prometheus/server/defaults
parentswitch to pascal for pressure (diff)
parentfix alert wording (diff)
Merge branch 'topic/prometheus-refactoring'
Diffstat (limited to 'roles/monitoring/prometheus/server/defaults')
-rw-r--r--roles/monitoring/prometheus/server/defaults/main/main.yml11
-rw-r--r--roles/monitoring/prometheus/server/defaults/main/rules_blackbox-https.yml3
-rw-r--r--roles/monitoring/prometheus/server/defaults/main/rules_blackbox-ping.yml3
-rw-r--r--roles/monitoring/prometheus/server/defaults/main/rules_blackbox-ssh.yml3
-rw-r--r--roles/monitoring/prometheus/server/defaults/main/rules_blackbox.yml3
-rw-r--r--roles/monitoring/prometheus/server/defaults/main/rules_blackbox__probe.yml74
-rw-r--r--roles/monitoring/prometheus/server/defaults/main/rules_ipmi.yml4
-rw-r--r--roles/monitoring/prometheus/server/defaults/main/rules_ipmi__remote.yml4
-rw-r--r--roles/monitoring/prometheus/server/defaults/main/rules_node.yml38
-rw-r--r--roles/monitoring/prometheus/server/defaults/main/rules_nut__ups.yml4
-rw-r--r--roles/monitoring/prometheus/server/defaults/main/rules_openwrt.yml1
11 files changed, 133 insertions, 15 deletions
diff --git a/roles/monitoring/prometheus/server/defaults/main/main.yml b/roles/monitoring/prometheus/server/defaults/main/main.yml
index 95b9da6d..1e0ccf78 100644
--- a/roles/monitoring/prometheus/server/defaults/main/main.yml
+++ b/roles/monitoring/prometheus/server/defaults/main/main.yml
@@ -5,9 +5,8 @@
prometheus_server_retention: "15d"
-prometheus_server_jobs_generic:
+prometheus_server_jobs:
- node
-prometheus_server_jobs_special: []
#prometheus_server_jobs_extra: |
# - job_name: ...
@@ -16,9 +15,11 @@ prometheus_server_rules:
node: "{{ prometheus_server_rules_node + prometheus_server_rules_node_extra }}"
openwrt: "{{ prometheus_server_rules_openwrt + prometheus_server_rules_node_extra }}"
nut: "{{ prometheus_server_rules_nut + prometheus_server_rules_nut_extra }}"
- "blackbox-ping": "{{ prometheus_server_rules_blackbox_ping + prometheus_server_rules_blackbox_ping_extra }}"
- "blackbox-https": "{{ prometheus_server_rules_blackbox_https + prometheus_server_rules_blackbox_https_extra }}"
- "blackbox-ssh": "{{ prometheus_server_rules_blackbox_ssh + prometheus_server_rules_blackbox_ssh_extra }}"
+ nut/ups: "{{ prometheus_server_rules_nut__ups + prometheus_server_rules_nut__ups_extra }}"
+ blackbox: "{{ prometheus_server_rules_blackbox + prometheus_server_rules_blackbox_extra }}"
+ blackbox/probe: "{{ prometheus_server_rules_blackbox__probe + prometheus_server_rules_blackbox__probe_extra }}"
+ ipmi: "{{ prometheus_server_rules_ipmi + prometheus_server_rules_ipmi_extra }}"
+ ipmi/remote: "{{ prometheus_server_rules_ipmi__remote + prometheus_server_rules_ipmi__remote_extra }}"
# prometheus_server_alertmanager:
# url: "127.0.0.1:9093"
diff --git a/roles/monitoring/prometheus/server/defaults/main/rules_blackbox-https.yml b/roles/monitoring/prometheus/server/defaults/main/rules_blackbox-https.yml
deleted file mode 100644
index bb806075..00000000
--- a/roles/monitoring/prometheus/server/defaults/main/rules_blackbox-https.yml
+++ /dev/null
@@ -1,3 +0,0 @@
----
-prometheus_server_rules_blackbox_https_extra: []
-prometheus_server_rules_blackbox_https: []
diff --git a/roles/monitoring/prometheus/server/defaults/main/rules_blackbox-ping.yml b/roles/monitoring/prometheus/server/defaults/main/rules_blackbox-ping.yml
deleted file mode 100644
index 56c122f5..00000000
--- a/roles/monitoring/prometheus/server/defaults/main/rules_blackbox-ping.yml
+++ /dev/null
@@ -1,3 +0,0 @@
----
-prometheus_server_rules_blackbox_ping_extra: []
-prometheus_server_rules_blackbox_ping: []
diff --git a/roles/monitoring/prometheus/server/defaults/main/rules_blackbox-ssh.yml b/roles/monitoring/prometheus/server/defaults/main/rules_blackbox-ssh.yml
deleted file mode 100644
index 727d2292..00000000
--- a/roles/monitoring/prometheus/server/defaults/main/rules_blackbox-ssh.yml
+++ /dev/null
@@ -1,3 +0,0 @@
----
-prometheus_server_rules_blackbox_ssh_extra: []
-prometheus_server_rules_blackbox_ssh: []
diff --git a/roles/monitoring/prometheus/server/defaults/main/rules_blackbox.yml b/roles/monitoring/prometheus/server/defaults/main/rules_blackbox.yml
new file mode 100644
index 00000000..99f2e83c
--- /dev/null
+++ b/roles/monitoring/prometheus/server/defaults/main/rules_blackbox.yml
@@ -0,0 +1,3 @@
+---
+prometheus_server_rules_blackbox_extra: []
+prometheus_server_rules_blackbox: []
diff --git a/roles/monitoring/prometheus/server/defaults/main/rules_blackbox__probe.yml b/roles/monitoring/prometheus/server/defaults/main/rules_blackbox__probe.yml
new file mode 100644
index 00000000..9f9d2292
--- /dev/null
+++ b/roles/monitoring/prometheus/server/defaults/main/rules_blackbox__probe.yml
@@ -0,0 +1,74 @@
+---
+prometheus_server_rules_blackbox__probe_extra: []
+prometheus_server_rules_blackbox__probe:
+ - alert: BlackboxProbeFailed
+ expr: probe_success == 0
+ for: 0m
+ labels:
+ severity: critical
+ annotations:
+ summary: Blackbox probe failed (instance {{ '{{' }} $labels.instance {{ '}}' }})
+ description: "Probe failed\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}"
+
+ - alert: BlackboxSlowProbe
+ expr: avg_over_time(probe_duration_seconds[1m]) > 1
+ for: 1m
+ labels:
+ severity: warning
+ annotations:
+ summary: Blackbox slow probe (instance {{ '{{' }} $labels.instance {{ '}}' }})
+ description: "Blackbox probe took more than 1s to complete\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}"
+
+ - alert: BlackboxSslCertificateWillExpireSoon
+ expr: probe_ssl_earliest_cert_expiry - time() < 86400 * 30
+ for: 0m
+ labels:
+ severity: warning
+ annotations:
+ summary: Blackbox SSL certificate will expire soon (instance {{ '{{' }} $labels.instance {{ '}}' }})
+ description: "SSL certificate expires in 30 days\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}"
+
+ - alert: BlackboxSslCertificateWillExpireSoon
+ expr: probe_ssl_earliest_cert_expiry - time() < 86400 * 3
+ for: 0m
+ labels:
+ severity: critical
+ annotations:
+ summary: Blackbox SSL certificate will expire soon (instance {{ '{{' }} $labels.instance {{ '}}' }})
+ description: "SSL certificate expires in 3 days\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}"
+
+ - alert: BlackboxSslCertificateExpired
+ expr: probe_ssl_earliest_cert_expiry - time() <= 0
+ for: 0m
+ labels:
+ severity: critical
+ annotations:
+ summary: Blackbox SSL certificate expired (instance {{ '{{' }} $labels.instance {{ '}}' }})
+ description: "SSL certificate has expired already\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}"
+
+ - alert: BlackboxProbeHttpFailure
+ expr: probe_http_status_code <= 199 OR probe_http_status_code >= 400
+ for: 0m
+ labels:
+ severity: critical
+ annotations:
+ summary: Blackbox probe HTTP failure (instance {{ '{{' }} $labels.instance {{ '}}' }})
+ description: "HTTP status code is not 200-399\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}"
+
+ - alert: BlackboxProbeSlowHttp
+ expr: avg_over_time(probe_http_duration_seconds[1m]) > 1
+ for: 1m
+ labels:
+ severity: warning
+ annotations:
+ summary: Blackbox probe slow HTTP (instance {{ '{{' }} $labels.instance {{ '}}' }})
+ description: "HTTP request took more than 1s\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}"
+
+ - alert: BlackboxProbeSlowPing
+ expr: avg_over_time(probe_icmp_duration_seconds[1m]) > 1
+ for: 1m
+ labels:
+ severity: warning
+ annotations:
+ summary: Blackbox probe slow ping (instance {{ '{{' }} $labels.instance {{ '}}' }})
+ description: "Blackbox ping took more than 1s\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}"
diff --git a/roles/monitoring/prometheus/server/defaults/main/rules_ipmi.yml b/roles/monitoring/prometheus/server/defaults/main/rules_ipmi.yml
new file mode 100644
index 00000000..41dcd7e9
--- /dev/null
+++ b/roles/monitoring/prometheus/server/defaults/main/rules_ipmi.yml
@@ -0,0 +1,4 @@
+---
+prometheus_server_rules_ipmi_extra: []
+prometheus_server_rules_ipmi: []
+## TODO: add common IPMI alert rules
diff --git a/roles/monitoring/prometheus/server/defaults/main/rules_ipmi__remote.yml b/roles/monitoring/prometheus/server/defaults/main/rules_ipmi__remote.yml
new file mode 100644
index 00000000..1f9338ea
--- /dev/null
+++ b/roles/monitoring/prometheus/server/defaults/main/rules_ipmi__remote.yml
@@ -0,0 +1,4 @@
+---
+prometheus_server_rules_ipmi__remote_extra: []
+prometheus_server_rules_ipmi__remote: []
+## TODO: add remote-IPMI specific alert rules
diff --git a/roles/monitoring/prometheus/server/defaults/main/rules_node.yml b/roles/monitoring/prometheus/server/defaults/main/rules_node.yml
index ab7317ac..55641534 100644
--- a/roles/monitoring/prometheus/server/defaults/main/rules_node.yml
+++ b/roles/monitoring/prometheus/server/defaults/main/rules_node.yml
@@ -92,6 +92,15 @@ prometheus_server_rules_node:
summary: Host CPU steal noisy neighbor (instance {{ '{{' }} $labels.instance {{ '}}' }})
description: "CPU steal is > 10%. A noisy neighbor is killing VM performances or a spot instance may be out of credit.\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}"
+ - alert: HostSystemdNotRunning
+ expr: node_systemd_system_running == 0
+ for: 0m
+ labels:
+ severity: warning
+ annotations:
+ summary: Host systemd is not in running state (instance {{ '{{' }} $labels.instance {{ '}}' }})
+ description: "systemd is not in running state.\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}"
+
- alert: HostSystemdServiceCrashed
expr: node_systemd_unit_state{state="failed"} == 1
for: 0m
@@ -99,7 +108,7 @@ prometheus_server_rules_node:
severity: warning
annotations:
summary: Host systemd service crashed (instance {{ '{{' }} $labels.instance {{ '}}' }})
- description: "systemd service crashed\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}"
+ description: "The systemd service unit {{ '{{' }} $labels.name {{ '}}' }} is in failed state.\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}"
- alert: HostPhysicalComponentTooHot
expr: node_hwmon_temp_celsius > 75
@@ -217,3 +226,30 @@ prometheus_server_rules_node:
annotations:
summary: Host clock not synchronising (instance {{ '{{' }} $labels.instance {{ '}}' }})
description: "Clock not synchronising.\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}"
+
+ - alert: AptUpgradesPending
+ expr: sum by (instance) (apt_upgrades_pending) > 0
+ for: 0m
+ labels:
+ severity: warning
+ annotations:
+ summary: Host has upgradeable packages (instance {{ '{{' }} $labels.instance {{ '}}' }})
+ description: "Host {{ '{{' }} $labels.instance {{ '}}' }} has {{ '{{' }} $value {{ '}}' }} upgradable packages.\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}"
+
+ - alert: AptAutoremovePending
+ expr: sum by (instance) (apt_autoremove_pending) > 0
+ for: 0m
+ labels:
+ severity: warning
+ annotations:
+ summary: Host has packages that can be autoremoved (instance {{ '{{' }} $labels.instance {{ '}}' }})
+ description: "Host {{ '{{' }} $labels.instance {{ '}}' }} has {{ '{{' }} $value {{ '}}' }} packages that can be autoremoved.\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}"
+
+ - alert: HostNeedsRebooting
+ expr: node_reboot_required > 0
+ for: 0m
+ labels:
+ severity: warning
+ annotations:
+ summary: Host must be rebootet (instance {{ '{{' }} $labels.instance {{ '}}' }})
+ description: "Host {{ '{{' }} $labels.instance {{ '}}' }} must be rebootet for security uppdates to take effect.\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}"
diff --git a/roles/monitoring/prometheus/server/defaults/main/rules_nut__ups.yml b/roles/monitoring/prometheus/server/defaults/main/rules_nut__ups.yml
new file mode 100644
index 00000000..150a507e
--- /dev/null
+++ b/roles/monitoring/prometheus/server/defaults/main/rules_nut__ups.yml
@@ -0,0 +1,4 @@
+---
+prometheus_server_rules_nut__ups_extra: []
+prometheus_server_rules_nut__ups: []
+## TODO: add NUT/UPS alert rules
diff --git a/roles/monitoring/prometheus/server/defaults/main/rules_openwrt.yml b/roles/monitoring/prometheus/server/defaults/main/rules_openwrt.yml
index 88d84f31..04b178f1 100644
--- a/roles/monitoring/prometheus/server/defaults/main/rules_openwrt.yml
+++ b/roles/monitoring/prometheus/server/defaults/main/rules_openwrt.yml
@@ -1,3 +1,4 @@
---
prometheus_server_rules_openwrt_extra: []
prometheus_server_rules_openwrt: []
+## TODO: add openwrt specific alert rules