summaryrefslogtreecommitdiff
path: root/roles/monitoring/prometheus
diff options
context:
space:
mode:
authorChristian Pointner <equinox@spreadspace.org>2024-05-29 15:08:49 +0200
committerChristian Pointner <equinox@spreadspace.org>2024-05-29 15:08:49 +0200
commit9156c07b0246b44556019efe3546c79e01a303b9 (patch)
tree0d54a55099a230292cd240badbfa4fbee9601893 /roles/monitoring/prometheus
parentch-cm4-sensor: make install in iot possible (diff)
add prometheus alert for host reboots
Diffstat (limited to 'roles/monitoring/prometheus')
-rw-r--r--roles/monitoring/prometheus/server/defaults/main/rules_node.yml8
1 files changed, 8 insertions, 0 deletions
diff --git a/roles/monitoring/prometheus/server/defaults/main/rules_node.yml b/roles/monitoring/prometheus/server/defaults/main/rules_node.yml
index fe6e781b..262f1af7 100644
--- a/roles/monitoring/prometheus/server/defaults/main/rules_node.yml
+++ b/roles/monitoring/prometheus/server/defaults/main/rules_node.yml
@@ -2,6 +2,14 @@
## https://awesome-prometheus-alerts.grep.to/rules#host-and-hardware
prometheus_server_rules_node_extra: []
prometheus_server_rules_node:
+ - alert: HostRebooted
+ expr: ((node_time_seconds - node_boot_time_seconds) / 60) < 15
+ labels:
+ severity: warning
+ annotations:
+ summary: Host has rebooted less then 15 minutes ago (instance {{ '{{' }} $labels.instance {{ '}}' }})
+ description: "The node has been rebootet {{ '{{' }} $value {{ '}}' }} minutes ago. If this was not done intentionally please check if everything is alright.\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}"
+
- alert: HostOutOfMemory
expr: node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 10
for: 2m