From 9156c07b0246b44556019efe3546c79e01a303b9 Mon Sep 17 00:00:00 2001 From: Christian Pointner Date: Wed, 29 May 2024 15:08:49 +0200 Subject: add prometheus alert for host reboots --- roles/monitoring/prometheus/server/defaults/main/rules_node.yml | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/roles/monitoring/prometheus/server/defaults/main/rules_node.yml b/roles/monitoring/prometheus/server/defaults/main/rules_node.yml index fe6e781b..262f1af7 100644 --- a/roles/monitoring/prometheus/server/defaults/main/rules_node.yml +++ b/roles/monitoring/prometheus/server/defaults/main/rules_node.yml @@ -2,6 +2,14 @@ ## https://awesome-prometheus-alerts.grep.to/rules#host-and-hardware prometheus_server_rules_node_extra: [] prometheus_server_rules_node: + - alert: HostRebooted + expr: ((node_time_seconds - node_boot_time_seconds) / 60) < 15 + labels: + severity: warning + annotations: + summary: Host has rebooted less then 15 minutes ago (instance {{ '{{' }} $labels.instance {{ '}}' }}) + description: "The node has been rebootet {{ '{{' }} $value {{ '}}' }} minutes ago. If this was not done intentionally please check if everything is alright.\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}" + - alert: HostOutOfMemory expr: node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 10 for: 2m -- cgit v1.2.3