summaryrefslogtreecommitdiff
path: root/roles/monitoring
diff options
context:
space:
mode:
authorChristian Pointner <equinox@spreadspace.org>2022-02-22 19:17:53 +0100
committerChristian Pointner <equinox@spreadspace.org>2022-02-22 19:17:53 +0100
commit21a3ac9e32e84800e0e33b01baa7c8696233e83b (patch)
tree2709abc3c2ab454347eecd92a4369bd04a28249d /roles/monitoring
parentsk-cloudio: downgrade jitsi meet (diff)
prometheus: add alert for corrupted memory
Diffstat (limited to 'roles/monitoring')
-rw-r--r--roles/monitoring/prometheus/server/defaults/main/rules_node.yml8
1 files changed, 8 insertions, 0 deletions
diff --git a/roles/monitoring/prometheus/server/defaults/main/rules_node.yml b/roles/monitoring/prometheus/server/defaults/main/rules_node.yml
index 8a02e67b..525355d5 100644
--- a/roles/monitoring/prometheus/server/defaults/main/rules_node.yml
+++ b/roles/monitoring/prometheus/server/defaults/main/rules_node.yml
@@ -20,6 +20,14 @@ prometheus_server_rules_node:
summary: Host memory under memory pressure (instance {{ '{{' }} $labels.instance {{ '}}' }})
description: "The node is under heavy memory pressure. High rate of major page faults\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}"
+ - alert: HostMemoryHardwareCorrupted
+ expr: node_memory_HardwareCorrupted_bytes > 0
+ labels:
+ severity: warning
+ annotations:
+ summary: Host memory is corrupted (instance {{ '{{' }} $labels.instance {{ '}}' }})
+ description: "The node reports {{ '{{' }} $value {{ '}}' }} bytes of corrupted memory.\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}"
+
- alert: HostOutOfDiskSpace
expr: (node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0
for: 2m