From 21a3ac9e32e84800e0e33b01baa7c8696233e83b Mon Sep 17 00:00:00 2001 From: Christian Pointner Date: Tue, 22 Feb 2022 19:17:53 +0100 Subject: prometheus: add alert for corrupted memory --- roles/monitoring/prometheus/server/defaults/main/rules_node.yml | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'roles/monitoring') diff --git a/roles/monitoring/prometheus/server/defaults/main/rules_node.yml b/roles/monitoring/prometheus/server/defaults/main/rules_node.yml index 8a02e67b..525355d5 100644 --- a/roles/monitoring/prometheus/server/defaults/main/rules_node.yml +++ b/roles/monitoring/prometheus/server/defaults/main/rules_node.yml @@ -20,6 +20,14 @@ prometheus_server_rules_node: summary: Host memory under memory pressure (instance {{ '{{' }} $labels.instance {{ '}}' }}) description: "The node is under heavy memory pressure. High rate of major page faults\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}" + - alert: HostMemoryHardwareCorrupted + expr: node_memory_HardwareCorrupted_bytes > 0 + labels: + severity: warning + annotations: + summary: Host memory is corrupted (instance {{ '{{' }} $labels.instance {{ '}}' }}) + description: "The node reports {{ '{{' }} $value {{ '}}' }} bytes of corrupted memory.\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}" + - alert: HostOutOfDiskSpace expr: (node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0 for: 2m -- cgit v1.2.3