summaryrefslogtreecommitdiff
path: root/roles/monitoring/prometheus/server/defaults
diff options
context:
space:
mode:
authorChristian Pointner <equinox@spreadspace.org>2022-03-22 00:20:38 +0100
committerChristian Pointner <equinox@spreadspace.org>2022-03-22 00:20:38 +0100
commit31e3ea00dc1c8f22af13a3205835a678d269ca77 (patch)
tree33fa11a425a3fce9aa5a98c92cb6de0b3cb3baa2 /roles/monitoring/prometheus/server/defaults
parentch-mod: tempary disable montirong of currently not running nodes (diff)
prometheus: minor fixes and improvements
Diffstat (limited to 'roles/monitoring/prometheus/server/defaults')
-rw-r--r--roles/monitoring/prometheus/server/defaults/main/rules_node.yml13
1 files changed, 11 insertions, 2 deletions
diff --git a/roles/monitoring/prometheus/server/defaults/main/rules_node.yml b/roles/monitoring/prometheus/server/defaults/main/rules_node.yml
index 7de36758..2278c70a 100644
--- a/roles/monitoring/prometheus/server/defaults/main/rules_node.yml
+++ b/roles/monitoring/prometheus/server/defaults/main/rules_node.yml
@@ -2,6 +2,15 @@
## https://awesome-prometheus-alerts.grep.to/rules#host-and-hardware
prometheus_server_rules_node_extra: []
prometheus_server_rules_node:
+ - alert: NodeTextfileScrapeFailed
+ expr: node_textfile_scrape_error != 0
+ for: 0m
+ labels:
+ severity: warning
+ annotations:
+ summary: Scraping metrics from textfiles failed (instance {{ '{{' }} $labels.instance {{ '}}' }})
+ description: "The textfile collector failed to scrape at least one file\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}"
+
- alert: HostOutOfMemory
expr: node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 10
for: 2m
@@ -47,7 +56,7 @@ prometheus_server_rules_node:
description: "Filesystem is predicted to run out of space within the next 24 hours at current write rate\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}"
- alert: HostOutOfInodes
- expr: node_filesystem_files_free{mountpoint ="/rootfs"} / node_filesystem_files{mountpoint="/rootfs"} * 100 < 10 and ON (instance, device, mountpoint) node_filesystem_readonly{mountpoint="/rootfs"} == 0
+ expr: node_filesystem_files_free{mountpoint="/rootfs"} / node_filesystem_files{mountpoint="/rootfs"} * 100 < 10 and ON (instance, device, mountpoint) node_filesystem_readonly{mountpoint="/rootfs"} == 0
for: 2m
labels:
severity: warning
@@ -56,7 +65,7 @@ prometheus_server_rules_node:
description: "Disk is almost running out of available inodes (< 10% left)\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}"
- alert: HostInodesWillFillIn24Hours
- expr: node_filesystem_files_free{mountpoint ="/rootfs"} / node_filesystem_files{mountpoint="/rootfs"} * 100 < 10 and predict_linear(node_filesystem_files_free{mountpoint="/rootfs"}[1h], 24 * 3600) < 0 and ON (instance, device, mountpoint) node_filesystem_readonly{mountpoint="/rootfs"} == 0
+ expr: node_filesystem_files_free{mountpoint="/rootfs"} / node_filesystem_files{mountpoint="/rootfs"} * 100 < 10 and predict_linear(node_filesystem_files_free{mountpoint="/rootfs"}[1h], 24 * 3600) < 0 and ON (instance, device, mountpoint) node_filesystem_readonly{mountpoint="/rootfs"} == 0
for: 2m
labels:
severity: warning