summaryrefslogtreecommitdiff
path: root/roles/monitoring/prometheus/server/defaults
diff options
context:
space:
mode:
authorChristian Pointner <equinox@spreadspace.org>2021-09-27 23:40:51 +0200
committerChristian Pointner <equinox@spreadspace.org>2021-09-27 23:40:51 +0200
commit49114bef214614a636b7d529e41566fdabb1f2c6 (patch)
tree6136d6708561b3708d8fe6e51508e54847f9b422 /roles/monitoring/prometheus/server/defaults
parentupgrade openwrt for ups monitors (diff)
finalize NUT monitoring
Diffstat (limited to 'roles/monitoring/prometheus/server/defaults')
-rw-r--r--roles/monitoring/prometheus/server/defaults/main/rules_ipmi__remote.yml1
-rw-r--r--roles/monitoring/prometheus/server/defaults/main/rules_nut__ups.yml56
2 files changed, 54 insertions, 3 deletions
diff --git a/roles/monitoring/prometheus/server/defaults/main/rules_ipmi__remote.yml b/roles/monitoring/prometheus/server/defaults/main/rules_ipmi__remote.yml
index 1f9338ea..b6163981 100644
--- a/roles/monitoring/prometheus/server/defaults/main/rules_ipmi__remote.yml
+++ b/roles/monitoring/prometheus/server/defaults/main/rules_ipmi__remote.yml
@@ -1,4 +1,3 @@
---
prometheus_server_rules_ipmi__remote_extra: []
prometheus_server_rules_ipmi__remote: []
-## TODO: add remote-IPMI specific alert rules
diff --git a/roles/monitoring/prometheus/server/defaults/main/rules_nut__ups.yml b/roles/monitoring/prometheus/server/defaults/main/rules_nut__ups.yml
index 150a507e..274133e5 100644
--- a/roles/monitoring/prometheus/server/defaults/main/rules_nut__ups.yml
+++ b/roles/monitoring/prometheus/server/defaults/main/rules_nut__ups.yml
@@ -1,4 +1,56 @@
---
prometheus_server_rules_nut__ups_extra: []
-prometheus_server_rules_nut__ups: []
-## TODO: add NUT/UPS alert rules
+prometheus_server_rules_nut__ups:
+ - alert: UPSLoadHigh
+ expr: network_ups_tools_ups_load > 82
+ for: 1m
+ labels:
+ severity: warning
+ annotations:
+ summary: UPS load is high (instance {{ '{{' }} $labels.instance {{ '}}' }})
+ description: "The load of UPS {{ '{{' }} $labels.instance {{ '}}' }} is > 82 %.\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}"
+
+ - alert: UPSLoadVeryHigh
+ expr: network_ups_tools_ups_load > 92
+ for: 1m
+ labels:
+ severity: critical
+ annotations:
+ summary: UPS load is very high (instance {{ '{{' }} $labels.instance {{ '}}' }})
+ description: "The load of UPS {{ '{{' }} $labels.instance {{ '}}' }} is > 92 %.\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}"
+
+ - alert: UPSStatusOverload
+ expr: network_ups_tools_ups_status{flag="OVER"} == 1
+ for: 1m
+ labels:
+ severity: critical
+ annotations:
+ summary: UPS is overloaded (instance {{ '{{' }} $labels.instance {{ '}}' }})
+ description: "UPS {{ '{{' }} $labels.instance {{ '}}' }} is overloaded.\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}"
+
+ - alert: UPSStatusOnBattery
+ expr: network_ups_tools_ups_status{flag="OB"} == 1
+ for: 1m
+ labels:
+ severity: warning
+ annotations:
+ summary: UPS is running on battery (instance {{ '{{' }} $labels.instance {{ '}}' }})
+ description: "UPS {{ '{{' }} $labels.instance {{ '}}' }} lost wall power and is running on battery.\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}"
+
+ - alert: UPSStatusLowBattery
+ expr: network_ups_tools_ups_status{flag="LB"} == 1
+ for: 0m
+ labels:
+ severity: critical
+ annotations:
+ summary: UPS battery is low (instance {{ '{{' }} $labels.instance {{ '}}' }})
+ description: "UPS {{ '{{' }} $labels.instance {{ '}}' }} reports low battery.\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}"
+
+ - alert: UPSStatusReplaceBattery
+ expr: network_ups_tools_ups_status{flag="RB"} == 1
+ for: 2m
+ labels:
+ severity: warning
+ annotations:
+ summary: UPS battery needs to be replaced (instance {{ '{{' }} $labels.instance {{ '}}' }})
+ description: "The battery of UPS {{ '{{' }} $labels.instance {{ '}}' }} needs to be replaced.\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}"