From 49114bef214614a636b7d529e41566fdabb1f2c6 Mon Sep 17 00:00:00 2001 From: Christian Pointner Date: Mon, 27 Sep 2021 23:40:51 +0200 Subject: finalize NUT monitoring --- .../server/defaults/main/rules_nut__ups.yml | 56 +++++++++++++++++++++- 1 file changed, 54 insertions(+), 2 deletions(-) (limited to 'roles/monitoring/prometheus/server/defaults/main/rules_nut__ups.yml') diff --git a/roles/monitoring/prometheus/server/defaults/main/rules_nut__ups.yml b/roles/monitoring/prometheus/server/defaults/main/rules_nut__ups.yml index 150a507e..274133e5 100644 --- a/roles/monitoring/prometheus/server/defaults/main/rules_nut__ups.yml +++ b/roles/monitoring/prometheus/server/defaults/main/rules_nut__ups.yml @@ -1,4 +1,56 @@ --- prometheus_server_rules_nut__ups_extra: [] -prometheus_server_rules_nut__ups: [] -## TODO: add NUT/UPS alert rules +prometheus_server_rules_nut__ups: + - alert: UPSLoadHigh + expr: network_ups_tools_ups_load > 82 + for: 1m + labels: + severity: warning + annotations: + summary: UPS load is high (instance {{ '{{' }} $labels.instance {{ '}}' }}) + description: "The load of UPS {{ '{{' }} $labels.instance {{ '}}' }} is > 82 %.\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}" + + - alert: UPSLoadVeryHigh + expr: network_ups_tools_ups_load > 92 + for: 1m + labels: + severity: critical + annotations: + summary: UPS load is very high (instance {{ '{{' }} $labels.instance {{ '}}' }}) + description: "The load of UPS {{ '{{' }} $labels.instance {{ '}}' }} is > 92 %.\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}" + + - alert: UPSStatusOverload + expr: network_ups_tools_ups_status{flag="OVER"} == 1 + for: 1m + labels: + severity: critical + annotations: + summary: UPS is overloaded (instance {{ '{{' }} $labels.instance {{ '}}' }}) + description: "UPS {{ '{{' }} $labels.instance {{ '}}' }} is overloaded.\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}" + + - alert: UPSStatusOnBattery + expr: network_ups_tools_ups_status{flag="OB"} == 1 + for: 1m + labels: + severity: warning + annotations: + summary: UPS is running on battery (instance {{ '{{' }} $labels.instance {{ '}}' }}) + description: "UPS {{ '{{' }} $labels.instance {{ '}}' }} lost wall power and is running on battery.\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}" + + - alert: UPSStatusLowBattery + expr: network_ups_tools_ups_status{flag="LB"} == 1 + for: 0m + labels: + severity: critical + annotations: + summary: UPS battery is low (instance {{ '{{' }} $labels.instance {{ '}}' }}) + description: "UPS {{ '{{' }} $labels.instance {{ '}}' }} reports low battery.\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}" + + - alert: UPSStatusReplaceBattery + expr: network_ups_tools_ups_status{flag="RB"} == 1 + for: 2m + labels: + severity: warning + annotations: + summary: UPS battery needs to be replaced (instance {{ '{{' }} $labels.instance {{ '}}' }}) + description: "The battery of UPS {{ '{{' }} $labels.instance {{ '}}' }} needs to be replaced.\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}" -- cgit v1.2.3