summaryrefslogtreecommitdiff
path: root/roles/monitoring/prometheus/server
diff options
context:
space:
mode:
authorChristian Pointner <equinox@spreadspace.org>2021-09-27 21:41:57 +0200
committerChristian Pointner <equinox@spreadspace.org>2021-09-27 21:41:57 +0200
commit4d60167a0a935a141e6300bc1c1fb691a77c49c0 (patch)
tree247aeb9cd9a2010a062112ccf7ca4da9566044b1 /roles/monitoring/prometheus/server
parentprometheus: add some openwrt specific alert rules (diff)
fix and finalize ipmi exporter
Diffstat (limited to 'roles/monitoring/prometheus/server')
-rw-r--r--roles/monitoring/prometheus/server/defaults/main/rules_ipmi.yml137
1 files changed, 135 insertions, 2 deletions
diff --git a/roles/monitoring/prometheus/server/defaults/main/rules_ipmi.yml b/roles/monitoring/prometheus/server/defaults/main/rules_ipmi.yml
index 41dcd7e9..31c15d51 100644
--- a/roles/monitoring/prometheus/server/defaults/main/rules_ipmi.yml
+++ b/roles/monitoring/prometheus/server/defaults/main/rules_ipmi.yml
@@ -1,4 +1,137 @@
---
prometheus_server_rules_ipmi_extra: []
-prometheus_server_rules_ipmi: []
-## TODO: add common IPMI alert rules
+prometheus_server_rules_ipmi:
+ - alert: IpmiFailedToScrapeCollector
+ expr: ipmi_up == 0
+ for: 2m
+ labels:
+ severity: warning
+ annotations:
+ summary: IPMI collector failed to scrape (instance {{ '{{' }} $labels.instance {{ '}}' }})
+ description: "The IPMI collector {{ '{{' }} $labels.collector {{ '}}' }} could not be scraped.\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}"
+
+ - alert: IpmiSensorStateWarning
+ expr: ipmi_sensor_state == 1
+ for: 2m
+ labels:
+ severity: warning
+ annotations:
+ summary: IPMI Sensor state is warning (instance {{ '{{' }} $labels.instance {{ '}}' }})
+ description: "The IPMI sensor {{ '{{' }} $labels.name {{ '}}' }} has state: warning.\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}"
+
+ - alert: IpmiSensorStateCritical
+ expr: ipmi_sensor_state == 2
+ for: 2m
+ labels:
+ severity: critical
+ annotations:
+ summary: IPMI Sensor state is critical (instance {{ '{{' }} $labels.instance {{ '}}' }})
+ description: "The IPMI sensor {{ '{{' }} $labels.name {{ '}}' }} has state: critical.\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}"
+
+ - alert: IpmiFanSpeedStateWarning
+ expr: ipmi_fan_speed_state == 1
+ for: 2m
+ labels:
+ severity: warning
+ annotations:
+ summary: IPMI fan-speed state is warning (instance {{ '{{' }} $labels.instance {{ '}}' }})
+ description: "The IPMI fanspeed {{ '{{' }} $labels.name {{ '}}' }} has state: warning.\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}"
+
+ - alert: IpmiFanSpeedStateCritical
+ expr: ipmi_fan_speed_state == 2
+ for: 2m
+ labels:
+ severity: critical
+ annotations:
+ summary: IPMI fan-speed state is critical (instance {{ '{{' }} $labels.instance {{ '}}' }})
+ description: "The IPMI fan speed {{ '{{' }} $labels.name {{ '}}' }} has state: critical.\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}"
+
+ - alert: IpmiTemperatureStateWarning
+ expr: ipmi_temperature_state == 1
+ for: 2m
+ labels:
+ severity: warning
+ annotations:
+ summary: IPMI Temperature state is warning (instance {{ '{{' }} $labels.instance {{ '}}' }})
+ description: "The IPMI temperature {{ '{{' }} $labels.name {{ '}}' }} has state: warning.\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}"
+
+ - alert: IpmiTemperatureStateCritical
+ expr: ipmi_temperature_state == 2
+ for: 2m
+ labels:
+ severity: critical
+ annotations:
+ summary: IPMI Temperature state is critical (instance {{ '{{' }} $labels.instance {{ '}}' }})
+ description: "The IPMI temperature {{ '{{' }} $labels.name {{ '}}' }} has state: critical.\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}"
+
+ - alert: IpmiVoltageStateWarning
+ expr: ipmi_voltage_state == 1
+ for: 2m
+ labels:
+ severity: warning
+ annotations:
+ summary: IPMI Voltage state is warning (instance {{ '{{' }} $labels.instance {{ '}}' }})
+ description: "The IPMI voltage {{ '{{' }} $labels.name {{ '}}' }} has state: warning.\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}"
+
+ - alert: IpmiVoltageStateCritical
+ expr: ipmi_voltage_state == 2
+ for: 2m
+ labels:
+ severity: critical
+ annotations:
+ summary: IPMI Voltage state is critical (instance {{ '{{' }} $labels.instance {{ '}}' }})
+ description: "The IPMI voltage {{ '{{' }} $labels.name {{ '}}' }} has state: critical.\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}"
+
+ - alert: IpmiCurrentStateWarning
+ expr: ipmi_current_state == 1
+ for: 2m
+ labels:
+ severity: warning
+ annotations:
+ summary: IPMI Current state is warning (instance {{ '{{' }} $labels.instance {{ '}}' }})
+ description: "The IPMI current {{ '{{' }} $labels.name {{ '}}' }} has state: warning.\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}"
+
+ - alert: IpmiCurrentStateCritical
+ expr: ipmi_current_state == 2
+ for: 2m
+ labels:
+ severity: critical
+ annotations:
+ summary: IPMI Current state is critical (instance {{ '{{' }} $labels.instance {{ '}}' }})
+ description: "The IPMI current {{ '{{' }} $labels.name {{ '}}' }} has state: critical.\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}"
+
+ - alert: IpmiPowerStateWarning
+ expr: ipmi_power_state == 1
+ for: 2m
+ labels:
+ severity: warning
+ annotations:
+ summary: IPMI Power state is warning (instance {{ '{{' }} $labels.instance {{ '}}' }})
+ description: "The IPMI power {{ '{{' }} $labels.name {{ '}}' }} has state: warning.\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}"
+
+ - alert: IpmiPowerStateCritical
+ expr: ipmi_power_state == 2
+ for: 2m
+ labels:
+ severity: critical
+ annotations:
+ summary: IPMI Power state is critical (instance {{ '{{' }} $labels.instance {{ '}}' }})
+ description: "The IPMI power {{ '{{' }} $labels.name {{ '}}' }} has state: critical.\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}"
+
+ - alert: IpmiSystemEventLogManyEvents
+ expr: rate(ipmi_sel_logs_count[2m]) > 0.1
+ for: 1m
+ labels:
+ severity: warning
+ annotations:
+ summary: IPMI SEL grows quickly (instance {{ '{{' }} $labels.instance {{ '}}' }})
+ description: "The IPMI system log contains too many new events.\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}"
+
+ - alert: IpmiSystemEventLogFull
+ expr: ipmi_sel_free_space_bytes < 100
+ for: 0m
+ labels:
+ severity: warning
+ annotations:
+ summary: IPMI SEL is full (instance {{ '{{' }} $labels.instance {{ '}}' }})
+ description: "The IPMI system log ran out of space.\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}"