From 4d60167a0a935a141e6300bc1c1fb691a77c49c0 Mon Sep 17 00:00:00 2001 From: Christian Pointner Date: Mon, 27 Sep 2021 21:41:57 +0200 Subject: fix and finalize ipmi exporter --- .../prometheus/server/defaults/main/rules_ipmi.yml | 137 ++++++++++++++++++++- 1 file changed, 135 insertions(+), 2 deletions(-) (limited to 'roles/monitoring/prometheus/server/defaults/main') diff --git a/roles/monitoring/prometheus/server/defaults/main/rules_ipmi.yml b/roles/monitoring/prometheus/server/defaults/main/rules_ipmi.yml index 41dcd7e9..31c15d51 100644 --- a/roles/monitoring/prometheus/server/defaults/main/rules_ipmi.yml +++ b/roles/monitoring/prometheus/server/defaults/main/rules_ipmi.yml @@ -1,4 +1,137 @@ --- prometheus_server_rules_ipmi_extra: [] -prometheus_server_rules_ipmi: [] -## TODO: add common IPMI alert rules +prometheus_server_rules_ipmi: + - alert: IpmiFailedToScrapeCollector + expr: ipmi_up == 0 + for: 2m + labels: + severity: warning + annotations: + summary: IPMI collector failed to scrape (instance {{ '{{' }} $labels.instance {{ '}}' }}) + description: "The IPMI collector {{ '{{' }} $labels.collector {{ '}}' }} could not be scraped.\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}" + + - alert: IpmiSensorStateWarning + expr: ipmi_sensor_state == 1 + for: 2m + labels: + severity: warning + annotations: + summary: IPMI Sensor state is warning (instance {{ '{{' }} $labels.instance {{ '}}' }}) + description: "The IPMI sensor {{ '{{' }} $labels.name {{ '}}' }} has state: warning.\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}" + + - alert: IpmiSensorStateCritical + expr: ipmi_sensor_state == 2 + for: 2m + labels: + severity: critical + annotations: + summary: IPMI Sensor state is critical (instance {{ '{{' }} $labels.instance {{ '}}' }}) + description: "The IPMI sensor {{ '{{' }} $labels.name {{ '}}' }} has state: critical.\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}" + + - alert: IpmiFanSpeedStateWarning + expr: ipmi_fan_speed_state == 1 + for: 2m + labels: + severity: warning + annotations: + summary: IPMI fan-speed state is warning (instance {{ '{{' }} $labels.instance {{ '}}' }}) + description: "The IPMI fanspeed {{ '{{' }} $labels.name {{ '}}' }} has state: warning.\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}" + + - alert: IpmiFanSpeedStateCritical + expr: ipmi_fan_speed_state == 2 + for: 2m + labels: + severity: critical + annotations: + summary: IPMI fan-speed state is critical (instance {{ '{{' }} $labels.instance {{ '}}' }}) + description: "The IPMI fan speed {{ '{{' }} $labels.name {{ '}}' }} has state: critical.\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}" + + - alert: IpmiTemperatureStateWarning + expr: ipmi_temperature_state == 1 + for: 2m + labels: + severity: warning + annotations: + summary: IPMI Temperature state is warning (instance {{ '{{' }} $labels.instance {{ '}}' }}) + description: "The IPMI temperature {{ '{{' }} $labels.name {{ '}}' }} has state: warning.\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}" + + - alert: IpmiTemperatureStateCritical + expr: ipmi_temperature_state == 2 + for: 2m + labels: + severity: critical + annotations: + summary: IPMI Temperature state is critical (instance {{ '{{' }} $labels.instance {{ '}}' }}) + description: "The IPMI temperature {{ '{{' }} $labels.name {{ '}}' }} has state: critical.\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}" + + - alert: IpmiVoltageStateWarning + expr: ipmi_voltage_state == 1 + for: 2m + labels: + severity: warning + annotations: + summary: IPMI Voltage state is warning (instance {{ '{{' }} $labels.instance {{ '}}' }}) + description: "The IPMI voltage {{ '{{' }} $labels.name {{ '}}' }} has state: warning.\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}" + + - alert: IpmiVoltageStateCritical + expr: ipmi_voltage_state == 2 + for: 2m + labels: + severity: critical + annotations: + summary: IPMI Voltage state is critical (instance {{ '{{' }} $labels.instance {{ '}}' }}) + description: "The IPMI voltage {{ '{{' }} $labels.name {{ '}}' }} has state: critical.\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}" + + - alert: IpmiCurrentStateWarning + expr: ipmi_current_state == 1 + for: 2m + labels: + severity: warning + annotations: + summary: IPMI Current state is warning (instance {{ '{{' }} $labels.instance {{ '}}' }}) + description: "The IPMI current {{ '{{' }} $labels.name {{ '}}' }} has state: warning.\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}" + + - alert: IpmiCurrentStateCritical + expr: ipmi_current_state == 2 + for: 2m + labels: + severity: critical + annotations: + summary: IPMI Current state is critical (instance {{ '{{' }} $labels.instance {{ '}}' }}) + description: "The IPMI current {{ '{{' }} $labels.name {{ '}}' }} has state: critical.\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}" + + - alert: IpmiPowerStateWarning + expr: ipmi_power_state == 1 + for: 2m + labels: + severity: warning + annotations: + summary: IPMI Power state is warning (instance {{ '{{' }} $labels.instance {{ '}}' }}) + description: "The IPMI power {{ '{{' }} $labels.name {{ '}}' }} has state: warning.\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}" + + - alert: IpmiPowerStateCritical + expr: ipmi_power_state == 2 + for: 2m + labels: + severity: critical + annotations: + summary: IPMI Power state is critical (instance {{ '{{' }} $labels.instance {{ '}}' }}) + description: "The IPMI power {{ '{{' }} $labels.name {{ '}}' }} has state: critical.\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}" + + - alert: IpmiSystemEventLogManyEvents + expr: rate(ipmi_sel_logs_count[2m]) > 0.1 + for: 1m + labels: + severity: warning + annotations: + summary: IPMI SEL grows quickly (instance {{ '{{' }} $labels.instance {{ '}}' }}) + description: "The IPMI system log contains too many new events.\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}" + + - alert: IpmiSystemEventLogFull + expr: ipmi_sel_free_space_bytes < 100 + for: 0m + labels: + severity: warning + annotations: + summary: IPMI SEL is full (instance {{ '{{' }} $labels.instance {{ '}}' }}) + description: "The IPMI system log ran out of space.\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}" -- cgit v1.2.3