summaryrefslogtreecommitdiff
path: root/roles/monitoring/prometheus/server/defaults/main/rules_ipmi.yml
blob: 31c15d51edccf43c506c307a8e4f1ad9418a2ed4 (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
---
prometheus_server_rules_ipmi_extra: []
prometheus_server_rules_ipmi:
  - alert: IpmiFailedToScrapeCollector
    expr: ipmi_up == 0
    for: 2m
    labels:
      severity: warning
    annotations:
      summary: IPMI collector failed to scrape (instance {{ '{{' }} $labels.instance {{ '}}' }})
      description: "The IPMI collector {{ '{{' }} $labels.collector {{ '}}' }} could not be scraped.\n  VALUE = {{ '{{' }} $value {{ '}}' }}\n  LABELS = {{ '{{' }} $labels {{ '}}' }}"

  - alert: IpmiSensorStateWarning
    expr: ipmi_sensor_state == 1
    for: 2m
    labels:
      severity: warning
    annotations:
      summary: IPMI Sensor state is warning (instance {{ '{{' }} $labels.instance {{ '}}' }})
      description: "The IPMI sensor {{ '{{' }} $labels.name {{ '}}' }} has state: warning.\n  VALUE = {{ '{{' }} $value {{ '}}' }}\n  LABELS = {{ '{{' }} $labels {{ '}}' }}"

  - alert: IpmiSensorStateCritical
    expr: ipmi_sensor_state == 2
    for: 2m
    labels:
      severity: critical
    annotations:
      summary: IPMI Sensor state is critical (instance {{ '{{' }} $labels.instance {{ '}}' }})
      description: "The IPMI sensor {{ '{{' }} $labels.name {{ '}}' }} has state: critical.\n  VALUE = {{ '{{' }} $value {{ '}}' }}\n  LABELS = {{ '{{' }} $labels {{ '}}' }}"

  - alert: IpmiFanSpeedStateWarning
    expr: ipmi_fan_speed_state == 1
    for: 2m
    labels:
      severity: warning
    annotations:
      summary: IPMI fan-speed state is warning (instance {{ '{{' }} $labels.instance {{ '}}' }})
      description: "The IPMI fanspeed {{ '{{' }} $labels.name {{ '}}' }} has state: warning.\n  VALUE = {{ '{{' }} $value {{ '}}' }}\n  LABELS = {{ '{{' }} $labels {{ '}}' }}"

  - alert: IpmiFanSpeedStateCritical
    expr: ipmi_fan_speed_state == 2
    for: 2m
    labels:
      severity: critical
    annotations:
      summary: IPMI fan-speed state is critical (instance {{ '{{' }} $labels.instance {{ '}}' }})
      description: "The IPMI fan speed {{ '{{' }} $labels.name {{ '}}' }} has state: critical.\n  VALUE = {{ '{{' }} $value {{ '}}' }}\n  LABELS = {{ '{{' }} $labels {{ '}}' }}"

  - alert: IpmiTemperatureStateWarning
    expr: ipmi_temperature_state == 1
    for: 2m
    labels:
      severity: warning
    annotations:
      summary: IPMI Temperature state is warning (instance {{ '{{' }} $labels.instance {{ '}}' }})
      description: "The IPMI temperature {{ '{{' }} $labels.name {{ '}}' }} has state: warning.\n  VALUE = {{ '{{' }} $value {{ '}}' }}\n  LABELS = {{ '{{' }} $labels {{ '}}' }}"

  - alert: IpmiTemperatureStateCritical
    expr: ipmi_temperature_state == 2
    for: 2m
    labels:
      severity: critical
    annotations:
      summary: IPMI Temperature state is critical (instance {{ '{{' }} $labels.instance {{ '}}' }})
      description: "The IPMI temperature {{ '{{' }} $labels.name {{ '}}' }} has state: critical.\n  VALUE = {{ '{{' }} $value {{ '}}' }}\n  LABELS = {{ '{{' }} $labels {{ '}}' }}"

  - alert: IpmiVoltageStateWarning
    expr: ipmi_voltage_state == 1
    for: 2m
    labels:
      severity: warning
    annotations:
      summary: IPMI Voltage state is warning (instance {{ '{{' }} $labels.instance {{ '}}' }})
      description: "The IPMI voltage {{ '{{' }} $labels.name {{ '}}' }} has state: warning.\n  VALUE = {{ '{{' }} $value {{ '}}' }}\n  LABELS = {{ '{{' }} $labels {{ '}}' }}"

  - alert: IpmiVoltageStateCritical
    expr: ipmi_voltage_state == 2
    for: 2m
    labels:
      severity: critical
    annotations:
      summary: IPMI Voltage state is critical (instance {{ '{{' }} $labels.instance {{ '}}' }})
      description: "The IPMI voltage {{ '{{' }} $labels.name {{ '}}' }} has state: critical.\n  VALUE = {{ '{{' }} $value {{ '}}' }}\n  LABELS = {{ '{{' }} $labels {{ '}}' }}"

  - alert: IpmiCurrentStateWarning
    expr: ipmi_current_state == 1
    for: 2m
    labels:
      severity: warning
    annotations:
      summary: IPMI Current state is warning (instance {{ '{{' }} $labels.instance {{ '}}' }})
      description: "The IPMI current {{ '{{' }} $labels.name {{ '}}' }} has state: warning.\n  VALUE = {{ '{{' }} $value {{ '}}' }}\n  LABELS = {{ '{{' }} $labels {{ '}}' }}"

  - alert: IpmiCurrentStateCritical
    expr: ipmi_current_state == 2
    for: 2m
    labels:
      severity: critical
    annotations:
      summary: IPMI Current state is critical (instance {{ '{{' }} $labels.instance {{ '}}' }})
      description: "The IPMI current {{ '{{' }} $labels.name {{ '}}' }} has state: critical.\n  VALUE = {{ '{{' }} $value {{ '}}' }}\n  LABELS = {{ '{{' }} $labels {{ '}}' }}"

  - alert: IpmiPowerStateWarning
    expr: ipmi_power_state == 1
    for: 2m
    labels:
      severity: warning
    annotations:
      summary: IPMI Power state is warning (instance {{ '{{' }} $labels.instance {{ '}}' }})
      description: "The IPMI power {{ '{{' }} $labels.name {{ '}}' }} has state: warning.\n  VALUE = {{ '{{' }} $value {{ '}}' }}\n  LABELS = {{ '{{' }} $labels {{ '}}' }}"

  - alert: IpmiPowerStateCritical
    expr: ipmi_power_state == 2
    for: 2m
    labels:
      severity: critical
    annotations:
      summary: IPMI Power state is critical (instance {{ '{{' }} $labels.instance {{ '}}' }})
      description: "The IPMI power {{ '{{' }} $labels.name {{ '}}' }} has state: critical.\n  VALUE = {{ '{{' }} $value {{ '}}' }}\n  LABELS = {{ '{{' }} $labels {{ '}}' }}"

  - alert: IpmiSystemEventLogManyEvents
    expr: rate(ipmi_sel_logs_count[2m]) > 0.1
    for: 1m
    labels:
      severity: warning
    annotations:
      summary: IPMI SEL grows quickly (instance {{ '{{' }} $labels.instance {{ '}}' }})
      description: "The IPMI system log contains too many new events.\n  VALUE = {{ '{{' }} $value {{ '}}' }}\n  LABELS = {{ '{{' }} $labels {{ '}}' }}"

  - alert: IpmiSystemEventLogFull
    expr: ipmi_sel_free_space_bytes < 100
    for: 0m
    labels:
      severity: warning
    annotations:
      summary: IPMI SEL is full (instance {{ '{{' }} $labels.instance {{ '}}' }})
      description: "The IPMI system log ran out of space.\n  VALUE = {{ '{{' }} $value {{ '}}' }}\n  LABELS = {{ '{{' }} $labels {{ '}}' }}"