roles/monitoring/prometheus/server/defaults/main/rules_node.yml


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219

---
## https://awesome-prometheus-alerts.grep.to/rules#host-and-hardware
prometheus_server_rules_node_extra: []
prometheus_server_rules_node:
  - alert: HostOutOfMemory
    expr: node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 10
    for: 2m
    labels:
      severity: warning
    annotations:
      summary: Host out of memory (instance {{ '{{' }} $labels.instance {{ '}}' }})
      description: "Node memory is filling up (< 10% left)\n  VALUE = {{ '{{' }} $value {{ '}}' }}\n  LABELS = {{ '{{' }} $labels {{ '}}' }}"

  - alert: HostMemoryUnderMemoryPressure
    expr: rate(node_vmstat_pgmajfault[1m]) > 1000
    for: 2m
    labels:
      severity: warning
    annotations:
      summary: Host memory under memory pressure (instance {{ '{{' }} $labels.instance {{ '}}' }})
      description: "The node is under heavy memory pressure. High rate of major page faults\n  VALUE = {{ '{{' }} $value {{ '}}' }}\n  LABELS = {{ '{{' }} $labels {{ '}}' }}"

  - alert: HostOutOfDiskSpace
    expr: (node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0
    for: 2m
    labels:
      severity: warning
    annotations:
      summary: Host out of disk space (instance {{ '{{' }} $labels.instance {{ '}}' }})
      description: "Disk is almost full (< 10% left)\n  VALUE = {{ '{{' }} $value {{ '}}' }}\n  LABELS = {{ '{{' }} $labels {{ '}}' }}"

  - alert: HostDiskWillFillIn24Hours
    expr: (node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10 and ON (instance, device, mountpoint) predict_linear(node_filesystem_avail_bytes{fstype!~"tmpfs"}[1h], 24 * 3600) < 0 and ON (instance, device, mountpoint) node_filesystem_readonly == 0
    for: 2m
    labels:
      severity: warning
    annotations:
      summary: Host disk will fill in 24 hours (instance {{ '{{' }} $labels.instance {{ '}}' }})
      description: "Filesystem is predicted to run out of space within the next 24 hours at current write rate\n  VALUE = {{ '{{' }} $value {{ '}}' }}\n  LABELS = {{ '{{' }} $labels {{ '}}' }}"

  - alert: HostOutOfInodes
    expr: node_filesystem_files_free{mountpoint ="/rootfs"} / node_filesystem_files{mountpoint="/rootfs"} * 100 < 10 and ON (instance, device, mountpoint) node_filesystem_readonly{mountpoint="/rootfs"} == 0
    for: 2m
    labels:
      severity: warning
    annotations:
      summary: Host out of inodes (instance {{ '{{' }} $labels.instance {{ '}}' }})
      description: "Disk is almost running out of available inodes (< 10% left)\n  VALUE = {{ '{{' }} $value {{ '}}' }}\n  LABELS = {{ '{{' }} $labels {{ '}}' }}"

  - alert: HostInodesWillFillIn24Hours
    expr: node_filesystem_files_free{mountpoint ="/rootfs"} / node_filesystem_files{mountpoint="/rootfs"} * 100 < 10 and predict_linear(node_filesystem_files_free{mountpoint="/rootfs"}[1h], 24 * 3600) < 0 and ON (instance, device, mountpoint) node_filesystem_readonly{mountpoint="/rootfs"} == 0
    for: 2m
    labels:
      severity: warning
    annotations:
      summary: Host inodes will fill in 24 hours (instance {{ '{{' }} $labels.instance {{ '}}' }})
      description: "Filesystem is predicted to run out of inodes within the next 24 hours at current write rate\n  VALUE = {{ '{{' }} $value {{ '}}' }}\n  LABELS = {{ '{{' }} $labels {{ '}}' }}"

  - alert: HostUnusualDiskReadLatency
    expr: rate(node_disk_read_time_seconds_total[1m]) / rate(node_disk_reads_completed_total[1m]) > 0.1 and rate(node_disk_reads_completed_total[1m]) > 0
    for: 2m
    labels:
      severity: warning
    annotations:
      summary: Host unusual disk read latency (instance {{ '{{' }} $labels.instance {{ '}}' }})
      description: "Disk latency is growing (read operations > 100ms)\n  VALUE = {{ '{{' }} $value {{ '}}' }}\n  LABELS = {{ '{{' }} $labels {{ '}}' }}"

  - alert: HostUnusualDiskWriteLatency
    expr: rate(node_disk_write_time_seconds_total[1m]) / rate(node_disk_writes_completed_total[1m]) > 0.1 and rate(node_disk_writes_completed_total[1m]) > 0
    for: 2m
    labels:
      severity: warning
    annotations:
      summary: Host unusual disk write latency (instance {{ '{{' }} $labels.instance {{ '}}' }})
      description: "Disk latency is growing (write operations > 100ms)\n  VALUE = {{ '{{' }} $value {{ '}}' }}\n  LABELS = {{ '{{' }} $labels {{ '}}' }}"

  - alert: HostHighCpuLoad
    expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[2m])) * 100) > 80
    for: 0m
    labels:
      severity: warning
    annotations:
      summary: Host high CPU load (instance {{ '{{' }} $labels.instance {{ '}}' }})
      description: "CPU load is > 80%\n  VALUE = {{ '{{' }} $value {{ '}}' }}\n  LABELS = {{ '{{' }} $labels {{ '}}' }}"

  - alert: HostCpuStealNoisyNeighbor
    expr: avg by(instance) (rate(node_cpu_seconds_total{mode="steal"}[5m])) * 100 > 10
    for: 0m
    labels:
      severity: warning
    annotations:
      summary: Host CPU steal noisy neighbor (instance {{ '{{' }} $labels.instance {{ '}}' }})
      description: "CPU steal is > 10%. A noisy neighbor is killing VM performances or a spot instance may be out of credit.\n  VALUE = {{ '{{' }} $value {{ '}}' }}\n  LABELS = {{ '{{' }} $labels {{ '}}' }}"

  - alert: HostSystemdServiceCrashed
    expr: node_systemd_unit_state{state="failed"} == 1
    for: 0m
    labels:
      severity: warning
    annotations:
      summary: Host systemd service crashed (instance {{ '{{' }} $labels.instance {{ '}}' }})
      description: "systemd service crashed\n  VALUE = {{ '{{' }} $value {{ '}}' }}\n  LABELS = {{ '{{' }} $labels {{ '}}' }}"

  - alert: HostPhysicalComponentTooHot
    expr: node_hwmon_temp_celsius > 75
    for: 5m
    labels:
      severity: warning
    annotations:
      summary: Host physical component too hot (instance {{ '{{' }} $labels.instance {{ '}}' }})
      description: "Physical hardware component too hot\n  VALUE = {{ '{{' }} $value {{ '}}' }}\n  LABELS = {{ '{{' }} $labels {{ '}}' }}"

  - alert: HostNodeOvertemperatureAlarm
    expr: node_hwmon_temp_crit_alarm_celsius == 1
    for: 0m
    labels:
      severity: critical
    annotations:
      summary: Host node overtemperature alarm (instance {{ '{{' }} $labels.instance {{ '}}' }})
      description: "Physical node temperature alarm triggered\n  VALUE = {{ '{{' }} $value {{ '}}' }}\n  LABELS = {{ '{{' }} $labels {{ '}}' }}"

  - alert: HostRaidArrayGotInactive
    expr: node_md_state{state="inactive"} > 0
    for: 0m
    labels:
      severity: critical
    annotations:
      summary: Host RAID array got inactive (instance {{ '{{' }} $labels.instance {{ '}}' }})
      description: "RAID array {{ '{{' }} $labels.device {{ '}}' }} is in degraded state due to one or more disks failures. Number of spare drives is insufficient to fix issue automatically.\n  VALUE = {{ '{{' }} $value {{ '}}' }}\n  LABELS = {{ '{{' }} $labels {{ '}}' }}"

  - alert: HostRaidDiskFailure
    expr: node_md_disks{state="failed"} > 0
    for: 2m
    labels:
      severity: warning
    annotations:
      summary: Host RAID disk failure (instance {{ '{{' }} $labels.instance {{ '}}' }})
      description: "At least one device in RAID array on {{ '{{' }} $labels.instance {{ '}}' }} failed. Array {{ '{{' }} $labels.md_device {{ '}}' }} needs attention and possibly a disk swap\n  VALUE = {{ '{{' }} $value {{ '}}' }}\n  LABELS = {{ '{{' }} $labels {{ '}}' }}"

  - alert: HostOomKillDetected
    expr: increase(node_vmstat_oom_kill[1m]) > 0
    for: 0m
    labels:
      severity: warning
    annotations:
      summary: Host OOM kill detected (instance {{ '{{' }} $labels.instance {{ '}}' }})
      description: "OOM kill detected\n  VALUE = {{ '{{' }} $value {{ '}}' }}\n  LABELS = {{ '{{' }} $labels {{ '}}' }}"

  - alert: HostEdacCorrectableErrorsDetected
    expr: increase(node_edac_correctable_errors_total[1m]) > 0
    for: 0m
    labels:
      severity: info
    annotations:
      summary: Host EDAC Correctable Errors detected (instance {{ '{{' }} $labels.instance {{ '}}' }})
      description: "Host {{ '{{' }} $labels.instance {{ '}}' }} has had {{ '{{' }} printf \"%.0f\" $value {{ '}}' }} correctable memory errors reported by EDAC in the last 5 minutes.\n  VALUE = {{ '{{' }} $value {{ '}}' }}\n  LABELS = {{ '{{' }} $labels {{ '}}' }}"

  - alert: HostEdacUncorrectableErrorsDetected
    expr: node_edac_uncorrectable_errors_total > 0
    for: 0m
    labels:
      severity: warning
    annotations:
      summary: Host EDAC Uncorrectable Errors detected (instance {{ '{{' }} $labels.instance {{ '}}' }})
      description: "Host {{ '{{' }} $labels.instance {{ '}}' }} has had {{ '{{' }} printf \"%.0f\" $value {{ '}}' }} uncorrectable memory errors reported by EDAC in the last 5 minutes.\n  VALUE = {{ '{{' }} $value {{ '}}' }}\n  LABELS = {{ '{{' }} $labels {{ '}}' }}"

  - alert: HostNetworkReceiveErrors
    expr: rate(node_network_receive_errs_total[2m]) / rate(node_network_receive_packets_total[2m]) > 0.01
    for: 2m
    labels:
      severity: warning
    annotations:
      summary: Host Network Receive Errors (instance {{ '{{' }} $labels.instance {{ '}}' }})
      description: "Host {{ '{{' }} $labels.instance {{ '}}' }} interface {{ '{{' }} $labels.device {{ '}}' }} has encountered {{ '{{' }} printf \"%.0f\" $value {{ '}}' }} receive errors in the last five minutes.\n  VALUE = {{ '{{' }} $value {{ '}}' }}\n  LABELS = {{ '{{' }} $labels {{ '}}' }}"

  - alert: HostNetworkTransmitErrors
    expr: rate(node_network_transmit_errs_total[2m]) / rate(node_network_transmit_packets_total[2m]) > 0.01
    for: 2m
    labels:
      severity: warning
    annotations:
      summary: Host Network Transmit Errors (instance {{ '{{' }} $labels.instance {{ '}}' }})
      description: "Host {{ '{{' }} $labels.instance {{ '}}' }} interface {{ '{{' }} $labels.device {{ '}}' }} has encountered {{ '{{' }} printf \"%.0f\" $value {{ '}}' }} transmit errors in the last five minutes.\n  VALUE = {{ '{{' }} $value {{ '}}' }}\n  LABELS = {{ '{{' }} $labels {{ '}}' }}"

  - alert: HostNetworkInterfaceSaturated
    expr: (rate(node_network_receive_bytes_total{device!~"^tap.*"}[1m]) + rate(node_network_transmit_bytes_total{device!~"^tap.*"}[1m])) / node_network_speed_bytes{device!~"^tap.*"} > 0.8
    for: 1m
    labels:
      severity: warning
    annotations:
      summary: Host Network Interface Saturated (instance {{ '{{' }} $labels.instance {{ '}}' }})
      description: "The network interface \"{{ '{{' }} $labels.interface {{ '}}' }}\" on \"{{ '{{' }} $labels.instance {{ '}}' }}\" is getting overloaded.\n  VALUE = {{ '{{' }} $value {{ '}}' }}\n  LABELS = {{ '{{' }} $labels {{ '}}' }}"

  - alert: HostConntrackLimit
    expr: node_nf_conntrack_entries / node_nf_conntrack_entries_limit > 0.8
    for: 5m
    labels:
      severity: warning
    annotations:
      summary: Host conntrack limit (instance {{ '{{' }} $labels.instance {{ '}}' }})
      description: "The number of conntrack is approching limit\n  VALUE = {{ '{{' }} $value {{ '}}' }}\n  LABELS = {{ '{{' }} $labels {{ '}}' }}"

  - alert: HostClockSkew
    expr: (node_timex_offset_seconds > 0.05 and deriv(node_timex_offset_seconds[5m]) >= 0) or (node_timex_offset_seconds < -0.05 and deriv(node_timex_offset_seconds[5m]) <= 0)
    for: 2m
    labels:
      severity: warning
    annotations:
      summary: Host clock skew (instance {{ '{{' }} $labels.instance {{ '}}' }})
      description: "Clock skew detected. Clock is out of sync.\n  VALUE = {{ '{{' }} $value {{ '}}' }}\n  LABELS = {{ '{{' }} $labels {{ '}}' }}"

  - alert: HostClockNotSynchronising
    expr: min_over_time(node_timex_sync_status[1m]) == 0 and node_timex_maxerror_seconds >= 16
    for: 2m
    labels:
      severity: warning
    annotations:
      summary: Host clock not synchronising (instance {{ '{{' }} $labels.instance {{ '}}' }})
      description: "Clock not synchronising.\n  VALUE = {{ '{{' }} $value {{ '}}' }}\n  LABELS = {{ '{{' }} $labels {{ '}}' }}"