summaryrefslogtreecommitdiff
path: root/roles
diff options
context:
space:
mode:
authorChristian Pointner <equinox@spreadspace.org>2021-10-15 22:52:47 +0200
committerChristian Pointner <equinox@spreadspace.org>2021-10-15 22:52:47 +0200
commit8ceb96b6425c4256a0efcaf73529421a32f99998 (patch)
tree6e8eb96d8feb3f17dd329ffc1eecc41b5c19a020 /roles
parentprometheus: fix smartmon textfile collector (diff)
add some alerts for smartmon collector
Diffstat (limited to 'roles')
-rw-r--r--roles/monitoring/prometheus/exporter/node/files/smartmon2
-rw-r--r--roles/monitoring/prometheus/server/defaults/main/rules_node.yml45
2 files changed, 46 insertions, 1 deletions
diff --git a/roles/monitoring/prometheus/exporter/node/files/smartmon b/roles/monitoring/prometheus/exporter/node/files/smartmon
index 1c39b492..fd03e45f 100644
--- a/roles/monitoring/prometheus/exporter/node/files/smartmon
+++ b/roles/monitoring/prometheus/exporter/node/files/smartmon
@@ -326,7 +326,7 @@ def collect_ata_error_count(device):
def collect_disks_smart_metrics(wakeup_disks):
- now = int(datetime.datetime.utcnow().timestamp())
+ now = int(datetime.datetime.now().timestamp())
for device in find_devices():
yield Metric('smartctl_run', device.base_labels, now)
diff --git a/roles/monitoring/prometheus/server/defaults/main/rules_node.yml b/roles/monitoring/prometheus/server/defaults/main/rules_node.yml
index 55641534..79a474e8 100644
--- a/roles/monitoring/prometheus/server/defaults/main/rules_node.yml
+++ b/roles/monitoring/prometheus/server/defaults/main/rules_node.yml
@@ -253,3 +253,48 @@ prometheus_server_rules_node:
annotations:
summary: Host must be rebootet (instance {{ '{{' }} $labels.instance {{ '}}' }})
description: "Host {{ '{{' }} $labels.instance {{ '}}' }} must be rebootet for security uppdates to take effect.\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}"
+
+ - alert: DeletedLibraryInUse
+ expr: node_processes_linking_deleted_libraries
+ for: 2m
+ labels:
+ severity: warning
+ annotations:
+ summary: Some processes still use a deleted library (instance {{ '{{' }} $labels.instance {{ '}}' }})
+ description: "The deleted library {{ '{{' }} $labels.library_name {{ '}}' }} on host {{ '{{' }} $labels.instance {{ '}}' }} is still in use by {{ '{{' }} $value {{ '}}' }} processes.\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}"
+
+ - alert: SmartmonMetricsOutdated
+ expr: smartmon_smartctl_run - time() > 7200
+ for: 0m
+ labels:
+ severity: warning
+ annotations:
+ summary: Metrics from smartctl are too old (instance {{ '{{' }} $labels.instance {{ '}}' }})
+ description: "The exported values from smartctl on host {{ '{{' }} $labels.instance {{ '}}' }} are {{ '{{' }} $value {{ '}}' }} seconds old.\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}"
+
+ - alert: SmartmonDeviceUnhealthy
+ expr: smartmon_device_smart_healthy == 0
+ for: 0m
+ labels:
+ severity: warning
+ annotations:
+ summary: Host disks are unhealthy (instance {{ '{{' }} $labels.instance {{ '}}' }})
+ description: "S.M.A.R.T. reports unhealthy device {{ '{{' }} $labels.device {{ '}}' }} on host {{ '{{' }} $labels.instance {{ '}}' }}.\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}"
+
+ - alert: SmartmonDeviceErrors
+ expr: smartmon_device_errors > 0
+ for: 0m
+ labels:
+ severity: warning
+ annotations:
+ summary: Host disk reports S.M.A.R.T. errors (instance {{ '{{' }} $labels.instance {{ '}}' }})
+ description: "S.M.A.R.T. reports errors for disk {{ '{{' }} $labels.device {{ '}}' }} on host {{ '{{' }} $labels.instance {{ '}}' }}.\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}"
+
+ - alert: SmartmonAttributeBelowThreshold
+ expr: smartmon_attr_value < smartmon_attr_threshold
+ for: 0m
+ labels:
+ severity: critical
+ annotations:
+ summary: Host disk S.M.A.R.T. attribute is below it's threshold (instance {{ '{{' }} $labels.instance {{ '}}' }})
+ description: "S.M.A.R.T. attribute {{ '{{' }} $labels.name {{ '}}' }} for disk {{ '{{' }} $labels.device {{ '}}' }} on host {{ '{{' }} $labels.instance {{ '}}' }} has fallen below it's threshold.\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}"