summaryrefslogtreecommitdiff
path: root/roles
diff options
context:
space:
mode:
authorChristian Pointner <equinox@spreadspace.org>2021-10-16 12:35:32 +0200
committerChristian Pointner <equinox@spreadspace.org>2021-10-16 12:35:32 +0200
commit7f9208af58e5143a02541be9e6b9ce5fa504156c (patch)
tree46128ffec7375541beecdd16edbe32c306d94e34 /roles
parentadd apt-repo from consol labs (diff)
prometues: add some more smartmon alerts
Diffstat (limited to 'roles')
-rw-r--r--roles/monitoring/prometheus/server/defaults/main/rules_node.yml47
1 files changed, 46 insertions, 1 deletions
diff --git a/roles/monitoring/prometheus/server/defaults/main/rules_node.yml b/roles/monitoring/prometheus/server/defaults/main/rules_node.yml
index ffe616b7..af47e7f7 100644
--- a/roles/monitoring/prometheus/server/defaults/main/rules_node.yml
+++ b/roles/monitoring/prometheus/server/defaults/main/rules_node.yml
@@ -309,10 +309,55 @@ prometheus_server_rules_node:
description: "S.M.A.R.T. reports errors for disk {{ '{{' }} $labels.device {{ '}}' }} on host {{ '{{' }} $labels.instance {{ '}}' }}.\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}"
- alert: SmartmonAttributeBelowThreshold
- expr: smartmon_attr_value < smartmon_attr_threshold
+ expr: smartmon_attr_value <= smartmon_attr_threshold
for: 0m
labels:
severity: critical
annotations:
summary: Host disk S.M.A.R.T. attribute is below it's threshold (instance {{ '{{' }} $labels.instance {{ '}}' }})
description: "S.M.A.R.T. attribute {{ '{{' }} $labels.name {{ '}}' }} for disk {{ '{{' }} $labels.device {{ '}}' }} on host {{ '{{' }} $labels.instance {{ '}}' }} has fallen below it's threshold.\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}"
+
+ - alert: SmartmonReallocatedSectorsCount
+ expr: smartmon_attr_raw_value{name="reallocated_sector_ct"} > 0
+ for: 0m
+ labels:
+ severity: warning
+ annotations:
+ summary: Host disk S.M.A.R.T. reports reallocated sectors (instance {{ '{{' }} $labels.instance {{ '}}' }})
+ description: "S.M.A.R.T. reports reallocated sectors for disk {{ '{{' }} $labels.device {{ '}}' }} on host {{ '{{' }} $labels.instance {{ '}}' }}, the drive might be failing.\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}"
+
+ - alert: SmartmonReallocationEventCount
+ expr: smartmon_attr_raw_value{name="reallocated_event_count"} > 0
+ for: 0m
+ labels:
+ severity: warning
+ annotations:
+ summary: Host disk S.M.A.R.T. reports reallocation events (instance {{ '{{' }} $labels.instance {{ '}}' }})
+ description: "S.M.A.R.T. reports reallocation events for disk {{ '{{' }} $labels.device {{ '}}' }} on host {{ '{{' }} $labels.instance {{ '}}' }}, the drive might be failing.\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}"
+
+ - alert: SmartmonCurrentPendingSectors
+ expr: smartmon_attr_raw_value{name="current_pending_sector"} > 0
+ for: 0m
+ labels:
+ severity: warning
+ annotations:
+ summary: Host disk S.M.A.R.T. reports current pending sectors (instance {{ '{{' }} $labels.instance {{ '}}' }})
+ description: "S.M.A.R.T. reports current pending sectors for disk {{ '{{' }} $labels.device {{ '}}' }} on host {{ '{{' }} $labels.instance {{ '}}' }}, the drive might be failing.\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}"
+
+ - alert: SmartmonOfflineUncorrectable
+ expr: smartmon_attr_raw_value{name="offline_uncorrectable"} > 0
+ for: 0m
+ labels:
+ severity: warning
+ annotations:
+ summary: Host disk S.M.A.R.T. reports offline uncorrectable errors (instance {{ '{{' }} $labels.instance {{ '}}' }})
+ description: "S.M.A.R.T. reports offline uncorrectable errors for disk {{ '{{' }} $labels.device {{ '}}' }} on host {{ '{{' }} $labels.instance {{ '}}' }}, the drive might be failing.\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}"
+
+ - alert: SmartmonSpinRetryCount
+ expr: smartmon_attr_raw_value{name="spin_retry_count"} > 0
+ for: 0m
+ labels:
+ severity: warning
+ annotations:
+ summary: Host disk S.M.A.R.T. reports spin-up retries (instance {{ '{{' }} $labels.instance {{ '}}' }})
+ description: "S.M.A.R.T. reports spin-up retries for disk {{ '{{' }} $labels.device {{ '}}' }} on host {{ '{{' }} $labels.instance {{ '}}' }}, the drive might be failing.\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}"