summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--roles/monitoring/prometheus/server/defaults/main/rules_node.yml45
1 files changed, 23 insertions, 22 deletions
diff --git a/roles/monitoring/prometheus/server/defaults/main/rules_node.yml b/roles/monitoring/prometheus/server/defaults/main/rules_node.yml
index 2278c70a..4d146119 100644
--- a/roles/monitoring/prometheus/server/defaults/main/rules_node.yml
+++ b/roles/monitoring/prometheus/server/defaults/main/rules_node.yml
@@ -343,32 +343,42 @@ prometheus_server_rules_node:
summary: Host disk reports S.M.A.R.T. errors (instance {{ '{{' }} $labels.instance {{ '}}' }})
description: "S.M.A.R.T. reports errors for disk {{ '{{' }} $labels.device {{ '}}' }} on host {{ '{{' }} $labels.instance {{ '}}' }}.\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}"
- - alert: SmartmonAttributeBelowThreshold
- expr: smartmon_attr_value <= smartmon_attr_threshold
+ ### selection of S.M.A.R.T. metrics based on: https://www.backblaze.com/blog/hard-drive-smart-stats/
+ - alert: SmartmonReallocatedSectorsCount
+ expr: delta(smartmon_attr_raw_value{name="reallocated_sector_ct"}[72h]) > 0
for: 0m
labels:
- severity: critical
+ severity: warning
annotations:
- summary: Host disk S.M.A.R.T. attribute is below it's threshold (instance {{ '{{' }} $labels.instance {{ '}}' }})
- description: "S.M.A.R.T. attribute {{ '{{' }} $labels.name {{ '}}' }} for disk {{ '{{' }} $labels.device {{ '}}' }} on host {{ '{{' }} $labels.instance {{ '}}' }} has fallen below it's threshold.\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}"
+ summary: Host disk S.M.A.R.T. reports reallocated sectors (instance {{ '{{' }} $labels.instance {{ '}}' }})
+ description: "S.M.A.R.T. reports reallocated sectors within last 72 hours for disk {{ '{{' }} $labels.device {{ '}}' }} on host {{ '{{' }} $labels.instance {{ '}}' }}, the drive might be failing.\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}"
- - alert: SmartmonReallocatedSectorsCount
- expr: delta(smartmon_attr_raw_value{name="reallocated_sector_ct"}[24h]) > 0
+ - alert: SmartmonReallocatedSectorsCountHigh
+ expr: smartmon_attr_raw_value{name="reallocated_sector_ct"} > 100
for: 0m
labels:
severity: warning
annotations:
- summary: Host disk S.M.A.R.T. reports reallocated sectors (instance {{ '{{' }} $labels.instance {{ '}}' }})
- description: "S.M.A.R.T. reports multiple reallocated sectors within 24 hours for disk {{ '{{' }} $labels.device {{ '}}' }} on host {{ '{{' }} $labels.instance {{ '}}' }}, the drive might be failing.\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}"
+ summary: Host disk S.M.A.R.T. reports more than 100 reallocated sectors (instance {{ '{{' }} $labels.instance {{ '}}' }})
+ description: "S.M.A.R.T. reports more than 100 reallocated sectors for disk {{ '{{' }} $labels.device {{ '}}' }} on host {{ '{{' }} $labels.instance {{ '}}' }}, the drive might be failing.\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}"
+
+ - alert: SmartmonReportedUncorrectableErrors
+ expr: delta(smartmon_attr_raw_value{name="reported_uncorrect"}[72h]) > 0
+ for: 0m
+ labels:
+ severity: warning
+ annotations:
+ summary: Host disk S.M.A.R.T. reports uncorrectable errors (instance {{ '{{' }} $labels.instance {{ '}}' }})
+ description: "S.M.A.R.T. reports uncorrectable errors within last 72 hours for disk {{ '{{' }} $labels.device {{ '}}' }} on host {{ '{{' }} $labels.instance {{ '}}' }}, the drive might be failing.\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}"
- - alert: SmartmonReallocationEventCount
- expr: delta(smartmon_attr_raw_value{name="reallocated_event_count"}[24h]) > 0
+ - alert: SmartmonCommandTimeouts
+ expr: delta(smartmon_attr_raw_value{name="command_timeout"}[72h]) > 0
for: 0m
labels:
severity: warning
annotations:
- summary: Host disk S.M.A.R.T. reports reallocation events (instance {{ '{{' }} $labels.instance {{ '}}' }})
- description: "S.M.A.R.T. reports multiple reallocation events within 24 hours for disk {{ '{{' }} $labels.device {{ '}}' }} on host {{ '{{' }} $labels.instance {{ '}}' }}, the drive might be failing.\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}"
+ summary: Host disk S.M.A.R.T. reports command timeouts (instance {{ '{{' }} $labels.instance {{ '}}' }})
+ description: "S.M.A.R.T. reports command timeouts within last 72 hours for disk {{ '{{' }} $labels.device {{ '}}' }} on host {{ '{{' }} $labels.instance {{ '}}' }}, the drive might be failing.\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}"
- alert: SmartmonCurrentPendingSectors
expr: smartmon_attr_raw_value{name="current_pending_sector"} > 0
@@ -388,15 +398,6 @@ prometheus_server_rules_node:
summary: Host disk S.M.A.R.T. reports offline uncorrectable errors (instance {{ '{{' }} $labels.instance {{ '}}' }})
description: "S.M.A.R.T. reports offline uncorrectable errors for disk {{ '{{' }} $labels.device {{ '}}' }} on host {{ '{{' }} $labels.instance {{ '}}' }}, the drive might be failing.\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}"
- - alert: SmartmonSpinRetryCount
- expr: smartmon_attr_raw_value{name="spin_retry_count"} > 0
- for: 0m
- labels:
- severity: warning
- annotations:
- summary: Host disk S.M.A.R.T. reports spin-up retries (instance {{ '{{' }} $labels.instance {{ '}}' }})
- description: "S.M.A.R.T. reports spin-up retries for disk {{ '{{' }} $labels.device {{ '}}' }} on host {{ '{{' }} $labels.instance {{ '}}' }}, the drive might be failing.\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}"
-
- alert: SyncoidPullJobTooLongAgo
expr: time() - syncoid_pull_run > (24 * 3600)
for: 0m