summaryrefslogtreecommitdiff
path: root/roles/monitoring/prometheus/server/defaults/main/rules_node.yml
diff options
context:
space:
mode:
Diffstat (limited to 'roles/monitoring/prometheus/server/defaults/main/rules_node.yml')
-rw-r--r--roles/monitoring/prometheus/server/defaults/main/rules_node.yml45
1 files changed, 45 insertions, 0 deletions
diff --git a/roles/monitoring/prometheus/server/defaults/main/rules_node.yml b/roles/monitoring/prometheus/server/defaults/main/rules_node.yml
index 55641534..79a474e8 100644
--- a/roles/monitoring/prometheus/server/defaults/main/rules_node.yml
+++ b/roles/monitoring/prometheus/server/defaults/main/rules_node.yml
@@ -253,3 +253,48 @@ prometheus_server_rules_node:
annotations:
summary: Host must be rebootet (instance {{ '{{' }} $labels.instance {{ '}}' }})
description: "Host {{ '{{' }} $labels.instance {{ '}}' }} must be rebootet for security uppdates to take effect.\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}"
+
+ - alert: DeletedLibraryInUse
+ expr: node_processes_linking_deleted_libraries
+ for: 2m
+ labels:
+ severity: warning
+ annotations:
+ summary: Some processes still use a deleted library (instance {{ '{{' }} $labels.instance {{ '}}' }})
+ description: "The deleted library {{ '{{' }} $labels.library_name {{ '}}' }} on host {{ '{{' }} $labels.instance {{ '}}' }} is still in use by {{ '{{' }} $value {{ '}}' }} processes.\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}"
+
+ - alert: SmartmonMetricsOutdated
+ expr: smartmon_smartctl_run - time() > 7200
+ for: 0m
+ labels:
+ severity: warning
+ annotations:
+ summary: Metrics from smartctl are too old (instance {{ '{{' }} $labels.instance {{ '}}' }})
+ description: "The exported values from smartctl on host {{ '{{' }} $labels.instance {{ '}}' }} are {{ '{{' }} $value {{ '}}' }} seconds old.\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}"
+
+ - alert: SmartmonDeviceUnhealthy
+ expr: smartmon_device_smart_healthy == 0
+ for: 0m
+ labels:
+ severity: warning
+ annotations:
+ summary: Host disks are unhealthy (instance {{ '{{' }} $labels.instance {{ '}}' }})
+ description: "S.M.A.R.T. reports unhealthy device {{ '{{' }} $labels.device {{ '}}' }} on host {{ '{{' }} $labels.instance {{ '}}' }}.\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}"
+
+ - alert: SmartmonDeviceErrors
+ expr: smartmon_device_errors > 0
+ for: 0m
+ labels:
+ severity: warning
+ annotations:
+ summary: Host disk reports S.M.A.R.T. errors (instance {{ '{{' }} $labels.instance {{ '}}' }})
+ description: "S.M.A.R.T. reports errors for disk {{ '{{' }} $labels.device {{ '}}' }} on host {{ '{{' }} $labels.instance {{ '}}' }}.\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}"
+
+ - alert: SmartmonAttributeBelowThreshold
+ expr: smartmon_attr_value < smartmon_attr_threshold
+ for: 0m
+ labels:
+ severity: critical
+ annotations:
+ summary: Host disk S.M.A.R.T. attribute is below it's threshold (instance {{ '{{' }} $labels.instance {{ '}}' }})
+ description: "S.M.A.R.T. attribute {{ '{{' }} $labels.name {{ '}}' }} for disk {{ '{{' }} $labels.device {{ '}}' }} on host {{ '{{' }} $labels.instance {{ '}}' }} has fallen below it's threshold.\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}"