summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--roles/monitoring/prometheus/exporter/node/templates/textfile-collector-scripts/deleted-libraries.j217
-rw-r--r--roles/monitoring/prometheus/exporter/node/templates/textfile-collector-scripts/smartmon.j248
-rw-r--r--roles/monitoring/prometheus/server/defaults/main/rules_node.yml13
3 files changed, 68 insertions, 10 deletions
diff --git a/roles/monitoring/prometheus/exporter/node/templates/textfile-collector-scripts/deleted-libraries.j2 b/roles/monitoring/prometheus/exporter/node/templates/textfile-collector-scripts/deleted-libraries.j2
index aeddc903..b1a78dec 100644
--- a/roles/monitoring/prometheus/exporter/node/templates/textfile-collector-scripts/deleted-libraries.j2
+++ b/roles/monitoring/prometheus/exporter/node/templates/textfile-collector-scripts/deleted-libraries.j2
@@ -30,15 +30,18 @@ def main():
if path not in processes_linking_deleted_libraries:
processes_linking_deleted_libraries[path] = {}
- if library in processes_linking_deleted_libraries[path]:
- processes_linking_deleted_libraries[path][library] += 1
- else:
- processes_linking_deleted_libraries[path][library] = 1
- except EnvironmentError as e:
+ if library in processes_linking_deleted_libraries[path]:
+ processes_linking_deleted_libraries[path][library] += 1
+ else:
+ processes_linking_deleted_libraries[path][library] = 1
+ except FileNotFoundError:
# Ignore non-existent files, since the files may have changed since
# we globbed.
- if e.errno != errno.ENOENT:
- sys.exit('Failed to open file: {0}'.format(path))
+ pass
+ except ProcessLookupError:
+ # If process vanishes while collecting the linked libs reading lines from
+ # the map file yields this error.
+ pass
num_processes_per_library = {}
diff --git a/roles/monitoring/prometheus/exporter/node/templates/textfile-collector-scripts/smartmon.j2 b/roles/monitoring/prometheus/exporter/node/templates/textfile-collector-scripts/smartmon.j2
index b033faf0..2b60509c 100644
--- a/roles/monitoring/prometheus/exporter/node/templates/textfile-collector-scripts/smartmon.j2
+++ b/roles/monitoring/prometheus/exporter/node/templates/textfile-collector-scripts/smartmon.j2
@@ -85,7 +85,7 @@ class Device(collections.namedtuple('DeviceBase', 'path opts')):
@property
def base_labels(self):
- return {'device': self.path, 'disk': self.type.partition('+')[2] or '0'}
+ return {'device': self.path, 'type': self.type}
def smartctl_select(self):
return ['--device', self.type, self.path]
@@ -203,6 +203,11 @@ def device_smart_capabilities(device):
(bool): True whenever SMART is available, False otherwise.
(bool): True whenever SMART is enabled, False otherwise.
"""
+
+ # NVME devices are SMART capable
+ if device.type == 'nvme':
+ return True, True
+
groups = device_info(device)
state = {
@@ -325,6 +330,44 @@ def collect_ata_error_count(device):
yield Metric('device_errors', device.base_labels, error_count)
+def collect_nvme_metrics(device):
+ # Fetch NVME metrics
+ attributes = smart_ctl(
+ '--attributes', *device.smartctl_select()
+ )
+
+ # replace multiple occurrences of whitespaces with a singel whitespace
+ attributes = re.sub(r'[\t\x20]+', ' ', attributes)
+
+ # Turn smartctl output into a list of lines and skip to the table of
+ # SMART attributes.
+ attribute_lines = attributes.strip().split('\n')[6:]
+ for line in attribute_lines:
+ label, value = line.split(':')
+ if label == 'Available Spare':
+ yield Metric('available_spare_ratio', device.base_labels, value[0:-1])
+ elif label == 'Available Spare Threshold':
+ yield Metric('available_spare_threshold_ratio', device.base_labels, value[0:-1])
+ elif label == 'Percentage Used':
+ yield Metric('percentage_used_ratio', device.base_labels, value[0:-1])
+ elif label == 'Power Cycle':
+ yield Metric('power_cycles_total', device.base_labels, value)
+ elif label == 'Power On Hours':
+ yield Metric('power_on_hours_total', device.base_labels, value.replace(',', ''))
+ elif label == 'Temperature':
+ yield Metric('temperature_celcius', device.base_labels, value.replace(' Celsius', ''))
+ elif label == 'Unsafe Shutdowns':
+ yield Metric('unsafe_shutdowns_total', device.base_labels, value)
+ elif label == 'Media and Data Integrity Errors':
+ yield Metric('media_errors_total', device.base_labels, value)
+ elif label == 'Error Information Log Entries':
+ yield Metric('num_err_log_entries_total', device.base_labels, value)
+ elif label == 'Warning Comp. Temperature Time':
+ yield Metric('warning_temperature_time_total', device.base_labels, value)
+ elif label == 'Critical Comp. Temperature Time':
+ yield Metric('critical_temperature_time_total', device.base_labels, value)
+
+
def collect_disks_smart_metrics(wakeup_disks):
now = int(datetime.datetime.now().timestamp())
@@ -362,6 +405,9 @@ def collect_disks_smart_metrics(wakeup_disks):
yield from collect_ata_error_count(device)
+ if device.type == 'nvme':
+ yield from collect_nvme_metrics(device)
+
def main():
parser = argparse.ArgumentParser()
diff --git a/roles/monitoring/prometheus/server/defaults/main/rules_node.yml b/roles/monitoring/prometheus/server/defaults/main/rules_node.yml
index 7de36758..2278c70a 100644
--- a/roles/monitoring/prometheus/server/defaults/main/rules_node.yml
+++ b/roles/monitoring/prometheus/server/defaults/main/rules_node.yml
@@ -2,6 +2,15 @@
## https://awesome-prometheus-alerts.grep.to/rules#host-and-hardware
prometheus_server_rules_node_extra: []
prometheus_server_rules_node:
+ - alert: NodeTextfileScrapeFailed
+ expr: node_textfile_scrape_error != 0
+ for: 0m
+ labels:
+ severity: warning
+ annotations:
+ summary: Scraping metrics from textfiles failed (instance {{ '{{' }} $labels.instance {{ '}}' }})
+ description: "The textfile collector failed to scrape at least one file\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}"
+
- alert: HostOutOfMemory
expr: node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 10
for: 2m
@@ -47,7 +56,7 @@ prometheus_server_rules_node:
description: "Filesystem is predicted to run out of space within the next 24 hours at current write rate\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}"
- alert: HostOutOfInodes
- expr: node_filesystem_files_free{mountpoint ="/rootfs"} / node_filesystem_files{mountpoint="/rootfs"} * 100 < 10 and ON (instance, device, mountpoint) node_filesystem_readonly{mountpoint="/rootfs"} == 0
+ expr: node_filesystem_files_free{mountpoint="/rootfs"} / node_filesystem_files{mountpoint="/rootfs"} * 100 < 10 and ON (instance, device, mountpoint) node_filesystem_readonly{mountpoint="/rootfs"} == 0
for: 2m
labels:
severity: warning
@@ -56,7 +65,7 @@ prometheus_server_rules_node:
description: "Disk is almost running out of available inodes (< 10% left)\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}"
- alert: HostInodesWillFillIn24Hours
- expr: node_filesystem_files_free{mountpoint ="/rootfs"} / node_filesystem_files{mountpoint="/rootfs"} * 100 < 10 and predict_linear(node_filesystem_files_free{mountpoint="/rootfs"}[1h], 24 * 3600) < 0 and ON (instance, device, mountpoint) node_filesystem_readonly{mountpoint="/rootfs"} == 0
+ expr: node_filesystem_files_free{mountpoint="/rootfs"} / node_filesystem_files{mountpoint="/rootfs"} * 100 < 10 and predict_linear(node_filesystem_files_free{mountpoint="/rootfs"}[1h], 24 * 3600) < 0 and ON (instance, device, mountpoint) node_filesystem_readonly{mountpoint="/rootfs"} == 0
for: 2m
labels:
severity: warning