summaryrefslogtreecommitdiff
path: root/roles/monitoring
diff options
context:
space:
mode:
authorChristian Pointner <equinox@spreadspace.org>2021-09-24 15:05:48 +0200
committerChristian Pointner <equinox@spreadspace.org>2021-09-24 15:05:48 +0200
commit9a47d5c3ef94cb09338a1b64d4dc9365d526bb54 (patch)
tree86164b8dc69beb65a1e60ca70c4f52a08548b5e7 /roles/monitoring
parentfix some todos (diff)
refactor textfile collector script handling
Diffstat (limited to 'roles/monitoring')
-rw-r--r--roles/monitoring/prometheus/exporter/TODO4
-rw-r--r--roles/monitoring/prometheus/exporter/node/defaults/main.yml1
-rw-r--r--roles/monitoring/prometheus/exporter/node/files/smartmon391
-rw-r--r--roles/monitoring/prometheus/exporter/node/tasks/main.yml24
-rw-r--r--roles/monitoring/prometheus/exporter/node/tasks/textfile_collector_script.yml2
-rw-r--r--roles/monitoring/prometheus/exporter/node/templates/textfile-collector-scripts/apt.service.j23
-rw-r--r--roles/monitoring/prometheus/exporter/node/templates/textfile-collector-scripts/apt.timer.j25
-rw-r--r--roles/monitoring/prometheus/exporter/node/templates/textfile-collector-scripts/deleted-libraries.service.j23
-rw-r--r--roles/monitoring/prometheus/exporter/node/templates/textfile-collector-scripts/deleted-libraries.timer.j25
-rw-r--r--roles/monitoring/prometheus/exporter/node/templates/textfile-collector-scripts/smartmon.service.j229
-rw-r--r--roles/monitoring/prometheus/exporter/node/templates/textfile-collector-scripts/smartmon.timer.j213
11 files changed, 447 insertions, 33 deletions
diff --git a/roles/monitoring/prometheus/exporter/TODO b/roles/monitoring/prometheus/exporter/TODO
index c02e5699..79ff8721 100644
--- a/roles/monitoring/prometheus/exporter/TODO
+++ b/roles/monitoring/prometheus/exporter/TODO
@@ -1,7 +1,3 @@
-Node Exporter - Text Collector Scripts:
- - https://github.com/prometheus-community/node-exporter-textfile-collector-scripts
- - https://packages.debian.org/bullseye/prometheus-node-exporter-collectors
-
IPMI Exporter:
- https://github.com/soundcloud/ipmi_exporter
- https://packages.debian.org/bullseye/prometheus-ipmi-exporter
diff --git a/roles/monitoring/prometheus/exporter/node/defaults/main.yml b/roles/monitoring/prometheus/exporter/node/defaults/main.yml
index 9309562f..870753c3 100644
--- a/roles/monitoring/prometheus/exporter/node/defaults/main.yml
+++ b/roles/monitoring/prometheus/exporter/node/defaults/main.yml
@@ -14,3 +14,4 @@ prometheus_exporter_node_extra_collectors:
prometheus_exporter_node_textfile_collector_scripts:
- deleted-libraries
+ - smartmon
diff --git a/roles/monitoring/prometheus/exporter/node/files/smartmon b/roles/monitoring/prometheus/exporter/node/files/smartmon
new file mode 100644
index 00000000..1c39b492
--- /dev/null
+++ b/roles/monitoring/prometheus/exporter/node/files/smartmon
@@ -0,0 +1,391 @@
+#!/usr/bin/env python3
+import argparse
+import collections
+import csv
+import datetime
+import decimal
+import re
+import shlex
+import subprocess
+import sys
+
+device_info_re = re.compile(r'^(?P<k>[^:]+?)(?:(?:\sis|):)\s*(?P<v>.*)$')
+
+ata_error_count_re = re.compile(
+ r'^Error (\d+) \[\d+\] occurred', re.MULTILINE)
+
+self_test_re = re.compile(r'^SMART.*(PASSED|OK)$', re.MULTILINE)
+
+device_info_map = {
+ 'Vendor': 'vendor',
+ 'Product': 'product',
+ 'Revision': 'revision',
+ 'Logical Unit id': 'lun_id',
+ 'Model Family': 'model_family',
+ 'Device Model': 'device_model',
+ 'Serial Number': 'serial_number',
+ 'Firmware Version': 'firmware_version',
+}
+
+smart_attributes_whitelist = {
+ 'airflow_temperature_cel',
+ 'command_timeout',
+ 'current_pending_sector',
+ 'end_to_end_error',
+ 'erase_fail_count_total',
+ 'g_sense_error_rate',
+ 'hardware_ecc_recovered',
+ 'host_reads_mib',
+ 'host_reads_32mib',
+ 'host_writes_mib',
+ 'host_writes_32mib',
+ 'load_cycle_count',
+ 'media_wearout_indicator',
+ 'wear_leveling_count',
+ 'nand_writes_1gib',
+ 'offline_uncorrectable',
+ 'power_cycle_count',
+ 'power_on_hours',
+ 'program_fail_count',
+ 'raw_read_error_rate',
+ 'reallocated_event_count',
+ 'reallocated_sector_ct',
+ 'reported_uncorrect',
+ 'sata_downshift_count',
+ 'seek_error_rate',
+ 'spin_retry_count',
+ 'spin_up_time',
+ 'start_stop_count',
+ 'temperature_case',
+ 'temperature_celsius',
+ 'temperature_internal',
+ 'total_lbas_read',
+ 'total_lbas_written',
+ 'udma_crc_error_count',
+ 'unsafe_shutdown_count',
+ 'workld_host_reads_perc',
+ 'workld_media_wear_indic',
+ 'workload_minutes',
+}
+
+Metric = collections.namedtuple('Metric', 'name labels value')
+
+SmartAttribute = collections.namedtuple('SmartAttribute', [
+ 'id', 'name', 'flag', 'value', 'worst', 'threshold', 'type', 'updated',
+ 'when_failed', 'raw_value',
+])
+
+
+class Device(collections.namedtuple('DeviceBase', 'path opts')):
+ """Representation of a device as found by smartctl --scan output."""
+
+ @property
+ def type(self):
+ return self.opts.type
+
+ @property
+ def base_labels(self):
+ return {'device': self.path, 'disk': self.type.partition('+')[2] or '0'}
+
+ def smartctl_select(self):
+ return ['--device', self.type, self.path]
+
+
+def metric_key(metric, prefix=''):
+ return '{prefix}{metric.name}'.format(prefix=prefix, metric=metric)
+
+
+def metric_format(metric, prefix=''):
+ key = metric_key(metric, prefix)
+ labels = ','.join(
+ '{k}="{v}"'.format(k=k, v=v.replace('"', '\\"')) for k, v in metric.labels.items())
+ value = decimal.Decimal(metric.value)
+
+ return '{key}{{{labels}}} {value}'.format(
+ key=key, labels=labels, value=value)
+
+
+def metric_print_meta(metric, prefix=''):
+ key = metric_key(metric, prefix)
+ print('# HELP {key} SMART metric {metric.name}'.format(
+ key=key, metric=metric))
+ print('# TYPE {key} gauge'.format(key=key))
+
+
+def metric_print(metric, prefix=''):
+ print(metric_format(metric, prefix))
+
+
+def smart_ctl(*args, check=True):
+ """Wrapper around invoking the smartctl binary.
+
+ Returns:
+ (str) Data piped to stdout by the smartctl subprocess.
+ """
+ return subprocess.run(
+ ['smartctl', *args], stdout=subprocess.PIPE, check=check
+ ).stdout.decode('utf-8')
+
+
+def smart_ctl_version():
+ return smart_ctl('-V').split('\n')[0].split()[1]
+
+
+def find_devices():
+ """Find SMART devices.
+
+ Yields:
+ (Device) Single device found by smartctl.
+ """
+ parser = argparse.ArgumentParser()
+ parser.add_argument('-d', '--device', dest='type')
+
+ devices = smart_ctl('--scan-open')
+
+ for device in devices.split('\n'):
+ device = device.strip()
+ if not device:
+ continue
+
+ tokens = shlex.split(device, comments=True)
+ if not tokens:
+ continue
+
+ yield Device(tokens[0], parser.parse_args(tokens[1:]))
+
+
+def device_is_active(device):
+ """Returns whenever the given device is currently active or not.
+
+ Args:
+ device: (Device) Device in question.
+
+ Returns:
+ (bool) True if the device is active and False otherwise.
+ """
+ try:
+ smart_ctl('--nocheck', 'standby', *device.smartctl_select())
+ except subprocess.CalledProcessError:
+ return False
+
+ return True
+
+
+def device_info(device):
+ """Query device for basic model information.
+
+ Args:
+ device: (Device) Device in question.
+
+ Returns:
+ (generator): Generator yielding:
+
+ key (str): Key describing the value.
+ value (str): Actual value.
+ """
+ info_lines = smart_ctl(
+ '--info', *device.smartctl_select()
+ ).strip().split('\n')[3:]
+
+ matches = (device_info_re.match(line) for line in info_lines)
+ return (m.groups() for m in matches if m is not None)
+
+
+def device_smart_capabilities(device):
+ """Returns SMART capabilities of the given device.
+
+ Args:
+ device: (Device) Device in question.
+
+ Returns:
+ (tuple): tuple containing:
+
+ (bool): True whenever SMART is available, False otherwise.
+ (bool): True whenever SMART is enabled, False otherwise.
+ """
+ groups = device_info(device)
+
+ state = {
+ g[1].split(' ', 1)[0]
+ for g in groups if g[0] == 'SMART support'}
+
+ smart_available = 'Available' in state
+ smart_enabled = 'Enabled' in state
+
+ return smart_available, smart_enabled
+
+
+def collect_device_info(device):
+ """Collect basic device information.
+
+ Args:
+ device: (Device) Device in question.
+
+ Yields:
+ (Metric) metrics describing general device information.
+ """
+ values = dict(device_info(device))
+ yield Metric('device_info', {
+ **device.base_labels,
+ **{v: values[k] for k, v in device_info_map.items() if k in values}
+ }, True)
+
+
+def collect_device_health_self_assessment(device):
+ """Collect metric about the device health self assessment.
+
+ Args:
+ device: (Device) Device in question.
+
+ Yields:
+ (Metric) Device health self assessment.
+ """
+ out = smart_ctl('--health', *device.smartctl_select(), check=False)
+
+ self_assessment_passed = bool(self_test_re.search(out))
+
+ yield Metric(
+ 'device_smart_healthy', device.base_labels, self_assessment_passed)
+
+
+def collect_ata_metrics(device):
+ # Fetch SMART attributes for the given device.
+ attributes = smart_ctl(
+ '--attributes', *device.smartctl_select()
+ )
+
+ # replace multiple occurrences of whitespace with a single whitespace
+ # so that the CSV Parser recognizes individual columns properly.
+ attributes = re.sub(r'[\t\x20]+', ' ', attributes)
+
+ # Turn smartctl output into a list of lines and skip to the table of
+ # SMART attributes.
+ attribute_lines = attributes.strip().split('\n')[7:]
+
+ # Some attributes have multiple IDs but have the same name. Don't
+ # yield attributes that already have been reported before.
+ seen = set()
+
+ reader = csv.DictReader(
+ (line.strip() for line in attribute_lines),
+ fieldnames=SmartAttribute._fields[:-1],
+ restkey=SmartAttribute._fields[-1], delimiter=' ')
+ for entry in reader:
+ # We're only interested in the SMART attributes that are
+ # whitelisted here.
+ entry['name'] = entry['name'].lower()
+ if entry['name'] not in smart_attributes_whitelist:
+ continue
+
+ # Ensure that only the numeric parts are fetched from the raw_value.
+ # Attributes such as 194 Temperature_Celsius reported by my SSD
+ # are in the format of "36 (Min/Max 24/40)" which can't be expressed
+ # properly as a prometheus metric.
+ m = re.match(r'^(\d+)', ' '.join(entry['raw_value']))
+ if not m:
+ continue
+ entry['raw_value'] = m.group(1)
+
+ # Some device models report "---" in the threshold value where most
+ # devices would report "000". We do the substitution here because
+ # downstream code expects values to be convertable to integer.
+ if entry['threshold'] == '---':
+ entry['threshold'] = '0'
+
+ if entry['name'] in smart_attributes_whitelist and entry['name'] not in seen:
+ labels = {
+ 'name': entry['name'],
+ **device.base_labels,
+ }
+
+ for col in 'value', 'worst', 'threshold', 'raw_value':
+ yield Metric(
+ 'attr_{col}'.format(col=col),
+ labels, entry[col])
+
+ seen.add(entry['name'])
+
+
+def collect_ata_error_count(device):
+ """Inspect the device error log and report the amount of entries.
+
+ Args:
+ device: (Device) Device in question.
+
+ Yields:
+ (Metric) Device error count.
+ """
+ error_log = smart_ctl(
+ '-l', 'xerror,1', *device.smartctl_select(), check=False)
+
+ m = ata_error_count_re.search(error_log)
+
+ error_count = m.group(1) if m is not None else 0
+
+ yield Metric('device_errors', device.base_labels, error_count)
+
+
+def collect_disks_smart_metrics(wakeup_disks):
+ now = int(datetime.datetime.utcnow().timestamp())
+
+ for device in find_devices():
+ yield Metric('smartctl_run', device.base_labels, now)
+
+ is_active = device_is_active(device)
+
+ yield Metric('device_active', device.base_labels, is_active)
+
+ # Skip further metrics collection to prevent the disk from
+ # spinning up.
+ if not is_active and not wakeup_disks:
+ continue
+
+ yield from collect_device_info(device)
+
+ smart_available, smart_enabled = device_smart_capabilities(device)
+
+ yield Metric(
+ 'device_smart_available', device.base_labels, smart_available)
+ yield Metric(
+ 'device_smart_enabled', device.base_labels, smart_enabled)
+
+ # Skip further metrics collection here if SMART is disabled
+ # on the device. Further smartctl invocations would fail
+ # anyways.
+ if not smart_available:
+ continue
+
+ yield from collect_device_health_self_assessment(device)
+
+ if device.type.startswith('sat'):
+ yield from collect_ata_metrics(device)
+
+ yield from collect_ata_error_count(device)
+
+
+def main():
+ parser = argparse.ArgumentParser()
+ parser.add_argument('-s', '--wakeup-disks', dest='wakeup_disks', action='store_true')
+ args = parser.parse_args(sys.argv[1:])
+
+ version_metric = Metric('smartctl_version', {
+ 'version': smart_ctl_version()
+ }, True)
+ metric_print_meta(version_metric, 'smartmon_')
+ metric_print(version_metric, 'smartmon_')
+
+ metrics = list(collect_disks_smart_metrics(args.wakeup_disks))
+ metrics.sort(key=lambda i: i.name)
+
+ previous_name = None
+ for m in metrics:
+ if m.name != previous_name:
+ metric_print_meta(m, 'smartmon_')
+
+ previous_name = m.name
+
+ metric_print(m, 'smartmon_')
+
+
+if __name__ == '__main__':
+ main()
diff --git a/roles/monitoring/prometheus/exporter/node/tasks/main.yml b/roles/monitoring/prometheus/exporter/node/tasks/main.yml
index 61e385f7..56903a33 100644
--- a/roles/monitoring/prometheus/exporter/node/tasks/main.yml
+++ b/roles/monitoring/prometheus/exporter/node/tasks/main.yml
@@ -2,7 +2,9 @@
## TODO: pin version
- name: install apt packages
apt:
- name: prom-exporter-node
+ name:
+ - prom-exporter-node
+ - moreutils
state: present
- name: create directory for textfile collector
@@ -34,27 +36,9 @@
- name: create directory for textfile collector scripts
file:
- path: /usr/local/lib/prometheus-node-exporter
+ path: /usr/local/share/prometheus-node-exporter
state: directory
-- name: install textfile collector script wrapper
- copy:
- content: |
- #!/bin/bash
-
- if [ -z "$1" ]; then
- echo "Please specify which collector script to call!"
- exit 1
- fi
- collector="$1"
-
- set -e
- rm -f "/var/lib/prometheus-node-exporter/textfile-collector/$collector.prom".*
- "/usr/local/lib/prometheus-node-exporter/$collector" > "/var/lib/prometheus-node-exporter/textfile-collector/$collector.prom.$$"
- mv "/var/lib/prometheus-node-exporter/textfile-collector/$collector.prom.$$" "/var/lib/prometheus-node-exporter/textfile-collector/$collector.prom"
- dest: /usr/local/lib/prometheus-node-exporter/run-collector
- mode: 0755
-
- name: install the apt textfile collector script
when: ansible_pkg_mgr == "apt"
vars:
diff --git a/roles/monitoring/prometheus/exporter/node/tasks/textfile_collector_script.yml b/roles/monitoring/prometheus/exporter/node/tasks/textfile_collector_script.yml
index 1a39bb4c..5c068fe7 100644
--- a/roles/monitoring/prometheus/exporter/node/tasks/textfile_collector_script.yml
+++ b/roles/monitoring/prometheus/exporter/node/tasks/textfile_collector_script.yml
@@ -2,7 +2,7 @@
- name: install the collector script
copy:
src: "{{ textfile_collector_name }}"
- dest: "/usr/local/lib/prometheus-node-exporter/{{ textfile_collector_name }}"
+ dest: "/usr/local/share/prometheus-node-exporter/{{ textfile_collector_name }}"
mode: 0755
- name: install systemd service units
diff --git a/roles/monitoring/prometheus/exporter/node/templates/textfile-collector-scripts/apt.service.j2 b/roles/monitoring/prometheus/exporter/node/templates/textfile-collector-scripts/apt.service.j2
index b0e9d167..7eca94fb 100644
--- a/roles/monitoring/prometheus/exporter/node/templates/textfile-collector-scripts/apt.service.j2
+++ b/roles/monitoring/prometheus/exporter/node/templates/textfile-collector-scripts/apt.service.j2
@@ -3,7 +3,8 @@ Description=Promethues node exporter textfile collector apt
[Service]
Type=oneshot
-ExecStart=/usr/local/lib/prometheus-node-exporter/run-collector apt
+Environment=TMPDIR=/var/lib/prometheus-node-exporter/textfile-collector
+ExecStart=bash -c "/usr/local/share/prometheus-node-exporter/apt | sponge /var/lib/prometheus-node-exporter/textfile-collector/apt.prom"
# systemd hardening-options
AmbientCapabilities=
diff --git a/roles/monitoring/prometheus/exporter/node/templates/textfile-collector-scripts/apt.timer.j2 b/roles/monitoring/prometheus/exporter/node/templates/textfile-collector-scripts/apt.timer.j2
index b8a9c34e..dc473749 100644
--- a/roles/monitoring/prometheus/exporter/node/templates/textfile-collector-scripts/apt.timer.j2
+++ b/roles/monitoring/prometheus/exporter/node/templates/textfile-collector-scripts/apt.timer.j2
@@ -2,9 +2,8 @@
Description=Promethues node exporter textfile collector apt
[Timer]
-OnBootSec=50s
-OnCalendar=*-*-* *:1/30:17
-AccuracySec=10s
+OnBootSec=10s
+OnUnitActiveSec=15min
[Install]
WantedBy=timers.target
diff --git a/roles/monitoring/prometheus/exporter/node/templates/textfile-collector-scripts/deleted-libraries.service.j2 b/roles/monitoring/prometheus/exporter/node/templates/textfile-collector-scripts/deleted-libraries.service.j2
index 9dbc822f..7b15e558 100644
--- a/roles/monitoring/prometheus/exporter/node/templates/textfile-collector-scripts/deleted-libraries.service.j2
+++ b/roles/monitoring/prometheus/exporter/node/templates/textfile-collector-scripts/deleted-libraries.service.j2
@@ -3,7 +3,8 @@ Description=Promethues node exporter textfile collector deleted-libraries
[Service]
Type=oneshot
-ExecStart=/usr/local/lib/prometheus-node-exporter/run-collector deleted-libraries
+Environment=TMPDIR=/var/lib/prometheus-node-exporter/textfile-collector
+ExecStart=bash -c "/usr/local/share/prometheus-node-exporter/deleted-libraries | sponge /var/lib/prometheus-node-exporter/textfile-collector/deleted-libraries.prom"
# systemd hardening-options
AmbientCapabilities=CAP_SYS_PTRACE
diff --git a/roles/monitoring/prometheus/exporter/node/templates/textfile-collector-scripts/deleted-libraries.timer.j2 b/roles/monitoring/prometheus/exporter/node/templates/textfile-collector-scripts/deleted-libraries.timer.j2
index 1646ac73..c09acecf 100644
--- a/roles/monitoring/prometheus/exporter/node/templates/textfile-collector-scripts/deleted-libraries.timer.j2
+++ b/roles/monitoring/prometheus/exporter/node/templates/textfile-collector-scripts/deleted-libraries.timer.j2
@@ -2,9 +2,8 @@
Description=Promethues node exporter textfile collector deleted-libraries
[Timer]
-OnBootSec=60s
-OnCalendar=*-*-* *:2/30:22
-AccuracySec=10s
+OnBootSec=20s
+OnUnitActiveSec=15min
[Install]
WantedBy=timers.target
diff --git a/roles/monitoring/prometheus/exporter/node/templates/textfile-collector-scripts/smartmon.service.j2 b/roles/monitoring/prometheus/exporter/node/templates/textfile-collector-scripts/smartmon.service.j2
new file mode 100644
index 00000000..fc7c9f3f
--- /dev/null
+++ b/roles/monitoring/prometheus/exporter/node/templates/textfile-collector-scripts/smartmon.service.j2
@@ -0,0 +1,29 @@
+[Unit]
+Description=Promethues node exporter textfile collector smartmon
+
+[Service]
+Type=oneshot
+Environment=TMPDIR=/var/lib/prometheus-node-exporter/textfile-collector
+Environment=LC_NUMERIC=C
+ExecStart=bash -c "/usr/local/share/prometheus-node-exporter/smartmon | sponge /var/lib/prometheus-node-exporter/textfile-collector/smartmon.prom"
+
+# systemd hardening-options
+AmbientCapabilities=
+CapabilityBoundingSet=
+LockPersonality=true
+MemoryDenyWriteExecute=true
+NoNewPrivileges=true
+PrivateTmp=true
+ProtectControlGroups=true
+ProtectHome=true
+ProtectKernelModules=true
+ProtectKernelTunables=true
+ProtectSystem=strict
+ReadWritePaths=/var/lib/prometheus-node-exporter/textfile-collector
+RemoveIPC=true
+RestrictNamespaces=true
+RestrictRealtime=true
+SystemCallArchitectures=native
+
+[Install]
+WantedBy=multi-user.target
diff --git a/roles/monitoring/prometheus/exporter/node/templates/textfile-collector-scripts/smartmon.timer.j2 b/roles/monitoring/prometheus/exporter/node/templates/textfile-collector-scripts/smartmon.timer.j2
new file mode 100644
index 00000000..438da6b0
--- /dev/null
+++ b/roles/monitoring/prometheus/exporter/node/templates/textfile-collector-scripts/smartmon.timer.j2
@@ -0,0 +1,13 @@
+[Unit]
+Description=Promethues node exporter textfile collector smartmon
+ConditionPathExists=/usr/sbin/smartctl
+ConditionPathExistsGlob=|/dev/sd*
+ConditionPathExistsGlob=|/dev/hd*
+ConditionPathExistsGlob=|/dev/nvme*
+
+[Timer]
+OnBootSec=30s
+OnUnitActiveSec=15min
+
+[Install]
+WantedBy=timers.target