From cde5169221233788f32d6909688b5861349c952d Mon Sep 17 00:00:00 2001
From: Christian Pointner <equinox@spreadspace.org>
Date: Wed, 20 Oct 2021 23:09:14 +0200
Subject: move prometheus node-exporter text collector scripts to templates

---
 .../monitoring/prometheus/exporter/node/files/apt  |  40 ---
 .../exporter/node/files/deleted-libraries          |  75 ----
 .../prometheus/exporter/node/files/smartmon        | 391 ---------------------
 .../node/tasks/textfile_collector_generic.yml      |   4 +-
 .../templates/textfile-collector-scripts/apt.j2    |  40 +++
 .../deleted-libraries.j2                           |  75 ++++
 .../textfile-collector-scripts/smartmon.j2         | 391 +++++++++++++++++++++
 7 files changed, 508 insertions(+), 508 deletions(-)
 delete mode 100755 roles/monitoring/prometheus/exporter/node/files/apt
 delete mode 100755 roles/monitoring/prometheus/exporter/node/files/deleted-libraries
 delete mode 100644 roles/monitoring/prometheus/exporter/node/files/smartmon
 create mode 100644 roles/monitoring/prometheus/exporter/node/templates/textfile-collector-scripts/apt.j2
 create mode 100644 roles/monitoring/prometheus/exporter/node/templates/textfile-collector-scripts/deleted-libraries.j2
 create mode 100644 roles/monitoring/prometheus/exporter/node/templates/textfile-collector-scripts/smartmon.j2

(limited to 'roles/monitoring')

diff --git a/roles/monitoring/prometheus/exporter/node/files/apt b/roles/monitoring/prometheus/exporter/node/files/apt
deleted file mode 100755
index 015addb0..00000000
--- a/roles/monitoring/prometheus/exporter/node/files/apt
+++ /dev/null
@@ -1,40 +0,0 @@
-#!/bin/bash
-#
-# Description: Expose metrics from apt updates.
-#
-# Author: Ben Kochie <superq@gmail.com>
-
-upgrades="$(/usr/bin/apt-get --just-print dist-upgrade \
-  | /usr/bin/awk -F'[()]' \
-      '/^Inst/ { sub("^[^ ]+ ", "", $2); gsub(" ","",$2);
-                 sub("\\[", " ", $2); sub("\\]", "", $2); print $2 }' \
-  | /usr/bin/sort \
-  | /usr/bin/uniq -c \
-  | awk '{ gsub(/\\\\/, "\\\\", $2); gsub(/"/, "\\\"", $2);
-           gsub(/\[/, "", $3); gsub(/\]/, "", $3);
-           print "apt_upgrades_pending{origin=\"" $2 "\",arch=\"" $NF "\"} " $1}'
-)"
-
-autoremove="$(/usr/bin/apt-get --just-print autoremove \
-  | /usr/bin/awk '/^Remv/{a++}END{printf "apt_autoremove_pending %d", a}'
-)"
-
-echo '# HELP apt_upgrades_pending Apt package pending updates by origin.'
-echo '# TYPE apt_upgrades_pending gauge'
-if [[ -n "${upgrades}" ]] ; then
-  echo "${upgrades}"
-else
-  echo 'apt_upgrades_pending{origin="",arch=""} 0'
-fi
-
-echo '# HELP apt_autoremove_pending Apt package pending autoremove.'
-echo '# TYPE apt_autoremove_pending gauge'
-echo "${autoremove}"
-
-echo '# HELP node_reboot_required Node reboot is required for software updates.'
-echo '# TYPE node_reboot_required gauge'
-if [[ -f '/run/reboot-required' ]] ; then
-  echo 'node_reboot_required 1'
-else
-  echo 'node_reboot_required 0'
-fi
diff --git a/roles/monitoring/prometheus/exporter/node/files/deleted-libraries b/roles/monitoring/prometheus/exporter/node/files/deleted-libraries
deleted file mode 100755
index e3e19cbd..00000000
--- a/roles/monitoring/prometheus/exporter/node/files/deleted-libraries
+++ /dev/null
@@ -1,75 +0,0 @@
-#!/usr/bin/env python3
-"""
-Script to count the number of deleted libraries that are linked by running
-processes and expose a summary as Prometheus metrics.
-
-The aim is to discover processes that are still using libraries that have since
-been updated, perhaps due security vulnerabilities.
-"""
-
-import errno
-import glob
-import os
-import sys
-
-
-def main():
-    processes_linking_deleted_libraries = {}
-
-    for path in glob.glob('/proc/*/maps'):
-        try:
-            with open(path, 'rb') as file:
-                for line in file:
-                    part = line.decode().strip().split()
-
-                    if len(part) == 7:
-                        library = part[5]
-                        comment = part[6]
-
-                        if '/lib/' in library and '(deleted)' in comment:
-                            if path not in processes_linking_deleted_libraries:
-                                processes_linking_deleted_libraries[path] = {}
-
-                                if library in processes_linking_deleted_libraries[path]:
-                                    processes_linking_deleted_libraries[path][library] += 1
-                                else:
-                                    processes_linking_deleted_libraries[path][library] = 1
-        except EnvironmentError as e:
-            # Ignore non-existent files, since the files may have changed since
-            # we globbed.
-            if e.errno != errno.ENOENT:
-                sys.exit('Failed to open file: {0}'.format(path))
-
-    num_processes_per_library = {}
-
-    for process, library_count in processes_linking_deleted_libraries.items():
-        libraries_seen = set()
-        for library, count in library_count.items():
-            if library in libraries_seen:
-                continue
-
-            libraries_seen.add(library)
-            if library in num_processes_per_library:
-                num_processes_per_library[library] += 1
-            else:
-                num_processes_per_library[library] = 1
-
-    metric_name = 'node_processes_linking_deleted_libraries'
-    description = 'Count of running processes that link a deleted library'
-    print('# HELP {0} {1}'.format(metric_name, description))
-    print('# TYPE {0} gauge'.format(metric_name))
-
-    for library, count in num_processes_per_library.items():
-        dir_path, basename = os.path.split(library)
-        basename = basename.replace('"', '\\"')
-        dir_path = dir_path.replace('"', '\\"')
-        print('{0}{{library_path="{1}", library_name="{2}"}} {3}'.format(
-            metric_name,
-            dir_path,
-            basename,
-            count)
-        )
-
-
-if __name__ == "__main__":
-    main()
diff --git a/roles/monitoring/prometheus/exporter/node/files/smartmon b/roles/monitoring/prometheus/exporter/node/files/smartmon
deleted file mode 100644
index fd03e45f..00000000
--- a/roles/monitoring/prometheus/exporter/node/files/smartmon
+++ /dev/null
@@ -1,391 +0,0 @@
-#!/usr/bin/env python3
-import argparse
-import collections
-import csv
-import datetime
-import decimal
-import re
-import shlex
-import subprocess
-import sys
-
-device_info_re = re.compile(r'^(?P<k>[^:]+?)(?:(?:\sis|):)\s*(?P<v>.*)$')
-
-ata_error_count_re = re.compile(
-    r'^Error (\d+) \[\d+\] occurred', re.MULTILINE)
-
-self_test_re = re.compile(r'^SMART.*(PASSED|OK)$', re.MULTILINE)
-
-device_info_map = {
-    'Vendor': 'vendor',
-    'Product': 'product',
-    'Revision': 'revision',
-    'Logical Unit id': 'lun_id',
-    'Model Family': 'model_family',
-    'Device Model': 'device_model',
-    'Serial Number': 'serial_number',
-    'Firmware Version': 'firmware_version',
-}
-
-smart_attributes_whitelist = {
-    'airflow_temperature_cel',
-    'command_timeout',
-    'current_pending_sector',
-    'end_to_end_error',
-    'erase_fail_count_total',
-    'g_sense_error_rate',
-    'hardware_ecc_recovered',
-    'host_reads_mib',
-    'host_reads_32mib',
-    'host_writes_mib',
-    'host_writes_32mib',
-    'load_cycle_count',
-    'media_wearout_indicator',
-    'wear_leveling_count',
-    'nand_writes_1gib',
-    'offline_uncorrectable',
-    'power_cycle_count',
-    'power_on_hours',
-    'program_fail_count',
-    'raw_read_error_rate',
-    'reallocated_event_count',
-    'reallocated_sector_ct',
-    'reported_uncorrect',
-    'sata_downshift_count',
-    'seek_error_rate',
-    'spin_retry_count',
-    'spin_up_time',
-    'start_stop_count',
-    'temperature_case',
-    'temperature_celsius',
-    'temperature_internal',
-    'total_lbas_read',
-    'total_lbas_written',
-    'udma_crc_error_count',
-    'unsafe_shutdown_count',
-    'workld_host_reads_perc',
-    'workld_media_wear_indic',
-    'workload_minutes',
-}
-
-Metric = collections.namedtuple('Metric', 'name labels value')
-
-SmartAttribute = collections.namedtuple('SmartAttribute', [
-    'id', 'name', 'flag', 'value', 'worst', 'threshold', 'type', 'updated',
-    'when_failed', 'raw_value',
-])
-
-
-class Device(collections.namedtuple('DeviceBase', 'path opts')):
-    """Representation of a device as found by smartctl --scan output."""
-
-    @property
-    def type(self):
-        return self.opts.type
-
-    @property
-    def base_labels(self):
-        return {'device': self.path, 'disk': self.type.partition('+')[2] or '0'}
-
-    def smartctl_select(self):
-        return ['--device', self.type, self.path]
-
-
-def metric_key(metric, prefix=''):
-    return '{prefix}{metric.name}'.format(prefix=prefix, metric=metric)
-
-
-def metric_format(metric, prefix=''):
-    key = metric_key(metric, prefix)
-    labels = ','.join(
-        '{k}="{v}"'.format(k=k, v=v.replace('"', '\\"')) for k, v in metric.labels.items())
-    value = decimal.Decimal(metric.value)
-
-    return '{key}{{{labels}}} {value}'.format(
-        key=key, labels=labels, value=value)
-
-
-def metric_print_meta(metric, prefix=''):
-    key = metric_key(metric, prefix)
-    print('# HELP {key} SMART metric {metric.name}'.format(
-        key=key, metric=metric))
-    print('# TYPE {key} gauge'.format(key=key))
-
-
-def metric_print(metric, prefix=''):
-    print(metric_format(metric, prefix))
-
-
-def smart_ctl(*args, check=True):
-    """Wrapper around invoking the smartctl binary.
-
-    Returns:
-        (str) Data piped to stdout by the smartctl subprocess.
-    """
-    return subprocess.run(
-        ['smartctl', *args], stdout=subprocess.PIPE, check=check
-    ).stdout.decode('utf-8')
-
-
-def smart_ctl_version():
-    return smart_ctl('-V').split('\n')[0].split()[1]
-
-
-def find_devices():
-    """Find SMART devices.
-
-    Yields:
-        (Device) Single device found by smartctl.
-    """
-    parser = argparse.ArgumentParser()
-    parser.add_argument('-d', '--device', dest='type')
-
-    devices = smart_ctl('--scan-open')
-
-    for device in devices.split('\n'):
-        device = device.strip()
-        if not device:
-            continue
-
-        tokens = shlex.split(device, comments=True)
-        if not tokens:
-            continue
-
-        yield Device(tokens[0], parser.parse_args(tokens[1:]))
-
-
-def device_is_active(device):
-    """Returns whenever the given device is currently active or not.
-
-    Args:
-        device: (Device) Device in question.
-
-    Returns:
-        (bool) True if the device is active and False otherwise.
-    """
-    try:
-        smart_ctl('--nocheck', 'standby', *device.smartctl_select())
-    except subprocess.CalledProcessError:
-        return False
-
-    return True
-
-
-def device_info(device):
-    """Query device for basic model information.
-
-    Args:
-        device: (Device) Device in question.
-
-    Returns:
-        (generator): Generator yielding:
-
-            key (str): Key describing the value.
-            value (str): Actual value.
-    """
-    info_lines = smart_ctl(
-        '--info', *device.smartctl_select()
-    ).strip().split('\n')[3:]
-
-    matches = (device_info_re.match(line) for line in info_lines)
-    return (m.groups() for m in matches if m is not None)
-
-
-def device_smart_capabilities(device):
-    """Returns SMART capabilities of the given device.
-
-    Args:
-        device: (Device) Device in question.
-
-    Returns:
-        (tuple): tuple containing:
-
-            (bool): True whenever SMART is available, False otherwise.
-            (bool): True whenever SMART is enabled, False otherwise.
-    """
-    groups = device_info(device)
-
-    state = {
-        g[1].split(' ', 1)[0]
-        for g in groups if g[0] == 'SMART support'}
-
-    smart_available = 'Available' in state
-    smart_enabled = 'Enabled' in state
-
-    return smart_available, smart_enabled
-
-
-def collect_device_info(device):
-    """Collect basic device information.
-
-    Args:
-        device: (Device) Device in question.
-
-    Yields:
-        (Metric) metrics describing general device information.
-    """
-    values = dict(device_info(device))
-    yield Metric('device_info', {
-        **device.base_labels,
-        **{v: values[k] for k, v in device_info_map.items() if k in values}
-    }, True)
-
-
-def collect_device_health_self_assessment(device):
-    """Collect metric about the device health self assessment.
-
-    Args:
-        device: (Device) Device in question.
-
-    Yields:
-        (Metric) Device health self assessment.
-    """
-    out = smart_ctl('--health', *device.smartctl_select(), check=False)
-
-    self_assessment_passed = bool(self_test_re.search(out))
-
-    yield Metric(
-        'device_smart_healthy', device.base_labels, self_assessment_passed)
-
-
-def collect_ata_metrics(device):
-    # Fetch SMART attributes for the given device.
-    attributes = smart_ctl(
-        '--attributes', *device.smartctl_select()
-    )
-
-    # replace multiple occurrences of whitespace with a single whitespace
-    # so that the CSV Parser recognizes individual columns properly.
-    attributes = re.sub(r'[\t\x20]+', ' ', attributes)
-
-    # Turn smartctl output into a list of lines and skip to the table of
-    # SMART attributes.
-    attribute_lines = attributes.strip().split('\n')[7:]
-
-    # Some attributes have multiple IDs but have the same name.  Don't
-    # yield attributes that already have been reported before.
-    seen = set()
-
-    reader = csv.DictReader(
-        (line.strip() for line in attribute_lines),
-        fieldnames=SmartAttribute._fields[:-1],
-        restkey=SmartAttribute._fields[-1], delimiter=' ')
-    for entry in reader:
-        # We're only interested in the SMART attributes that are
-        # whitelisted here.
-        entry['name'] = entry['name'].lower()
-        if entry['name'] not in smart_attributes_whitelist:
-            continue
-
-        # Ensure that only the numeric parts are fetched from the raw_value.
-        # Attributes such as 194 Temperature_Celsius reported by my SSD
-        # are in the format of "36 (Min/Max 24/40)" which can't be expressed
-        # properly as a prometheus metric.
-        m = re.match(r'^(\d+)', ' '.join(entry['raw_value']))
-        if not m:
-            continue
-        entry['raw_value'] = m.group(1)
-
-        # Some device models report "---" in the threshold value where most
-        # devices would report "000". We do the substitution here because
-        # downstream code expects values to be convertable to integer.
-        if entry['threshold'] == '---':
-            entry['threshold'] = '0'
-
-        if entry['name'] in smart_attributes_whitelist and entry['name'] not in seen:
-            labels = {
-                'name': entry['name'],
-                **device.base_labels,
-            }
-
-            for col in 'value', 'worst', 'threshold', 'raw_value':
-                yield Metric(
-                    'attr_{col}'.format(col=col),
-                    labels, entry[col])
-
-            seen.add(entry['name'])
-
-
-def collect_ata_error_count(device):
-    """Inspect the device error log and report the amount of entries.
-
-    Args:
-        device: (Device) Device in question.
-
-    Yields:
-        (Metric) Device error count.
-    """
-    error_log = smart_ctl(
-        '-l', 'xerror,1', *device.smartctl_select(), check=False)
-
-    m = ata_error_count_re.search(error_log)
-
-    error_count = m.group(1) if m is not None else 0
-
-    yield Metric('device_errors', device.base_labels, error_count)
-
-
-def collect_disks_smart_metrics(wakeup_disks):
-    now = int(datetime.datetime.now().timestamp())
-
-    for device in find_devices():
-        yield Metric('smartctl_run', device.base_labels, now)
-
-        is_active = device_is_active(device)
-
-        yield Metric('device_active', device.base_labels, is_active)
-
-        # Skip further metrics collection to prevent the disk from
-        # spinning up.
-        if not is_active and not wakeup_disks:
-            continue
-
-        yield from collect_device_info(device)
-
-        smart_available, smart_enabled = device_smart_capabilities(device)
-
-        yield Metric(
-            'device_smart_available', device.base_labels, smart_available)
-        yield Metric(
-            'device_smart_enabled', device.base_labels, smart_enabled)
-
-        # Skip further metrics collection here if SMART is disabled
-        # on the device.  Further smartctl invocations would fail
-        # anyways.
-        if not smart_available:
-            continue
-
-        yield from collect_device_health_self_assessment(device)
-
-        if device.type.startswith('sat'):
-            yield from collect_ata_metrics(device)
-
-            yield from collect_ata_error_count(device)
-
-
-def main():
-    parser = argparse.ArgumentParser()
-    parser.add_argument('-s', '--wakeup-disks', dest='wakeup_disks', action='store_true')
-    args = parser.parse_args(sys.argv[1:])
-
-    version_metric = Metric('smartctl_version', {
-        'version': smart_ctl_version()
-    }, True)
-    metric_print_meta(version_metric, 'smartmon_')
-    metric_print(version_metric, 'smartmon_')
-
-    metrics = list(collect_disks_smart_metrics(args.wakeup_disks))
-    metrics.sort(key=lambda i: i.name)
-
-    previous_name = None
-    for m in metrics:
-        if m.name != previous_name:
-            metric_print_meta(m, 'smartmon_')
-
-            previous_name = m.name
-
-        metric_print(m, 'smartmon_')
-
-
-if __name__ == '__main__':
-    main()
diff --git a/roles/monitoring/prometheus/exporter/node/tasks/textfile_collector_generic.yml b/roles/monitoring/prometheus/exporter/node/tasks/textfile_collector_generic.yml
index 5c068fe7..e5bf2ce5 100644
--- a/roles/monitoring/prometheus/exporter/node/tasks/textfile_collector_generic.yml
+++ b/roles/monitoring/prometheus/exporter/node/tasks/textfile_collector_generic.yml
@@ -1,7 +1,7 @@
 ---
 - name: install the collector script
-  copy:
-    src: "{{ textfile_collector_name }}"
+  template:
+    src: "textfile-collector-scripts/{{ textfile_collector_name }}.j2"
     dest: "/usr/local/share/prometheus-node-exporter/{{ textfile_collector_name }}"
     mode: 0755
 
diff --git a/roles/monitoring/prometheus/exporter/node/templates/textfile-collector-scripts/apt.j2 b/roles/monitoring/prometheus/exporter/node/templates/textfile-collector-scripts/apt.j2
new file mode 100644
index 00000000..015addb0
--- /dev/null
+++ b/roles/monitoring/prometheus/exporter/node/templates/textfile-collector-scripts/apt.j2
@@ -0,0 +1,40 @@
+#!/bin/bash
+#
+# Description: Expose metrics from apt updates.
+#
+# Author: Ben Kochie <superq@gmail.com>
+
+upgrades="$(/usr/bin/apt-get --just-print dist-upgrade \
+  | /usr/bin/awk -F'[()]' \
+      '/^Inst/ { sub("^[^ ]+ ", "", $2); gsub(" ","",$2);
+                 sub("\\[", " ", $2); sub("\\]", "", $2); print $2 }' \
+  | /usr/bin/sort \
+  | /usr/bin/uniq -c \
+  | awk '{ gsub(/\\\\/, "\\\\", $2); gsub(/"/, "\\\"", $2);
+           gsub(/\[/, "", $3); gsub(/\]/, "", $3);
+           print "apt_upgrades_pending{origin=\"" $2 "\",arch=\"" $NF "\"} " $1}'
+)"
+
+autoremove="$(/usr/bin/apt-get --just-print autoremove \
+  | /usr/bin/awk '/^Remv/{a++}END{printf "apt_autoremove_pending %d", a}'
+)"
+
+echo '# HELP apt_upgrades_pending Apt package pending updates by origin.'
+echo '# TYPE apt_upgrades_pending gauge'
+if [[ -n "${upgrades}" ]] ; then
+  echo "${upgrades}"
+else
+  echo 'apt_upgrades_pending{origin="",arch=""} 0'
+fi
+
+echo '# HELP apt_autoremove_pending Apt package pending autoremove.'
+echo '# TYPE apt_autoremove_pending gauge'
+echo "${autoremove}"
+
+echo '# HELP node_reboot_required Node reboot is required for software updates.'
+echo '# TYPE node_reboot_required gauge'
+if [[ -f '/run/reboot-required' ]] ; then
+  echo 'node_reboot_required 1'
+else
+  echo 'node_reboot_required 0'
+fi
diff --git a/roles/monitoring/prometheus/exporter/node/templates/textfile-collector-scripts/deleted-libraries.j2 b/roles/monitoring/prometheus/exporter/node/templates/textfile-collector-scripts/deleted-libraries.j2
new file mode 100644
index 00000000..aeddc903
--- /dev/null
+++ b/roles/monitoring/prometheus/exporter/node/templates/textfile-collector-scripts/deleted-libraries.j2
@@ -0,0 +1,75 @@
+#!/usr/bin/env {{ python_basename }}
+"""
+Script to count the number of deleted libraries that are linked by running
+processes and expose a summary as Prometheus metrics.
+
+The aim is to discover processes that are still using libraries that have since
+been updated, perhaps due security vulnerabilities.
+"""
+
+import errno
+import glob
+import os
+import sys
+
+
+def main():
+    processes_linking_deleted_libraries = {}
+
+    for path in glob.glob('/proc/*/maps'):
+        try:
+            with open(path, 'rb') as file:
+                for line in file:
+                    part = line.decode().strip().split()
+
+                    if len(part) == 7:
+                        library = part[5]
+                        comment = part[6]
+
+                        if '/lib/' in library and '(deleted)' in comment:
+                            if path not in processes_linking_deleted_libraries:
+                                processes_linking_deleted_libraries[path] = {}
+
+                                if library in processes_linking_deleted_libraries[path]:
+                                    processes_linking_deleted_libraries[path][library] += 1
+                                else:
+                                    processes_linking_deleted_libraries[path][library] = 1
+        except EnvironmentError as e:
+            # Ignore non-existent files, since the files may have changed since
+            # we globbed.
+            if e.errno != errno.ENOENT:
+                sys.exit('Failed to open file: {0}'.format(path))
+
+    num_processes_per_library = {}
+
+    for process, library_count in processes_linking_deleted_libraries.items():
+        libraries_seen = set()
+        for library, count in library_count.items():
+            if library in libraries_seen:
+                continue
+
+            libraries_seen.add(library)
+            if library in num_processes_per_library:
+                num_processes_per_library[library] += 1
+            else:
+                num_processes_per_library[library] = 1
+
+    metric_name = 'node_processes_linking_deleted_libraries'
+    description = 'Count of running processes that link a deleted library'
+    print('# HELP {0} {1}'.format(metric_name, description))
+    print('# TYPE {0} gauge'.format(metric_name))
+
+    for library, count in num_processes_per_library.items():
+        dir_path, basename = os.path.split(library)
+        basename = basename.replace('"', '\\"')
+        dir_path = dir_path.replace('"', '\\"')
+        print('{0}{{ '{{' }}library_path="{1}", library_name="{2}"{{ '}}' }} {3}'.format(
+            metric_name,
+            dir_path,
+            basename,
+            count)
+        )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/roles/monitoring/prometheus/exporter/node/templates/textfile-collector-scripts/smartmon.j2 b/roles/monitoring/prometheus/exporter/node/templates/textfile-collector-scripts/smartmon.j2
new file mode 100644
index 00000000..b033faf0
--- /dev/null
+++ b/roles/monitoring/prometheus/exporter/node/templates/textfile-collector-scripts/smartmon.j2
@@ -0,0 +1,391 @@
+#!/usr/bin/env {{ python_basename }}
+import argparse
+import collections
+import csv
+import datetime
+import decimal
+import re
+import shlex
+import subprocess
+import sys
+
+device_info_re = re.compile(r'^(?P<k>[^:]+?)(?:(?:\sis|):)\s*(?P<v>.*)$')
+
+ata_error_count_re = re.compile(
+    r'^Error (\d+) \[\d+\] occurred', re.MULTILINE)
+
+self_test_re = re.compile(r'^SMART.*(PASSED|OK)$', re.MULTILINE)
+
+device_info_map = {
+    'Vendor': 'vendor',
+    'Product': 'product',
+    'Revision': 'revision',
+    'Logical Unit id': 'lun_id',
+    'Model Family': 'model_family',
+    'Device Model': 'device_model',
+    'Serial Number': 'serial_number',
+    'Firmware Version': 'firmware_version',
+}
+
+smart_attributes_whitelist = {
+    'airflow_temperature_cel',
+    'command_timeout',
+    'current_pending_sector',
+    'end_to_end_error',
+    'erase_fail_count_total',
+    'g_sense_error_rate',
+    'hardware_ecc_recovered',
+    'host_reads_mib',
+    'host_reads_32mib',
+    'host_writes_mib',
+    'host_writes_32mib',
+    'load_cycle_count',
+    'media_wearout_indicator',
+    'wear_leveling_count',
+    'nand_writes_1gib',
+    'offline_uncorrectable',
+    'power_cycle_count',
+    'power_on_hours',
+    'program_fail_count',
+    'raw_read_error_rate',
+    'reallocated_event_count',
+    'reallocated_sector_ct',
+    'reported_uncorrect',
+    'sata_downshift_count',
+    'seek_error_rate',
+    'spin_retry_count',
+    'spin_up_time',
+    'start_stop_count',
+    'temperature_case',
+    'temperature_celsius',
+    'temperature_internal',
+    'total_lbas_read',
+    'total_lbas_written',
+    'udma_crc_error_count',
+    'unsafe_shutdown_count',
+    'workld_host_reads_perc',
+    'workld_media_wear_indic',
+    'workload_minutes',
+}
+
+Metric = collections.namedtuple('Metric', 'name labels value')
+
+SmartAttribute = collections.namedtuple('SmartAttribute', [
+    'id', 'name', 'flag', 'value', 'worst', 'threshold', 'type', 'updated',
+    'when_failed', 'raw_value',
+])
+
+
+class Device(collections.namedtuple('DeviceBase', 'path opts')):
+    """Representation of a device as found by smartctl --scan output."""
+
+    @property
+    def type(self):
+        return self.opts.type
+
+    @property
+    def base_labels(self):
+        return {'device': self.path, 'disk': self.type.partition('+')[2] or '0'}
+
+    def smartctl_select(self):
+        return ['--device', self.type, self.path]
+
+
+def metric_key(metric, prefix=''):
+    return '{prefix}{metric.name}'.format(prefix=prefix, metric=metric)
+
+
+def metric_format(metric, prefix=''):
+    key = metric_key(metric, prefix)
+    labels = ','.join(
+        '{k}="{v}"'.format(k=k, v=v.replace('"', '\\"')) for k, v in metric.labels.items())
+    value = decimal.Decimal(metric.value)
+
+    return '{key}{{ '{{{' }}labels{{ '}}}' }} {value}'.format(
+        key=key, labels=labels, value=value)
+
+
+def metric_print_meta(metric, prefix=''):
+    key = metric_key(metric, prefix)
+    print('# HELP {key} SMART metric {metric.name}'.format(
+        key=key, metric=metric))
+    print('# TYPE {key} gauge'.format(key=key))
+
+
+def metric_print(metric, prefix=''):
+    print(metric_format(metric, prefix))
+
+
+def smart_ctl(*args, check=True):
+    """Wrapper around invoking the smartctl binary.
+
+    Returns:
+        (str) Data piped to stdout by the smartctl subprocess.
+    """
+    return subprocess.run(
+        ['smartctl', *args], stdout=subprocess.PIPE, check=check
+    ).stdout.decode('utf-8')
+
+
+def smart_ctl_version():
+    return smart_ctl('-V').split('\n')[0].split()[1]
+
+
+def find_devices():
+    """Find SMART devices.
+
+    Yields:
+        (Device) Single device found by smartctl.
+    """
+    parser = argparse.ArgumentParser()
+    parser.add_argument('-d', '--device', dest='type')
+
+    devices = smart_ctl('--scan-open')
+
+    for device in devices.split('\n'):
+        device = device.strip()
+        if not device:
+            continue
+
+        tokens = shlex.split(device, comments=True)
+        if not tokens:
+            continue
+
+        yield Device(tokens[0], parser.parse_args(tokens[1:]))
+
+
+def device_is_active(device):
+    """Returns whenever the given device is currently active or not.
+
+    Args:
+        device: (Device) Device in question.
+
+    Returns:
+        (bool) True if the device is active and False otherwise.
+    """
+    try:
+        smart_ctl('--nocheck', 'standby', *device.smartctl_select())
+    except subprocess.CalledProcessError:
+        return False
+
+    return True
+
+
+def device_info(device):
+    """Query device for basic model information.
+
+    Args:
+        device: (Device) Device in question.
+
+    Returns:
+        (generator): Generator yielding:
+
+            key (str): Key describing the value.
+            value (str): Actual value.
+    """
+    info_lines = smart_ctl(
+        '--info', *device.smartctl_select()
+    ).strip().split('\n')[3:]
+
+    matches = (device_info_re.match(line) for line in info_lines)
+    return (m.groups() for m in matches if m is not None)
+
+
+def device_smart_capabilities(device):
+    """Returns SMART capabilities of the given device.
+
+    Args:
+        device: (Device) Device in question.
+
+    Returns:
+        (tuple): tuple containing:
+
+            (bool): True whenever SMART is available, False otherwise.
+            (bool): True whenever SMART is enabled, False otherwise.
+    """
+    groups = device_info(device)
+
+    state = {
+        g[1].split(' ', 1)[0]
+        for g in groups if g[0] == 'SMART support'}
+
+    smart_available = 'Available' in state
+    smart_enabled = 'Enabled' in state
+
+    return smart_available, smart_enabled
+
+
+def collect_device_info(device):
+    """Collect basic device information.
+
+    Args:
+        device: (Device) Device in question.
+
+    Yields:
+        (Metric) metrics describing general device information.
+    """
+    values = dict(device_info(device))
+    yield Metric('device_info', {
+        **device.base_labels,
+        **{v: values[k] for k, v in device_info_map.items() if k in values}
+    }, True)
+
+
+def collect_device_health_self_assessment(device):
+    """Collect metric about the device health self assessment.
+
+    Args:
+        device: (Device) Device in question.
+
+    Yields:
+        (Metric) Device health self assessment.
+    """
+    out = smart_ctl('--health', *device.smartctl_select(), check=False)
+
+    self_assessment_passed = bool(self_test_re.search(out))
+
+    yield Metric(
+        'device_smart_healthy', device.base_labels, self_assessment_passed)
+
+
+def collect_ata_metrics(device):
+    # Fetch SMART attributes for the given device.
+    attributes = smart_ctl(
+        '--attributes', *device.smartctl_select()
+    )
+
+    # replace multiple occurrences of whitespace with a single whitespace
+    # so that the CSV Parser recognizes individual columns properly.
+    attributes = re.sub(r'[\t\x20]+', ' ', attributes)
+
+    # Turn smartctl output into a list of lines and skip to the table of
+    # SMART attributes.
+    attribute_lines = attributes.strip().split('\n')[7:]
+
+    # Some attributes have multiple IDs but have the same name.  Don't
+    # yield attributes that already have been reported before.
+    seen = set()
+
+    reader = csv.DictReader(
+        (line.strip() for line in attribute_lines),
+        fieldnames=SmartAttribute._fields[:-1],
+        restkey=SmartAttribute._fields[-1], delimiter=' ')
+    for entry in reader:
+        # We're only interested in the SMART attributes that are
+        # whitelisted here.
+        entry['name'] = entry['name'].lower()
+        if entry['name'] not in smart_attributes_whitelist:
+            continue
+
+        # Ensure that only the numeric parts are fetched from the raw_value.
+        # Attributes such as 194 Temperature_Celsius reported by my SSD
+        # are in the format of "36 (Min/Max 24/40)" which can't be expressed
+        # properly as a prometheus metric.
+        m = re.match(r'^(\d+)', ' '.join(entry['raw_value']))
+        if not m:
+            continue
+        entry['raw_value'] = m.group(1)
+
+        # Some device models report "---" in the threshold value where most
+        # devices would report "000". We do the substitution here because
+        # downstream code expects values to be convertable to integer.
+        if entry['threshold'] == '---':
+            entry['threshold'] = '0'
+
+        if entry['name'] in smart_attributes_whitelist and entry['name'] not in seen:
+            labels = {
+                'name': entry['name'],
+                **device.base_labels,
+            }
+
+            for col in 'value', 'worst', 'threshold', 'raw_value':
+                yield Metric(
+                    'attr_{col}'.format(col=col),
+                    labels, entry[col])
+
+            seen.add(entry['name'])
+
+
+def collect_ata_error_count(device):
+    """Inspect the device error log and report the amount of entries.
+
+    Args:
+        device: (Device) Device in question.
+
+    Yields:
+        (Metric) Device error count.
+    """
+    error_log = smart_ctl(
+        '-l', 'xerror,1', *device.smartctl_select(), check=False)
+
+    m = ata_error_count_re.search(error_log)
+
+    error_count = m.group(1) if m is not None else 0
+
+    yield Metric('device_errors', device.base_labels, error_count)
+
+
+def collect_disks_smart_metrics(wakeup_disks):
+    now = int(datetime.datetime.now().timestamp())
+
+    for device in find_devices():
+        yield Metric('smartctl_run', device.base_labels, now)
+
+        is_active = device_is_active(device)
+
+        yield Metric('device_active', device.base_labels, is_active)
+
+        # Skip further metrics collection to prevent the disk from
+        # spinning up.
+        if not is_active and not wakeup_disks:
+            continue
+
+        yield from collect_device_info(device)
+
+        smart_available, smart_enabled = device_smart_capabilities(device)
+
+        yield Metric(
+            'device_smart_available', device.base_labels, smart_available)
+        yield Metric(
+            'device_smart_enabled', device.base_labels, smart_enabled)
+
+        # Skip further metrics collection here if SMART is disabled
+        # on the device.  Further smartctl invocations would fail
+        # anyways.
+        if not smart_available:
+            continue
+
+        yield from collect_device_health_self_assessment(device)
+
+        if device.type.startswith('sat'):
+            yield from collect_ata_metrics(device)
+
+            yield from collect_ata_error_count(device)
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('-s', '--wakeup-disks', dest='wakeup_disks', action='store_true')
+    args = parser.parse_args(sys.argv[1:])
+
+    version_metric = Metric('smartctl_version', {
+        'version': smart_ctl_version()
+    }, True)
+    metric_print_meta(version_metric, 'smartmon_')
+    metric_print(version_metric, 'smartmon_')
+
+    metrics = list(collect_disks_smart_metrics(args.wakeup_disks))
+    metrics.sort(key=lambda i: i.name)
+
+    previous_name = None
+    for m in metrics:
+        if m.name != previous_name:
+            metric_print_meta(m, 'smartmon_')
+
+            previous_name = m.name
+
+        metric_print(m, 'smartmon_')
+
+
+if __name__ == '__main__':
+    main()
-- 
cgit v1.2.3