From cde5169221233788f32d6909688b5861349c952d Mon Sep 17 00:00:00 2001 From: Christian Pointner Date: Wed, 20 Oct 2021 23:09:14 +0200 Subject: move prometheus node-exporter text collector scripts to templates --- .../monitoring/prometheus/exporter/node/files/apt | 40 --- .../exporter/node/files/deleted-libraries | 75 ---- .../prometheus/exporter/node/files/smartmon | 391 --------------------- .../node/tasks/textfile_collector_generic.yml | 4 +- .../templates/textfile-collector-scripts/apt.j2 | 40 +++ .../deleted-libraries.j2 | 75 ++++ .../textfile-collector-scripts/smartmon.j2 | 391 +++++++++++++++++++++ 7 files changed, 508 insertions(+), 508 deletions(-) delete mode 100755 roles/monitoring/prometheus/exporter/node/files/apt delete mode 100755 roles/monitoring/prometheus/exporter/node/files/deleted-libraries delete mode 100644 roles/monitoring/prometheus/exporter/node/files/smartmon create mode 100644 roles/monitoring/prometheus/exporter/node/templates/textfile-collector-scripts/apt.j2 create mode 100644 roles/monitoring/prometheus/exporter/node/templates/textfile-collector-scripts/deleted-libraries.j2 create mode 100644 roles/monitoring/prometheus/exporter/node/templates/textfile-collector-scripts/smartmon.j2 (limited to 'roles/monitoring') diff --git a/roles/monitoring/prometheus/exporter/node/files/apt b/roles/monitoring/prometheus/exporter/node/files/apt deleted file mode 100755 index 015addb0..00000000 --- a/roles/monitoring/prometheus/exporter/node/files/apt +++ /dev/null @@ -1,40 +0,0 @@ -#!/bin/bash -# -# Description: Expose metrics from apt updates. -# -# Author: Ben Kochie - -upgrades="$(/usr/bin/apt-get --just-print dist-upgrade \ - | /usr/bin/awk -F'[()]' \ - '/^Inst/ { sub("^[^ ]+ ", "", $2); gsub(" ","",$2); - sub("\\[", " ", $2); sub("\\]", "", $2); print $2 }' \ - | /usr/bin/sort \ - | /usr/bin/uniq -c \ - | awk '{ gsub(/\\\\/, "\\\\", $2); gsub(/"/, "\\\"", $2); - gsub(/\[/, "", $3); gsub(/\]/, "", $3); - print "apt_upgrades_pending{origin=\"" $2 "\",arch=\"" $NF "\"} " $1}' -)" - -autoremove="$(/usr/bin/apt-get --just-print autoremove \ - | /usr/bin/awk '/^Remv/{a++}END{printf "apt_autoremove_pending %d", a}' -)" - -echo '# HELP apt_upgrades_pending Apt package pending updates by origin.' -echo '# TYPE apt_upgrades_pending gauge' -if [[ -n "${upgrades}" ]] ; then - echo "${upgrades}" -else - echo 'apt_upgrades_pending{origin="",arch=""} 0' -fi - -echo '# HELP apt_autoremove_pending Apt package pending autoremove.' -echo '# TYPE apt_autoremove_pending gauge' -echo "${autoremove}" - -echo '# HELP node_reboot_required Node reboot is required for software updates.' -echo '# TYPE node_reboot_required gauge' -if [[ -f '/run/reboot-required' ]] ; then - echo 'node_reboot_required 1' -else - echo 'node_reboot_required 0' -fi diff --git a/roles/monitoring/prometheus/exporter/node/files/deleted-libraries b/roles/monitoring/prometheus/exporter/node/files/deleted-libraries deleted file mode 100755 index e3e19cbd..00000000 --- a/roles/monitoring/prometheus/exporter/node/files/deleted-libraries +++ /dev/null @@ -1,75 +0,0 @@ -#!/usr/bin/env python3 -""" -Script to count the number of deleted libraries that are linked by running -processes and expose a summary as Prometheus metrics. - -The aim is to discover processes that are still using libraries that have since -been updated, perhaps due security vulnerabilities. -""" - -import errno -import glob -import os -import sys - - -def main(): - processes_linking_deleted_libraries = {} - - for path in glob.glob('/proc/*/maps'): - try: - with open(path, 'rb') as file: - for line in file: - part = line.decode().strip().split() - - if len(part) == 7: - library = part[5] - comment = part[6] - - if '/lib/' in library and '(deleted)' in comment: - if path not in processes_linking_deleted_libraries: - processes_linking_deleted_libraries[path] = {} - - if library in processes_linking_deleted_libraries[path]: - processes_linking_deleted_libraries[path][library] += 1 - else: - processes_linking_deleted_libraries[path][library] = 1 - except EnvironmentError as e: - # Ignore non-existent files, since the files may have changed since - # we globbed. - if e.errno != errno.ENOENT: - sys.exit('Failed to open file: {0}'.format(path)) - - num_processes_per_library = {} - - for process, library_count in processes_linking_deleted_libraries.items(): - libraries_seen = set() - for library, count in library_count.items(): - if library in libraries_seen: - continue - - libraries_seen.add(library) - if library in num_processes_per_library: - num_processes_per_library[library] += 1 - else: - num_processes_per_library[library] = 1 - - metric_name = 'node_processes_linking_deleted_libraries' - description = 'Count of running processes that link a deleted library' - print('# HELP {0} {1}'.format(metric_name, description)) - print('# TYPE {0} gauge'.format(metric_name)) - - for library, count in num_processes_per_library.items(): - dir_path, basename = os.path.split(library) - basename = basename.replace('"', '\\"') - dir_path = dir_path.replace('"', '\\"') - print('{0}{{library_path="{1}", library_name="{2}"}} {3}'.format( - metric_name, - dir_path, - basename, - count) - ) - - -if __name__ == "__main__": - main() diff --git a/roles/monitoring/prometheus/exporter/node/files/smartmon b/roles/monitoring/prometheus/exporter/node/files/smartmon deleted file mode 100644 index fd03e45f..00000000 --- a/roles/monitoring/prometheus/exporter/node/files/smartmon +++ /dev/null @@ -1,391 +0,0 @@ -#!/usr/bin/env python3 -import argparse -import collections -import csv -import datetime -import decimal -import re -import shlex -import subprocess -import sys - -device_info_re = re.compile(r'^(?P[^:]+?)(?:(?:\sis|):)\s*(?P.*)$') - -ata_error_count_re = re.compile( - r'^Error (\d+) \[\d+\] occurred', re.MULTILINE) - -self_test_re = re.compile(r'^SMART.*(PASSED|OK)$', re.MULTILINE) - -device_info_map = { - 'Vendor': 'vendor', - 'Product': 'product', - 'Revision': 'revision', - 'Logical Unit id': 'lun_id', - 'Model Family': 'model_family', - 'Device Model': 'device_model', - 'Serial Number': 'serial_number', - 'Firmware Version': 'firmware_version', -} - -smart_attributes_whitelist = { - 'airflow_temperature_cel', - 'command_timeout', - 'current_pending_sector', - 'end_to_end_error', - 'erase_fail_count_total', - 'g_sense_error_rate', - 'hardware_ecc_recovered', - 'host_reads_mib', - 'host_reads_32mib', - 'host_writes_mib', - 'host_writes_32mib', - 'load_cycle_count', - 'media_wearout_indicator', - 'wear_leveling_count', - 'nand_writes_1gib', - 'offline_uncorrectable', - 'power_cycle_count', - 'power_on_hours', - 'program_fail_count', - 'raw_read_error_rate', - 'reallocated_event_count', - 'reallocated_sector_ct', - 'reported_uncorrect', - 'sata_downshift_count', - 'seek_error_rate', - 'spin_retry_count', - 'spin_up_time', - 'start_stop_count', - 'temperature_case', - 'temperature_celsius', - 'temperature_internal', - 'total_lbas_read', - 'total_lbas_written', - 'udma_crc_error_count', - 'unsafe_shutdown_count', - 'workld_host_reads_perc', - 'workld_media_wear_indic', - 'workload_minutes', -} - -Metric = collections.namedtuple('Metric', 'name labels value') - -SmartAttribute = collections.namedtuple('SmartAttribute', [ - 'id', 'name', 'flag', 'value', 'worst', 'threshold', 'type', 'updated', - 'when_failed', 'raw_value', -]) - - -class Device(collections.namedtuple('DeviceBase', 'path opts')): - """Representation of a device as found by smartctl --scan output.""" - - @property - def type(self): - return self.opts.type - - @property - def base_labels(self): - return {'device': self.path, 'disk': self.type.partition('+')[2] or '0'} - - def smartctl_select(self): - return ['--device', self.type, self.path] - - -def metric_key(metric, prefix=''): - return '{prefix}{metric.name}'.format(prefix=prefix, metric=metric) - - -def metric_format(metric, prefix=''): - key = metric_key(metric, prefix) - labels = ','.join( - '{k}="{v}"'.format(k=k, v=v.replace('"', '\\"')) for k, v in metric.labels.items()) - value = decimal.Decimal(metric.value) - - return '{key}{{{labels}}} {value}'.format( - key=key, labels=labels, value=value) - - -def metric_print_meta(metric, prefix=''): - key = metric_key(metric, prefix) - print('# HELP {key} SMART metric {metric.name}'.format( - key=key, metric=metric)) - print('# TYPE {key} gauge'.format(key=key)) - - -def metric_print(metric, prefix=''): - print(metric_format(metric, prefix)) - - -def smart_ctl(*args, check=True): - """Wrapper around invoking the smartctl binary. - - Returns: - (str) Data piped to stdout by the smartctl subprocess. - """ - return subprocess.run( - ['smartctl', *args], stdout=subprocess.PIPE, check=check - ).stdout.decode('utf-8') - - -def smart_ctl_version(): - return smart_ctl('-V').split('\n')[0].split()[1] - - -def find_devices(): - """Find SMART devices. - - Yields: - (Device) Single device found by smartctl. - """ - parser = argparse.ArgumentParser() - parser.add_argument('-d', '--device', dest='type') - - devices = smart_ctl('--scan-open') - - for device in devices.split('\n'): - device = device.strip() - if not device: - continue - - tokens = shlex.split(device, comments=True) - if not tokens: - continue - - yield Device(tokens[0], parser.parse_args(tokens[1:])) - - -def device_is_active(device): - """Returns whenever the given device is currently active or not. - - Args: - device: (Device) Device in question. - - Returns: - (bool) True if the device is active and False otherwise. - """ - try: - smart_ctl('--nocheck', 'standby', *device.smartctl_select()) - except subprocess.CalledProcessError: - return False - - return True - - -def device_info(device): - """Query device for basic model information. - - Args: - device: (Device) Device in question. - - Returns: - (generator): Generator yielding: - - key (str): Key describing the value. - value (str): Actual value. - """ - info_lines = smart_ctl( - '--info', *device.smartctl_select() - ).strip().split('\n')[3:] - - matches = (device_info_re.match(line) for line in info_lines) - return (m.groups() for m in matches if m is not None) - - -def device_smart_capabilities(device): - """Returns SMART capabilities of the given device. - - Args: - device: (Device) Device in question. - - Returns: - (tuple): tuple containing: - - (bool): True whenever SMART is available, False otherwise. - (bool): True whenever SMART is enabled, False otherwise. - """ - groups = device_info(device) - - state = { - g[1].split(' ', 1)[0] - for g in groups if g[0] == 'SMART support'} - - smart_available = 'Available' in state - smart_enabled = 'Enabled' in state - - return smart_available, smart_enabled - - -def collect_device_info(device): - """Collect basic device information. - - Args: - device: (Device) Device in question. - - Yields: - (Metric) metrics describing general device information. - """ - values = dict(device_info(device)) - yield Metric('device_info', { - **device.base_labels, - **{v: values[k] for k, v in device_info_map.items() if k in values} - }, True) - - -def collect_device_health_self_assessment(device): - """Collect metric about the device health self assessment. - - Args: - device: (Device) Device in question. - - Yields: - (Metric) Device health self assessment. - """ - out = smart_ctl('--health', *device.smartctl_select(), check=False) - - self_assessment_passed = bool(self_test_re.search(out)) - - yield Metric( - 'device_smart_healthy', device.base_labels, self_assessment_passed) - - -def collect_ata_metrics(device): - # Fetch SMART attributes for the given device. - attributes = smart_ctl( - '--attributes', *device.smartctl_select() - ) - - # replace multiple occurrences of whitespace with a single whitespace - # so that the CSV Parser recognizes individual columns properly. - attributes = re.sub(r'[\t\x20]+', ' ', attributes) - - # Turn smartctl output into a list of lines and skip to the table of - # SMART attributes. - attribute_lines = attributes.strip().split('\n')[7:] - - # Some attributes have multiple IDs but have the same name. Don't - # yield attributes that already have been reported before. - seen = set() - - reader = csv.DictReader( - (line.strip() for line in attribute_lines), - fieldnames=SmartAttribute._fields[:-1], - restkey=SmartAttribute._fields[-1], delimiter=' ') - for entry in reader: - # We're only interested in the SMART attributes that are - # whitelisted here. - entry['name'] = entry['name'].lower() - if entry['name'] not in smart_attributes_whitelist: - continue - - # Ensure that only the numeric parts are fetched from the raw_value. - # Attributes such as 194 Temperature_Celsius reported by my SSD - # are in the format of "36 (Min/Max 24/40)" which can't be expressed - # properly as a prometheus metric. - m = re.match(r'^(\d+)', ' '.join(entry['raw_value'])) - if not m: - continue - entry['raw_value'] = m.group(1) - - # Some device models report "---" in the threshold value where most - # devices would report "000". We do the substitution here because - # downstream code expects values to be convertable to integer. - if entry['threshold'] == '---': - entry['threshold'] = '0' - - if entry['name'] in smart_attributes_whitelist and entry['name'] not in seen: - labels = { - 'name': entry['name'], - **device.base_labels, - } - - for col in 'value', 'worst', 'threshold', 'raw_value': - yield Metric( - 'attr_{col}'.format(col=col), - labels, entry[col]) - - seen.add(entry['name']) - - -def collect_ata_error_count(device): - """Inspect the device error log and report the amount of entries. - - Args: - device: (Device) Device in question. - - Yields: - (Metric) Device error count. - """ - error_log = smart_ctl( - '-l', 'xerror,1', *device.smartctl_select(), check=False) - - m = ata_error_count_re.search(error_log) - - error_count = m.group(1) if m is not None else 0 - - yield Metric('device_errors', device.base_labels, error_count) - - -def collect_disks_smart_metrics(wakeup_disks): - now = int(datetime.datetime.now().timestamp()) - - for device in find_devices(): - yield Metric('smartctl_run', device.base_labels, now) - - is_active = device_is_active(device) - - yield Metric('device_active', device.base_labels, is_active) - - # Skip further metrics collection to prevent the disk from - # spinning up. - if not is_active and not wakeup_disks: - continue - - yield from collect_device_info(device) - - smart_available, smart_enabled = device_smart_capabilities(device) - - yield Metric( - 'device_smart_available', device.base_labels, smart_available) - yield Metric( - 'device_smart_enabled', device.base_labels, smart_enabled) - - # Skip further metrics collection here if SMART is disabled - # on the device. Further smartctl invocations would fail - # anyways. - if not smart_available: - continue - - yield from collect_device_health_self_assessment(device) - - if device.type.startswith('sat'): - yield from collect_ata_metrics(device) - - yield from collect_ata_error_count(device) - - -def main(): - parser = argparse.ArgumentParser() - parser.add_argument('-s', '--wakeup-disks', dest='wakeup_disks', action='store_true') - args = parser.parse_args(sys.argv[1:]) - - version_metric = Metric('smartctl_version', { - 'version': smart_ctl_version() - }, True) - metric_print_meta(version_metric, 'smartmon_') - metric_print(version_metric, 'smartmon_') - - metrics = list(collect_disks_smart_metrics(args.wakeup_disks)) - metrics.sort(key=lambda i: i.name) - - previous_name = None - for m in metrics: - if m.name != previous_name: - metric_print_meta(m, 'smartmon_') - - previous_name = m.name - - metric_print(m, 'smartmon_') - - -if __name__ == '__main__': - main() diff --git a/roles/monitoring/prometheus/exporter/node/tasks/textfile_collector_generic.yml b/roles/monitoring/prometheus/exporter/node/tasks/textfile_collector_generic.yml index 5c068fe7..e5bf2ce5 100644 --- a/roles/monitoring/prometheus/exporter/node/tasks/textfile_collector_generic.yml +++ b/roles/monitoring/prometheus/exporter/node/tasks/textfile_collector_generic.yml @@ -1,7 +1,7 @@ --- - name: install the collector script - copy: - src: "{{ textfile_collector_name }}" + template: + src: "textfile-collector-scripts/{{ textfile_collector_name }}.j2" dest: "/usr/local/share/prometheus-node-exporter/{{ textfile_collector_name }}" mode: 0755 diff --git a/roles/monitoring/prometheus/exporter/node/templates/textfile-collector-scripts/apt.j2 b/roles/monitoring/prometheus/exporter/node/templates/textfile-collector-scripts/apt.j2 new file mode 100644 index 00000000..015addb0 --- /dev/null +++ b/roles/monitoring/prometheus/exporter/node/templates/textfile-collector-scripts/apt.j2 @@ -0,0 +1,40 @@ +#!/bin/bash +# +# Description: Expose metrics from apt updates. +# +# Author: Ben Kochie + +upgrades="$(/usr/bin/apt-get --just-print dist-upgrade \ + | /usr/bin/awk -F'[()]' \ + '/^Inst/ { sub("^[^ ]+ ", "", $2); gsub(" ","",$2); + sub("\\[", " ", $2); sub("\\]", "", $2); print $2 }' \ + | /usr/bin/sort \ + | /usr/bin/uniq -c \ + | awk '{ gsub(/\\\\/, "\\\\", $2); gsub(/"/, "\\\"", $2); + gsub(/\[/, "", $3); gsub(/\]/, "", $3); + print "apt_upgrades_pending{origin=\"" $2 "\",arch=\"" $NF "\"} " $1}' +)" + +autoremove="$(/usr/bin/apt-get --just-print autoremove \ + | /usr/bin/awk '/^Remv/{a++}END{printf "apt_autoremove_pending %d", a}' +)" + +echo '# HELP apt_upgrades_pending Apt package pending updates by origin.' +echo '# TYPE apt_upgrades_pending gauge' +if [[ -n "${upgrades}" ]] ; then + echo "${upgrades}" +else + echo 'apt_upgrades_pending{origin="",arch=""} 0' +fi + +echo '# HELP apt_autoremove_pending Apt package pending autoremove.' +echo '# TYPE apt_autoremove_pending gauge' +echo "${autoremove}" + +echo '# HELP node_reboot_required Node reboot is required for software updates.' +echo '# TYPE node_reboot_required gauge' +if [[ -f '/run/reboot-required' ]] ; then + echo 'node_reboot_required 1' +else + echo 'node_reboot_required 0' +fi diff --git a/roles/monitoring/prometheus/exporter/node/templates/textfile-collector-scripts/deleted-libraries.j2 b/roles/monitoring/prometheus/exporter/node/templates/textfile-collector-scripts/deleted-libraries.j2 new file mode 100644 index 00000000..aeddc903 --- /dev/null +++ b/roles/monitoring/prometheus/exporter/node/templates/textfile-collector-scripts/deleted-libraries.j2 @@ -0,0 +1,75 @@ +#!/usr/bin/env {{ python_basename }} +""" +Script to count the number of deleted libraries that are linked by running +processes and expose a summary as Prometheus metrics. + +The aim is to discover processes that are still using libraries that have since +been updated, perhaps due security vulnerabilities. +""" + +import errno +import glob +import os +import sys + + +def main(): + processes_linking_deleted_libraries = {} + + for path in glob.glob('/proc/*/maps'): + try: + with open(path, 'rb') as file: + for line in file: + part = line.decode().strip().split() + + if len(part) == 7: + library = part[5] + comment = part[6] + + if '/lib/' in library and '(deleted)' in comment: + if path not in processes_linking_deleted_libraries: + processes_linking_deleted_libraries[path] = {} + + if library in processes_linking_deleted_libraries[path]: + processes_linking_deleted_libraries[path][library] += 1 + else: + processes_linking_deleted_libraries[path][library] = 1 + except EnvironmentError as e: + # Ignore non-existent files, since the files may have changed since + # we globbed. + if e.errno != errno.ENOENT: + sys.exit('Failed to open file: {0}'.format(path)) + + num_processes_per_library = {} + + for process, library_count in processes_linking_deleted_libraries.items(): + libraries_seen = set() + for library, count in library_count.items(): + if library in libraries_seen: + continue + + libraries_seen.add(library) + if library in num_processes_per_library: + num_processes_per_library[library] += 1 + else: + num_processes_per_library[library] = 1 + + metric_name = 'node_processes_linking_deleted_libraries' + description = 'Count of running processes that link a deleted library' + print('# HELP {0} {1}'.format(metric_name, description)) + print('# TYPE {0} gauge'.format(metric_name)) + + for library, count in num_processes_per_library.items(): + dir_path, basename = os.path.split(library) + basename = basename.replace('"', '\\"') + dir_path = dir_path.replace('"', '\\"') + print('{0}{{ '{{' }}library_path="{1}", library_name="{2}"{{ '}}' }} {3}'.format( + metric_name, + dir_path, + basename, + count) + ) + + +if __name__ == "__main__": + main() diff --git a/roles/monitoring/prometheus/exporter/node/templates/textfile-collector-scripts/smartmon.j2 b/roles/monitoring/prometheus/exporter/node/templates/textfile-collector-scripts/smartmon.j2 new file mode 100644 index 00000000..b033faf0 --- /dev/null +++ b/roles/monitoring/prometheus/exporter/node/templates/textfile-collector-scripts/smartmon.j2 @@ -0,0 +1,391 @@ +#!/usr/bin/env {{ python_basename }} +import argparse +import collections +import csv +import datetime +import decimal +import re +import shlex +import subprocess +import sys + +device_info_re = re.compile(r'^(?P[^:]+?)(?:(?:\sis|):)\s*(?P.*)$') + +ata_error_count_re = re.compile( + r'^Error (\d+) \[\d+\] occurred', re.MULTILINE) + +self_test_re = re.compile(r'^SMART.*(PASSED|OK)$', re.MULTILINE) + +device_info_map = { + 'Vendor': 'vendor', + 'Product': 'product', + 'Revision': 'revision', + 'Logical Unit id': 'lun_id', + 'Model Family': 'model_family', + 'Device Model': 'device_model', + 'Serial Number': 'serial_number', + 'Firmware Version': 'firmware_version', +} + +smart_attributes_whitelist = { + 'airflow_temperature_cel', + 'command_timeout', + 'current_pending_sector', + 'end_to_end_error', + 'erase_fail_count_total', + 'g_sense_error_rate', + 'hardware_ecc_recovered', + 'host_reads_mib', + 'host_reads_32mib', + 'host_writes_mib', + 'host_writes_32mib', + 'load_cycle_count', + 'media_wearout_indicator', + 'wear_leveling_count', + 'nand_writes_1gib', + 'offline_uncorrectable', + 'power_cycle_count', + 'power_on_hours', + 'program_fail_count', + 'raw_read_error_rate', + 'reallocated_event_count', + 'reallocated_sector_ct', + 'reported_uncorrect', + 'sata_downshift_count', + 'seek_error_rate', + 'spin_retry_count', + 'spin_up_time', + 'start_stop_count', + 'temperature_case', + 'temperature_celsius', + 'temperature_internal', + 'total_lbas_read', + 'total_lbas_written', + 'udma_crc_error_count', + 'unsafe_shutdown_count', + 'workld_host_reads_perc', + 'workld_media_wear_indic', + 'workload_minutes', +} + +Metric = collections.namedtuple('Metric', 'name labels value') + +SmartAttribute = collections.namedtuple('SmartAttribute', [ + 'id', 'name', 'flag', 'value', 'worst', 'threshold', 'type', 'updated', + 'when_failed', 'raw_value', +]) + + +class Device(collections.namedtuple('DeviceBase', 'path opts')): + """Representation of a device as found by smartctl --scan output.""" + + @property + def type(self): + return self.opts.type + + @property + def base_labels(self): + return {'device': self.path, 'disk': self.type.partition('+')[2] or '0'} + + def smartctl_select(self): + return ['--device', self.type, self.path] + + +def metric_key(metric, prefix=''): + return '{prefix}{metric.name}'.format(prefix=prefix, metric=metric) + + +def metric_format(metric, prefix=''): + key = metric_key(metric, prefix) + labels = ','.join( + '{k}="{v}"'.format(k=k, v=v.replace('"', '\\"')) for k, v in metric.labels.items()) + value = decimal.Decimal(metric.value) + + return '{key}{{ '{{{' }}labels{{ '}}}' }} {value}'.format( + key=key, labels=labels, value=value) + + +def metric_print_meta(metric, prefix=''): + key = metric_key(metric, prefix) + print('# HELP {key} SMART metric {metric.name}'.format( + key=key, metric=metric)) + print('# TYPE {key} gauge'.format(key=key)) + + +def metric_print(metric, prefix=''): + print(metric_format(metric, prefix)) + + +def smart_ctl(*args, check=True): + """Wrapper around invoking the smartctl binary. + + Returns: + (str) Data piped to stdout by the smartctl subprocess. + """ + return subprocess.run( + ['smartctl', *args], stdout=subprocess.PIPE, check=check + ).stdout.decode('utf-8') + + +def smart_ctl_version(): + return smart_ctl('-V').split('\n')[0].split()[1] + + +def find_devices(): + """Find SMART devices. + + Yields: + (Device) Single device found by smartctl. + """ + parser = argparse.ArgumentParser() + parser.add_argument('-d', '--device', dest='type') + + devices = smart_ctl('--scan-open') + + for device in devices.split('\n'): + device = device.strip() + if not device: + continue + + tokens = shlex.split(device, comments=True) + if not tokens: + continue + + yield Device(tokens[0], parser.parse_args(tokens[1:])) + + +def device_is_active(device): + """Returns whenever the given device is currently active or not. + + Args: + device: (Device) Device in question. + + Returns: + (bool) True if the device is active and False otherwise. + """ + try: + smart_ctl('--nocheck', 'standby', *device.smartctl_select()) + except subprocess.CalledProcessError: + return False + + return True + + +def device_info(device): + """Query device for basic model information. + + Args: + device: (Device) Device in question. + + Returns: + (generator): Generator yielding: + + key (str): Key describing the value. + value (str): Actual value. + """ + info_lines = smart_ctl( + '--info', *device.smartctl_select() + ).strip().split('\n')[3:] + + matches = (device_info_re.match(line) for line in info_lines) + return (m.groups() for m in matches if m is not None) + + +def device_smart_capabilities(device): + """Returns SMART capabilities of the given device. + + Args: + device: (Device) Device in question. + + Returns: + (tuple): tuple containing: + + (bool): True whenever SMART is available, False otherwise. + (bool): True whenever SMART is enabled, False otherwise. + """ + groups = device_info(device) + + state = { + g[1].split(' ', 1)[0] + for g in groups if g[0] == 'SMART support'} + + smart_available = 'Available' in state + smart_enabled = 'Enabled' in state + + return smart_available, smart_enabled + + +def collect_device_info(device): + """Collect basic device information. + + Args: + device: (Device) Device in question. + + Yields: + (Metric) metrics describing general device information. + """ + values = dict(device_info(device)) + yield Metric('device_info', { + **device.base_labels, + **{v: values[k] for k, v in device_info_map.items() if k in values} + }, True) + + +def collect_device_health_self_assessment(device): + """Collect metric about the device health self assessment. + + Args: + device: (Device) Device in question. + + Yields: + (Metric) Device health self assessment. + """ + out = smart_ctl('--health', *device.smartctl_select(), check=False) + + self_assessment_passed = bool(self_test_re.search(out)) + + yield Metric( + 'device_smart_healthy', device.base_labels, self_assessment_passed) + + +def collect_ata_metrics(device): + # Fetch SMART attributes for the given device. + attributes = smart_ctl( + '--attributes', *device.smartctl_select() + ) + + # replace multiple occurrences of whitespace with a single whitespace + # so that the CSV Parser recognizes individual columns properly. + attributes = re.sub(r'[\t\x20]+', ' ', attributes) + + # Turn smartctl output into a list of lines and skip to the table of + # SMART attributes. + attribute_lines = attributes.strip().split('\n')[7:] + + # Some attributes have multiple IDs but have the same name. Don't + # yield attributes that already have been reported before. + seen = set() + + reader = csv.DictReader( + (line.strip() for line in attribute_lines), + fieldnames=SmartAttribute._fields[:-1], + restkey=SmartAttribute._fields[-1], delimiter=' ') + for entry in reader: + # We're only interested in the SMART attributes that are + # whitelisted here. + entry['name'] = entry['name'].lower() + if entry['name'] not in smart_attributes_whitelist: + continue + + # Ensure that only the numeric parts are fetched from the raw_value. + # Attributes such as 194 Temperature_Celsius reported by my SSD + # are in the format of "36 (Min/Max 24/40)" which can't be expressed + # properly as a prometheus metric. + m = re.match(r'^(\d+)', ' '.join(entry['raw_value'])) + if not m: + continue + entry['raw_value'] = m.group(1) + + # Some device models report "---" in the threshold value where most + # devices would report "000". We do the substitution here because + # downstream code expects values to be convertable to integer. + if entry['threshold'] == '---': + entry['threshold'] = '0' + + if entry['name'] in smart_attributes_whitelist and entry['name'] not in seen: + labels = { + 'name': entry['name'], + **device.base_labels, + } + + for col in 'value', 'worst', 'threshold', 'raw_value': + yield Metric( + 'attr_{col}'.format(col=col), + labels, entry[col]) + + seen.add(entry['name']) + + +def collect_ata_error_count(device): + """Inspect the device error log and report the amount of entries. + + Args: + device: (Device) Device in question. + + Yields: + (Metric) Device error count. + """ + error_log = smart_ctl( + '-l', 'xerror,1', *device.smartctl_select(), check=False) + + m = ata_error_count_re.search(error_log) + + error_count = m.group(1) if m is not None else 0 + + yield Metric('device_errors', device.base_labels, error_count) + + +def collect_disks_smart_metrics(wakeup_disks): + now = int(datetime.datetime.now().timestamp()) + + for device in find_devices(): + yield Metric('smartctl_run', device.base_labels, now) + + is_active = device_is_active(device) + + yield Metric('device_active', device.base_labels, is_active) + + # Skip further metrics collection to prevent the disk from + # spinning up. + if not is_active and not wakeup_disks: + continue + + yield from collect_device_info(device) + + smart_available, smart_enabled = device_smart_capabilities(device) + + yield Metric( + 'device_smart_available', device.base_labels, smart_available) + yield Metric( + 'device_smart_enabled', device.base_labels, smart_enabled) + + # Skip further metrics collection here if SMART is disabled + # on the device. Further smartctl invocations would fail + # anyways. + if not smart_available: + continue + + yield from collect_device_health_self_assessment(device) + + if device.type.startswith('sat'): + yield from collect_ata_metrics(device) + + yield from collect_ata_error_count(device) + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument('-s', '--wakeup-disks', dest='wakeup_disks', action='store_true') + args = parser.parse_args(sys.argv[1:]) + + version_metric = Metric('smartctl_version', { + 'version': smart_ctl_version() + }, True) + metric_print_meta(version_metric, 'smartmon_') + metric_print(version_metric, 'smartmon_') + + metrics = list(collect_disks_smart_metrics(args.wakeup_disks)) + metrics.sort(key=lambda i: i.name) + + previous_name = None + for m in metrics: + if m.name != previous_name: + metric_print_meta(m, 'smartmon_') + + previous_name = m.name + + metric_print(m, 'smartmon_') + + +if __name__ == '__main__': + main() -- cgit v1.2.3