From 037bd11b5a70bd33abbe03c73431c569c77bc5a3 Mon Sep 17 00:00:00 2001 From: Christian Pointner Date: Tue, 18 Jul 2023 23:40:15 +0200 Subject: prometheus/node: update texfile collector smartmon --- .../prometheus/exporter/node/tasks/main.yml | 5 + .../textfile-collector-scripts/smartmon.j2 | 360 ++++++++++++++------- .../textfile-collector-scripts/smartmon.service.j2 | 2 +- 3 files changed, 254 insertions(+), 113 deletions(-) (limited to 'roles/monitoring') diff --git a/roles/monitoring/prometheus/exporter/node/tasks/main.yml b/roles/monitoring/prometheus/exporter/node/tasks/main.yml index ba722da8..5af10326 100644 --- a/roles/monitoring/prometheus/exporter/node/tasks/main.yml +++ b/roles/monitoring/prometheus/exporter/node/tasks/main.yml @@ -50,6 +50,11 @@ dest: /etc/prometheus/exporter/node.locations notify: reload nginx +- name: install common deps for textfile collector scripts + apt: + name: "{{ python_basename }}-prometheus-client" + state: present + - name: create directory for textfile collector scripts file: path: /usr/local/share/prometheus-node-exporter diff --git a/roles/monitoring/prometheus/exporter/node/templates/textfile-collector-scripts/smartmon.j2 b/roles/monitoring/prometheus/exporter/node/templates/textfile-collector-scripts/smartmon.j2 index 2b60509c..829383e2 100644 --- a/roles/monitoring/prometheus/exporter/node/templates/textfile-collector-scripts/smartmon.j2 +++ b/roles/monitoring/prometheus/exporter/node/templates/textfile-collector-scripts/smartmon.j2 @@ -1,13 +1,13 @@ #!/usr/bin/env {{ python_basename }} + import argparse import collections import csv -import datetime -import decimal import re import shlex import subprocess import sys +from prometheus_client import CollectorRegistry, Gauge, generate_latest device_info_re = re.compile(r'^(?P[^:]+?)(?:(?:\sis|):)\s*(?P.*)$') @@ -24,10 +24,11 @@ device_info_map = { 'Model Family': 'model_family', 'Device Model': 'device_model', 'Serial Number': 'serial_number', + 'Serial number': 'serial_number', 'Firmware Version': 'firmware_version', } -smart_attributes_whitelist = { +smart_attributes_whitelist = ( 'airflow_temperature_cel', 'command_timeout', 'current_pending_sector', @@ -66,10 +67,181 @@ smart_attributes_whitelist = { 'workld_host_reads_perc', 'workld_media_wear_indic', 'workload_minutes', +) + +registry = CollectorRegistry() +namespace = "smartmon" + +metrics = { + "smartctl_version": Gauge( + "smartctl_version", + "SMART metric smartctl_version", + ["version"], + namespace=namespace, + registry=registry, + ), + "device_active": Gauge( + "device_active", + "SMART metric device_active", + ["device", "disk"], + namespace=namespace, + registry=registry, + ), + "device_info": Gauge( + "device_info", + "SMART metric device_info", + [ + "device", + "disk", + "vendor", + "product", + "revision", + "lun_id", + "model_family", + "device_model", + "serial_number", + "firmware_version", + ], + namespace=namespace, + registry=registry, + ), + "device_smart_available": Gauge( + "device_smart_available", + "SMART metric device_smart_available", + ["device", "disk"], + namespace=namespace, + registry=registry, + ), + "device_smart_enabled": Gauge( + "device_smart_enabled", + "SMART metric device_smart_enabled", + ["device", "disk"], + namespace=namespace, + registry=registry, + ), + "device_smart_healthy": Gauge( + "device_smart_healthy", + "SMART metric device_smart_healthy", + ["device", "disk"], + namespace=namespace, + registry=registry, + ), + + # SMART attributes - ATA disks only + "attr_value": Gauge( + "attr_value", + "SMART metric attr_value", + ["device", "disk", "name"], + namespace=namespace, + registry=registry, + ), + "attr_worst": Gauge( + "attr_worst", + "SMART metric attr_worst", + ["device", "disk", "name"], + namespace=namespace, + registry=registry, + ), + "attr_threshold": Gauge( + "attr_threshold", + "SMART metric attr_threshold", + ["device", "disk", "name"], + namespace=namespace, + registry=registry, + ), + "attr_raw_value": Gauge( + "attr_raw_value", + "SMART metric attr_raw_value", + ["device", "disk", "name"], + namespace=namespace, + registry=registry, + ), + "device_errors": Gauge( + "device_errors", + "SMART metric device_errors", + ["device", "disk"], + namespace=namespace, + registry=registry, + ), + "available_spare_ratio": Gauge( + "available_spare_ratio", + "SMART metric available_spare_ratio", + ["device", "disk"], + namespace=namespace, + registry=registry, + ), + "available_spare_threshold_ratio": Gauge( + "available_spare_threshold_ratio", + "SMART metric available_spare_threshold_ratio", + ["device", "disk"], + namespace=namespace, + registry=registry, + ), + "percentage_used_ratio": Gauge( + "percentage_used_ratio", + "SMART metric percentage_used_ratio", + ["device", "disk"], + namespace=namespace, + registry=registry, + ), + "power_cycles_total": Gauge( + "power_cycles_total", + "SMART metric power_cycles_total", + ["device", "disk"], + namespace=namespace, + registry=registry, + ), + "power_on_hours_total": Gauge( + "power_on_hours_total", + "SMART metric power_on_hours_total", + ["device", "disk"], + namespace=namespace, + registry=registry, + ), + "temperature_celcius": Gauge( + "temperature_celcius", + "SMART metric temperature_celcius", + ["device", "disk"], + namespace=namespace, + registry=registry, + ), + "unsafe_shutdowns_total": Gauge( + "unsafe_shutdowns_total", + "SMART metric unsafe_shutdowns_total", + ["device", "disk"], + namespace=namespace, + registry=registry, + ), + "media_errors_total": Gauge( + "media_errors_total", + "SMART metric media_errors_total", + ["device", "disk"], + namespace=namespace, + registry=registry, + ), + "num_err_log_entries_total": Gauge( + "num_err_log_entries_total", + "SMART metric num_err_log_entries_total", + ["device", "disk"], + namespace=namespace, + registry=registry, + ), + "warning_temperature_time_total": Gauge( + "warning_temperature_time_total", + "SMART metric warning_temperature_time_total", + ["device", "disk"], + namespace=namespace, + registry=registry, + ), + "critical_temperature_time_total": Gauge( + "critical_temperature_time_total", + "SMART metric critical_temperature_time_total", + ["device", "disk"], + namespace=namespace, + registry=registry, + ), } -Metric = collections.namedtuple('Metric', 'name labels value') - SmartAttribute = collections.namedtuple('SmartAttribute', [ 'id', 'name', 'flag', 'value', 'worst', 'threshold', 'type', 'updated', 'when_failed', 'raw_value', @@ -85,37 +257,12 @@ class Device(collections.namedtuple('DeviceBase', 'path opts')): @property def base_labels(self): - return {'device': self.path, 'type': self.type} + return {'device': self.path, 'disk': self.type.partition('+')[2] or '0'} def smartctl_select(self): return ['--device', self.type, self.path] -def metric_key(metric, prefix=''): - return '{prefix}{metric.name}'.format(prefix=prefix, metric=metric) - - -def metric_format(metric, prefix=''): - key = metric_key(metric, prefix) - labels = ','.join( - '{k}="{v}"'.format(k=k, v=v.replace('"', '\\"')) for k, v in metric.labels.items()) - value = decimal.Decimal(metric.value) - - return '{key}{{ '{{{' }}labels{{ '}}}' }} {value}'.format( - key=key, labels=labels, value=value) - - -def metric_print_meta(metric, prefix=''): - key = metric_key(metric, prefix) - print('# HELP {key} SMART metric {metric.name}'.format( - key=key, metric=metric)) - print('# TYPE {key} gauge'.format(key=key)) - - -def metric_print(metric, prefix=''): - print(metric_format(metric, prefix)) - - def smart_ctl(*args, check=True): """Wrapper around invoking the smartctl binary. @@ -131,7 +278,7 @@ def smart_ctl_version(): return smart_ctl('-V').split('\n')[0].split()[1] -def find_devices(): +def find_devices(by_id): """Find SMART devices. Yields: @@ -140,7 +287,10 @@ def find_devices(): parser = argparse.ArgumentParser() parser.add_argument('-d', '--device', dest='type') - devices = smart_ctl('--scan-open') + args = ['--scan-open'] + if by_id: + args.extend(['-d', 'by-id']) + devices = smart_ctl(*args) for device in devices.split('\n'): device = device.strip() @@ -225,15 +375,20 @@ def collect_device_info(device): Args: device: (Device) Device in question. - - Yields: - (Metric) metrics describing general device information. """ values = dict(device_info(device)) - yield Metric('device_info', { - **device.base_labels, - **{v: values[k] for k, v in device_info_map.items() if k in values} - }, True) + metrics["device_info"].labels( + device.base_labels["device"], + device.base_labels["disk"], + values.get("Vendor", ""), + values.get("Product", ""), + values.get("Revision", ""), + values.get("Logical Unit id", ""), + values.get("Model Family", ""), + values.get("Device Model", ""), + values.get("Serial Number", ""), + values.get("Firmware Version", ""), + ).set(1) def collect_device_health_self_assessment(device): @@ -241,16 +396,13 @@ def collect_device_health_self_assessment(device): Args: device: (Device) Device in question. - - Yields: - (Metric) Device health self assessment. """ out = smart_ctl('--health', *device.smartctl_select(), check=False) self_assessment_passed = bool(self_test_re.search(out)) - - yield Metric( - 'device_smart_healthy', device.base_labels, self_assessment_passed) + metrics["device_smart_healthy"].labels( + device.base_labels["device"], device.base_labels["disk"] + ).set(self_assessment_passed) def collect_ata_metrics(device): @@ -298,15 +450,12 @@ def collect_ata_metrics(device): entry['threshold'] = '0' if entry['name'] in smart_attributes_whitelist and entry['name'] not in seen: - labels = { - 'name': entry['name'], - **device.base_labels, - } - for col in 'value', 'worst', 'threshold', 'raw_value': - yield Metric( - 'attr_{col}'.format(col=col), - labels, entry[col]) + metrics["attr_" + col].labels( + device.base_labels["device"], + device.base_labels["disk"], + entry["name"], + ).set(entry[col]) seen.add(entry['name']) @@ -316,9 +465,6 @@ def collect_ata_error_count(device): Args: device: (Device) Device in question. - - Yields: - (Metric) Device error count. """ error_log = smart_ctl( '-l', 'xerror,1', *device.smartctl_select(), check=False) @@ -326,8 +472,9 @@ def collect_ata_error_count(device): m = ata_error_count_re.search(error_log) error_count = m.group(1) if m is not None else 0 - - yield Metric('device_errors', device.base_labels, error_count) + metrics["device_errors"].labels( + device.base_labels["device"], device.base_labels["disk"] + ).set(error_count) def collect_nvme_metrics(device): @@ -336,7 +483,7 @@ def collect_nvme_metrics(device): '--attributes', *device.smartctl_select() ) - # replace multiple occurrences of whitespaces with a singel whitespace + # replace multiple occurrences of whitespaces with a single whitespace attributes = re.sub(r'[\t\x20]+', ' ', attributes) # Turn smartctl output into a list of lines and skip to the table of @@ -345,92 +492,81 @@ def collect_nvme_metrics(device): for line in attribute_lines: label, value = line.split(':') if label == 'Available Spare': - yield Metric('available_spare_ratio', device.base_labels, value[0:-1]) + metrics['available_spare_ratio'].labels(device.base_labels["device"], device.base_labels["disk"]).set(value[0:-1]) elif label == 'Available Spare Threshold': - yield Metric('available_spare_threshold_ratio', device.base_labels, value[0:-1]) + metrics['available_spare_threshold_ratio'].labels(device.base_labels["device"], device.base_labels["disk"]).set(value[0:-1]) elif label == 'Percentage Used': - yield Metric('percentage_used_ratio', device.base_labels, value[0:-1]) + metrics['percentage_used_ratio'].labels(device.base_labels["device"], device.base_labels["disk"]).set(value[0:-1]) elif label == 'Power Cycle': - yield Metric('power_cycles_total', device.base_labels, value) + metrics['power_cycles_total'].labels(device.base_labels["device"], device.base_labels["disk"]).set(value) elif label == 'Power On Hours': - yield Metric('power_on_hours_total', device.base_labels, value.replace(',', '')) + metrics['power_on_hours_total'].labels(device.base_labels["device"], device.base_labels["disk"]).set(value.replace(',', '')) elif label == 'Temperature': - yield Metric('temperature_celcius', device.base_labels, value.replace(' Celsius', '')) + metrics['temperature_celcius'].labels(device.base_labels["device"], device.base_labels["disk"]).set(value.replace(' Celsius', '')) elif label == 'Unsafe Shutdowns': - yield Metric('unsafe_shutdowns_total', device.base_labels, value) + metrics['unsafe_shutdowns_total'].labels(device.base_labels["device"], device.base_labels["disk"]).set(value) elif label == 'Media and Data Integrity Errors': - yield Metric('media_errors_total', device.base_labels, value) + metrics['media_errors_total'].labels(device.base_labels["device"], device.base_labels["disk"]).set(value) elif label == 'Error Information Log Entries': - yield Metric('num_err_log_entries_total', device.base_labels, value) + metrics['num_err_log_entries_total'].labels(device.base_labels["device"], device.base_labels["disk"]).set(value) elif label == 'Warning Comp. Temperature Time': - yield Metric('warning_temperature_time_total', device.base_labels, value) + metrics['warning_temperature_time_total'].labels(device.base_labels["device"], device.base_labels["disk"]).set(value) elif label == 'Critical Comp. Temperature Time': - yield Metric('critical_temperature_time_total', device.base_labels, value) - + metrics['critical_temperature_time_total'].labels(device.base_labels["device"], device.base_labels["disk"]).set(value) -def collect_disks_smart_metrics(wakeup_disks): - now = int(datetime.datetime.now().timestamp()) - - for device in find_devices(): - yield Metric('smartctl_run', device.base_labels, now) +def collect_disks_smart_metrics(wakeup_disks, by_id, include_nvme): + for device in find_devices(by_id): is_active = device_is_active(device) + metrics["device_active"].labels( + device.base_labels["device"], device.base_labels["disk"], + ).set(is_active) - yield Metric('device_active', device.base_labels, is_active) - - # Skip further metrics collection to prevent the disk from - # spinning up. + # Skip further metrics collection to prevent the disk from spinning up. if not is_active and not wakeup_disks: continue - yield from collect_device_info(device) + collect_device_info(device) smart_available, smart_enabled = device_smart_capabilities(device) - yield Metric( - 'device_smart_available', device.base_labels, smart_available) - yield Metric( - 'device_smart_enabled', device.base_labels, smart_enabled) + metrics["device_smart_available"].labels( + device.base_labels["device"], device.base_labels["disk"] + ).set(smart_available) - # Skip further metrics collection here if SMART is disabled - # on the device. Further smartctl invocations would fail - # anyways. + metrics["device_smart_enabled"].labels( + device.base_labels["device"], device.base_labels["disk"] + ).set(smart_enabled) + + # Skip further metrics collection here if SMART is disabled on the device. Further smartctl + # invocations would fail anyway. if not smart_available: continue - yield from collect_device_health_self_assessment(device) + collect_device_health_self_assessment(device) if device.type.startswith('sat'): - yield from collect_ata_metrics(device) - - yield from collect_ata_error_count(device) + collect_ata_metrics(device) + collect_ata_error_count(device) - if device.type == 'nvme': - yield from collect_nvme_metrics(device) + if include_nvme and device.type == 'nvme': + collect_nvme_metrics(device) def main(): parser = argparse.ArgumentParser() - parser.add_argument('-s', '--wakeup-disks', dest='wakeup_disks', action='store_true') + parser.add_argument('-s', '--wakeup-disks', dest='wakeup_disks', action='store_true', + help="Wake up disks to collect live stats") + parser.add_argument('--by-id', dest='by_id', action='store_true', + help="Use /dev/disk/by-id/X instead of /dev/sdX to index devices") + parser.add_argument('--include-nvme', dest='include_nvme', action='store_true', + help="Include metrics for NVMe drives") args = parser.parse_args(sys.argv[1:]) - version_metric = Metric('smartctl_version', { - 'version': smart_ctl_version() - }, True) - metric_print_meta(version_metric, 'smartmon_') - metric_print(version_metric, 'smartmon_') - - metrics = list(collect_disks_smart_metrics(args.wakeup_disks)) - metrics.sort(key=lambda i: i.name) - - previous_name = None - for m in metrics: - if m.name != previous_name: - metric_print_meta(m, 'smartmon_') - - previous_name = m.name + metrics["smartctl_version"].labels(smart_ctl_version()).set(1) - metric_print(m, 'smartmon_') + collect_disks_smart_metrics(args.wakeup_disks, args.by_id, args.include_nvme) + print(generate_latest(registry).decode(), end="") if __name__ == '__main__': diff --git a/roles/monitoring/prometheus/exporter/node/templates/textfile-collector-scripts/smartmon.service.j2 b/roles/monitoring/prometheus/exporter/node/templates/textfile-collector-scripts/smartmon.service.j2 index 8ee60c02..8d91677b 100644 --- a/roles/monitoring/prometheus/exporter/node/templates/textfile-collector-scripts/smartmon.service.j2 +++ b/roles/monitoring/prometheus/exporter/node/templates/textfile-collector-scripts/smartmon.service.j2 @@ -5,7 +5,7 @@ Description=Promethues node exporter textfile collector smartmon Type=oneshot Environment=TMPDIR=/var/lib/prometheus-node-exporter/textfile-collector Environment=LC_NUMERIC=C -ExecStart=bash -o pipefail -c "/usr/local/share/prometheus-node-exporter/smartmon | sponge /var/lib/prometheus-node-exporter/textfile-collector/smartmon.prom" +ExecStart=bash -o pipefail -c "/usr/local/share/prometheus-node-exporter/smartmon --include-nvme | sponge /var/lib/prometheus-node-exporter/textfile-collector/smartmon.prom" TimeoutStartSec=30s # systemd hardening-options -- cgit v1.2.3