summaryrefslogtreecommitdiff
path: root/roles/monitoring
diff options
context:
space:
mode:
authorChristian Pointner <equinox@spreadspace.org>2023-07-18 23:40:15 +0200
committerChristian Pointner <equinox@spreadspace.org>2023-07-18 23:40:15 +0200
commit037bd11b5a70bd33abbe03c73431c569c77bc5a3 (patch)
tree2784ac402d153dd80469818ab91c920ae08d5df5 /roles/monitoring
parentremove automatic selection for time-sync metrics and always use timex (diff)
prometheus/node: update texfile collector smartmon
Diffstat (limited to 'roles/monitoring')
-rw-r--r--roles/monitoring/prometheus/exporter/node/tasks/main.yml5
-rw-r--r--roles/monitoring/prometheus/exporter/node/templates/textfile-collector-scripts/smartmon.j2360
-rw-r--r--roles/monitoring/prometheus/exporter/node/templates/textfile-collector-scripts/smartmon.service.j22
3 files changed, 254 insertions, 113 deletions
diff --git a/roles/monitoring/prometheus/exporter/node/tasks/main.yml b/roles/monitoring/prometheus/exporter/node/tasks/main.yml
index ba722da8..5af10326 100644
--- a/roles/monitoring/prometheus/exporter/node/tasks/main.yml
+++ b/roles/monitoring/prometheus/exporter/node/tasks/main.yml
@@ -50,6 +50,11 @@
dest: /etc/prometheus/exporter/node.locations
notify: reload nginx
+- name: install common deps for textfile collector scripts
+ apt:
+ name: "{{ python_basename }}-prometheus-client"
+ state: present
+
- name: create directory for textfile collector scripts
file:
path: /usr/local/share/prometheus-node-exporter
diff --git a/roles/monitoring/prometheus/exporter/node/templates/textfile-collector-scripts/smartmon.j2 b/roles/monitoring/prometheus/exporter/node/templates/textfile-collector-scripts/smartmon.j2
index 2b60509c..829383e2 100644
--- a/roles/monitoring/prometheus/exporter/node/templates/textfile-collector-scripts/smartmon.j2
+++ b/roles/monitoring/prometheus/exporter/node/templates/textfile-collector-scripts/smartmon.j2
@@ -1,13 +1,13 @@
#!/usr/bin/env {{ python_basename }}
+
import argparse
import collections
import csv
-import datetime
-import decimal
import re
import shlex
import subprocess
import sys
+from prometheus_client import CollectorRegistry, Gauge, generate_latest
device_info_re = re.compile(r'^(?P<k>[^:]+?)(?:(?:\sis|):)\s*(?P<v>.*)$')
@@ -24,10 +24,11 @@ device_info_map = {
'Model Family': 'model_family',
'Device Model': 'device_model',
'Serial Number': 'serial_number',
+ 'Serial number': 'serial_number',
'Firmware Version': 'firmware_version',
}
-smart_attributes_whitelist = {
+smart_attributes_whitelist = (
'airflow_temperature_cel',
'command_timeout',
'current_pending_sector',
@@ -66,10 +67,181 @@ smart_attributes_whitelist = {
'workld_host_reads_perc',
'workld_media_wear_indic',
'workload_minutes',
+)
+
+registry = CollectorRegistry()
+namespace = "smartmon"
+
+metrics = {
+ "smartctl_version": Gauge(
+ "smartctl_version",
+ "SMART metric smartctl_version",
+ ["version"],
+ namespace=namespace,
+ registry=registry,
+ ),
+ "device_active": Gauge(
+ "device_active",
+ "SMART metric device_active",
+ ["device", "disk"],
+ namespace=namespace,
+ registry=registry,
+ ),
+ "device_info": Gauge(
+ "device_info",
+ "SMART metric device_info",
+ [
+ "device",
+ "disk",
+ "vendor",
+ "product",
+ "revision",
+ "lun_id",
+ "model_family",
+ "device_model",
+ "serial_number",
+ "firmware_version",
+ ],
+ namespace=namespace,
+ registry=registry,
+ ),
+ "device_smart_available": Gauge(
+ "device_smart_available",
+ "SMART metric device_smart_available",
+ ["device", "disk"],
+ namespace=namespace,
+ registry=registry,
+ ),
+ "device_smart_enabled": Gauge(
+ "device_smart_enabled",
+ "SMART metric device_smart_enabled",
+ ["device", "disk"],
+ namespace=namespace,
+ registry=registry,
+ ),
+ "device_smart_healthy": Gauge(
+ "device_smart_healthy",
+ "SMART metric device_smart_healthy",
+ ["device", "disk"],
+ namespace=namespace,
+ registry=registry,
+ ),
+
+ # SMART attributes - ATA disks only
+ "attr_value": Gauge(
+ "attr_value",
+ "SMART metric attr_value",
+ ["device", "disk", "name"],
+ namespace=namespace,
+ registry=registry,
+ ),
+ "attr_worst": Gauge(
+ "attr_worst",
+ "SMART metric attr_worst",
+ ["device", "disk", "name"],
+ namespace=namespace,
+ registry=registry,
+ ),
+ "attr_threshold": Gauge(
+ "attr_threshold",
+ "SMART metric attr_threshold",
+ ["device", "disk", "name"],
+ namespace=namespace,
+ registry=registry,
+ ),
+ "attr_raw_value": Gauge(
+ "attr_raw_value",
+ "SMART metric attr_raw_value",
+ ["device", "disk", "name"],
+ namespace=namespace,
+ registry=registry,
+ ),
+ "device_errors": Gauge(
+ "device_errors",
+ "SMART metric device_errors",
+ ["device", "disk"],
+ namespace=namespace,
+ registry=registry,
+ ),
+ "available_spare_ratio": Gauge(
+ "available_spare_ratio",
+ "SMART metric available_spare_ratio",
+ ["device", "disk"],
+ namespace=namespace,
+ registry=registry,
+ ),
+ "available_spare_threshold_ratio": Gauge(
+ "available_spare_threshold_ratio",
+ "SMART metric available_spare_threshold_ratio",
+ ["device", "disk"],
+ namespace=namespace,
+ registry=registry,
+ ),
+ "percentage_used_ratio": Gauge(
+ "percentage_used_ratio",
+ "SMART metric percentage_used_ratio",
+ ["device", "disk"],
+ namespace=namespace,
+ registry=registry,
+ ),
+ "power_cycles_total": Gauge(
+ "power_cycles_total",
+ "SMART metric power_cycles_total",
+ ["device", "disk"],
+ namespace=namespace,
+ registry=registry,
+ ),
+ "power_on_hours_total": Gauge(
+ "power_on_hours_total",
+ "SMART metric power_on_hours_total",
+ ["device", "disk"],
+ namespace=namespace,
+ registry=registry,
+ ),
+ "temperature_celcius": Gauge(
+ "temperature_celcius",
+ "SMART metric temperature_celcius",
+ ["device", "disk"],
+ namespace=namespace,
+ registry=registry,
+ ),
+ "unsafe_shutdowns_total": Gauge(
+ "unsafe_shutdowns_total",
+ "SMART metric unsafe_shutdowns_total",
+ ["device", "disk"],
+ namespace=namespace,
+ registry=registry,
+ ),
+ "media_errors_total": Gauge(
+ "media_errors_total",
+ "SMART metric media_errors_total",
+ ["device", "disk"],
+ namespace=namespace,
+ registry=registry,
+ ),
+ "num_err_log_entries_total": Gauge(
+ "num_err_log_entries_total",
+ "SMART metric num_err_log_entries_total",
+ ["device", "disk"],
+ namespace=namespace,
+ registry=registry,
+ ),
+ "warning_temperature_time_total": Gauge(
+ "warning_temperature_time_total",
+ "SMART metric warning_temperature_time_total",
+ ["device", "disk"],
+ namespace=namespace,
+ registry=registry,
+ ),
+ "critical_temperature_time_total": Gauge(
+ "critical_temperature_time_total",
+ "SMART metric critical_temperature_time_total",
+ ["device", "disk"],
+ namespace=namespace,
+ registry=registry,
+ ),
}
-Metric = collections.namedtuple('Metric', 'name labels value')
-
SmartAttribute = collections.namedtuple('SmartAttribute', [
'id', 'name', 'flag', 'value', 'worst', 'threshold', 'type', 'updated',
'when_failed', 'raw_value',
@@ -85,37 +257,12 @@ class Device(collections.namedtuple('DeviceBase', 'path opts')):
@property
def base_labels(self):
- return {'device': self.path, 'type': self.type}
+ return {'device': self.path, 'disk': self.type.partition('+')[2] or '0'}
def smartctl_select(self):
return ['--device', self.type, self.path]
-def metric_key(metric, prefix=''):
- return '{prefix}{metric.name}'.format(prefix=prefix, metric=metric)
-
-
-def metric_format(metric, prefix=''):
- key = metric_key(metric, prefix)
- labels = ','.join(
- '{k}="{v}"'.format(k=k, v=v.replace('"', '\\"')) for k, v in metric.labels.items())
- value = decimal.Decimal(metric.value)
-
- return '{key}{{ '{{{' }}labels{{ '}}}' }} {value}'.format(
- key=key, labels=labels, value=value)
-
-
-def metric_print_meta(metric, prefix=''):
- key = metric_key(metric, prefix)
- print('# HELP {key} SMART metric {metric.name}'.format(
- key=key, metric=metric))
- print('# TYPE {key} gauge'.format(key=key))
-
-
-def metric_print(metric, prefix=''):
- print(metric_format(metric, prefix))
-
-
def smart_ctl(*args, check=True):
"""Wrapper around invoking the smartctl binary.
@@ -131,7 +278,7 @@ def smart_ctl_version():
return smart_ctl('-V').split('\n')[0].split()[1]
-def find_devices():
+def find_devices(by_id):
"""Find SMART devices.
Yields:
@@ -140,7 +287,10 @@ def find_devices():
parser = argparse.ArgumentParser()
parser.add_argument('-d', '--device', dest='type')
- devices = smart_ctl('--scan-open')
+ args = ['--scan-open']
+ if by_id:
+ args.extend(['-d', 'by-id'])
+ devices = smart_ctl(*args)
for device in devices.split('\n'):
device = device.strip()
@@ -225,15 +375,20 @@ def collect_device_info(device):
Args:
device: (Device) Device in question.
-
- Yields:
- (Metric) metrics describing general device information.
"""
values = dict(device_info(device))
- yield Metric('device_info', {
- **device.base_labels,
- **{v: values[k] for k, v in device_info_map.items() if k in values}
- }, True)
+ metrics["device_info"].labels(
+ device.base_labels["device"],
+ device.base_labels["disk"],
+ values.get("Vendor", ""),
+ values.get("Product", ""),
+ values.get("Revision", ""),
+ values.get("Logical Unit id", ""),
+ values.get("Model Family", ""),
+ values.get("Device Model", ""),
+ values.get("Serial Number", ""),
+ values.get("Firmware Version", ""),
+ ).set(1)
def collect_device_health_self_assessment(device):
@@ -241,16 +396,13 @@ def collect_device_health_self_assessment(device):
Args:
device: (Device) Device in question.
-
- Yields:
- (Metric) Device health self assessment.
"""
out = smart_ctl('--health', *device.smartctl_select(), check=False)
self_assessment_passed = bool(self_test_re.search(out))
-
- yield Metric(
- 'device_smart_healthy', device.base_labels, self_assessment_passed)
+ metrics["device_smart_healthy"].labels(
+ device.base_labels["device"], device.base_labels["disk"]
+ ).set(self_assessment_passed)
def collect_ata_metrics(device):
@@ -298,15 +450,12 @@ def collect_ata_metrics(device):
entry['threshold'] = '0'
if entry['name'] in smart_attributes_whitelist and entry['name'] not in seen:
- labels = {
- 'name': entry['name'],
- **device.base_labels,
- }
-
for col in 'value', 'worst', 'threshold', 'raw_value':
- yield Metric(
- 'attr_{col}'.format(col=col),
- labels, entry[col])
+ metrics["attr_" + col].labels(
+ device.base_labels["device"],
+ device.base_labels["disk"],
+ entry["name"],
+ ).set(entry[col])
seen.add(entry['name'])
@@ -316,9 +465,6 @@ def collect_ata_error_count(device):
Args:
device: (Device) Device in question.
-
- Yields:
- (Metric) Device error count.
"""
error_log = smart_ctl(
'-l', 'xerror,1', *device.smartctl_select(), check=False)
@@ -326,8 +472,9 @@ def collect_ata_error_count(device):
m = ata_error_count_re.search(error_log)
error_count = m.group(1) if m is not None else 0
-
- yield Metric('device_errors', device.base_labels, error_count)
+ metrics["device_errors"].labels(
+ device.base_labels["device"], device.base_labels["disk"]
+ ).set(error_count)
def collect_nvme_metrics(device):
@@ -336,7 +483,7 @@ def collect_nvme_metrics(device):
'--attributes', *device.smartctl_select()
)
- # replace multiple occurrences of whitespaces with a singel whitespace
+ # replace multiple occurrences of whitespaces with a single whitespace
attributes = re.sub(r'[\t\x20]+', ' ', attributes)
# Turn smartctl output into a list of lines and skip to the table of
@@ -345,92 +492,81 @@ def collect_nvme_metrics(device):
for line in attribute_lines:
label, value = line.split(':')
if label == 'Available Spare':
- yield Metric('available_spare_ratio', device.base_labels, value[0:-1])
+ metrics['available_spare_ratio'].labels(device.base_labels["device"], device.base_labels["disk"]).set(value[0:-1])
elif label == 'Available Spare Threshold':
- yield Metric('available_spare_threshold_ratio', device.base_labels, value[0:-1])
+ metrics['available_spare_threshold_ratio'].labels(device.base_labels["device"], device.base_labels["disk"]).set(value[0:-1])
elif label == 'Percentage Used':
- yield Metric('percentage_used_ratio', device.base_labels, value[0:-1])
+ metrics['percentage_used_ratio'].labels(device.base_labels["device"], device.base_labels["disk"]).set(value[0:-1])
elif label == 'Power Cycle':
- yield Metric('power_cycles_total', device.base_labels, value)
+ metrics['power_cycles_total'].labels(device.base_labels["device"], device.base_labels["disk"]).set(value)
elif label == 'Power On Hours':
- yield Metric('power_on_hours_total', device.base_labels, value.replace(',', ''))
+ metrics['power_on_hours_total'].labels(device.base_labels["device"], device.base_labels["disk"]).set(value.replace(',', ''))
elif label == 'Temperature':
- yield Metric('temperature_celcius', device.base_labels, value.replace(' Celsius', ''))
+ metrics['temperature_celcius'].labels(device.base_labels["device"], device.base_labels["disk"]).set(value.replace(' Celsius', ''))
elif label == 'Unsafe Shutdowns':
- yield Metric('unsafe_shutdowns_total', device.base_labels, value)
+ metrics['unsafe_shutdowns_total'].labels(device.base_labels["device"], device.base_labels["disk"]).set(value)
elif label == 'Media and Data Integrity Errors':
- yield Metric('media_errors_total', device.base_labels, value)
+ metrics['media_errors_total'].labels(device.base_labels["device"], device.base_labels["disk"]).set(value)
elif label == 'Error Information Log Entries':
- yield Metric('num_err_log_entries_total', device.base_labels, value)
+ metrics['num_err_log_entries_total'].labels(device.base_labels["device"], device.base_labels["disk"]).set(value)
elif label == 'Warning Comp. Temperature Time':
- yield Metric('warning_temperature_time_total', device.base_labels, value)
+ metrics['warning_temperature_time_total'].labels(device.base_labels["device"], device.base_labels["disk"]).set(value)
elif label == 'Critical Comp. Temperature Time':
- yield Metric('critical_temperature_time_total', device.base_labels, value)
-
+ metrics['critical_temperature_time_total'].labels(device.base_labels["device"], device.base_labels["disk"]).set(value)
-def collect_disks_smart_metrics(wakeup_disks):
- now = int(datetime.datetime.now().timestamp())
-
- for device in find_devices():
- yield Metric('smartctl_run', device.base_labels, now)
+def collect_disks_smart_metrics(wakeup_disks, by_id, include_nvme):
+ for device in find_devices(by_id):
is_active = device_is_active(device)
+ metrics["device_active"].labels(
+ device.base_labels["device"], device.base_labels["disk"],
+ ).set(is_active)
- yield Metric('device_active', device.base_labels, is_active)
-
- # Skip further metrics collection to prevent the disk from
- # spinning up.
+ # Skip further metrics collection to prevent the disk from spinning up.
if not is_active and not wakeup_disks:
continue
- yield from collect_device_info(device)
+ collect_device_info(device)
smart_available, smart_enabled = device_smart_capabilities(device)
- yield Metric(
- 'device_smart_available', device.base_labels, smart_available)
- yield Metric(
- 'device_smart_enabled', device.base_labels, smart_enabled)
+ metrics["device_smart_available"].labels(
+ device.base_labels["device"], device.base_labels["disk"]
+ ).set(smart_available)
- # Skip further metrics collection here if SMART is disabled
- # on the device. Further smartctl invocations would fail
- # anyways.
+ metrics["device_smart_enabled"].labels(
+ device.base_labels["device"], device.base_labels["disk"]
+ ).set(smart_enabled)
+
+ # Skip further metrics collection here if SMART is disabled on the device. Further smartctl
+ # invocations would fail anyway.
if not smart_available:
continue
- yield from collect_device_health_self_assessment(device)
+ collect_device_health_self_assessment(device)
if device.type.startswith('sat'):
- yield from collect_ata_metrics(device)
-
- yield from collect_ata_error_count(device)
+ collect_ata_metrics(device)
+ collect_ata_error_count(device)
- if device.type == 'nvme':
- yield from collect_nvme_metrics(device)
+ if include_nvme and device.type == 'nvme':
+ collect_nvme_metrics(device)
def main():
parser = argparse.ArgumentParser()
- parser.add_argument('-s', '--wakeup-disks', dest='wakeup_disks', action='store_true')
+ parser.add_argument('-s', '--wakeup-disks', dest='wakeup_disks', action='store_true',
+ help="Wake up disks to collect live stats")
+ parser.add_argument('--by-id', dest='by_id', action='store_true',
+ help="Use /dev/disk/by-id/X instead of /dev/sdX to index devices")
+ parser.add_argument('--include-nvme', dest='include_nvme', action='store_true',
+ help="Include metrics for NVMe drives")
args = parser.parse_args(sys.argv[1:])
- version_metric = Metric('smartctl_version', {
- 'version': smart_ctl_version()
- }, True)
- metric_print_meta(version_metric, 'smartmon_')
- metric_print(version_metric, 'smartmon_')
-
- metrics = list(collect_disks_smart_metrics(args.wakeup_disks))
- metrics.sort(key=lambda i: i.name)
-
- previous_name = None
- for m in metrics:
- if m.name != previous_name:
- metric_print_meta(m, 'smartmon_')
-
- previous_name = m.name
+ metrics["smartctl_version"].labels(smart_ctl_version()).set(1)
- metric_print(m, 'smartmon_')
+ collect_disks_smart_metrics(args.wakeup_disks, args.by_id, args.include_nvme)
+ print(generate_latest(registry).decode(), end="")
if __name__ == '__main__':
diff --git a/roles/monitoring/prometheus/exporter/node/templates/textfile-collector-scripts/smartmon.service.j2 b/roles/monitoring/prometheus/exporter/node/templates/textfile-collector-scripts/smartmon.service.j2
index 8ee60c02..8d91677b 100644
--- a/roles/monitoring/prometheus/exporter/node/templates/textfile-collector-scripts/smartmon.service.j2
+++ b/roles/monitoring/prometheus/exporter/node/templates/textfile-collector-scripts/smartmon.service.j2
@@ -5,7 +5,7 @@ Description=Promethues node exporter textfile collector smartmon
Type=oneshot
Environment=TMPDIR=/var/lib/prometheus-node-exporter/textfile-collector
Environment=LC_NUMERIC=C
-ExecStart=bash -o pipefail -c "/usr/local/share/prometheus-node-exporter/smartmon | sponge /var/lib/prometheus-node-exporter/textfile-collector/smartmon.prom"
+ExecStart=bash -o pipefail -c "/usr/local/share/prometheus-node-exporter/smartmon --include-nvme | sponge /var/lib/prometheus-node-exporter/textfile-collector/smartmon.prom"
TimeoutStartSec=30s
# systemd hardening-options