From 6320da1262c1f44ac773c6b6578a59ba286ce973 Mon Sep 17 00:00:00 2001 From: Christian Pointner Date: Fri, 24 Sep 2021 00:57:01 +0200 Subject: add some basic prometheus node exporter textfile collector scripts --- roles/monitoring/prometheus/exporter/node/defaults/main.yml | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'roles/monitoring/prometheus/exporter/node/defaults') diff --git a/roles/monitoring/prometheus/exporter/node/defaults/main.yml b/roles/monitoring/prometheus/exporter/node/defaults/main.yml index 56227fbb..4a9b40cd 100644 --- a/roles/monitoring/prometheus/exporter/node/defaults/main.yml +++ b/roles/monitoring/prometheus/exporter/node/defaults/main.yml @@ -9,4 +9,7 @@ prometheus_exporter_node_timesync_collector: "{{ _prometheus_exporter_node_time_ prometheus_exporter_node_disable_collectors: [] prometheus_exporter_node_extra_collectors: -- "{{ prometheus_exporter_node_timesync_collector }}" + - "{{ prometheus_exporter_node_timesync_collector }}" + +prometheus_exporter_node_textfile_collector_scripts: + - deleted-libraries -- cgit v1.2.3 From bb9f5e0b165895e748ca1e6d83c1b3404c7cef71 Mon Sep 17 00:00:00 2001 From: Christian Pointner Date: Fri, 24 Sep 2021 12:56:42 +0200 Subject: also run new textcollector scripts after reboot --- roles/monitoring/prometheus/exporter/node/defaults/main.yml | 2 ++ .../exporter/node/templates/textfile-collector-scripts/apt.timer.j2 | 1 + .../templates/textfile-collector-scripts/deleted-libraries.timer.j2 | 1 + 3 files changed, 4 insertions(+) (limited to 'roles/monitoring/prometheus/exporter/node/defaults') diff --git a/roles/monitoring/prometheus/exporter/node/defaults/main.yml b/roles/monitoring/prometheus/exporter/node/defaults/main.yml index 4a9b40cd..2714a7fe 100644 --- a/roles/monitoring/prometheus/exporter/node/defaults/main.yml +++ b/roles/monitoring/prometheus/exporter/node/defaults/main.yml @@ -7,6 +7,8 @@ _prometheus_exporter_node_time_collector_map_: prometheus_exporter_node_timesync_collector: "{{ _prometheus_exporter_node_time_collector_map_[ntp_variant | default('')] }}" +## TODO: systemd state collector??? + prometheus_exporter_node_disable_collectors: [] prometheus_exporter_node_extra_collectors: - "{{ prometheus_exporter_node_timesync_collector }}" diff --git a/roles/monitoring/prometheus/exporter/node/templates/textfile-collector-scripts/apt.timer.j2 b/roles/monitoring/prometheus/exporter/node/templates/textfile-collector-scripts/apt.timer.j2 index 5e7d3062..b8a9c34e 100644 --- a/roles/monitoring/prometheus/exporter/node/templates/textfile-collector-scripts/apt.timer.j2 +++ b/roles/monitoring/prometheus/exporter/node/templates/textfile-collector-scripts/apt.timer.j2 @@ -2,6 +2,7 @@ Description=Promethues node exporter textfile collector apt [Timer] +OnBootSec=50s OnCalendar=*-*-* *:1/30:17 AccuracySec=10s diff --git a/roles/monitoring/prometheus/exporter/node/templates/textfile-collector-scripts/deleted-libraries.timer.j2 b/roles/monitoring/prometheus/exporter/node/templates/textfile-collector-scripts/deleted-libraries.timer.j2 index 8f38050a..1646ac73 100644 --- a/roles/monitoring/prometheus/exporter/node/templates/textfile-collector-scripts/deleted-libraries.timer.j2 +++ b/roles/monitoring/prometheus/exporter/node/templates/textfile-collector-scripts/deleted-libraries.timer.j2 @@ -2,6 +2,7 @@ Description=Promethues node exporter textfile collector deleted-libraries [Timer] +OnBootSec=60s OnCalendar=*-*-* *:2/30:22 AccuracySec=10s -- cgit v1.2.3 From 4ec26b272ab6090498a9eefa4a0efb06248b1ef4 Mon Sep 17 00:00:00 2001 From: Christian Pointner Date: Fri, 24 Sep 2021 14:11:35 +0200 Subject: prometheus: add monitoring for systemd units --- .../prometheus/exporter/node/defaults/main.yml | 1 + .../prometheus/server/defaults/main/rules_node.yml | 11 ++++++++++- .../prometheus/server/templates/jobs/node.j2 | 20 ++++++++++++++++++++ 3 files changed, 31 insertions(+), 1 deletion(-) create mode 100644 roles/monitoring/prometheus/server/templates/jobs/node.j2 (limited to 'roles/monitoring/prometheus/exporter/node/defaults') diff --git a/roles/monitoring/prometheus/exporter/node/defaults/main.yml b/roles/monitoring/prometheus/exporter/node/defaults/main.yml index 2714a7fe..491e70f6 100644 --- a/roles/monitoring/prometheus/exporter/node/defaults/main.yml +++ b/roles/monitoring/prometheus/exporter/node/defaults/main.yml @@ -12,6 +12,7 @@ prometheus_exporter_node_timesync_collector: "{{ _prometheus_exporter_node_time_ prometheus_exporter_node_disable_collectors: [] prometheus_exporter_node_extra_collectors: - "{{ prometheus_exporter_node_timesync_collector }}" + - systemd prometheus_exporter_node_textfile_collector_scripts: - deleted-libraries diff --git a/roles/monitoring/prometheus/server/defaults/main/rules_node.yml b/roles/monitoring/prometheus/server/defaults/main/rules_node.yml index 2c7f9319..64a7d562 100644 --- a/roles/monitoring/prometheus/server/defaults/main/rules_node.yml +++ b/roles/monitoring/prometheus/server/defaults/main/rules_node.yml @@ -92,6 +92,15 @@ prometheus_server_rules_node: summary: Host CPU steal noisy neighbor (instance {{ '{{' }} $labels.instance {{ '}}' }}) description: "CPU steal is > 10%. A noisy neighbor is killing VM performances or a spot instance may be out of credit.\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}" + - alert: HostSystemdNotRunning + expr: node_systemd_system_running == 0 + for: 0m + labels: + severity: warning + annotations: + summary: Host systemd is not in running state (instance {{ '{{' }} $labels.instance {{ '}}' }}) + description: "systemd is not in running state.\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}" + - alert: HostSystemdServiceCrashed expr: node_systemd_unit_state{state="failed"} == 1 for: 0m @@ -99,7 +108,7 @@ prometheus_server_rules_node: severity: warning annotations: summary: Host systemd service crashed (instance {{ '{{' }} $labels.instance {{ '}}' }}) - description: "systemd service crashed\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}" + description: "The systemd service unit {{ '{{' }} $labels.name {{ '}}' }} is in failed state.\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}" - alert: HostPhysicalComponentTooHot expr: node_hwmon_temp_celsius > 75 diff --git a/roles/monitoring/prometheus/server/templates/jobs/node.j2 b/roles/monitoring/prometheus/server/templates/jobs/node.j2 new file mode 100644 index 00000000..ba9eab31 --- /dev/null +++ b/roles/monitoring/prometheus/server/templates/jobs/node.j2 @@ -0,0 +1,20 @@ + - job_name: '{{ job }}' + metrics_path: /proxy + params: + module: + - {{ job }} + scheme: https + tls_config: + ca_file: /etc/ssl/prometheus/ca-crt.pem + cert_file: /etc/ssl/prometheus/server/scrape-crt.pem + key_file: /etc/ssl/prometheus/server/scrape-key.pem + file_sd_configs: + - files: + - "/etc/prometheus/targets/{{ job }}/*.yml" + metric_relabel_configs: + - source_labels: [ "mountpoint" ] + regex: ".*/\\.snapshot/.*" + action: drop + - source_labels: [ "__name__", "state" ] + regex: "node_systemd_unit_state;(activating|deactivating|inactive)" + action: drop -- cgit v1.2.3 From 9da269b334fc9a1949c787ea37a3d5879bc2b865 Mon Sep 17 00:00:00 2001 From: Christian Pointner Date: Fri, 24 Sep 2021 14:13:51 +0200 Subject: fix some todos --- roles/monitoring/prometheus/exporter/node/defaults/main.yml | 2 -- roles/monitoring/prometheus/server/defaults/main/rules_nut__ups.yml | 1 + roles/monitoring/prometheus/server/defaults/main/rules_openwrt.yml | 1 + 3 files changed, 2 insertions(+), 2 deletions(-) (limited to 'roles/monitoring/prometheus/exporter/node/defaults') diff --git a/roles/monitoring/prometheus/exporter/node/defaults/main.yml b/roles/monitoring/prometheus/exporter/node/defaults/main.yml index 491e70f6..9309562f 100644 --- a/roles/monitoring/prometheus/exporter/node/defaults/main.yml +++ b/roles/monitoring/prometheus/exporter/node/defaults/main.yml @@ -7,8 +7,6 @@ _prometheus_exporter_node_time_collector_map_: prometheus_exporter_node_timesync_collector: "{{ _prometheus_exporter_node_time_collector_map_[ntp_variant | default('')] }}" -## TODO: systemd state collector??? - prometheus_exporter_node_disable_collectors: [] prometheus_exporter_node_extra_collectors: - "{{ prometheus_exporter_node_timesync_collector }}" diff --git a/roles/monitoring/prometheus/server/defaults/main/rules_nut__ups.yml b/roles/monitoring/prometheus/server/defaults/main/rules_nut__ups.yml index bccb0ca8..150a507e 100644 --- a/roles/monitoring/prometheus/server/defaults/main/rules_nut__ups.yml +++ b/roles/monitoring/prometheus/server/defaults/main/rules_nut__ups.yml @@ -1,3 +1,4 @@ --- prometheus_server_rules_nut__ups_extra: [] prometheus_server_rules_nut__ups: [] +## TODO: add NUT/UPS alert rules diff --git a/roles/monitoring/prometheus/server/defaults/main/rules_openwrt.yml b/roles/monitoring/prometheus/server/defaults/main/rules_openwrt.yml index 88d84f31..04b178f1 100644 --- a/roles/monitoring/prometheus/server/defaults/main/rules_openwrt.yml +++ b/roles/monitoring/prometheus/server/defaults/main/rules_openwrt.yml @@ -1,3 +1,4 @@ --- prometheus_server_rules_openwrt_extra: [] prometheus_server_rules_openwrt: [] +## TODO: add openwrt specific alert rules -- cgit v1.2.3 From 9a47d5c3ef94cb09338a1b64d4dc9365d526bb54 Mon Sep 17 00:00:00 2001 From: Christian Pointner Date: Fri, 24 Sep 2021 15:05:48 +0200 Subject: refactor textfile collector script handling --- roles/monitoring/prometheus/exporter/TODO | 4 - .../prometheus/exporter/node/defaults/main.yml | 1 + .../prometheus/exporter/node/files/smartmon | 391 +++++++++++++++++++++ .../prometheus/exporter/node/tasks/main.yml | 24 +- .../node/tasks/textfile_collector_script.yml | 2 +- .../textfile-collector-scripts/apt.service.j2 | 3 +- .../textfile-collector-scripts/apt.timer.j2 | 5 +- .../deleted-libraries.service.j2 | 3 +- .../deleted-libraries.timer.j2 | 5 +- .../textfile-collector-scripts/smartmon.service.j2 | 29 ++ .../textfile-collector-scripts/smartmon.timer.j2 | 13 + 11 files changed, 447 insertions(+), 33 deletions(-) create mode 100644 roles/monitoring/prometheus/exporter/node/files/smartmon create mode 100644 roles/monitoring/prometheus/exporter/node/templates/textfile-collector-scripts/smartmon.service.j2 create mode 100644 roles/monitoring/prometheus/exporter/node/templates/textfile-collector-scripts/smartmon.timer.j2 (limited to 'roles/monitoring/prometheus/exporter/node/defaults') diff --git a/roles/monitoring/prometheus/exporter/TODO b/roles/monitoring/prometheus/exporter/TODO index c02e5699..79ff8721 100644 --- a/roles/monitoring/prometheus/exporter/TODO +++ b/roles/monitoring/prometheus/exporter/TODO @@ -1,7 +1,3 @@ -Node Exporter - Text Collector Scripts: - - https://github.com/prometheus-community/node-exporter-textfile-collector-scripts - - https://packages.debian.org/bullseye/prometheus-node-exporter-collectors - IPMI Exporter: - https://github.com/soundcloud/ipmi_exporter - https://packages.debian.org/bullseye/prometheus-ipmi-exporter diff --git a/roles/monitoring/prometheus/exporter/node/defaults/main.yml b/roles/monitoring/prometheus/exporter/node/defaults/main.yml index 9309562f..870753c3 100644 --- a/roles/monitoring/prometheus/exporter/node/defaults/main.yml +++ b/roles/monitoring/prometheus/exporter/node/defaults/main.yml @@ -14,3 +14,4 @@ prometheus_exporter_node_extra_collectors: prometheus_exporter_node_textfile_collector_scripts: - deleted-libraries + - smartmon diff --git a/roles/monitoring/prometheus/exporter/node/files/smartmon b/roles/monitoring/prometheus/exporter/node/files/smartmon new file mode 100644 index 00000000..1c39b492 --- /dev/null +++ b/roles/monitoring/prometheus/exporter/node/files/smartmon @@ -0,0 +1,391 @@ +#!/usr/bin/env python3 +import argparse +import collections +import csv +import datetime +import decimal +import re +import shlex +import subprocess +import sys + +device_info_re = re.compile(r'^(?P[^:]+?)(?:(?:\sis|):)\s*(?P.*)$') + +ata_error_count_re = re.compile( + r'^Error (\d+) \[\d+\] occurred', re.MULTILINE) + +self_test_re = re.compile(r'^SMART.*(PASSED|OK)$', re.MULTILINE) + +device_info_map = { + 'Vendor': 'vendor', + 'Product': 'product', + 'Revision': 'revision', + 'Logical Unit id': 'lun_id', + 'Model Family': 'model_family', + 'Device Model': 'device_model', + 'Serial Number': 'serial_number', + 'Firmware Version': 'firmware_version', +} + +smart_attributes_whitelist = { + 'airflow_temperature_cel', + 'command_timeout', + 'current_pending_sector', + 'end_to_end_error', + 'erase_fail_count_total', + 'g_sense_error_rate', + 'hardware_ecc_recovered', + 'host_reads_mib', + 'host_reads_32mib', + 'host_writes_mib', + 'host_writes_32mib', + 'load_cycle_count', + 'media_wearout_indicator', + 'wear_leveling_count', + 'nand_writes_1gib', + 'offline_uncorrectable', + 'power_cycle_count', + 'power_on_hours', + 'program_fail_count', + 'raw_read_error_rate', + 'reallocated_event_count', + 'reallocated_sector_ct', + 'reported_uncorrect', + 'sata_downshift_count', + 'seek_error_rate', + 'spin_retry_count', + 'spin_up_time', + 'start_stop_count', + 'temperature_case', + 'temperature_celsius', + 'temperature_internal', + 'total_lbas_read', + 'total_lbas_written', + 'udma_crc_error_count', + 'unsafe_shutdown_count', + 'workld_host_reads_perc', + 'workld_media_wear_indic', + 'workload_minutes', +} + +Metric = collections.namedtuple('Metric', 'name labels value') + +SmartAttribute = collections.namedtuple('SmartAttribute', [ + 'id', 'name', 'flag', 'value', 'worst', 'threshold', 'type', 'updated', + 'when_failed', 'raw_value', +]) + + +class Device(collections.namedtuple('DeviceBase', 'path opts')): + """Representation of a device as found by smartctl --scan output.""" + + @property + def type(self): + return self.opts.type + + @property + def base_labels(self): + return {'device': self.path, 'disk': self.type.partition('+')[2] or '0'} + + def smartctl_select(self): + return ['--device', self.type, self.path] + + +def metric_key(metric, prefix=''): + return '{prefix}{metric.name}'.format(prefix=prefix, metric=metric) + + +def metric_format(metric, prefix=''): + key = metric_key(metric, prefix) + labels = ','.join( + '{k}="{v}"'.format(k=k, v=v.replace('"', '\\"')) for k, v in metric.labels.items()) + value = decimal.Decimal(metric.value) + + return '{key}{{{labels}}} {value}'.format( + key=key, labels=labels, value=value) + + +def metric_print_meta(metric, prefix=''): + key = metric_key(metric, prefix) + print('# HELP {key} SMART metric {metric.name}'.format( + key=key, metric=metric)) + print('# TYPE {key} gauge'.format(key=key)) + + +def metric_print(metric, prefix=''): + print(metric_format(metric, prefix)) + + +def smart_ctl(*args, check=True): + """Wrapper around invoking the smartctl binary. + + Returns: + (str) Data piped to stdout by the smartctl subprocess. + """ + return subprocess.run( + ['smartctl', *args], stdout=subprocess.PIPE, check=check + ).stdout.decode('utf-8') + + +def smart_ctl_version(): + return smart_ctl('-V').split('\n')[0].split()[1] + + +def find_devices(): + """Find SMART devices. + + Yields: + (Device) Single device found by smartctl. + """ + parser = argparse.ArgumentParser() + parser.add_argument('-d', '--device', dest='type') + + devices = smart_ctl('--scan-open') + + for device in devices.split('\n'): + device = device.strip() + if not device: + continue + + tokens = shlex.split(device, comments=True) + if not tokens: + continue + + yield Device(tokens[0], parser.parse_args(tokens[1:])) + + +def device_is_active(device): + """Returns whenever the given device is currently active or not. + + Args: + device: (Device) Device in question. + + Returns: + (bool) True if the device is active and False otherwise. + """ + try: + smart_ctl('--nocheck', 'standby', *device.smartctl_select()) + except subprocess.CalledProcessError: + return False + + return True + + +def device_info(device): + """Query device for basic model information. + + Args: + device: (Device) Device in question. + + Returns: + (generator): Generator yielding: + + key (str): Key describing the value. + value (str): Actual value. + """ + info_lines = smart_ctl( + '--info', *device.smartctl_select() + ).strip().split('\n')[3:] + + matches = (device_info_re.match(line) for line in info_lines) + return (m.groups() for m in matches if m is not None) + + +def device_smart_capabilities(device): + """Returns SMART capabilities of the given device. + + Args: + device: (Device) Device in question. + + Returns: + (tuple): tuple containing: + + (bool): True whenever SMART is available, False otherwise. + (bool): True whenever SMART is enabled, False otherwise. + """ + groups = device_info(device) + + state = { + g[1].split(' ', 1)[0] + for g in groups if g[0] == 'SMART support'} + + smart_available = 'Available' in state + smart_enabled = 'Enabled' in state + + return smart_available, smart_enabled + + +def collect_device_info(device): + """Collect basic device information. + + Args: + device: (Device) Device in question. + + Yields: + (Metric) metrics describing general device information. + """ + values = dict(device_info(device)) + yield Metric('device_info', { + **device.base_labels, + **{v: values[k] for k, v in device_info_map.items() if k in values} + }, True) + + +def collect_device_health_self_assessment(device): + """Collect metric about the device health self assessment. + + Args: + device: (Device) Device in question. + + Yields: + (Metric) Device health self assessment. + """ + out = smart_ctl('--health', *device.smartctl_select(), check=False) + + self_assessment_passed = bool(self_test_re.search(out)) + + yield Metric( + 'device_smart_healthy', device.base_labels, self_assessment_passed) + + +def collect_ata_metrics(device): + # Fetch SMART attributes for the given device. + attributes = smart_ctl( + '--attributes', *device.smartctl_select() + ) + + # replace multiple occurrences of whitespace with a single whitespace + # so that the CSV Parser recognizes individual columns properly. + attributes = re.sub(r'[\t\x20]+', ' ', attributes) + + # Turn smartctl output into a list of lines and skip to the table of + # SMART attributes. + attribute_lines = attributes.strip().split('\n')[7:] + + # Some attributes have multiple IDs but have the same name. Don't + # yield attributes that already have been reported before. + seen = set() + + reader = csv.DictReader( + (line.strip() for line in attribute_lines), + fieldnames=SmartAttribute._fields[:-1], + restkey=SmartAttribute._fields[-1], delimiter=' ') + for entry in reader: + # We're only interested in the SMART attributes that are + # whitelisted here. + entry['name'] = entry['name'].lower() + if entry['name'] not in smart_attributes_whitelist: + continue + + # Ensure that only the numeric parts are fetched from the raw_value. + # Attributes such as 194 Temperature_Celsius reported by my SSD + # are in the format of "36 (Min/Max 24/40)" which can't be expressed + # properly as a prometheus metric. + m = re.match(r'^(\d+)', ' '.join(entry['raw_value'])) + if not m: + continue + entry['raw_value'] = m.group(1) + + # Some device models report "---" in the threshold value where most + # devices would report "000". We do the substitution here because + # downstream code expects values to be convertable to integer. + if entry['threshold'] == '---': + entry['threshold'] = '0' + + if entry['name'] in smart_attributes_whitelist and entry['name'] not in seen: + labels = { + 'name': entry['name'], + **device.base_labels, + } + + for col in 'value', 'worst', 'threshold', 'raw_value': + yield Metric( + 'attr_{col}'.format(col=col), + labels, entry[col]) + + seen.add(entry['name']) + + +def collect_ata_error_count(device): + """Inspect the device error log and report the amount of entries. + + Args: + device: (Device) Device in question. + + Yields: + (Metric) Device error count. + """ + error_log = smart_ctl( + '-l', 'xerror,1', *device.smartctl_select(), check=False) + + m = ata_error_count_re.search(error_log) + + error_count = m.group(1) if m is not None else 0 + + yield Metric('device_errors', device.base_labels, error_count) + + +def collect_disks_smart_metrics(wakeup_disks): + now = int(datetime.datetime.utcnow().timestamp()) + + for device in find_devices(): + yield Metric('smartctl_run', device.base_labels, now) + + is_active = device_is_active(device) + + yield Metric('device_active', device.base_labels, is_active) + + # Skip further metrics collection to prevent the disk from + # spinning up. + if not is_active and not wakeup_disks: + continue + + yield from collect_device_info(device) + + smart_available, smart_enabled = device_smart_capabilities(device) + + yield Metric( + 'device_smart_available', device.base_labels, smart_available) + yield Metric( + 'device_smart_enabled', device.base_labels, smart_enabled) + + # Skip further metrics collection here if SMART is disabled + # on the device. Further smartctl invocations would fail + # anyways. + if not smart_available: + continue + + yield from collect_device_health_self_assessment(device) + + if device.type.startswith('sat'): + yield from collect_ata_metrics(device) + + yield from collect_ata_error_count(device) + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument('-s', '--wakeup-disks', dest='wakeup_disks', action='store_true') + args = parser.parse_args(sys.argv[1:]) + + version_metric = Metric('smartctl_version', { + 'version': smart_ctl_version() + }, True) + metric_print_meta(version_metric, 'smartmon_') + metric_print(version_metric, 'smartmon_') + + metrics = list(collect_disks_smart_metrics(args.wakeup_disks)) + metrics.sort(key=lambda i: i.name) + + previous_name = None + for m in metrics: + if m.name != previous_name: + metric_print_meta(m, 'smartmon_') + + previous_name = m.name + + metric_print(m, 'smartmon_') + + +if __name__ == '__main__': + main() diff --git a/roles/monitoring/prometheus/exporter/node/tasks/main.yml b/roles/monitoring/prometheus/exporter/node/tasks/main.yml index 61e385f7..56903a33 100644 --- a/roles/monitoring/prometheus/exporter/node/tasks/main.yml +++ b/roles/monitoring/prometheus/exporter/node/tasks/main.yml @@ -2,7 +2,9 @@ ## TODO: pin version - name: install apt packages apt: - name: prom-exporter-node + name: + - prom-exporter-node + - moreutils state: present - name: create directory for textfile collector @@ -34,27 +36,9 @@ - name: create directory for textfile collector scripts file: - path: /usr/local/lib/prometheus-node-exporter + path: /usr/local/share/prometheus-node-exporter state: directory -- name: install textfile collector script wrapper - copy: - content: | - #!/bin/bash - - if [ -z "$1" ]; then - echo "Please specify which collector script to call!" - exit 1 - fi - collector="$1" - - set -e - rm -f "/var/lib/prometheus-node-exporter/textfile-collector/$collector.prom".* - "/usr/local/lib/prometheus-node-exporter/$collector" > "/var/lib/prometheus-node-exporter/textfile-collector/$collector.prom.$$" - mv "/var/lib/prometheus-node-exporter/textfile-collector/$collector.prom.$$" "/var/lib/prometheus-node-exporter/textfile-collector/$collector.prom" - dest: /usr/local/lib/prometheus-node-exporter/run-collector - mode: 0755 - - name: install the apt textfile collector script when: ansible_pkg_mgr == "apt" vars: diff --git a/roles/monitoring/prometheus/exporter/node/tasks/textfile_collector_script.yml b/roles/monitoring/prometheus/exporter/node/tasks/textfile_collector_script.yml index 1a39bb4c..5c068fe7 100644 --- a/roles/monitoring/prometheus/exporter/node/tasks/textfile_collector_script.yml +++ b/roles/monitoring/prometheus/exporter/node/tasks/textfile_collector_script.yml @@ -2,7 +2,7 @@ - name: install the collector script copy: src: "{{ textfile_collector_name }}" - dest: "/usr/local/lib/prometheus-node-exporter/{{ textfile_collector_name }}" + dest: "/usr/local/share/prometheus-node-exporter/{{ textfile_collector_name }}" mode: 0755 - name: install systemd service units diff --git a/roles/monitoring/prometheus/exporter/node/templates/textfile-collector-scripts/apt.service.j2 b/roles/monitoring/prometheus/exporter/node/templates/textfile-collector-scripts/apt.service.j2 index b0e9d167..7eca94fb 100644 --- a/roles/monitoring/prometheus/exporter/node/templates/textfile-collector-scripts/apt.service.j2 +++ b/roles/monitoring/prometheus/exporter/node/templates/textfile-collector-scripts/apt.service.j2 @@ -3,7 +3,8 @@ Description=Promethues node exporter textfile collector apt [Service] Type=oneshot -ExecStart=/usr/local/lib/prometheus-node-exporter/run-collector apt +Environment=TMPDIR=/var/lib/prometheus-node-exporter/textfile-collector +ExecStart=bash -c "/usr/local/share/prometheus-node-exporter/apt | sponge /var/lib/prometheus-node-exporter/textfile-collector/apt.prom" # systemd hardening-options AmbientCapabilities= diff --git a/roles/monitoring/prometheus/exporter/node/templates/textfile-collector-scripts/apt.timer.j2 b/roles/monitoring/prometheus/exporter/node/templates/textfile-collector-scripts/apt.timer.j2 index b8a9c34e..dc473749 100644 --- a/roles/monitoring/prometheus/exporter/node/templates/textfile-collector-scripts/apt.timer.j2 +++ b/roles/monitoring/prometheus/exporter/node/templates/textfile-collector-scripts/apt.timer.j2 @@ -2,9 +2,8 @@ Description=Promethues node exporter textfile collector apt [Timer] -OnBootSec=50s -OnCalendar=*-*-* *:1/30:17 -AccuracySec=10s +OnBootSec=10s +OnUnitActiveSec=15min [Install] WantedBy=timers.target diff --git a/roles/monitoring/prometheus/exporter/node/templates/textfile-collector-scripts/deleted-libraries.service.j2 b/roles/monitoring/prometheus/exporter/node/templates/textfile-collector-scripts/deleted-libraries.service.j2 index 9dbc822f..7b15e558 100644 --- a/roles/monitoring/prometheus/exporter/node/templates/textfile-collector-scripts/deleted-libraries.service.j2 +++ b/roles/monitoring/prometheus/exporter/node/templates/textfile-collector-scripts/deleted-libraries.service.j2 @@ -3,7 +3,8 @@ Description=Promethues node exporter textfile collector deleted-libraries [Service] Type=oneshot -ExecStart=/usr/local/lib/prometheus-node-exporter/run-collector deleted-libraries +Environment=TMPDIR=/var/lib/prometheus-node-exporter/textfile-collector +ExecStart=bash -c "/usr/local/share/prometheus-node-exporter/deleted-libraries | sponge /var/lib/prometheus-node-exporter/textfile-collector/deleted-libraries.prom" # systemd hardening-options AmbientCapabilities=CAP_SYS_PTRACE diff --git a/roles/monitoring/prometheus/exporter/node/templates/textfile-collector-scripts/deleted-libraries.timer.j2 b/roles/monitoring/prometheus/exporter/node/templates/textfile-collector-scripts/deleted-libraries.timer.j2 index 1646ac73..c09acecf 100644 --- a/roles/monitoring/prometheus/exporter/node/templates/textfile-collector-scripts/deleted-libraries.timer.j2 +++ b/roles/monitoring/prometheus/exporter/node/templates/textfile-collector-scripts/deleted-libraries.timer.j2 @@ -2,9 +2,8 @@ Description=Promethues node exporter textfile collector deleted-libraries [Timer] -OnBootSec=60s -OnCalendar=*-*-* *:2/30:22 -AccuracySec=10s +OnBootSec=20s +OnUnitActiveSec=15min [Install] WantedBy=timers.target diff --git a/roles/monitoring/prometheus/exporter/node/templates/textfile-collector-scripts/smartmon.service.j2 b/roles/monitoring/prometheus/exporter/node/templates/textfile-collector-scripts/smartmon.service.j2 new file mode 100644 index 00000000..fc7c9f3f --- /dev/null +++ b/roles/monitoring/prometheus/exporter/node/templates/textfile-collector-scripts/smartmon.service.j2 @@ -0,0 +1,29 @@ +[Unit] +Description=Promethues node exporter textfile collector smartmon + +[Service] +Type=oneshot +Environment=TMPDIR=/var/lib/prometheus-node-exporter/textfile-collector +Environment=LC_NUMERIC=C +ExecStart=bash -c "/usr/local/share/prometheus-node-exporter/smartmon | sponge /var/lib/prometheus-node-exporter/textfile-collector/smartmon.prom" + +# systemd hardening-options +AmbientCapabilities= +CapabilityBoundingSet= +LockPersonality=true +MemoryDenyWriteExecute=true +NoNewPrivileges=true +PrivateTmp=true +ProtectControlGroups=true +ProtectHome=true +ProtectKernelModules=true +ProtectKernelTunables=true +ProtectSystem=strict +ReadWritePaths=/var/lib/prometheus-node-exporter/textfile-collector +RemoveIPC=true +RestrictNamespaces=true +RestrictRealtime=true +SystemCallArchitectures=native + +[Install] +WantedBy=multi-user.target diff --git a/roles/monitoring/prometheus/exporter/node/templates/textfile-collector-scripts/smartmon.timer.j2 b/roles/monitoring/prometheus/exporter/node/templates/textfile-collector-scripts/smartmon.timer.j2 new file mode 100644 index 00000000..438da6b0 --- /dev/null +++ b/roles/monitoring/prometheus/exporter/node/templates/textfile-collector-scripts/smartmon.timer.j2 @@ -0,0 +1,13 @@ +[Unit] +Description=Promethues node exporter textfile collector smartmon +ConditionPathExists=/usr/sbin/smartctl +ConditionPathExistsGlob=|/dev/sd* +ConditionPathExistsGlob=|/dev/hd* +ConditionPathExistsGlob=|/dev/nvme* + +[Timer] +OnBootSec=30s +OnUnitActiveSec=15min + +[Install] +WantedBy=timers.target -- cgit v1.2.3 From 4921bb0dc32811aa40cf07ec8ad83f6f197ada0e Mon Sep 17 00:00:00 2001 From: Christian Pointner Date: Sat, 25 Sep 2021 19:11:28 +0200 Subject: disabling smartmon textfile collector by default since this can lead to idempotence issues with systemd that don't have smartcl installed --- roles/monitoring/prometheus/exporter/node/defaults/main.yml | 1 - .../prometheus/exporter/node/tasks/textfile_collector_script.yml | 3 +++ .../node/templates/textfile-collector-scripts/smartmon.timer.j2 | 4 ---- 3 files changed, 3 insertions(+), 5 deletions(-) (limited to 'roles/monitoring/prometheus/exporter/node/defaults') diff --git a/roles/monitoring/prometheus/exporter/node/defaults/main.yml b/roles/monitoring/prometheus/exporter/node/defaults/main.yml index 870753c3..9309562f 100644 --- a/roles/monitoring/prometheus/exporter/node/defaults/main.yml +++ b/roles/monitoring/prometheus/exporter/node/defaults/main.yml @@ -14,4 +14,3 @@ prometheus_exporter_node_extra_collectors: prometheus_exporter_node_textfile_collector_scripts: - deleted-libraries - - smartmon diff --git a/roles/monitoring/prometheus/exporter/node/tasks/textfile_collector_script.yml b/roles/monitoring/prometheus/exporter/node/tasks/textfile_collector_script.yml index 5c068fe7..80390a15 100644 --- a/roles/monitoring/prometheus/exporter/node/tasks/textfile_collector_script.yml +++ b/roles/monitoring/prometheus/exporter/node/tasks/textfile_collector_script.yml @@ -19,3 +19,6 @@ name: "prometheus-node-exporter_{{ textfile_collector_name }}.timer" state: started enabled: yes + + +## TODO: install deps for textfile collectors: i.e. smartmontools for collector smartmon diff --git a/roles/monitoring/prometheus/exporter/node/templates/textfile-collector-scripts/smartmon.timer.j2 b/roles/monitoring/prometheus/exporter/node/templates/textfile-collector-scripts/smartmon.timer.j2 index 438da6b0..576f5a9f 100644 --- a/roles/monitoring/prometheus/exporter/node/templates/textfile-collector-scripts/smartmon.timer.j2 +++ b/roles/monitoring/prometheus/exporter/node/templates/textfile-collector-scripts/smartmon.timer.j2 @@ -1,9 +1,5 @@ [Unit] Description=Promethues node exporter textfile collector smartmon -ConditionPathExists=/usr/sbin/smartctl -ConditionPathExistsGlob=|/dev/sd* -ConditionPathExistsGlob=|/dev/hd* -ConditionPathExistsGlob=|/dev/nvme* [Timer] OnBootSec=30s -- cgit v1.2.3