From 31e3ea00dc1c8f22af13a3205835a678d269ca77 Mon Sep 17 00:00:00 2001 From: Christian Pointner Date: Tue, 22 Mar 2022 00:20:38 +0100 Subject: prometheus: minor fixes and improvements --- .../deleted-libraries.j2 | 17 ++++---- .../textfile-collector-scripts/smartmon.j2 | 48 +++++++++++++++++++++- .../prometheus/server/defaults/main/rules_node.yml | 13 +++++- 3 files changed, 68 insertions(+), 10 deletions(-) diff --git a/roles/monitoring/prometheus/exporter/node/templates/textfile-collector-scripts/deleted-libraries.j2 b/roles/monitoring/prometheus/exporter/node/templates/textfile-collector-scripts/deleted-libraries.j2 index aeddc903..b1a78dec 100644 --- a/roles/monitoring/prometheus/exporter/node/templates/textfile-collector-scripts/deleted-libraries.j2 +++ b/roles/monitoring/prometheus/exporter/node/templates/textfile-collector-scripts/deleted-libraries.j2 @@ -30,15 +30,18 @@ def main(): if path not in processes_linking_deleted_libraries: processes_linking_deleted_libraries[path] = {} - if library in processes_linking_deleted_libraries[path]: - processes_linking_deleted_libraries[path][library] += 1 - else: - processes_linking_deleted_libraries[path][library] = 1 - except EnvironmentError as e: + if library in processes_linking_deleted_libraries[path]: + processes_linking_deleted_libraries[path][library] += 1 + else: + processes_linking_deleted_libraries[path][library] = 1 + except FileNotFoundError: # Ignore non-existent files, since the files may have changed since # we globbed. - if e.errno != errno.ENOENT: - sys.exit('Failed to open file: {0}'.format(path)) + pass + except ProcessLookupError: + # If process vanishes while collecting the linked libs reading lines from + # the map file yields this error. + pass num_processes_per_library = {} diff --git a/roles/monitoring/prometheus/exporter/node/templates/textfile-collector-scripts/smartmon.j2 b/roles/monitoring/prometheus/exporter/node/templates/textfile-collector-scripts/smartmon.j2 index b033faf0..2b60509c 100644 --- a/roles/monitoring/prometheus/exporter/node/templates/textfile-collector-scripts/smartmon.j2 +++ b/roles/monitoring/prometheus/exporter/node/templates/textfile-collector-scripts/smartmon.j2 @@ -85,7 +85,7 @@ class Device(collections.namedtuple('DeviceBase', 'path opts')): @property def base_labels(self): - return {'device': self.path, 'disk': self.type.partition('+')[2] or '0'} + return {'device': self.path, 'type': self.type} def smartctl_select(self): return ['--device', self.type, self.path] @@ -203,6 +203,11 @@ def device_smart_capabilities(device): (bool): True whenever SMART is available, False otherwise. (bool): True whenever SMART is enabled, False otherwise. """ + + # NVME devices are SMART capable + if device.type == 'nvme': + return True, True + groups = device_info(device) state = { @@ -325,6 +330,44 @@ def collect_ata_error_count(device): yield Metric('device_errors', device.base_labels, error_count) +def collect_nvme_metrics(device): + # Fetch NVME metrics + attributes = smart_ctl( + '--attributes', *device.smartctl_select() + ) + + # replace multiple occurrences of whitespaces with a singel whitespace + attributes = re.sub(r'[\t\x20]+', ' ', attributes) + + # Turn smartctl output into a list of lines and skip to the table of + # SMART attributes. + attribute_lines = attributes.strip().split('\n')[6:] + for line in attribute_lines: + label, value = line.split(':') + if label == 'Available Spare': + yield Metric('available_spare_ratio', device.base_labels, value[0:-1]) + elif label == 'Available Spare Threshold': + yield Metric('available_spare_threshold_ratio', device.base_labels, value[0:-1]) + elif label == 'Percentage Used': + yield Metric('percentage_used_ratio', device.base_labels, value[0:-1]) + elif label == 'Power Cycle': + yield Metric('power_cycles_total', device.base_labels, value) + elif label == 'Power On Hours': + yield Metric('power_on_hours_total', device.base_labels, value.replace(',', '')) + elif label == 'Temperature': + yield Metric('temperature_celcius', device.base_labels, value.replace(' Celsius', '')) + elif label == 'Unsafe Shutdowns': + yield Metric('unsafe_shutdowns_total', device.base_labels, value) + elif label == 'Media and Data Integrity Errors': + yield Metric('media_errors_total', device.base_labels, value) + elif label == 'Error Information Log Entries': + yield Metric('num_err_log_entries_total', device.base_labels, value) + elif label == 'Warning Comp. Temperature Time': + yield Metric('warning_temperature_time_total', device.base_labels, value) + elif label == 'Critical Comp. Temperature Time': + yield Metric('critical_temperature_time_total', device.base_labels, value) + + def collect_disks_smart_metrics(wakeup_disks): now = int(datetime.datetime.now().timestamp()) @@ -362,6 +405,9 @@ def collect_disks_smart_metrics(wakeup_disks): yield from collect_ata_error_count(device) + if device.type == 'nvme': + yield from collect_nvme_metrics(device) + def main(): parser = argparse.ArgumentParser() diff --git a/roles/monitoring/prometheus/server/defaults/main/rules_node.yml b/roles/monitoring/prometheus/server/defaults/main/rules_node.yml index 7de36758..2278c70a 100644 --- a/roles/monitoring/prometheus/server/defaults/main/rules_node.yml +++ b/roles/monitoring/prometheus/server/defaults/main/rules_node.yml @@ -2,6 +2,15 @@ ## https://awesome-prometheus-alerts.grep.to/rules#host-and-hardware prometheus_server_rules_node_extra: [] prometheus_server_rules_node: + - alert: NodeTextfileScrapeFailed + expr: node_textfile_scrape_error != 0 + for: 0m + labels: + severity: warning + annotations: + summary: Scraping metrics from textfiles failed (instance {{ '{{' }} $labels.instance {{ '}}' }}) + description: "The textfile collector failed to scrape at least one file\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}" + - alert: HostOutOfMemory expr: node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 10 for: 2m @@ -47,7 +56,7 @@ prometheus_server_rules_node: description: "Filesystem is predicted to run out of space within the next 24 hours at current write rate\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}" - alert: HostOutOfInodes - expr: node_filesystem_files_free{mountpoint ="/rootfs"} / node_filesystem_files{mountpoint="/rootfs"} * 100 < 10 and ON (instance, device, mountpoint) node_filesystem_readonly{mountpoint="/rootfs"} == 0 + expr: node_filesystem_files_free{mountpoint="/rootfs"} / node_filesystem_files{mountpoint="/rootfs"} * 100 < 10 and ON (instance, device, mountpoint) node_filesystem_readonly{mountpoint="/rootfs"} == 0 for: 2m labels: severity: warning @@ -56,7 +65,7 @@ prometheus_server_rules_node: description: "Disk is almost running out of available inodes (< 10% left)\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}" - alert: HostInodesWillFillIn24Hours - expr: node_filesystem_files_free{mountpoint ="/rootfs"} / node_filesystem_files{mountpoint="/rootfs"} * 100 < 10 and predict_linear(node_filesystem_files_free{mountpoint="/rootfs"}[1h], 24 * 3600) < 0 and ON (instance, device, mountpoint) node_filesystem_readonly{mountpoint="/rootfs"} == 0 + expr: node_filesystem_files_free{mountpoint="/rootfs"} / node_filesystem_files{mountpoint="/rootfs"} * 100 < 10 and predict_linear(node_filesystem_files_free{mountpoint="/rootfs"}[1h], 24 * 3600) < 0 and ON (instance, device, mountpoint) node_filesystem_readonly{mountpoint="/rootfs"} == 0 for: 2m labels: severity: warning -- cgit v1.2.3