summaryrefslogtreecommitdiff
path: root/roles/monitoring/prometheus/exporter
diff options
context:
space:
mode:
Diffstat (limited to 'roles/monitoring/prometheus/exporter')
-rw-r--r--roles/monitoring/prometheus/exporter/node/templates/textfile-collector-scripts/deleted-libraries.j217
-rw-r--r--roles/monitoring/prometheus/exporter/node/templates/textfile-collector-scripts/smartmon.j248
2 files changed, 57 insertions, 8 deletions
diff --git a/roles/monitoring/prometheus/exporter/node/templates/textfile-collector-scripts/deleted-libraries.j2 b/roles/monitoring/prometheus/exporter/node/templates/textfile-collector-scripts/deleted-libraries.j2
index aeddc903..b1a78dec 100644
--- a/roles/monitoring/prometheus/exporter/node/templates/textfile-collector-scripts/deleted-libraries.j2
+++ b/roles/monitoring/prometheus/exporter/node/templates/textfile-collector-scripts/deleted-libraries.j2
@@ -30,15 +30,18 @@ def main():
if path not in processes_linking_deleted_libraries:
processes_linking_deleted_libraries[path] = {}
- if library in processes_linking_deleted_libraries[path]:
- processes_linking_deleted_libraries[path][library] += 1
- else:
- processes_linking_deleted_libraries[path][library] = 1
- except EnvironmentError as e:
+ if library in processes_linking_deleted_libraries[path]:
+ processes_linking_deleted_libraries[path][library] += 1
+ else:
+ processes_linking_deleted_libraries[path][library] = 1
+ except FileNotFoundError:
# Ignore non-existent files, since the files may have changed since
# we globbed.
- if e.errno != errno.ENOENT:
- sys.exit('Failed to open file: {0}'.format(path))
+ pass
+ except ProcessLookupError:
+ # If process vanishes while collecting the linked libs reading lines from
+ # the map file yields this error.
+ pass
num_processes_per_library = {}
diff --git a/roles/monitoring/prometheus/exporter/node/templates/textfile-collector-scripts/smartmon.j2 b/roles/monitoring/prometheus/exporter/node/templates/textfile-collector-scripts/smartmon.j2
index b033faf0..2b60509c 100644
--- a/roles/monitoring/prometheus/exporter/node/templates/textfile-collector-scripts/smartmon.j2
+++ b/roles/monitoring/prometheus/exporter/node/templates/textfile-collector-scripts/smartmon.j2
@@ -85,7 +85,7 @@ class Device(collections.namedtuple('DeviceBase', 'path opts')):
@property
def base_labels(self):
- return {'device': self.path, 'disk': self.type.partition('+')[2] or '0'}
+ return {'device': self.path, 'type': self.type}
def smartctl_select(self):
return ['--device', self.type, self.path]
@@ -203,6 +203,11 @@ def device_smart_capabilities(device):
(bool): True whenever SMART is available, False otherwise.
(bool): True whenever SMART is enabled, False otherwise.
"""
+
+ # NVME devices are SMART capable
+ if device.type == 'nvme':
+ return True, True
+
groups = device_info(device)
state = {
@@ -325,6 +330,44 @@ def collect_ata_error_count(device):
yield Metric('device_errors', device.base_labels, error_count)
+def collect_nvme_metrics(device):
+ # Fetch NVME metrics
+ attributes = smart_ctl(
+ '--attributes', *device.smartctl_select()
+ )
+
+ # replace multiple occurrences of whitespaces with a singel whitespace
+ attributes = re.sub(r'[\t\x20]+', ' ', attributes)
+
+ # Turn smartctl output into a list of lines and skip to the table of
+ # SMART attributes.
+ attribute_lines = attributes.strip().split('\n')[6:]
+ for line in attribute_lines:
+ label, value = line.split(':')
+ if label == 'Available Spare':
+ yield Metric('available_spare_ratio', device.base_labels, value[0:-1])
+ elif label == 'Available Spare Threshold':
+ yield Metric('available_spare_threshold_ratio', device.base_labels, value[0:-1])
+ elif label == 'Percentage Used':
+ yield Metric('percentage_used_ratio', device.base_labels, value[0:-1])
+ elif label == 'Power Cycle':
+ yield Metric('power_cycles_total', device.base_labels, value)
+ elif label == 'Power On Hours':
+ yield Metric('power_on_hours_total', device.base_labels, value.replace(',', ''))
+ elif label == 'Temperature':
+ yield Metric('temperature_celcius', device.base_labels, value.replace(' Celsius', ''))
+ elif label == 'Unsafe Shutdowns':
+ yield Metric('unsafe_shutdowns_total', device.base_labels, value)
+ elif label == 'Media and Data Integrity Errors':
+ yield Metric('media_errors_total', device.base_labels, value)
+ elif label == 'Error Information Log Entries':
+ yield Metric('num_err_log_entries_total', device.base_labels, value)
+ elif label == 'Warning Comp. Temperature Time':
+ yield Metric('warning_temperature_time_total', device.base_labels, value)
+ elif label == 'Critical Comp. Temperature Time':
+ yield Metric('critical_temperature_time_total', device.base_labels, value)
+
+
def collect_disks_smart_metrics(wakeup_disks):
now = int(datetime.datetime.now().timestamp())
@@ -362,6 +405,9 @@ def collect_disks_smart_metrics(wakeup_disks):
yield from collect_ata_error_count(device)
+ if device.type == 'nvme':
+ yield from collect_nvme_metrics(device)
+
def main():
parser = argparse.ArgumentParser()