diff options
Diffstat (limited to 'roles/monitoring/prometheus/exporter')
2 files changed, 57 insertions, 8 deletions
diff --git a/roles/monitoring/prometheus/exporter/node/templates/textfile-collector-scripts/deleted-libraries.j2 b/roles/monitoring/prometheus/exporter/node/templates/textfile-collector-scripts/deleted-libraries.j2 index aeddc903..b1a78dec 100644 --- a/roles/monitoring/prometheus/exporter/node/templates/textfile-collector-scripts/deleted-libraries.j2 +++ b/roles/monitoring/prometheus/exporter/node/templates/textfile-collector-scripts/deleted-libraries.j2 @@ -30,15 +30,18 @@ def main(): if path not in processes_linking_deleted_libraries: processes_linking_deleted_libraries[path] = {} - if library in processes_linking_deleted_libraries[path]: - processes_linking_deleted_libraries[path][library] += 1 - else: - processes_linking_deleted_libraries[path][library] = 1 - except EnvironmentError as e: + if library in processes_linking_deleted_libraries[path]: + processes_linking_deleted_libraries[path][library] += 1 + else: + processes_linking_deleted_libraries[path][library] = 1 + except FileNotFoundError: # Ignore non-existent files, since the files may have changed since # we globbed. - if e.errno != errno.ENOENT: - sys.exit('Failed to open file: {0}'.format(path)) + pass + except ProcessLookupError: + # If process vanishes while collecting the linked libs reading lines from + # the map file yields this error. + pass num_processes_per_library = {} diff --git a/roles/monitoring/prometheus/exporter/node/templates/textfile-collector-scripts/smartmon.j2 b/roles/monitoring/prometheus/exporter/node/templates/textfile-collector-scripts/smartmon.j2 index b033faf0..2b60509c 100644 --- a/roles/monitoring/prometheus/exporter/node/templates/textfile-collector-scripts/smartmon.j2 +++ b/roles/monitoring/prometheus/exporter/node/templates/textfile-collector-scripts/smartmon.j2 @@ -85,7 +85,7 @@ class Device(collections.namedtuple('DeviceBase', 'path opts')): @property def base_labels(self): - return {'device': self.path, 'disk': self.type.partition('+')[2] or '0'} + return {'device': self.path, 'type': self.type} def smartctl_select(self): return ['--device', self.type, self.path] @@ -203,6 +203,11 @@ def device_smart_capabilities(device): (bool): True whenever SMART is available, False otherwise. (bool): True whenever SMART is enabled, False otherwise. """ + + # NVME devices are SMART capable + if device.type == 'nvme': + return True, True + groups = device_info(device) state = { @@ -325,6 +330,44 @@ def collect_ata_error_count(device): yield Metric('device_errors', device.base_labels, error_count) +def collect_nvme_metrics(device): + # Fetch NVME metrics + attributes = smart_ctl( + '--attributes', *device.smartctl_select() + ) + + # replace multiple occurrences of whitespaces with a singel whitespace + attributes = re.sub(r'[\t\x20]+', ' ', attributes) + + # Turn smartctl output into a list of lines and skip to the table of + # SMART attributes. + attribute_lines = attributes.strip().split('\n')[6:] + for line in attribute_lines: + label, value = line.split(':') + if label == 'Available Spare': + yield Metric('available_spare_ratio', device.base_labels, value[0:-1]) + elif label == 'Available Spare Threshold': + yield Metric('available_spare_threshold_ratio', device.base_labels, value[0:-1]) + elif label == 'Percentage Used': + yield Metric('percentage_used_ratio', device.base_labels, value[0:-1]) + elif label == 'Power Cycle': + yield Metric('power_cycles_total', device.base_labels, value) + elif label == 'Power On Hours': + yield Metric('power_on_hours_total', device.base_labels, value.replace(',', '')) + elif label == 'Temperature': + yield Metric('temperature_celcius', device.base_labels, value.replace(' Celsius', '')) + elif label == 'Unsafe Shutdowns': + yield Metric('unsafe_shutdowns_total', device.base_labels, value) + elif label == 'Media and Data Integrity Errors': + yield Metric('media_errors_total', device.base_labels, value) + elif label == 'Error Information Log Entries': + yield Metric('num_err_log_entries_total', device.base_labels, value) + elif label == 'Warning Comp. Temperature Time': + yield Metric('warning_temperature_time_total', device.base_labels, value) + elif label == 'Critical Comp. Temperature Time': + yield Metric('critical_temperature_time_total', device.base_labels, value) + + def collect_disks_smart_metrics(wakeup_disks): now = int(datetime.datetime.now().timestamp()) @@ -362,6 +405,9 @@ def collect_disks_smart_metrics(wakeup_disks): yield from collect_ata_error_count(device) + if device.type == 'nvme': + yield from collect_nvme_metrics(device) + def main(): parser = argparse.ArgumentParser() |