From 8412f714d05fff3e8b9e43d90e18937c8f0c91e4 Mon Sep 17 00:00:00 2001 From: Christian Pointner Date: Wed, 19 Jul 2023 00:51:42 +0200 Subject: prometheus/node: update textfile collector apt_info (replaces apt) --- .../prometheus/exporter/node/defaults/main.yml | 2 +- .../prometheus/exporter/node/handlers/main.yml | 5 + .../prometheus/exporter/node/tasks/main.yml | 9 +- .../node/tasks/textfile_collector_apt_info.yml | 30 ++++++ .../templates/textfile-collector-scripts/apt.j2 | 40 -------- .../textfile-collector-scripts/apt.service.j2 | 33 ------- .../textfile-collector-scripts/apt.timer.j2 | 9 -- .../textfile-collector-scripts/apt_info.j2 | 104 +++++++++++++++++++++ .../textfile-collector-scripts/apt_info.service.j2 | 33 +++++++ .../textfile-collector-scripts/apt_info.timer.j2 | 9 ++ .../prometheus/server/defaults/main/rules_node.yml | 2 +- 11 files changed, 188 insertions(+), 88 deletions(-) create mode 100644 roles/monitoring/prometheus/exporter/node/tasks/textfile_collector_apt_info.yml delete mode 100644 roles/monitoring/prometheus/exporter/node/templates/textfile-collector-scripts/apt.j2 delete mode 100644 roles/monitoring/prometheus/exporter/node/templates/textfile-collector-scripts/apt.service.j2 delete mode 100644 roles/monitoring/prometheus/exporter/node/templates/textfile-collector-scripts/apt.timer.j2 create mode 100644 roles/monitoring/prometheus/exporter/node/templates/textfile-collector-scripts/apt_info.j2 create mode 100644 roles/monitoring/prometheus/exporter/node/templates/textfile-collector-scripts/apt_info.service.j2 create mode 100644 roles/monitoring/prometheus/exporter/node/templates/textfile-collector-scripts/apt_info.timer.j2 diff --git a/roles/monitoring/prometheus/exporter/node/defaults/main.yml b/roles/monitoring/prometheus/exporter/node/defaults/main.yml index b9e4a6c7..e06909bf 100644 --- a/roles/monitoring/prometheus/exporter/node/defaults/main.yml +++ b/roles/monitoring/prometheus/exporter/node/defaults/main.yml @@ -5,7 +5,7 @@ prometheus_exporter_node_disable_collectors: [] prometheus_exporter_node_extra_collectors: - systemd -prometheus_exporter_node_install_apt_textfile_collector_script: "{{ ansible_pkg_mgr == 'apt' }}" +prometheus_exporter_node_install_apt_info_textfile_collector_script: "{{ ansible_pkg_mgr == 'apt' }}" prometheus_exporter_node_textfile_collector_scripts: - deleted-libraries # - smartmon diff --git a/roles/monitoring/prometheus/exporter/node/handlers/main.yml b/roles/monitoring/prometheus/exporter/node/handlers/main.yml index 8f5cb37c..1b6f8668 100644 --- a/roles/monitoring/prometheus/exporter/node/handlers/main.yml +++ b/roles/monitoring/prometheus/exporter/node/handlers/main.yml @@ -13,3 +13,8 @@ file: path: /run/prometheus-node-exporter_sensors/state state: absent + +### TODO: remove this once apt textfile collector has been replaced on all hosts +- name: reload systemd + systemd: + daemon_reload: yes diff --git a/roles/monitoring/prometheus/exporter/node/tasks/main.yml b/roles/monitoring/prometheus/exporter/node/tasks/main.yml index 5af10326..adb2be85 100644 --- a/roles/monitoring/prometheus/exporter/node/tasks/main.yml +++ b/roles/monitoring/prometheus/exporter/node/tasks/main.yml @@ -60,11 +60,12 @@ path: /usr/local/share/prometheus-node-exporter state: directory -- name: install the apt textfile collector script - when: prometheus_exporter_node_install_apt_textfile_collector_script +- name: install the apt_info textfile collector script + when: prometheus_exporter_node_install_apt_info_textfile_collector_script vars: - textfile_collector_name: "apt" - include_tasks: textfile_collector_generic.yml + textfile_collector_name: "apt_info" + include_tasks: textfile_collector_apt_info.yml +# include_tasks: textfile_collector_generic.yml - name: install all other textfile collector scripts loop: "{{ prometheus_exporter_node_textfile_collector_scripts }}" diff --git a/roles/monitoring/prometheus/exporter/node/tasks/textfile_collector_apt_info.yml b/roles/monitoring/prometheus/exporter/node/tasks/textfile_collector_apt_info.yml new file mode 100644 index 00000000..b452c66e --- /dev/null +++ b/roles/monitoring/prometheus/exporter/node/tasks/textfile_collector_apt_info.yml @@ -0,0 +1,30 @@ +--- +## this is overkill since all apt-based machines that are controlled via ansible will have this installed +# - name: make sure python-apt is installed +# apt: +# name: "{{ python_basename }}-apt" +# state: present + +## TODO: remove this once all hosts have been migrated +- name: make sure the systemd timer for apt textfile collector is disabled and stopped + systemd: + service: prometheus-node-exporter_apt.timer + enabled: no + state: stopped + register: result_systemd_stop + failed_when: "result_systemd_stop is failed and 'Could not find the requested service' not in result_systemd_stop.msg" + +- name: remove files from apt textfile collector + loop: + - /etc/systemd/system/prometheus-node-exporter_apt.timer + - /etc/systemd/system/prometheus-node-exporter_apt.service + - /usr/local/share/prometheus-node-exporter/apt + - /var/lib/prometheus-node-exporter/textfile-collector/apt.prom + file: + path: "{{ item }}" + state: absent + notify: reload systemd +## + +- name: install the apt_info textfile collector script + include_tasks: textfile_collector_generic.yml diff --git a/roles/monitoring/prometheus/exporter/node/templates/textfile-collector-scripts/apt.j2 b/roles/monitoring/prometheus/exporter/node/templates/textfile-collector-scripts/apt.j2 deleted file mode 100644 index 015addb0..00000000 --- a/roles/monitoring/prometheus/exporter/node/templates/textfile-collector-scripts/apt.j2 +++ /dev/null @@ -1,40 +0,0 @@ -#!/bin/bash -# -# Description: Expose metrics from apt updates. -# -# Author: Ben Kochie - -upgrades="$(/usr/bin/apt-get --just-print dist-upgrade \ - | /usr/bin/awk -F'[()]' \ - '/^Inst/ { sub("^[^ ]+ ", "", $2); gsub(" ","",$2); - sub("\\[", " ", $2); sub("\\]", "", $2); print $2 }' \ - | /usr/bin/sort \ - | /usr/bin/uniq -c \ - | awk '{ gsub(/\\\\/, "\\\\", $2); gsub(/"/, "\\\"", $2); - gsub(/\[/, "", $3); gsub(/\]/, "", $3); - print "apt_upgrades_pending{origin=\"" $2 "\",arch=\"" $NF "\"} " $1}' -)" - -autoremove="$(/usr/bin/apt-get --just-print autoremove \ - | /usr/bin/awk '/^Remv/{a++}END{printf "apt_autoremove_pending %d", a}' -)" - -echo '# HELP apt_upgrades_pending Apt package pending updates by origin.' -echo '# TYPE apt_upgrades_pending gauge' -if [[ -n "${upgrades}" ]] ; then - echo "${upgrades}" -else - echo 'apt_upgrades_pending{origin="",arch=""} 0' -fi - -echo '# HELP apt_autoremove_pending Apt package pending autoremove.' -echo '# TYPE apt_autoremove_pending gauge' -echo "${autoremove}" - -echo '# HELP node_reboot_required Node reboot is required for software updates.' -echo '# TYPE node_reboot_required gauge' -if [[ -f '/run/reboot-required' ]] ; then - echo 'node_reboot_required 1' -else - echo 'node_reboot_required 0' -fi diff --git a/roles/monitoring/prometheus/exporter/node/templates/textfile-collector-scripts/apt.service.j2 b/roles/monitoring/prometheus/exporter/node/templates/textfile-collector-scripts/apt.service.j2 deleted file mode 100644 index c60439c4..00000000 --- a/roles/monitoring/prometheus/exporter/node/templates/textfile-collector-scripts/apt.service.j2 +++ /dev/null @@ -1,33 +0,0 @@ -[Unit] -Description=Promethues node exporter textfile collector apt - -[Service] -Type=oneshot -Environment=TMPDIR=/var/lib/prometheus-node-exporter/textfile-collector -ExecStart=bash -o pipefail -c "/usr/local/share/prometheus-node-exporter/apt | sponge /var/lib/prometheus-node-exporter/textfile-collector/apt.prom" -TimeoutStartSec=30s - -# systemd hardening-options -AmbientCapabilities= -CapabilityBoundingSet= -DeviceAllow=/dev/null rw -DevicePolicy=strict -LockPersonality=true -MemoryDenyWriteExecute=true -NoNewPrivileges=true -PrivateDevices=true -PrivateTmp=true -ProtectControlGroups=true -ProtectHome=true -ProtectKernelModules=true -ProtectKernelTunables=true -ProtectSystem=strict -ReadWritePaths=/var/lib/prometheus-node-exporter/textfile-collector -RemoveIPC=true -RestrictNamespaces=true -RestrictRealtime=true -RestrictAddressFamilies=AF_UNIX -SystemCallArchitectures=native - -[Install] -WantedBy=multi-user.target diff --git a/roles/monitoring/prometheus/exporter/node/templates/textfile-collector-scripts/apt.timer.j2 b/roles/monitoring/prometheus/exporter/node/templates/textfile-collector-scripts/apt.timer.j2 deleted file mode 100644 index dc473749..00000000 --- a/roles/monitoring/prometheus/exporter/node/templates/textfile-collector-scripts/apt.timer.j2 +++ /dev/null @@ -1,9 +0,0 @@ -[Unit] -Description=Promethues node exporter textfile collector apt - -[Timer] -OnBootSec=10s -OnUnitActiveSec=15min - -[Install] -WantedBy=timers.target diff --git a/roles/monitoring/prometheus/exporter/node/templates/textfile-collector-scripts/apt_info.j2 b/roles/monitoring/prometheus/exporter/node/templates/textfile-collector-scripts/apt_info.j2 new file mode 100644 index 00000000..20cfdf4f --- /dev/null +++ b/roles/monitoring/prometheus/exporter/node/templates/textfile-collector-scripts/apt_info.j2 @@ -0,0 +1,104 @@ +#!/usr/bin/env {{ python_basename }} +# +# Description: Expose metrics from apt. This is inspired by and +# intended to be a replacement for the original apt.sh. +# +# Dependencies: python3-apt, python3-prometheus-client +# +# Authors: Kyle Fazzari +# Daniel Swarbrick + +import apt +import collections +import contextlib +import os +from prometheus_client import CollectorRegistry, Gauge, generate_latest + +_UpgradeInfo = collections.namedtuple("_UpgradeInfo", ["labels", "count"]) + + +def _convert_candidates_to_upgrade_infos(candidates): + changes_dict = collections.defaultdict(lambda: collections.defaultdict(int)) + + for candidate in candidates: + origins = sorted( + {f"{o.origin}:{o.codename}/{o.archive}" for o in candidate.origins} + ) + changes_dict[",".join(origins)][candidate.architecture] += 1 + + changes_list = list() + for origin in sorted(changes_dict.keys()): + for arch in sorted(changes_dict[origin].keys()): + changes_list.append( + _UpgradeInfo( + labels=dict(origin=origin, arch=arch), + count=changes_dict[origin][arch], + ) + ) + + return changes_list + + +def _write_pending_upgrades(registry, cache): + # Discount any changes that apply to packages that aren't installed (e.g. + # count an upgrade to package A that adds a new dependency on package B as + # only one upgrade, not two). See the following issue for more details: + # https://github.com/prometheus-community/node-exporter-textfile-collector-scripts/issues/85 + candidates = { + p.candidate for p in cache.get_changes() if p.is_installed and p.marked_upgrade + } + upgrade_list = _convert_candidates_to_upgrade_infos(candidates) + + if upgrade_list: + g = Gauge('apt_upgrades_pending', "Apt packages pending updates by origin", + ['origin', 'arch'], registry=registry) + for change in upgrade_list: + g.labels(change.labels['origin'], change.labels['arch']).set(change.count) + + +def _write_held_upgrades(registry, cache): + held_candidates = {p.candidate for p in cache if p.is_upgradable and p.marked_keep} + upgrade_list = _convert_candidates_to_upgrade_infos(held_candidates) + + if upgrade_list: + g = Gauge('apt_upgrades_held', "Apt packages pending updates but held back.", + ['origin', 'arch'], registry=registry) + for change in upgrade_list: + g.labels(change.labels['origin'], change.labels['arch']).set(change.count) + + +def _write_autoremove_pending(registry, cache): + autoremovable_packages = {p for p in cache if p.is_auto_removable} + g = Gauge('apt_autoremove_pending', "Apt packages pending autoremoval.", + registry=registry) + g.set(len(autoremovable_packages)) + + +def _write_reboot_required(registry): + g = Gauge('node_reboot_required', "Node reboot is required for software updates.", + registry=registry) + g.set(int(os.path.isfile('/run/reboot-required'))) + + +def _main(): + cache = apt.cache.Cache() + + # First of all, attempt to update the index. If we don't have permission + # to do so (or it fails for some reason), it's not the end of the world, + # we'll operate on the old index. + with contextlib.suppress(apt.cache.LockFailedException, apt.cache.FetchFailedException): + cache.update() + + cache.open() + cache.upgrade(True) + + registry = CollectorRegistry() + _write_pending_upgrades(registry, cache) + _write_held_upgrades(registry, cache) + _write_autoremove_pending(registry, cache) + _write_reboot_required(registry) + print(generate_latest(registry).decode(), end='') + + +if __name__ == "__main__": + _main() diff --git a/roles/monitoring/prometheus/exporter/node/templates/textfile-collector-scripts/apt_info.service.j2 b/roles/monitoring/prometheus/exporter/node/templates/textfile-collector-scripts/apt_info.service.j2 new file mode 100644 index 00000000..70211c1a --- /dev/null +++ b/roles/monitoring/prometheus/exporter/node/templates/textfile-collector-scripts/apt_info.service.j2 @@ -0,0 +1,33 @@ +[Unit] +Description=Promethues node exporter textfile collector apt_info + +[Service] +Type=oneshot +Environment=TMPDIR=/var/lib/prometheus-node-exporter/textfile-collector +ExecStart=bash -o pipefail -c "/usr/local/share/prometheus-node-exporter/apt_info | sponge /var/lib/prometheus-node-exporter/textfile-collector/apt_info.prom" +TimeoutStartSec=30s + +# systemd hardening-options +AmbientCapabilities= +CapabilityBoundingSet= +DeviceAllow=/dev/null rw +DevicePolicy=strict +LockPersonality=true +MemoryDenyWriteExecute=true +NoNewPrivileges=true +PrivateDevices=true +PrivateTmp=true +ProtectControlGroups=true +ProtectHome=true +ProtectKernelModules=true +ProtectKernelTunables=true +ProtectSystem=strict +ReadWritePaths=/var/lib/prometheus-node-exporter/textfile-collector +RemoveIPC=true +RestrictNamespaces=true +RestrictRealtime=true +RestrictAddressFamilies=AF_UNIX +SystemCallArchitectures=native + +[Install] +WantedBy=multi-user.target diff --git a/roles/monitoring/prometheus/exporter/node/templates/textfile-collector-scripts/apt_info.timer.j2 b/roles/monitoring/prometheus/exporter/node/templates/textfile-collector-scripts/apt_info.timer.j2 new file mode 100644 index 00000000..d7881d38 --- /dev/null +++ b/roles/monitoring/prometheus/exporter/node/templates/textfile-collector-scripts/apt_info.timer.j2 @@ -0,0 +1,9 @@ +[Unit] +Description=Promethues node exporter textfile collector apt_info + +[Timer] +OnBootSec=10s +OnUnitActiveSec=15min + +[Install] +WantedBy=timers.target diff --git a/roles/monitoring/prometheus/server/defaults/main/rules_node.yml b/roles/monitoring/prometheus/server/defaults/main/rules_node.yml index 4deeeebe..0f9e025e 100644 --- a/roles/monitoring/prometheus/server/defaults/main/rules_node.yml +++ b/roles/monitoring/prometheus/server/defaults/main/rules_node.yml @@ -281,7 +281,7 @@ prometheus_server_rules_node: summary: Metrics from a textfile collector are too old (instance {{ '{{' }} $labels.instance {{ '}}' }}) description: "The exported values from textfile {{ '{{' }} $labels.file {{ '}}' }} on host {{ '{{' }} $labels.instance {{ '}}' }} are {{ '{{' }} $value {{ '}}' }} seconds old.\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}" - ## textfile-collector: apt + ## textfile-collector: apt_info - alert: AptUpgradesPending expr: sum by (instance) (apt_upgrades_pending) > 0 for: 0m -- cgit v1.2.3