summaryrefslogtreecommitdiff
path: root/roles/monitoring/prometheus
diff options
context:
space:
mode:
authorChristian Pointner <equinox@spreadspace.org>2023-07-19 00:51:42 +0200
committerChristian Pointner <equinox@spreadspace.org>2023-07-19 00:51:42 +0200
commit8412f714d05fff3e8b9e43d90e18937c8f0c91e4 (patch)
treeab0d220364f2ae4e74e0da10453edae8ba4b783c /roles/monitoring/prometheus
parentprometheus/node: satisfy linter... (diff)
prometheus/node: update textfile collector apt_info (replaces apt)
Diffstat (limited to 'roles/monitoring/prometheus')
-rw-r--r--roles/monitoring/prometheus/exporter/node/defaults/main.yml2
-rw-r--r--roles/monitoring/prometheus/exporter/node/handlers/main.yml5
-rw-r--r--roles/monitoring/prometheus/exporter/node/tasks/main.yml9
-rw-r--r--roles/monitoring/prometheus/exporter/node/tasks/textfile_collector_apt_info.yml30
-rw-r--r--roles/monitoring/prometheus/exporter/node/templates/textfile-collector-scripts/apt.j240
-rw-r--r--roles/monitoring/prometheus/exporter/node/templates/textfile-collector-scripts/apt_info.j2104
-rw-r--r--roles/monitoring/prometheus/exporter/node/templates/textfile-collector-scripts/apt_info.service.j2 (renamed from roles/monitoring/prometheus/exporter/node/templates/textfile-collector-scripts/apt.service.j2)4
-rw-r--r--roles/monitoring/prometheus/exporter/node/templates/textfile-collector-scripts/apt_info.timer.j2 (renamed from roles/monitoring/prometheus/exporter/node/templates/textfile-collector-scripts/apt.timer.j2)2
-rw-r--r--roles/monitoring/prometheus/server/defaults/main/rules_node.yml2
9 files changed, 149 insertions, 49 deletions
diff --git a/roles/monitoring/prometheus/exporter/node/defaults/main.yml b/roles/monitoring/prometheus/exporter/node/defaults/main.yml
index b9e4a6c7..e06909bf 100644
--- a/roles/monitoring/prometheus/exporter/node/defaults/main.yml
+++ b/roles/monitoring/prometheus/exporter/node/defaults/main.yml
@@ -5,7 +5,7 @@ prometheus_exporter_node_disable_collectors: []
prometheus_exporter_node_extra_collectors:
- systemd
-prometheus_exporter_node_install_apt_textfile_collector_script: "{{ ansible_pkg_mgr == 'apt' }}"
+prometheus_exporter_node_install_apt_info_textfile_collector_script: "{{ ansible_pkg_mgr == 'apt' }}"
prometheus_exporter_node_textfile_collector_scripts:
- deleted-libraries
# - smartmon
diff --git a/roles/monitoring/prometheus/exporter/node/handlers/main.yml b/roles/monitoring/prometheus/exporter/node/handlers/main.yml
index 8f5cb37c..1b6f8668 100644
--- a/roles/monitoring/prometheus/exporter/node/handlers/main.yml
+++ b/roles/monitoring/prometheus/exporter/node/handlers/main.yml
@@ -13,3 +13,8 @@
file:
path: /run/prometheus-node-exporter_sensors/state
state: absent
+
+### TODO: remove this once apt textfile collector has been replaced on all hosts
+- name: reload systemd
+ systemd:
+ daemon_reload: yes
diff --git a/roles/monitoring/prometheus/exporter/node/tasks/main.yml b/roles/monitoring/prometheus/exporter/node/tasks/main.yml
index 5af10326..adb2be85 100644
--- a/roles/monitoring/prometheus/exporter/node/tasks/main.yml
+++ b/roles/monitoring/prometheus/exporter/node/tasks/main.yml
@@ -60,11 +60,12 @@
path: /usr/local/share/prometheus-node-exporter
state: directory
-- name: install the apt textfile collector script
- when: prometheus_exporter_node_install_apt_textfile_collector_script
+- name: install the apt_info textfile collector script
+ when: prometheus_exporter_node_install_apt_info_textfile_collector_script
vars:
- textfile_collector_name: "apt"
- include_tasks: textfile_collector_generic.yml
+ textfile_collector_name: "apt_info"
+ include_tasks: textfile_collector_apt_info.yml
+# include_tasks: textfile_collector_generic.yml
- name: install all other textfile collector scripts
loop: "{{ prometheus_exporter_node_textfile_collector_scripts }}"
diff --git a/roles/monitoring/prometheus/exporter/node/tasks/textfile_collector_apt_info.yml b/roles/monitoring/prometheus/exporter/node/tasks/textfile_collector_apt_info.yml
new file mode 100644
index 00000000..b452c66e
--- /dev/null
+++ b/roles/monitoring/prometheus/exporter/node/tasks/textfile_collector_apt_info.yml
@@ -0,0 +1,30 @@
+---
+## this is overkill since all apt-based machines that are controlled via ansible will have this installed
+# - name: make sure python-apt is installed
+# apt:
+# name: "{{ python_basename }}-apt"
+# state: present
+
+## TODO: remove this once all hosts have been migrated
+- name: make sure the systemd timer for apt textfile collector is disabled and stopped
+ systemd:
+ service: prometheus-node-exporter_apt.timer
+ enabled: no
+ state: stopped
+ register: result_systemd_stop
+ failed_when: "result_systemd_stop is failed and 'Could not find the requested service' not in result_systemd_stop.msg"
+
+- name: remove files from apt textfile collector
+ loop:
+ - /etc/systemd/system/prometheus-node-exporter_apt.timer
+ - /etc/systemd/system/prometheus-node-exporter_apt.service
+ - /usr/local/share/prometheus-node-exporter/apt
+ - /var/lib/prometheus-node-exporter/textfile-collector/apt.prom
+ file:
+ path: "{{ item }}"
+ state: absent
+ notify: reload systemd
+##
+
+- name: install the apt_info textfile collector script
+ include_tasks: textfile_collector_generic.yml
diff --git a/roles/monitoring/prometheus/exporter/node/templates/textfile-collector-scripts/apt.j2 b/roles/monitoring/prometheus/exporter/node/templates/textfile-collector-scripts/apt.j2
deleted file mode 100644
index 015addb0..00000000
--- a/roles/monitoring/prometheus/exporter/node/templates/textfile-collector-scripts/apt.j2
+++ /dev/null
@@ -1,40 +0,0 @@
-#!/bin/bash
-#
-# Description: Expose metrics from apt updates.
-#
-# Author: Ben Kochie <superq@gmail.com>
-
-upgrades="$(/usr/bin/apt-get --just-print dist-upgrade \
- | /usr/bin/awk -F'[()]' \
- '/^Inst/ { sub("^[^ ]+ ", "", $2); gsub(" ","",$2);
- sub("\\[", " ", $2); sub("\\]", "", $2); print $2 }' \
- | /usr/bin/sort \
- | /usr/bin/uniq -c \
- | awk '{ gsub(/\\\\/, "\\\\", $2); gsub(/"/, "\\\"", $2);
- gsub(/\[/, "", $3); gsub(/\]/, "", $3);
- print "apt_upgrades_pending{origin=\"" $2 "\",arch=\"" $NF "\"} " $1}'
-)"
-
-autoremove="$(/usr/bin/apt-get --just-print autoremove \
- | /usr/bin/awk '/^Remv/{a++}END{printf "apt_autoremove_pending %d", a}'
-)"
-
-echo '# HELP apt_upgrades_pending Apt package pending updates by origin.'
-echo '# TYPE apt_upgrades_pending gauge'
-if [[ -n "${upgrades}" ]] ; then
- echo "${upgrades}"
-else
- echo 'apt_upgrades_pending{origin="",arch=""} 0'
-fi
-
-echo '# HELP apt_autoremove_pending Apt package pending autoremove.'
-echo '# TYPE apt_autoremove_pending gauge'
-echo "${autoremove}"
-
-echo '# HELP node_reboot_required Node reboot is required for software updates.'
-echo '# TYPE node_reboot_required gauge'
-if [[ -f '/run/reboot-required' ]] ; then
- echo 'node_reboot_required 1'
-else
- echo 'node_reboot_required 0'
-fi
diff --git a/roles/monitoring/prometheus/exporter/node/templates/textfile-collector-scripts/apt_info.j2 b/roles/monitoring/prometheus/exporter/node/templates/textfile-collector-scripts/apt_info.j2
new file mode 100644
index 00000000..20cfdf4f
--- /dev/null
+++ b/roles/monitoring/prometheus/exporter/node/templates/textfile-collector-scripts/apt_info.j2
@@ -0,0 +1,104 @@
+#!/usr/bin/env {{ python_basename }}
+#
+# Description: Expose metrics from apt. This is inspired by and
+# intended to be a replacement for the original apt.sh.
+#
+# Dependencies: python3-apt, python3-prometheus-client
+#
+# Authors: Kyle Fazzari <kyrofa@ubuntu.com>
+# Daniel Swarbrick <dswarbrick@debian.org>
+
+import apt
+import collections
+import contextlib
+import os
+from prometheus_client import CollectorRegistry, Gauge, generate_latest
+
+_UpgradeInfo = collections.namedtuple("_UpgradeInfo", ["labels", "count"])
+
+
+def _convert_candidates_to_upgrade_infos(candidates):
+ changes_dict = collections.defaultdict(lambda: collections.defaultdict(int))
+
+ for candidate in candidates:
+ origins = sorted(
+ {f"{o.origin}:{o.codename}/{o.archive}" for o in candidate.origins}
+ )
+ changes_dict[",".join(origins)][candidate.architecture] += 1
+
+ changes_list = list()
+ for origin in sorted(changes_dict.keys()):
+ for arch in sorted(changes_dict[origin].keys()):
+ changes_list.append(
+ _UpgradeInfo(
+ labels=dict(origin=origin, arch=arch),
+ count=changes_dict[origin][arch],
+ )
+ )
+
+ return changes_list
+
+
+def _write_pending_upgrades(registry, cache):
+ # Discount any changes that apply to packages that aren't installed (e.g.
+ # count an upgrade to package A that adds a new dependency on package B as
+ # only one upgrade, not two). See the following issue for more details:
+ # https://github.com/prometheus-community/node-exporter-textfile-collector-scripts/issues/85
+ candidates = {
+ p.candidate for p in cache.get_changes() if p.is_installed and p.marked_upgrade
+ }
+ upgrade_list = _convert_candidates_to_upgrade_infos(candidates)
+
+ if upgrade_list:
+ g = Gauge('apt_upgrades_pending', "Apt packages pending updates by origin",
+ ['origin', 'arch'], registry=registry)
+ for change in upgrade_list:
+ g.labels(change.labels['origin'], change.labels['arch']).set(change.count)
+
+
+def _write_held_upgrades(registry, cache):
+ held_candidates = {p.candidate for p in cache if p.is_upgradable and p.marked_keep}
+ upgrade_list = _convert_candidates_to_upgrade_infos(held_candidates)
+
+ if upgrade_list:
+ g = Gauge('apt_upgrades_held', "Apt packages pending updates but held back.",
+ ['origin', 'arch'], registry=registry)
+ for change in upgrade_list:
+ g.labels(change.labels['origin'], change.labels['arch']).set(change.count)
+
+
+def _write_autoremove_pending(registry, cache):
+ autoremovable_packages = {p for p in cache if p.is_auto_removable}
+ g = Gauge('apt_autoremove_pending', "Apt packages pending autoremoval.",
+ registry=registry)
+ g.set(len(autoremovable_packages))
+
+
+def _write_reboot_required(registry):
+ g = Gauge('node_reboot_required', "Node reboot is required for software updates.",
+ registry=registry)
+ g.set(int(os.path.isfile('/run/reboot-required')))
+
+
+def _main():
+ cache = apt.cache.Cache()
+
+ # First of all, attempt to update the index. If we don't have permission
+ # to do so (or it fails for some reason), it's not the end of the world,
+ # we'll operate on the old index.
+ with contextlib.suppress(apt.cache.LockFailedException, apt.cache.FetchFailedException):
+ cache.update()
+
+ cache.open()
+ cache.upgrade(True)
+
+ registry = CollectorRegistry()
+ _write_pending_upgrades(registry, cache)
+ _write_held_upgrades(registry, cache)
+ _write_autoremove_pending(registry, cache)
+ _write_reboot_required(registry)
+ print(generate_latest(registry).decode(), end='')
+
+
+if __name__ == "__main__":
+ _main()
diff --git a/roles/monitoring/prometheus/exporter/node/templates/textfile-collector-scripts/apt.service.j2 b/roles/monitoring/prometheus/exporter/node/templates/textfile-collector-scripts/apt_info.service.j2
index c60439c4..70211c1a 100644
--- a/roles/monitoring/prometheus/exporter/node/templates/textfile-collector-scripts/apt.service.j2
+++ b/roles/monitoring/prometheus/exporter/node/templates/textfile-collector-scripts/apt_info.service.j2
@@ -1,10 +1,10 @@
[Unit]
-Description=Promethues node exporter textfile collector apt
+Description=Promethues node exporter textfile collector apt_info
[Service]
Type=oneshot
Environment=TMPDIR=/var/lib/prometheus-node-exporter/textfile-collector
-ExecStart=bash -o pipefail -c "/usr/local/share/prometheus-node-exporter/apt | sponge /var/lib/prometheus-node-exporter/textfile-collector/apt.prom"
+ExecStart=bash -o pipefail -c "/usr/local/share/prometheus-node-exporter/apt_info | sponge /var/lib/prometheus-node-exporter/textfile-collector/apt_info.prom"
TimeoutStartSec=30s
# systemd hardening-options
diff --git a/roles/monitoring/prometheus/exporter/node/templates/textfile-collector-scripts/apt.timer.j2 b/roles/monitoring/prometheus/exporter/node/templates/textfile-collector-scripts/apt_info.timer.j2
index dc473749..d7881d38 100644
--- a/roles/monitoring/prometheus/exporter/node/templates/textfile-collector-scripts/apt.timer.j2
+++ b/roles/monitoring/prometheus/exporter/node/templates/textfile-collector-scripts/apt_info.timer.j2
@@ -1,5 +1,5 @@
[Unit]
-Description=Promethues node exporter textfile collector apt
+Description=Promethues node exporter textfile collector apt_info
[Timer]
OnBootSec=10s
diff --git a/roles/monitoring/prometheus/server/defaults/main/rules_node.yml b/roles/monitoring/prometheus/server/defaults/main/rules_node.yml
index 4deeeebe..0f9e025e 100644
--- a/roles/monitoring/prometheus/server/defaults/main/rules_node.yml
+++ b/roles/monitoring/prometheus/server/defaults/main/rules_node.yml
@@ -281,7 +281,7 @@ prometheus_server_rules_node:
summary: Metrics from a textfile collector are too old (instance {{ '{{' }} $labels.instance {{ '}}' }})
description: "The exported values from textfile {{ '{{' }} $labels.file {{ '}}' }} on host {{ '{{' }} $labels.instance {{ '}}' }} are {{ '{{' }} $value {{ '}}' }} seconds old.\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}"
- ## textfile-collector: apt
+ ## textfile-collector: apt_info
- alert: AptUpgradesPending
expr: sum by (instance) (apt_upgrades_pending) > 0
for: 0m