summaryrefslogtreecommitdiff
path: root/roles/monitoring/prometheus
diff options
context:
space:
mode:
authorChristian Pointner <equinox@spreadspace.org>2023-05-26 21:34:14 +0200
committerChristian Pointner <equinox@spreadspace.org>2023-07-17 21:51:22 +0200
commit695131994b5a749e129fb304e8ba709acd37afe8 (patch)
tree02111746a1cd0cfc31c5736170b12aafadcb771e /roles/monitoring/prometheus
parentmake textfile collector for apt packages configurable (diff)
add support for chrony_exporter (replaces textfile collector)
Diffstat (limited to 'roles/monitoring/prometheus')
-rw-r--r--roles/monitoring/prometheus/exporter/chrony/defaults/main.yml6
-rw-r--r--roles/monitoring/prometheus/exporter/chrony/handlers/main.yml15
-rw-r--r--roles/monitoring/prometheus/exporter/chrony/tasks/main.yml65
-rw-r--r--roles/monitoring/prometheus/exporter/chrony/templates/service.j231
-rw-r--r--roles/monitoring/prometheus/exporter/meta/main.yml2
-rw-r--r--roles/monitoring/prometheus/exporter/node/defaults/main.yml1
-rw-r--r--roles/monitoring/prometheus/exporter/node/templates/textfile-collector-scripts/chrony.j2138
-rw-r--r--roles/monitoring/prometheus/exporter/node/templates/textfile-collector-scripts/chrony.service.j233
-rw-r--r--roles/monitoring/prometheus/exporter/node/templates/textfile-collector-scripts/chrony.timer.j29
-rw-r--r--roles/monitoring/prometheus/server/defaults/main/main.yml1
-rw-r--r--roles/monitoring/prometheus/server/defaults/main/rules_chrony.yml14
11 files changed, 134 insertions, 181 deletions
diff --git a/roles/monitoring/prometheus/exporter/chrony/defaults/main.yml b/roles/monitoring/prometheus/exporter/chrony/defaults/main.yml
new file mode 100644
index 00000000..699ed580
--- /dev/null
+++ b/roles/monitoring/prometheus/exporter/chrony/defaults/main.yml
@@ -0,0 +1,6 @@
+---
+# prometheus_exporter_chrony_version:
+
+prometheus_exporter_chrony_enable_collectors:
+ - sources
+ - tracking
diff --git a/roles/monitoring/prometheus/exporter/chrony/handlers/main.yml b/roles/monitoring/prometheus/exporter/chrony/handlers/main.yml
new file mode 100644
index 00000000..0c940ca9
--- /dev/null
+++ b/roles/monitoring/prometheus/exporter/chrony/handlers/main.yml
@@ -0,0 +1,15 @@
+---
+- name: restart prometheus-chrony-exporter
+ service:
+ name: prometheus-chrony-exporter
+ state: restarted
+
+- name: reload nginx
+ service:
+ name: nginx
+ state: reloaded
+
+### TODO: remove this once all hosts have been migrated
+- name: reload systemd
+ systemd:
+ daemon_reload: yes
diff --git a/roles/monitoring/prometheus/exporter/chrony/tasks/main.yml b/roles/monitoring/prometheus/exporter/chrony/tasks/main.yml
new file mode 100644
index 00000000..f15037ec
--- /dev/null
+++ b/roles/monitoring/prometheus/exporter/chrony/tasks/main.yml
@@ -0,0 +1,65 @@
+---
+- name: generate apt pin file for exporter-chrony package
+ when: prometheus_exporter_chrony_version is defined
+ copy:
+ dest: "/etc/apt/preferences.d/prom-exporter-chrony.pref"
+ content: |
+ Package: prom-exporter-chrony
+ Pin: version {{ prometheus_exporter_chrony_version }}-1
+ Pin-Priority: 1001
+
+- name: remove apt pin file for exporter-chrony package
+ when: prometheus_exporter_chrony_version is not defined
+ file:
+ path: "/etc/apt/preferences.d/prom-exporter-chrony.pref"
+ state: absent
+
+- name: install apt packages
+ apt:
+ name: "prom-exporter-chrony{% if prometheus_exporter_chrony_version is defined %}={{ prometheus_exporter_chrony_version }}-1{% endif %}"
+ state: present
+ allow_downgrade: yes
+ notify: restart prometheus-chrony-exporter
+
+- name: generate systemd service unit
+ template:
+ src: service.j2
+ dest: /etc/systemd/system/prometheus-chrony-exporter.service
+ notify: restart prometheus-chrony-exporter
+
+- name: make sure prometheus-chrony-exporter is enabled and started
+ systemd:
+ name: prometheus-chrony-exporter.service
+ daemon_reload: yes
+ state: started
+ enabled: yes
+
+- name: register exporter
+ copy:
+ content: |
+ location = /chrony {
+ proxy_pass http://127.0.0.1:9123/metrics;
+ }
+ dest: /etc/prometheus/exporter/chrony.locations
+ notify: reload nginx
+
+
+## TODO: remove these tasks once all hosts have been migrated
+- name: make sure the systemd timer for chrony textfile collector is disabled and stopped
+ systemd:
+ service: prometheus-node-exporter_chrony.timer
+ enabled: no
+ state: stopped
+ register: result_systemd_stop
+ failed_when: "result_systemd_stop is failed and 'Could not find the requested service' not in result_systemd_stop.msg"
+
+- name: remove files from chrony textfile collector
+ loop:
+ - /etc/systemd/system/prometheus-node-exporter_chrony.timer
+ - /etc/systemd/system/prometheus-node-exporter_chrony.service
+ - /usr/local/share/prometheus-node-exporter/chrony
+ - /var/lib/prometheus-node-exporter/textfile-collector/chrony.prom
+ file:
+ path: "{{ item }}"
+ state: absent
+ notify: reload systemd
diff --git a/roles/monitoring/prometheus/exporter/chrony/templates/service.j2 b/roles/monitoring/prometheus/exporter/chrony/templates/service.j2
new file mode 100644
index 00000000..cb806649
--- /dev/null
+++ b/roles/monitoring/prometheus/exporter/chrony/templates/service.j2
@@ -0,0 +1,31 @@
+[Unit]
+Description=Prometheus chrony exporter
+
+[Service]
+Restart=always
+User=_chrony
+ExecStart=/usr/bin/prometheus-chrony-exporter --web.listen-address="127.0.0.1:9123" --chrony.address=unix:///run/chrony/chronyd.sock {% for collector in prometheus_exporter_chrony_enable_collectors %} --collector.{{ collector }}{% endfor %}{{ '' }}
+
+# systemd hardening-options
+AmbientCapabilities=
+CapabilityBoundingSet=
+DeviceAllow=/dev/null rw
+DevicePolicy=strict
+LockPersonality=true
+MemoryDenyWriteExecute=true
+NoNewPrivileges=true
+PrivateDevices=true
+PrivateTmp=true
+ProtectControlGroups=true
+ProtectHome=true
+ProtectKernelModules=true
+ProtectKernelTunables=true
+ProtectSystem=strict
+ReadWritePaths=/run/chrony
+RemoveIPC=true
+RestrictNamespaces=true
+RestrictRealtime=true
+SystemCallArchitectures=native
+
+[Install]
+WantedBy=multi-user.target
diff --git a/roles/monitoring/prometheus/exporter/meta/main.yml b/roles/monitoring/prometheus/exporter/meta/main.yml
index 4a427770..10a251f4 100644
--- a/roles/monitoring/prometheus/exporter/meta/main.yml
+++ b/roles/monitoring/prometheus/exporter/meta/main.yml
@@ -23,4 +23,6 @@ dependencies:
when: "'standalone-kubelet' in (prometheus_exporters_default | union(prometheus_exporters_extra))"
- role: monitoring/prometheus/exporter/modbus
when: "'modbus' in (prometheus_exporters_default | union(prometheus_exporters_extra))"
+ - role: monitoring/prometheus/exporter/chrony
+ when: "'chrony' in (prometheus_exporters_default | union(prometheus_exporters_extra))"
- role: monitoring/prometheus/exporter/register
diff --git a/roles/monitoring/prometheus/exporter/node/defaults/main.yml b/roles/monitoring/prometheus/exporter/node/defaults/main.yml
index 3b961a4f..ab4cee38 100644
--- a/roles/monitoring/prometheus/exporter/node/defaults/main.yml
+++ b/roles/monitoring/prometheus/exporter/node/defaults/main.yml
@@ -18,7 +18,6 @@ prometheus_exporter_node_install_apt_textfile_collector_script: "{{ ansible_pkg_
prometheus_exporter_node_textfile_collector_scripts:
- deleted-libraries
# - smartmon
-# - chrony
# - sensors
# prometheus_exporter_node_textfile_collector__sensors:
diff --git a/roles/monitoring/prometheus/exporter/node/templates/textfile-collector-scripts/chrony.j2 b/roles/monitoring/prometheus/exporter/node/templates/textfile-collector-scripts/chrony.j2
deleted file mode 100644
index 95c6a5d3..00000000
--- a/roles/monitoring/prometheus/exporter/node/templates/textfile-collector-scripts/chrony.j2
+++ /dev/null
@@ -1,138 +0,0 @@
-#!/usr/bin/env {{ python_basename }}
-#
-# Description: Extract chronyd metrics from chronyc -c.
-# Author: Aanchal Malhotra <aanch...@bu.edu>
-#
-# Works with chrony version 2.4 and higher
-#
-# this is from: https://www.mail-archive.com/chrony-users@chrony.tuxfamily.org/msg02179.html
-
-import subprocess
-import sys
-
-chrony_sourcestats_cmd = ['chronyc', '-n', '-c', 'sourcestats']
-chrony_source_cmd = ['chronyc', '-n', '-c', 'sources']
-chrony_tracking_cmd = ['chronyc', '-n', '-c', 'tracking']
-
-metrics_fields = [
- "Name/IP Address",
- "NP",
- "NR",
- "Span",
- "Frequency",
- "Freq Skew",
- "Offset",
- "Std Dev"]
-
-status_types = {'x': 0, '?': 1, '-': 2, '+': 3, '*': 4}
-
-metrics_source = {
- "*": "synchronized (system peer)",
- "+": "synchronized",
- "?": "unreachable",
- "x": "Falseticker",
- "-": "reference clock"}
-
-metrics_mode = {
- '^': "server",
- '=': "peer",
- "#": "reference clock"}
-
-
-def get_cmdoutput(command):
- proc = subprocess.Popen(command, stdout=subprocess.PIPE)
- out, err = proc.communicate()
- return_code = proc.poll()
- if return_code:
- raise RuntimeError('Call to "{}" returned error: \
- {}'.format(command, return_code))
- return out.decode("utf-8")
-
-
-def printPrometheusformat(metric, values):
- print("# HELP chronyd_%s chronyd metric for %s" % (metric, metric))
- print("# TYPE chronyd_%s gauge" % (metric))
- for labels in values:
- if labels is None:
- print("chronyd_%s %f" % (metric, values[labels]))
- else:
- print("chronyd_%s{{ '{%' }}s} %f" % (metric, labels, values[labels]))
-
-
-def printPrometheusscalar(metric, value):
- print("# HELP chronyd_%s chronyd metric for %s" % (metric, metric))
- print("# TYPE chronyd_%s gauge" % (metric))
- print("chronyd_%s %f" % (metric, value))
-
-
-def printPrometheusEnum(metric, name):
- print("# HELP chronyd_%s enum for %s" % (metric, metric))
- print("# TYPE chronyd_%s gauge" % (metric))
- print("chronyd_%s{value=\"%s\"} 1" % (metric, name))
-
-
-def weight(value):
- val_int = int(value, 8)
- return bin(val_int).count('1')/8.0
-
-
-def main(argv):
- peer_status_metrics = {}
- peer_reach_metrics = {}
- offset_metrics = {}
- freq_skew_metrics = {}
- freq_metrics = {}
- std_dev_metrics = {}
- chrony_sourcestats = get_cmdoutput(chrony_sourcestats_cmd)
- for line in chrony_sourcestats.split('\n'):
- if (len(line)) > 0:
- x = line.split(',')
- common_labels = "remote=\"%s\"" % (x[0])
- freq_metrics[common_labels] = float(x[4])
- freq_skew_metrics[common_labels] = float(x[5])
- std_dev_metrics[common_labels] = float(x[7])
-
- printPrometheusformat('freq_skew_ppm', freq_skew_metrics)
- printPrometheusformat('freq_ppm', freq_metrics)
- printPrometheusformat('std_dev_seconds', std_dev_metrics)
-
- chrony_source = get_cmdoutput(chrony_source_cmd)
- for line in chrony_source.split('\n'):
- if (len(line)) > 0:
- x = line.split(',')
- stratum = x[3]
- reach = x[5]
- mode = metrics_mode[x[0]]
- common_labels = "remote=\"%s\"" % (x[2])
- peer_labels = "%s,stratum=\"%s\",mode=\"%s\"" % (
- common_labels,
- stratum,
- mode,
- )
- peer_status_metrics[peer_labels] = float(status_types[x[1]])
- peer_reach_metrics[peer_labels] = weight(reach)
- offset_metrics[common_labels] = float(x[8])
-
- printPrometheusformat('peer_status', peer_status_metrics)
- printPrometheusformat('offset_seconds', offset_metrics)
- printPrometheusformat('peer_reachable', peer_reach_metrics)
-
- chrony_tracking_stats = get_cmdoutput(chrony_tracking_cmd).rstrip()
- fields = chrony_tracking_stats.split(",")
- printPrometheusEnum("tracking_source", fields[1])
- printPrometheusscalar("tracking_stratum", float(fields[2]))
- printPrometheusscalar("tracking_ref_time", float(fields[3]))
- printPrometheusscalar("tracking_system_time", float(fields[4]))
- printPrometheusscalar("tracking_last_offset", float(fields[5]))
- printPrometheusscalar("tracking_rms_offset", float(fields[6]))
- printPrometheusscalar("tracking_frequency_error", float(fields[7]))
- printPrometheusscalar("tracking_frequency_residual", float(fields[8]))
- printPrometheusscalar("tracking_frequency_skew", float(fields[9]))
- printPrometheusscalar("tracking_root_delay", float(fields[10]))
- printPrometheusscalar("tracking_root_dispersion", float(fields[11]))
- printPrometheusscalar("tracking_update_interval", float(fields[12]))
- printPrometheusEnum("tracking_leap_status", fields[13])
-
-
-if __name__ == "__main__":
- main(sys.argv[1:])
diff --git a/roles/monitoring/prometheus/exporter/node/templates/textfile-collector-scripts/chrony.service.j2 b/roles/monitoring/prometheus/exporter/node/templates/textfile-collector-scripts/chrony.service.j2
deleted file mode 100644
index 49b15185..00000000
--- a/roles/monitoring/prometheus/exporter/node/templates/textfile-collector-scripts/chrony.service.j2
+++ /dev/null
@@ -1,33 +0,0 @@
-[Unit]
-Description=Promethues node exporter textfile collector chrony
-
-[Service]
-Type=oneshot
-Environment=TMPDIR=/var/lib/prometheus-node-exporter/textfile-collector
-ExecStart=bash -o pipefail -c "/usr/local/share/prometheus-node-exporter/chrony | sponge /var/lib/prometheus-node-exporter/textfile-collector/chrony.prom"
-TimeoutStartSec=30s
-
-# systemd hardening-options
-AmbientCapabilities=CAP_DAC_OVERRIDE
-CapabilityBoundingSet=CAP_DAC_OVERRIDE
-DeviceAllow=/dev/null rw
-DevicePolicy=strict
-LockPersonality=true
-MemoryDenyWriteExecute=true
-NoNewPrivileges=true
-PrivateDevices=true
-PrivateTmp=true
-ProtectControlGroups=true
-ProtectHome=true
-ProtectKernelModules=true
-ProtectKernelTunables=true
-ProtectSystem=strict
-ReadWritePaths=/var/lib/prometheus-node-exporter/textfile-collector /var/run/chrony
-RemoveIPC=true
-RestrictNamespaces=true
-RestrictRealtime=true
-RestrictAddressFamilies=AF_UNIX
-SystemCallArchitectures=native
-
-[Install]
-WantedBy=multi-user.target
diff --git a/roles/monitoring/prometheus/exporter/node/templates/textfile-collector-scripts/chrony.timer.j2 b/roles/monitoring/prometheus/exporter/node/templates/textfile-collector-scripts/chrony.timer.j2
deleted file mode 100644
index eecc70e2..00000000
--- a/roles/monitoring/prometheus/exporter/node/templates/textfile-collector-scripts/chrony.timer.j2
+++ /dev/null
@@ -1,9 +0,0 @@
-[Unit]
-Description=Promethues node exporter textfile collector chrony
-
-[Timer]
-OnBootSec=40s
-OnUnitActiveSec=2min
-
-[Install]
-WantedBy=timers.target
diff --git a/roles/monitoring/prometheus/server/defaults/main/main.yml b/roles/monitoring/prometheus/server/defaults/main/main.yml
index d778bad8..1e0dcf32 100644
--- a/roles/monitoring/prometheus/server/defaults/main/main.yml
+++ b/roles/monitoring/prometheus/server/defaults/main/main.yml
@@ -22,6 +22,7 @@ prometheus_server_rules:
prometheus: "{{ prometheus_server_rules_prometheus + ((prometheus_server_alertmanager is defined) | ternary(prometheus_server_rules_prometheus_alertmanager, [])) + prometheus_server_rules_prometheus_extra }}"
node: "{{ prometheus_server_rules_node + prometheus_server_rules_node_extra }}"
openwrt: "{{ prometheus_server_rules_openwrt + prometheus_server_rules_openwrt_extra }}"
+ chrony: "{{ prometheus_server_rules_chrony + prometheus_server_rules_chrony_extra }}"
nut: "{{ prometheus_server_rules_nut + prometheus_server_rules_nut_extra }}"
nut/ups: "{{ prometheus_server_rules_nut__ups + prometheus_server_rules_nut__ups_extra }}"
blackbox: "{{ prometheus_server_rules_blackbox + prometheus_server_rules_blackbox_extra }}"
diff --git a/roles/monitoring/prometheus/server/defaults/main/rules_chrony.yml b/roles/monitoring/prometheus/server/defaults/main/rules_chrony.yml
new file mode 100644
index 00000000..e845a60b
--- /dev/null
+++ b/roles/monitoring/prometheus/server/defaults/main/rules_chrony.yml
@@ -0,0 +1,14 @@
+---
+prometheus_server_rules_chrony_extra: []
+prometheus_server_rules_chrony:
+ - record: instance:chrony_clock_error_seconds:abs
+ expr: abs(chrony_tracking_last_offset_seconds) + chrony_tracking_root_dispersion_seconds + (0.5 * chrony_tracking_root_delay_seconds)
+
+ - alert: ChronyUnreachable
+ expr: chrony_up == 0
+ for: 0m
+ labels:
+ severity: critical
+ annotations:
+ summary: Unable to scrape chrony metrics (instance {{ '{{' }} $labels.instance {{ '}}' }})
+ description: "The chrony process might have crashed.\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}"