diff options
Diffstat (limited to 'roles/monitoring/prometheus')
11 files changed, 134 insertions, 181 deletions
diff --git a/roles/monitoring/prometheus/exporter/chrony/defaults/main.yml b/roles/monitoring/prometheus/exporter/chrony/defaults/main.yml new file mode 100644 index 00000000..699ed580 --- /dev/null +++ b/roles/monitoring/prometheus/exporter/chrony/defaults/main.yml @@ -0,0 +1,6 @@ +--- +# prometheus_exporter_chrony_version: + +prometheus_exporter_chrony_enable_collectors: + - sources + - tracking diff --git a/roles/monitoring/prometheus/exporter/chrony/handlers/main.yml b/roles/monitoring/prometheus/exporter/chrony/handlers/main.yml new file mode 100644 index 00000000..0c940ca9 --- /dev/null +++ b/roles/monitoring/prometheus/exporter/chrony/handlers/main.yml @@ -0,0 +1,15 @@ +--- +- name: restart prometheus-chrony-exporter + service: + name: prometheus-chrony-exporter + state: restarted + +- name: reload nginx + service: + name: nginx + state: reloaded + +### TODO: remove this once all hosts have been migrated +- name: reload systemd + systemd: + daemon_reload: yes diff --git a/roles/monitoring/prometheus/exporter/chrony/tasks/main.yml b/roles/monitoring/prometheus/exporter/chrony/tasks/main.yml new file mode 100644 index 00000000..f15037ec --- /dev/null +++ b/roles/monitoring/prometheus/exporter/chrony/tasks/main.yml @@ -0,0 +1,65 @@ +--- +- name: generate apt pin file for exporter-chrony package + when: prometheus_exporter_chrony_version is defined + copy: + dest: "/etc/apt/preferences.d/prom-exporter-chrony.pref" + content: | + Package: prom-exporter-chrony + Pin: version {{ prometheus_exporter_chrony_version }}-1 + Pin-Priority: 1001 + +- name: remove apt pin file for exporter-chrony package + when: prometheus_exporter_chrony_version is not defined + file: + path: "/etc/apt/preferences.d/prom-exporter-chrony.pref" + state: absent + +- name: install apt packages + apt: + name: "prom-exporter-chrony{% if prometheus_exporter_chrony_version is defined %}={{ prometheus_exporter_chrony_version }}-1{% endif %}" + state: present + allow_downgrade: yes + notify: restart prometheus-chrony-exporter + +- name: generate systemd service unit + template: + src: service.j2 + dest: /etc/systemd/system/prometheus-chrony-exporter.service + notify: restart prometheus-chrony-exporter + +- name: make sure prometheus-chrony-exporter is enabled and started + systemd: + name: prometheus-chrony-exporter.service + daemon_reload: yes + state: started + enabled: yes + +- name: register exporter + copy: + content: | + location = /chrony { + proxy_pass http://127.0.0.1:9123/metrics; + } + dest: /etc/prometheus/exporter/chrony.locations + notify: reload nginx + + +## TODO: remove these tasks once all hosts have been migrated +- name: make sure the systemd timer for chrony textfile collector is disabled and stopped + systemd: + service: prometheus-node-exporter_chrony.timer + enabled: no + state: stopped + register: result_systemd_stop + failed_when: "result_systemd_stop is failed and 'Could not find the requested service' not in result_systemd_stop.msg" + +- name: remove files from chrony textfile collector + loop: + - /etc/systemd/system/prometheus-node-exporter_chrony.timer + - /etc/systemd/system/prometheus-node-exporter_chrony.service + - /usr/local/share/prometheus-node-exporter/chrony + - /var/lib/prometheus-node-exporter/textfile-collector/chrony.prom + file: + path: "{{ item }}" + state: absent + notify: reload systemd diff --git a/roles/monitoring/prometheus/exporter/chrony/templates/service.j2 b/roles/monitoring/prometheus/exporter/chrony/templates/service.j2 new file mode 100644 index 00000000..cb806649 --- /dev/null +++ b/roles/monitoring/prometheus/exporter/chrony/templates/service.j2 @@ -0,0 +1,31 @@ +[Unit] +Description=Prometheus chrony exporter + +[Service] +Restart=always +User=_chrony +ExecStart=/usr/bin/prometheus-chrony-exporter --web.listen-address="127.0.0.1:9123" --chrony.address=unix:///run/chrony/chronyd.sock {% for collector in prometheus_exporter_chrony_enable_collectors %} --collector.{{ collector }}{% endfor %}{{ '' }} + +# systemd hardening-options +AmbientCapabilities= +CapabilityBoundingSet= +DeviceAllow=/dev/null rw +DevicePolicy=strict +LockPersonality=true +MemoryDenyWriteExecute=true +NoNewPrivileges=true +PrivateDevices=true +PrivateTmp=true +ProtectControlGroups=true +ProtectHome=true +ProtectKernelModules=true +ProtectKernelTunables=true +ProtectSystem=strict +ReadWritePaths=/run/chrony +RemoveIPC=true +RestrictNamespaces=true +RestrictRealtime=true +SystemCallArchitectures=native + +[Install] +WantedBy=multi-user.target diff --git a/roles/monitoring/prometheus/exporter/meta/main.yml b/roles/monitoring/prometheus/exporter/meta/main.yml index 4a427770..10a251f4 100644 --- a/roles/monitoring/prometheus/exporter/meta/main.yml +++ b/roles/monitoring/prometheus/exporter/meta/main.yml @@ -23,4 +23,6 @@ dependencies: when: "'standalone-kubelet' in (prometheus_exporters_default | union(prometheus_exporters_extra))" - role: monitoring/prometheus/exporter/modbus when: "'modbus' in (prometheus_exporters_default | union(prometheus_exporters_extra))" + - role: monitoring/prometheus/exporter/chrony + when: "'chrony' in (prometheus_exporters_default | union(prometheus_exporters_extra))" - role: monitoring/prometheus/exporter/register diff --git a/roles/monitoring/prometheus/exporter/node/defaults/main.yml b/roles/monitoring/prometheus/exporter/node/defaults/main.yml index 3b961a4f..ab4cee38 100644 --- a/roles/monitoring/prometheus/exporter/node/defaults/main.yml +++ b/roles/monitoring/prometheus/exporter/node/defaults/main.yml @@ -18,7 +18,6 @@ prometheus_exporter_node_install_apt_textfile_collector_script: "{{ ansible_pkg_ prometheus_exporter_node_textfile_collector_scripts: - deleted-libraries # - smartmon -# - chrony # - sensors # prometheus_exporter_node_textfile_collector__sensors: diff --git a/roles/monitoring/prometheus/exporter/node/templates/textfile-collector-scripts/chrony.j2 b/roles/monitoring/prometheus/exporter/node/templates/textfile-collector-scripts/chrony.j2 deleted file mode 100644 index 95c6a5d3..00000000 --- a/roles/monitoring/prometheus/exporter/node/templates/textfile-collector-scripts/chrony.j2 +++ /dev/null @@ -1,138 +0,0 @@ -#!/usr/bin/env {{ python_basename }} -# -# Description: Extract chronyd metrics from chronyc -c. -# Author: Aanchal Malhotra <aanch...@bu.edu> -# -# Works with chrony version 2.4 and higher -# -# this is from: https://www.mail-archive.com/chrony-users@chrony.tuxfamily.org/msg02179.html - -import subprocess -import sys - -chrony_sourcestats_cmd = ['chronyc', '-n', '-c', 'sourcestats'] -chrony_source_cmd = ['chronyc', '-n', '-c', 'sources'] -chrony_tracking_cmd = ['chronyc', '-n', '-c', 'tracking'] - -metrics_fields = [ - "Name/IP Address", - "NP", - "NR", - "Span", - "Frequency", - "Freq Skew", - "Offset", - "Std Dev"] - -status_types = {'x': 0, '?': 1, '-': 2, '+': 3, '*': 4} - -metrics_source = { - "*": "synchronized (system peer)", - "+": "synchronized", - "?": "unreachable", - "x": "Falseticker", - "-": "reference clock"} - -metrics_mode = { - '^': "server", - '=': "peer", - "#": "reference clock"} - - -def get_cmdoutput(command): - proc = subprocess.Popen(command, stdout=subprocess.PIPE) - out, err = proc.communicate() - return_code = proc.poll() - if return_code: - raise RuntimeError('Call to "{}" returned error: \ - {}'.format(command, return_code)) - return out.decode("utf-8") - - -def printPrometheusformat(metric, values): - print("# HELP chronyd_%s chronyd metric for %s" % (metric, metric)) - print("# TYPE chronyd_%s gauge" % (metric)) - for labels in values: - if labels is None: - print("chronyd_%s %f" % (metric, values[labels])) - else: - print("chronyd_%s{{ '{%' }}s} %f" % (metric, labels, values[labels])) - - -def printPrometheusscalar(metric, value): - print("# HELP chronyd_%s chronyd metric for %s" % (metric, metric)) - print("# TYPE chronyd_%s gauge" % (metric)) - print("chronyd_%s %f" % (metric, value)) - - -def printPrometheusEnum(metric, name): - print("# HELP chronyd_%s enum for %s" % (metric, metric)) - print("# TYPE chronyd_%s gauge" % (metric)) - print("chronyd_%s{value=\"%s\"} 1" % (metric, name)) - - -def weight(value): - val_int = int(value, 8) - return bin(val_int).count('1')/8.0 - - -def main(argv): - peer_status_metrics = {} - peer_reach_metrics = {} - offset_metrics = {} - freq_skew_metrics = {} - freq_metrics = {} - std_dev_metrics = {} - chrony_sourcestats = get_cmdoutput(chrony_sourcestats_cmd) - for line in chrony_sourcestats.split('\n'): - if (len(line)) > 0: - x = line.split(',') - common_labels = "remote=\"%s\"" % (x[0]) - freq_metrics[common_labels] = float(x[4]) - freq_skew_metrics[common_labels] = float(x[5]) - std_dev_metrics[common_labels] = float(x[7]) - - printPrometheusformat('freq_skew_ppm', freq_skew_metrics) - printPrometheusformat('freq_ppm', freq_metrics) - printPrometheusformat('std_dev_seconds', std_dev_metrics) - - chrony_source = get_cmdoutput(chrony_source_cmd) - for line in chrony_source.split('\n'): - if (len(line)) > 0: - x = line.split(',') - stratum = x[3] - reach = x[5] - mode = metrics_mode[x[0]] - common_labels = "remote=\"%s\"" % (x[2]) - peer_labels = "%s,stratum=\"%s\",mode=\"%s\"" % ( - common_labels, - stratum, - mode, - ) - peer_status_metrics[peer_labels] = float(status_types[x[1]]) - peer_reach_metrics[peer_labels] = weight(reach) - offset_metrics[common_labels] = float(x[8]) - - printPrometheusformat('peer_status', peer_status_metrics) - printPrometheusformat('offset_seconds', offset_metrics) - printPrometheusformat('peer_reachable', peer_reach_metrics) - - chrony_tracking_stats = get_cmdoutput(chrony_tracking_cmd).rstrip() - fields = chrony_tracking_stats.split(",") - printPrometheusEnum("tracking_source", fields[1]) - printPrometheusscalar("tracking_stratum", float(fields[2])) - printPrometheusscalar("tracking_ref_time", float(fields[3])) - printPrometheusscalar("tracking_system_time", float(fields[4])) - printPrometheusscalar("tracking_last_offset", float(fields[5])) - printPrometheusscalar("tracking_rms_offset", float(fields[6])) - printPrometheusscalar("tracking_frequency_error", float(fields[7])) - printPrometheusscalar("tracking_frequency_residual", float(fields[8])) - printPrometheusscalar("tracking_frequency_skew", float(fields[9])) - printPrometheusscalar("tracking_root_delay", float(fields[10])) - printPrometheusscalar("tracking_root_dispersion", float(fields[11])) - printPrometheusscalar("tracking_update_interval", float(fields[12])) - printPrometheusEnum("tracking_leap_status", fields[13]) - - -if __name__ == "__main__": - main(sys.argv[1:]) diff --git a/roles/monitoring/prometheus/exporter/node/templates/textfile-collector-scripts/chrony.service.j2 b/roles/monitoring/prometheus/exporter/node/templates/textfile-collector-scripts/chrony.service.j2 deleted file mode 100644 index 49b15185..00000000 --- a/roles/monitoring/prometheus/exporter/node/templates/textfile-collector-scripts/chrony.service.j2 +++ /dev/null @@ -1,33 +0,0 @@ -[Unit] -Description=Promethues node exporter textfile collector chrony - -[Service] -Type=oneshot -Environment=TMPDIR=/var/lib/prometheus-node-exporter/textfile-collector -ExecStart=bash -o pipefail -c "/usr/local/share/prometheus-node-exporter/chrony | sponge /var/lib/prometheus-node-exporter/textfile-collector/chrony.prom" -TimeoutStartSec=30s - -# systemd hardening-options -AmbientCapabilities=CAP_DAC_OVERRIDE -CapabilityBoundingSet=CAP_DAC_OVERRIDE -DeviceAllow=/dev/null rw -DevicePolicy=strict -LockPersonality=true -MemoryDenyWriteExecute=true -NoNewPrivileges=true -PrivateDevices=true -PrivateTmp=true -ProtectControlGroups=true -ProtectHome=true -ProtectKernelModules=true -ProtectKernelTunables=true -ProtectSystem=strict -ReadWritePaths=/var/lib/prometheus-node-exporter/textfile-collector /var/run/chrony -RemoveIPC=true -RestrictNamespaces=true -RestrictRealtime=true -RestrictAddressFamilies=AF_UNIX -SystemCallArchitectures=native - -[Install] -WantedBy=multi-user.target diff --git a/roles/monitoring/prometheus/exporter/node/templates/textfile-collector-scripts/chrony.timer.j2 b/roles/monitoring/prometheus/exporter/node/templates/textfile-collector-scripts/chrony.timer.j2 deleted file mode 100644 index eecc70e2..00000000 --- a/roles/monitoring/prometheus/exporter/node/templates/textfile-collector-scripts/chrony.timer.j2 +++ /dev/null @@ -1,9 +0,0 @@ -[Unit] -Description=Promethues node exporter textfile collector chrony - -[Timer] -OnBootSec=40s -OnUnitActiveSec=2min - -[Install] -WantedBy=timers.target diff --git a/roles/monitoring/prometheus/server/defaults/main/main.yml b/roles/monitoring/prometheus/server/defaults/main/main.yml index d778bad8..1e0dcf32 100644 --- a/roles/monitoring/prometheus/server/defaults/main/main.yml +++ b/roles/monitoring/prometheus/server/defaults/main/main.yml @@ -22,6 +22,7 @@ prometheus_server_rules: prometheus: "{{ prometheus_server_rules_prometheus + ((prometheus_server_alertmanager is defined) | ternary(prometheus_server_rules_prometheus_alertmanager, [])) + prometheus_server_rules_prometheus_extra }}" node: "{{ prometheus_server_rules_node + prometheus_server_rules_node_extra }}" openwrt: "{{ prometheus_server_rules_openwrt + prometheus_server_rules_openwrt_extra }}" + chrony: "{{ prometheus_server_rules_chrony + prometheus_server_rules_chrony_extra }}" nut: "{{ prometheus_server_rules_nut + prometheus_server_rules_nut_extra }}" nut/ups: "{{ prometheus_server_rules_nut__ups + prometheus_server_rules_nut__ups_extra }}" blackbox: "{{ prometheus_server_rules_blackbox + prometheus_server_rules_blackbox_extra }}" diff --git a/roles/monitoring/prometheus/server/defaults/main/rules_chrony.yml b/roles/monitoring/prometheus/server/defaults/main/rules_chrony.yml new file mode 100644 index 00000000..e845a60b --- /dev/null +++ b/roles/monitoring/prometheus/server/defaults/main/rules_chrony.yml @@ -0,0 +1,14 @@ +--- +prometheus_server_rules_chrony_extra: [] +prometheus_server_rules_chrony: + - record: instance:chrony_clock_error_seconds:abs + expr: abs(chrony_tracking_last_offset_seconds) + chrony_tracking_root_dispersion_seconds + (0.5 * chrony_tracking_root_delay_seconds) + + - alert: ChronyUnreachable + expr: chrony_up == 0 + for: 0m + labels: + severity: critical + annotations: + summary: Unable to scrape chrony metrics (instance {{ '{{' }} $labels.instance {{ '}}' }}) + description: "The chrony process might have crashed.\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}" |