summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorChristian Pointner <equinox@spreadspace.org>2021-09-26 04:29:02 +0200
committerChristian Pointner <equinox@spreadspace.org>2021-09-26 04:29:02 +0200
commit223297c29d78e4fe95e2ed2338455708e4e0a9c3 (patch)
tree02534f529becf59835dd1ed5dee0f2321966a09b
parentswitch to pascal for pressure (diff)
parentfix alert wording (diff)
Merge branch 'topic/prometheus-refactoring'
-rw-r--r--chaos-at-home/ch-testvm-prometheus.yml1
-rw-r--r--inventory/group_vars/chaos-at-home-ups/vars.yml6
-rw-r--r--inventory/group_vars/ele-ups/vars.yml10
-rw-r--r--inventory/group_vars/promzone-chaos-at-home/vars.yml9
-rw-r--r--inventory/group_vars/promzone-elevate-festival/vars.yml4
-rw-r--r--inventory/host_vars/ch-mon.yml36
-rw-r--r--roles/monitoring/prometheus/alertmanager/tasks/main.yml1
-rw-r--r--roles/monitoring/prometheus/exporter/TODO8
-rw-r--r--roles/monitoring/prometheus/exporter/base/defaults/main.yml2
-rw-r--r--roles/monitoring/prometheus/exporter/base/handlers/main.yml6
-rw-r--r--roles/monitoring/prometheus/exporter/base/tasks/main.yml29
-rw-r--r--roles/monitoring/prometheus/exporter/base/templates/nginx-vhost.j219
-rw-r--r--roles/monitoring/prometheus/exporter/blackbox/defaults/main.yml7
-rw-r--r--roles/monitoring/prometheus/exporter/blackbox/handlers/main.yml7
-rw-r--r--roles/monitoring/prometheus/exporter/blackbox/tasks/main.yml15
-rw-r--r--roles/monitoring/prometheus/exporter/blackbox/templates/config.yml.j22
-rw-r--r--roles/monitoring/prometheus/exporter/ipmi/defaults/main.yml26
-rw-r--r--roles/monitoring/prometheus/exporter/ipmi/handlers/main.yml15
-rw-r--r--roles/monitoring/prometheus/exporter/ipmi/tasks/main.yml42
-rw-r--r--roles/monitoring/prometheus/exporter/ipmi/templates/config.yml.j24
-rw-r--r--roles/monitoring/prometheus/exporter/ipmi/templates/service.j2 (renamed from roles/monitoring/prometheus/exporter/base/templates/service.j2)8
-rw-r--r--roles/monitoring/prometheus/exporter/meta/main.yml10
-rw-r--r--roles/monitoring/prometheus/exporter/mikrotik/handlers/main.yml7
-rw-r--r--roles/monitoring/prometheus/exporter/mikrotik/tasks/main.yml11
-rw-r--r--roles/monitoring/prometheus/exporter/mikrotik/templates/config.yml.j24
-rw-r--r--roles/monitoring/prometheus/exporter/node/defaults/main.yml6
-rwxr-xr-xroles/monitoring/prometheus/exporter/node/files/apt40
-rwxr-xr-xroles/monitoring/prometheus/exporter/node/files/deleted-libraries75
-rw-r--r--roles/monitoring/prometheus/exporter/node/files/smartmon391
-rw-r--r--roles/monitoring/prometheus/exporter/node/handlers/main.yml7
-rw-r--r--roles/monitoring/prometheus/exporter/node/tasks/main.yml32
-rw-r--r--roles/monitoring/prometheus/exporter/node/tasks/textfile_collector_script.yml24
-rw-r--r--roles/monitoring/prometheus/exporter/node/templates/textfile-collector-scripts/apt.service.j231
-rw-r--r--roles/monitoring/prometheus/exporter/node/templates/textfile-collector-scripts/apt.timer.j29
-rw-r--r--roles/monitoring/prometheus/exporter/node/templates/textfile-collector-scripts/deleted-libraries.service.j231
-rw-r--r--roles/monitoring/prometheus/exporter/node/templates/textfile-collector-scripts/deleted-libraries.timer.j29
-rw-r--r--roles/monitoring/prometheus/exporter/node/templates/textfile-collector-scripts/smartmon.service.j229
-rw-r--r--roles/monitoring/prometheus/exporter/node/templates/textfile-collector-scripts/smartmon.timer.j29
-rw-r--r--roles/monitoring/prometheus/exporter/nut/handlers/main.yml7
-rw-r--r--roles/monitoring/prometheus/exporter/nut/tasks/main.yml15
-rw-r--r--roles/monitoring/prometheus/server/defaults/main/main.yml11
-rw-r--r--roles/monitoring/prometheus/server/defaults/main/rules_blackbox-https.yml3
-rw-r--r--roles/monitoring/prometheus/server/defaults/main/rules_blackbox-ping.yml3
-rw-r--r--roles/monitoring/prometheus/server/defaults/main/rules_blackbox-ssh.yml3
-rw-r--r--roles/monitoring/prometheus/server/defaults/main/rules_blackbox.yml3
-rw-r--r--roles/monitoring/prometheus/server/defaults/main/rules_blackbox__probe.yml74
-rw-r--r--roles/monitoring/prometheus/server/defaults/main/rules_ipmi.yml4
-rw-r--r--roles/monitoring/prometheus/server/defaults/main/rules_ipmi__remote.yml4
-rw-r--r--roles/monitoring/prometheus/server/defaults/main/rules_node.yml38
-rw-r--r--roles/monitoring/prometheus/server/defaults/main/rules_nut__ups.yml4
-rw-r--r--roles/monitoring/prometheus/server/defaults/main/rules_openwrt.yml1
-rw-r--r--roles/monitoring/prometheus/server/filter_plugins/prometheus.py33
-rw-r--r--roles/monitoring/prometheus/server/tasks/main.yml65
-rw-r--r--roles/monitoring/prometheus/server/templates/job-snippets/blackbox-https.j214
-rw-r--r--roles/monitoring/prometheus/server/templates/job-snippets/blackbox-ping.j214
-rw-r--r--roles/monitoring/prometheus/server/templates/job-snippets/blackbox-ssh.j214
-rw-r--r--roles/monitoring/prometheus/server/templates/jobs/generic.j2 (renamed from roles/monitoring/prometheus/server/templates/job-snippets/generic.j2)7
-rw-r--r--roles/monitoring/prometheus/server/templates/jobs/node.j217
-rw-r--r--roles/monitoring/prometheus/server/templates/jobs/openwrt.j2 (renamed from roles/monitoring/prometheus/server/templates/job-snippets/openwrt.j2)2
-rw-r--r--roles/monitoring/prometheus/server/templates/prometheus.yml.j29
-rw-r--r--roles/monitoring/prometheus/server/templates/targets/blackbox-https.yml.j24
-rw-r--r--roles/monitoring/prometheus/server/templates/targets/blackbox-ping.yml.j24
-rw-r--r--roles/monitoring/prometheus/server/templates/targets/blackbox-ssh.yml.j24
-rw-r--r--roles/monitoring/prometheus/server/templates/targets/blackbox/probe.yml.j25
-rw-r--r--roles/monitoring/prometheus/server/templates/targets/generic.yml.j24
-rw-r--r--roles/monitoring/prometheus/server/templates/targets/ipmi/remote.yml.j25
-rw-r--r--roles/monitoring/prometheus/server/templates/targets/nut.yml.j217
-rw-r--r--roles/monitoring/prometheus/server/templates/targets/nut/ups.yml.j217
68 files changed, 1123 insertions, 264 deletions
diff --git a/chaos-at-home/ch-testvm-prometheus.yml b/chaos-at-home/ch-testvm-prometheus.yml
index 3fd99d41..c0f33b8f 100644
--- a/chaos-at-home/ch-testvm-prometheus.yml
+++ b/chaos-at-home/ch-testvm-prometheus.yml
@@ -7,6 +7,7 @@
- role: core/sshd/base
- role: core/zsh
- role: core/ntp
+ - role: nginx/base
- role: apt-repo/spreadspace
- role: monitoring/prometheus/exporter
# - role: kubernetes/base
diff --git a/inventory/group_vars/chaos-at-home-ups/vars.yml b/inventory/group_vars/chaos-at-home-ups/vars.yml
index 5ff68452..7b60e893 100644
--- a/inventory/group_vars/chaos-at-home-ups/vars.yml
+++ b/inventory/group_vars/chaos-at-home-ups/vars.yml
@@ -11,8 +11,8 @@ prometheus_scrape_endpoint: "{{ network_mgmt_zone.prefix | ipaddr(network_mgmt_z
prometheus_exporters_default:
- openwrt
-prometheus_special_job_nut:
- - exporter_hostname: ch-mon
- instance: "ups-{{ ups_name }}"
+prometheus_job_multitarget_nut__ups:
+ ch-mon:
+ - instance: "ups-{{ ups_name }}"
ups: "{{ ups_name }}"
server: "{{ network_mgmt_zone.prefix | ipaddr(network_mgmt_zone.offsets[inventory_hostname]) | ipaddr('address') }}"
diff --git a/inventory/group_vars/ele-ups/vars.yml b/inventory/group_vars/ele-ups/vars.yml
index cbee3ee8..28a5eaff 100644
--- a/inventory/group_vars/ele-ups/vars.yml
+++ b/inventory/group_vars/ele-ups/vars.yml
@@ -14,8 +14,8 @@ prometheus_scrape_endpoint: "{{ network_mgmt_zone.prefix | ipaddr(network_mgmt_z
prometheus_exporters_default:
- openwrt
-prometheus_special_job_nut:
- exporter_hostname: ele-mon
- instance: "ups-{{ ups_name }}"
- ups: "{{ ups_name }}"
- server: "{{ network_mgmt_zone.prefix | ipaddr(network_mgmt_zone.offsets[inventory_hostname]) | ipaddr('address') }}"
+prometheus_job_multitarget_nut__ups:
+ ele-mon:
+ - instance: "ups-{{ ups_name }}"
+ ups: "{{ ups_name }}"
+ server: "{{ network_mgmt_zone.prefix | ipaddr(network_mgmt_zone.offsets[inventory_hostname]) | ipaddr('address') }}"
diff --git a/inventory/group_vars/promzone-chaos-at-home/vars.yml b/inventory/group_vars/promzone-chaos-at-home/vars.yml
index fcb04716..529bf3e7 100644
--- a/inventory/group_vars/promzone-chaos-at-home/vars.yml
+++ b/inventory/group_vars/promzone-chaos-at-home/vars.yml
@@ -6,14 +6,13 @@ prometheus_exporters_default:
- node
prometheus_server: ch-mon
-prometheus_server_jobs_generic:
+prometheus_server_jobs:
- node
- openwrt
-prometheus_server_jobs_special:
- nut
- - blackbox-ping
- - blackbox-https
- - blackbox-ssh
+ - nut/ups
+ - blackbox
+ - blackbox/probe
prometheus_zone_name: chaos@home
prometheus_zone_targets: "{{ groups['promzone-chaos-at-home'] }}"
diff --git a/inventory/group_vars/promzone-elevate-festival/vars.yml b/inventory/group_vars/promzone-elevate-festival/vars.yml
index e94943d7..b3321614 100644
--- a/inventory/group_vars/promzone-elevate-festival/vars.yml
+++ b/inventory/group_vars/promzone-elevate-festival/vars.yml
@@ -6,9 +6,11 @@ prometheus_exporters_default:
- node
prometheus_server: ele-mon
-prometheus_server_jobs_generic:
+prometheus_server_jobs:
- node
- openwrt
+ - nut
+ - nut/ups
prometheus_zone_name: Elevate Festival
prometheus_zone_targets: "{{ groups['promzone-elevate-festival'] }}"
diff --git a/inventory/host_vars/ch-mon.yml b/inventory/host_vars/ch-mon.yml
index 7d8e334b..b2402d0c 100644
--- a/inventory/host_vars/ch-mon.yml
+++ b/inventory/host_vars/ch-mon.yml
@@ -76,23 +76,25 @@ prometheus_exporter_blackbox_modules_extra:
icmp:
prober: icmp
-prometheus_special_job_blackbox_ping:
- - exporter_hostname: ch-mon
- instance: "ping-magentagw"
- address: 62.99.185.129
- - exporter_hostname: ch-mon
- instance: "ping-quad9"
- address: 9.9.9.9
-
-prometheus_special_job_blackbox_https:
- - exporter_hostname: ch-mon
- instance: "https-web.chaos-at-home.org"
- address: web.chaos-at-home.org
-
-prometheus_special_job_blackbox_ssh:
- - exporter_hostname: ch-mon
- instance: "ssh-{{ inventory_hostname }}"
- address: "{{ network_zones.svc.prefix | ipaddr(network_zones.svc.offsets[inventory_hostname]) | ipaddr('address') }}:{{ ansible_port | default(22) }}"
+prometheus_job_multitarget_blackbox__probe:
+ ch-mon:
+ - instance: "ping-magentagw"
+ target: 62.99.185.129
+ module: icmp
+ - instance: "ping-quad9"
+ target: 9.9.9.9
+ module: icmp
+
+ - instance: "https-pan.chaos-at-home.org"
+ target: "https://pan.chaos-at-home.org"
+ module: http_tls_2xx
+ - instance: "https-mimas.chaos-at-home.org"
+ target: "https://mimas.chaos-at-home.org"
+ module: http_tls_2xx
+
+ - instance: "ssh-{{ inventory_hostname }}"
+ target: "{{ network_zones.svc.prefix | ipaddr(network_zones.svc.offsets[inventory_hostname]) | ipaddr('address') }}:{{ ansible_port | default(22) }}"
+ module: ssh_banner
promethues_alertmanager_smtp:
diff --git a/roles/monitoring/prometheus/alertmanager/tasks/main.yml b/roles/monitoring/prometheus/alertmanager/tasks/main.yml
index fe8ce9ca..0dce6ef4 100644
--- a/roles/monitoring/prometheus/alertmanager/tasks/main.yml
+++ b/roles/monitoring/prometheus/alertmanager/tasks/main.yml
@@ -6,6 +6,7 @@
- spreadspace_apt_repo_components is defined
- "'prometheus' in spreadspace_apt_repo_components"
+ ## TODO: pin version
- name: install apt packages
apt:
name: prom-alertmanager
diff --git a/roles/monitoring/prometheus/exporter/TODO b/roles/monitoring/prometheus/exporter/TODO
index c02e5699..57179464 100644
--- a/roles/monitoring/prometheus/exporter/TODO
+++ b/roles/monitoring/prometheus/exporter/TODO
@@ -1,11 +1,3 @@
-Node Exporter - Text Collector Scripts:
- - https://github.com/prometheus-community/node-exporter-textfile-collector-scripts
- - https://packages.debian.org/bullseye/prometheus-node-exporter-collectors
-
-IPMI Exporter:
- - https://github.com/soundcloud/ipmi_exporter
- - https://packages.debian.org/bullseye/prometheus-ipmi-exporter
-
Postfix Exporter:
- https://github.com/kumina/postfix_exporter
- https://packages.debian.org/bullseye/prometheus-postfix-exporter
diff --git a/roles/monitoring/prometheus/exporter/base/defaults/main.yml b/roles/monitoring/prometheus/exporter/base/defaults/main.yml
index 963763a5..613943d8 100644
--- a/roles/monitoring/prometheus/exporter/base/defaults/main.yml
+++ b/roles/monitoring/prometheus/exporter/base/defaults/main.yml
@@ -1,2 +1,2 @@
---
-prometheus_exporter_listen: ":9999"
+prometheus_exporter_listen: "9999"
diff --git a/roles/monitoring/prometheus/exporter/base/handlers/main.yml b/roles/monitoring/prometheus/exporter/base/handlers/main.yml
index ebd760cf..d4e42ca0 100644
--- a/roles/monitoring/prometheus/exporter/base/handlers/main.yml
+++ b/roles/monitoring/prometheus/exporter/base/handlers/main.yml
@@ -1,5 +1,5 @@
---
-- name: restart prometheus-exporter-exporter
+- name: reload nginx
service:
- name: prometheus-exporter-exporter
- state: restarted
+ name: nginx
+ state: reloaded
diff --git a/roles/monitoring/prometheus/exporter/base/tasks/main.yml b/roles/monitoring/prometheus/exporter/base/tasks/main.yml
index 9a214f39..5f42867d 100644
--- a/roles/monitoring/prometheus/exporter/base/tasks/main.yml
+++ b/roles/monitoring/prometheus/exporter/base/tasks/main.yml
@@ -6,16 +6,6 @@
- spreadspace_apt_repo_components is defined
- "'prometheus' in spreadspace_apt_repo_components"
-- name: install apt packages
- apt:
- name: prom-exporter-exporter
- state: present
-
-- name: create configuration directories
- file:
- path: /etc/prometheus/exporter/enabled
- state: directory
-
- name: add user for prometheus-exporter
user:
name: prometheus-exporter
@@ -26,15 +16,10 @@
- name: create TLS certificate and key
import_tasks: tls.yml
-- name: generate systemd service unit
- template:
- src: service.j2
- dest: /etc/systemd/system/prometheus-exporter-exporter.service
- notify: restart prometheus-exporter-exporter
-
-- name: make sure prometheus-exporter-exporter is enabled and started
- systemd:
- name: prometheus-exporter-exporter.service
- daemon_reload: yes
- state: started
- enabled: yes
+- name: configure nginx vhost
+ import_role:
+ name: nginx/vhost
+ vars:
+ nginx_vhost:
+ name: prometheus-exporter
+ content: "{{ lookup('template', 'nginx-vhost.j2') }}"
diff --git a/roles/monitoring/prometheus/exporter/base/templates/nginx-vhost.j2 b/roles/monitoring/prometheus/exporter/base/templates/nginx-vhost.j2
new file mode 100644
index 00000000..70e65b29
--- /dev/null
+++ b/roles/monitoring/prometheus/exporter/base/templates/nginx-vhost.j2
@@ -0,0 +1,19 @@
+server {
+ listen {{ prometheus_exporter_listen }} ssl;
+ server_name _;
+
+ ssl_certificate /etc/ssl/prometheus/exporter/crt.pem;
+ ssl_certificate_key /etc/ssl/prometheus/exporter/key.pem;
+ ssl_client_certificate /etc/ssl/prometheus/ca-crt.pem;
+ ssl_verify_client on;
+
+ root /nonexistent;
+
+ location = / {
+ return 404 'please specify the exporter you want to reach!';
+ }
+
+ include snippets/proxy-nobuff.conf;
+
+ include /etc/prometheus/exporter/*.locations;
+}
diff --git a/roles/monitoring/prometheus/exporter/blackbox/defaults/main.yml b/roles/monitoring/prometheus/exporter/blackbox/defaults/main.yml
index 4e7d8d9a..73b9fde1 100644
--- a/roles/monitoring/prometheus/exporter/blackbox/defaults/main.yml
+++ b/roles/monitoring/prometheus/exporter/blackbox/defaults/main.yml
@@ -2,23 +2,30 @@
prometheus_exporter_blackbox_modules:
tcp_connect:
prober: tcp
+ tcp:
+ preferred_ip_protocol: "ip4"
tcp_tls_connect:
prober: tcp
tcp:
+ preferred_ip_protocol: "ip4"
tls: true
tls_config:
insecure_skip_verify: true
http_2xx:
prober: http
+ http:
+ preferred_ip_protocol: "ip4"
http_tls_2xx:
prober: http
http:
+ preferred_ip_protocol: "ip4"
fail_if_not_ssl: true
tls_config:
insecure_skip_verify: true
ssh_banner:
prober: tcp
tcp:
+ preferred_ip_protocol: "ip4"
query_response:
- expect: "^SSH-2.0-"
- send: "SSH-2.0-blackbox-ssh-check"
diff --git a/roles/monitoring/prometheus/exporter/blackbox/handlers/main.yml b/roles/monitoring/prometheus/exporter/blackbox/handlers/main.yml
index 99a416e2..12250769 100644
--- a/roles/monitoring/prometheus/exporter/blackbox/handlers/main.yml
+++ b/roles/monitoring/prometheus/exporter/blackbox/handlers/main.yml
@@ -9,8 +9,7 @@
name: prometheus-blackbox-exporter
state: reloaded
-- name: reload prometheus-exporter-exporter
+- name: reload nginx
service:
- name: prometheus-exporter-exporter
- ## TODO: implement reload once exporter_exporter supports this...
- state: restarted
+ name: nginx
+ state: reloaded
diff --git a/roles/monitoring/prometheus/exporter/blackbox/tasks/main.yml b/roles/monitoring/prometheus/exporter/blackbox/tasks/main.yml
index 7ecd8113..c4cabfce 100644
--- a/roles/monitoring/prometheus/exporter/blackbox/tasks/main.yml
+++ b/roles/monitoring/prometheus/exporter/blackbox/tasks/main.yml
@@ -1,4 +1,5 @@
---
+ ## TODO: pin version
- name: install apt packages
apt:
name: prom-exporter-blackbox
@@ -31,9 +32,11 @@
- name: register exporter
copy:
content: |
- method: http
- http:
- port: 9115
- path: '/probe'
- dest: /etc/prometheus/exporter/enabled/blackbox.yml
- notify: reload prometheus-exporter-exporter
+ location = /blackbox {
+ proxy_pass http://127.0.0.1:9115/metrics;
+ }
+ location = /blackbox/probe {
+ proxy_pass http://127.0.0.1:9115/probe;
+ }
+ dest: /etc/prometheus/exporter/blackbox.locations
+ notify: reload nginx
diff --git a/roles/monitoring/prometheus/exporter/blackbox/templates/config.yml.j2 b/roles/monitoring/prometheus/exporter/blackbox/templates/config.yml.j2
index 01e3f7a0..0ff9db13 100644
--- a/roles/monitoring/prometheus/exporter/blackbox/templates/config.yml.j2
+++ b/roles/monitoring/prometheus/exporter/blackbox/templates/config.yml.j2
@@ -1,4 +1,4 @@
# {{ ansible_managed }}
modules:
- {{ prometheus_exporter_blackbox_modules | combine(prometheus_exporter_blackbox_modules_extra) | to_nice_yaml(indent=2) | indent(2)}}
+ {{ prometheus_exporter_blackbox_modules | combine(prometheus_exporter_blackbox_modules_extra) | to_nice_yaml(indent=2) | indent(2) }}
diff --git a/roles/monitoring/prometheus/exporter/ipmi/defaults/main.yml b/roles/monitoring/prometheus/exporter/ipmi/defaults/main.yml
new file mode 100644
index 00000000..6cf14f76
--- /dev/null
+++ b/roles/monitoring/prometheus/exporter/ipmi/defaults/main.yml
@@ -0,0 +1,26 @@
+---
+prometheus_exporter_ipmi_modules:
+ default:
+ collectors: []
+ # collectors:
+ # - bmc
+ # - ipmi
+ # - chassis
+ # - dcmi
+ # - sel
+ # - sm-lan-mode
+ # exclude_sensor_ids:
+ # - 2
+ # - 29
+ # - 32
+ # thatspecialhost:
+ # user: "some_user"
+ # pass: "secret_pw"
+ # privilege: "admin"
+ # driver: "LAN"
+ # collectors:
+ # - ipmi
+ # - sel
+ # custom_args:
+ # ipmi:
+ # - "--bridge-sensors"
diff --git a/roles/monitoring/prometheus/exporter/ipmi/handlers/main.yml b/roles/monitoring/prometheus/exporter/ipmi/handlers/main.yml
new file mode 100644
index 00000000..a8eb55b3
--- /dev/null
+++ b/roles/monitoring/prometheus/exporter/ipmi/handlers/main.yml
@@ -0,0 +1,15 @@
+---
+- name: restart prometheus-ipmi-exporter
+ service:
+ name: prometheus-ipmi-exporter
+ state: restarted
+
+- name: reload prometheus-ipmi-exporter
+ service:
+ name: prometheus-ipmi-exporter
+ state: reloaded
+
+- name: reload nginx
+ service:
+ name: nginx
+ state: reloaded
diff --git a/roles/monitoring/prometheus/exporter/ipmi/tasks/main.yml b/roles/monitoring/prometheus/exporter/ipmi/tasks/main.yml
new file mode 100644
index 00000000..91318f16
--- /dev/null
+++ b/roles/monitoring/prometheus/exporter/ipmi/tasks/main.yml
@@ -0,0 +1,42 @@
+---
+ ## TODO: pin version
+- name: install apt packages
+ apt:
+ name: prom-exporter-ipmi
+ state: present
+
+- name: create config directory
+ file:
+ path: /etc/prometheus/exporter/ipmi
+ state: directory
+
+- name: generate configuration
+ template:
+ src: config.yml.j2
+ dest: /etc/prometheus/exporter/ipmi/config.yml
+ notify: reload prometheus-ipmi-exporter
+
+- name: generate systemd service unit
+ template:
+ src: service.j2
+ dest: /etc/systemd/system/prometheus-ipmi-exporter.service
+ notify: restart prometheus-ipmi-exporter
+
+- name: make sure prometheus-ipmi-exporter is enabled and started
+ systemd:
+ name: prometheus-ipmi-exporter.service
+ daemon_reload: yes
+ state: started
+ enabled: yes
+
+- name: register exporter
+ copy:
+ content: |
+ location = /ipmi {
+ proxy_pass http://127.0.0.1:9290/metrics;
+ }
+ location = /ipmi/remote {
+ proxy_pass http://127.0.0.1:9290/ipmi;
+ }
+ dest: /etc/prometheus/exporter/ipmi.locations
+ notify: reload nginx
diff --git a/roles/monitoring/prometheus/exporter/ipmi/templates/config.yml.j2 b/roles/monitoring/prometheus/exporter/ipmi/templates/config.yml.j2
new file mode 100644
index 00000000..32d0b34a
--- /dev/null
+++ b/roles/monitoring/prometheus/exporter/ipmi/templates/config.yml.j2
@@ -0,0 +1,4 @@
+# {{ ansible_managed }}
+
+modules:
+ {{ prometheus_exporter_ipmi_modules | to_nice_yaml(indent=2) | indent(2) }}
diff --git a/roles/monitoring/prometheus/exporter/base/templates/service.j2 b/roles/monitoring/prometheus/exporter/ipmi/templates/service.j2
index c24baf43..465215e8 100644
--- a/roles/monitoring/prometheus/exporter/base/templates/service.j2
+++ b/roles/monitoring/prometheus/exporter/ipmi/templates/service.j2
@@ -1,12 +1,13 @@
[Unit]
-Description=Prometheus exporter proxy
+Description=Prometheus ipmi exporter
[Service]
Restart=always
User=prometheus-exporter
-ExecStart=/usr/bin/prometheus-exporter-exporter -config.dirs=/etc/prometheus/exporter/enabled -config.file="" -web.listen-address="" -web.tls.listen-address="{{ prometheus_exporter_listen }}" -web.tls.cert="/etc/ssl/prometheus/exporter/crt.pem" -web.tls.key="/etc/ssl/prometheus/exporter/key.pem" --web.tls.ca="/etc/ssl/prometheus/ca-crt.pem" -web.tls.verify
-{# TODO: implement reloading once the exporter_exporter supports this #}
+ExecStart=/usr/bin/prometheus-ipmi-exporter --web.listen-address="127.0.0.1:9290" --config.file=/etc/prometheus/exporter/ipmi/config.yml --freeipmi.path="/usr/sbin"
+ExecReload=/bin/kill -HUP $MAINPID
+{# TODO: test which hardening options need to be removed for IPMI to work... #}
# systemd hardening-options
AmbientCapabilities=
CapabilityBoundingSet=
@@ -17,7 +18,6 @@ MemoryDenyWriteExecute=true
NoNewPrivileges=true
PrivateDevices=true
PrivateTmp=true
-PrivateUsers=true
ProtectControlGroups=true
ProtectHome=true
ProtectKernelModules=true
diff --git a/roles/monitoring/prometheus/exporter/meta/main.yml b/roles/monitoring/prometheus/exporter/meta/main.yml
index 22131422..68fce6cb 100644
--- a/roles/monitoring/prometheus/exporter/meta/main.yml
+++ b/roles/monitoring/prometheus/exporter/meta/main.yml
@@ -1,11 +1,13 @@
---
dependencies:
- role: monitoring/prometheus/exporter/base
- - role: monitoring/prometheus/exporter/node
- when: "'node' in (prometheus_exporters_default | union(prometheus_exporters_extra))"
- role: monitoring/prometheus/exporter/blackbox
when: "'blackbox' in (prometheus_exporters_default | union(prometheus_exporters_extra))"
- - role: monitoring/prometheus/exporter/nut
- when: "'nut' in (prometheus_exporters_default | union(prometheus_exporters_extra))"
+ - role: monitoring/prometheus/exporter/ipmi
+ when: "'ipmi' in (prometheus_exporters_default | union(prometheus_exporters_extra))"
- role: monitoring/prometheus/exporter/mikrotik
when: "'mikrotik' in (prometheus_exporters_default | union(prometheus_exporters_extra))"
+ - role: monitoring/prometheus/exporter/node
+ when: "'node' in (prometheus_exporters_default | union(prometheus_exporters_extra))"
+ - role: monitoring/prometheus/exporter/nut
+ when: "'nut' in (prometheus_exporters_default | union(prometheus_exporters_extra))"
diff --git a/roles/monitoring/prometheus/exporter/mikrotik/handlers/main.yml b/roles/monitoring/prometheus/exporter/mikrotik/handlers/main.yml
index cb85d0d9..c5844220 100644
--- a/roles/monitoring/prometheus/exporter/mikrotik/handlers/main.yml
+++ b/roles/monitoring/prometheus/exporter/mikrotik/handlers/main.yml
@@ -4,8 +4,7 @@
name: prometheus-mikrotik-exporter
state: restarted
-- name: reload prometheus-exporter-exporter
+- name: reload nginx
service:
- name: prometheus-exporter-exporter
- ## TODO: implement reload once exporter_exporter supports this...
- state: restarted
+ name: nginx
+ state: reloaded
diff --git a/roles/monitoring/prometheus/exporter/mikrotik/tasks/main.yml b/roles/monitoring/prometheus/exporter/mikrotik/tasks/main.yml
index c3ffe31b..72c78e4a 100644
--- a/roles/monitoring/prometheus/exporter/mikrotik/tasks/main.yml
+++ b/roles/monitoring/prometheus/exporter/mikrotik/tasks/main.yml
@@ -1,4 +1,5 @@
---
+ ## TODO: pin version
- name: install apt packages
apt:
name: prom-exporter-mikrotik
@@ -34,8 +35,8 @@
- name: register exporter
copy:
content: |
- method: http
- http:
- port: 9436
- dest: /etc/prometheus/exporter/enabled/mikrotik.yml
- notify: reload prometheus-exporter-exporter
+ location = /mikrotik {
+ proxy_pass http://127.0.0.1:9436/metrics;
+ }
+ dest: /etc/prometheus/exporter/mikrotik.locations
+ notify: reload nginx
diff --git a/roles/monitoring/prometheus/exporter/mikrotik/templates/config.yml.j2 b/roles/monitoring/prometheus/exporter/mikrotik/templates/config.yml.j2
index a2dc1c71..576ee12f 100644
--- a/roles/monitoring/prometheus/exporter/mikrotik/templates/config.yml.j2
+++ b/roles/monitoring/prometheus/exporter/mikrotik/templates/config.yml.j2
@@ -1,7 +1,7 @@
# {{ ansible_managed }}
devices:
- {{ prometheus_exporter_mikrotik_devices | to_nice_yaml(indent=2) | indent(2)}}
+ {{ prometheus_exporter_mikrotik_devices | to_nice_yaml(indent=2) | indent(2) }}
features:
- {{ prometheus_exporter_mikrotik_features | to_nice_yaml(indent=2) | indent(2)}}
+ {{ prometheus_exporter_mikrotik_features | to_nice_yaml(indent=2) | indent(2) }}
diff --git a/roles/monitoring/prometheus/exporter/node/defaults/main.yml b/roles/monitoring/prometheus/exporter/node/defaults/main.yml
index 56227fbb..9309562f 100644
--- a/roles/monitoring/prometheus/exporter/node/defaults/main.yml
+++ b/roles/monitoring/prometheus/exporter/node/defaults/main.yml
@@ -9,4 +9,8 @@ prometheus_exporter_node_timesync_collector: "{{ _prometheus_exporter_node_time_
prometheus_exporter_node_disable_collectors: []
prometheus_exporter_node_extra_collectors:
-- "{{ prometheus_exporter_node_timesync_collector }}"
+ - "{{ prometheus_exporter_node_timesync_collector }}"
+ - systemd
+
+prometheus_exporter_node_textfile_collector_scripts:
+ - deleted-libraries
diff --git a/roles/monitoring/prometheus/exporter/node/files/apt b/roles/monitoring/prometheus/exporter/node/files/apt
new file mode 100755
index 00000000..015addb0
--- /dev/null
+++ b/roles/monitoring/prometheus/exporter/node/files/apt
@@ -0,0 +1,40 @@
+#!/bin/bash
+#
+# Description: Expose metrics from apt updates.
+#
+# Author: Ben Kochie <superq@gmail.com>
+
+upgrades="$(/usr/bin/apt-get --just-print dist-upgrade \
+ | /usr/bin/awk -F'[()]' \
+ '/^Inst/ { sub("^[^ ]+ ", "", $2); gsub(" ","",$2);
+ sub("\\[", " ", $2); sub("\\]", "", $2); print $2 }' \
+ | /usr/bin/sort \
+ | /usr/bin/uniq -c \
+ | awk '{ gsub(/\\\\/, "\\\\", $2); gsub(/"/, "\\\"", $2);
+ gsub(/\[/, "", $3); gsub(/\]/, "", $3);
+ print "apt_upgrades_pending{origin=\"" $2 "\",arch=\"" $NF "\"} " $1}'
+)"
+
+autoremove="$(/usr/bin/apt-get --just-print autoremove \
+ | /usr/bin/awk '/^Remv/{a++}END{printf "apt_autoremove_pending %d", a}'
+)"
+
+echo '# HELP apt_upgrades_pending Apt package pending updates by origin.'
+echo '# TYPE apt_upgrades_pending gauge'
+if [[ -n "${upgrades}" ]] ; then
+ echo "${upgrades}"
+else
+ echo 'apt_upgrades_pending{origin="",arch=""} 0'
+fi
+
+echo '# HELP apt_autoremove_pending Apt package pending autoremove.'
+echo '# TYPE apt_autoremove_pending gauge'
+echo "${autoremove}"
+
+echo '# HELP node_reboot_required Node reboot is required for software updates.'
+echo '# TYPE node_reboot_required gauge'
+if [[ -f '/run/reboot-required' ]] ; then
+ echo 'node_reboot_required 1'
+else
+ echo 'node_reboot_required 0'
+fi
diff --git a/roles/monitoring/prometheus/exporter/node/files/deleted-libraries b/roles/monitoring/prometheus/exporter/node/files/deleted-libraries
new file mode 100755
index 00000000..e3e19cbd
--- /dev/null
+++ b/roles/monitoring/prometheus/exporter/node/files/deleted-libraries
@@ -0,0 +1,75 @@
+#!/usr/bin/env python3
+"""
+Script to count the number of deleted libraries that are linked by running
+processes and expose a summary as Prometheus metrics.
+
+The aim is to discover processes that are still using libraries that have since
+been updated, perhaps due security vulnerabilities.
+"""
+
+import errno
+import glob
+import os
+import sys
+
+
+def main():
+ processes_linking_deleted_libraries = {}
+
+ for path in glob.glob('/proc/*/maps'):
+ try:
+ with open(path, 'rb') as file:
+ for line in file:
+ part = line.decode().strip().split()
+
+ if len(part) == 7:
+ library = part[5]
+ comment = part[6]
+
+ if '/lib/' in library and '(deleted)' in comment:
+ if path not in processes_linking_deleted_libraries:
+ processes_linking_deleted_libraries[path] = {}
+
+ if library in processes_linking_deleted_libraries[path]:
+ processes_linking_deleted_libraries[path][library] += 1
+ else:
+ processes_linking_deleted_libraries[path][library] = 1
+ except EnvironmentError as e:
+ # Ignore non-existent files, since the files may have changed since
+ # we globbed.
+ if e.errno != errno.ENOENT:
+ sys.exit('Failed to open file: {0}'.format(path))
+
+ num_processes_per_library = {}
+
+ for process, library_count in processes_linking_deleted_libraries.items():
+ libraries_seen = set()
+ for library, count in library_count.items():
+ if library in libraries_seen:
+ continue
+
+ libraries_seen.add(library)
+ if library in num_processes_per_library:
+ num_processes_per_library[library] += 1
+ else:
+ num_processes_per_library[library] = 1
+
+ metric_name = 'node_processes_linking_deleted_libraries'
+ description = 'Count of running processes that link a deleted library'
+ print('# HELP {0} {1}'.format(metric_name, description))
+ print('# TYPE {0} gauge'.format(metric_name))
+
+ for library, count in num_processes_per_library.items():
+ dir_path, basename = os.path.split(library)
+ basename = basename.replace('"', '\\"')
+ dir_path = dir_path.replace('"', '\\"')
+ print('{0}{{library_path="{1}", library_name="{2}"}} {3}'.format(
+ metric_name,
+ dir_path,
+ basename,
+ count)
+ )
+
+
+if __name__ == "__main__":
+ main()
diff --git a/roles/monitoring/prometheus/exporter/node/files/smartmon b/roles/monitoring/prometheus/exporter/node/files/smartmon
new file mode 100644
index 00000000..1c39b492
--- /dev/null
+++ b/roles/monitoring/prometheus/exporter/node/files/smartmon
@@ -0,0 +1,391 @@
+#!/usr/bin/env python3
+import argparse
+import collections
+import csv
+import datetime
+import decimal
+import re
+import shlex
+import subprocess
+import sys
+
+device_info_re = re.compile(r'^(?P<k>[^:]+?)(?:(?:\sis|):)\s*(?P<v>.*)$')
+
+ata_error_count_re = re.compile(
+ r'^Error (\d+) \[\d+\] occurred', re.MULTILINE)
+
+self_test_re = re.compile(r'^SMART.*(PASSED|OK)$', re.MULTILINE)
+
+device_info_map = {
+ 'Vendor': 'vendor',
+ 'Product': 'product',
+ 'Revision': 'revision',
+ 'Logical Unit id': 'lun_id',
+ 'Model Family': 'model_family',
+ 'Device Model': 'device_model',
+ 'Serial Number': 'serial_number',
+ 'Firmware Version': 'firmware_version',
+}
+
+smart_attributes_whitelist = {
+ 'airflow_temperature_cel',
+ 'command_timeout',
+ 'current_pending_sector',
+ 'end_to_end_error',
+ 'erase_fail_count_total',
+ 'g_sense_error_rate',
+ 'hardware_ecc_recovered',
+ 'host_reads_mib',
+ 'host_reads_32mib',
+ 'host_writes_mib',
+ 'host_writes_32mib',
+ 'load_cycle_count',
+ 'media_wearout_indicator',
+ 'wear_leveling_count',
+ 'nand_writes_1gib',
+ 'offline_uncorrectable',
+ 'power_cycle_count',
+ 'power_on_hours',
+ 'program_fail_count',
+ 'raw_read_error_rate',
+ 'reallocated_event_count',
+ 'reallocated_sector_ct',
+ 'reported_uncorrect',
+ 'sata_downshift_count',
+ 'seek_error_rate',
+ 'spin_retry_count',
+ 'spin_up_time',
+ 'start_stop_count',
+ 'temperature_case',
+ 'temperature_celsius',
+ 'temperature_internal',
+ 'total_lbas_read',
+ 'total_lbas_written',
+ 'udma_crc_error_count',
+ 'unsafe_shutdown_count',
+ 'workld_host_reads_perc',
+ 'workld_media_wear_indic',
+ 'workload_minutes',
+}
+
+Metric = collections.namedtuple('Metric', 'name labels value')
+
+SmartAttribute = collections.namedtuple('SmartAttribute', [
+ 'id', 'name', 'flag', 'value', 'worst', 'threshold', 'type', 'updated',
+ 'when_failed', 'raw_value',
+])
+
+
+class Device(collections.namedtuple('DeviceBase', 'path opts')):
+ """Representation of a device as found by smartctl --scan output."""
+
+ @property
+ def type(self):
+ return self.opts.type
+
+ @property
+ def base_labels(self):
+ return {'device': self.path, 'disk': self.type.partition('+')[2] or '0'}
+
+ def smartctl_select(self):
+ return ['--device', self.type, self.path]
+
+
+def metric_key(metric, prefix=''):
+ return '{prefix}{metric.name}'.format(prefix=prefix, metric=metric)
+
+
+def metric_format(metric, prefix=''):
+ key = metric_key(metric, prefix)
+ labels = ','.join(
+ '{k}="{v}"'.format(k=k, v=v.replace('"', '\\"')) for k, v in metric.labels.items())
+ value = decimal.Decimal(metric.value)
+
+ return '{key}{{{labels}}} {value}'.format(
+ key=key, labels=labels, value=value)
+
+
+def metric_print_meta(metric, prefix=''):
+ key = metric_key(metric, prefix)
+ print('# HELP {key} SMART metric {metric.name}'.format(
+ key=key, metric=metric))
+ print('# TYPE {key} gauge'.format(key=key))
+
+
+def metric_print(metric, prefix=''):
+ print(metric_format(metric, prefix))
+
+
+def smart_ctl(*args, check=True):
+ """Wrapper around invoking the smartctl binary.
+
+ Returns:
+ (str) Data piped to stdout by the smartctl subprocess.
+ """
+ return subprocess.run(
+ ['smartctl', *args], stdout=subprocess.PIPE, check=check
+ ).stdout.decode('utf-8')
+
+
+def smart_ctl_version():
+ return smart_ctl('-V').split('\n')[0].split()[1]
+
+
+def find_devices():
+ """Find SMART devices.
+
+ Yields:
+ (Device) Single device found by smartctl.
+ """
+ parser = argparse.ArgumentParser()
+ parser.add_argument('-d', '--device', dest='type')
+
+ devices = smart_ctl('--scan-open')
+
+ for device in devices.split('\n'):
+ device = device.strip()
+ if not device:
+ continue
+
+ tokens = shlex.split(device, comments=True)
+ if not tokens:
+ continue
+
+ yield Device(tokens[0], parser.parse_args(tokens[1:]))
+
+
+def device_is_active(device):
+ """Returns whenever the given device is currently active or not.
+
+ Args:
+ device: (Device) Device in question.
+
+ Returns:
+ (bool) True if the device is active and False otherwise.
+ """
+ try:
+ smart_ctl('--nocheck', 'standby', *device.smartctl_select())
+ except subprocess.CalledProcessError:
+ return False
+
+ return True
+
+
+def device_info(device):
+ """Query device for basic model information.
+
+ Args:
+ device: (Device) Device in question.
+
+ Returns:
+ (generator): Generator yielding:
+
+ key (str): Key describing the value.
+ value (str): Actual value.
+ """
+ info_lines = smart_ctl(
+ '--info', *device.smartctl_select()
+ ).strip().split('\n')[3:]
+
+ matches = (device_info_re.match(line) for line in info_lines)
+ return (m.groups() for m in matches if m is not None)
+
+
+def device_smart_capabilities(device):
+ """Returns SMART capabilities of the given device.
+
+ Args:
+ device: (Device) Device in question.
+
+ Returns:
+ (tuple): tuple containing:
+
+ (bool): True whenever SMART is available, False otherwise.
+ (bool): True whenever SMART is enabled, False otherwise.
+ """
+ groups = device_info(device)
+
+ state = {
+ g[1].split(' ', 1)[0]
+ for g in groups if g[0] == 'SMART support'}
+
+ smart_available = 'Available' in state
+ smart_enabled = 'Enabled' in state
+
+ return smart_available, smart_enabled
+
+
+def collect_device_info(device):
+ """Collect basic device information.
+
+ Args:
+ device: (Device) Device in question.
+
+ Yields:
+ (Metric) metrics describing general device information.
+ """
+ values = dict(device_info(device))
+ yield Metric('device_info', {
+ **device.base_labels,
+ **{v: values[k] for k, v in device_info_map.items() if k in values}
+ }, True)
+
+
+def collect_device_health_self_assessment(device):
+ """Collect metric about the device health self assessment.
+
+ Args:
+ device: (Device) Device in question.
+
+ Yields:
+ (Metric) Device health self assessment.
+ """
+ out = smart_ctl('--health', *device.smartctl_select(), check=False)
+
+ self_assessment_passed = bool(self_test_re.search(out))
+
+ yield Metric(
+ 'device_smart_healthy', device.base_labels, self_assessment_passed)
+
+
+def collect_ata_metrics(device):
+ # Fetch SMART attributes for the given device.
+ attributes = smart_ctl(
+ '--attributes', *device.smartctl_select()
+ )
+
+ # replace multiple occurrences of whitespace with a single whitespace
+ # so that the CSV Parser recognizes individual columns properly.
+ attributes = re.sub(r'[\t\x20]+', ' ', attributes)
+
+ # Turn smartctl output into a list of lines and skip to the table of
+ # SMART attributes.
+ attribute_lines = attributes.strip().split('\n')[7:]
+
+ # Some attributes have multiple IDs but have the same name. Don't
+ # yield attributes that already have been reported before.
+ seen = set()
+
+ reader = csv.DictReader(
+ (line.strip() for line in attribute_lines),
+ fieldnames=SmartAttribute._fields[:-1],
+ restkey=SmartAttribute._fields[-1], delimiter=' ')
+ for entry in reader:
+ # We're only interested in the SMART attributes that are
+ # whitelisted here.
+ entry['name'] = entry['name'].lower()
+ if entry['name'] not in smart_attributes_whitelist:
+ continue
+
+ # Ensure that only the numeric parts are fetched from the raw_value.
+ # Attributes such as 194 Temperature_Celsius reported by my SSD
+ # are in the format of "36 (Min/Max 24/40)" which can't be expressed
+ # properly as a prometheus metric.
+ m = re.match(r'^(\d+)', ' '.join(entry['raw_value']))
+ if not m:
+ continue
+ entry['raw_value'] = m.group(1)
+
+ # Some device models report "---" in the threshold value where most
+ # devices would report "000". We do the substitution here because
+ # downstream code expects values to be convertable to integer.
+ if entry['threshold'] == '---':
+ entry['threshold'] = '0'
+
+ if entry['name'] in smart_attributes_whitelist and entry['name'] not in seen:
+ labels = {
+ 'name': entry['name'],
+ **device.base_labels,
+ }
+
+ for col in 'value', 'worst', 'threshold', 'raw_value':
+ yield Metric(
+ 'attr_{col}'.format(col=col),
+ labels, entry[col])
+
+ seen.add(entry['name'])
+
+
+def collect_ata_error_count(device):
+ """Inspect the device error log and report the amount of entries.
+
+ Args:
+ device: (Device) Device in question.
+
+ Yields:
+ (Metric) Device error count.
+ """
+ error_log = smart_ctl(
+ '-l', 'xerror,1', *device.smartctl_select(), check=False)
+
+ m = ata_error_count_re.search(error_log)
+
+ error_count = m.group(1) if m is not None else 0
+
+ yield Metric('device_errors', device.base_labels, error_count)
+
+
+def collect_disks_smart_metrics(wakeup_disks):
+ now = int(datetime.datetime.utcnow().timestamp())
+
+ for device in find_devices():
+ yield Metric('smartctl_run', device.base_labels, now)
+
+ is_active = device_is_active(device)
+
+ yield Metric('device_active', device.base_labels, is_active)
+
+ # Skip further metrics collection to prevent the disk from
+ # spinning up.
+ if not is_active and not wakeup_disks:
+ continue
+
+ yield from collect_device_info(device)
+
+ smart_available, smart_enabled = device_smart_capabilities(device)
+
+ yield Metric(
+ 'device_smart_available', device.base_labels, smart_available)
+ yield Metric(
+ 'device_smart_enabled', device.base_labels, smart_enabled)
+
+ # Skip further metrics collection here if SMART is disabled
+ # on the device. Further smartctl invocations would fail
+ # anyways.
+ if not smart_available:
+ continue
+
+ yield from collect_device_health_self_assessment(device)
+
+ if device.type.startswith('sat'):
+ yield from collect_ata_metrics(device)
+
+ yield from collect_ata_error_count(device)
+
+
+def main():
+ parser = argparse.ArgumentParser()
+ parser.add_argument('-s', '--wakeup-disks', dest='wakeup_disks', action='store_true')
+ args = parser.parse_args(sys.argv[1:])
+
+ version_metric = Metric('smartctl_version', {
+ 'version': smart_ctl_version()
+ }, True)
+ metric_print_meta(version_metric, 'smartmon_')
+ metric_print(version_metric, 'smartmon_')
+
+ metrics = list(collect_disks_smart_metrics(args.wakeup_disks))
+ metrics.sort(key=lambda i: i.name)
+
+ previous_name = None
+ for m in metrics:
+ if m.name != previous_name:
+ metric_print_meta(m, 'smartmon_')
+
+ previous_name = m.name
+
+ metric_print(m, 'smartmon_')
+
+
+if __name__ == '__main__':
+ main()
diff --git a/roles/monitoring/prometheus/exporter/node/handlers/main.yml b/roles/monitoring/prometheus/exporter/node/handlers/main.yml
index 3e1b2000..56056ea6 100644
--- a/roles/monitoring/prometheus/exporter/node/handlers/main.yml
+++ b/roles/monitoring/prometheus/exporter/node/handlers/main.yml
@@ -4,8 +4,7 @@
name: prometheus-node-exporter
state: restarted
-- name: reload prometheus-exporter-exporter
+- name: reload nginx
service:
- name: prometheus-exporter-exporter
- ## TODO: implement reload once exporter_exporter supports this...
- state: restarted
+ name: nginx
+ state: reloaded
diff --git a/roles/monitoring/prometheus/exporter/node/tasks/main.yml b/roles/monitoring/prometheus/exporter/node/tasks/main.yml
index 8392e580..2811c759 100644
--- a/roles/monitoring/prometheus/exporter/node/tasks/main.yml
+++ b/roles/monitoring/prometheus/exporter/node/tasks/main.yml
@@ -1,7 +1,10 @@
---
+ ## TODO: pin version
- name: install apt packages
apt:
- name: prom-exporter-node
+ name:
+ - prom-exporter-node
+ - moreutils
state: present
- name: create directory for textfile collector
@@ -25,8 +28,25 @@
- name: register exporter
copy:
content: |
- method: http
- http:
- port: 9100
- dest: /etc/prometheus/exporter/enabled/node.yml
- notify: reload prometheus-exporter-exporter
+ location = /node {
+ proxy_pass http://127.0.0.1:9100/metrics;
+ }
+ dest: /etc/prometheus/exporter/node.locations
+ notify: reload nginx
+
+- name: create directory for textfile collector scripts
+ file:
+ path: /usr/local/share/prometheus-node-exporter
+ state: directory
+
+- name: install the apt textfile collector script
+ when: ansible_pkg_mgr == "apt"
+ vars:
+ textfile_collector_name: "apt"
+ include_tasks: textfile_collector_script.yml
+
+- name: install all other textfile collector scripts
+ loop: "{{ prometheus_exporter_node_textfile_collector_scripts }}"
+ loop_control:
+ loop_var: textfile_collector_name
+ include_tasks: textfile_collector_script.yml
diff --git a/roles/monitoring/prometheus/exporter/node/tasks/textfile_collector_script.yml b/roles/monitoring/prometheus/exporter/node/tasks/textfile_collector_script.yml
new file mode 100644
index 00000000..80390a15
--- /dev/null
+++ b/roles/monitoring/prometheus/exporter/node/tasks/textfile_collector_script.yml
@@ -0,0 +1,24 @@
+---
+- name: install the collector script
+ copy:
+ src: "{{ textfile_collector_name }}"
+ dest: "/usr/local/share/prometheus-node-exporter/{{ textfile_collector_name }}"
+ mode: 0755
+
+- name: install systemd service units
+ loop:
+ - service
+ - timer
+ template:
+ src: "textfile-collector-scripts/{{ textfile_collector_name }}.{{ item }}.j2"
+ dest: "/etc/systemd/system/prometheus-node-exporter_{{ textfile_collector_name }}.{{ item }}"
+
+- name: make sure the systemd timer is enabled and started
+ systemd:
+ daemon_reload: yes
+ name: "prometheus-node-exporter_{{ textfile_collector_name }}.timer"
+ state: started
+ enabled: yes
+
+
+## TODO: install deps for textfile collectors: i.e. smartmontools for collector smartmon
diff --git a/roles/monitoring/prometheus/exporter/node/templates/textfile-collector-scripts/apt.service.j2 b/roles/monitoring/prometheus/exporter/node/templates/textfile-collector-scripts/apt.service.j2
new file mode 100644
index 00000000..7eca94fb
--- /dev/null
+++ b/roles/monitoring/prometheus/exporter/node/templates/textfile-collector-scripts/apt.service.j2
@@ -0,0 +1,31 @@
+[Unit]
+Description=Promethues node exporter textfile collector apt
+
+[Service]
+Type=oneshot
+Environment=TMPDIR=/var/lib/prometheus-node-exporter/textfile-collector
+ExecStart=bash -c "/usr/local/share/prometheus-node-exporter/apt | sponge /var/lib/prometheus-node-exporter/textfile-collector/apt.prom"
+
+# systemd hardening-options
+AmbientCapabilities=
+CapabilityBoundingSet=
+DeviceAllow=/dev/null rw
+DevicePolicy=strict
+LockPersonality=true
+MemoryDenyWriteExecute=true
+NoNewPrivileges=true
+PrivateDevices=true
+PrivateTmp=true
+ProtectControlGroups=true
+ProtectHome=true
+ProtectKernelModules=true
+ProtectKernelTunables=true
+ProtectSystem=strict
+ReadWritePaths=/var/lib/prometheus-node-exporter/textfile-collector
+RemoveIPC=true
+RestrictNamespaces=true
+RestrictRealtime=true
+SystemCallArchitectures=native
+
+[Install]
+WantedBy=multi-user.target
diff --git a/roles/monitoring/prometheus/exporter/node/templates/textfile-collector-scripts/apt.timer.j2 b/roles/monitoring/prometheus/exporter/node/templates/textfile-collector-scripts/apt.timer.j2
new file mode 100644
index 00000000..dc473749
--- /dev/null
+++ b/roles/monitoring/prometheus/exporter/node/templates/textfile-collector-scripts/apt.timer.j2
@@ -0,0 +1,9 @@
+[Unit]
+Description=Promethues node exporter textfile collector apt
+
+[Timer]
+OnBootSec=10s
+OnUnitActiveSec=15min
+
+[Install]
+WantedBy=timers.target
diff --git a/roles/monitoring/prometheus/exporter/node/templates/textfile-collector-scripts/deleted-libraries.service.j2 b/roles/monitoring/prometheus/exporter/node/templates/textfile-collector-scripts/deleted-libraries.service.j2
new file mode 100644
index 00000000..7b15e558
--- /dev/null
+++ b/roles/monitoring/prometheus/exporter/node/templates/textfile-collector-scripts/deleted-libraries.service.j2
@@ -0,0 +1,31 @@
+[Unit]
+Description=Promethues node exporter textfile collector deleted-libraries
+
+[Service]
+Type=oneshot
+Environment=TMPDIR=/var/lib/prometheus-node-exporter/textfile-collector
+ExecStart=bash -c "/usr/local/share/prometheus-node-exporter/deleted-libraries | sponge /var/lib/prometheus-node-exporter/textfile-collector/deleted-libraries.prom"
+
+# systemd hardening-options
+AmbientCapabilities=CAP_SYS_PTRACE
+CapabilityBoundingSet=CAP_SYS_PTRACE
+DeviceAllow=/dev/null rw
+DevicePolicy=strict
+LockPersonality=true
+MemoryDenyWriteExecute=true
+NoNewPrivileges=true
+PrivateDevices=true
+PrivateTmp=true
+ProtectControlGroups=true
+ProtectHome=true
+ProtectKernelModules=true
+ProtectKernelTunables=true
+ProtectSystem=strict
+ReadWritePaths=/var/lib/prometheus-node-exporter/textfile-collector
+RemoveIPC=true
+RestrictNamespaces=true
+RestrictRealtime=true
+SystemCallArchitectures=native
+
+[Install]
+WantedBy=multi-user.target
diff --git a/roles/monitoring/prometheus/exporter/node/templates/textfile-collector-scripts/deleted-libraries.timer.j2 b/roles/monitoring/prometheus/exporter/node/templates/textfile-collector-scripts/deleted-libraries.timer.j2
new file mode 100644
index 00000000..c09acecf
--- /dev/null
+++ b/roles/monitoring/prometheus/exporter/node/templates/textfile-collector-scripts/deleted-libraries.timer.j2
@@ -0,0 +1,9 @@
+[Unit]
+Description=Promethues node exporter textfile collector deleted-libraries
+
+[Timer]
+OnBootSec=20s
+OnUnitActiveSec=15min
+
+[Install]
+WantedBy=timers.target
diff --git a/roles/monitoring/prometheus/exporter/node/templates/textfile-collector-scripts/smartmon.service.j2 b/roles/monitoring/prometheus/exporter/node/templates/textfile-collector-scripts/smartmon.service.j2
new file mode 100644
index 00000000..fc7c9f3f
--- /dev/null
+++ b/roles/monitoring/prometheus/exporter/node/templates/textfile-collector-scripts/smartmon.service.j2
@@ -0,0 +1,29 @@
+[Unit]
+Description=Promethues node exporter textfile collector smartmon
+
+[Service]
+Type=oneshot
+Environment=TMPDIR=/var/lib/prometheus-node-exporter/textfile-collector
+Environment=LC_NUMERIC=C
+ExecStart=bash -c "/usr/local/share/prometheus-node-exporter/smartmon | sponge /var/lib/prometheus-node-exporter/textfile-collector/smartmon.prom"
+
+# systemd hardening-options
+AmbientCapabilities=
+CapabilityBoundingSet=
+LockPersonality=true
+MemoryDenyWriteExecute=true
+NoNewPrivileges=true
+PrivateTmp=true
+ProtectControlGroups=true
+ProtectHome=true
+ProtectKernelModules=true
+ProtectKernelTunables=true
+ProtectSystem=strict
+ReadWritePaths=/var/lib/prometheus-node-exporter/textfile-collector
+RemoveIPC=true
+RestrictNamespaces=true
+RestrictRealtime=true
+SystemCallArchitectures=native
+
+[Install]
+WantedBy=multi-user.target
diff --git a/roles/monitoring/prometheus/exporter/node/templates/textfile-collector-scripts/smartmon.timer.j2 b/roles/monitoring/prometheus/exporter/node/templates/textfile-collector-scripts/smartmon.timer.j2
new file mode 100644
index 00000000..576f5a9f
--- /dev/null
+++ b/roles/monitoring/prometheus/exporter/node/templates/textfile-collector-scripts/smartmon.timer.j2
@@ -0,0 +1,9 @@
+[Unit]
+Description=Promethues node exporter textfile collector smartmon
+
+[Timer]
+OnBootSec=30s
+OnUnitActiveSec=15min
+
+[Install]
+WantedBy=timers.target
diff --git a/roles/monitoring/prometheus/exporter/nut/handlers/main.yml b/roles/monitoring/prometheus/exporter/nut/handlers/main.yml
index 6e10f43b..edd87ed5 100644
--- a/roles/monitoring/prometheus/exporter/nut/handlers/main.yml
+++ b/roles/monitoring/prometheus/exporter/nut/handlers/main.yml
@@ -4,8 +4,7 @@
name: prometheus-nut-exporter
state: restarted
-- name: reload prometheus-exporter-exporter
+- name: reload ngnix
service:
- name: prometheus-exporter-exporter
- ## TODO: implement reload once exporter_exporter supports this...
- state: restarted
+ name: nginx
+ state: reloaded
diff --git a/roles/monitoring/prometheus/exporter/nut/tasks/main.yml b/roles/monitoring/prometheus/exporter/nut/tasks/main.yml
index 519ac7a0..f602472d 100644
--- a/roles/monitoring/prometheus/exporter/nut/tasks/main.yml
+++ b/roles/monitoring/prometheus/exporter/nut/tasks/main.yml
@@ -1,4 +1,5 @@
---
+ ## TODO: pin version
- name: install apt packages
apt:
name: prom-exporter-nut
@@ -20,9 +21,11 @@
- name: register exporter
copy:
content: |
- method: http
- http:
- port: 9199
- path: /ups_metrics
- dest: /etc/prometheus/exporter/enabled/nut.yml
- notify: reload prometheus-exporter-exporter
+ location = /nut {
+ proxy_pass http://127.0.0.1:9199/metrics;
+ }
+ location = /nut/ups {
+ proxy_pass http://127.0.0.1:9199/ups_metrics;
+ }
+ dest: /etc/prometheus/exporter/nut.locations
+ notify: reload nginx
diff --git a/roles/monitoring/prometheus/server/defaults/main/main.yml b/roles/monitoring/prometheus/server/defaults/main/main.yml
index 95b9da6d..1e0ccf78 100644
--- a/roles/monitoring/prometheus/server/defaults/main/main.yml
+++ b/roles/monitoring/prometheus/server/defaults/main/main.yml
@@ -5,9 +5,8 @@
prometheus_server_retention: "15d"
-prometheus_server_jobs_generic:
+prometheus_server_jobs:
- node
-prometheus_server_jobs_special: []
#prometheus_server_jobs_extra: |
# - job_name: ...
@@ -16,9 +15,11 @@ prometheus_server_rules:
node: "{{ prometheus_server_rules_node + prometheus_server_rules_node_extra }}"
openwrt: "{{ prometheus_server_rules_openwrt + prometheus_server_rules_node_extra }}"
nut: "{{ prometheus_server_rules_nut + prometheus_server_rules_nut_extra }}"
- "blackbox-ping": "{{ prometheus_server_rules_blackbox_ping + prometheus_server_rules_blackbox_ping_extra }}"
- "blackbox-https": "{{ prometheus_server_rules_blackbox_https + prometheus_server_rules_blackbox_https_extra }}"
- "blackbox-ssh": "{{ prometheus_server_rules_blackbox_ssh + prometheus_server_rules_blackbox_ssh_extra }}"
+ nut/ups: "{{ prometheus_server_rules_nut__ups + prometheus_server_rules_nut__ups_extra }}"
+ blackbox: "{{ prometheus_server_rules_blackbox + prometheus_server_rules_blackbox_extra }}"
+ blackbox/probe: "{{ prometheus_server_rules_blackbox__probe + prometheus_server_rules_blackbox__probe_extra }}"
+ ipmi: "{{ prometheus_server_rules_ipmi + prometheus_server_rules_ipmi_extra }}"
+ ipmi/remote: "{{ prometheus_server_rules_ipmi__remote + prometheus_server_rules_ipmi__remote_extra }}"
# prometheus_server_alertmanager:
# url: "127.0.0.1:9093"
diff --git a/roles/monitoring/prometheus/server/defaults/main/rules_blackbox-https.yml b/roles/monitoring/prometheus/server/defaults/main/rules_blackbox-https.yml
deleted file mode 100644
index bb806075..00000000
--- a/roles/monitoring/prometheus/server/defaults/main/rules_blackbox-https.yml
+++ /dev/null
@@ -1,3 +0,0 @@
----
-prometheus_server_rules_blackbox_https_extra: []
-prometheus_server_rules_blackbox_https: []
diff --git a/roles/monitoring/prometheus/server/defaults/main/rules_blackbox-ping.yml b/roles/monitoring/prometheus/server/defaults/main/rules_blackbox-ping.yml
deleted file mode 100644
index 56c122f5..00000000
--- a/roles/monitoring/prometheus/server/defaults/main/rules_blackbox-ping.yml
+++ /dev/null
@@ -1,3 +0,0 @@
----
-prometheus_server_rules_blackbox_ping_extra: []
-prometheus_server_rules_blackbox_ping: []
diff --git a/roles/monitoring/prometheus/server/defaults/main/rules_blackbox-ssh.yml b/roles/monitoring/prometheus/server/defaults/main/rules_blackbox-ssh.yml
deleted file mode 100644
index 727d2292..00000000
--- a/roles/monitoring/prometheus/server/defaults/main/rules_blackbox-ssh.yml
+++ /dev/null
@@ -1,3 +0,0 @@
----
-prometheus_server_rules_blackbox_ssh_extra: []
-prometheus_server_rules_blackbox_ssh: []
diff --git a/roles/monitoring/prometheus/server/defaults/main/rules_blackbox.yml b/roles/monitoring/prometheus/server/defaults/main/rules_blackbox.yml
new file mode 100644
index 00000000..99f2e83c
--- /dev/null
+++ b/roles/monitoring/prometheus/server/defaults/main/rules_blackbox.yml
@@ -0,0 +1,3 @@
+---
+prometheus_server_rules_blackbox_extra: []
+prometheus_server_rules_blackbox: []
diff --git a/roles/monitoring/prometheus/server/defaults/main/rules_blackbox__probe.yml b/roles/monitoring/prometheus/server/defaults/main/rules_blackbox__probe.yml
new file mode 100644
index 00000000..9f9d2292
--- /dev/null
+++ b/roles/monitoring/prometheus/server/defaults/main/rules_blackbox__probe.yml
@@ -0,0 +1,74 @@
+---
+prometheus_server_rules_blackbox__probe_extra: []
+prometheus_server_rules_blackbox__probe:
+ - alert: BlackboxProbeFailed
+ expr: probe_success == 0
+ for: 0m
+ labels:
+ severity: critical
+ annotations:
+ summary: Blackbox probe failed (instance {{ '{{' }} $labels.instance {{ '}}' }})
+ description: "Probe failed\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}"
+
+ - alert: BlackboxSlowProbe
+ expr: avg_over_time(probe_duration_seconds[1m]) > 1
+ for: 1m
+ labels:
+ severity: warning
+ annotations:
+ summary: Blackbox slow probe (instance {{ '{{' }} $labels.instance {{ '}}' }})
+ description: "Blackbox probe took more than 1s to complete\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}"
+
+ - alert: BlackboxSslCertificateWillExpireSoon
+ expr: probe_ssl_earliest_cert_expiry - time() < 86400 * 30
+ for: 0m
+ labels:
+ severity: warning
+ annotations:
+ summary: Blackbox SSL certificate will expire soon (instance {{ '{{' }} $labels.instance {{ '}}' }})
+ description: "SSL certificate expires in 30 days\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}"
+
+ - alert: BlackboxSslCertificateWillExpireSoon
+ expr: probe_ssl_earliest_cert_expiry - time() < 86400 * 3
+ for: 0m
+ labels:
+ severity: critical
+ annotations:
+ summary: Blackbox SSL certificate will expire soon (instance {{ '{{' }} $labels.instance {{ '}}' }})
+ description: "SSL certificate expires in 3 days\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}"
+
+ - alert: BlackboxSslCertificateExpired
+ expr: probe_ssl_earliest_cert_expiry - time() <= 0
+ for: 0m
+ labels:
+ severity: critical
+ annotations:
+ summary: Blackbox SSL certificate expired (instance {{ '{{' }} $labels.instance {{ '}}' }})
+ description: "SSL certificate has expired already\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}"
+
+ - alert: BlackboxProbeHttpFailure
+ expr: probe_http_status_code <= 199 OR probe_http_status_code >= 400
+ for: 0m
+ labels:
+ severity: critical
+ annotations:
+ summary: Blackbox probe HTTP failure (instance {{ '{{' }} $labels.instance {{ '}}' }})
+ description: "HTTP status code is not 200-399\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}"
+
+ - alert: BlackboxProbeSlowHttp
+ expr: avg_over_time(probe_http_duration_seconds[1m]) > 1
+ for: 1m
+ labels:
+ severity: warning
+ annotations:
+ summary: Blackbox probe slow HTTP (instance {{ '{{' }} $labels.instance {{ '}}' }})
+ description: "HTTP request took more than 1s\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}"
+
+ - alert: BlackboxProbeSlowPing
+ expr: avg_over_time(probe_icmp_duration_seconds[1m]) > 1
+ for: 1m
+ labels:
+ severity: warning
+ annotations:
+ summary: Blackbox probe slow ping (instance {{ '{{' }} $labels.instance {{ '}}' }})
+ description: "Blackbox ping took more than 1s\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}"
diff --git a/roles/monitoring/prometheus/server/defaults/main/rules_ipmi.yml b/roles/monitoring/prometheus/server/defaults/main/rules_ipmi.yml
new file mode 100644
index 00000000..41dcd7e9
--- /dev/null
+++ b/roles/monitoring/prometheus/server/defaults/main/rules_ipmi.yml
@@ -0,0 +1,4 @@
+---
+prometheus_server_rules_ipmi_extra: []
+prometheus_server_rules_ipmi: []
+## TODO: add common IPMI alert rules
diff --git a/roles/monitoring/prometheus/server/defaults/main/rules_ipmi__remote.yml b/roles/monitoring/prometheus/server/defaults/main/rules_ipmi__remote.yml
new file mode 100644
index 00000000..1f9338ea
--- /dev/null
+++ b/roles/monitoring/prometheus/server/defaults/main/rules_ipmi__remote.yml
@@ -0,0 +1,4 @@
+---
+prometheus_server_rules_ipmi__remote_extra: []
+prometheus_server_rules_ipmi__remote: []
+## TODO: add remote-IPMI specific alert rules
diff --git a/roles/monitoring/prometheus/server/defaults/main/rules_node.yml b/roles/monitoring/prometheus/server/defaults/main/rules_node.yml
index ab7317ac..55641534 100644
--- a/roles/monitoring/prometheus/server/defaults/main/rules_node.yml
+++ b/roles/monitoring/prometheus/server/defaults/main/rules_node.yml
@@ -92,6 +92,15 @@ prometheus_server_rules_node:
summary: Host CPU steal noisy neighbor (instance {{ '{{' }} $labels.instance {{ '}}' }})
description: "CPU steal is > 10%. A noisy neighbor is killing VM performances or a spot instance may be out of credit.\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}"
+ - alert: HostSystemdNotRunning
+ expr: node_systemd_system_running == 0
+ for: 0m
+ labels:
+ severity: warning
+ annotations:
+ summary: Host systemd is not in running state (instance {{ '{{' }} $labels.instance {{ '}}' }})
+ description: "systemd is not in running state.\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}"
+
- alert: HostSystemdServiceCrashed
expr: node_systemd_unit_state{state="failed"} == 1
for: 0m
@@ -99,7 +108,7 @@ prometheus_server_rules_node:
severity: warning
annotations:
summary: Host systemd service crashed (instance {{ '{{' }} $labels.instance {{ '}}' }})
- description: "systemd service crashed\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}"
+ description: "The systemd service unit {{ '{{' }} $labels.name {{ '}}' }} is in failed state.\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}"
- alert: HostPhysicalComponentTooHot
expr: node_hwmon_temp_celsius > 75
@@ -217,3 +226,30 @@ prometheus_server_rules_node:
annotations:
summary: Host clock not synchronising (instance {{ '{{' }} $labels.instance {{ '}}' }})
description: "Clock not synchronising.\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}"
+
+ - alert: AptUpgradesPending
+ expr: sum by (instance) (apt_upgrades_pending) > 0
+ for: 0m
+ labels:
+ severity: warning
+ annotations:
+ summary: Host has upgradeable packages (instance {{ '{{' }} $labels.instance {{ '}}' }})
+ description: "Host {{ '{{' }} $labels.instance {{ '}}' }} has {{ '{{' }} $value {{ '}}' }} upgradable packages.\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}"
+
+ - alert: AptAutoremovePending
+ expr: sum by (instance) (apt_autoremove_pending) > 0
+ for: 0m
+ labels:
+ severity: warning
+ annotations:
+ summary: Host has packages that can be autoremoved (instance {{ '{{' }} $labels.instance {{ '}}' }})
+ description: "Host {{ '{{' }} $labels.instance {{ '}}' }} has {{ '{{' }} $value {{ '}}' }} packages that can be autoremoved.\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}"
+
+ - alert: HostNeedsRebooting
+ expr: node_reboot_required > 0
+ for: 0m
+ labels:
+ severity: warning
+ annotations:
+ summary: Host must be rebootet (instance {{ '{{' }} $labels.instance {{ '}}' }})
+ description: "Host {{ '{{' }} $labels.instance {{ '}}' }} must be rebootet for security uppdates to take effect.\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}"
diff --git a/roles/monitoring/prometheus/server/defaults/main/rules_nut__ups.yml b/roles/monitoring/prometheus/server/defaults/main/rules_nut__ups.yml
new file mode 100644
index 00000000..150a507e
--- /dev/null
+++ b/roles/monitoring/prometheus/server/defaults/main/rules_nut__ups.yml
@@ -0,0 +1,4 @@
+---
+prometheus_server_rules_nut__ups_extra: []
+prometheus_server_rules_nut__ups: []
+## TODO: add NUT/UPS alert rules
diff --git a/roles/monitoring/prometheus/server/defaults/main/rules_openwrt.yml b/roles/monitoring/prometheus/server/defaults/main/rules_openwrt.yml
index 88d84f31..04b178f1 100644
--- a/roles/monitoring/prometheus/server/defaults/main/rules_openwrt.yml
+++ b/roles/monitoring/prometheus/server/defaults/main/rules_openwrt.yml
@@ -1,3 +1,4 @@
---
prometheus_server_rules_openwrt_extra: []
prometheus_server_rules_openwrt: []
+## TODO: add openwrt specific alert rules
diff --git a/roles/monitoring/prometheus/server/filter_plugins/prometheus.py b/roles/monitoring/prometheus/server/filter_plugins/prometheus.py
index 5a8722c2..d91ef619 100644
--- a/roles/monitoring/prometheus/server/filter_plugins/prometheus.py
+++ b/roles/monitoring/prometheus/server/filter_plugins/prometheus.py
@@ -6,38 +6,31 @@ from functools import partial
from ansible import errors
-def prometheus_generic_job_targets(hostvars, jobs, targets):
+def prometheus_job_targets(hostvars, jobs, targets):
try:
result = []
for job in jobs:
for target in targets:
- enabled = job in hostvars[target]['prometheus_exporters_default'] or job in hostvars[target]['prometheus_exporters_extra']
- result.append({'job': job, 'instance': target, 'enabled': enabled})
+ multitarget_config_varname = 'prometheus_job_multitarget_' + job.replace('-', '_').replace('/', '__')
+ if multitarget_config_varname in hostvars[target]:
+ for exporter_hostname, configs in hostvars[target][multitarget_config_varname].items():
+ for config in configs:
+ result.append({'job': job, 'instance': config['instance'], 'enabled': True,
+ 'exporter_hostname': exporter_hostname, 'config': config})
+
+ else:
+ enabled = job in hostvars[target]['prometheus_exporters_default'] or job in hostvars[target]['prometheus_exporters_extra']
+ result.append({'job': job, 'instance': target, 'enabled': enabled})
return result
except Exception as e:
- raise errors.AnsibleFilterError("prometheus_generic_job_targets(): %s" % str(e))
-
-
-def prometheus_special_job_targets(hostvars, jobs, targets):
- try:
- result = []
- for job in jobs:
- for target in targets:
- config_varname = 'prometheus_special_job_' + job.replace('-', '_')
- if config_varname in hostvars[target]:
- for config in hostvars[target][config_varname]:
- result.append({'job': job, 'instance': config['instance'], 'config': config})
- return result
- except Exception as e:
- raise errors.AnsibleFilterError("prometheus_special_job_targets(): %s" % str(e))
+ raise errors.AnsibleFilterError("prometheus_job_targets(): %s" % str(e))
class FilterModule(object):
''' prometheus filters '''
filter_map = {
- 'prometheus_generic_job_targets': prometheus_generic_job_targets,
- 'prometheus_special_job_targets': prometheus_special_job_targets,
+ 'prometheus_job_targets': prometheus_job_targets,
}
def filters(self):
diff --git a/roles/monitoring/prometheus/server/tasks/main.yml b/roles/monitoring/prometheus/server/tasks/main.yml
index d0ccd8af..16167c9c 100644
--- a/roles/monitoring/prometheus/server/tasks/main.yml
+++ b/roles/monitoring/prometheus/server/tasks/main.yml
@@ -13,6 +13,7 @@
include_role:
name: "storage/{{ prometheus_server_storage.type }}/volume"
+ ## TODO: pin version
- name: install apt packages
apt:
name: prom-server
@@ -37,50 +38,52 @@
- name: create configuration directories
loop:
- - jobs
- rules
- targets
file:
path: "/etc/prometheus/{{ item }}"
state: directory
-- name: create sub-directroy for all exporter types in jobs directory
- loop: "{{ prometheus_server_jobs_generic + prometheus_server_jobs_special }}"
+- name: create sub-directories for all jobs in targets directory
+ loop: "{{ prometheus_server_jobs }}"
file:
- path: "/etc/prometheus/jobs/{{ item }}"
+ path: "/etc/prometheus/targets/{{ item }}"
state: directory
-- name: generate generic targets config
- loop: "{{ prometheus_zone_targets }}"
- loop_control:
- loop_var: target
- template:
- src: targets/generic.yml.j2
- dest: "/etc/prometheus/targets/{{ target }}.yml"
- notify: reload prometheus
+- name: enable/disable job targets
+ vars:
+ job_targets: "{{ hostvars | prometheus_job_targets(prometheus_server_jobs, prometheus_zone_targets) }}"
+ block:
+ - name: install files for enabled targets
+ loop: "{{ job_targets }}"
+ loop_control:
+ loop_var: target
+ label: "{{ target.job }} -> {{ target.instance }}"
+ when: target.enabled
+ template:
+ src: "{{ lookup('first_found', {'paths': ['templates/targets'], 'files': [target.job + '.yml.j2', 'generic.yml.j2']}) }}"
+ dest: "/etc/prometheus/targets/{{ target.job }}/{{ target.instance }}.yml"
+ notify: reload prometheus
-- name: enable targets for generic jobs
- loop: "{{ hostvars | prometheus_generic_job_targets(prometheus_server_jobs_generic, prometheus_zone_targets) }}"
- loop_control:
- label: "{{ item.job }} -> {{ item.instance }}"
- file:
- src: "{{ item.enabled | ternary('/etc/prometheus/targets/' + item.instance + '.yml', omit) }}"
- path: "/etc/prometheus/jobs/{{ item.job }}/{{ item.instance }}.yml"
- state: "{{ item.enabled | ternary('link', 'absent') }}"
- notify: reload prometheus
+ - name: remove files for disabled targets
+ loop: "{{ job_targets }}"
+ loop_control:
+ loop_var: target
+ label: "{{ target.job }} -> {{ target.instance }}"
+ when: not target.enabled
+ file:
+ path: "/etc/prometheus/targets/{{ target.job }}/{{ target.instance }}.yml"
+ state: absent
+ notify: reload prometheus
-- name: enable targets for special jobs
- loop: "{{ hostvars | prometheus_special_job_targets(prometheus_server_jobs_special, prometheus_zone_targets) }}"
- loop_control:
- loop_var: target
- label: "{{ target.job }} -> {{ target.instance }}"
- template:
- src: "targets/{{ target.job }}.yml.j2"
- dest: "/etc/prometheus/jobs/{{ target.job }}/{{ target.instance }}.yml"
- notify: reload prometheus
+- name: create sub-directories for all jobs in rules directory
+ loop: "{{ prometheus_server_jobs | select('match', '.*/.*') | map('dirname') | unique }}"
+ file:
+ path: "/etc/prometheus/rules/{{ item }}"
+ state: directory
- name: generate rules files for all jobs
- loop: "{{ (prometheus_server_jobs_generic + prometheus_server_jobs_special) | union(['prometheus']) }}"
+ loop: "{{ prometheus_server_jobs | union(['prometheus']) }}"
template:
src: rules.yml.j2
dest: "/etc/prometheus/rules/{{ item }}.yml"
diff --git a/roles/monitoring/prometheus/server/templates/job-snippets/blackbox-https.j2 b/roles/monitoring/prometheus/server/templates/job-snippets/blackbox-https.j2
deleted file mode 100644
index 0a6d2dfa..00000000
--- a/roles/monitoring/prometheus/server/templates/job-snippets/blackbox-https.j2
+++ /dev/null
@@ -1,14 +0,0 @@
- - job_name: '{{ job }}'
- metrics_path: /proxy
- params:
- module:
- - blackbox
- - http_tls_2xx
- scheme: https
- tls_config:
- ca_file: /etc/ssl/prometheus/ca-crt.pem
- cert_file: /etc/ssl/prometheus/server/scrape-crt.pem
- key_file: /etc/ssl/prometheus/server/scrape-key.pem
- file_sd_configs:
- - files:
- - "/etc/prometheus/jobs/{{ job }}/*.yml"
diff --git a/roles/monitoring/prometheus/server/templates/job-snippets/blackbox-ping.j2 b/roles/monitoring/prometheus/server/templates/job-snippets/blackbox-ping.j2
deleted file mode 100644
index 7f4f12df..00000000
--- a/roles/monitoring/prometheus/server/templates/job-snippets/blackbox-ping.j2
+++ /dev/null
@@ -1,14 +0,0 @@
- - job_name: '{{ job }}'
- metrics_path: /proxy
- params:
- module:
- - blackbox
- - icmp
- scheme: https
- tls_config:
- ca_file: /etc/ssl/prometheus/ca-crt.pem
- cert_file: /etc/ssl/prometheus/server/scrape-crt.pem
- key_file: /etc/ssl/prometheus/server/scrape-key.pem
- file_sd_configs:
- - files:
- - "/etc/prometheus/jobs/{{ job }}/*.yml"
diff --git a/roles/monitoring/prometheus/server/templates/job-snippets/blackbox-ssh.j2 b/roles/monitoring/prometheus/server/templates/job-snippets/blackbox-ssh.j2
deleted file mode 100644
index 18381e32..00000000
--- a/roles/monitoring/prometheus/server/templates/job-snippets/blackbox-ssh.j2
+++ /dev/null
@@ -1,14 +0,0 @@
- - job_name: '{{ job }}'
- metrics_path: /proxy
- params:
- module:
- - blackbox
- - ssh_banner
- scheme: https
- tls_config:
- ca_file: /etc/ssl/prometheus/ca-crt.pem
- cert_file: /etc/ssl/prometheus/server/scrape-crt.pem
- key_file: /etc/ssl/prometheus/server/scrape-key.pem
- file_sd_configs:
- - files:
- - "/etc/prometheus/jobs/{{ job }}/*.yml"
diff --git a/roles/monitoring/prometheus/server/templates/job-snippets/generic.j2 b/roles/monitoring/prometheus/server/templates/jobs/generic.j2
index 87992eeb..65a95007 100644
--- a/roles/monitoring/prometheus/server/templates/job-snippets/generic.j2
+++ b/roles/monitoring/prometheus/server/templates/jobs/generic.j2
@@ -1,8 +1,5 @@
- job_name: '{{ job }}'
- metrics_path: /proxy
- params:
- module:
- - {{ job }}
+ metrics_path: /{{ job }}
scheme: https
tls_config:
ca_file: /etc/ssl/prometheus/ca-crt.pem
@@ -10,4 +7,4 @@
key_file: /etc/ssl/prometheus/server/scrape-key.pem
file_sd_configs:
- files:
- - "/etc/prometheus/jobs/{{ job }}/*.yml"
+ - "/etc/prometheus/targets/{{ job }}/*.yml"
diff --git a/roles/monitoring/prometheus/server/templates/jobs/node.j2 b/roles/monitoring/prometheus/server/templates/jobs/node.j2
new file mode 100644
index 00000000..1b14e1f6
--- /dev/null
+++ b/roles/monitoring/prometheus/server/templates/jobs/node.j2
@@ -0,0 +1,17 @@
+ - job_name: '{{ job }}'
+ metrics_path: /{{ job }}
+ scheme: https
+ tls_config:
+ ca_file: /etc/ssl/prometheus/ca-crt.pem
+ cert_file: /etc/ssl/prometheus/server/scrape-crt.pem
+ key_file: /etc/ssl/prometheus/server/scrape-key.pem
+ file_sd_configs:
+ - files:
+ - "/etc/prometheus/targets/{{ job }}/*.yml"
+ metric_relabel_configs:
+ - source_labels: [ "mountpoint" ]
+ regex: ".*/\\.snapshot/.*"
+ action: drop
+ - source_labels: [ "__name__", "state" ]
+ regex: "node_systemd_unit_state;(activating|deactivating|inactive)"
+ action: drop
diff --git a/roles/monitoring/prometheus/server/templates/job-snippets/openwrt.j2 b/roles/monitoring/prometheus/server/templates/jobs/openwrt.j2
index 493a4fdb..e93f8be7 100644
--- a/roles/monitoring/prometheus/server/templates/job-snippets/openwrt.j2
+++ b/roles/monitoring/prometheus/server/templates/jobs/openwrt.j2
@@ -2,4 +2,4 @@
scheme: http
file_sd_configs:
- files:
- - "/etc/prometheus/jobs/{{ job }}/*.yml"
+ - "/etc/prometheus/targets/{{ job }}/*.yml"
diff --git a/roles/monitoring/prometheus/server/templates/prometheus.yml.j2 b/roles/monitoring/prometheus/server/templates/prometheus.yml.j2
index 4cfcc498..e73ca354 100644
--- a/roles/monitoring/prometheus/server/templates/prometheus.yml.j2
+++ b/roles/monitoring/prometheus/server/templates/prometheus.yml.j2
@@ -6,6 +6,9 @@ global:
rule_files:
- /etc/prometheus/rules/*.yml
+{% for subdir in (prometheus_server_jobs | select('match', '.*/.*') | map('dirname') | unique) %}
+ - /etc/prometheus/rules/{{ subdir }}/*.yml
+{% endfor %}
{% if prometheus_server_alertmanager is defined %}
alerting:
@@ -25,7 +28,7 @@ scrape_configs:
static_configs:
- targets: ['localhost:9090']
labels:
- instance: "{{ inventory_hostname }}"
+ instance: '{{ inventory_hostname }}'
{% if prometheus_server_alertmanager is defined %}
- job_name: 'alertmanager'
@@ -35,9 +38,9 @@ scrape_configs:
static_configs:
- targets: ['{{ prometheus_server_alertmanager.url }}']
{% endif %}
-{% for job in (prometheus_server_jobs_generic + prometheus_server_jobs_special) %}
+{% for job in (prometheus_server_jobs) %}
-{% include 'job-snippets/' + (lookup('first_found', {'paths': ['templates/job-snippets'], 'files': [job + '.j2', 'generic.j2']}) | basename) %}{{ '' }}
+{% include lookup('first_found', {'paths': ['templates/jobs'], 'files': [job + '.j2', 'generic.j2']}) | relpath(template_fullpath | dirname) %}{{ '' }}
{% endfor %}
{% if prometheus_server_jobs_extra is defined %}
diff --git a/roles/monitoring/prometheus/server/templates/targets/blackbox-https.yml.j2 b/roles/monitoring/prometheus/server/templates/targets/blackbox-https.yml.j2
deleted file mode 100644
index e843de36..00000000
--- a/roles/monitoring/prometheus/server/templates/targets/blackbox-https.yml.j2
+++ /dev/null
@@ -1,4 +0,0 @@
-- targets: [ "{{ hostvars[target.config.exporter_hostname].prometheus_scrape_endpoint }}" ]
- labels:
- instance: "{{ target.instance }}"
- __param_target: {{ target.config.address }}
diff --git a/roles/monitoring/prometheus/server/templates/targets/blackbox-ping.yml.j2 b/roles/monitoring/prometheus/server/templates/targets/blackbox-ping.yml.j2
deleted file mode 100644
index e843de36..00000000
--- a/roles/monitoring/prometheus/server/templates/targets/blackbox-ping.yml.j2
+++ /dev/null
@@ -1,4 +0,0 @@
-- targets: [ "{{ hostvars[target.config.exporter_hostname].prometheus_scrape_endpoint }}" ]
- labels:
- instance: "{{ target.instance }}"
- __param_target: {{ target.config.address }}
diff --git a/roles/monitoring/prometheus/server/templates/targets/blackbox-ssh.yml.j2 b/roles/monitoring/prometheus/server/templates/targets/blackbox-ssh.yml.j2
deleted file mode 100644
index e843de36..00000000
--- a/roles/monitoring/prometheus/server/templates/targets/blackbox-ssh.yml.j2
+++ /dev/null
@@ -1,4 +0,0 @@
-- targets: [ "{{ hostvars[target.config.exporter_hostname].prometheus_scrape_endpoint }}" ]
- labels:
- instance: "{{ target.instance }}"
- __param_target: {{ target.config.address }}
diff --git a/roles/monitoring/prometheus/server/templates/targets/blackbox/probe.yml.j2 b/roles/monitoring/prometheus/server/templates/targets/blackbox/probe.yml.j2
new file mode 100644
index 00000000..4e336873
--- /dev/null
+++ b/roles/monitoring/prometheus/server/templates/targets/blackbox/probe.yml.j2
@@ -0,0 +1,5 @@
+- targets: [ '{{ hostvars[target.exporter_hostname].prometheus_scrape_endpoint }}' ]
+ labels:
+ instance: '{{ target.instance }}'
+ __param_target: '{{ target.config.target }}'
+ __param_module: '{{ target.config.module }}'
diff --git a/roles/monitoring/prometheus/server/templates/targets/generic.yml.j2 b/roles/monitoring/prometheus/server/templates/targets/generic.yml.j2
index e83b6bf4..6591362b 100644
--- a/roles/monitoring/prometheus/server/templates/targets/generic.yml.j2
+++ b/roles/monitoring/prometheus/server/templates/targets/generic.yml.j2
@@ -1,3 +1,3 @@
-- targets: [ "{{ hostvars[target].prometheus_scrape_endpoint }}" ]
+- targets: [ '{{ hostvars[target.instance].prometheus_scrape_endpoint }}' ]
labels:
- instance: "{{ target }}"
+ instance: '{{ target.instance }}'
diff --git a/roles/monitoring/prometheus/server/templates/targets/ipmi/remote.yml.j2 b/roles/monitoring/prometheus/server/templates/targets/ipmi/remote.yml.j2
new file mode 100644
index 00000000..4e336873
--- /dev/null
+++ b/roles/monitoring/prometheus/server/templates/targets/ipmi/remote.yml.j2
@@ -0,0 +1,5 @@
+- targets: [ '{{ hostvars[target.exporter_hostname].prometheus_scrape_endpoint }}' ]
+ labels:
+ instance: '{{ target.instance }}'
+ __param_target: '{{ target.config.target }}'
+ __param_module: '{{ target.config.module }}'
diff --git a/roles/monitoring/prometheus/server/templates/targets/nut.yml.j2 b/roles/monitoring/prometheus/server/templates/targets/nut.yml.j2
deleted file mode 100644
index da3de3d7..00000000
--- a/roles/monitoring/prometheus/server/templates/targets/nut.yml.j2
+++ /dev/null
@@ -1,17 +0,0 @@
-- targets: [ "{{ hostvars[target.config.exporter_hostname].prometheus_scrape_endpoint }}" ]
- labels:
- instance: "{{ target.instance }}"
- __param_ups: {{ target.config.ups }}
- __param_server: {{ target.config.server | default('127.0.0.1') }}
-{% if 'username' in target.config %}
- __param_username: {{ target.config.username }}
-{% endif %}
-{% if 'password' in target.config %}
- __param_password: {{ target.config.password }}
-{% endif %}
-{% if 'variables' in target.config %}
- __param_variables: {{ target.config.variables }}
-{% endif %}
-{% if 'statuses' in target.config %}
- __param_statuses: {{ target.config.statuses }}
-{% endif %}
diff --git a/roles/monitoring/prometheus/server/templates/targets/nut/ups.yml.j2 b/roles/monitoring/prometheus/server/templates/targets/nut/ups.yml.j2
new file mode 100644
index 00000000..c60077c7
--- /dev/null
+++ b/roles/monitoring/prometheus/server/templates/targets/nut/ups.yml.j2
@@ -0,0 +1,17 @@
+- targets: [ '{{ hostvars[target.exporter_hostname].prometheus_scrape_endpoint }}' ]
+ labels:
+ instance: '{{ target.instance }}'
+ __param_ups: '{{ target.config.ups }}'
+ __param_server: '{{ target.config.server | default('127.0.0.1') }}'
+{% if 'username' in target.config %}
+ __param_username: '{{ target.config.username }}'
+{% endif %}
+{% if 'password' in target.config %}
+ __param_password: '{{ target.config.password }}'
+{% endif %}
+{% if 'variables' in target.config %}
+ __param_variables: '{{ target.config.variables }}'
+{% endif %}
+{% if 'statuses' in target.config %}
+ __param_statuses: '{{ target.config.statuses }}'
+{% endif %}