summaryrefslogtreecommitdiff
path: root/roles/monitoring/prometheus/server
diff options
context:
space:
mode:
Diffstat (limited to 'roles/monitoring/prometheus/server')
-rw-r--r--roles/monitoring/prometheus/server/defaults/main/main.yml11
-rw-r--r--roles/monitoring/prometheus/server/defaults/main/rules_blackbox-https.yml3
-rw-r--r--roles/monitoring/prometheus/server/defaults/main/rules_blackbox-ping.yml3
-rw-r--r--roles/monitoring/prometheus/server/defaults/main/rules_blackbox-ssh.yml3
-rw-r--r--roles/monitoring/prometheus/server/defaults/main/rules_blackbox.yml3
-rw-r--r--roles/monitoring/prometheus/server/defaults/main/rules_blackbox__probe.yml74
-rw-r--r--roles/monitoring/prometheus/server/defaults/main/rules_ipmi.yml4
-rw-r--r--roles/monitoring/prometheus/server/defaults/main/rules_ipmi__remote.yml4
-rw-r--r--roles/monitoring/prometheus/server/defaults/main/rules_node.yml38
-rw-r--r--roles/monitoring/prometheus/server/defaults/main/rules_nut__ups.yml4
-rw-r--r--roles/monitoring/prometheus/server/defaults/main/rules_openwrt.yml1
-rw-r--r--roles/monitoring/prometheus/server/filter_plugins/prometheus.py33
-rw-r--r--roles/monitoring/prometheus/server/tasks/main.yml65
-rw-r--r--roles/monitoring/prometheus/server/templates/job-snippets/blackbox-https.j214
-rw-r--r--roles/monitoring/prometheus/server/templates/job-snippets/blackbox-ping.j214
-rw-r--r--roles/monitoring/prometheus/server/templates/job-snippets/blackbox-ssh.j214
-rw-r--r--roles/monitoring/prometheus/server/templates/jobs/generic.j2 (renamed from roles/monitoring/prometheus/server/templates/job-snippets/generic.j2)7
-rw-r--r--roles/monitoring/prometheus/server/templates/jobs/node.j217
-rw-r--r--roles/monitoring/prometheus/server/templates/jobs/openwrt.j2 (renamed from roles/monitoring/prometheus/server/templates/job-snippets/openwrt.j2)2
-rw-r--r--roles/monitoring/prometheus/server/templates/prometheus.yml.j29
-rw-r--r--roles/monitoring/prometheus/server/templates/targets/blackbox-https.yml.j24
-rw-r--r--roles/monitoring/prometheus/server/templates/targets/blackbox-ping.yml.j24
-rw-r--r--roles/monitoring/prometheus/server/templates/targets/blackbox-ssh.yml.j24
-rw-r--r--roles/monitoring/prometheus/server/templates/targets/blackbox/probe.yml.j25
-rw-r--r--roles/monitoring/prometheus/server/templates/targets/generic.yml.j24
-rw-r--r--roles/monitoring/prometheus/server/templates/targets/ipmi/remote.yml.j25
-rw-r--r--roles/monitoring/prometheus/server/templates/targets/nut.yml.j217
-rw-r--r--roles/monitoring/prometheus/server/templates/targets/nut/ups.yml.j217
28 files changed, 235 insertions, 148 deletions
diff --git a/roles/monitoring/prometheus/server/defaults/main/main.yml b/roles/monitoring/prometheus/server/defaults/main/main.yml
index 95b9da6d..1e0ccf78 100644
--- a/roles/monitoring/prometheus/server/defaults/main/main.yml
+++ b/roles/monitoring/prometheus/server/defaults/main/main.yml
@@ -5,9 +5,8 @@
prometheus_server_retention: "15d"
-prometheus_server_jobs_generic:
+prometheus_server_jobs:
- node
-prometheus_server_jobs_special: []
#prometheus_server_jobs_extra: |
# - job_name: ...
@@ -16,9 +15,11 @@ prometheus_server_rules:
node: "{{ prometheus_server_rules_node + prometheus_server_rules_node_extra }}"
openwrt: "{{ prometheus_server_rules_openwrt + prometheus_server_rules_node_extra }}"
nut: "{{ prometheus_server_rules_nut + prometheus_server_rules_nut_extra }}"
- "blackbox-ping": "{{ prometheus_server_rules_blackbox_ping + prometheus_server_rules_blackbox_ping_extra }}"
- "blackbox-https": "{{ prometheus_server_rules_blackbox_https + prometheus_server_rules_blackbox_https_extra }}"
- "blackbox-ssh": "{{ prometheus_server_rules_blackbox_ssh + prometheus_server_rules_blackbox_ssh_extra }}"
+ nut/ups: "{{ prometheus_server_rules_nut__ups + prometheus_server_rules_nut__ups_extra }}"
+ blackbox: "{{ prometheus_server_rules_blackbox + prometheus_server_rules_blackbox_extra }}"
+ blackbox/probe: "{{ prometheus_server_rules_blackbox__probe + prometheus_server_rules_blackbox__probe_extra }}"
+ ipmi: "{{ prometheus_server_rules_ipmi + prometheus_server_rules_ipmi_extra }}"
+ ipmi/remote: "{{ prometheus_server_rules_ipmi__remote + prometheus_server_rules_ipmi__remote_extra }}"
# prometheus_server_alertmanager:
# url: "127.0.0.1:9093"
diff --git a/roles/monitoring/prometheus/server/defaults/main/rules_blackbox-https.yml b/roles/monitoring/prometheus/server/defaults/main/rules_blackbox-https.yml
deleted file mode 100644
index bb806075..00000000
--- a/roles/monitoring/prometheus/server/defaults/main/rules_blackbox-https.yml
+++ /dev/null
@@ -1,3 +0,0 @@
----
-prometheus_server_rules_blackbox_https_extra: []
-prometheus_server_rules_blackbox_https: []
diff --git a/roles/monitoring/prometheus/server/defaults/main/rules_blackbox-ping.yml b/roles/monitoring/prometheus/server/defaults/main/rules_blackbox-ping.yml
deleted file mode 100644
index 56c122f5..00000000
--- a/roles/monitoring/prometheus/server/defaults/main/rules_blackbox-ping.yml
+++ /dev/null
@@ -1,3 +0,0 @@
----
-prometheus_server_rules_blackbox_ping_extra: []
-prometheus_server_rules_blackbox_ping: []
diff --git a/roles/monitoring/prometheus/server/defaults/main/rules_blackbox-ssh.yml b/roles/monitoring/prometheus/server/defaults/main/rules_blackbox-ssh.yml
deleted file mode 100644
index 727d2292..00000000
--- a/roles/monitoring/prometheus/server/defaults/main/rules_blackbox-ssh.yml
+++ /dev/null
@@ -1,3 +0,0 @@
----
-prometheus_server_rules_blackbox_ssh_extra: []
-prometheus_server_rules_blackbox_ssh: []
diff --git a/roles/monitoring/prometheus/server/defaults/main/rules_blackbox.yml b/roles/monitoring/prometheus/server/defaults/main/rules_blackbox.yml
new file mode 100644
index 00000000..99f2e83c
--- /dev/null
+++ b/roles/monitoring/prometheus/server/defaults/main/rules_blackbox.yml
@@ -0,0 +1,3 @@
+---
+prometheus_server_rules_blackbox_extra: []
+prometheus_server_rules_blackbox: []
diff --git a/roles/monitoring/prometheus/server/defaults/main/rules_blackbox__probe.yml b/roles/monitoring/prometheus/server/defaults/main/rules_blackbox__probe.yml
new file mode 100644
index 00000000..9f9d2292
--- /dev/null
+++ b/roles/monitoring/prometheus/server/defaults/main/rules_blackbox__probe.yml
@@ -0,0 +1,74 @@
+---
+prometheus_server_rules_blackbox__probe_extra: []
+prometheus_server_rules_blackbox__probe:
+ - alert: BlackboxProbeFailed
+ expr: probe_success == 0
+ for: 0m
+ labels:
+ severity: critical
+ annotations:
+ summary: Blackbox probe failed (instance {{ '{{' }} $labels.instance {{ '}}' }})
+ description: "Probe failed\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}"
+
+ - alert: BlackboxSlowProbe
+ expr: avg_over_time(probe_duration_seconds[1m]) > 1
+ for: 1m
+ labels:
+ severity: warning
+ annotations:
+ summary: Blackbox slow probe (instance {{ '{{' }} $labels.instance {{ '}}' }})
+ description: "Blackbox probe took more than 1s to complete\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}"
+
+ - alert: BlackboxSslCertificateWillExpireSoon
+ expr: probe_ssl_earliest_cert_expiry - time() < 86400 * 30
+ for: 0m
+ labels:
+ severity: warning
+ annotations:
+ summary: Blackbox SSL certificate will expire soon (instance {{ '{{' }} $labels.instance {{ '}}' }})
+ description: "SSL certificate expires in 30 days\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}"
+
+ - alert: BlackboxSslCertificateWillExpireSoon
+ expr: probe_ssl_earliest_cert_expiry - time() < 86400 * 3
+ for: 0m
+ labels:
+ severity: critical
+ annotations:
+ summary: Blackbox SSL certificate will expire soon (instance {{ '{{' }} $labels.instance {{ '}}' }})
+ description: "SSL certificate expires in 3 days\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}"
+
+ - alert: BlackboxSslCertificateExpired
+ expr: probe_ssl_earliest_cert_expiry - time() <= 0
+ for: 0m
+ labels:
+ severity: critical
+ annotations:
+ summary: Blackbox SSL certificate expired (instance {{ '{{' }} $labels.instance {{ '}}' }})
+ description: "SSL certificate has expired already\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}"
+
+ - alert: BlackboxProbeHttpFailure
+ expr: probe_http_status_code <= 199 OR probe_http_status_code >= 400
+ for: 0m
+ labels:
+ severity: critical
+ annotations:
+ summary: Blackbox probe HTTP failure (instance {{ '{{' }} $labels.instance {{ '}}' }})
+ description: "HTTP status code is not 200-399\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}"
+
+ - alert: BlackboxProbeSlowHttp
+ expr: avg_over_time(probe_http_duration_seconds[1m]) > 1
+ for: 1m
+ labels:
+ severity: warning
+ annotations:
+ summary: Blackbox probe slow HTTP (instance {{ '{{' }} $labels.instance {{ '}}' }})
+ description: "HTTP request took more than 1s\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}"
+
+ - alert: BlackboxProbeSlowPing
+ expr: avg_over_time(probe_icmp_duration_seconds[1m]) > 1
+ for: 1m
+ labels:
+ severity: warning
+ annotations:
+ summary: Blackbox probe slow ping (instance {{ '{{' }} $labels.instance {{ '}}' }})
+ description: "Blackbox ping took more than 1s\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}"
diff --git a/roles/monitoring/prometheus/server/defaults/main/rules_ipmi.yml b/roles/monitoring/prometheus/server/defaults/main/rules_ipmi.yml
new file mode 100644
index 00000000..41dcd7e9
--- /dev/null
+++ b/roles/monitoring/prometheus/server/defaults/main/rules_ipmi.yml
@@ -0,0 +1,4 @@
+---
+prometheus_server_rules_ipmi_extra: []
+prometheus_server_rules_ipmi: []
+## TODO: add common IPMI alert rules
diff --git a/roles/monitoring/prometheus/server/defaults/main/rules_ipmi__remote.yml b/roles/monitoring/prometheus/server/defaults/main/rules_ipmi__remote.yml
new file mode 100644
index 00000000..1f9338ea
--- /dev/null
+++ b/roles/monitoring/prometheus/server/defaults/main/rules_ipmi__remote.yml
@@ -0,0 +1,4 @@
+---
+prometheus_server_rules_ipmi__remote_extra: []
+prometheus_server_rules_ipmi__remote: []
+## TODO: add remote-IPMI specific alert rules
diff --git a/roles/monitoring/prometheus/server/defaults/main/rules_node.yml b/roles/monitoring/prometheus/server/defaults/main/rules_node.yml
index ab7317ac..55641534 100644
--- a/roles/monitoring/prometheus/server/defaults/main/rules_node.yml
+++ b/roles/monitoring/prometheus/server/defaults/main/rules_node.yml
@@ -92,6 +92,15 @@ prometheus_server_rules_node:
summary: Host CPU steal noisy neighbor (instance {{ '{{' }} $labels.instance {{ '}}' }})
description: "CPU steal is > 10%. A noisy neighbor is killing VM performances or a spot instance may be out of credit.\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}"
+ - alert: HostSystemdNotRunning
+ expr: node_systemd_system_running == 0
+ for: 0m
+ labels:
+ severity: warning
+ annotations:
+ summary: Host systemd is not in running state (instance {{ '{{' }} $labels.instance {{ '}}' }})
+ description: "systemd is not in running state.\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}"
+
- alert: HostSystemdServiceCrashed
expr: node_systemd_unit_state{state="failed"} == 1
for: 0m
@@ -99,7 +108,7 @@ prometheus_server_rules_node:
severity: warning
annotations:
summary: Host systemd service crashed (instance {{ '{{' }} $labels.instance {{ '}}' }})
- description: "systemd service crashed\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}"
+ description: "The systemd service unit {{ '{{' }} $labels.name {{ '}}' }} is in failed state.\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}"
- alert: HostPhysicalComponentTooHot
expr: node_hwmon_temp_celsius > 75
@@ -217,3 +226,30 @@ prometheus_server_rules_node:
annotations:
summary: Host clock not synchronising (instance {{ '{{' }} $labels.instance {{ '}}' }})
description: "Clock not synchronising.\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}"
+
+ - alert: AptUpgradesPending
+ expr: sum by (instance) (apt_upgrades_pending) > 0
+ for: 0m
+ labels:
+ severity: warning
+ annotations:
+ summary: Host has upgradeable packages (instance {{ '{{' }} $labels.instance {{ '}}' }})
+ description: "Host {{ '{{' }} $labels.instance {{ '}}' }} has {{ '{{' }} $value {{ '}}' }} upgradable packages.\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}"
+
+ - alert: AptAutoremovePending
+ expr: sum by (instance) (apt_autoremove_pending) > 0
+ for: 0m
+ labels:
+ severity: warning
+ annotations:
+ summary: Host has packages that can be autoremoved (instance {{ '{{' }} $labels.instance {{ '}}' }})
+ description: "Host {{ '{{' }} $labels.instance {{ '}}' }} has {{ '{{' }} $value {{ '}}' }} packages that can be autoremoved.\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}"
+
+ - alert: HostNeedsRebooting
+ expr: node_reboot_required > 0
+ for: 0m
+ labels:
+ severity: warning
+ annotations:
+ summary: Host must be rebootet (instance {{ '{{' }} $labels.instance {{ '}}' }})
+ description: "Host {{ '{{' }} $labels.instance {{ '}}' }} must be rebootet for security uppdates to take effect.\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}"
diff --git a/roles/monitoring/prometheus/server/defaults/main/rules_nut__ups.yml b/roles/monitoring/prometheus/server/defaults/main/rules_nut__ups.yml
new file mode 100644
index 00000000..150a507e
--- /dev/null
+++ b/roles/monitoring/prometheus/server/defaults/main/rules_nut__ups.yml
@@ -0,0 +1,4 @@
+---
+prometheus_server_rules_nut__ups_extra: []
+prometheus_server_rules_nut__ups: []
+## TODO: add NUT/UPS alert rules
diff --git a/roles/monitoring/prometheus/server/defaults/main/rules_openwrt.yml b/roles/monitoring/prometheus/server/defaults/main/rules_openwrt.yml
index 88d84f31..04b178f1 100644
--- a/roles/monitoring/prometheus/server/defaults/main/rules_openwrt.yml
+++ b/roles/monitoring/prometheus/server/defaults/main/rules_openwrt.yml
@@ -1,3 +1,4 @@
---
prometheus_server_rules_openwrt_extra: []
prometheus_server_rules_openwrt: []
+## TODO: add openwrt specific alert rules
diff --git a/roles/monitoring/prometheus/server/filter_plugins/prometheus.py b/roles/monitoring/prometheus/server/filter_plugins/prometheus.py
index 5a8722c2..d91ef619 100644
--- a/roles/monitoring/prometheus/server/filter_plugins/prometheus.py
+++ b/roles/monitoring/prometheus/server/filter_plugins/prometheus.py
@@ -6,38 +6,31 @@ from functools import partial
from ansible import errors
-def prometheus_generic_job_targets(hostvars, jobs, targets):
+def prometheus_job_targets(hostvars, jobs, targets):
try:
result = []
for job in jobs:
for target in targets:
- enabled = job in hostvars[target]['prometheus_exporters_default'] or job in hostvars[target]['prometheus_exporters_extra']
- result.append({'job': job, 'instance': target, 'enabled': enabled})
+ multitarget_config_varname = 'prometheus_job_multitarget_' + job.replace('-', '_').replace('/', '__')
+ if multitarget_config_varname in hostvars[target]:
+ for exporter_hostname, configs in hostvars[target][multitarget_config_varname].items():
+ for config in configs:
+ result.append({'job': job, 'instance': config['instance'], 'enabled': True,
+ 'exporter_hostname': exporter_hostname, 'config': config})
+
+ else:
+ enabled = job in hostvars[target]['prometheus_exporters_default'] or job in hostvars[target]['prometheus_exporters_extra']
+ result.append({'job': job, 'instance': target, 'enabled': enabled})
return result
except Exception as e:
- raise errors.AnsibleFilterError("prometheus_generic_job_targets(): %s" % str(e))
-
-
-def prometheus_special_job_targets(hostvars, jobs, targets):
- try:
- result = []
- for job in jobs:
- for target in targets:
- config_varname = 'prometheus_special_job_' + job.replace('-', '_')
- if config_varname in hostvars[target]:
- for config in hostvars[target][config_varname]:
- result.append({'job': job, 'instance': config['instance'], 'config': config})
- return result
- except Exception as e:
- raise errors.AnsibleFilterError("prometheus_special_job_targets(): %s" % str(e))
+ raise errors.AnsibleFilterError("prometheus_job_targets(): %s" % str(e))
class FilterModule(object):
''' prometheus filters '''
filter_map = {
- 'prometheus_generic_job_targets': prometheus_generic_job_targets,
- 'prometheus_special_job_targets': prometheus_special_job_targets,
+ 'prometheus_job_targets': prometheus_job_targets,
}
def filters(self):
diff --git a/roles/monitoring/prometheus/server/tasks/main.yml b/roles/monitoring/prometheus/server/tasks/main.yml
index d0ccd8af..16167c9c 100644
--- a/roles/monitoring/prometheus/server/tasks/main.yml
+++ b/roles/monitoring/prometheus/server/tasks/main.yml
@@ -13,6 +13,7 @@
include_role:
name: "storage/{{ prometheus_server_storage.type }}/volume"
+ ## TODO: pin version
- name: install apt packages
apt:
name: prom-server
@@ -37,50 +38,52 @@
- name: create configuration directories
loop:
- - jobs
- rules
- targets
file:
path: "/etc/prometheus/{{ item }}"
state: directory
-- name: create sub-directroy for all exporter types in jobs directory
- loop: "{{ prometheus_server_jobs_generic + prometheus_server_jobs_special }}"
+- name: create sub-directories for all jobs in targets directory
+ loop: "{{ prometheus_server_jobs }}"
file:
- path: "/etc/prometheus/jobs/{{ item }}"
+ path: "/etc/prometheus/targets/{{ item }}"
state: directory
-- name: generate generic targets config
- loop: "{{ prometheus_zone_targets }}"
- loop_control:
- loop_var: target
- template:
- src: targets/generic.yml.j2
- dest: "/etc/prometheus/targets/{{ target }}.yml"
- notify: reload prometheus
+- name: enable/disable job targets
+ vars:
+ job_targets: "{{ hostvars | prometheus_job_targets(prometheus_server_jobs, prometheus_zone_targets) }}"
+ block:
+ - name: install files for enabled targets
+ loop: "{{ job_targets }}"
+ loop_control:
+ loop_var: target
+ label: "{{ target.job }} -> {{ target.instance }}"
+ when: target.enabled
+ template:
+ src: "{{ lookup('first_found', {'paths': ['templates/targets'], 'files': [target.job + '.yml.j2', 'generic.yml.j2']}) }}"
+ dest: "/etc/prometheus/targets/{{ target.job }}/{{ target.instance }}.yml"
+ notify: reload prometheus
-- name: enable targets for generic jobs
- loop: "{{ hostvars | prometheus_generic_job_targets(prometheus_server_jobs_generic, prometheus_zone_targets) }}"
- loop_control:
- label: "{{ item.job }} -> {{ item.instance }}"
- file:
- src: "{{ item.enabled | ternary('/etc/prometheus/targets/' + item.instance + '.yml', omit) }}"
- path: "/etc/prometheus/jobs/{{ item.job }}/{{ item.instance }}.yml"
- state: "{{ item.enabled | ternary('link', 'absent') }}"
- notify: reload prometheus
+ - name: remove files for disabled targets
+ loop: "{{ job_targets }}"
+ loop_control:
+ loop_var: target
+ label: "{{ target.job }} -> {{ target.instance }}"
+ when: not target.enabled
+ file:
+ path: "/etc/prometheus/targets/{{ target.job }}/{{ target.instance }}.yml"
+ state: absent
+ notify: reload prometheus
-- name: enable targets for special jobs
- loop: "{{ hostvars | prometheus_special_job_targets(prometheus_server_jobs_special, prometheus_zone_targets) }}"
- loop_control:
- loop_var: target
- label: "{{ target.job }} -> {{ target.instance }}"
- template:
- src: "targets/{{ target.job }}.yml.j2"
- dest: "/etc/prometheus/jobs/{{ target.job }}/{{ target.instance }}.yml"
- notify: reload prometheus
+- name: create sub-directories for all jobs in rules directory
+ loop: "{{ prometheus_server_jobs | select('match', '.*/.*') | map('dirname') | unique }}"
+ file:
+ path: "/etc/prometheus/rules/{{ item }}"
+ state: directory
- name: generate rules files for all jobs
- loop: "{{ (prometheus_server_jobs_generic + prometheus_server_jobs_special) | union(['prometheus']) }}"
+ loop: "{{ prometheus_server_jobs | union(['prometheus']) }}"
template:
src: rules.yml.j2
dest: "/etc/prometheus/rules/{{ item }}.yml"
diff --git a/roles/monitoring/prometheus/server/templates/job-snippets/blackbox-https.j2 b/roles/monitoring/prometheus/server/templates/job-snippets/blackbox-https.j2
deleted file mode 100644
index 0a6d2dfa..00000000
--- a/roles/monitoring/prometheus/server/templates/job-snippets/blackbox-https.j2
+++ /dev/null
@@ -1,14 +0,0 @@
- - job_name: '{{ job }}'
- metrics_path: /proxy
- params:
- module:
- - blackbox
- - http_tls_2xx
- scheme: https
- tls_config:
- ca_file: /etc/ssl/prometheus/ca-crt.pem
- cert_file: /etc/ssl/prometheus/server/scrape-crt.pem
- key_file: /etc/ssl/prometheus/server/scrape-key.pem
- file_sd_configs:
- - files:
- - "/etc/prometheus/jobs/{{ job }}/*.yml"
diff --git a/roles/monitoring/prometheus/server/templates/job-snippets/blackbox-ping.j2 b/roles/monitoring/prometheus/server/templates/job-snippets/blackbox-ping.j2
deleted file mode 100644
index 7f4f12df..00000000
--- a/roles/monitoring/prometheus/server/templates/job-snippets/blackbox-ping.j2
+++ /dev/null
@@ -1,14 +0,0 @@
- - job_name: '{{ job }}'
- metrics_path: /proxy
- params:
- module:
- - blackbox
- - icmp
- scheme: https
- tls_config:
- ca_file: /etc/ssl/prometheus/ca-crt.pem
- cert_file: /etc/ssl/prometheus/server/scrape-crt.pem
- key_file: /etc/ssl/prometheus/server/scrape-key.pem
- file_sd_configs:
- - files:
- - "/etc/prometheus/jobs/{{ job }}/*.yml"
diff --git a/roles/monitoring/prometheus/server/templates/job-snippets/blackbox-ssh.j2 b/roles/monitoring/prometheus/server/templates/job-snippets/blackbox-ssh.j2
deleted file mode 100644
index 18381e32..00000000
--- a/roles/monitoring/prometheus/server/templates/job-snippets/blackbox-ssh.j2
+++ /dev/null
@@ -1,14 +0,0 @@
- - job_name: '{{ job }}'
- metrics_path: /proxy
- params:
- module:
- - blackbox
- - ssh_banner
- scheme: https
- tls_config:
- ca_file: /etc/ssl/prometheus/ca-crt.pem
- cert_file: /etc/ssl/prometheus/server/scrape-crt.pem
- key_file: /etc/ssl/prometheus/server/scrape-key.pem
- file_sd_configs:
- - files:
- - "/etc/prometheus/jobs/{{ job }}/*.yml"
diff --git a/roles/monitoring/prometheus/server/templates/job-snippets/generic.j2 b/roles/monitoring/prometheus/server/templates/jobs/generic.j2
index 87992eeb..65a95007 100644
--- a/roles/monitoring/prometheus/server/templates/job-snippets/generic.j2
+++ b/roles/monitoring/prometheus/server/templates/jobs/generic.j2
@@ -1,8 +1,5 @@
- job_name: '{{ job }}'
- metrics_path: /proxy
- params:
- module:
- - {{ job }}
+ metrics_path: /{{ job }}
scheme: https
tls_config:
ca_file: /etc/ssl/prometheus/ca-crt.pem
@@ -10,4 +7,4 @@
key_file: /etc/ssl/prometheus/server/scrape-key.pem
file_sd_configs:
- files:
- - "/etc/prometheus/jobs/{{ job }}/*.yml"
+ - "/etc/prometheus/targets/{{ job }}/*.yml"
diff --git a/roles/monitoring/prometheus/server/templates/jobs/node.j2 b/roles/monitoring/prometheus/server/templates/jobs/node.j2
new file mode 100644
index 00000000..1b14e1f6
--- /dev/null
+++ b/roles/monitoring/prometheus/server/templates/jobs/node.j2
@@ -0,0 +1,17 @@
+ - job_name: '{{ job }}'
+ metrics_path: /{{ job }}
+ scheme: https
+ tls_config:
+ ca_file: /etc/ssl/prometheus/ca-crt.pem
+ cert_file: /etc/ssl/prometheus/server/scrape-crt.pem
+ key_file: /etc/ssl/prometheus/server/scrape-key.pem
+ file_sd_configs:
+ - files:
+ - "/etc/prometheus/targets/{{ job }}/*.yml"
+ metric_relabel_configs:
+ - source_labels: [ "mountpoint" ]
+ regex: ".*/\\.snapshot/.*"
+ action: drop
+ - source_labels: [ "__name__", "state" ]
+ regex: "node_systemd_unit_state;(activating|deactivating|inactive)"
+ action: drop
diff --git a/roles/monitoring/prometheus/server/templates/job-snippets/openwrt.j2 b/roles/monitoring/prometheus/server/templates/jobs/openwrt.j2
index 493a4fdb..e93f8be7 100644
--- a/roles/monitoring/prometheus/server/templates/job-snippets/openwrt.j2
+++ b/roles/monitoring/prometheus/server/templates/jobs/openwrt.j2
@@ -2,4 +2,4 @@
scheme: http
file_sd_configs:
- files:
- - "/etc/prometheus/jobs/{{ job }}/*.yml"
+ - "/etc/prometheus/targets/{{ job }}/*.yml"
diff --git a/roles/monitoring/prometheus/server/templates/prometheus.yml.j2 b/roles/monitoring/prometheus/server/templates/prometheus.yml.j2
index 4cfcc498..e73ca354 100644
--- a/roles/monitoring/prometheus/server/templates/prometheus.yml.j2
+++ b/roles/monitoring/prometheus/server/templates/prometheus.yml.j2
@@ -6,6 +6,9 @@ global:
rule_files:
- /etc/prometheus/rules/*.yml
+{% for subdir in (prometheus_server_jobs | select('match', '.*/.*') | map('dirname') | unique) %}
+ - /etc/prometheus/rules/{{ subdir }}/*.yml
+{% endfor %}
{% if prometheus_server_alertmanager is defined %}
alerting:
@@ -25,7 +28,7 @@ scrape_configs:
static_configs:
- targets: ['localhost:9090']
labels:
- instance: "{{ inventory_hostname }}"
+ instance: '{{ inventory_hostname }}'
{% if prometheus_server_alertmanager is defined %}
- job_name: 'alertmanager'
@@ -35,9 +38,9 @@ scrape_configs:
static_configs:
- targets: ['{{ prometheus_server_alertmanager.url }}']
{% endif %}
-{% for job in (prometheus_server_jobs_generic + prometheus_server_jobs_special) %}
+{% for job in (prometheus_server_jobs) %}
-{% include 'job-snippets/' + (lookup('first_found', {'paths': ['templates/job-snippets'], 'files': [job + '.j2', 'generic.j2']}) | basename) %}{{ '' }}
+{% include lookup('first_found', {'paths': ['templates/jobs'], 'files': [job + '.j2', 'generic.j2']}) | relpath(template_fullpath | dirname) %}{{ '' }}
{% endfor %}
{% if prometheus_server_jobs_extra is defined %}
diff --git a/roles/monitoring/prometheus/server/templates/targets/blackbox-https.yml.j2 b/roles/monitoring/prometheus/server/templates/targets/blackbox-https.yml.j2
deleted file mode 100644
index e843de36..00000000
--- a/roles/monitoring/prometheus/server/templates/targets/blackbox-https.yml.j2
+++ /dev/null
@@ -1,4 +0,0 @@
-- targets: [ "{{ hostvars[target.config.exporter_hostname].prometheus_scrape_endpoint }}" ]
- labels:
- instance: "{{ target.instance }}"
- __param_target: {{ target.config.address }}
diff --git a/roles/monitoring/prometheus/server/templates/targets/blackbox-ping.yml.j2 b/roles/monitoring/prometheus/server/templates/targets/blackbox-ping.yml.j2
deleted file mode 100644
index e843de36..00000000
--- a/roles/monitoring/prometheus/server/templates/targets/blackbox-ping.yml.j2
+++ /dev/null
@@ -1,4 +0,0 @@
-- targets: [ "{{ hostvars[target.config.exporter_hostname].prometheus_scrape_endpoint }}" ]
- labels:
- instance: "{{ target.instance }}"
- __param_target: {{ target.config.address }}
diff --git a/roles/monitoring/prometheus/server/templates/targets/blackbox-ssh.yml.j2 b/roles/monitoring/prometheus/server/templates/targets/blackbox-ssh.yml.j2
deleted file mode 100644
index e843de36..00000000
--- a/roles/monitoring/prometheus/server/templates/targets/blackbox-ssh.yml.j2
+++ /dev/null
@@ -1,4 +0,0 @@
-- targets: [ "{{ hostvars[target.config.exporter_hostname].prometheus_scrape_endpoint }}" ]
- labels:
- instance: "{{ target.instance }}"
- __param_target: {{ target.config.address }}
diff --git a/roles/monitoring/prometheus/server/templates/targets/blackbox/probe.yml.j2 b/roles/monitoring/prometheus/server/templates/targets/blackbox/probe.yml.j2
new file mode 100644
index 00000000..4e336873
--- /dev/null
+++ b/roles/monitoring/prometheus/server/templates/targets/blackbox/probe.yml.j2
@@ -0,0 +1,5 @@
+- targets: [ '{{ hostvars[target.exporter_hostname].prometheus_scrape_endpoint }}' ]
+ labels:
+ instance: '{{ target.instance }}'
+ __param_target: '{{ target.config.target }}'
+ __param_module: '{{ target.config.module }}'
diff --git a/roles/monitoring/prometheus/server/templates/targets/generic.yml.j2 b/roles/monitoring/prometheus/server/templates/targets/generic.yml.j2
index e83b6bf4..6591362b 100644
--- a/roles/monitoring/prometheus/server/templates/targets/generic.yml.j2
+++ b/roles/monitoring/prometheus/server/templates/targets/generic.yml.j2
@@ -1,3 +1,3 @@
-- targets: [ "{{ hostvars[target].prometheus_scrape_endpoint }}" ]
+- targets: [ '{{ hostvars[target.instance].prometheus_scrape_endpoint }}' ]
labels:
- instance: "{{ target }}"
+ instance: '{{ target.instance }}'
diff --git a/roles/monitoring/prometheus/server/templates/targets/ipmi/remote.yml.j2 b/roles/monitoring/prometheus/server/templates/targets/ipmi/remote.yml.j2
new file mode 100644
index 00000000..4e336873
--- /dev/null
+++ b/roles/monitoring/prometheus/server/templates/targets/ipmi/remote.yml.j2
@@ -0,0 +1,5 @@
+- targets: [ '{{ hostvars[target.exporter_hostname].prometheus_scrape_endpoint }}' ]
+ labels:
+ instance: '{{ target.instance }}'
+ __param_target: '{{ target.config.target }}'
+ __param_module: '{{ target.config.module }}'
diff --git a/roles/monitoring/prometheus/server/templates/targets/nut.yml.j2 b/roles/monitoring/prometheus/server/templates/targets/nut.yml.j2
deleted file mode 100644
index da3de3d7..00000000
--- a/roles/monitoring/prometheus/server/templates/targets/nut.yml.j2
+++ /dev/null
@@ -1,17 +0,0 @@
-- targets: [ "{{ hostvars[target.config.exporter_hostname].prometheus_scrape_endpoint }}" ]
- labels:
- instance: "{{ target.instance }}"
- __param_ups: {{ target.config.ups }}
- __param_server: {{ target.config.server | default('127.0.0.1') }}
-{% if 'username' in target.config %}
- __param_username: {{ target.config.username }}
-{% endif %}
-{% if 'password' in target.config %}
- __param_password: {{ target.config.password }}
-{% endif %}
-{% if 'variables' in target.config %}
- __param_variables: {{ target.config.variables }}
-{% endif %}
-{% if 'statuses' in target.config %}
- __param_statuses: {{ target.config.statuses }}
-{% endif %}
diff --git a/roles/monitoring/prometheus/server/templates/targets/nut/ups.yml.j2 b/roles/monitoring/prometheus/server/templates/targets/nut/ups.yml.j2
new file mode 100644
index 00000000..c60077c7
--- /dev/null
+++ b/roles/monitoring/prometheus/server/templates/targets/nut/ups.yml.j2
@@ -0,0 +1,17 @@
+- targets: [ '{{ hostvars[target.exporter_hostname].prometheus_scrape_endpoint }}' ]
+ labels:
+ instance: '{{ target.instance }}'
+ __param_ups: '{{ target.config.ups }}'
+ __param_server: '{{ target.config.server | default('127.0.0.1') }}'
+{% if 'username' in target.config %}
+ __param_username: '{{ target.config.username }}'
+{% endif %}
+{% if 'password' in target.config %}
+ __param_password: '{{ target.config.password }}'
+{% endif %}
+{% if 'variables' in target.config %}
+ __param_variables: '{{ target.config.variables }}'
+{% endif %}
+{% if 'statuses' in target.config %}
+ __param_statuses: '{{ target.config.statuses }}'
+{% endif %}