summaryrefslogtreecommitdiff
path: root/roles/monitoring
diff options
context:
space:
mode:
authorChristian Pointner <equinox@spreadspace.org>2021-09-26 03:32:47 +0200
committerChristian Pointner <equinox@spreadspace.org>2021-09-26 03:32:47 +0200
commitcc89d6d4211aa5aec8e5bef8c854d4929c337887 (patch)
treecfad00b79ed9b475cf50ec85ea18fded07efb99e /roles/monitoring
parentmove away from exporter-exporter in favor for nginx (diff)
improved promethues multitarget support
Diffstat (limited to 'roles/monitoring')
-rw-r--r--roles/monitoring/prometheus/exporter/blackbox/tasks/main.yml3
-rw-r--r--roles/monitoring/prometheus/server/defaults/main/main.yml4
-rw-r--r--roles/monitoring/prometheus/server/defaults/main/rules_blackbox.yml46
-rw-r--r--roles/monitoring/prometheus/server/defaults/main/rules_blackbox__https.yml20
-rw-r--r--roles/monitoring/prometheus/server/defaults/main/rules_blackbox__ping.yml11
-rw-r--r--roles/monitoring/prometheus/server/defaults/main/rules_blackbox__probe.yml74
-rw-r--r--roles/monitoring/prometheus/server/defaults/main/rules_blackbox__ssh.yml3
-rw-r--r--roles/monitoring/prometheus/server/filter_plugins/prometheus.py10
-rw-r--r--roles/monitoring/prometheus/server/tasks/main.yml2
-rw-r--r--roles/monitoring/prometheus/server/templates/jobs/blackbox/https.j213
-rw-r--r--roles/monitoring/prometheus/server/templates/jobs/blackbox/ping.j213
-rw-r--r--roles/monitoring/prometheus/server/templates/jobs/blackbox/ssh.j213
-rw-r--r--roles/monitoring/prometheus/server/templates/jobs/nut/ups.j210
-rw-r--r--roles/monitoring/prometheus/server/templates/targets/blackbox/https.yml.j24
-rw-r--r--roles/monitoring/prometheus/server/templates/targets/blackbox/ping.yml.j24
-rw-r--r--roles/monitoring/prometheus/server/templates/targets/blackbox/probe.yml.j25
-rw-r--r--roles/monitoring/prometheus/server/templates/targets/blackbox/ssh.yml.j24
-rw-r--r--roles/monitoring/prometheus/server/templates/targets/nut/ups.yml.j22
18 files changed, 92 insertions, 149 deletions
diff --git a/roles/monitoring/prometheus/exporter/blackbox/tasks/main.yml b/roles/monitoring/prometheus/exporter/blackbox/tasks/main.yml
index f9793df6..c4cabfce 100644
--- a/roles/monitoring/prometheus/exporter/blackbox/tasks/main.yml
+++ b/roles/monitoring/prometheus/exporter/blackbox/tasks/main.yml
@@ -33,6 +33,9 @@
copy:
content: |
location = /blackbox {
+ proxy_pass http://127.0.0.1:9115/metrics;
+ }
+ location = /blackbox/probe {
proxy_pass http://127.0.0.1:9115/probe;
}
dest: /etc/prometheus/exporter/blackbox.locations
diff --git a/roles/monitoring/prometheus/server/defaults/main/main.yml b/roles/monitoring/prometheus/server/defaults/main/main.yml
index 09cd150c..7781fd69 100644
--- a/roles/monitoring/prometheus/server/defaults/main/main.yml
+++ b/roles/monitoring/prometheus/server/defaults/main/main.yml
@@ -17,9 +17,7 @@ prometheus_server_rules:
nut: "{{ prometheus_server_rules_nut + prometheus_server_rules_nut_extra }}"
nut/ups: "{{ prometheus_server_rules_nut__ups + prometheus_server_rules_nut__ups_extra }}"
blackbox: "{{ prometheus_server_rules_blackbox + prometheus_server_rules_blackbox_extra }}"
- blackbox/ping: "{{ prometheus_server_rules_blackbox__ping + prometheus_server_rules_blackbox__ping_extra }}"
- blackbox/https: "{{ prometheus_server_rules_blackbox__https + prometheus_server_rules_blackbox__https_extra }}"
- blackbox/ssh: "{{ prometheus_server_rules_blackbox__ssh + prometheus_server_rules_blackbox__ssh_extra }}"
+ blackbox/probe: "{{ prometheus_server_rules_blackbox__probe + prometheus_server_rules_blackbox__probe_extra }}"
# prometheus_server_alertmanager:
# url: "127.0.0.1:9093"
diff --git a/roles/monitoring/prometheus/server/defaults/main/rules_blackbox.yml b/roles/monitoring/prometheus/server/defaults/main/rules_blackbox.yml
index d5c1fd42..99f2e83c 100644
--- a/roles/monitoring/prometheus/server/defaults/main/rules_blackbox.yml
+++ b/roles/monitoring/prometheus/server/defaults/main/rules_blackbox.yml
@@ -1,47 +1,3 @@
---
prometheus_server_rules_blackbox_extra: []
-prometheus_server_rules_blackbox:
- - alert: BlackboxProbeFailed
- expr: probe_success == 0
- for: 0m
- labels:
- severity: critical
- annotations:
- summary: Blackbox probe failed (instance {{ '{{' }} $labels.instance {{ '}}' }})
- description: "Probe failed\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}"
-
- - alert: BlackboxSlowProbe
- expr: avg_over_time(probe_duration_seconds[1m]) > 1
- for: 1m
- labels:
- severity: warning
- annotations:
- summary: Blackbox slow probe (instance {{ '{{' }} $labels.instance {{ '}}' }})
- description: "Blackbox probe took more than 1s to complete\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}"
-
- - alert: BlackboxSslCertificateWillExpireSoon
- expr: probe_ssl_earliest_cert_expiry - time() < 86400 * 30
- for: 0m
- labels:
- severity: warning
- annotations:
- summary: Blackbox SSL certificate will expire soon (instance {{ '{{' }} $labels.instance {{ '}}' }})
- description: "SSL certificate expires in 30 days\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}"
-
- - alert: BlackboxSslCertificateWillExpireSoon
- expr: probe_ssl_earliest_cert_expiry - time() < 86400 * 3
- for: 0m
- labels:
- severity: critical
- annotations:
- summary: Blackbox SSL certificate will expire soon (instance {{ '{{' }} $labels.instance {{ '}}' }})
- description: "SSL certificate expires in 3 days\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}"
-
- - alert: BlackboxSslCertificateExpired
- expr: probe_ssl_earliest_cert_expiry - time() <= 0
- for: 0m
- labels:
- severity: critical
- annotations:
- summary: Blackbox SSL certificate expired (instance {{ '{{' }} $labels.instance {{ '}}' }})
- description: "SSL certificate has expired already\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}"
+prometheus_server_rules_blackbox: []
diff --git a/roles/monitoring/prometheus/server/defaults/main/rules_blackbox__https.yml b/roles/monitoring/prometheus/server/defaults/main/rules_blackbox__https.yml
deleted file mode 100644
index 140e3b4f..00000000
--- a/roles/monitoring/prometheus/server/defaults/main/rules_blackbox__https.yml
+++ /dev/null
@@ -1,20 +0,0 @@
----
-prometheus_server_rules_blackbox__https_extra: []
-prometheus_server_rules_blackbox__https:
- - alert: BlackboxProbeHttpFailure
- expr: probe_http_status_code <= 199 OR probe_http_status_code >= 400
- for: 0m
- labels:
- severity: critical
- annotations:
- summary: Blackbox probe HTTP failure (instance {{ '{{' }} $labels.instance {{ '}}' }})
- description: "HTTP status code is not 200-399\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}"
-
- - alert: BlackboxProbeSlowHttp
- expr: avg_over_time(probe_http_duration_seconds[1m]) > 1
- for: 1m
- labels:
- severity: warning
- annotations:
- summary: Blackbox probe slow HTTP (instance {{ '{{' }} $labels.instance {{ '}}' }})
- description: "HTTP request took more than 1s\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}"
diff --git a/roles/monitoring/prometheus/server/defaults/main/rules_blackbox__ping.yml b/roles/monitoring/prometheus/server/defaults/main/rules_blackbox__ping.yml
deleted file mode 100644
index cc87b6b1..00000000
--- a/roles/monitoring/prometheus/server/defaults/main/rules_blackbox__ping.yml
+++ /dev/null
@@ -1,11 +0,0 @@
----
-prometheus_server_rules_blackbox__ping_extra: []
-prometheus_server_rules_blackbox__ping:
- - alert: BlackboxProbeSlowPing
- expr: avg_over_time(probe_icmp_duration_seconds[1m]) > 1
- for: 1m
- labels:
- severity: warning
- annotations:
- summary: Blackbox probe slow ping (instance {{ '{{' }} $labels.instance {{ '}}' }})
- description: "Blackbox ping took more than 1s\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}"
diff --git a/roles/monitoring/prometheus/server/defaults/main/rules_blackbox__probe.yml b/roles/monitoring/prometheus/server/defaults/main/rules_blackbox__probe.yml
new file mode 100644
index 00000000..9f9d2292
--- /dev/null
+++ b/roles/monitoring/prometheus/server/defaults/main/rules_blackbox__probe.yml
@@ -0,0 +1,74 @@
+---
+prometheus_server_rules_blackbox__probe_extra: []
+prometheus_server_rules_blackbox__probe:
+ - alert: BlackboxProbeFailed
+ expr: probe_success == 0
+ for: 0m
+ labels:
+ severity: critical
+ annotations:
+ summary: Blackbox probe failed (instance {{ '{{' }} $labels.instance {{ '}}' }})
+ description: "Probe failed\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}"
+
+ - alert: BlackboxSlowProbe
+ expr: avg_over_time(probe_duration_seconds[1m]) > 1
+ for: 1m
+ labels:
+ severity: warning
+ annotations:
+ summary: Blackbox slow probe (instance {{ '{{' }} $labels.instance {{ '}}' }})
+ description: "Blackbox probe took more than 1s to complete\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}"
+
+ - alert: BlackboxSslCertificateWillExpireSoon
+ expr: probe_ssl_earliest_cert_expiry - time() < 86400 * 30
+ for: 0m
+ labels:
+ severity: warning
+ annotations:
+ summary: Blackbox SSL certificate will expire soon (instance {{ '{{' }} $labels.instance {{ '}}' }})
+ description: "SSL certificate expires in 30 days\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}"
+
+ - alert: BlackboxSslCertificateWillExpireSoon
+ expr: probe_ssl_earliest_cert_expiry - time() < 86400 * 3
+ for: 0m
+ labels:
+ severity: critical
+ annotations:
+ summary: Blackbox SSL certificate will expire soon (instance {{ '{{' }} $labels.instance {{ '}}' }})
+ description: "SSL certificate expires in 3 days\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}"
+
+ - alert: BlackboxSslCertificateExpired
+ expr: probe_ssl_earliest_cert_expiry - time() <= 0
+ for: 0m
+ labels:
+ severity: critical
+ annotations:
+ summary: Blackbox SSL certificate expired (instance {{ '{{' }} $labels.instance {{ '}}' }})
+ description: "SSL certificate has expired already\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}"
+
+ - alert: BlackboxProbeHttpFailure
+ expr: probe_http_status_code <= 199 OR probe_http_status_code >= 400
+ for: 0m
+ labels:
+ severity: critical
+ annotations:
+ summary: Blackbox probe HTTP failure (instance {{ '{{' }} $labels.instance {{ '}}' }})
+ description: "HTTP status code is not 200-399\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}"
+
+ - alert: BlackboxProbeSlowHttp
+ expr: avg_over_time(probe_http_duration_seconds[1m]) > 1
+ for: 1m
+ labels:
+ severity: warning
+ annotations:
+ summary: Blackbox probe slow HTTP (instance {{ '{{' }} $labels.instance {{ '}}' }})
+ description: "HTTP request took more than 1s\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}"
+
+ - alert: BlackboxProbeSlowPing
+ expr: avg_over_time(probe_icmp_duration_seconds[1m]) > 1
+ for: 1m
+ labels:
+ severity: warning
+ annotations:
+ summary: Blackbox probe slow ping (instance {{ '{{' }} $labels.instance {{ '}}' }})
+ description: "Blackbox ping took more than 1s\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}"
diff --git a/roles/monitoring/prometheus/server/defaults/main/rules_blackbox__ssh.yml b/roles/monitoring/prometheus/server/defaults/main/rules_blackbox__ssh.yml
deleted file mode 100644
index 8e717c41..00000000
--- a/roles/monitoring/prometheus/server/defaults/main/rules_blackbox__ssh.yml
+++ /dev/null
@@ -1,3 +0,0 @@
----
-prometheus_server_rules_blackbox__ssh_extra: []
-prometheus_server_rules_blackbox__ssh: []
diff --git a/roles/monitoring/prometheus/server/filter_plugins/prometheus.py b/roles/monitoring/prometheus/server/filter_plugins/prometheus.py
index 1443e837..d91ef619 100644
--- a/roles/monitoring/prometheus/server/filter_plugins/prometheus.py
+++ b/roles/monitoring/prometheus/server/filter_plugins/prometheus.py
@@ -11,10 +11,12 @@ def prometheus_job_targets(hostvars, jobs, targets):
result = []
for job in jobs:
for target in targets:
- special_config_varname = 'prometheus_job_' + job.replace('-', '_').replace('/', '__')
- if special_config_varname in hostvars[target]:
- for config in hostvars[target][special_config_varname]:
- result.append({'job': job, 'instance': config['instance'], 'config': config, 'enabled': True})
+ multitarget_config_varname = 'prometheus_job_multitarget_' + job.replace('-', '_').replace('/', '__')
+ if multitarget_config_varname in hostvars[target]:
+ for exporter_hostname, configs in hostvars[target][multitarget_config_varname].items():
+ for config in configs:
+ result.append({'job': job, 'instance': config['instance'], 'enabled': True,
+ 'exporter_hostname': exporter_hostname, 'config': config})
else:
enabled = job in hostvars[target]['prometheus_exporters_default'] or job in hostvars[target]['prometheus_exporters_extra']
diff --git a/roles/monitoring/prometheus/server/tasks/main.yml b/roles/monitoring/prometheus/server/tasks/main.yml
index c0928cc3..16167c9c 100644
--- a/roles/monitoring/prometheus/server/tasks/main.yml
+++ b/roles/monitoring/prometheus/server/tasks/main.yml
@@ -83,7 +83,7 @@
state: directory
- name: generate rules files for all jobs
- loop: "{{ prometheus_server_jobs | union(['prometheus']) | union(prometheus_server_jobs | select('match', '.*/.*') | map('dirname') | unique) }}"
+ loop: "{{ prometheus_server_jobs | union(['prometheus']) }}"
template:
src: rules.yml.j2
dest: "/etc/prometheus/rules/{{ item }}.yml"
diff --git a/roles/monitoring/prometheus/server/templates/jobs/blackbox/https.j2 b/roles/monitoring/prometheus/server/templates/jobs/blackbox/https.j2
deleted file mode 100644
index 86ff88dd..00000000
--- a/roles/monitoring/prometheus/server/templates/jobs/blackbox/https.j2
+++ /dev/null
@@ -1,13 +0,0 @@
- - job_name: '{{ job }}'
- metrics_path: /blackbox
- params:
- module:
- - http_tls_2xx
- scheme: https
- tls_config:
- ca_file: /etc/ssl/prometheus/ca-crt.pem
- cert_file: /etc/ssl/prometheus/server/scrape-crt.pem
- key_file: /etc/ssl/prometheus/server/scrape-key.pem
- file_sd_configs:
- - files:
- - "/etc/prometheus/targets/{{ job }}/*.yml"
diff --git a/roles/monitoring/prometheus/server/templates/jobs/blackbox/ping.j2 b/roles/monitoring/prometheus/server/templates/jobs/blackbox/ping.j2
deleted file mode 100644
index 2d3889d2..00000000
--- a/roles/monitoring/prometheus/server/templates/jobs/blackbox/ping.j2
+++ /dev/null
@@ -1,13 +0,0 @@
- - job_name: '{{ job }}'
- metrics_path: /blackbox
- params:
- module:
- - icmp
- scheme: https
- tls_config:
- ca_file: /etc/ssl/prometheus/ca-crt.pem
- cert_file: /etc/ssl/prometheus/server/scrape-crt.pem
- key_file: /etc/ssl/prometheus/server/scrape-key.pem
- file_sd_configs:
- - files:
- - "/etc/prometheus/targets/{{ job }}/*.yml"
diff --git a/roles/monitoring/prometheus/server/templates/jobs/blackbox/ssh.j2 b/roles/monitoring/prometheus/server/templates/jobs/blackbox/ssh.j2
deleted file mode 100644
index 97565673..00000000
--- a/roles/monitoring/prometheus/server/templates/jobs/blackbox/ssh.j2
+++ /dev/null
@@ -1,13 +0,0 @@
- - job_name: '{{ job }}'
- metrics_path: /blackbox
- params:
- module:
- - ssh_banner
- scheme: https
- tls_config:
- ca_file: /etc/ssl/prometheus/ca-crt.pem
- cert_file: /etc/ssl/prometheus/server/scrape-crt.pem
- key_file: /etc/ssl/prometheus/server/scrape-key.pem
- file_sd_configs:
- - files:
- - "/etc/prometheus/targets/{{ job }}/*.yml"
diff --git a/roles/monitoring/prometheus/server/templates/jobs/nut/ups.j2 b/roles/monitoring/prometheus/server/templates/jobs/nut/ups.j2
deleted file mode 100644
index 0cf4ae4e..00000000
--- a/roles/monitoring/prometheus/server/templates/jobs/nut/ups.j2
+++ /dev/null
@@ -1,10 +0,0 @@
- - job_name: '{{ job }}'
- metrics_path: /nut/ups
- scheme: https
- tls_config:
- ca_file: /etc/ssl/prometheus/ca-crt.pem
- cert_file: /etc/ssl/prometheus/server/scrape-crt.pem
- key_file: /etc/ssl/prometheus/server/scrape-key.pem
- file_sd_configs:
- - files:
- - "/etc/prometheus/targets/{{ job }}/*.yml"
diff --git a/roles/monitoring/prometheus/server/templates/targets/blackbox/https.yml.j2 b/roles/monitoring/prometheus/server/templates/targets/blackbox/https.yml.j2
deleted file mode 100644
index 29c89590..00000000
--- a/roles/monitoring/prometheus/server/templates/targets/blackbox/https.yml.j2
+++ /dev/null
@@ -1,4 +0,0 @@
-- targets: [ '{{ hostvars[target.config.exporter_hostname].prometheus_scrape_endpoint }}' ]
- labels:
- instance: '{{ target.instance }}'
- __param_target: '{{ target.config.address }}'
diff --git a/roles/monitoring/prometheus/server/templates/targets/blackbox/ping.yml.j2 b/roles/monitoring/prometheus/server/templates/targets/blackbox/ping.yml.j2
deleted file mode 100644
index 29c89590..00000000
--- a/roles/monitoring/prometheus/server/templates/targets/blackbox/ping.yml.j2
+++ /dev/null
@@ -1,4 +0,0 @@
-- targets: [ '{{ hostvars[target.config.exporter_hostname].prometheus_scrape_endpoint }}' ]
- labels:
- instance: '{{ target.instance }}'
- __param_target: '{{ target.config.address }}'
diff --git a/roles/monitoring/prometheus/server/templates/targets/blackbox/probe.yml.j2 b/roles/monitoring/prometheus/server/templates/targets/blackbox/probe.yml.j2
new file mode 100644
index 00000000..4e336873
--- /dev/null
+++ b/roles/monitoring/prometheus/server/templates/targets/blackbox/probe.yml.j2
@@ -0,0 +1,5 @@
+- targets: [ '{{ hostvars[target.exporter_hostname].prometheus_scrape_endpoint }}' ]
+ labels:
+ instance: '{{ target.instance }}'
+ __param_target: '{{ target.config.target }}'
+ __param_module: '{{ target.config.module }}'
diff --git a/roles/monitoring/prometheus/server/templates/targets/blackbox/ssh.yml.j2 b/roles/monitoring/prometheus/server/templates/targets/blackbox/ssh.yml.j2
deleted file mode 100644
index 29c89590..00000000
--- a/roles/monitoring/prometheus/server/templates/targets/blackbox/ssh.yml.j2
+++ /dev/null
@@ -1,4 +0,0 @@
-- targets: [ '{{ hostvars[target.config.exporter_hostname].prometheus_scrape_endpoint }}' ]
- labels:
- instance: '{{ target.instance }}'
- __param_target: '{{ target.config.address }}'
diff --git a/roles/monitoring/prometheus/server/templates/targets/nut/ups.yml.j2 b/roles/monitoring/prometheus/server/templates/targets/nut/ups.yml.j2
index 6003cd46..c60077c7 100644
--- a/roles/monitoring/prometheus/server/templates/targets/nut/ups.yml.j2
+++ b/roles/monitoring/prometheus/server/templates/targets/nut/ups.yml.j2
@@ -1,4 +1,4 @@
-- targets: [ '{{ hostvars[target.config.exporter_hostname].prometheus_scrape_endpoint }}' ]
+- targets: [ '{{ hostvars[target.exporter_hostname].prometheus_scrape_endpoint }}' ]
labels:
instance: '{{ target.instance }}'
__param_ups: '{{ target.config.ups }}'