From cc89d6d4211aa5aec8e5bef8c854d4929c337887 Mon Sep 17 00:00:00 2001 From: Christian Pointner Date: Sun, 26 Sep 2021 03:32:47 +0200 Subject: improved promethues multitarget support --- .../prometheus/exporter/blackbox/tasks/main.yml | 3 + .../prometheus/server/defaults/main/main.yml | 4 +- .../server/defaults/main/rules_blackbox.yml | 46 +------------- .../server/defaults/main/rules_blackbox__https.yml | 20 ------ .../server/defaults/main/rules_blackbox__ping.yml | 11 ---- .../server/defaults/main/rules_blackbox__probe.yml | 74 ++++++++++++++++++++++ .../server/defaults/main/rules_blackbox__ssh.yml | 3 - .../prometheus/server/filter_plugins/prometheus.py | 10 +-- roles/monitoring/prometheus/server/tasks/main.yml | 2 +- .../server/templates/jobs/blackbox/https.j2 | 13 ---- .../server/templates/jobs/blackbox/ping.j2 | 13 ---- .../server/templates/jobs/blackbox/ssh.j2 | 13 ---- .../prometheus/server/templates/jobs/nut/ups.j2 | 10 --- .../server/templates/targets/blackbox/https.yml.j2 | 4 -- .../server/templates/targets/blackbox/ping.yml.j2 | 4 -- .../server/templates/targets/blackbox/probe.yml.j2 | 5 ++ .../server/templates/targets/blackbox/ssh.yml.j2 | 4 -- .../server/templates/targets/nut/ups.yml.j2 | 2 +- 18 files changed, 92 insertions(+), 149 deletions(-) delete mode 100644 roles/monitoring/prometheus/server/defaults/main/rules_blackbox__https.yml delete mode 100644 roles/monitoring/prometheus/server/defaults/main/rules_blackbox__ping.yml create mode 100644 roles/monitoring/prometheus/server/defaults/main/rules_blackbox__probe.yml delete mode 100644 roles/monitoring/prometheus/server/defaults/main/rules_blackbox__ssh.yml delete mode 100644 roles/monitoring/prometheus/server/templates/jobs/blackbox/https.j2 delete mode 100644 roles/monitoring/prometheus/server/templates/jobs/blackbox/ping.j2 delete mode 100644 roles/monitoring/prometheus/server/templates/jobs/blackbox/ssh.j2 delete mode 100644 roles/monitoring/prometheus/server/templates/jobs/nut/ups.j2 delete mode 100644 roles/monitoring/prometheus/server/templates/targets/blackbox/https.yml.j2 delete mode 100644 roles/monitoring/prometheus/server/templates/targets/blackbox/ping.yml.j2 create mode 100644 roles/monitoring/prometheus/server/templates/targets/blackbox/probe.yml.j2 delete mode 100644 roles/monitoring/prometheus/server/templates/targets/blackbox/ssh.yml.j2 (limited to 'roles/monitoring') diff --git a/roles/monitoring/prometheus/exporter/blackbox/tasks/main.yml b/roles/monitoring/prometheus/exporter/blackbox/tasks/main.yml index f9793df6..c4cabfce 100644 --- a/roles/monitoring/prometheus/exporter/blackbox/tasks/main.yml +++ b/roles/monitoring/prometheus/exporter/blackbox/tasks/main.yml @@ -33,6 +33,9 @@ copy: content: | location = /blackbox { + proxy_pass http://127.0.0.1:9115/metrics; + } + location = /blackbox/probe { proxy_pass http://127.0.0.1:9115/probe; } dest: /etc/prometheus/exporter/blackbox.locations diff --git a/roles/monitoring/prometheus/server/defaults/main/main.yml b/roles/monitoring/prometheus/server/defaults/main/main.yml index 09cd150c..7781fd69 100644 --- a/roles/monitoring/prometheus/server/defaults/main/main.yml +++ b/roles/monitoring/prometheus/server/defaults/main/main.yml @@ -17,9 +17,7 @@ prometheus_server_rules: nut: "{{ prometheus_server_rules_nut + prometheus_server_rules_nut_extra }}" nut/ups: "{{ prometheus_server_rules_nut__ups + prometheus_server_rules_nut__ups_extra }}" blackbox: "{{ prometheus_server_rules_blackbox + prometheus_server_rules_blackbox_extra }}" - blackbox/ping: "{{ prometheus_server_rules_blackbox__ping + prometheus_server_rules_blackbox__ping_extra }}" - blackbox/https: "{{ prometheus_server_rules_blackbox__https + prometheus_server_rules_blackbox__https_extra }}" - blackbox/ssh: "{{ prometheus_server_rules_blackbox__ssh + prometheus_server_rules_blackbox__ssh_extra }}" + blackbox/probe: "{{ prometheus_server_rules_blackbox__probe + prometheus_server_rules_blackbox__probe_extra }}" # prometheus_server_alertmanager: # url: "127.0.0.1:9093" diff --git a/roles/monitoring/prometheus/server/defaults/main/rules_blackbox.yml b/roles/monitoring/prometheus/server/defaults/main/rules_blackbox.yml index d5c1fd42..99f2e83c 100644 --- a/roles/monitoring/prometheus/server/defaults/main/rules_blackbox.yml +++ b/roles/monitoring/prometheus/server/defaults/main/rules_blackbox.yml @@ -1,47 +1,3 @@ --- prometheus_server_rules_blackbox_extra: [] -prometheus_server_rules_blackbox: - - alert: BlackboxProbeFailed - expr: probe_success == 0 - for: 0m - labels: - severity: critical - annotations: - summary: Blackbox probe failed (instance {{ '{{' }} $labels.instance {{ '}}' }}) - description: "Probe failed\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}" - - - alert: BlackboxSlowProbe - expr: avg_over_time(probe_duration_seconds[1m]) > 1 - for: 1m - labels: - severity: warning - annotations: - summary: Blackbox slow probe (instance {{ '{{' }} $labels.instance {{ '}}' }}) - description: "Blackbox probe took more than 1s to complete\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}" - - - alert: BlackboxSslCertificateWillExpireSoon - expr: probe_ssl_earliest_cert_expiry - time() < 86400 * 30 - for: 0m - labels: - severity: warning - annotations: - summary: Blackbox SSL certificate will expire soon (instance {{ '{{' }} $labels.instance {{ '}}' }}) - description: "SSL certificate expires in 30 days\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}" - - - alert: BlackboxSslCertificateWillExpireSoon - expr: probe_ssl_earliest_cert_expiry - time() < 86400 * 3 - for: 0m - labels: - severity: critical - annotations: - summary: Blackbox SSL certificate will expire soon (instance {{ '{{' }} $labels.instance {{ '}}' }}) - description: "SSL certificate expires in 3 days\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}" - - - alert: BlackboxSslCertificateExpired - expr: probe_ssl_earliest_cert_expiry - time() <= 0 - for: 0m - labels: - severity: critical - annotations: - summary: Blackbox SSL certificate expired (instance {{ '{{' }} $labels.instance {{ '}}' }}) - description: "SSL certificate has expired already\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}" +prometheus_server_rules_blackbox: [] diff --git a/roles/monitoring/prometheus/server/defaults/main/rules_blackbox__https.yml b/roles/monitoring/prometheus/server/defaults/main/rules_blackbox__https.yml deleted file mode 100644 index 140e3b4f..00000000 --- a/roles/monitoring/prometheus/server/defaults/main/rules_blackbox__https.yml +++ /dev/null @@ -1,20 +0,0 @@ ---- -prometheus_server_rules_blackbox__https_extra: [] -prometheus_server_rules_blackbox__https: - - alert: BlackboxProbeHttpFailure - expr: probe_http_status_code <= 199 OR probe_http_status_code >= 400 - for: 0m - labels: - severity: critical - annotations: - summary: Blackbox probe HTTP failure (instance {{ '{{' }} $labels.instance {{ '}}' }}) - description: "HTTP status code is not 200-399\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}" - - - alert: BlackboxProbeSlowHttp - expr: avg_over_time(probe_http_duration_seconds[1m]) > 1 - for: 1m - labels: - severity: warning - annotations: - summary: Blackbox probe slow HTTP (instance {{ '{{' }} $labels.instance {{ '}}' }}) - description: "HTTP request took more than 1s\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}" diff --git a/roles/monitoring/prometheus/server/defaults/main/rules_blackbox__ping.yml b/roles/monitoring/prometheus/server/defaults/main/rules_blackbox__ping.yml deleted file mode 100644 index cc87b6b1..00000000 --- a/roles/monitoring/prometheus/server/defaults/main/rules_blackbox__ping.yml +++ /dev/null @@ -1,11 +0,0 @@ ---- -prometheus_server_rules_blackbox__ping_extra: [] -prometheus_server_rules_blackbox__ping: - - alert: BlackboxProbeSlowPing - expr: avg_over_time(probe_icmp_duration_seconds[1m]) > 1 - for: 1m - labels: - severity: warning - annotations: - summary: Blackbox probe slow ping (instance {{ '{{' }} $labels.instance {{ '}}' }}) - description: "Blackbox ping took more than 1s\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}" diff --git a/roles/monitoring/prometheus/server/defaults/main/rules_blackbox__probe.yml b/roles/monitoring/prometheus/server/defaults/main/rules_blackbox__probe.yml new file mode 100644 index 00000000..9f9d2292 --- /dev/null +++ b/roles/monitoring/prometheus/server/defaults/main/rules_blackbox__probe.yml @@ -0,0 +1,74 @@ +--- +prometheus_server_rules_blackbox__probe_extra: [] +prometheus_server_rules_blackbox__probe: + - alert: BlackboxProbeFailed + expr: probe_success == 0 + for: 0m + labels: + severity: critical + annotations: + summary: Blackbox probe failed (instance {{ '{{' }} $labels.instance {{ '}}' }}) + description: "Probe failed\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}" + + - alert: BlackboxSlowProbe + expr: avg_over_time(probe_duration_seconds[1m]) > 1 + for: 1m + labels: + severity: warning + annotations: + summary: Blackbox slow probe (instance {{ '{{' }} $labels.instance {{ '}}' }}) + description: "Blackbox probe took more than 1s to complete\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}" + + - alert: BlackboxSslCertificateWillExpireSoon + expr: probe_ssl_earliest_cert_expiry - time() < 86400 * 30 + for: 0m + labels: + severity: warning + annotations: + summary: Blackbox SSL certificate will expire soon (instance {{ '{{' }} $labels.instance {{ '}}' }}) + description: "SSL certificate expires in 30 days\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}" + + - alert: BlackboxSslCertificateWillExpireSoon + expr: probe_ssl_earliest_cert_expiry - time() < 86400 * 3 + for: 0m + labels: + severity: critical + annotations: + summary: Blackbox SSL certificate will expire soon (instance {{ '{{' }} $labels.instance {{ '}}' }}) + description: "SSL certificate expires in 3 days\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}" + + - alert: BlackboxSslCertificateExpired + expr: probe_ssl_earliest_cert_expiry - time() <= 0 + for: 0m + labels: + severity: critical + annotations: + summary: Blackbox SSL certificate expired (instance {{ '{{' }} $labels.instance {{ '}}' }}) + description: "SSL certificate has expired already\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}" + + - alert: BlackboxProbeHttpFailure + expr: probe_http_status_code <= 199 OR probe_http_status_code >= 400 + for: 0m + labels: + severity: critical + annotations: + summary: Blackbox probe HTTP failure (instance {{ '{{' }} $labels.instance {{ '}}' }}) + description: "HTTP status code is not 200-399\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}" + + - alert: BlackboxProbeSlowHttp + expr: avg_over_time(probe_http_duration_seconds[1m]) > 1 + for: 1m + labels: + severity: warning + annotations: + summary: Blackbox probe slow HTTP (instance {{ '{{' }} $labels.instance {{ '}}' }}) + description: "HTTP request took more than 1s\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}" + + - alert: BlackboxProbeSlowPing + expr: avg_over_time(probe_icmp_duration_seconds[1m]) > 1 + for: 1m + labels: + severity: warning + annotations: + summary: Blackbox probe slow ping (instance {{ '{{' }} $labels.instance {{ '}}' }}) + description: "Blackbox ping took more than 1s\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}" diff --git a/roles/monitoring/prometheus/server/defaults/main/rules_blackbox__ssh.yml b/roles/monitoring/prometheus/server/defaults/main/rules_blackbox__ssh.yml deleted file mode 100644 index 8e717c41..00000000 --- a/roles/monitoring/prometheus/server/defaults/main/rules_blackbox__ssh.yml +++ /dev/null @@ -1,3 +0,0 @@ ---- -prometheus_server_rules_blackbox__ssh_extra: [] -prometheus_server_rules_blackbox__ssh: [] diff --git a/roles/monitoring/prometheus/server/filter_plugins/prometheus.py b/roles/monitoring/prometheus/server/filter_plugins/prometheus.py index 1443e837..d91ef619 100644 --- a/roles/monitoring/prometheus/server/filter_plugins/prometheus.py +++ b/roles/monitoring/prometheus/server/filter_plugins/prometheus.py @@ -11,10 +11,12 @@ def prometheus_job_targets(hostvars, jobs, targets): result = [] for job in jobs: for target in targets: - special_config_varname = 'prometheus_job_' + job.replace('-', '_').replace('/', '__') - if special_config_varname in hostvars[target]: - for config in hostvars[target][special_config_varname]: - result.append({'job': job, 'instance': config['instance'], 'config': config, 'enabled': True}) + multitarget_config_varname = 'prometheus_job_multitarget_' + job.replace('-', '_').replace('/', '__') + if multitarget_config_varname in hostvars[target]: + for exporter_hostname, configs in hostvars[target][multitarget_config_varname].items(): + for config in configs: + result.append({'job': job, 'instance': config['instance'], 'enabled': True, + 'exporter_hostname': exporter_hostname, 'config': config}) else: enabled = job in hostvars[target]['prometheus_exporters_default'] or job in hostvars[target]['prometheus_exporters_extra'] diff --git a/roles/monitoring/prometheus/server/tasks/main.yml b/roles/monitoring/prometheus/server/tasks/main.yml index c0928cc3..16167c9c 100644 --- a/roles/monitoring/prometheus/server/tasks/main.yml +++ b/roles/monitoring/prometheus/server/tasks/main.yml @@ -83,7 +83,7 @@ state: directory - name: generate rules files for all jobs - loop: "{{ prometheus_server_jobs | union(['prometheus']) | union(prometheus_server_jobs | select('match', '.*/.*') | map('dirname') | unique) }}" + loop: "{{ prometheus_server_jobs | union(['prometheus']) }}" template: src: rules.yml.j2 dest: "/etc/prometheus/rules/{{ item }}.yml" diff --git a/roles/monitoring/prometheus/server/templates/jobs/blackbox/https.j2 b/roles/monitoring/prometheus/server/templates/jobs/blackbox/https.j2 deleted file mode 100644 index 86ff88dd..00000000 --- a/roles/monitoring/prometheus/server/templates/jobs/blackbox/https.j2 +++ /dev/null @@ -1,13 +0,0 @@ - - job_name: '{{ job }}' - metrics_path: /blackbox - params: - module: - - http_tls_2xx - scheme: https - tls_config: - ca_file: /etc/ssl/prometheus/ca-crt.pem - cert_file: /etc/ssl/prometheus/server/scrape-crt.pem - key_file: /etc/ssl/prometheus/server/scrape-key.pem - file_sd_configs: - - files: - - "/etc/prometheus/targets/{{ job }}/*.yml" diff --git a/roles/monitoring/prometheus/server/templates/jobs/blackbox/ping.j2 b/roles/monitoring/prometheus/server/templates/jobs/blackbox/ping.j2 deleted file mode 100644 index 2d3889d2..00000000 --- a/roles/monitoring/prometheus/server/templates/jobs/blackbox/ping.j2 +++ /dev/null @@ -1,13 +0,0 @@ - - job_name: '{{ job }}' - metrics_path: /blackbox - params: - module: - - icmp - scheme: https - tls_config: - ca_file: /etc/ssl/prometheus/ca-crt.pem - cert_file: /etc/ssl/prometheus/server/scrape-crt.pem - key_file: /etc/ssl/prometheus/server/scrape-key.pem - file_sd_configs: - - files: - - "/etc/prometheus/targets/{{ job }}/*.yml" diff --git a/roles/monitoring/prometheus/server/templates/jobs/blackbox/ssh.j2 b/roles/monitoring/prometheus/server/templates/jobs/blackbox/ssh.j2 deleted file mode 100644 index 97565673..00000000 --- a/roles/monitoring/prometheus/server/templates/jobs/blackbox/ssh.j2 +++ /dev/null @@ -1,13 +0,0 @@ - - job_name: '{{ job }}' - metrics_path: /blackbox - params: - module: - - ssh_banner - scheme: https - tls_config: - ca_file: /etc/ssl/prometheus/ca-crt.pem - cert_file: /etc/ssl/prometheus/server/scrape-crt.pem - key_file: /etc/ssl/prometheus/server/scrape-key.pem - file_sd_configs: - - files: - - "/etc/prometheus/targets/{{ job }}/*.yml" diff --git a/roles/monitoring/prometheus/server/templates/jobs/nut/ups.j2 b/roles/monitoring/prometheus/server/templates/jobs/nut/ups.j2 deleted file mode 100644 index 0cf4ae4e..00000000 --- a/roles/monitoring/prometheus/server/templates/jobs/nut/ups.j2 +++ /dev/null @@ -1,10 +0,0 @@ - - job_name: '{{ job }}' - metrics_path: /nut/ups - scheme: https - tls_config: - ca_file: /etc/ssl/prometheus/ca-crt.pem - cert_file: /etc/ssl/prometheus/server/scrape-crt.pem - key_file: /etc/ssl/prometheus/server/scrape-key.pem - file_sd_configs: - - files: - - "/etc/prometheus/targets/{{ job }}/*.yml" diff --git a/roles/monitoring/prometheus/server/templates/targets/blackbox/https.yml.j2 b/roles/monitoring/prometheus/server/templates/targets/blackbox/https.yml.j2 deleted file mode 100644 index 29c89590..00000000 --- a/roles/monitoring/prometheus/server/templates/targets/blackbox/https.yml.j2 +++ /dev/null @@ -1,4 +0,0 @@ -- targets: [ '{{ hostvars[target.config.exporter_hostname].prometheus_scrape_endpoint }}' ] - labels: - instance: '{{ target.instance }}' - __param_target: '{{ target.config.address }}' diff --git a/roles/monitoring/prometheus/server/templates/targets/blackbox/ping.yml.j2 b/roles/monitoring/prometheus/server/templates/targets/blackbox/ping.yml.j2 deleted file mode 100644 index 29c89590..00000000 --- a/roles/monitoring/prometheus/server/templates/targets/blackbox/ping.yml.j2 +++ /dev/null @@ -1,4 +0,0 @@ -- targets: [ '{{ hostvars[target.config.exporter_hostname].prometheus_scrape_endpoint }}' ] - labels: - instance: '{{ target.instance }}' - __param_target: '{{ target.config.address }}' diff --git a/roles/monitoring/prometheus/server/templates/targets/blackbox/probe.yml.j2 b/roles/monitoring/prometheus/server/templates/targets/blackbox/probe.yml.j2 new file mode 100644 index 00000000..4e336873 --- /dev/null +++ b/roles/monitoring/prometheus/server/templates/targets/blackbox/probe.yml.j2 @@ -0,0 +1,5 @@ +- targets: [ '{{ hostvars[target.exporter_hostname].prometheus_scrape_endpoint }}' ] + labels: + instance: '{{ target.instance }}' + __param_target: '{{ target.config.target }}' + __param_module: '{{ target.config.module }}' diff --git a/roles/monitoring/prometheus/server/templates/targets/blackbox/ssh.yml.j2 b/roles/monitoring/prometheus/server/templates/targets/blackbox/ssh.yml.j2 deleted file mode 100644 index 29c89590..00000000 --- a/roles/monitoring/prometheus/server/templates/targets/blackbox/ssh.yml.j2 +++ /dev/null @@ -1,4 +0,0 @@ -- targets: [ '{{ hostvars[target.config.exporter_hostname].prometheus_scrape_endpoint }}' ] - labels: - instance: '{{ target.instance }}' - __param_target: '{{ target.config.address }}' diff --git a/roles/monitoring/prometheus/server/templates/targets/nut/ups.yml.j2 b/roles/monitoring/prometheus/server/templates/targets/nut/ups.yml.j2 index 6003cd46..c60077c7 100644 --- a/roles/monitoring/prometheus/server/templates/targets/nut/ups.yml.j2 +++ b/roles/monitoring/prometheus/server/templates/targets/nut/ups.yml.j2 @@ -1,4 +1,4 @@ -- targets: [ '{{ hostvars[target.config.exporter_hostname].prometheus_scrape_endpoint }}' ] +- targets: [ '{{ hostvars[target.exporter_hostname].prometheus_scrape_endpoint }}' ] labels: instance: '{{ target.instance }}' __param_ups: '{{ target.config.ups }}' -- cgit v1.2.3