From a8e8cb2ed3d5e68d89edd8785ed59f0ee45f81bf Mon Sep 17 00:00:00 2001 From: Christian Pointner Date: Tue, 21 Sep 2021 19:34:25 +0200 Subject: prometheus: simplify job config --- roles/monitoring/prometheus/server/defaults/main/main.yml | 5 ++--- roles/monitoring/prometheus/server/defaults/main/rules_nut-ups.yml | 3 +++ roles/monitoring/prometheus/server/defaults/main/rules_nut.yml | 3 --- 3 files changed, 5 insertions(+), 6 deletions(-) create mode 100644 roles/monitoring/prometheus/server/defaults/main/rules_nut-ups.yml delete mode 100644 roles/monitoring/prometheus/server/defaults/main/rules_nut.yml (limited to 'roles/monitoring/prometheus/server/defaults') diff --git a/roles/monitoring/prometheus/server/defaults/main/main.yml b/roles/monitoring/prometheus/server/defaults/main/main.yml index 95b9da6d..5be3ecd3 100644 --- a/roles/monitoring/prometheus/server/defaults/main/main.yml +++ b/roles/monitoring/prometheus/server/defaults/main/main.yml @@ -5,9 +5,8 @@ prometheus_server_retention: "15d" -prometheus_server_jobs_generic: +prometheus_server_jobs: - node -prometheus_server_jobs_special: [] #prometheus_server_jobs_extra: | # - job_name: ... @@ -15,7 +14,7 @@ prometheus_server_rules: prometheus: "{{ prometheus_server_rules_prometheus + ((prometheus_server_alertmanager is defined) | ternary(prometheus_server_rules_prometheus_alertmanager, [])) + prometheus_server_rules_prometheus_extra }}" node: "{{ prometheus_server_rules_node + prometheus_server_rules_node_extra }}" openwrt: "{{ prometheus_server_rules_openwrt + prometheus_server_rules_node_extra }}" - nut: "{{ prometheus_server_rules_nut + prometheus_server_rules_nut_extra }}" + "nut-ups": "{{ prometheus_server_rules_nut_ups + prometheus_server_rules_nut_ups_extra }}" "blackbox-ping": "{{ prometheus_server_rules_blackbox_ping + prometheus_server_rules_blackbox_ping_extra }}" "blackbox-https": "{{ prometheus_server_rules_blackbox_https + prometheus_server_rules_blackbox_https_extra }}" "blackbox-ssh": "{{ prometheus_server_rules_blackbox_ssh + prometheus_server_rules_blackbox_ssh_extra }}" diff --git a/roles/monitoring/prometheus/server/defaults/main/rules_nut-ups.yml b/roles/monitoring/prometheus/server/defaults/main/rules_nut-ups.yml new file mode 100644 index 00000000..842007b4 --- /dev/null +++ b/roles/monitoring/prometheus/server/defaults/main/rules_nut-ups.yml @@ -0,0 +1,3 @@ +--- +prometheus_server_rules_nut_ups_extra: [] +prometheus_server_rules_nut_ups: [] diff --git a/roles/monitoring/prometheus/server/defaults/main/rules_nut.yml b/roles/monitoring/prometheus/server/defaults/main/rules_nut.yml deleted file mode 100644 index d8d64f64..00000000 --- a/roles/monitoring/prometheus/server/defaults/main/rules_nut.yml +++ /dev/null @@ -1,3 +0,0 @@ ---- -prometheus_server_rules_nut_extra: [] -prometheus_server_rules_nut: [] -- cgit v1.2.3 From ef4432c51bacb5b92c03a42cb1ea7f9d837ec8b6 Mon Sep 17 00:00:00 2001 From: Christian Pointner Date: Thu, 23 Sep 2021 16:50:24 +0200 Subject: use / as spereator for jobs formerly known as special --- inventory/group_vars/chaos-at-home-ups/vars.yml | 2 +- inventory/group_vars/ele-ups/vars.yml | 2 +- inventory/group_vars/promzone-chaos-at-home/vars.yml | 8 ++++---- inventory/group_vars/promzone-elevate-festival/vars.yml | 2 +- inventory/host_vars/ch-mon.yml | 6 +++--- .../monitoring/prometheus/server/defaults/main/main.yml | 8 ++++---- .../server/defaults/main/rules_blackbox-https.yml | 3 --- .../server/defaults/main/rules_blackbox-ping.yml | 3 --- .../server/defaults/main/rules_blackbox-ssh.yml | 3 --- .../server/defaults/main/rules_blackbox__https.yml | 3 +++ .../server/defaults/main/rules_blackbox__ping.yml | 3 +++ .../server/defaults/main/rules_blackbox__ssh.yml | 3 +++ .../prometheus/server/defaults/main/rules_nut-ups.yml | 3 --- .../prometheus/server/defaults/main/rules_nut__ups.yml | 3 +++ .../prometheus/server/filter_plugins/prometheus.py | 2 +- roles/monitoring/prometheus/server/tasks/main.yml | 8 +++++++- .../server/templates/job-snippets/blackbox-https.j2 | 14 -------------- .../server/templates/job-snippets/blackbox-ping.j2 | 14 -------------- .../server/templates/job-snippets/blackbox-ssh.j2 | 14 -------------- .../server/templates/job-snippets/blackbox/https.j2 | 14 ++++++++++++++ .../server/templates/job-snippets/blackbox/ping.j2 | 14 ++++++++++++++ .../server/templates/job-snippets/blackbox/ssh.j2 | 14 ++++++++++++++ .../prometheus/server/templates/job-snippets/nut-ups.j2 | 13 ------------- .../prometheus/server/templates/job-snippets/nut/ups.j2 | 13 +++++++++++++ .../prometheus/server/templates/prometheus.yml.j2 | 5 ++++- .../server/templates/targets/blackbox-https.yml.j2 | 4 ---- .../server/templates/targets/blackbox-ping.yml.j2 | 4 ---- .../server/templates/targets/blackbox-ssh.yml.j2 | 4 ---- .../server/templates/targets/blackbox/https.yml.j2 | 4 ++++ .../server/templates/targets/blackbox/ping.yml.j2 | 4 ++++ .../server/templates/targets/blackbox/ssh.yml.j2 | 4 ++++ .../prometheus/server/templates/targets/nut-ups.yml.j2 | 17 ----------------- .../prometheus/server/templates/targets/nut/ups.yml.j2 | 17 +++++++++++++++++ 33 files changed, 122 insertions(+), 113 deletions(-) delete mode 100644 roles/monitoring/prometheus/server/defaults/main/rules_blackbox-https.yml delete mode 100644 roles/monitoring/prometheus/server/defaults/main/rules_blackbox-ping.yml delete mode 100644 roles/monitoring/prometheus/server/defaults/main/rules_blackbox-ssh.yml create mode 100644 roles/monitoring/prometheus/server/defaults/main/rules_blackbox__https.yml create mode 100644 roles/monitoring/prometheus/server/defaults/main/rules_blackbox__ping.yml create mode 100644 roles/monitoring/prometheus/server/defaults/main/rules_blackbox__ssh.yml delete mode 100644 roles/monitoring/prometheus/server/defaults/main/rules_nut-ups.yml create mode 100644 roles/monitoring/prometheus/server/defaults/main/rules_nut__ups.yml delete mode 100644 roles/monitoring/prometheus/server/templates/job-snippets/blackbox-https.j2 delete mode 100644 roles/monitoring/prometheus/server/templates/job-snippets/blackbox-ping.j2 delete mode 100644 roles/monitoring/prometheus/server/templates/job-snippets/blackbox-ssh.j2 create mode 100644 roles/monitoring/prometheus/server/templates/job-snippets/blackbox/https.j2 create mode 100644 roles/monitoring/prometheus/server/templates/job-snippets/blackbox/ping.j2 create mode 100644 roles/monitoring/prometheus/server/templates/job-snippets/blackbox/ssh.j2 delete mode 100644 roles/monitoring/prometheus/server/templates/job-snippets/nut-ups.j2 create mode 100644 roles/monitoring/prometheus/server/templates/job-snippets/nut/ups.j2 delete mode 100644 roles/monitoring/prometheus/server/templates/targets/blackbox-https.yml.j2 delete mode 100644 roles/monitoring/prometheus/server/templates/targets/blackbox-ping.yml.j2 delete mode 100644 roles/monitoring/prometheus/server/templates/targets/blackbox-ssh.yml.j2 create mode 100644 roles/monitoring/prometheus/server/templates/targets/blackbox/https.yml.j2 create mode 100644 roles/monitoring/prometheus/server/templates/targets/blackbox/ping.yml.j2 create mode 100644 roles/monitoring/prometheus/server/templates/targets/blackbox/ssh.yml.j2 delete mode 100644 roles/monitoring/prometheus/server/templates/targets/nut-ups.yml.j2 create mode 100644 roles/monitoring/prometheus/server/templates/targets/nut/ups.yml.j2 (limited to 'roles/monitoring/prometheus/server/defaults') diff --git a/inventory/group_vars/chaos-at-home-ups/vars.yml b/inventory/group_vars/chaos-at-home-ups/vars.yml index 99868165..f8c1bdf1 100644 --- a/inventory/group_vars/chaos-at-home-ups/vars.yml +++ b/inventory/group_vars/chaos-at-home-ups/vars.yml @@ -11,7 +11,7 @@ prometheus_scrape_endpoint: "{{ network_mgmt_zone.prefix | ipaddr(network_mgmt_z prometheus_exporters_default: - openwrt -prometheus_job_nut_ups: +prometheus_job_nut__ups: - exporter_hostname: ch-mon instance: "ups-{{ ups_name }}" ups: "{{ ups_name }}" diff --git a/inventory/group_vars/ele-ups/vars.yml b/inventory/group_vars/ele-ups/vars.yml index 0d22f770..1c4613a3 100644 --- a/inventory/group_vars/ele-ups/vars.yml +++ b/inventory/group_vars/ele-ups/vars.yml @@ -14,7 +14,7 @@ prometheus_scrape_endpoint: "{{ network_mgmt_zone.prefix | ipaddr(network_mgmt_z prometheus_exporters_default: - openwrt -prometheus_job_nut_ups: +prometheus_job_nut__ups: exporter_hostname: ele-mon instance: "ups-{{ ups_name }}" ups: "{{ ups_name }}" diff --git a/inventory/group_vars/promzone-chaos-at-home/vars.yml b/inventory/group_vars/promzone-chaos-at-home/vars.yml index d1958d47..84ed1263 100644 --- a/inventory/group_vars/promzone-chaos-at-home/vars.yml +++ b/inventory/group_vars/promzone-chaos-at-home/vars.yml @@ -9,10 +9,10 @@ prometheus_server: ch-mon prometheus_server_jobs: - node - openwrt - - nut-ups - - blackbox-ping - - blackbox-https - - blackbox-ssh + - nut/ups + - blackbox/ping + - blackbox/https + - blackbox/ssh prometheus_zone_name: chaos@home prometheus_zone_targets: "{{ groups['promzone-chaos-at-home'] }}" diff --git a/inventory/group_vars/promzone-elevate-festival/vars.yml b/inventory/group_vars/promzone-elevate-festival/vars.yml index a65a0cb7..43115dc4 100644 --- a/inventory/group_vars/promzone-elevate-festival/vars.yml +++ b/inventory/group_vars/promzone-elevate-festival/vars.yml @@ -9,7 +9,7 @@ prometheus_server: ele-mon prometheus_server_jobs: - node - openwrt - - nut-ups + - nut/ups prometheus_zone_name: Elevate Festival prometheus_zone_targets: "{{ groups['promzone-elevate-festival'] }}" diff --git a/inventory/host_vars/ch-mon.yml b/inventory/host_vars/ch-mon.yml index a211d4bb..242c4835 100644 --- a/inventory/host_vars/ch-mon.yml +++ b/inventory/host_vars/ch-mon.yml @@ -76,7 +76,7 @@ prometheus_exporter_blackbox_modules_extra: icmp: prober: icmp -prometheus_job_blackbox_ping: +prometheus_job_blackbox__ping: - exporter_hostname: ch-mon instance: "ping-magentagw" address: 62.99.185.129 @@ -84,12 +84,12 @@ prometheus_job_blackbox_ping: instance: "ping-quad9" address: 9.9.9.9 -prometheus_job_blackbox_https: +prometheus_job_blackbox__https: - exporter_hostname: ch-mon instance: "https-web.chaos-at-home.org" address: web.chaos-at-home.org -prometheus_job_blackbox_ssh: +prometheus_job_blackbox__ssh: - exporter_hostname: ch-mon instance: "ssh-{{ inventory_hostname }}" address: "{{ network_zones.svc.prefix | ipaddr(network_zones.svc.offsets[inventory_hostname]) | ipaddr('address') }}:{{ ansible_port | default(22) }}" diff --git a/roles/monitoring/prometheus/server/defaults/main/main.yml b/roles/monitoring/prometheus/server/defaults/main/main.yml index 5be3ecd3..bae0cdba 100644 --- a/roles/monitoring/prometheus/server/defaults/main/main.yml +++ b/roles/monitoring/prometheus/server/defaults/main/main.yml @@ -14,10 +14,10 @@ prometheus_server_rules: prometheus: "{{ prometheus_server_rules_prometheus + ((prometheus_server_alertmanager is defined) | ternary(prometheus_server_rules_prometheus_alertmanager, [])) + prometheus_server_rules_prometheus_extra }}" node: "{{ prometheus_server_rules_node + prometheus_server_rules_node_extra }}" openwrt: "{{ prometheus_server_rules_openwrt + prometheus_server_rules_node_extra }}" - "nut-ups": "{{ prometheus_server_rules_nut_ups + prometheus_server_rules_nut_ups_extra }}" - "blackbox-ping": "{{ prometheus_server_rules_blackbox_ping + prometheus_server_rules_blackbox_ping_extra }}" - "blackbox-https": "{{ prometheus_server_rules_blackbox_https + prometheus_server_rules_blackbox_https_extra }}" - "blackbox-ssh": "{{ prometheus_server_rules_blackbox_ssh + prometheus_server_rules_blackbox_ssh_extra }}" + nut/ups: "{{ prometheus_server_rules_nut__ups + prometheus_server_rules_nut__ups_extra }}" + blackbox/ping: "{{ prometheus_server_rules_blackbox__ping + prometheus_server_rules_blackbox__ping_extra }}" + blackbox/https: "{{ prometheus_server_rules_blackbox__https + prometheus_server_rules_blackbox__https_extra }}" + blackbox/ssh: "{{ prometheus_server_rules_blackbox__ssh + prometheus_server_rules_blackbox__ssh_extra }}" # prometheus_server_alertmanager: # url: "127.0.0.1:9093" diff --git a/roles/monitoring/prometheus/server/defaults/main/rules_blackbox-https.yml b/roles/monitoring/prometheus/server/defaults/main/rules_blackbox-https.yml deleted file mode 100644 index bb806075..00000000 --- a/roles/monitoring/prometheus/server/defaults/main/rules_blackbox-https.yml +++ /dev/null @@ -1,3 +0,0 @@ ---- -prometheus_server_rules_blackbox_https_extra: [] -prometheus_server_rules_blackbox_https: [] diff --git a/roles/monitoring/prometheus/server/defaults/main/rules_blackbox-ping.yml b/roles/monitoring/prometheus/server/defaults/main/rules_blackbox-ping.yml deleted file mode 100644 index 56c122f5..00000000 --- a/roles/monitoring/prometheus/server/defaults/main/rules_blackbox-ping.yml +++ /dev/null @@ -1,3 +0,0 @@ ---- -prometheus_server_rules_blackbox_ping_extra: [] -prometheus_server_rules_blackbox_ping: [] diff --git a/roles/monitoring/prometheus/server/defaults/main/rules_blackbox-ssh.yml b/roles/monitoring/prometheus/server/defaults/main/rules_blackbox-ssh.yml deleted file mode 100644 index 727d2292..00000000 --- a/roles/monitoring/prometheus/server/defaults/main/rules_blackbox-ssh.yml +++ /dev/null @@ -1,3 +0,0 @@ ---- -prometheus_server_rules_blackbox_ssh_extra: [] -prometheus_server_rules_blackbox_ssh: [] diff --git a/roles/monitoring/prometheus/server/defaults/main/rules_blackbox__https.yml b/roles/monitoring/prometheus/server/defaults/main/rules_blackbox__https.yml new file mode 100644 index 00000000..cfdc10bd --- /dev/null +++ b/roles/monitoring/prometheus/server/defaults/main/rules_blackbox__https.yml @@ -0,0 +1,3 @@ +--- +prometheus_server_rules_blackbox__https_extra: [] +prometheus_server_rules_blackbox__https: [] diff --git a/roles/monitoring/prometheus/server/defaults/main/rules_blackbox__ping.yml b/roles/monitoring/prometheus/server/defaults/main/rules_blackbox__ping.yml new file mode 100644 index 00000000..06ce8607 --- /dev/null +++ b/roles/monitoring/prometheus/server/defaults/main/rules_blackbox__ping.yml @@ -0,0 +1,3 @@ +--- +prometheus_server_rules_blackbox__ping_extra: [] +prometheus_server_rules_blackbox__ping: [] diff --git a/roles/monitoring/prometheus/server/defaults/main/rules_blackbox__ssh.yml b/roles/monitoring/prometheus/server/defaults/main/rules_blackbox__ssh.yml new file mode 100644 index 00000000..8e717c41 --- /dev/null +++ b/roles/monitoring/prometheus/server/defaults/main/rules_blackbox__ssh.yml @@ -0,0 +1,3 @@ +--- +prometheus_server_rules_blackbox__ssh_extra: [] +prometheus_server_rules_blackbox__ssh: [] diff --git a/roles/monitoring/prometheus/server/defaults/main/rules_nut-ups.yml b/roles/monitoring/prometheus/server/defaults/main/rules_nut-ups.yml deleted file mode 100644 index 842007b4..00000000 --- a/roles/monitoring/prometheus/server/defaults/main/rules_nut-ups.yml +++ /dev/null @@ -1,3 +0,0 @@ ---- -prometheus_server_rules_nut_ups_extra: [] -prometheus_server_rules_nut_ups: [] diff --git a/roles/monitoring/prometheus/server/defaults/main/rules_nut__ups.yml b/roles/monitoring/prometheus/server/defaults/main/rules_nut__ups.yml new file mode 100644 index 00000000..bccb0ca8 --- /dev/null +++ b/roles/monitoring/prometheus/server/defaults/main/rules_nut__ups.yml @@ -0,0 +1,3 @@ +--- +prometheus_server_rules_nut__ups_extra: [] +prometheus_server_rules_nut__ups: [] diff --git a/roles/monitoring/prometheus/server/filter_plugins/prometheus.py b/roles/monitoring/prometheus/server/filter_plugins/prometheus.py index 056d216f..1443e837 100644 --- a/roles/monitoring/prometheus/server/filter_plugins/prometheus.py +++ b/roles/monitoring/prometheus/server/filter_plugins/prometheus.py @@ -11,7 +11,7 @@ def prometheus_job_targets(hostvars, jobs, targets): result = [] for job in jobs: for target in targets: - special_config_varname = 'prometheus_job_' + job.replace('-', '_') + special_config_varname = 'prometheus_job_' + job.replace('-', '_').replace('/', '__') if special_config_varname in hostvars[target]: for config in hostvars[target][special_config_varname]: result.append({'job': job, 'instance': config['instance'], 'config': config, 'enabled': True}) diff --git a/roles/monitoring/prometheus/server/tasks/main.yml b/roles/monitoring/prometheus/server/tasks/main.yml index 4bcaa2d5..16167c9c 100644 --- a/roles/monitoring/prometheus/server/tasks/main.yml +++ b/roles/monitoring/prometheus/server/tasks/main.yml @@ -44,7 +44,7 @@ path: "/etc/prometheus/{{ item }}" state: directory -- name: create sub-directroy for all jobs in targets directory +- name: create sub-directories for all jobs in targets directory loop: "{{ prometheus_server_jobs }}" file: path: "/etc/prometheus/targets/{{ item }}" @@ -76,6 +76,12 @@ state: absent notify: reload prometheus +- name: create sub-directories for all jobs in rules directory + loop: "{{ prometheus_server_jobs | select('match', '.*/.*') | map('dirname') | unique }}" + file: + path: "/etc/prometheus/rules/{{ item }}" + state: directory + - name: generate rules files for all jobs loop: "{{ prometheus_server_jobs | union(['prometheus']) }}" template: diff --git a/roles/monitoring/prometheus/server/templates/job-snippets/blackbox-https.j2 b/roles/monitoring/prometheus/server/templates/job-snippets/blackbox-https.j2 deleted file mode 100644 index 98a64121..00000000 --- a/roles/monitoring/prometheus/server/templates/job-snippets/blackbox-https.j2 +++ /dev/null @@ -1,14 +0,0 @@ - - job_name: '{{ job }}' - metrics_path: /proxy - params: - module: - - blackbox - - http_tls_2xx - scheme: https - tls_config: - ca_file: /etc/ssl/prometheus/ca-crt.pem - cert_file: /etc/ssl/prometheus/server/scrape-crt.pem - key_file: /etc/ssl/prometheus/server/scrape-key.pem - file_sd_configs: - - files: - - "/etc/prometheus/targets/{{ job }}/*.yml" diff --git a/roles/monitoring/prometheus/server/templates/job-snippets/blackbox-ping.j2 b/roles/monitoring/prometheus/server/templates/job-snippets/blackbox-ping.j2 deleted file mode 100644 index 736ffec1..00000000 --- a/roles/monitoring/prometheus/server/templates/job-snippets/blackbox-ping.j2 +++ /dev/null @@ -1,14 +0,0 @@ - - job_name: '{{ job }}' - metrics_path: /proxy - params: - module: - - blackbox - - icmp - scheme: https - tls_config: - ca_file: /etc/ssl/prometheus/ca-crt.pem - cert_file: /etc/ssl/prometheus/server/scrape-crt.pem - key_file: /etc/ssl/prometheus/server/scrape-key.pem - file_sd_configs: - - files: - - "/etc/prometheus/targets/{{ job }}/*.yml" diff --git a/roles/monitoring/prometheus/server/templates/job-snippets/blackbox-ssh.j2 b/roles/monitoring/prometheus/server/templates/job-snippets/blackbox-ssh.j2 deleted file mode 100644 index 166f37ad..00000000 --- a/roles/monitoring/prometheus/server/templates/job-snippets/blackbox-ssh.j2 +++ /dev/null @@ -1,14 +0,0 @@ - - job_name: '{{ job }}' - metrics_path: /proxy - params: - module: - - blackbox - - ssh_banner - scheme: https - tls_config: - ca_file: /etc/ssl/prometheus/ca-crt.pem - cert_file: /etc/ssl/prometheus/server/scrape-crt.pem - key_file: /etc/ssl/prometheus/server/scrape-key.pem - file_sd_configs: - - files: - - "/etc/prometheus/targets/{{ job }}/*.yml" diff --git a/roles/monitoring/prometheus/server/templates/job-snippets/blackbox/https.j2 b/roles/monitoring/prometheus/server/templates/job-snippets/blackbox/https.j2 new file mode 100644 index 00000000..98a64121 --- /dev/null +++ b/roles/monitoring/prometheus/server/templates/job-snippets/blackbox/https.j2 @@ -0,0 +1,14 @@ + - job_name: '{{ job }}' + metrics_path: /proxy + params: + module: + - blackbox + - http_tls_2xx + scheme: https + tls_config: + ca_file: /etc/ssl/prometheus/ca-crt.pem + cert_file: /etc/ssl/prometheus/server/scrape-crt.pem + key_file: /etc/ssl/prometheus/server/scrape-key.pem + file_sd_configs: + - files: + - "/etc/prometheus/targets/{{ job }}/*.yml" diff --git a/roles/monitoring/prometheus/server/templates/job-snippets/blackbox/ping.j2 b/roles/monitoring/prometheus/server/templates/job-snippets/blackbox/ping.j2 new file mode 100644 index 00000000..736ffec1 --- /dev/null +++ b/roles/monitoring/prometheus/server/templates/job-snippets/blackbox/ping.j2 @@ -0,0 +1,14 @@ + - job_name: '{{ job }}' + metrics_path: /proxy + params: + module: + - blackbox + - icmp + scheme: https + tls_config: + ca_file: /etc/ssl/prometheus/ca-crt.pem + cert_file: /etc/ssl/prometheus/server/scrape-crt.pem + key_file: /etc/ssl/prometheus/server/scrape-key.pem + file_sd_configs: + - files: + - "/etc/prometheus/targets/{{ job }}/*.yml" diff --git a/roles/monitoring/prometheus/server/templates/job-snippets/blackbox/ssh.j2 b/roles/monitoring/prometheus/server/templates/job-snippets/blackbox/ssh.j2 new file mode 100644 index 00000000..166f37ad --- /dev/null +++ b/roles/monitoring/prometheus/server/templates/job-snippets/blackbox/ssh.j2 @@ -0,0 +1,14 @@ + - job_name: '{{ job }}' + metrics_path: /proxy + params: + module: + - blackbox + - ssh_banner + scheme: https + tls_config: + ca_file: /etc/ssl/prometheus/ca-crt.pem + cert_file: /etc/ssl/prometheus/server/scrape-crt.pem + key_file: /etc/ssl/prometheus/server/scrape-key.pem + file_sd_configs: + - files: + - "/etc/prometheus/targets/{{ job }}/*.yml" diff --git a/roles/monitoring/prometheus/server/templates/job-snippets/nut-ups.j2 b/roles/monitoring/prometheus/server/templates/job-snippets/nut-ups.j2 deleted file mode 100644 index 3a2c5c62..00000000 --- a/roles/monitoring/prometheus/server/templates/job-snippets/nut-ups.j2 +++ /dev/null @@ -1,13 +0,0 @@ - - job_name: '{{ job }}' - metrics_path: /proxy - params: - module: - - nut - scheme: https - tls_config: - ca_file: /etc/ssl/prometheus/ca-crt.pem - cert_file: /etc/ssl/prometheus/server/scrape-crt.pem - key_file: /etc/ssl/prometheus/server/scrape-key.pem - file_sd_configs: - - files: - - "/etc/prometheus/targets/{{ job }}/*.yml" diff --git a/roles/monitoring/prometheus/server/templates/job-snippets/nut/ups.j2 b/roles/monitoring/prometheus/server/templates/job-snippets/nut/ups.j2 new file mode 100644 index 00000000..3a2c5c62 --- /dev/null +++ b/roles/monitoring/prometheus/server/templates/job-snippets/nut/ups.j2 @@ -0,0 +1,13 @@ + - job_name: '{{ job }}' + metrics_path: /proxy + params: + module: + - nut + scheme: https + tls_config: + ca_file: /etc/ssl/prometheus/ca-crt.pem + cert_file: /etc/ssl/prometheus/server/scrape-crt.pem + key_file: /etc/ssl/prometheus/server/scrape-key.pem + file_sd_configs: + - files: + - "/etc/prometheus/targets/{{ job }}/*.yml" diff --git a/roles/monitoring/prometheus/server/templates/prometheus.yml.j2 b/roles/monitoring/prometheus/server/templates/prometheus.yml.j2 index 4a079896..8156341d 100644 --- a/roles/monitoring/prometheus/server/templates/prometheus.yml.j2 +++ b/roles/monitoring/prometheus/server/templates/prometheus.yml.j2 @@ -6,6 +6,9 @@ global: rule_files: - /etc/prometheus/rules/*.yml +{% for subdir in (prometheus_server_jobs | select('match', '.*/.*') | map('dirname') | unique) %} + - /etc/prometheus/rules/{{ subdir }}/*.yml +{% endfor %} {% if prometheus_server_alertmanager is defined %} alerting: @@ -37,7 +40,7 @@ scrape_configs: {% endif %} {% for job in (prometheus_server_jobs) %} -{% include 'job-snippets/' + (lookup('first_found', {'paths': ['templates/job-snippets'], 'files': [job + '.j2', 'generic.j2']}) | basename) %}{{ '' }} +{% include lookup('first_found', {'paths': ['templates/job-snippets'], 'files': [job + '.j2', 'generic.j2']}) | relpath(template_fullpath | dirname) %}{{ '' }} {% endfor %} {% if prometheus_server_jobs_extra is defined %} diff --git a/roles/monitoring/prometheus/server/templates/targets/blackbox-https.yml.j2 b/roles/monitoring/prometheus/server/templates/targets/blackbox-https.yml.j2 deleted file mode 100644 index b1a33df3..00000000 --- a/roles/monitoring/prometheus/server/templates/targets/blackbox-https.yml.j2 +++ /dev/null @@ -1,4 +0,0 @@ -- targets: [ '{{ hostvars[target.config.exporter_hostname].prometheus_scrape_endpoint }}' ] - labels: - instance: '{{ target.instance }}' - __param_target: {{ target.config.address }} diff --git a/roles/monitoring/prometheus/server/templates/targets/blackbox-ping.yml.j2 b/roles/monitoring/prometheus/server/templates/targets/blackbox-ping.yml.j2 deleted file mode 100644 index b1a33df3..00000000 --- a/roles/monitoring/prometheus/server/templates/targets/blackbox-ping.yml.j2 +++ /dev/null @@ -1,4 +0,0 @@ -- targets: [ '{{ hostvars[target.config.exporter_hostname].prometheus_scrape_endpoint }}' ] - labels: - instance: '{{ target.instance }}' - __param_target: {{ target.config.address }} diff --git a/roles/monitoring/prometheus/server/templates/targets/blackbox-ssh.yml.j2 b/roles/monitoring/prometheus/server/templates/targets/blackbox-ssh.yml.j2 deleted file mode 100644 index b1a33df3..00000000 --- a/roles/monitoring/prometheus/server/templates/targets/blackbox-ssh.yml.j2 +++ /dev/null @@ -1,4 +0,0 @@ -- targets: [ '{{ hostvars[target.config.exporter_hostname].prometheus_scrape_endpoint }}' ] - labels: - instance: '{{ target.instance }}' - __param_target: {{ target.config.address }} diff --git a/roles/monitoring/prometheus/server/templates/targets/blackbox/https.yml.j2 b/roles/monitoring/prometheus/server/templates/targets/blackbox/https.yml.j2 new file mode 100644 index 00000000..29c89590 --- /dev/null +++ b/roles/monitoring/prometheus/server/templates/targets/blackbox/https.yml.j2 @@ -0,0 +1,4 @@ +- targets: [ '{{ hostvars[target.config.exporter_hostname].prometheus_scrape_endpoint }}' ] + labels: + instance: '{{ target.instance }}' + __param_target: '{{ target.config.address }}' diff --git a/roles/monitoring/prometheus/server/templates/targets/blackbox/ping.yml.j2 b/roles/monitoring/prometheus/server/templates/targets/blackbox/ping.yml.j2 new file mode 100644 index 00000000..29c89590 --- /dev/null +++ b/roles/monitoring/prometheus/server/templates/targets/blackbox/ping.yml.j2 @@ -0,0 +1,4 @@ +- targets: [ '{{ hostvars[target.config.exporter_hostname].prometheus_scrape_endpoint }}' ] + labels: + instance: '{{ target.instance }}' + __param_target: '{{ target.config.address }}' diff --git a/roles/monitoring/prometheus/server/templates/targets/blackbox/ssh.yml.j2 b/roles/monitoring/prometheus/server/templates/targets/blackbox/ssh.yml.j2 new file mode 100644 index 00000000..29c89590 --- /dev/null +++ b/roles/monitoring/prometheus/server/templates/targets/blackbox/ssh.yml.j2 @@ -0,0 +1,4 @@ +- targets: [ '{{ hostvars[target.config.exporter_hostname].prometheus_scrape_endpoint }}' ] + labels: + instance: '{{ target.instance }}' + __param_target: '{{ target.config.address }}' diff --git a/roles/monitoring/prometheus/server/templates/targets/nut-ups.yml.j2 b/roles/monitoring/prometheus/server/templates/targets/nut-ups.yml.j2 deleted file mode 100644 index d63d79a7..00000000 --- a/roles/monitoring/prometheus/server/templates/targets/nut-ups.yml.j2 +++ /dev/null @@ -1,17 +0,0 @@ -- targets: [ '{{ hostvars[target.config.exporter_hostname].prometheus_scrape_endpoint }}' ] - labels: - instance: '{{ target.instance }}' - __param_ups: {{ target.config.ups }} - __param_server: {{ target.config.server | default('127.0.0.1') }} -{% if 'username' in target.config %} - __param_username: {{ target.config.username }} -{% endif %} -{% if 'password' in target.config %} - __param_password: {{ target.config.password }} -{% endif %} -{% if 'variables' in target.config %} - __param_variables: {{ target.config.variables }} -{% endif %} -{% if 'statuses' in target.config %} - __param_statuses: {{ target.config.statuses }} -{% endif %} diff --git a/roles/monitoring/prometheus/server/templates/targets/nut/ups.yml.j2 b/roles/monitoring/prometheus/server/templates/targets/nut/ups.yml.j2 new file mode 100644 index 00000000..6003cd46 --- /dev/null +++ b/roles/monitoring/prometheus/server/templates/targets/nut/ups.yml.j2 @@ -0,0 +1,17 @@ +- targets: [ '{{ hostvars[target.config.exporter_hostname].prometheus_scrape_endpoint }}' ] + labels: + instance: '{{ target.instance }}' + __param_ups: '{{ target.config.ups }}' + __param_server: '{{ target.config.server | default('127.0.0.1') }}' +{% if 'username' in target.config %} + __param_username: '{{ target.config.username }}' +{% endif %} +{% if 'password' in target.config %} + __param_password: '{{ target.config.password }}' +{% endif %} +{% if 'variables' in target.config %} + __param_variables: '{{ target.config.variables }}' +{% endif %} +{% if 'statuses' in target.config %} + __param_statuses: '{{ target.config.statuses }}' +{% endif %} -- cgit v1.2.3 From 486c84d53244e44ff72a3c2db42ee12afdb083e8 Mon Sep 17 00:00:00 2001 From: Christian Pointner Date: Thu, 23 Sep 2021 20:37:20 +0200 Subject: add some more prometheus rules for blackbox exporter --- .../prometheus/server/defaults/main/main.yml | 2 + .../server/defaults/main/rules_blackbox.yml | 47 ++++++++++++++++++++++ .../server/defaults/main/rules_blackbox__https.yml | 19 ++++++++- .../server/defaults/main/rules_blackbox__ping.yml | 10 ++++- .../prometheus/server/defaults/main/rules_nut.yml | 3 ++ roles/monitoring/prometheus/server/tasks/main.yml | 2 +- 6 files changed, 80 insertions(+), 3 deletions(-) create mode 100644 roles/monitoring/prometheus/server/defaults/main/rules_blackbox.yml create mode 100644 roles/monitoring/prometheus/server/defaults/main/rules_nut.yml (limited to 'roles/monitoring/prometheus/server/defaults') diff --git a/roles/monitoring/prometheus/server/defaults/main/main.yml b/roles/monitoring/prometheus/server/defaults/main/main.yml index bae0cdba..09cd150c 100644 --- a/roles/monitoring/prometheus/server/defaults/main/main.yml +++ b/roles/monitoring/prometheus/server/defaults/main/main.yml @@ -14,7 +14,9 @@ prometheus_server_rules: prometheus: "{{ prometheus_server_rules_prometheus + ((prometheus_server_alertmanager is defined) | ternary(prometheus_server_rules_prometheus_alertmanager, [])) + prometheus_server_rules_prometheus_extra }}" node: "{{ prometheus_server_rules_node + prometheus_server_rules_node_extra }}" openwrt: "{{ prometheus_server_rules_openwrt + prometheus_server_rules_node_extra }}" + nut: "{{ prometheus_server_rules_nut + prometheus_server_rules_nut_extra }}" nut/ups: "{{ prometheus_server_rules_nut__ups + prometheus_server_rules_nut__ups_extra }}" + blackbox: "{{ prometheus_server_rules_blackbox + prometheus_server_rules_blackbox_extra }}" blackbox/ping: "{{ prometheus_server_rules_blackbox__ping + prometheus_server_rules_blackbox__ping_extra }}" blackbox/https: "{{ prometheus_server_rules_blackbox__https + prometheus_server_rules_blackbox__https_extra }}" blackbox/ssh: "{{ prometheus_server_rules_blackbox__ssh + prometheus_server_rules_blackbox__ssh_extra }}" diff --git a/roles/monitoring/prometheus/server/defaults/main/rules_blackbox.yml b/roles/monitoring/prometheus/server/defaults/main/rules_blackbox.yml new file mode 100644 index 00000000..d5c1fd42 --- /dev/null +++ b/roles/monitoring/prometheus/server/defaults/main/rules_blackbox.yml @@ -0,0 +1,47 @@ +--- +prometheus_server_rules_blackbox_extra: [] +prometheus_server_rules_blackbox: + - alert: BlackboxProbeFailed + expr: probe_success == 0 + for: 0m + labels: + severity: critical + annotations: + summary: Blackbox probe failed (instance {{ '{{' }} $labels.instance {{ '}}' }}) + description: "Probe failed\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}" + + - alert: BlackboxSlowProbe + expr: avg_over_time(probe_duration_seconds[1m]) > 1 + for: 1m + labels: + severity: warning + annotations: + summary: Blackbox slow probe (instance {{ '{{' }} $labels.instance {{ '}}' }}) + description: "Blackbox probe took more than 1s to complete\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}" + + - alert: BlackboxSslCertificateWillExpireSoon + expr: probe_ssl_earliest_cert_expiry - time() < 86400 * 30 + for: 0m + labels: + severity: warning + annotations: + summary: Blackbox SSL certificate will expire soon (instance {{ '{{' }} $labels.instance {{ '}}' }}) + description: "SSL certificate expires in 30 days\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}" + + - alert: BlackboxSslCertificateWillExpireSoon + expr: probe_ssl_earliest_cert_expiry - time() < 86400 * 3 + for: 0m + labels: + severity: critical + annotations: + summary: Blackbox SSL certificate will expire soon (instance {{ '{{' }} $labels.instance {{ '}}' }}) + description: "SSL certificate expires in 3 days\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}" + + - alert: BlackboxSslCertificateExpired + expr: probe_ssl_earliest_cert_expiry - time() <= 0 + for: 0m + labels: + severity: critical + annotations: + summary: Blackbox SSL certificate expired (instance {{ '{{' }} $labels.instance {{ '}}' }}) + description: "SSL certificate has expired already\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}" diff --git a/roles/monitoring/prometheus/server/defaults/main/rules_blackbox__https.yml b/roles/monitoring/prometheus/server/defaults/main/rules_blackbox__https.yml index cfdc10bd..140e3b4f 100644 --- a/roles/monitoring/prometheus/server/defaults/main/rules_blackbox__https.yml +++ b/roles/monitoring/prometheus/server/defaults/main/rules_blackbox__https.yml @@ -1,3 +1,20 @@ --- prometheus_server_rules_blackbox__https_extra: [] -prometheus_server_rules_blackbox__https: [] +prometheus_server_rules_blackbox__https: + - alert: BlackboxProbeHttpFailure + expr: probe_http_status_code <= 199 OR probe_http_status_code >= 400 + for: 0m + labels: + severity: critical + annotations: + summary: Blackbox probe HTTP failure (instance {{ '{{' }} $labels.instance {{ '}}' }}) + description: "HTTP status code is not 200-399\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}" + + - alert: BlackboxProbeSlowHttp + expr: avg_over_time(probe_http_duration_seconds[1m]) > 1 + for: 1m + labels: + severity: warning + annotations: + summary: Blackbox probe slow HTTP (instance {{ '{{' }} $labels.instance {{ '}}' }}) + description: "HTTP request took more than 1s\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}" diff --git a/roles/monitoring/prometheus/server/defaults/main/rules_blackbox__ping.yml b/roles/monitoring/prometheus/server/defaults/main/rules_blackbox__ping.yml index 06ce8607..cc87b6b1 100644 --- a/roles/monitoring/prometheus/server/defaults/main/rules_blackbox__ping.yml +++ b/roles/monitoring/prometheus/server/defaults/main/rules_blackbox__ping.yml @@ -1,3 +1,11 @@ --- prometheus_server_rules_blackbox__ping_extra: [] -prometheus_server_rules_blackbox__ping: [] +prometheus_server_rules_blackbox__ping: + - alert: BlackboxProbeSlowPing + expr: avg_over_time(probe_icmp_duration_seconds[1m]) > 1 + for: 1m + labels: + severity: warning + annotations: + summary: Blackbox probe slow ping (instance {{ '{{' }} $labels.instance {{ '}}' }}) + description: "Blackbox ping took more than 1s\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}" diff --git a/roles/monitoring/prometheus/server/defaults/main/rules_nut.yml b/roles/monitoring/prometheus/server/defaults/main/rules_nut.yml new file mode 100644 index 00000000..d8d64f64 --- /dev/null +++ b/roles/monitoring/prometheus/server/defaults/main/rules_nut.yml @@ -0,0 +1,3 @@ +--- +prometheus_server_rules_nut_extra: [] +prometheus_server_rules_nut: [] diff --git a/roles/monitoring/prometheus/server/tasks/main.yml b/roles/monitoring/prometheus/server/tasks/main.yml index 16167c9c..c0928cc3 100644 --- a/roles/monitoring/prometheus/server/tasks/main.yml +++ b/roles/monitoring/prometheus/server/tasks/main.yml @@ -83,7 +83,7 @@ state: directory - name: generate rules files for all jobs - loop: "{{ prometheus_server_jobs | union(['prometheus']) }}" + loop: "{{ prometheus_server_jobs | union(['prometheus']) | union(prometheus_server_jobs | select('match', '.*/.*') | map('dirname') | unique) }}" template: src: rules.yml.j2 dest: "/etc/prometheus/rules/{{ item }}.yml" -- cgit v1.2.3 From c36e7b7a8f2dfe1c54e537b737340e025fa81467 Mon Sep 17 00:00:00 2001 From: Christian Pointner Date: Fri, 24 Sep 2021 02:04:31 +0200 Subject: add some alert rule for newly added node exporter metrics --- .../prometheus/server/defaults/main/rules_node.yml | 27 ++++++++++++++++++++++ 1 file changed, 27 insertions(+) (limited to 'roles/monitoring/prometheus/server/defaults') diff --git a/roles/monitoring/prometheus/server/defaults/main/rules_node.yml b/roles/monitoring/prometheus/server/defaults/main/rules_node.yml index ab7317ac..2c7f9319 100644 --- a/roles/monitoring/prometheus/server/defaults/main/rules_node.yml +++ b/roles/monitoring/prometheus/server/defaults/main/rules_node.yml @@ -217,3 +217,30 @@ prometheus_server_rules_node: annotations: summary: Host clock not synchronising (instance {{ '{{' }} $labels.instance {{ '}}' }}) description: "Clock not synchronising.\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}" + + - alert: AptUpgradesPending + expr: sum by (instance) (apt_upgrades_pending) > 0 + for: 0m + labels: + severity: warning + annotations: + summary: Host has upgradeable packages (instance {{ '{{' }} $labels.instance {{ '}}' }}) + description: "Host {{ '{{' }} $labels.instance {{ '}}' }} has {{ '{{' }} $value {{ '}}' }} upgradable packages.\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}" + + - alert: AptAutoremovePending + expr: sum by (instance) (apt_autoremove_pending) > 0 + for: 0m + labels: + severity: warning + annotations: + summary: Host has packages that can be autoremoved (instance {{ '{{' }} $labels.instance {{ '}}' }}) + description: "Host {{ '{{' }} $labels.instance {{ '}}' }} has {{ '{{' }} $value {{ '}}' }} that can be autoremoved.\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}" + + - alert: HostNeedsRebooting + expr: node_reboot_required > 0 + for: 0m + labels: + severity: warning + annotations: + summary: Host must be rebootet (instance {{ '{{' }} $labels.instance {{ '}}' }}) + description: "Host {{ '{{' }} $labels.instance {{ '}}' }} must be rebootet for security uppdates to take effect.\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}" -- cgit v1.2.3 From 4ec26b272ab6090498a9eefa4a0efb06248b1ef4 Mon Sep 17 00:00:00 2001 From: Christian Pointner Date: Fri, 24 Sep 2021 14:11:35 +0200 Subject: prometheus: add monitoring for systemd units --- .../prometheus/exporter/node/defaults/main.yml | 1 + .../prometheus/server/defaults/main/rules_node.yml | 11 ++++++++++- .../prometheus/server/templates/jobs/node.j2 | 20 ++++++++++++++++++++ 3 files changed, 31 insertions(+), 1 deletion(-) create mode 100644 roles/monitoring/prometheus/server/templates/jobs/node.j2 (limited to 'roles/monitoring/prometheus/server/defaults') diff --git a/roles/monitoring/prometheus/exporter/node/defaults/main.yml b/roles/monitoring/prometheus/exporter/node/defaults/main.yml index 2714a7fe..491e70f6 100644 --- a/roles/monitoring/prometheus/exporter/node/defaults/main.yml +++ b/roles/monitoring/prometheus/exporter/node/defaults/main.yml @@ -12,6 +12,7 @@ prometheus_exporter_node_timesync_collector: "{{ _prometheus_exporter_node_time_ prometheus_exporter_node_disable_collectors: [] prometheus_exporter_node_extra_collectors: - "{{ prometheus_exporter_node_timesync_collector }}" + - systemd prometheus_exporter_node_textfile_collector_scripts: - deleted-libraries diff --git a/roles/monitoring/prometheus/server/defaults/main/rules_node.yml b/roles/monitoring/prometheus/server/defaults/main/rules_node.yml index 2c7f9319..64a7d562 100644 --- a/roles/monitoring/prometheus/server/defaults/main/rules_node.yml +++ b/roles/monitoring/prometheus/server/defaults/main/rules_node.yml @@ -92,6 +92,15 @@ prometheus_server_rules_node: summary: Host CPU steal noisy neighbor (instance {{ '{{' }} $labels.instance {{ '}}' }}) description: "CPU steal is > 10%. A noisy neighbor is killing VM performances or a spot instance may be out of credit.\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}" + - alert: HostSystemdNotRunning + expr: node_systemd_system_running == 0 + for: 0m + labels: + severity: warning + annotations: + summary: Host systemd is not in running state (instance {{ '{{' }} $labels.instance {{ '}}' }}) + description: "systemd is not in running state.\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}" + - alert: HostSystemdServiceCrashed expr: node_systemd_unit_state{state="failed"} == 1 for: 0m @@ -99,7 +108,7 @@ prometheus_server_rules_node: severity: warning annotations: summary: Host systemd service crashed (instance {{ '{{' }} $labels.instance {{ '}}' }}) - description: "systemd service crashed\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}" + description: "The systemd service unit {{ '{{' }} $labels.name {{ '}}' }} is in failed state.\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}" - alert: HostPhysicalComponentTooHot expr: node_hwmon_temp_celsius > 75 diff --git a/roles/monitoring/prometheus/server/templates/jobs/node.j2 b/roles/monitoring/prometheus/server/templates/jobs/node.j2 new file mode 100644 index 00000000..ba9eab31 --- /dev/null +++ b/roles/monitoring/prometheus/server/templates/jobs/node.j2 @@ -0,0 +1,20 @@ + - job_name: '{{ job }}' + metrics_path: /proxy + params: + module: + - {{ job }} + scheme: https + tls_config: + ca_file: /etc/ssl/prometheus/ca-crt.pem + cert_file: /etc/ssl/prometheus/server/scrape-crt.pem + key_file: /etc/ssl/prometheus/server/scrape-key.pem + file_sd_configs: + - files: + - "/etc/prometheus/targets/{{ job }}/*.yml" + metric_relabel_configs: + - source_labels: [ "mountpoint" ] + regex: ".*/\\.snapshot/.*" + action: drop + - source_labels: [ "__name__", "state" ] + regex: "node_systemd_unit_state;(activating|deactivating|inactive)" + action: drop -- cgit v1.2.3 From 9da269b334fc9a1949c787ea37a3d5879bc2b865 Mon Sep 17 00:00:00 2001 From: Christian Pointner Date: Fri, 24 Sep 2021 14:13:51 +0200 Subject: fix some todos --- roles/monitoring/prometheus/exporter/node/defaults/main.yml | 2 -- roles/monitoring/prometheus/server/defaults/main/rules_nut__ups.yml | 1 + roles/monitoring/prometheus/server/defaults/main/rules_openwrt.yml | 1 + 3 files changed, 2 insertions(+), 2 deletions(-) (limited to 'roles/monitoring/prometheus/server/defaults') diff --git a/roles/monitoring/prometheus/exporter/node/defaults/main.yml b/roles/monitoring/prometheus/exporter/node/defaults/main.yml index 491e70f6..9309562f 100644 --- a/roles/monitoring/prometheus/exporter/node/defaults/main.yml +++ b/roles/monitoring/prometheus/exporter/node/defaults/main.yml @@ -7,8 +7,6 @@ _prometheus_exporter_node_time_collector_map_: prometheus_exporter_node_timesync_collector: "{{ _prometheus_exporter_node_time_collector_map_[ntp_variant | default('')] }}" -## TODO: systemd state collector??? - prometheus_exporter_node_disable_collectors: [] prometheus_exporter_node_extra_collectors: - "{{ prometheus_exporter_node_timesync_collector }}" diff --git a/roles/monitoring/prometheus/server/defaults/main/rules_nut__ups.yml b/roles/monitoring/prometheus/server/defaults/main/rules_nut__ups.yml index bccb0ca8..150a507e 100644 --- a/roles/monitoring/prometheus/server/defaults/main/rules_nut__ups.yml +++ b/roles/monitoring/prometheus/server/defaults/main/rules_nut__ups.yml @@ -1,3 +1,4 @@ --- prometheus_server_rules_nut__ups_extra: [] prometheus_server_rules_nut__ups: [] +## TODO: add NUT/UPS alert rules diff --git a/roles/monitoring/prometheus/server/defaults/main/rules_openwrt.yml b/roles/monitoring/prometheus/server/defaults/main/rules_openwrt.yml index 88d84f31..04b178f1 100644 --- a/roles/monitoring/prometheus/server/defaults/main/rules_openwrt.yml +++ b/roles/monitoring/prometheus/server/defaults/main/rules_openwrt.yml @@ -1,3 +1,4 @@ --- prometheus_server_rules_openwrt_extra: [] prometheus_server_rules_openwrt: [] +## TODO: add openwrt specific alert rules -- cgit v1.2.3 From cc89d6d4211aa5aec8e5bef8c854d4929c337887 Mon Sep 17 00:00:00 2001 From: Christian Pointner Date: Sun, 26 Sep 2021 03:32:47 +0200 Subject: improved promethues multitarget support --- inventory/group_vars/chaos-at-home-ups/vars.yml | 6 +- inventory/group_vars/ele-ups/vars.yml | 10 +-- .../group_vars/promzone-chaos-at-home/vars.yml | 6 +- .../group_vars/promzone-elevate-festival/vars.yml | 1 + inventory/host_vars/ch-mon.yml | 39 ++++++------ .../prometheus/exporter/blackbox/tasks/main.yml | 3 + .../prometheus/server/defaults/main/main.yml | 4 +- .../server/defaults/main/rules_blackbox.yml | 46 +------------- .../server/defaults/main/rules_blackbox__https.yml | 20 ------ .../server/defaults/main/rules_blackbox__ping.yml | 11 ---- .../server/defaults/main/rules_blackbox__probe.yml | 74 ++++++++++++++++++++++ .../server/defaults/main/rules_blackbox__ssh.yml | 3 - .../prometheus/server/filter_plugins/prometheus.py | 10 +-- roles/monitoring/prometheus/server/tasks/main.yml | 2 +- .../server/templates/jobs/blackbox/https.j2 | 13 ---- .../server/templates/jobs/blackbox/ping.j2 | 13 ---- .../server/templates/jobs/blackbox/ssh.j2 | 13 ---- .../prometheus/server/templates/jobs/nut/ups.j2 | 10 --- .../server/templates/targets/blackbox/https.yml.j2 | 4 -- .../server/templates/targets/blackbox/ping.yml.j2 | 4 -- .../server/templates/targets/blackbox/probe.yml.j2 | 5 ++ .../server/templates/targets/blackbox/ssh.yml.j2 | 4 -- .../server/templates/targets/nut/ups.yml.j2 | 2 +- 23 files changed, 123 insertions(+), 180 deletions(-) delete mode 100644 roles/monitoring/prometheus/server/defaults/main/rules_blackbox__https.yml delete mode 100644 roles/monitoring/prometheus/server/defaults/main/rules_blackbox__ping.yml create mode 100644 roles/monitoring/prometheus/server/defaults/main/rules_blackbox__probe.yml delete mode 100644 roles/monitoring/prometheus/server/defaults/main/rules_blackbox__ssh.yml delete mode 100644 roles/monitoring/prometheus/server/templates/jobs/blackbox/https.j2 delete mode 100644 roles/monitoring/prometheus/server/templates/jobs/blackbox/ping.j2 delete mode 100644 roles/monitoring/prometheus/server/templates/jobs/blackbox/ssh.j2 delete mode 100644 roles/monitoring/prometheus/server/templates/jobs/nut/ups.j2 delete mode 100644 roles/monitoring/prometheus/server/templates/targets/blackbox/https.yml.j2 delete mode 100644 roles/monitoring/prometheus/server/templates/targets/blackbox/ping.yml.j2 create mode 100644 roles/monitoring/prometheus/server/templates/targets/blackbox/probe.yml.j2 delete mode 100644 roles/monitoring/prometheus/server/templates/targets/blackbox/ssh.yml.j2 (limited to 'roles/monitoring/prometheus/server/defaults') diff --git a/inventory/group_vars/chaos-at-home-ups/vars.yml b/inventory/group_vars/chaos-at-home-ups/vars.yml index f8c1bdf1..7b60e893 100644 --- a/inventory/group_vars/chaos-at-home-ups/vars.yml +++ b/inventory/group_vars/chaos-at-home-ups/vars.yml @@ -11,8 +11,8 @@ prometheus_scrape_endpoint: "{{ network_mgmt_zone.prefix | ipaddr(network_mgmt_z prometheus_exporters_default: - openwrt -prometheus_job_nut__ups: - - exporter_hostname: ch-mon - instance: "ups-{{ ups_name }}" +prometheus_job_multitarget_nut__ups: + ch-mon: + - instance: "ups-{{ ups_name }}" ups: "{{ ups_name }}" server: "{{ network_mgmt_zone.prefix | ipaddr(network_mgmt_zone.offsets[inventory_hostname]) | ipaddr('address') }}" diff --git a/inventory/group_vars/ele-ups/vars.yml b/inventory/group_vars/ele-ups/vars.yml index 1c4613a3..28a5eaff 100644 --- a/inventory/group_vars/ele-ups/vars.yml +++ b/inventory/group_vars/ele-ups/vars.yml @@ -14,8 +14,8 @@ prometheus_scrape_endpoint: "{{ network_mgmt_zone.prefix | ipaddr(network_mgmt_z prometheus_exporters_default: - openwrt -prometheus_job_nut__ups: - exporter_hostname: ele-mon - instance: "ups-{{ ups_name }}" - ups: "{{ ups_name }}" - server: "{{ network_mgmt_zone.prefix | ipaddr(network_mgmt_zone.offsets[inventory_hostname]) | ipaddr('address') }}" +prometheus_job_multitarget_nut__ups: + ele-mon: + - instance: "ups-{{ ups_name }}" + ups: "{{ ups_name }}" + server: "{{ network_mgmt_zone.prefix | ipaddr(network_mgmt_zone.offsets[inventory_hostname]) | ipaddr('address') }}" diff --git a/inventory/group_vars/promzone-chaos-at-home/vars.yml b/inventory/group_vars/promzone-chaos-at-home/vars.yml index 84ed1263..529bf3e7 100644 --- a/inventory/group_vars/promzone-chaos-at-home/vars.yml +++ b/inventory/group_vars/promzone-chaos-at-home/vars.yml @@ -9,10 +9,10 @@ prometheus_server: ch-mon prometheus_server_jobs: - node - openwrt + - nut - nut/ups - - blackbox/ping - - blackbox/https - - blackbox/ssh + - blackbox + - blackbox/probe prometheus_zone_name: chaos@home prometheus_zone_targets: "{{ groups['promzone-chaos-at-home'] }}" diff --git a/inventory/group_vars/promzone-elevate-festival/vars.yml b/inventory/group_vars/promzone-elevate-festival/vars.yml index 43115dc4..b3321614 100644 --- a/inventory/group_vars/promzone-elevate-festival/vars.yml +++ b/inventory/group_vars/promzone-elevate-festival/vars.yml @@ -9,6 +9,7 @@ prometheus_server: ele-mon prometheus_server_jobs: - node - openwrt + - nut - nut/ups prometheus_zone_name: Elevate Festival diff --git a/inventory/host_vars/ch-mon.yml b/inventory/host_vars/ch-mon.yml index d1a710b9..b2402d0c 100644 --- a/inventory/host_vars/ch-mon.yml +++ b/inventory/host_vars/ch-mon.yml @@ -76,26 +76,25 @@ prometheus_exporter_blackbox_modules_extra: icmp: prober: icmp -prometheus_job_blackbox__ping: - - exporter_hostname: ch-mon - instance: "ping-magentagw" - address: 62.99.185.129 - - exporter_hostname: ch-mon - instance: "ping-quad9" - address: 9.9.9.9 - -prometheus_job_blackbox__https: - - exporter_hostname: ch-mon - instance: "https-pan.chaos-at-home.org" - address: "https://pan.chaos-at-home.org" - - exporter_hostname: ch-mon - instance: "https-mimas.chaos-at-home.org" - address: "https://mimas.chaos-at-home.org" - -prometheus_job_blackbox__ssh: - - exporter_hostname: ch-mon - instance: "ssh-{{ inventory_hostname }}" - address: "{{ network_zones.svc.prefix | ipaddr(network_zones.svc.offsets[inventory_hostname]) | ipaddr('address') }}:{{ ansible_port | default(22) }}" +prometheus_job_multitarget_blackbox__probe: + ch-mon: + - instance: "ping-magentagw" + target: 62.99.185.129 + module: icmp + - instance: "ping-quad9" + target: 9.9.9.9 + module: icmp + + - instance: "https-pan.chaos-at-home.org" + target: "https://pan.chaos-at-home.org" + module: http_tls_2xx + - instance: "https-mimas.chaos-at-home.org" + target: "https://mimas.chaos-at-home.org" + module: http_tls_2xx + + - instance: "ssh-{{ inventory_hostname }}" + target: "{{ network_zones.svc.prefix | ipaddr(network_zones.svc.offsets[inventory_hostname]) | ipaddr('address') }}:{{ ansible_port | default(22) }}" + module: ssh_banner promethues_alertmanager_smtp: diff --git a/roles/monitoring/prometheus/exporter/blackbox/tasks/main.yml b/roles/monitoring/prometheus/exporter/blackbox/tasks/main.yml index f9793df6..c4cabfce 100644 --- a/roles/monitoring/prometheus/exporter/blackbox/tasks/main.yml +++ b/roles/monitoring/prometheus/exporter/blackbox/tasks/main.yml @@ -33,6 +33,9 @@ copy: content: | location = /blackbox { + proxy_pass http://127.0.0.1:9115/metrics; + } + location = /blackbox/probe { proxy_pass http://127.0.0.1:9115/probe; } dest: /etc/prometheus/exporter/blackbox.locations diff --git a/roles/monitoring/prometheus/server/defaults/main/main.yml b/roles/monitoring/prometheus/server/defaults/main/main.yml index 09cd150c..7781fd69 100644 --- a/roles/monitoring/prometheus/server/defaults/main/main.yml +++ b/roles/monitoring/prometheus/server/defaults/main/main.yml @@ -17,9 +17,7 @@ prometheus_server_rules: nut: "{{ prometheus_server_rules_nut + prometheus_server_rules_nut_extra }}" nut/ups: "{{ prometheus_server_rules_nut__ups + prometheus_server_rules_nut__ups_extra }}" blackbox: "{{ prometheus_server_rules_blackbox + prometheus_server_rules_blackbox_extra }}" - blackbox/ping: "{{ prometheus_server_rules_blackbox__ping + prometheus_server_rules_blackbox__ping_extra }}" - blackbox/https: "{{ prometheus_server_rules_blackbox__https + prometheus_server_rules_blackbox__https_extra }}" - blackbox/ssh: "{{ prometheus_server_rules_blackbox__ssh + prometheus_server_rules_blackbox__ssh_extra }}" + blackbox/probe: "{{ prometheus_server_rules_blackbox__probe + prometheus_server_rules_blackbox__probe_extra }}" # prometheus_server_alertmanager: # url: "127.0.0.1:9093" diff --git a/roles/monitoring/prometheus/server/defaults/main/rules_blackbox.yml b/roles/monitoring/prometheus/server/defaults/main/rules_blackbox.yml index d5c1fd42..99f2e83c 100644 --- a/roles/monitoring/prometheus/server/defaults/main/rules_blackbox.yml +++ b/roles/monitoring/prometheus/server/defaults/main/rules_blackbox.yml @@ -1,47 +1,3 @@ --- prometheus_server_rules_blackbox_extra: [] -prometheus_server_rules_blackbox: - - alert: BlackboxProbeFailed - expr: probe_success == 0 - for: 0m - labels: - severity: critical - annotations: - summary: Blackbox probe failed (instance {{ '{{' }} $labels.instance {{ '}}' }}) - description: "Probe failed\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}" - - - alert: BlackboxSlowProbe - expr: avg_over_time(probe_duration_seconds[1m]) > 1 - for: 1m - labels: - severity: warning - annotations: - summary: Blackbox slow probe (instance {{ '{{' }} $labels.instance {{ '}}' }}) - description: "Blackbox probe took more than 1s to complete\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}" - - - alert: BlackboxSslCertificateWillExpireSoon - expr: probe_ssl_earliest_cert_expiry - time() < 86400 * 30 - for: 0m - labels: - severity: warning - annotations: - summary: Blackbox SSL certificate will expire soon (instance {{ '{{' }} $labels.instance {{ '}}' }}) - description: "SSL certificate expires in 30 days\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}" - - - alert: BlackboxSslCertificateWillExpireSoon - expr: probe_ssl_earliest_cert_expiry - time() < 86400 * 3 - for: 0m - labels: - severity: critical - annotations: - summary: Blackbox SSL certificate will expire soon (instance {{ '{{' }} $labels.instance {{ '}}' }}) - description: "SSL certificate expires in 3 days\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}" - - - alert: BlackboxSslCertificateExpired - expr: probe_ssl_earliest_cert_expiry - time() <= 0 - for: 0m - labels: - severity: critical - annotations: - summary: Blackbox SSL certificate expired (instance {{ '{{' }} $labels.instance {{ '}}' }}) - description: "SSL certificate has expired already\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}" +prometheus_server_rules_blackbox: [] diff --git a/roles/monitoring/prometheus/server/defaults/main/rules_blackbox__https.yml b/roles/monitoring/prometheus/server/defaults/main/rules_blackbox__https.yml deleted file mode 100644 index 140e3b4f..00000000 --- a/roles/monitoring/prometheus/server/defaults/main/rules_blackbox__https.yml +++ /dev/null @@ -1,20 +0,0 @@ ---- -prometheus_server_rules_blackbox__https_extra: [] -prometheus_server_rules_blackbox__https: - - alert: BlackboxProbeHttpFailure - expr: probe_http_status_code <= 199 OR probe_http_status_code >= 400 - for: 0m - labels: - severity: critical - annotations: - summary: Blackbox probe HTTP failure (instance {{ '{{' }} $labels.instance {{ '}}' }}) - description: "HTTP status code is not 200-399\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}" - - - alert: BlackboxProbeSlowHttp - expr: avg_over_time(probe_http_duration_seconds[1m]) > 1 - for: 1m - labels: - severity: warning - annotations: - summary: Blackbox probe slow HTTP (instance {{ '{{' }} $labels.instance {{ '}}' }}) - description: "HTTP request took more than 1s\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}" diff --git a/roles/monitoring/prometheus/server/defaults/main/rules_blackbox__ping.yml b/roles/monitoring/prometheus/server/defaults/main/rules_blackbox__ping.yml deleted file mode 100644 index cc87b6b1..00000000 --- a/roles/monitoring/prometheus/server/defaults/main/rules_blackbox__ping.yml +++ /dev/null @@ -1,11 +0,0 @@ ---- -prometheus_server_rules_blackbox__ping_extra: [] -prometheus_server_rules_blackbox__ping: - - alert: BlackboxProbeSlowPing - expr: avg_over_time(probe_icmp_duration_seconds[1m]) > 1 - for: 1m - labels: - severity: warning - annotations: - summary: Blackbox probe slow ping (instance {{ '{{' }} $labels.instance {{ '}}' }}) - description: "Blackbox ping took more than 1s\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}" diff --git a/roles/monitoring/prometheus/server/defaults/main/rules_blackbox__probe.yml b/roles/monitoring/prometheus/server/defaults/main/rules_blackbox__probe.yml new file mode 100644 index 00000000..9f9d2292 --- /dev/null +++ b/roles/monitoring/prometheus/server/defaults/main/rules_blackbox__probe.yml @@ -0,0 +1,74 @@ +--- +prometheus_server_rules_blackbox__probe_extra: [] +prometheus_server_rules_blackbox__probe: + - alert: BlackboxProbeFailed + expr: probe_success == 0 + for: 0m + labels: + severity: critical + annotations: + summary: Blackbox probe failed (instance {{ '{{' }} $labels.instance {{ '}}' }}) + description: "Probe failed\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}" + + - alert: BlackboxSlowProbe + expr: avg_over_time(probe_duration_seconds[1m]) > 1 + for: 1m + labels: + severity: warning + annotations: + summary: Blackbox slow probe (instance {{ '{{' }} $labels.instance {{ '}}' }}) + description: "Blackbox probe took more than 1s to complete\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}" + + - alert: BlackboxSslCertificateWillExpireSoon + expr: probe_ssl_earliest_cert_expiry - time() < 86400 * 30 + for: 0m + labels: + severity: warning + annotations: + summary: Blackbox SSL certificate will expire soon (instance {{ '{{' }} $labels.instance {{ '}}' }}) + description: "SSL certificate expires in 30 days\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}" + + - alert: BlackboxSslCertificateWillExpireSoon + expr: probe_ssl_earliest_cert_expiry - time() < 86400 * 3 + for: 0m + labels: + severity: critical + annotations: + summary: Blackbox SSL certificate will expire soon (instance {{ '{{' }} $labels.instance {{ '}}' }}) + description: "SSL certificate expires in 3 days\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}" + + - alert: BlackboxSslCertificateExpired + expr: probe_ssl_earliest_cert_expiry - time() <= 0 + for: 0m + labels: + severity: critical + annotations: + summary: Blackbox SSL certificate expired (instance {{ '{{' }} $labels.instance {{ '}}' }}) + description: "SSL certificate has expired already\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}" + + - alert: BlackboxProbeHttpFailure + expr: probe_http_status_code <= 199 OR probe_http_status_code >= 400 + for: 0m + labels: + severity: critical + annotations: + summary: Blackbox probe HTTP failure (instance {{ '{{' }} $labels.instance {{ '}}' }}) + description: "HTTP status code is not 200-399\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}" + + - alert: BlackboxProbeSlowHttp + expr: avg_over_time(probe_http_duration_seconds[1m]) > 1 + for: 1m + labels: + severity: warning + annotations: + summary: Blackbox probe slow HTTP (instance {{ '{{' }} $labels.instance {{ '}}' }}) + description: "HTTP request took more than 1s\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}" + + - alert: BlackboxProbeSlowPing + expr: avg_over_time(probe_icmp_duration_seconds[1m]) > 1 + for: 1m + labels: + severity: warning + annotations: + summary: Blackbox probe slow ping (instance {{ '{{' }} $labels.instance {{ '}}' }}) + description: "Blackbox ping took more than 1s\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}" diff --git a/roles/monitoring/prometheus/server/defaults/main/rules_blackbox__ssh.yml b/roles/monitoring/prometheus/server/defaults/main/rules_blackbox__ssh.yml deleted file mode 100644 index 8e717c41..00000000 --- a/roles/monitoring/prometheus/server/defaults/main/rules_blackbox__ssh.yml +++ /dev/null @@ -1,3 +0,0 @@ ---- -prometheus_server_rules_blackbox__ssh_extra: [] -prometheus_server_rules_blackbox__ssh: [] diff --git a/roles/monitoring/prometheus/server/filter_plugins/prometheus.py b/roles/monitoring/prometheus/server/filter_plugins/prometheus.py index 1443e837..d91ef619 100644 --- a/roles/monitoring/prometheus/server/filter_plugins/prometheus.py +++ b/roles/monitoring/prometheus/server/filter_plugins/prometheus.py @@ -11,10 +11,12 @@ def prometheus_job_targets(hostvars, jobs, targets): result = [] for job in jobs: for target in targets: - special_config_varname = 'prometheus_job_' + job.replace('-', '_').replace('/', '__') - if special_config_varname in hostvars[target]: - for config in hostvars[target][special_config_varname]: - result.append({'job': job, 'instance': config['instance'], 'config': config, 'enabled': True}) + multitarget_config_varname = 'prometheus_job_multitarget_' + job.replace('-', '_').replace('/', '__') + if multitarget_config_varname in hostvars[target]: + for exporter_hostname, configs in hostvars[target][multitarget_config_varname].items(): + for config in configs: + result.append({'job': job, 'instance': config['instance'], 'enabled': True, + 'exporter_hostname': exporter_hostname, 'config': config}) else: enabled = job in hostvars[target]['prometheus_exporters_default'] or job in hostvars[target]['prometheus_exporters_extra'] diff --git a/roles/monitoring/prometheus/server/tasks/main.yml b/roles/monitoring/prometheus/server/tasks/main.yml index c0928cc3..16167c9c 100644 --- a/roles/monitoring/prometheus/server/tasks/main.yml +++ b/roles/monitoring/prometheus/server/tasks/main.yml @@ -83,7 +83,7 @@ state: directory - name: generate rules files for all jobs - loop: "{{ prometheus_server_jobs | union(['prometheus']) | union(prometheus_server_jobs | select('match', '.*/.*') | map('dirname') | unique) }}" + loop: "{{ prometheus_server_jobs | union(['prometheus']) }}" template: src: rules.yml.j2 dest: "/etc/prometheus/rules/{{ item }}.yml" diff --git a/roles/monitoring/prometheus/server/templates/jobs/blackbox/https.j2 b/roles/monitoring/prometheus/server/templates/jobs/blackbox/https.j2 deleted file mode 100644 index 86ff88dd..00000000 --- a/roles/monitoring/prometheus/server/templates/jobs/blackbox/https.j2 +++ /dev/null @@ -1,13 +0,0 @@ - - job_name: '{{ job }}' - metrics_path: /blackbox - params: - module: - - http_tls_2xx - scheme: https - tls_config: - ca_file: /etc/ssl/prometheus/ca-crt.pem - cert_file: /etc/ssl/prometheus/server/scrape-crt.pem - key_file: /etc/ssl/prometheus/server/scrape-key.pem - file_sd_configs: - - files: - - "/etc/prometheus/targets/{{ job }}/*.yml" diff --git a/roles/monitoring/prometheus/server/templates/jobs/blackbox/ping.j2 b/roles/monitoring/prometheus/server/templates/jobs/blackbox/ping.j2 deleted file mode 100644 index 2d3889d2..00000000 --- a/roles/monitoring/prometheus/server/templates/jobs/blackbox/ping.j2 +++ /dev/null @@ -1,13 +0,0 @@ - - job_name: '{{ job }}' - metrics_path: /blackbox - params: - module: - - icmp - scheme: https - tls_config: - ca_file: /etc/ssl/prometheus/ca-crt.pem - cert_file: /etc/ssl/prometheus/server/scrape-crt.pem - key_file: /etc/ssl/prometheus/server/scrape-key.pem - file_sd_configs: - - files: - - "/etc/prometheus/targets/{{ job }}/*.yml" diff --git a/roles/monitoring/prometheus/server/templates/jobs/blackbox/ssh.j2 b/roles/monitoring/prometheus/server/templates/jobs/blackbox/ssh.j2 deleted file mode 100644 index 97565673..00000000 --- a/roles/monitoring/prometheus/server/templates/jobs/blackbox/ssh.j2 +++ /dev/null @@ -1,13 +0,0 @@ - - job_name: '{{ job }}' - metrics_path: /blackbox - params: - module: - - ssh_banner - scheme: https - tls_config: - ca_file: /etc/ssl/prometheus/ca-crt.pem - cert_file: /etc/ssl/prometheus/server/scrape-crt.pem - key_file: /etc/ssl/prometheus/server/scrape-key.pem - file_sd_configs: - - files: - - "/etc/prometheus/targets/{{ job }}/*.yml" diff --git a/roles/monitoring/prometheus/server/templates/jobs/nut/ups.j2 b/roles/monitoring/prometheus/server/templates/jobs/nut/ups.j2 deleted file mode 100644 index 0cf4ae4e..00000000 --- a/roles/monitoring/prometheus/server/templates/jobs/nut/ups.j2 +++ /dev/null @@ -1,10 +0,0 @@ - - job_name: '{{ job }}' - metrics_path: /nut/ups - scheme: https - tls_config: - ca_file: /etc/ssl/prometheus/ca-crt.pem - cert_file: /etc/ssl/prometheus/server/scrape-crt.pem - key_file: /etc/ssl/prometheus/server/scrape-key.pem - file_sd_configs: - - files: - - "/etc/prometheus/targets/{{ job }}/*.yml" diff --git a/roles/monitoring/prometheus/server/templates/targets/blackbox/https.yml.j2 b/roles/monitoring/prometheus/server/templates/targets/blackbox/https.yml.j2 deleted file mode 100644 index 29c89590..00000000 --- a/roles/monitoring/prometheus/server/templates/targets/blackbox/https.yml.j2 +++ /dev/null @@ -1,4 +0,0 @@ -- targets: [ '{{ hostvars[target.config.exporter_hostname].prometheus_scrape_endpoint }}' ] - labels: - instance: '{{ target.instance }}' - __param_target: '{{ target.config.address }}' diff --git a/roles/monitoring/prometheus/server/templates/targets/blackbox/ping.yml.j2 b/roles/monitoring/prometheus/server/templates/targets/blackbox/ping.yml.j2 deleted file mode 100644 index 29c89590..00000000 --- a/roles/monitoring/prometheus/server/templates/targets/blackbox/ping.yml.j2 +++ /dev/null @@ -1,4 +0,0 @@ -- targets: [ '{{ hostvars[target.config.exporter_hostname].prometheus_scrape_endpoint }}' ] - labels: - instance: '{{ target.instance }}' - __param_target: '{{ target.config.address }}' diff --git a/roles/monitoring/prometheus/server/templates/targets/blackbox/probe.yml.j2 b/roles/monitoring/prometheus/server/templates/targets/blackbox/probe.yml.j2 new file mode 100644 index 00000000..4e336873 --- /dev/null +++ b/roles/monitoring/prometheus/server/templates/targets/blackbox/probe.yml.j2 @@ -0,0 +1,5 @@ +- targets: [ '{{ hostvars[target.exporter_hostname].prometheus_scrape_endpoint }}' ] + labels: + instance: '{{ target.instance }}' + __param_target: '{{ target.config.target }}' + __param_module: '{{ target.config.module }}' diff --git a/roles/monitoring/prometheus/server/templates/targets/blackbox/ssh.yml.j2 b/roles/monitoring/prometheus/server/templates/targets/blackbox/ssh.yml.j2 deleted file mode 100644 index 29c89590..00000000 --- a/roles/monitoring/prometheus/server/templates/targets/blackbox/ssh.yml.j2 +++ /dev/null @@ -1,4 +0,0 @@ -- targets: [ '{{ hostvars[target.config.exporter_hostname].prometheus_scrape_endpoint }}' ] - labels: - instance: '{{ target.instance }}' - __param_target: '{{ target.config.address }}' diff --git a/roles/monitoring/prometheus/server/templates/targets/nut/ups.yml.j2 b/roles/monitoring/prometheus/server/templates/targets/nut/ups.yml.j2 index 6003cd46..c60077c7 100644 --- a/roles/monitoring/prometheus/server/templates/targets/nut/ups.yml.j2 +++ b/roles/monitoring/prometheus/server/templates/targets/nut/ups.yml.j2 @@ -1,4 +1,4 @@ -- targets: [ '{{ hostvars[target.config.exporter_hostname].prometheus_scrape_endpoint }}' ] +- targets: [ '{{ hostvars[target.exporter_hostname].prometheus_scrape_endpoint }}' ] labels: instance: '{{ target.instance }}' __param_ups: '{{ target.config.ups }}' -- cgit v1.2.3 From 419ede2858769e4414a23a42b57931b83cf70d8c Mon Sep 17 00:00:00 2001 From: Christian Pointner Date: Sun, 26 Sep 2021 04:08:58 +0200 Subject: add job configs for ipmi and ipmi/remote --- inventory/host_vars/ch-testvm-prometheus.yml | 3 --- roles/monitoring/prometheus/exporter/ipmi/defaults/main.yml | 3 ++- roles/monitoring/prometheus/server/defaults/main/main.yml | 2 ++ roles/monitoring/prometheus/server/defaults/main/rules_ipmi.yml | 4 ++++ .../prometheus/server/defaults/main/rules_ipmi__remote.yml | 4 ++++ .../prometheus/server/templates/targets/ipmi/remote.yml.j2 | 5 +++++ 6 files changed, 17 insertions(+), 4 deletions(-) create mode 100644 roles/monitoring/prometheus/server/defaults/main/rules_ipmi.yml create mode 100644 roles/monitoring/prometheus/server/defaults/main/rules_ipmi__remote.yml create mode 100644 roles/monitoring/prometheus/server/templates/targets/ipmi/remote.yml.j2 (limited to 'roles/monitoring/prometheus/server/defaults') diff --git a/inventory/host_vars/ch-testvm-prometheus.yml b/inventory/host_vars/ch-testvm-prometheus.yml index 939fa398..e539735f 100644 --- a/inventory/host_vars/ch-testvm-prometheus.yml +++ b/inventory/host_vars/ch-testvm-prometheus.yml @@ -36,9 +36,6 @@ network: spreadspace_apt_repo_components: - prometheus -prometheus_exporters_extra: - - ipmi - containerd_storage: type: lvm diff --git a/roles/monitoring/prometheus/exporter/ipmi/defaults/main.yml b/roles/monitoring/prometheus/exporter/ipmi/defaults/main.yml index 9b99f9a5..6cf14f76 100644 --- a/roles/monitoring/prometheus/exporter/ipmi/defaults/main.yml +++ b/roles/monitoring/prometheus/exporter/ipmi/defaults/main.yml @@ -1,6 +1,7 @@ --- prometheus_exporter_ipmi_modules: - default: {} + default: + collectors: [] # collectors: # - bmc # - ipmi diff --git a/roles/monitoring/prometheus/server/defaults/main/main.yml b/roles/monitoring/prometheus/server/defaults/main/main.yml index 7781fd69..1e0ccf78 100644 --- a/roles/monitoring/prometheus/server/defaults/main/main.yml +++ b/roles/monitoring/prometheus/server/defaults/main/main.yml @@ -18,6 +18,8 @@ prometheus_server_rules: nut/ups: "{{ prometheus_server_rules_nut__ups + prometheus_server_rules_nut__ups_extra }}" blackbox: "{{ prometheus_server_rules_blackbox + prometheus_server_rules_blackbox_extra }}" blackbox/probe: "{{ prometheus_server_rules_blackbox__probe + prometheus_server_rules_blackbox__probe_extra }}" + ipmi: "{{ prometheus_server_rules_ipmi + prometheus_server_rules_ipmi_extra }}" + ipmi/remote: "{{ prometheus_server_rules_ipmi__remote + prometheus_server_rules_ipmi__remote_extra }}" # prometheus_server_alertmanager: # url: "127.0.0.1:9093" diff --git a/roles/monitoring/prometheus/server/defaults/main/rules_ipmi.yml b/roles/monitoring/prometheus/server/defaults/main/rules_ipmi.yml new file mode 100644 index 00000000..41dcd7e9 --- /dev/null +++ b/roles/monitoring/prometheus/server/defaults/main/rules_ipmi.yml @@ -0,0 +1,4 @@ +--- +prometheus_server_rules_ipmi_extra: [] +prometheus_server_rules_ipmi: [] +## TODO: add common IPMI alert rules diff --git a/roles/monitoring/prometheus/server/defaults/main/rules_ipmi__remote.yml b/roles/monitoring/prometheus/server/defaults/main/rules_ipmi__remote.yml new file mode 100644 index 00000000..1f9338ea --- /dev/null +++ b/roles/monitoring/prometheus/server/defaults/main/rules_ipmi__remote.yml @@ -0,0 +1,4 @@ +--- +prometheus_server_rules_ipmi__remote_extra: [] +prometheus_server_rules_ipmi__remote: [] +## TODO: add remote-IPMI specific alert rules diff --git a/roles/monitoring/prometheus/server/templates/targets/ipmi/remote.yml.j2 b/roles/monitoring/prometheus/server/templates/targets/ipmi/remote.yml.j2 new file mode 100644 index 00000000..4e336873 --- /dev/null +++ b/roles/monitoring/prometheus/server/templates/targets/ipmi/remote.yml.j2 @@ -0,0 +1,5 @@ +- targets: [ '{{ hostvars[target.exporter_hostname].prometheus_scrape_endpoint }}' ] + labels: + instance: '{{ target.instance }}' + __param_target: '{{ target.config.target }}' + __param_module: '{{ target.config.module }}' -- cgit v1.2.3 From 93afffd62ab0da48230985440aae11afbe4de79b Mon Sep 17 00:00:00 2001 From: Christian Pointner Date: Sun, 26 Sep 2021 04:21:56 +0200 Subject: fix alert wording --- roles/monitoring/prometheus/server/defaults/main/rules_node.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'roles/monitoring/prometheus/server/defaults') diff --git a/roles/monitoring/prometheus/server/defaults/main/rules_node.yml b/roles/monitoring/prometheus/server/defaults/main/rules_node.yml index 64a7d562..55641534 100644 --- a/roles/monitoring/prometheus/server/defaults/main/rules_node.yml +++ b/roles/monitoring/prometheus/server/defaults/main/rules_node.yml @@ -243,7 +243,7 @@ prometheus_server_rules_node: severity: warning annotations: summary: Host has packages that can be autoremoved (instance {{ '{{' }} $labels.instance {{ '}}' }}) - description: "Host {{ '{{' }} $labels.instance {{ '}}' }} has {{ '{{' }} $value {{ '}}' }} that can be autoremoved.\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}" + description: "Host {{ '{{' }} $labels.instance {{ '}}' }} has {{ '{{' }} $value {{ '}}' }} packages that can be autoremoved.\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}" - alert: HostNeedsRebooting expr: node_reboot_required > 0 -- cgit v1.2.3