From 486c84d53244e44ff72a3c2db42ee12afdb083e8 Mon Sep 17 00:00:00 2001 From: Christian Pointner Date: Thu, 23 Sep 2021 20:37:20 +0200 Subject: add some more prometheus rules for blackbox exporter --- .../prometheus/server/defaults/main/main.yml | 2 + .../server/defaults/main/rules_blackbox.yml | 47 ++++++++++++++++++++++ .../server/defaults/main/rules_blackbox__https.yml | 19 ++++++++- .../server/defaults/main/rules_blackbox__ping.yml | 10 ++++- .../prometheus/server/defaults/main/rules_nut.yml | 3 ++ roles/monitoring/prometheus/server/tasks/main.yml | 2 +- 6 files changed, 80 insertions(+), 3 deletions(-) create mode 100644 roles/monitoring/prometheus/server/defaults/main/rules_blackbox.yml create mode 100644 roles/monitoring/prometheus/server/defaults/main/rules_nut.yml diff --git a/roles/monitoring/prometheus/server/defaults/main/main.yml b/roles/monitoring/prometheus/server/defaults/main/main.yml index bae0cdba..09cd150c 100644 --- a/roles/monitoring/prometheus/server/defaults/main/main.yml +++ b/roles/monitoring/prometheus/server/defaults/main/main.yml @@ -14,7 +14,9 @@ prometheus_server_rules: prometheus: "{{ prometheus_server_rules_prometheus + ((prometheus_server_alertmanager is defined) | ternary(prometheus_server_rules_prometheus_alertmanager, [])) + prometheus_server_rules_prometheus_extra }}" node: "{{ prometheus_server_rules_node + prometheus_server_rules_node_extra }}" openwrt: "{{ prometheus_server_rules_openwrt + prometheus_server_rules_node_extra }}" + nut: "{{ prometheus_server_rules_nut + prometheus_server_rules_nut_extra }}" nut/ups: "{{ prometheus_server_rules_nut__ups + prometheus_server_rules_nut__ups_extra }}" + blackbox: "{{ prometheus_server_rules_blackbox + prometheus_server_rules_blackbox_extra }}" blackbox/ping: "{{ prometheus_server_rules_blackbox__ping + prometheus_server_rules_blackbox__ping_extra }}" blackbox/https: "{{ prometheus_server_rules_blackbox__https + prometheus_server_rules_blackbox__https_extra }}" blackbox/ssh: "{{ prometheus_server_rules_blackbox__ssh + prometheus_server_rules_blackbox__ssh_extra }}" diff --git a/roles/monitoring/prometheus/server/defaults/main/rules_blackbox.yml b/roles/monitoring/prometheus/server/defaults/main/rules_blackbox.yml new file mode 100644 index 00000000..d5c1fd42 --- /dev/null +++ b/roles/monitoring/prometheus/server/defaults/main/rules_blackbox.yml @@ -0,0 +1,47 @@ +--- +prometheus_server_rules_blackbox_extra: [] +prometheus_server_rules_blackbox: + - alert: BlackboxProbeFailed + expr: probe_success == 0 + for: 0m + labels: + severity: critical + annotations: + summary: Blackbox probe failed (instance {{ '{{' }} $labels.instance {{ '}}' }}) + description: "Probe failed\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}" + + - alert: BlackboxSlowProbe + expr: avg_over_time(probe_duration_seconds[1m]) > 1 + for: 1m + labels: + severity: warning + annotations: + summary: Blackbox slow probe (instance {{ '{{' }} $labels.instance {{ '}}' }}) + description: "Blackbox probe took more than 1s to complete\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}" + + - alert: BlackboxSslCertificateWillExpireSoon + expr: probe_ssl_earliest_cert_expiry - time() < 86400 * 30 + for: 0m + labels: + severity: warning + annotations: + summary: Blackbox SSL certificate will expire soon (instance {{ '{{' }} $labels.instance {{ '}}' }}) + description: "SSL certificate expires in 30 days\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}" + + - alert: BlackboxSslCertificateWillExpireSoon + expr: probe_ssl_earliest_cert_expiry - time() < 86400 * 3 + for: 0m + labels: + severity: critical + annotations: + summary: Blackbox SSL certificate will expire soon (instance {{ '{{' }} $labels.instance {{ '}}' }}) + description: "SSL certificate expires in 3 days\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}" + + - alert: BlackboxSslCertificateExpired + expr: probe_ssl_earliest_cert_expiry - time() <= 0 + for: 0m + labels: + severity: critical + annotations: + summary: Blackbox SSL certificate expired (instance {{ '{{' }} $labels.instance {{ '}}' }}) + description: "SSL certificate has expired already\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}" diff --git a/roles/monitoring/prometheus/server/defaults/main/rules_blackbox__https.yml b/roles/monitoring/prometheus/server/defaults/main/rules_blackbox__https.yml index cfdc10bd..140e3b4f 100644 --- a/roles/monitoring/prometheus/server/defaults/main/rules_blackbox__https.yml +++ b/roles/monitoring/prometheus/server/defaults/main/rules_blackbox__https.yml @@ -1,3 +1,20 @@ --- prometheus_server_rules_blackbox__https_extra: [] -prometheus_server_rules_blackbox__https: [] +prometheus_server_rules_blackbox__https: + - alert: BlackboxProbeHttpFailure + expr: probe_http_status_code <= 199 OR probe_http_status_code >= 400 + for: 0m + labels: + severity: critical + annotations: + summary: Blackbox probe HTTP failure (instance {{ '{{' }} $labels.instance {{ '}}' }}) + description: "HTTP status code is not 200-399\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}" + + - alert: BlackboxProbeSlowHttp + expr: avg_over_time(probe_http_duration_seconds[1m]) > 1 + for: 1m + labels: + severity: warning + annotations: + summary: Blackbox probe slow HTTP (instance {{ '{{' }} $labels.instance {{ '}}' }}) + description: "HTTP request took more than 1s\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}" diff --git a/roles/monitoring/prometheus/server/defaults/main/rules_blackbox__ping.yml b/roles/monitoring/prometheus/server/defaults/main/rules_blackbox__ping.yml index 06ce8607..cc87b6b1 100644 --- a/roles/monitoring/prometheus/server/defaults/main/rules_blackbox__ping.yml +++ b/roles/monitoring/prometheus/server/defaults/main/rules_blackbox__ping.yml @@ -1,3 +1,11 @@ --- prometheus_server_rules_blackbox__ping_extra: [] -prometheus_server_rules_blackbox__ping: [] +prometheus_server_rules_blackbox__ping: + - alert: BlackboxProbeSlowPing + expr: avg_over_time(probe_icmp_duration_seconds[1m]) > 1 + for: 1m + labels: + severity: warning + annotations: + summary: Blackbox probe slow ping (instance {{ '{{' }} $labels.instance {{ '}}' }}) + description: "Blackbox ping took more than 1s\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}" diff --git a/roles/monitoring/prometheus/server/defaults/main/rules_nut.yml b/roles/monitoring/prometheus/server/defaults/main/rules_nut.yml new file mode 100644 index 00000000..d8d64f64 --- /dev/null +++ b/roles/monitoring/prometheus/server/defaults/main/rules_nut.yml @@ -0,0 +1,3 @@ +--- +prometheus_server_rules_nut_extra: [] +prometheus_server_rules_nut: [] diff --git a/roles/monitoring/prometheus/server/tasks/main.yml b/roles/monitoring/prometheus/server/tasks/main.yml index 16167c9c..c0928cc3 100644 --- a/roles/monitoring/prometheus/server/tasks/main.yml +++ b/roles/monitoring/prometheus/server/tasks/main.yml @@ -83,7 +83,7 @@ state: directory - name: generate rules files for all jobs - loop: "{{ prometheus_server_jobs | union(['prometheus']) }}" + loop: "{{ prometheus_server_jobs | union(['prometheus']) | union(prometheus_server_jobs | select('match', '.*/.*') | map('dirname') | unique) }}" template: src: rules.yml.j2 dest: "/etc/prometheus/rules/{{ item }}.yml" -- cgit v1.2.3