From cc89d6d4211aa5aec8e5bef8c854d4929c337887 Mon Sep 17 00:00:00 2001 From: Christian Pointner Date: Sun, 26 Sep 2021 03:32:47 +0200 Subject: improved promethues multitarget support --- .../prometheus/server/defaults/main/main.yml | 4 +- .../server/defaults/main/rules_blackbox.yml | 46 +------------- .../server/defaults/main/rules_blackbox__https.yml | 20 ------ .../server/defaults/main/rules_blackbox__ping.yml | 11 ---- .../server/defaults/main/rules_blackbox__probe.yml | 74 ++++++++++++++++++++++ .../server/defaults/main/rules_blackbox__ssh.yml | 3 - 6 files changed, 76 insertions(+), 82 deletions(-) delete mode 100644 roles/monitoring/prometheus/server/defaults/main/rules_blackbox__https.yml delete mode 100644 roles/monitoring/prometheus/server/defaults/main/rules_blackbox__ping.yml create mode 100644 roles/monitoring/prometheus/server/defaults/main/rules_blackbox__probe.yml delete mode 100644 roles/monitoring/prometheus/server/defaults/main/rules_blackbox__ssh.yml (limited to 'roles/monitoring/prometheus/server/defaults/main') diff --git a/roles/monitoring/prometheus/server/defaults/main/main.yml b/roles/monitoring/prometheus/server/defaults/main/main.yml index 09cd150c..7781fd69 100644 --- a/roles/monitoring/prometheus/server/defaults/main/main.yml +++ b/roles/monitoring/prometheus/server/defaults/main/main.yml @@ -17,9 +17,7 @@ prometheus_server_rules: nut: "{{ prometheus_server_rules_nut + prometheus_server_rules_nut_extra }}" nut/ups: "{{ prometheus_server_rules_nut__ups + prometheus_server_rules_nut__ups_extra }}" blackbox: "{{ prometheus_server_rules_blackbox + prometheus_server_rules_blackbox_extra }}" - blackbox/ping: "{{ prometheus_server_rules_blackbox__ping + prometheus_server_rules_blackbox__ping_extra }}" - blackbox/https: "{{ prometheus_server_rules_blackbox__https + prometheus_server_rules_blackbox__https_extra }}" - blackbox/ssh: "{{ prometheus_server_rules_blackbox__ssh + prometheus_server_rules_blackbox__ssh_extra }}" + blackbox/probe: "{{ prometheus_server_rules_blackbox__probe + prometheus_server_rules_blackbox__probe_extra }}" # prometheus_server_alertmanager: # url: "127.0.0.1:9093" diff --git a/roles/monitoring/prometheus/server/defaults/main/rules_blackbox.yml b/roles/monitoring/prometheus/server/defaults/main/rules_blackbox.yml index d5c1fd42..99f2e83c 100644 --- a/roles/monitoring/prometheus/server/defaults/main/rules_blackbox.yml +++ b/roles/monitoring/prometheus/server/defaults/main/rules_blackbox.yml @@ -1,47 +1,3 @@ --- prometheus_server_rules_blackbox_extra: [] -prometheus_server_rules_blackbox: - - alert: BlackboxProbeFailed - expr: probe_success == 0 - for: 0m - labels: - severity: critical - annotations: - summary: Blackbox probe failed (instance {{ '{{' }} $labels.instance {{ '}}' }}) - description: "Probe failed\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}" - - - alert: BlackboxSlowProbe - expr: avg_over_time(probe_duration_seconds[1m]) > 1 - for: 1m - labels: - severity: warning - annotations: - summary: Blackbox slow probe (instance {{ '{{' }} $labels.instance {{ '}}' }}) - description: "Blackbox probe took more than 1s to complete\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}" - - - alert: BlackboxSslCertificateWillExpireSoon - expr: probe_ssl_earliest_cert_expiry - time() < 86400 * 30 - for: 0m - labels: - severity: warning - annotations: - summary: Blackbox SSL certificate will expire soon (instance {{ '{{' }} $labels.instance {{ '}}' }}) - description: "SSL certificate expires in 30 days\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}" - - - alert: BlackboxSslCertificateWillExpireSoon - expr: probe_ssl_earliest_cert_expiry - time() < 86400 * 3 - for: 0m - labels: - severity: critical - annotations: - summary: Blackbox SSL certificate will expire soon (instance {{ '{{' }} $labels.instance {{ '}}' }}) - description: "SSL certificate expires in 3 days\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}" - - - alert: BlackboxSslCertificateExpired - expr: probe_ssl_earliest_cert_expiry - time() <= 0 - for: 0m - labels: - severity: critical - annotations: - summary: Blackbox SSL certificate expired (instance {{ '{{' }} $labels.instance {{ '}}' }}) - description: "SSL certificate has expired already\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}" +prometheus_server_rules_blackbox: [] diff --git a/roles/monitoring/prometheus/server/defaults/main/rules_blackbox__https.yml b/roles/monitoring/prometheus/server/defaults/main/rules_blackbox__https.yml deleted file mode 100644 index 140e3b4f..00000000 --- a/roles/monitoring/prometheus/server/defaults/main/rules_blackbox__https.yml +++ /dev/null @@ -1,20 +0,0 @@ ---- -prometheus_server_rules_blackbox__https_extra: [] -prometheus_server_rules_blackbox__https: - - alert: BlackboxProbeHttpFailure - expr: probe_http_status_code <= 199 OR probe_http_status_code >= 400 - for: 0m - labels: - severity: critical - annotations: - summary: Blackbox probe HTTP failure (instance {{ '{{' }} $labels.instance {{ '}}' }}) - description: "HTTP status code is not 200-399\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}" - - - alert: BlackboxProbeSlowHttp - expr: avg_over_time(probe_http_duration_seconds[1m]) > 1 - for: 1m - labels: - severity: warning - annotations: - summary: Blackbox probe slow HTTP (instance {{ '{{' }} $labels.instance {{ '}}' }}) - description: "HTTP request took more than 1s\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}" diff --git a/roles/monitoring/prometheus/server/defaults/main/rules_blackbox__ping.yml b/roles/monitoring/prometheus/server/defaults/main/rules_blackbox__ping.yml deleted file mode 100644 index cc87b6b1..00000000 --- a/roles/monitoring/prometheus/server/defaults/main/rules_blackbox__ping.yml +++ /dev/null @@ -1,11 +0,0 @@ ---- -prometheus_server_rules_blackbox__ping_extra: [] -prometheus_server_rules_blackbox__ping: - - alert: BlackboxProbeSlowPing - expr: avg_over_time(probe_icmp_duration_seconds[1m]) > 1 - for: 1m - labels: - severity: warning - annotations: - summary: Blackbox probe slow ping (instance {{ '{{' }} $labels.instance {{ '}}' }}) - description: "Blackbox ping took more than 1s\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}" diff --git a/roles/monitoring/prometheus/server/defaults/main/rules_blackbox__probe.yml b/roles/monitoring/prometheus/server/defaults/main/rules_blackbox__probe.yml new file mode 100644 index 00000000..9f9d2292 --- /dev/null +++ b/roles/monitoring/prometheus/server/defaults/main/rules_blackbox__probe.yml @@ -0,0 +1,74 @@ +--- +prometheus_server_rules_blackbox__probe_extra: [] +prometheus_server_rules_blackbox__probe: + - alert: BlackboxProbeFailed + expr: probe_success == 0 + for: 0m + labels: + severity: critical + annotations: + summary: Blackbox probe failed (instance {{ '{{' }} $labels.instance {{ '}}' }}) + description: "Probe failed\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}" + + - alert: BlackboxSlowProbe + expr: avg_over_time(probe_duration_seconds[1m]) > 1 + for: 1m + labels: + severity: warning + annotations: + summary: Blackbox slow probe (instance {{ '{{' }} $labels.instance {{ '}}' }}) + description: "Blackbox probe took more than 1s to complete\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}" + + - alert: BlackboxSslCertificateWillExpireSoon + expr: probe_ssl_earliest_cert_expiry - time() < 86400 * 30 + for: 0m + labels: + severity: warning + annotations: + summary: Blackbox SSL certificate will expire soon (instance {{ '{{' }} $labels.instance {{ '}}' }}) + description: "SSL certificate expires in 30 days\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}" + + - alert: BlackboxSslCertificateWillExpireSoon + expr: probe_ssl_earliest_cert_expiry - time() < 86400 * 3 + for: 0m + labels: + severity: critical + annotations: + summary: Blackbox SSL certificate will expire soon (instance {{ '{{' }} $labels.instance {{ '}}' }}) + description: "SSL certificate expires in 3 days\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}" + + - alert: BlackboxSslCertificateExpired + expr: probe_ssl_earliest_cert_expiry - time() <= 0 + for: 0m + labels: + severity: critical + annotations: + summary: Blackbox SSL certificate expired (instance {{ '{{' }} $labels.instance {{ '}}' }}) + description: "SSL certificate has expired already\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}" + + - alert: BlackboxProbeHttpFailure + expr: probe_http_status_code <= 199 OR probe_http_status_code >= 400 + for: 0m + labels: + severity: critical + annotations: + summary: Blackbox probe HTTP failure (instance {{ '{{' }} $labels.instance {{ '}}' }}) + description: "HTTP status code is not 200-399\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}" + + - alert: BlackboxProbeSlowHttp + expr: avg_over_time(probe_http_duration_seconds[1m]) > 1 + for: 1m + labels: + severity: warning + annotations: + summary: Blackbox probe slow HTTP (instance {{ '{{' }} $labels.instance {{ '}}' }}) + description: "HTTP request took more than 1s\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}" + + - alert: BlackboxProbeSlowPing + expr: avg_over_time(probe_icmp_duration_seconds[1m]) > 1 + for: 1m + labels: + severity: warning + annotations: + summary: Blackbox probe slow ping (instance {{ '{{' }} $labels.instance {{ '}}' }}) + description: "Blackbox ping took more than 1s\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}" diff --git a/roles/monitoring/prometheus/server/defaults/main/rules_blackbox__ssh.yml b/roles/monitoring/prometheus/server/defaults/main/rules_blackbox__ssh.yml deleted file mode 100644 index 8e717c41..00000000 --- a/roles/monitoring/prometheus/server/defaults/main/rules_blackbox__ssh.yml +++ /dev/null @@ -1,3 +0,0 @@ ---- -prometheus_server_rules_blackbox__ssh_extra: [] -prometheus_server_rules_blackbox__ssh: [] -- cgit v1.2.3