diff options
Diffstat (limited to 'roles/monitoring/prometheus/server/defaults/main')
11 files changed, 133 insertions, 15 deletions
diff --git a/roles/monitoring/prometheus/server/defaults/main/main.yml b/roles/monitoring/prometheus/server/defaults/main/main.yml index 95b9da6d..1e0ccf78 100644 --- a/roles/monitoring/prometheus/server/defaults/main/main.yml +++ b/roles/monitoring/prometheus/server/defaults/main/main.yml @@ -5,9 +5,8 @@ prometheus_server_retention: "15d" -prometheus_server_jobs_generic: +prometheus_server_jobs: - node -prometheus_server_jobs_special: [] #prometheus_server_jobs_extra: | # - job_name: ... @@ -16,9 +15,11 @@ prometheus_server_rules: node: "{{ prometheus_server_rules_node + prometheus_server_rules_node_extra }}" openwrt: "{{ prometheus_server_rules_openwrt + prometheus_server_rules_node_extra }}" nut: "{{ prometheus_server_rules_nut + prometheus_server_rules_nut_extra }}" - "blackbox-ping": "{{ prometheus_server_rules_blackbox_ping + prometheus_server_rules_blackbox_ping_extra }}" - "blackbox-https": "{{ prometheus_server_rules_blackbox_https + prometheus_server_rules_blackbox_https_extra }}" - "blackbox-ssh": "{{ prometheus_server_rules_blackbox_ssh + prometheus_server_rules_blackbox_ssh_extra }}" + nut/ups: "{{ prometheus_server_rules_nut__ups + prometheus_server_rules_nut__ups_extra }}" + blackbox: "{{ prometheus_server_rules_blackbox + prometheus_server_rules_blackbox_extra }}" + blackbox/probe: "{{ prometheus_server_rules_blackbox__probe + prometheus_server_rules_blackbox__probe_extra }}" + ipmi: "{{ prometheus_server_rules_ipmi + prometheus_server_rules_ipmi_extra }}" + ipmi/remote: "{{ prometheus_server_rules_ipmi__remote + prometheus_server_rules_ipmi__remote_extra }}" # prometheus_server_alertmanager: # url: "127.0.0.1:9093" diff --git a/roles/monitoring/prometheus/server/defaults/main/rules_blackbox-https.yml b/roles/monitoring/prometheus/server/defaults/main/rules_blackbox-https.yml deleted file mode 100644 index bb806075..00000000 --- a/roles/monitoring/prometheus/server/defaults/main/rules_blackbox-https.yml +++ /dev/null @@ -1,3 +0,0 @@ ---- -prometheus_server_rules_blackbox_https_extra: [] -prometheus_server_rules_blackbox_https: [] diff --git a/roles/monitoring/prometheus/server/defaults/main/rules_blackbox-ping.yml b/roles/monitoring/prometheus/server/defaults/main/rules_blackbox-ping.yml deleted file mode 100644 index 56c122f5..00000000 --- a/roles/monitoring/prometheus/server/defaults/main/rules_blackbox-ping.yml +++ /dev/null @@ -1,3 +0,0 @@ ---- -prometheus_server_rules_blackbox_ping_extra: [] -prometheus_server_rules_blackbox_ping: [] diff --git a/roles/monitoring/prometheus/server/defaults/main/rules_blackbox-ssh.yml b/roles/monitoring/prometheus/server/defaults/main/rules_blackbox-ssh.yml deleted file mode 100644 index 727d2292..00000000 --- a/roles/monitoring/prometheus/server/defaults/main/rules_blackbox-ssh.yml +++ /dev/null @@ -1,3 +0,0 @@ ---- -prometheus_server_rules_blackbox_ssh_extra: [] -prometheus_server_rules_blackbox_ssh: [] diff --git a/roles/monitoring/prometheus/server/defaults/main/rules_blackbox.yml b/roles/monitoring/prometheus/server/defaults/main/rules_blackbox.yml new file mode 100644 index 00000000..99f2e83c --- /dev/null +++ b/roles/monitoring/prometheus/server/defaults/main/rules_blackbox.yml @@ -0,0 +1,3 @@ +--- +prometheus_server_rules_blackbox_extra: [] +prometheus_server_rules_blackbox: [] diff --git a/roles/monitoring/prometheus/server/defaults/main/rules_blackbox__probe.yml b/roles/monitoring/prometheus/server/defaults/main/rules_blackbox__probe.yml new file mode 100644 index 00000000..9f9d2292 --- /dev/null +++ b/roles/monitoring/prometheus/server/defaults/main/rules_blackbox__probe.yml @@ -0,0 +1,74 @@ +--- +prometheus_server_rules_blackbox__probe_extra: [] +prometheus_server_rules_blackbox__probe: + - alert: BlackboxProbeFailed + expr: probe_success == 0 + for: 0m + labels: + severity: critical + annotations: + summary: Blackbox probe failed (instance {{ '{{' }} $labels.instance {{ '}}' }}) + description: "Probe failed\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}" + + - alert: BlackboxSlowProbe + expr: avg_over_time(probe_duration_seconds[1m]) > 1 + for: 1m + labels: + severity: warning + annotations: + summary: Blackbox slow probe (instance {{ '{{' }} $labels.instance {{ '}}' }}) + description: "Blackbox probe took more than 1s to complete\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}" + + - alert: BlackboxSslCertificateWillExpireSoon + expr: probe_ssl_earliest_cert_expiry - time() < 86400 * 30 + for: 0m + labels: + severity: warning + annotations: + summary: Blackbox SSL certificate will expire soon (instance {{ '{{' }} $labels.instance {{ '}}' }}) + description: "SSL certificate expires in 30 days\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}" + + - alert: BlackboxSslCertificateWillExpireSoon + expr: probe_ssl_earliest_cert_expiry - time() < 86400 * 3 + for: 0m + labels: + severity: critical + annotations: + summary: Blackbox SSL certificate will expire soon (instance {{ '{{' }} $labels.instance {{ '}}' }}) + description: "SSL certificate expires in 3 days\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}" + + - alert: BlackboxSslCertificateExpired + expr: probe_ssl_earliest_cert_expiry - time() <= 0 + for: 0m + labels: + severity: critical + annotations: + summary: Blackbox SSL certificate expired (instance {{ '{{' }} $labels.instance {{ '}}' }}) + description: "SSL certificate has expired already\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}" + + - alert: BlackboxProbeHttpFailure + expr: probe_http_status_code <= 199 OR probe_http_status_code >= 400 + for: 0m + labels: + severity: critical + annotations: + summary: Blackbox probe HTTP failure (instance {{ '{{' }} $labels.instance {{ '}}' }}) + description: "HTTP status code is not 200-399\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}" + + - alert: BlackboxProbeSlowHttp + expr: avg_over_time(probe_http_duration_seconds[1m]) > 1 + for: 1m + labels: + severity: warning + annotations: + summary: Blackbox probe slow HTTP (instance {{ '{{' }} $labels.instance {{ '}}' }}) + description: "HTTP request took more than 1s\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}" + + - alert: BlackboxProbeSlowPing + expr: avg_over_time(probe_icmp_duration_seconds[1m]) > 1 + for: 1m + labels: + severity: warning + annotations: + summary: Blackbox probe slow ping (instance {{ '{{' }} $labels.instance {{ '}}' }}) + description: "Blackbox ping took more than 1s\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}" diff --git a/roles/monitoring/prometheus/server/defaults/main/rules_ipmi.yml b/roles/monitoring/prometheus/server/defaults/main/rules_ipmi.yml new file mode 100644 index 00000000..41dcd7e9 --- /dev/null +++ b/roles/monitoring/prometheus/server/defaults/main/rules_ipmi.yml @@ -0,0 +1,4 @@ +--- +prometheus_server_rules_ipmi_extra: [] +prometheus_server_rules_ipmi: [] +## TODO: add common IPMI alert rules diff --git a/roles/monitoring/prometheus/server/defaults/main/rules_ipmi__remote.yml b/roles/monitoring/prometheus/server/defaults/main/rules_ipmi__remote.yml new file mode 100644 index 00000000..1f9338ea --- /dev/null +++ b/roles/monitoring/prometheus/server/defaults/main/rules_ipmi__remote.yml @@ -0,0 +1,4 @@ +--- +prometheus_server_rules_ipmi__remote_extra: [] +prometheus_server_rules_ipmi__remote: [] +## TODO: add remote-IPMI specific alert rules diff --git a/roles/monitoring/prometheus/server/defaults/main/rules_node.yml b/roles/monitoring/prometheus/server/defaults/main/rules_node.yml index ab7317ac..55641534 100644 --- a/roles/monitoring/prometheus/server/defaults/main/rules_node.yml +++ b/roles/monitoring/prometheus/server/defaults/main/rules_node.yml @@ -92,6 +92,15 @@ prometheus_server_rules_node: summary: Host CPU steal noisy neighbor (instance {{ '{{' }} $labels.instance {{ '}}' }}) description: "CPU steal is > 10%. A noisy neighbor is killing VM performances or a spot instance may be out of credit.\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}" + - alert: HostSystemdNotRunning + expr: node_systemd_system_running == 0 + for: 0m + labels: + severity: warning + annotations: + summary: Host systemd is not in running state (instance {{ '{{' }} $labels.instance {{ '}}' }}) + description: "systemd is not in running state.\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}" + - alert: HostSystemdServiceCrashed expr: node_systemd_unit_state{state="failed"} == 1 for: 0m @@ -99,7 +108,7 @@ prometheus_server_rules_node: severity: warning annotations: summary: Host systemd service crashed (instance {{ '{{' }} $labels.instance {{ '}}' }}) - description: "systemd service crashed\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}" + description: "The systemd service unit {{ '{{' }} $labels.name {{ '}}' }} is in failed state.\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}" - alert: HostPhysicalComponentTooHot expr: node_hwmon_temp_celsius > 75 @@ -217,3 +226,30 @@ prometheus_server_rules_node: annotations: summary: Host clock not synchronising (instance {{ '{{' }} $labels.instance {{ '}}' }}) description: "Clock not synchronising.\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}" + + - alert: AptUpgradesPending + expr: sum by (instance) (apt_upgrades_pending) > 0 + for: 0m + labels: + severity: warning + annotations: + summary: Host has upgradeable packages (instance {{ '{{' }} $labels.instance {{ '}}' }}) + description: "Host {{ '{{' }} $labels.instance {{ '}}' }} has {{ '{{' }} $value {{ '}}' }} upgradable packages.\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}" + + - alert: AptAutoremovePending + expr: sum by (instance) (apt_autoremove_pending) > 0 + for: 0m + labels: + severity: warning + annotations: + summary: Host has packages that can be autoremoved (instance {{ '{{' }} $labels.instance {{ '}}' }}) + description: "Host {{ '{{' }} $labels.instance {{ '}}' }} has {{ '{{' }} $value {{ '}}' }} packages that can be autoremoved.\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}" + + - alert: HostNeedsRebooting + expr: node_reboot_required > 0 + for: 0m + labels: + severity: warning + annotations: + summary: Host must be rebootet (instance {{ '{{' }} $labels.instance {{ '}}' }}) + description: "Host {{ '{{' }} $labels.instance {{ '}}' }} must be rebootet for security uppdates to take effect.\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}" diff --git a/roles/monitoring/prometheus/server/defaults/main/rules_nut__ups.yml b/roles/monitoring/prometheus/server/defaults/main/rules_nut__ups.yml new file mode 100644 index 00000000..150a507e --- /dev/null +++ b/roles/monitoring/prometheus/server/defaults/main/rules_nut__ups.yml @@ -0,0 +1,4 @@ +--- +prometheus_server_rules_nut__ups_extra: [] +prometheus_server_rules_nut__ups: [] +## TODO: add NUT/UPS alert rules diff --git a/roles/monitoring/prometheus/server/defaults/main/rules_openwrt.yml b/roles/monitoring/prometheus/server/defaults/main/rules_openwrt.yml index 88d84f31..04b178f1 100644 --- a/roles/monitoring/prometheus/server/defaults/main/rules_openwrt.yml +++ b/roles/monitoring/prometheus/server/defaults/main/rules_openwrt.yml @@ -1,3 +1,4 @@ --- prometheus_server_rules_openwrt_extra: [] prometheus_server_rules_openwrt: [] +## TODO: add openwrt specific alert rules |