diff options
Diffstat (limited to 'roles/monitoring/prometheus/server/defaults')
3 files changed, 56 insertions, 3 deletions
diff --git a/roles/monitoring/prometheus/server/defaults/main/main.yml b/roles/monitoring/prometheus/server/defaults/main/main.yml index 9f867568..dd290e9e 100644 --- a/roles/monitoring/prometheus/server/defaults/main/main.yml +++ b/roles/monitoring/prometheus/server/defaults/main/main.yml @@ -15,7 +15,7 @@ prometheus_server_evaluation_interval: "15s" prometheus_server_jobs: - node -#prometheus_server_jobs_extra: | +#prometheus_server_jobs_extra: |- # - job_name: ... prometheus_server_rules: @@ -41,6 +41,7 @@ prometheus_server_rules: nftables: "{{ prometheus_server_rules_nftables + prometheus_server_rules_nftables_extra }}" whawty-nginx-sso: "{{ prometheus_server_rules_whawty_nginx_sso + prometheus_server_rules_whawty_nginx_sso_extra }}" mosquitto: "{{ prometheus_server_rules_mosquitto + prometheus_server_rules_mosquitto_extra }}" + coredns: "{{ prometheus_server_rules_coredns + prometheus_server_rules_coredns_extra }}" # prometheus_server_alertmanager: # url: "127.0.0.1:9093" @@ -48,6 +49,7 @@ prometheus_server_rules: # basic_auth: # username: server # password: geheim +# scrape_instance: <inventory-hostname> prometheus_server_web_listen_address: 127.0.0.1:9090 # prometheus_server_web_route_prefix: /prometheus/ @@ -73,5 +75,27 @@ prometheus_server_web_listen_address: 127.0.0.1:9090 # - node # - blackbox +prometheus_server_remote_write_receiver: no + +# prometheus_server_remote_write_destinations: +# example: +# url: "https://mon.example.com/prometheus/api/v1/write" +# basic_auth: +# username: remote +# password_file: /etc/prometheus/prometheus-remote.secret +# tls_config: +# ca: | +# -----BEGIN CERTIFICATE----- +# ... +# -----END CERTIFICATE----- +# write_relabel_configs: +# - source_labels: ['__name__'] +# regex: 'go_gc_.*' +# action: 'drop' +# - source_labels: ['job'] +# regex: 'alertmanager' +# action: 'drop' + # prometheus_server_secret_files: # user: secret +# remote: othersecret diff --git a/roles/monitoring/prometheus/server/defaults/main/rules_coredns.yml b/roles/monitoring/prometheus/server/defaults/main/rules_coredns.yml new file mode 100644 index 00000000..126a7ba4 --- /dev/null +++ b/roles/monitoring/prometheus/server/defaults/main/rules_coredns.yml @@ -0,0 +1,29 @@ +--- +prometheus_server_rules_coredns_extra: [] +prometheus_server_rules_coredns: + - alert: CorednsPanicCount + expr: increase(coredns_panics_total[15m]) > 0 + for: 0m + labels: + severity: critical + annotations: + summary: CoreDNS Panic (instance {{ '{{' }} $labels.instance {{ '}}' }}) + description: "Number of CoreDNS panics encountered has been increasing in the last 15 minutes\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}" + + - alert: CorednsFailedReloadCount + expr: increase(coredns_reload_failed_total[15m]) > 0 + for: 0m + labels: + severity: critical + annotations: + summary: CoreDNS reload failed (instance {{ '{{' }} $labels.instance {{ '}}' }}) + description: "Number of CoreDNS failed reloads has been increasing in the last 15 minutes\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}" + + - alert: CorednsBrokenForwardHealthchecks + expr: increase(coredns_forward_healthcheck_broken_total[15m]) > 0 + for: 0m + labels: + severity: warning + annotations: + summary: CoreDNS broken forward healthchecks (instance {{ '{{' }} $labels.instance {{ '}}' }}) + description: "Number of CoreDNS broken forward healthchecks has been increasing in the last 15 minutes\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}" diff --git a/roles/monitoring/prometheus/server/defaults/main/rules_prometheus.yml b/roles/monitoring/prometheus/server/defaults/main/rules_prometheus.yml index 4db6cd17..5cb27264 100644 --- a/roles/monitoring/prometheus/server/defaults/main/rules_prometheus.yml +++ b/roles/monitoring/prometheus/server/defaults/main/rules_prometheus.yml @@ -93,13 +93,13 @@ prometheus_server_rules_prometheus: description: "Prometheus has no target in service discovery\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}" - alert: PrometheusTargetScrapingSlow - expr: prometheus_target_interval_length_seconds{quantile="0.9"} > 60 + expr: prometheus_target_interval_length_seconds{quantile="0.9"} / on (interval, instance, job) prometheus_target_interval_length_seconds{quantile="0.5"} > 1.05 for: 5m labels: severity: warning annotations: summary: Prometheus target scraping slow (instance {{ '{{' }} $labels.instance {{ '}}' }}) - description: "Prometheus is scraping exporters slowly\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}" + description: "Prometheus is scraping exporters slowly since it exceeded the requested interval time\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}" - alert: PrometheusLargeScrape expr: increase(prometheus_target_scrapes_exceeded_sample_limit_total[10m]) > 10 |