summaryrefslogtreecommitdiff
path: root/roles/monitoring/prometheus/server/defaults/main
diff options
context:
space:
mode:
Diffstat (limited to 'roles/monitoring/prometheus/server/defaults/main')
-rw-r--r--roles/monitoring/prometheus/server/defaults/main/main.yml26
-rw-r--r--roles/monitoring/prometheus/server/defaults/main/rules_coredns.yml29
-rw-r--r--roles/monitoring/prometheus/server/defaults/main/rules_prometheus.yml4
3 files changed, 56 insertions, 3 deletions
diff --git a/roles/monitoring/prometheus/server/defaults/main/main.yml b/roles/monitoring/prometheus/server/defaults/main/main.yml
index 9f867568..dd290e9e 100644
--- a/roles/monitoring/prometheus/server/defaults/main/main.yml
+++ b/roles/monitoring/prometheus/server/defaults/main/main.yml
@@ -15,7 +15,7 @@ prometheus_server_evaluation_interval: "15s"
prometheus_server_jobs:
- node
-#prometheus_server_jobs_extra: |
+#prometheus_server_jobs_extra: |-
# - job_name: ...
prometheus_server_rules:
@@ -41,6 +41,7 @@ prometheus_server_rules:
nftables: "{{ prometheus_server_rules_nftables + prometheus_server_rules_nftables_extra }}"
whawty-nginx-sso: "{{ prometheus_server_rules_whawty_nginx_sso + prometheus_server_rules_whawty_nginx_sso_extra }}"
mosquitto: "{{ prometheus_server_rules_mosquitto + prometheus_server_rules_mosquitto_extra }}"
+ coredns: "{{ prometheus_server_rules_coredns + prometheus_server_rules_coredns_extra }}"
# prometheus_server_alertmanager:
# url: "127.0.0.1:9093"
@@ -48,6 +49,7 @@ prometheus_server_rules:
# basic_auth:
# username: server
# password: geheim
+# scrape_instance: <inventory-hostname>
prometheus_server_web_listen_address: 127.0.0.1:9090
# prometheus_server_web_route_prefix: /prometheus/
@@ -73,5 +75,27 @@ prometheus_server_web_listen_address: 127.0.0.1:9090
# - node
# - blackbox
+prometheus_server_remote_write_receiver: no
+
+# prometheus_server_remote_write_destinations:
+# example:
+# url: "https://mon.example.com/prometheus/api/v1/write"
+# basic_auth:
+# username: remote
+# password_file: /etc/prometheus/prometheus-remote.secret
+# tls_config:
+# ca: |
+# -----BEGIN CERTIFICATE-----
+# ...
+# -----END CERTIFICATE-----
+# write_relabel_configs:
+# - source_labels: ['__name__']
+# regex: 'go_gc_.*'
+# action: 'drop'
+# - source_labels: ['job']
+# regex: 'alertmanager'
+# action: 'drop'
+
# prometheus_server_secret_files:
# user: secret
+# remote: othersecret
diff --git a/roles/monitoring/prometheus/server/defaults/main/rules_coredns.yml b/roles/monitoring/prometheus/server/defaults/main/rules_coredns.yml
new file mode 100644
index 00000000..126a7ba4
--- /dev/null
+++ b/roles/monitoring/prometheus/server/defaults/main/rules_coredns.yml
@@ -0,0 +1,29 @@
+---
+prometheus_server_rules_coredns_extra: []
+prometheus_server_rules_coredns:
+ - alert: CorednsPanicCount
+ expr: increase(coredns_panics_total[15m]) > 0
+ for: 0m
+ labels:
+ severity: critical
+ annotations:
+ summary: CoreDNS Panic (instance {{ '{{' }} $labels.instance {{ '}}' }})
+ description: "Number of CoreDNS panics encountered has been increasing in the last 15 minutes\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}"
+
+ - alert: CorednsFailedReloadCount
+ expr: increase(coredns_reload_failed_total[15m]) > 0
+ for: 0m
+ labels:
+ severity: critical
+ annotations:
+ summary: CoreDNS reload failed (instance {{ '{{' }} $labels.instance {{ '}}' }})
+ description: "Number of CoreDNS failed reloads has been increasing in the last 15 minutes\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}"
+
+ - alert: CorednsBrokenForwardHealthchecks
+ expr: increase(coredns_forward_healthcheck_broken_total[15m]) > 0
+ for: 0m
+ labels:
+ severity: warning
+ annotations:
+ summary: CoreDNS broken forward healthchecks (instance {{ '{{' }} $labels.instance {{ '}}' }})
+ description: "Number of CoreDNS broken forward healthchecks has been increasing in the last 15 minutes\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}"
diff --git a/roles/monitoring/prometheus/server/defaults/main/rules_prometheus.yml b/roles/monitoring/prometheus/server/defaults/main/rules_prometheus.yml
index 4db6cd17..5cb27264 100644
--- a/roles/monitoring/prometheus/server/defaults/main/rules_prometheus.yml
+++ b/roles/monitoring/prometheus/server/defaults/main/rules_prometheus.yml
@@ -93,13 +93,13 @@ prometheus_server_rules_prometheus:
description: "Prometheus has no target in service discovery\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}"
- alert: PrometheusTargetScrapingSlow
- expr: prometheus_target_interval_length_seconds{quantile="0.9"} > 60
+ expr: prometheus_target_interval_length_seconds{quantile="0.9"} / on (interval, instance, job) prometheus_target_interval_length_seconds{quantile="0.5"} > 1.05
for: 5m
labels:
severity: warning
annotations:
summary: Prometheus target scraping slow (instance {{ '{{' }} $labels.instance {{ '}}' }})
- description: "Prometheus is scraping exporters slowly\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}"
+ description: "Prometheus is scraping exporters slowly since it exceeded the requested interval time\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}"
- alert: PrometheusLargeScrape
expr: increase(prometheus_target_scrapes_exceeded_sample_limit_total[10m]) > 10