summaryrefslogtreecommitdiff
path: root/roles/monitoring/prometheus
diff options
context:
space:
mode:
Diffstat (limited to 'roles/monitoring/prometheus')
-rw-r--r--roles/monitoring/prometheus/exporter/coredns/handlers/main.yml5
-rw-r--r--roles/monitoring/prometheus/exporter/coredns/tasks/main.yml9
-rw-r--r--roles/monitoring/prometheus/exporter/meta/main.yml2
-rw-r--r--roles/monitoring/prometheus/server/defaults/main/main.yml4
-rw-r--r--roles/monitoring/prometheus/server/defaults/main/rules_coredns.yml29
-rw-r--r--roles/monitoring/prometheus/server/templates/prometheus.yml.j24
6 files changed, 51 insertions, 2 deletions
diff --git a/roles/monitoring/prometheus/exporter/coredns/handlers/main.yml b/roles/monitoring/prometheus/exporter/coredns/handlers/main.yml
new file mode 100644
index 00000000..d4e42ca0
--- /dev/null
+++ b/roles/monitoring/prometheus/exporter/coredns/handlers/main.yml
@@ -0,0 +1,5 @@
+---
+- name: reload nginx
+ service:
+ name: nginx
+ state: reloaded
diff --git a/roles/monitoring/prometheus/exporter/coredns/tasks/main.yml b/roles/monitoring/prometheus/exporter/coredns/tasks/main.yml
new file mode 100644
index 00000000..884eb4bf
--- /dev/null
+++ b/roles/monitoring/prometheus/exporter/coredns/tasks/main.yml
@@ -0,0 +1,9 @@
+---
+- name: register exporter
+ copy:
+ content: |
+ location = /coredns {
+ proxy_pass http://127.0.0.1:9153/metrics;
+ }
+ dest: /etc/prometheus/exporter/coredns.locations
+ notify: reload nginx
diff --git a/roles/monitoring/prometheus/exporter/meta/main.yml b/roles/monitoring/prometheus/exporter/meta/main.yml
index b60d0dbc..0580861a 100644
--- a/roles/monitoring/prometheus/exporter/meta/main.yml
+++ b/roles/monitoring/prometheus/exporter/meta/main.yml
@@ -29,4 +29,6 @@ dependencies:
when: "'nftables' in (prometheus_exporters_default | union(prometheus_exporters_extra))"
- role: monitoring/prometheus/exporter/mosquitto
when: "'mosquitto' in (prometheus_exporters_default | union(prometheus_exporters_extra))"
+ - role: monitoring/prometheus/exporter/coredns
+ when: "'coredns' in (prometheus_exporters_default | union(prometheus_exporters_extra))"
- role: monitoring/prometheus/exporter/register
diff --git a/roles/monitoring/prometheus/server/defaults/main/main.yml b/roles/monitoring/prometheus/server/defaults/main/main.yml
index 9f867568..25cffa5b 100644
--- a/roles/monitoring/prometheus/server/defaults/main/main.yml
+++ b/roles/monitoring/prometheus/server/defaults/main/main.yml
@@ -15,7 +15,7 @@ prometheus_server_evaluation_interval: "15s"
prometheus_server_jobs:
- node
-#prometheus_server_jobs_extra: |
+#prometheus_server_jobs_extra: |-
# - job_name: ...
prometheus_server_rules:
@@ -41,6 +41,7 @@ prometheus_server_rules:
nftables: "{{ prometheus_server_rules_nftables + prometheus_server_rules_nftables_extra }}"
whawty-nginx-sso: "{{ prometheus_server_rules_whawty_nginx_sso + prometheus_server_rules_whawty_nginx_sso_extra }}"
mosquitto: "{{ prometheus_server_rules_mosquitto + prometheus_server_rules_mosquitto_extra }}"
+ coredns: "{{ prometheus_server_rules_coredns + prometheus_server_rules_coredns_extra }}"
# prometheus_server_alertmanager:
# url: "127.0.0.1:9093"
@@ -48,6 +49,7 @@ prometheus_server_rules:
# basic_auth:
# username: server
# password: geheim
+# scrape_instance: <inventory-hostname>
prometheus_server_web_listen_address: 127.0.0.1:9090
# prometheus_server_web_route_prefix: /prometheus/
diff --git a/roles/monitoring/prometheus/server/defaults/main/rules_coredns.yml b/roles/monitoring/prometheus/server/defaults/main/rules_coredns.yml
new file mode 100644
index 00000000..126a7ba4
--- /dev/null
+++ b/roles/monitoring/prometheus/server/defaults/main/rules_coredns.yml
@@ -0,0 +1,29 @@
+---
+prometheus_server_rules_coredns_extra: []
+prometheus_server_rules_coredns:
+ - alert: CorednsPanicCount
+ expr: increase(coredns_panics_total[15m]) > 0
+ for: 0m
+ labels:
+ severity: critical
+ annotations:
+ summary: CoreDNS Panic (instance {{ '{{' }} $labels.instance {{ '}}' }})
+ description: "Number of CoreDNS panics encountered has been increasing in the last 15 minutes\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}"
+
+ - alert: CorednsFailedReloadCount
+ expr: increase(coredns_reload_failed_total[15m]) > 0
+ for: 0m
+ labels:
+ severity: critical
+ annotations:
+ summary: CoreDNS reload failed (instance {{ '{{' }} $labels.instance {{ '}}' }})
+ description: "Number of CoreDNS failed reloads has been increasing in the last 15 minutes\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}"
+
+ - alert: CorednsBrokenForwardHealthchecks
+ expr: increase(coredns_forward_healthcheck_broken_total[15m]) > 0
+ for: 0m
+ labels:
+ severity: warning
+ annotations:
+ summary: CoreDNS broken forward healthchecks (instance {{ '{{' }} $labels.instance {{ '}}' }})
+ description: "Number of CoreDNS broken forward healthchecks has been increasing in the last 15 minutes\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}"
diff --git a/roles/monitoring/prometheus/server/templates/prometheus.yml.j2 b/roles/monitoring/prometheus/server/templates/prometheus.yml.j2
index 09d5452d..85adfa52 100644
--- a/roles/monitoring/prometheus/server/templates/prometheus.yml.j2
+++ b/roles/monitoring/prometheus/server/templates/prometheus.yml.j2
@@ -45,7 +45,7 @@ scrape_configs:
- targets: ['localhost:9090']
labels:
instance: '{{ inventory_hostname }}'
-{% if prometheus_server_alertmanager is defined %}
+{% if prometheus_server_alertmanager is defined and 'scrape_instance' in prometheus_server_alertmanager %}
- job_name: 'alertmanager'
{% if 'path_prefix' in prometheus_server_alertmanager %}
@@ -58,6 +58,8 @@ scrape_configs:
{% endif %}
static_configs:
- targets: ['{{ prometheus_server_alertmanager.url }}']
+ labels:
+ instance: '{{ prometheus_server_alertmanager.scrape_instance }}'
{% endif %}
{% for job in (prometheus_server_jobs) %}