summaryrefslogtreecommitdiff
path: root/roles/monitoring/prometheus
diff options
context:
space:
mode:
authorChristian Pointner <equinox@spreadspace.org>2021-06-23 23:06:40 +0200
committerChristian Pointner <equinox@spreadspace.org>2021-06-23 23:06:40 +0200
commit6cf380956bdd31292b4ccf51b1bbc217b93bf45f (patch)
treef887f4bad36c796b78c8b211ae97bd90efc6819b /roles/monitoring/prometheus
parentadd minimalistic role for prometheus/alertmanager (diff)
prometheus: connect server to alertmanager if configured
Diffstat (limited to 'roles/monitoring/prometheus')
-rw-r--r--roles/monitoring/prometheus/server/defaults/main/main.yml5
-rw-r--r--roles/monitoring/prometheus/server/defaults/main/rules_prometheus.yml47
-rw-r--r--roles/monitoring/prometheus/server/templates/prometheus.yml.j213
3 files changed, 64 insertions, 1 deletions
diff --git a/roles/monitoring/prometheus/server/defaults/main/main.yml b/roles/monitoring/prometheus/server/defaults/main/main.yml
index b10d6f17..8e7fea4b 100644
--- a/roles/monitoring/prometheus/server/defaults/main/main.yml
+++ b/roles/monitoring/prometheus/server/defaults/main/main.yml
@@ -9,5 +9,8 @@ prometheus_server_jobs:
- node
prometheus_server_rules:
- prometheus: "{{ prometheus_server_rules_prometheus + prometheus_server_rules_prometheus_extra }}"
+ prometheus: "{{ prometheus_server_rules_prometheus + ((prometheus_server_alertmanager is defined) | ternary(prometheus_server_rules_prometheus_alertmanager, [])) + prometheus_server_rules_prometheus_extra }}"
node: "{{ prometheus_server_rules_node + prometheus_server_rules_prometheus_extra }}"
+
+# prometheus_server_alertmanager:
+# url: "127.0.0.1:9093"
diff --git a/roles/monitoring/prometheus/server/defaults/main/rules_prometheus.yml b/roles/monitoring/prometheus/server/defaults/main/rules_prometheus.yml
index 6d84efa4..8d4672b1 100644
--- a/roles/monitoring/prometheus/server/defaults/main/rules_prometheus.yml
+++ b/roles/monitoring/prometheus/server/defaults/main/rules_prometheus.yml
@@ -190,3 +190,50 @@ prometheus_server_rules_prometheus:
annotations:
summary: Prometheus TSDB WAL truncations failed (instance {{ '{{' }} $labels.instance {{ '}}' }})
description: "Prometheus encountered {{ '{{' }} $value {{ '}}' }} TSDB WAL truncation failures\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}"
+
+
+prometheus_server_rules_prometheus_alertmanager:
+ - alert: PrometheusAlertmanagerConfigurationReloadFailure
+ expr: alertmanager_config_last_reload_successful != 1
+ for: 0m
+ labels:
+ severity: warning
+ annotations:
+ summary: Prometheus AlertManager configuration reload failure (instance {{ '{{' }} $labels.instance {{ '}}' }})
+ description: "AlertManager configuration reload error\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}"
+
+ - alert: PrometheusAlertmanagerConfigNotSynced
+ expr: count(count_values("config_hash", alertmanager_config_hash)) > 1
+ for: 0m
+ labels:
+ severity: warning
+ annotations:
+ summary: Prometheus AlertManager config not synced (instance {{ '{{' }} $labels.instance {{ '}}' }})
+ description: "Configurations of AlertManager cluster instances are out of sync\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}"
+
+ - alert: PrometheusAlertmanagerE2eDeadManSwitch
+ expr: vector(1)
+ for: 0m
+ labels:
+ severity: critical
+ annotations:
+ summary: Prometheus AlertManager E2E dead man switch (instance {{ '{{' }} $labels.instance {{ '}}' }})
+ description: "Prometheus DeadManSwitch is an always-firing alert. It's used as an end-to-end test of Prometheus through the Alertmanager.\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}"
+
+ - alert: PrometheusNotConnectedToAlertmanager
+ expr: prometheus_notifications_alertmanagers_discovered < 1
+ for: 0m
+ labels:
+ severity: critical
+ annotations:
+ summary: Prometheus not connected to alertmanager (instance {{ '{{' }} $labels.instance {{ '}}' }})
+ description: "Prometheus cannot connect the alertmanager\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}"
+
+ - alert: PrometheusAlertmanagerNotificationFailing
+ expr: rate(alertmanager_notifications_failed_total[1m]) > 0
+ for: 0m
+ labels:
+ severity: critical
+ annotations:
+ summary: Prometheus AlertManager notification failing (instance {{ '{{' }} $labels.instance {{ '}}' }})
+ description: "Alertmanager is failing sending notifications\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}"
diff --git a/roles/monitoring/prometheus/server/templates/prometheus.yml.j2 b/roles/monitoring/prometheus/server/templates/prometheus.yml.j2
index 3975c74d..c76990f4 100644
--- a/roles/monitoring/prometheus/server/templates/prometheus.yml.j2
+++ b/roles/monitoring/prometheus/server/templates/prometheus.yml.j2
@@ -6,6 +6,13 @@ global:
rule_files:
- /etc/prometheus/rules/*.yml
+{% if prometheus_server_alertmanager is defined %}
+
+alerting:
+ alertmanagers:
+ - static_configs:
+ - targets: ['{{ prometheus_server_alertmanager.url }}']
+{% endif %}
scrape_configs:
- job_name: 'prometheus'
@@ -13,6 +20,12 @@ scrape_configs:
- targets: ['localhost:9090']
labels:
instance: "{{ inventory_hostname }}"
+{% if prometheus_server_alertmanager is defined %}
+
+ - job_name: 'alertmanager'
+ static_configs:
+ - targets: ['{{ prometheus_server_alertmanager.url }}']
+{% endif %}
{% for job in prometheus_server_jobs %}
- job_name: '{{ job }}'