From c92175810eb36514f6aa16e641551ef4d4f6b776 Mon Sep 17 00:00:00 2001 From: Christian Pointner Date: Sat, 5 Mar 2022 03:59:41 +0100 Subject: prometheus: cleanups and fixes in prep for alerta --- roles/monitoring/prometheus/server/defaults/main/main.yml | 6 +++++- .../monitoring/prometheus/server/defaults/main/rules_prometheus.yml | 4 +++- 2 files changed, 8 insertions(+), 2 deletions(-) (limited to 'roles/monitoring/prometheus/server/defaults/main') diff --git a/roles/monitoring/prometheus/server/defaults/main/main.yml b/roles/monitoring/prometheus/server/defaults/main/main.yml index f74a6f30..99f93e6e 100644 --- a/roles/monitoring/prometheus/server/defaults/main/main.yml +++ b/roles/monitoring/prometheus/server/defaults/main/main.yml @@ -4,6 +4,9 @@ # ... prometheus_server_retention: "15d" +# prometheus_server_external_labels: +# environment: foo +# monitor: {{ inventory_hostname }} prometheus_server_jobs: - node @@ -31,7 +34,8 @@ prometheus_server_rules: # password: geheim prometheus_server_web_listen_address: 127.0.0.1:9090 -# prometheus_server_web_external_url: /prometheus/ +# prometheus_server_web_route_prefix: /prometheus/ +# prometheus_server_web_external_url: https://mon.example.com/prometheus/ # prometheus_server_auth_users: # server: changeme diff --git a/roles/monitoring/prometheus/server/defaults/main/rules_prometheus.yml b/roles/monitoring/prometheus/server/defaults/main/rules_prometheus.yml index 8d4672b1..422f84cb 100644 --- a/roles/monitoring/prometheus/server/defaults/main/rules_prometheus.yml +++ b/roles/monitoring/prometheus/server/defaults/main/rules_prometheus.yml @@ -215,7 +215,9 @@ prometheus_server_rules_prometheus_alertmanager: expr: vector(1) for: 0m labels: - severity: critical + severity: informational + instance: prometheus + timeout: 7200 annotations: summary: Prometheus AlertManager E2E dead man switch (instance {{ '{{' }} $labels.instance {{ '}}' }}) description: "Prometheus DeadManSwitch is an always-firing alert. It's used as an end-to-end test of Prometheus through the Alertmanager.\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}" -- cgit v1.2.3 From 1bbe8582d9650eda816f9c596d8cd3b3fe9998e7 Mon Sep 17 00:00:00 2001 From: Christian Pointner Date: Sun, 6 Mar 2022 00:31:22 +0100 Subject: alerta: test alertmanager silence integration --- inventory/host_vars/ch-mon.yml | 3 ++- roles/monitoring/alerta/tasks/main.yml | 6 +++++- roles/monitoring/prometheus/server/defaults/main/rules_node.yml | 2 +- 3 files changed, 8 insertions(+), 3 deletions(-) (limited to 'roles/monitoring/prometheus/server/defaults/main') diff --git a/inventory/host_vars/ch-mon.yml b/inventory/host_vars/ch-mon.yml index bd952fc8..684cc722 100644 --- a/inventory/host_vars/ch-mon.yml +++ b/inventory/host_vars/ch-mon.yml @@ -145,6 +145,7 @@ prometheus_alertmanager_web_external_url: "http://{{ network.primary.address | i prometheus_alertmanager_auth_users: server: "{{ vault_prometheus_alertmanager_auth_user_passwords['server'] }}" admin: "{{ vault_prometheus_alertmanager_auth_user_passwords['admin'] }}" + alerta: "alerta" ## TODO: move this to vault prometheus_alertmanager_route: receiver: alerta @@ -161,7 +162,7 @@ prometheus_alertmanager_receivers: http_config: basic_auth: username: admin - password: alerta + password: alerta ## TODO: move this to vault grafana_secret_key: "{{ vault_grafana_secret_key }}" diff --git a/roles/monitoring/alerta/tasks/main.yml b/roles/monitoring/alerta/tasks/main.yml index af7530d8..a4fd3df9 100644 --- a/roles/monitoring/alerta/tasks/main.yml +++ b/roles/monitoring/alerta/tasks/main.yml @@ -12,10 +12,14 @@ content: | DEBUG = {{ '{{' }} 'True' if env.DEBUG else 'False' {{ '}}' }} SECRET = "{{ '{{' }} env.SECRET_KEY {{ '}}' }}" - PLUGINS = ['reject', 'blackout', 'heartbeat'] + PLUGINS = ['reject', 'blackout', 'heartbeat', 'prometheus'] DEFAULT_ENVIRONMENT = 'unknown' ALLOWED_ENVIRONMENTS = ['unknown', 'chaos-at-.*'] HEARTBEAT_EVENTS = ['PrometheusAlertmanagerE2eDeadManSwitch'] + ALERTMANAGER_USERNAME = 'alerta' + ALERTMANAGER_PASSWORD = 'alerta' + ALERTMANAGER_SILENCE_FROM_ACK = True + ALERTMANAGER_USE_EXTERNALURL_FOR_SILENCES = True dest: "{{ alerta_base_path }}/config/alertad.conf.j2" - name: install pod manifest diff --git a/roles/monitoring/prometheus/server/defaults/main/rules_node.yml b/roles/monitoring/prometheus/server/defaults/main/rules_node.yml index 525355d5..75e96bca 100644 --- a/roles/monitoring/prometheus/server/defaults/main/rules_node.yml +++ b/roles/monitoring/prometheus/server/defaults/main/rules_node.yml @@ -206,7 +206,7 @@ prometheus_server_rules_node: severity: warning annotations: summary: Host Network Interface Saturated (instance {{ '{{' }} $labels.instance {{ '}}' }}) - description: "The network interface \"{{ '{{' }} $labels.interface {{ '}}' }}\" on \"{{ '{{' }} $labels.instance {{ '}}' }}\" is getting overloaded.\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}" + description: "The network interface \"{{ '{{' }} $labels.device {{ '}}' }}\" on \"{{ '{{' }} $labels.instance {{ '}}' }}\" is getting overloaded.\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}" - alert: HostConntrackLimit expr: node_nf_conntrack_entries / node_nf_conntrack_entries_limit > 0.8 -- cgit v1.2.3 From 48f47342d85692e4d342c490085c13518be7a07e Mon Sep 17 00:00:00 2001 From: Christian Pointner Date: Tue, 8 Mar 2022 22:39:30 +0100 Subject: prometheus/alerta: tune some severities --- roles/monitoring/alerta/tasks/main.yml | 1 + roles/monitoring/prometheus/server/defaults/main/rules_node.yml | 4 ++-- 2 files changed, 3 insertions(+), 2 deletions(-) (limited to 'roles/monitoring/prometheus/server/defaults/main') diff --git a/roles/monitoring/alerta/tasks/main.yml b/roles/monitoring/alerta/tasks/main.yml index 47df5308..490f5e3d 100644 --- a/roles/monitoring/alerta/tasks/main.yml +++ b/roles/monitoring/alerta/tasks/main.yml @@ -58,6 +58,7 @@ key = aNqBsEyG0ynIKcc3e7acaBVBk5B793o_z7tvlsht endpoint = http://localhost:8080/api amqp_url = redis://localhost:6379 + severities = critical, warning smtp_host = 192.168.28.250 smtp_port = 25 smtp_starttls = False diff --git a/roles/monitoring/prometheus/server/defaults/main/rules_node.yml b/roles/monitoring/prometheus/server/defaults/main/rules_node.yml index 75e96bca..d211731a 100644 --- a/roles/monitoring/prometheus/server/defaults/main/rules_node.yml +++ b/roles/monitoring/prometheus/server/defaults/main/rules_node.yml @@ -167,7 +167,7 @@ prometheus_server_rules_node: expr: increase(node_edac_correctable_errors_total[1m]) > 0 for: 0m labels: - severity: info + severity: warning annotations: summary: Host EDAC Correctable Errors detected (instance {{ '{{' }} $labels.instance {{ '}}' }}) description: "Host {{ '{{' }} $labels.instance {{ '}}' }} has had {{ '{{' }} printf \"%.0f\" $value {{ '}}' }} correctable memory errors reported by EDAC in the last 5 minutes.\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}" @@ -176,7 +176,7 @@ prometheus_server_rules_node: expr: node_edac_uncorrectable_errors_total > 0 for: 0m labels: - severity: warning + severity: critical annotations: summary: Host EDAC Uncorrectable Errors detected (instance {{ '{{' }} $labels.instance {{ '}}' }}) description: "Host {{ '{{' }} $labels.instance {{ '}}' }} has had {{ '{{' }} printf \"%.0f\" $value {{ '}}' }} uncorrectable memory errors reported by EDAC in the last 5 minutes.\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}" -- cgit v1.2.3