From c92175810eb36514f6aa16e641551ef4d4f6b776 Mon Sep 17 00:00:00 2001 From: Christian Pointner Date: Sat, 5 Mar 2022 03:59:41 +0100 Subject: prometheus: cleanups and fixes in prep for alerta --- roles/monitoring/prometheus/alertmanager/defaults/main.yml | 1 + .../alertmanager/templates/prometheus-alertmanager.service.j2 | 2 +- roles/monitoring/prometheus/server/defaults/main/main.yml | 6 +++++- .../prometheus/server/defaults/main/rules_prometheus.yml | 4 +++- .../prometheus/server/templates/prometheus.service.j2 | 2 +- roles/monitoring/prometheus/server/templates/prometheus.yml.j2 | 10 ++++++++-- 6 files changed, 19 insertions(+), 6 deletions(-) (limited to 'roles') diff --git a/roles/monitoring/prometheus/alertmanager/defaults/main.yml b/roles/monitoring/prometheus/alertmanager/defaults/main.yml index 47e0ae54..86cd9aa5 100644 --- a/roles/monitoring/prometheus/alertmanager/defaults/main.yml +++ b/roles/monitoring/prometheus/alertmanager/defaults/main.yml @@ -1,6 +1,7 @@ --- prometheus_alertmanager_web_listen_address: 127.0.0.1:9093 # prometheus_alertmanager_web_route_prefix: /alertmanager/ +# prometheus_alertmanager_web_external_url: https://mon.example.com/alertmanager/ prometheus_alertmanager_smtp: smarthost: "127.0.0.1:25" diff --git a/roles/monitoring/prometheus/alertmanager/templates/prometheus-alertmanager.service.j2 b/roles/monitoring/prometheus/alertmanager/templates/prometheus-alertmanager.service.j2 index 5e0e3008..d22d9e01 100644 --- a/roles/monitoring/prometheus/alertmanager/templates/prometheus-alertmanager.service.j2 +++ b/roles/monitoring/prometheus/alertmanager/templates/prometheus-alertmanager.service.j2 @@ -5,7 +5,7 @@ Documentation=https://prometheus.io/docs/alerting/alertmanager/ [Service] Restart=on-failure User=prometheus-alertmanager -ExecStart=/usr/bin/prometheus-alertmanager --config.file=/etc/prometheus/alertmanager.yml --cluster.listen-address= --storage.path="/var/lib/prometheus/alertmanager"{% if prometheus_alertmanager_web_route_prefix is defined %} --web.route-prefix={{ prometheus_alertmanager_web_route_prefix }}{% endif %}{% if prometheus_alertmanager_auth_users is defined %} --web.config.file=/etc/prometheus/alertmanager-web.yml{% endif %} --web.listen-address={{ prometheus_alertmanager_web_listen_address }} +ExecStart=/usr/bin/prometheus-alertmanager --config.file=/etc/prometheus/alertmanager.yml --cluster.listen-address= --storage.path="/var/lib/prometheus/alertmanager"{% if prometheus_alertmanager_web_external_url is defined %} --web.external-url={{ prometheus_alertmanager_web_external_url }}{% endif %}{% if prometheus_alertmanager_web_route_prefix is defined %} --web.route-prefix={{ prometheus_alertmanager_web_route_prefix }}{% endif %}{% if prometheus_alertmanager_auth_users is defined %} --web.config.file=/etc/prometheus/alertmanager-web.yml{% endif %} --web.listen-address={{ prometheus_alertmanager_web_listen_address }} ExecReload=/bin/kill -HUP $MAINPID TimeoutStopSec=20s SendSIGKILL=no diff --git a/roles/monitoring/prometheus/server/defaults/main/main.yml b/roles/monitoring/prometheus/server/defaults/main/main.yml index f74a6f30..99f93e6e 100644 --- a/roles/monitoring/prometheus/server/defaults/main/main.yml +++ b/roles/monitoring/prometheus/server/defaults/main/main.yml @@ -4,6 +4,9 @@ # ... prometheus_server_retention: "15d" +# prometheus_server_external_labels: +# environment: foo +# monitor: {{ inventory_hostname }} prometheus_server_jobs: - node @@ -31,7 +34,8 @@ prometheus_server_rules: # password: geheim prometheus_server_web_listen_address: 127.0.0.1:9090 -# prometheus_server_web_external_url: /prometheus/ +# prometheus_server_web_route_prefix: /prometheus/ +# prometheus_server_web_external_url: https://mon.example.com/prometheus/ # prometheus_server_auth_users: # server: changeme diff --git a/roles/monitoring/prometheus/server/defaults/main/rules_prometheus.yml b/roles/monitoring/prometheus/server/defaults/main/rules_prometheus.yml index 8d4672b1..422f84cb 100644 --- a/roles/monitoring/prometheus/server/defaults/main/rules_prometheus.yml +++ b/roles/monitoring/prometheus/server/defaults/main/rules_prometheus.yml @@ -215,7 +215,9 @@ prometheus_server_rules_prometheus_alertmanager: expr: vector(1) for: 0m labels: - severity: critical + severity: informational + instance: prometheus + timeout: 7200 annotations: summary: Prometheus AlertManager E2E dead man switch (instance {{ '{{' }} $labels.instance {{ '}}' }}) description: "Prometheus DeadManSwitch is an always-firing alert. It's used as an end-to-end test of Prometheus through the Alertmanager.\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}" diff --git a/roles/monitoring/prometheus/server/templates/prometheus.service.j2 b/roles/monitoring/prometheus/server/templates/prometheus.service.j2 index 77a3b02a..e65e9425 100644 --- a/roles/monitoring/prometheus/server/templates/prometheus.service.j2 +++ b/roles/monitoring/prometheus/server/templates/prometheus.service.j2 @@ -6,7 +6,7 @@ After=time-sync.target [Service] Restart=on-failure User=prometheus -ExecStart=/usr/bin/prometheus --config.file=/etc/prometheus/prometheus.yml --storage.tsdb.path=/var/lib/prometheus/metrics2/ --storage.tsdb.retention.time={{ prometheus_server_retention }}{% if prometheus_server_web_external_url is defined %} --web.external-url={{ prometheus_server_web_external_url }}{% endif %}{% if prometheus_server_auth_users is defined %} --web.config.file=/etc/prometheus/prometheus-web.yml{% endif %} --web.listen-address={{ prometheus_server_web_listen_address }} +ExecStart=/usr/bin/prometheus --config.file=/etc/prometheus/prometheus.yml --storage.tsdb.path=/var/lib/prometheus/metrics2/ --storage.tsdb.retention.time={{ prometheus_server_retention }}{% if prometheus_server_web_external_url is defined %} --web.external-url={{ prometheus_server_web_external_url }}{% endif %}{% if prometheus_server_web_route_prefix is defined %} --web.route-prefix={{ prometheus_server_web_route_prefix }}{% endif %}{% if prometheus_server_auth_users is defined %} --web.config.file=/etc/prometheus/prometheus-web.yml{% endif %} --web.listen-address={{ prometheus_server_web_listen_address }} ExecReload=/bin/kill -HUP $MAINPID TimeoutStopSec=20s SendSIGKILL=no diff --git a/roles/monitoring/prometheus/server/templates/prometheus.yml.j2 b/roles/monitoring/prometheus/server/templates/prometheus.yml.j2 index 883aa223..aed69de5 100644 --- a/roles/monitoring/prometheus/server/templates/prometheus.yml.j2 +++ b/roles/monitoring/prometheus/server/templates/prometheus.yml.j2 @@ -3,6 +3,12 @@ global: scrape_interval: 15s evaluation_interval: 15s +{% if prometheus_server_external_labels is defined %} + external_labels: +{% for name, value in prometheus_server_external_labels.items() %} + {{ name }}: {{ value }} +{% endfor %} +{% endif %} rule_files: - /etc/prometheus/rules/*.yml @@ -27,8 +33,8 @@ alerting: scrape_configs: - job_name: 'prometheus' -{% if prometheus_server_web_external_url is defined %} - metrics_path: '{{ (prometheus_server_web_external_url | urlsplit('path'), 'metrics') | path_join }}' +{% if prometheus_server_web_route_prefix is defined or prometheus_server_web_external_url is defined %} + metrics_path: '{{ (prometheus_server_web_route_prefix | default(prometheus_server_web_external_url | default('') | urlsplit('path')), 'metrics') | path_join }}' {% endif %} {% if prometheus_server_selfscraping_auth is defined %} basic_auth: -- cgit v1.2.3 From 9117d620ff6908ce92318db216403dab68c496ed Mon Sep 17 00:00:00 2001 From: Christian Pointner Date: Sat, 5 Mar 2022 21:55:14 +0100 Subject: initial commit for alerta role --- chaos-at-home/ch-apps.yml | 1 + inventory/host_vars/ch-apps.yml | 3 ++ inventory/host_vars/ch-mon.yml | 17 ++++++++ roles/monitoring/alerta/defaults/main.yml | 2 + roles/monitoring/alerta/tasks/main.yml | 32 +++++++++++++++ roles/monitoring/alerta/templates/pod-spec.yml.j2 | 48 +++++++++++++++++++++++ 6 files changed, 103 insertions(+) create mode 100644 roles/monitoring/alerta/defaults/main.yml create mode 100644 roles/monitoring/alerta/tasks/main.yml create mode 100644 roles/monitoring/alerta/templates/pod-spec.yml.j2 (limited to 'roles') diff --git a/chaos-at-home/ch-apps.yml b/chaos-at-home/ch-apps.yml index d264ffc2..f0347216 100644 --- a/chaos-at-home/ch-apps.yml +++ b/chaos-at-home/ch-apps.yml @@ -12,3 +12,4 @@ - role: apt-repo/spreadspace - role: kubernetes/base - role: kubernetes/standalone/base + - role: monitoring/alerta diff --git a/inventory/host_vars/ch-apps.yml b/inventory/host_vars/ch-apps.yml index bdbac832..ed2aeb70 100644 --- a/inventory/host_vars/ch-apps.yml +++ b/inventory/host_vars/ch-apps.yml @@ -113,3 +113,6 @@ kubernetes_version: 1.23.2 kubernetes_container_runtime: docker kubernetes_standalone_max_pods: 42 kubernetes_standalone_cni_variant: with-portmap + + +alerta_base_path: /srv/storage/alerta diff --git a/inventory/host_vars/ch-mon.yml b/inventory/host_vars/ch-mon.yml index c16398bc..bd952fc8 100644 --- a/inventory/host_vars/ch-mon.yml +++ b/inventory/host_vars/ch-mon.yml @@ -146,6 +146,23 @@ prometheus_alertmanager_auth_users: server: "{{ vault_prometheus_alertmanager_auth_user_passwords['server'] }}" admin: "{{ vault_prometheus_alertmanager_auth_user_passwords['admin'] }}" +prometheus_alertmanager_route: + receiver: alerta + group_by: ['...'] + group_wait: 0 + group_interval: 10s + repeat_interval: 5m + +prometheus_alertmanager_receivers: + - name: alerta + webhook_configs: + - url: http://192.168.32.1:8080/api/webhooks/prometheus + send_resolved: true + http_config: + basic_auth: + username: admin + password: alerta + grafana_secret_key: "{{ vault_grafana_secret_key }}" diff --git a/roles/monitoring/alerta/defaults/main.yml b/roles/monitoring/alerta/defaults/main.yml new file mode 100644 index 00000000..034c8268 --- /dev/null +++ b/roles/monitoring/alerta/defaults/main.yml @@ -0,0 +1,2 @@ +--- +# alerta_base_path: /srv/alerta diff --git a/roles/monitoring/alerta/tasks/main.yml b/roles/monitoring/alerta/tasks/main.yml new file mode 100644 index 00000000..af7530d8 --- /dev/null +++ b/roles/monitoring/alerta/tasks/main.yml @@ -0,0 +1,32 @@ +--- +- name: create alerta subdirectories + loop: + - config + - postgres + file: + path: "{{ alerta_base_path }}/{{ item }}" + state: directory + +- name: install alertad config template + copy: + content: | + DEBUG = {{ '{{' }} 'True' if env.DEBUG else 'False' {{ '}}' }} + SECRET = "{{ '{{' }} env.SECRET_KEY {{ '}}' }}" + PLUGINS = ['reject', 'blackout', 'heartbeat'] + DEFAULT_ENVIRONMENT = 'unknown' + ALLOWED_ENVIRONMENTS = ['unknown', 'chaos-at-.*'] + HEARTBEAT_EVENTS = ['PrometheusAlertmanagerE2eDeadManSwitch'] + dest: "{{ alerta_base_path }}/config/alertad.conf.j2" + +- name: install pod manifest + vars: + kubernetes_standalone_pod: + name: "alerta" + spec: "{{ lookup('template', 'pod-spec.yml.j2') }}" + mode: "0600" + config_hash_items: + - path: "{{ alerta_base_path }}/config/alertad.conf.j2" + properties: + - checksum + include_role: + name: kubernetes/standalone/pod diff --git a/roles/monitoring/alerta/templates/pod-spec.yml.j2 b/roles/monitoring/alerta/templates/pod-spec.yml.j2 new file mode 100644 index 00000000..e31686e3 --- /dev/null +++ b/roles/monitoring/alerta/templates/pod-spec.yml.j2 @@ -0,0 +1,48 @@ +containers: +- name: alerta + image: "alerta/alerta-web:8.7.0" + resources: + limits: + memory: "1Gi" + env: + - name: "DATABASE_URL" + value: "postgres://alerta:secret@127.0.0.1:5432/alerta" + - name: "AUTH_REQUIRED" + value: "True" + - name: "ADMIN_USERS" + value: "admin" + volumeMounts: + - name: config + mountPath: /app/alertad.conf.j2 + subPath: alertad.conf.j2 + readOnly: true + ports: + - containerPort: 8080 + hostPort: 8080 + +- name: postgresql + image: "postgres:14.2" + args: + - postgres + - -c + - listen_addresses=127.0.0.1 + env: + - name: "POSTGRES_DB" + value: "alerta" + - name: "POSTGRES_USER" + value: "alerta" + - name: "POSTGRES_PASSWORD" + value: "secret" + volumeMounts: + - name: postgres + mountPath: /var/lib/postgresql/data + +volumes: +- name: config + hostPath: + path: "{{ alerta_base_path }}/config" + type: Directory +- name: postgres + hostPath: + path: "{{ alerta_base_path }}/postgres" + type: Directory -- cgit v1.2.3 From 1bbe8582d9650eda816f9c596d8cd3b3fe9998e7 Mon Sep 17 00:00:00 2001 From: Christian Pointner Date: Sun, 6 Mar 2022 00:31:22 +0100 Subject: alerta: test alertmanager silence integration --- inventory/host_vars/ch-mon.yml | 3 ++- roles/monitoring/alerta/tasks/main.yml | 6 +++++- roles/monitoring/prometheus/server/defaults/main/rules_node.yml | 2 +- 3 files changed, 8 insertions(+), 3 deletions(-) (limited to 'roles') diff --git a/inventory/host_vars/ch-mon.yml b/inventory/host_vars/ch-mon.yml index bd952fc8..684cc722 100644 --- a/inventory/host_vars/ch-mon.yml +++ b/inventory/host_vars/ch-mon.yml @@ -145,6 +145,7 @@ prometheus_alertmanager_web_external_url: "http://{{ network.primary.address | i prometheus_alertmanager_auth_users: server: "{{ vault_prometheus_alertmanager_auth_user_passwords['server'] }}" admin: "{{ vault_prometheus_alertmanager_auth_user_passwords['admin'] }}" + alerta: "alerta" ## TODO: move this to vault prometheus_alertmanager_route: receiver: alerta @@ -161,7 +162,7 @@ prometheus_alertmanager_receivers: http_config: basic_auth: username: admin - password: alerta + password: alerta ## TODO: move this to vault grafana_secret_key: "{{ vault_grafana_secret_key }}" diff --git a/roles/monitoring/alerta/tasks/main.yml b/roles/monitoring/alerta/tasks/main.yml index af7530d8..a4fd3df9 100644 --- a/roles/monitoring/alerta/tasks/main.yml +++ b/roles/monitoring/alerta/tasks/main.yml @@ -12,10 +12,14 @@ content: | DEBUG = {{ '{{' }} 'True' if env.DEBUG else 'False' {{ '}}' }} SECRET = "{{ '{{' }} env.SECRET_KEY {{ '}}' }}" - PLUGINS = ['reject', 'blackout', 'heartbeat'] + PLUGINS = ['reject', 'blackout', 'heartbeat', 'prometheus'] DEFAULT_ENVIRONMENT = 'unknown' ALLOWED_ENVIRONMENTS = ['unknown', 'chaos-at-.*'] HEARTBEAT_EVENTS = ['PrometheusAlertmanagerE2eDeadManSwitch'] + ALERTMANAGER_USERNAME = 'alerta' + ALERTMANAGER_PASSWORD = 'alerta' + ALERTMANAGER_SILENCE_FROM_ACK = True + ALERTMANAGER_USE_EXTERNALURL_FOR_SILENCES = True dest: "{{ alerta_base_path }}/config/alertad.conf.j2" - name: install pod manifest diff --git a/roles/monitoring/prometheus/server/defaults/main/rules_node.yml b/roles/monitoring/prometheus/server/defaults/main/rules_node.yml index 525355d5..75e96bca 100644 --- a/roles/monitoring/prometheus/server/defaults/main/rules_node.yml +++ b/roles/monitoring/prometheus/server/defaults/main/rules_node.yml @@ -206,7 +206,7 @@ prometheus_server_rules_node: severity: warning annotations: summary: Host Network Interface Saturated (instance {{ '{{' }} $labels.instance {{ '}}' }}) - description: "The network interface \"{{ '{{' }} $labels.interface {{ '}}' }}\" on \"{{ '{{' }} $labels.instance {{ '}}' }}\" is getting overloaded.\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}" + description: "The network interface \"{{ '{{' }} $labels.device {{ '}}' }}\" on \"{{ '{{' }} $labels.instance {{ '}}' }}\" is getting overloaded.\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}" - alert: HostConntrackLimit expr: node_nf_conntrack_entries / node_nf_conntrack_entries_limit > 0.8 -- cgit v1.2.3 From c5fad84c1abe4bd208a75e34a92c242b8bbaf9ff Mon Sep 17 00:00:00 2001 From: Christian Pointner Date: Sun, 6 Mar 2022 19:02:27 +0100 Subject: alerta: add mailer --- roles/monitoring/alerta/files/email.tmpl | 32 ++++++++++++ roles/monitoring/alerta/tasks/main.yml | 64 ++++++++++++++++++++++- roles/monitoring/alerta/templates/pod-spec.yml.j2 | 33 ++++++++++-- 3 files changed, 124 insertions(+), 5 deletions(-) create mode 100644 roles/monitoring/alerta/files/email.tmpl (limited to 'roles') diff --git a/roles/monitoring/alerta/files/email.tmpl b/roles/monitoring/alerta/files/email.tmpl new file mode 100644 index 00000000..939e0038 --- /dev/null +++ b/roles/monitoring/alerta/files/email.tmpl @@ -0,0 +1,32 @@ + +------------------------------------------------------------ +[{{ alert.status|title }}] {{ alert.environment }}: {{ alert.severity|title }} {{ alert.event }} on {{ alert.service|join(', ') }} {{ alert.resource }} +------------------------------------------------------------ + +Alert ID: {{ alert.id }} +Create Time: {{ alert.create_time }} +Environment: {{ alert.environment }} +Services: {{ alert.service|join(', ') }} +Resource: {{ alert.resource }} +Event: {{ alert.event }} +Group: {{ alert.group }} +Value: {{ alert.value }} +Severity: {{ alert.previous_severity|title}} -> {{ alert.severity|title }} +Status: {{ alert.status|title }} +Text: {{ alert.text }} +Duplicate Count: {{ alert.duplicate_count }} +Origin: {{ alert.origin }} +Tags: {{ alert.tags|join(', ') }} +{% for key,value in alert.attributes.items() -%} +{{ key|title }}: {{ value | safe }} +{% endfor -%} + +{% if alert.raw_data %} +Raw Data +{{ alert.raw_data | safe }} +{% endif %} + +To acknowledge this alert visit this URL: +{{ dashboard_url | safe }}/#/alert/{{ alert.id }} + +Generated by {{ program }} on {{ hostname }} at {{ now }} diff --git a/roles/monitoring/alerta/tasks/main.yml b/roles/monitoring/alerta/tasks/main.yml index a4fd3df9..47df5308 100644 --- a/roles/monitoring/alerta/tasks/main.yml +++ b/roles/monitoring/alerta/tasks/main.yml @@ -3,16 +3,42 @@ loop: - config - postgres + - build file: path: "{{ alerta_base_path }}/{{ item }}" state: directory +- name: generate Dockerfile for custom image + copy: + content: | + FROM alerta/alerta-web:8.7.0 + + RUN set -x \ + && sed 's/USE_AM_EXTERNALURL_FOR_SILENCES/ALERTMANAGER_USE_EXTERNALURL_FOR_SILENCES/' -i /venv/lib/python3.8/site-packages/alerta_prometheus.py \ + && /venv/bin/pip install redis==4.1.4 \ + && /venv/bin/pip install git+https://github.com/alerta/alerta-contrib.git@69d271ef9fe6542727ec4aa39fc8e0f797f1e8b1#subdirectory=integrations/mailer + dest: "{{ alerta_base_path }}/build/Dockerfile" + register: alerta_custom_image_docker + +- name: build custom image + docker_image: + name: "alerta-web-with-mailer:8.7.0" + state: present + force_source: "{{ alerta_custom_image_docker is changed }}" + source: build + build: + path: "{{ alerta_base_path }}/build" + network: host + pull: yes + - name: install alertad config template copy: content: | DEBUG = {{ '{{' }} 'True' if env.DEBUG else 'False' {{ '}}' }} SECRET = "{{ '{{' }} env.SECRET_KEY {{ '}}' }}" - PLUGINS = ['reject', 'blackout', 'heartbeat', 'prometheus'] + ALERT_TIMEOUT = 86400 + HEARTBEAT_TIMEOUT = 7200 + PLUGINS = ['reject', 'blackout', 'heartbeat', 'prometheus', 'amqp'] DEFAULT_ENVIRONMENT = 'unknown' ALLOWED_ENVIRONMENTS = ['unknown', 'chaos-at-.*'] HEARTBEAT_EVENTS = ['PrometheusAlertmanagerE2eDeadManSwitch'] @@ -20,8 +46,35 @@ ALERTMANAGER_PASSWORD = 'alerta' ALERTMANAGER_SILENCE_FROM_ACK = True ALERTMANAGER_USE_EXTERNALURL_FOR_SILENCES = True + AMQP_URL = 'redis://localhost:6379/' dest: "{{ alerta_base_path }}/config/alertad.conf.j2" + ## TODO: add key handling... +- name: install alerta-mailer config file + copy: + content: | + [alerta-mailer] + debug = True + key = aNqBsEyG0ynIKcc3e7acaBVBk5B793o_z7tvlsht + endpoint = http://localhost:8080/api + amqp_url = redis://localhost:6379 + smtp_host = 192.168.28.250 + smtp_port = 25 + smtp_starttls = False + skip_mta = False + mail_to = equinox@chaos-at-home.org + mail_from = noreply@chaos-at-home.org + email_type = text + mail_template = /app/email.tmpl + dashboard_url = http://192.168.32.1:8080 + dest: "{{ alerta_base_path }}/config/alerta-mailer.conf" + mode: 0640 + +- name: install e-mail template + copy: + src: email.tmpl + dest: "{{ alerta_base_path }}/config/email.tmpl" + - name: install pod manifest vars: kubernetes_standalone_pod: @@ -32,5 +85,14 @@ - path: "{{ alerta_base_path }}/config/alertad.conf.j2" properties: - checksum + - path: "{{ alerta_base_path }}/config/alerta-mailer.conf" + properties: + - checksum + - path: "{{ alerta_base_path }}/config/email.tmpl" + properties: + - checksum + - path: "{{ alerta_base_path }}/build/Dockerfile" + properties: + - checksum include_role: name: kubernetes/standalone/pod diff --git a/roles/monitoring/alerta/templates/pod-spec.yml.j2 b/roles/monitoring/alerta/templates/pod-spec.yml.j2 index e31686e3..6edabae5 100644 --- a/roles/monitoring/alerta/templates/pod-spec.yml.j2 +++ b/roles/monitoring/alerta/templates/pod-spec.yml.j2 @@ -1,9 +1,6 @@ containers: - name: alerta - image: "alerta/alerta-web:8.7.0" - resources: - limits: - memory: "1Gi" + image: "alerta-web-with-mailer:8.7.0" env: - name: "DATABASE_URL" value: "postgres://alerta:secret@127.0.0.1:5432/alerta" @@ -11,6 +8,10 @@ containers: value: "True" - name: "ADMIN_USERS" value: "admin" + - name: "DEBUG" + value: "1" + - name: "SUPERVISORD_LOG_LEVEL" + value: "warn" volumeMounts: - name: config mountPath: /app/alertad.conf.j2 @@ -37,6 +38,30 @@ containers: - name: postgres mountPath: /var/lib/postgresql/data +- name: redis + image: "redis:6.2.6" + args: + - redis-server + - --bind + - 127.0.0.1 + +- name: mailer + image: "alerta-web-with-mailer:8.7.0" + command: + - alerta-mailer + env: + - name: "ALERTA_CONF_FILE" + value: "/app/alerta-mailer.conf" + volumeMounts: + - name: config + mountPath: /app/alerta-mailer.conf + subPath: alerta-mailer.conf + readOnly: true + - name: config + mountPath: /app/email.tmpl + subPath: email.tmpl + readOnly: true + volumes: - name: config hostPath: -- cgit v1.2.3 From 48f47342d85692e4d342c490085c13518be7a07e Mon Sep 17 00:00:00 2001 From: Christian Pointner Date: Tue, 8 Mar 2022 22:39:30 +0100 Subject: prometheus/alerta: tune some severities --- roles/monitoring/alerta/tasks/main.yml | 1 + roles/monitoring/prometheus/server/defaults/main/rules_node.yml | 4 ++-- 2 files changed, 3 insertions(+), 2 deletions(-) (limited to 'roles') diff --git a/roles/monitoring/alerta/tasks/main.yml b/roles/monitoring/alerta/tasks/main.yml index 47df5308..490f5e3d 100644 --- a/roles/monitoring/alerta/tasks/main.yml +++ b/roles/monitoring/alerta/tasks/main.yml @@ -58,6 +58,7 @@ key = aNqBsEyG0ynIKcc3e7acaBVBk5B793o_z7tvlsht endpoint = http://localhost:8080/api amqp_url = redis://localhost:6379 + severities = critical, warning smtp_host = 192.168.28.250 smtp_port = 25 smtp_starttls = False diff --git a/roles/monitoring/prometheus/server/defaults/main/rules_node.yml b/roles/monitoring/prometheus/server/defaults/main/rules_node.yml index 75e96bca..d211731a 100644 --- a/roles/monitoring/prometheus/server/defaults/main/rules_node.yml +++ b/roles/monitoring/prometheus/server/defaults/main/rules_node.yml @@ -167,7 +167,7 @@ prometheus_server_rules_node: expr: increase(node_edac_correctable_errors_total[1m]) > 0 for: 0m labels: - severity: info + severity: warning annotations: summary: Host EDAC Correctable Errors detected (instance {{ '{{' }} $labels.instance {{ '}}' }}) description: "Host {{ '{{' }} $labels.instance {{ '}}' }} has had {{ '{{' }} printf \"%.0f\" $value {{ '}}' }} correctable memory errors reported by EDAC in the last 5 minutes.\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}" @@ -176,7 +176,7 @@ prometheus_server_rules_node: expr: node_edac_uncorrectable_errors_total > 0 for: 0m labels: - severity: warning + severity: critical annotations: summary: Host EDAC Uncorrectable Errors detected (instance {{ '{{' }} $labels.instance {{ '}}' }}) description: "Host {{ '{{' }} $labels.instance {{ '}}' }} has had {{ '{{' }} printf \"%.0f\" $value {{ '}}' }} uncorrectable memory errors reported by EDAC in the last 5 minutes.\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}" -- cgit v1.2.3