From 9117d620ff6908ce92318db216403dab68c496ed Mon Sep 17 00:00:00 2001 From: Christian Pointner Date: Sat, 5 Mar 2022 21:55:14 +0100 Subject: initial commit for alerta role --- roles/monitoring/alerta/defaults/main.yml | 2 + roles/monitoring/alerta/tasks/main.yml | 32 +++++++++++++++ roles/monitoring/alerta/templates/pod-spec.yml.j2 | 48 +++++++++++++++++++++++ 3 files changed, 82 insertions(+) create mode 100644 roles/monitoring/alerta/defaults/main.yml create mode 100644 roles/monitoring/alerta/tasks/main.yml create mode 100644 roles/monitoring/alerta/templates/pod-spec.yml.j2 (limited to 'roles/monitoring/alerta') diff --git a/roles/monitoring/alerta/defaults/main.yml b/roles/monitoring/alerta/defaults/main.yml new file mode 100644 index 00000000..034c8268 --- /dev/null +++ b/roles/monitoring/alerta/defaults/main.yml @@ -0,0 +1,2 @@ +--- +# alerta_base_path: /srv/alerta diff --git a/roles/monitoring/alerta/tasks/main.yml b/roles/monitoring/alerta/tasks/main.yml new file mode 100644 index 00000000..af7530d8 --- /dev/null +++ b/roles/monitoring/alerta/tasks/main.yml @@ -0,0 +1,32 @@ +--- +- name: create alerta subdirectories + loop: + - config + - postgres + file: + path: "{{ alerta_base_path }}/{{ item }}" + state: directory + +- name: install alertad config template + copy: + content: | + DEBUG = {{ '{{' }} 'True' if env.DEBUG else 'False' {{ '}}' }} + SECRET = "{{ '{{' }} env.SECRET_KEY {{ '}}' }}" + PLUGINS = ['reject', 'blackout', 'heartbeat'] + DEFAULT_ENVIRONMENT = 'unknown' + ALLOWED_ENVIRONMENTS = ['unknown', 'chaos-at-.*'] + HEARTBEAT_EVENTS = ['PrometheusAlertmanagerE2eDeadManSwitch'] + dest: "{{ alerta_base_path }}/config/alertad.conf.j2" + +- name: install pod manifest + vars: + kubernetes_standalone_pod: + name: "alerta" + spec: "{{ lookup('template', 'pod-spec.yml.j2') }}" + mode: "0600" + config_hash_items: + - path: "{{ alerta_base_path }}/config/alertad.conf.j2" + properties: + - checksum + include_role: + name: kubernetes/standalone/pod diff --git a/roles/monitoring/alerta/templates/pod-spec.yml.j2 b/roles/monitoring/alerta/templates/pod-spec.yml.j2 new file mode 100644 index 00000000..e31686e3 --- /dev/null +++ b/roles/monitoring/alerta/templates/pod-spec.yml.j2 @@ -0,0 +1,48 @@ +containers: +- name: alerta + image: "alerta/alerta-web:8.7.0" + resources: + limits: + memory: "1Gi" + env: + - name: "DATABASE_URL" + value: "postgres://alerta:secret@127.0.0.1:5432/alerta" + - name: "AUTH_REQUIRED" + value: "True" + - name: "ADMIN_USERS" + value: "admin" + volumeMounts: + - name: config + mountPath: /app/alertad.conf.j2 + subPath: alertad.conf.j2 + readOnly: true + ports: + - containerPort: 8080 + hostPort: 8080 + +- name: postgresql + image: "postgres:14.2" + args: + - postgres + - -c + - listen_addresses=127.0.0.1 + env: + - name: "POSTGRES_DB" + value: "alerta" + - name: "POSTGRES_USER" + value: "alerta" + - name: "POSTGRES_PASSWORD" + value: "secret" + volumeMounts: + - name: postgres + mountPath: /var/lib/postgresql/data + +volumes: +- name: config + hostPath: + path: "{{ alerta_base_path }}/config" + type: Directory +- name: postgres + hostPath: + path: "{{ alerta_base_path }}/postgres" + type: Directory -- cgit v1.2.3 From 1bbe8582d9650eda816f9c596d8cd3b3fe9998e7 Mon Sep 17 00:00:00 2001 From: Christian Pointner Date: Sun, 6 Mar 2022 00:31:22 +0100 Subject: alerta: test alertmanager silence integration --- inventory/host_vars/ch-mon.yml | 3 ++- roles/monitoring/alerta/tasks/main.yml | 6 +++++- roles/monitoring/prometheus/server/defaults/main/rules_node.yml | 2 +- 3 files changed, 8 insertions(+), 3 deletions(-) (limited to 'roles/monitoring/alerta') diff --git a/inventory/host_vars/ch-mon.yml b/inventory/host_vars/ch-mon.yml index bd952fc8..684cc722 100644 --- a/inventory/host_vars/ch-mon.yml +++ b/inventory/host_vars/ch-mon.yml @@ -145,6 +145,7 @@ prometheus_alertmanager_web_external_url: "http://{{ network.primary.address | i prometheus_alertmanager_auth_users: server: "{{ vault_prometheus_alertmanager_auth_user_passwords['server'] }}" admin: "{{ vault_prometheus_alertmanager_auth_user_passwords['admin'] }}" + alerta: "alerta" ## TODO: move this to vault prometheus_alertmanager_route: receiver: alerta @@ -161,7 +162,7 @@ prometheus_alertmanager_receivers: http_config: basic_auth: username: admin - password: alerta + password: alerta ## TODO: move this to vault grafana_secret_key: "{{ vault_grafana_secret_key }}" diff --git a/roles/monitoring/alerta/tasks/main.yml b/roles/monitoring/alerta/tasks/main.yml index af7530d8..a4fd3df9 100644 --- a/roles/monitoring/alerta/tasks/main.yml +++ b/roles/monitoring/alerta/tasks/main.yml @@ -12,10 +12,14 @@ content: | DEBUG = {{ '{{' }} 'True' if env.DEBUG else 'False' {{ '}}' }} SECRET = "{{ '{{' }} env.SECRET_KEY {{ '}}' }}" - PLUGINS = ['reject', 'blackout', 'heartbeat'] + PLUGINS = ['reject', 'blackout', 'heartbeat', 'prometheus'] DEFAULT_ENVIRONMENT = 'unknown' ALLOWED_ENVIRONMENTS = ['unknown', 'chaos-at-.*'] HEARTBEAT_EVENTS = ['PrometheusAlertmanagerE2eDeadManSwitch'] + ALERTMANAGER_USERNAME = 'alerta' + ALERTMANAGER_PASSWORD = 'alerta' + ALERTMANAGER_SILENCE_FROM_ACK = True + ALERTMANAGER_USE_EXTERNALURL_FOR_SILENCES = True dest: "{{ alerta_base_path }}/config/alertad.conf.j2" - name: install pod manifest diff --git a/roles/monitoring/prometheus/server/defaults/main/rules_node.yml b/roles/monitoring/prometheus/server/defaults/main/rules_node.yml index 525355d5..75e96bca 100644 --- a/roles/monitoring/prometheus/server/defaults/main/rules_node.yml +++ b/roles/monitoring/prometheus/server/defaults/main/rules_node.yml @@ -206,7 +206,7 @@ prometheus_server_rules_node: severity: warning annotations: summary: Host Network Interface Saturated (instance {{ '{{' }} $labels.instance {{ '}}' }}) - description: "The network interface \"{{ '{{' }} $labels.interface {{ '}}' }}\" on \"{{ '{{' }} $labels.instance {{ '}}' }}\" is getting overloaded.\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}" + description: "The network interface \"{{ '{{' }} $labels.device {{ '}}' }}\" on \"{{ '{{' }} $labels.instance {{ '}}' }}\" is getting overloaded.\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}" - alert: HostConntrackLimit expr: node_nf_conntrack_entries / node_nf_conntrack_entries_limit > 0.8 -- cgit v1.2.3 From c5fad84c1abe4bd208a75e34a92c242b8bbaf9ff Mon Sep 17 00:00:00 2001 From: Christian Pointner Date: Sun, 6 Mar 2022 19:02:27 +0100 Subject: alerta: add mailer --- roles/monitoring/alerta/files/email.tmpl | 32 ++++++++++++ roles/monitoring/alerta/tasks/main.yml | 64 ++++++++++++++++++++++- roles/monitoring/alerta/templates/pod-spec.yml.j2 | 33 ++++++++++-- 3 files changed, 124 insertions(+), 5 deletions(-) create mode 100644 roles/monitoring/alerta/files/email.tmpl (limited to 'roles/monitoring/alerta') diff --git a/roles/monitoring/alerta/files/email.tmpl b/roles/monitoring/alerta/files/email.tmpl new file mode 100644 index 00000000..939e0038 --- /dev/null +++ b/roles/monitoring/alerta/files/email.tmpl @@ -0,0 +1,32 @@ + +------------------------------------------------------------ +[{{ alert.status|title }}] {{ alert.environment }}: {{ alert.severity|title }} {{ alert.event }} on {{ alert.service|join(', ') }} {{ alert.resource }} +------------------------------------------------------------ + +Alert ID: {{ alert.id }} +Create Time: {{ alert.create_time }} +Environment: {{ alert.environment }} +Services: {{ alert.service|join(', ') }} +Resource: {{ alert.resource }} +Event: {{ alert.event }} +Group: {{ alert.group }} +Value: {{ alert.value }} +Severity: {{ alert.previous_severity|title}} -> {{ alert.severity|title }} +Status: {{ alert.status|title }} +Text: {{ alert.text }} +Duplicate Count: {{ alert.duplicate_count }} +Origin: {{ alert.origin }} +Tags: {{ alert.tags|join(', ') }} +{% for key,value in alert.attributes.items() -%} +{{ key|title }}: {{ value | safe }} +{% endfor -%} + +{% if alert.raw_data %} +Raw Data +{{ alert.raw_data | safe }} +{% endif %} + +To acknowledge this alert visit this URL: +{{ dashboard_url | safe }}/#/alert/{{ alert.id }} + +Generated by {{ program }} on {{ hostname }} at {{ now }} diff --git a/roles/monitoring/alerta/tasks/main.yml b/roles/monitoring/alerta/tasks/main.yml index a4fd3df9..47df5308 100644 --- a/roles/monitoring/alerta/tasks/main.yml +++ b/roles/monitoring/alerta/tasks/main.yml @@ -3,16 +3,42 @@ loop: - config - postgres + - build file: path: "{{ alerta_base_path }}/{{ item }}" state: directory +- name: generate Dockerfile for custom image + copy: + content: | + FROM alerta/alerta-web:8.7.0 + + RUN set -x \ + && sed 's/USE_AM_EXTERNALURL_FOR_SILENCES/ALERTMANAGER_USE_EXTERNALURL_FOR_SILENCES/' -i /venv/lib/python3.8/site-packages/alerta_prometheus.py \ + && /venv/bin/pip install redis==4.1.4 \ + && /venv/bin/pip install git+https://github.com/alerta/alerta-contrib.git@69d271ef9fe6542727ec4aa39fc8e0f797f1e8b1#subdirectory=integrations/mailer + dest: "{{ alerta_base_path }}/build/Dockerfile" + register: alerta_custom_image_docker + +- name: build custom image + docker_image: + name: "alerta-web-with-mailer:8.7.0" + state: present + force_source: "{{ alerta_custom_image_docker is changed }}" + source: build + build: + path: "{{ alerta_base_path }}/build" + network: host + pull: yes + - name: install alertad config template copy: content: | DEBUG = {{ '{{' }} 'True' if env.DEBUG else 'False' {{ '}}' }} SECRET = "{{ '{{' }} env.SECRET_KEY {{ '}}' }}" - PLUGINS = ['reject', 'blackout', 'heartbeat', 'prometheus'] + ALERT_TIMEOUT = 86400 + HEARTBEAT_TIMEOUT = 7200 + PLUGINS = ['reject', 'blackout', 'heartbeat', 'prometheus', 'amqp'] DEFAULT_ENVIRONMENT = 'unknown' ALLOWED_ENVIRONMENTS = ['unknown', 'chaos-at-.*'] HEARTBEAT_EVENTS = ['PrometheusAlertmanagerE2eDeadManSwitch'] @@ -20,8 +46,35 @@ ALERTMANAGER_PASSWORD = 'alerta' ALERTMANAGER_SILENCE_FROM_ACK = True ALERTMANAGER_USE_EXTERNALURL_FOR_SILENCES = True + AMQP_URL = 'redis://localhost:6379/' dest: "{{ alerta_base_path }}/config/alertad.conf.j2" + ## TODO: add key handling... +- name: install alerta-mailer config file + copy: + content: | + [alerta-mailer] + debug = True + key = aNqBsEyG0ynIKcc3e7acaBVBk5B793o_z7tvlsht + endpoint = http://localhost:8080/api + amqp_url = redis://localhost:6379 + smtp_host = 192.168.28.250 + smtp_port = 25 + smtp_starttls = False + skip_mta = False + mail_to = equinox@chaos-at-home.org + mail_from = noreply@chaos-at-home.org + email_type = text + mail_template = /app/email.tmpl + dashboard_url = http://192.168.32.1:8080 + dest: "{{ alerta_base_path }}/config/alerta-mailer.conf" + mode: 0640 + +- name: install e-mail template + copy: + src: email.tmpl + dest: "{{ alerta_base_path }}/config/email.tmpl" + - name: install pod manifest vars: kubernetes_standalone_pod: @@ -32,5 +85,14 @@ - path: "{{ alerta_base_path }}/config/alertad.conf.j2" properties: - checksum + - path: "{{ alerta_base_path }}/config/alerta-mailer.conf" + properties: + - checksum + - path: "{{ alerta_base_path }}/config/email.tmpl" + properties: + - checksum + - path: "{{ alerta_base_path }}/build/Dockerfile" + properties: + - checksum include_role: name: kubernetes/standalone/pod diff --git a/roles/monitoring/alerta/templates/pod-spec.yml.j2 b/roles/monitoring/alerta/templates/pod-spec.yml.j2 index e31686e3..6edabae5 100644 --- a/roles/monitoring/alerta/templates/pod-spec.yml.j2 +++ b/roles/monitoring/alerta/templates/pod-spec.yml.j2 @@ -1,9 +1,6 @@ containers: - name: alerta - image: "alerta/alerta-web:8.7.0" - resources: - limits: - memory: "1Gi" + image: "alerta-web-with-mailer:8.7.0" env: - name: "DATABASE_URL" value: "postgres://alerta:secret@127.0.0.1:5432/alerta" @@ -11,6 +8,10 @@ containers: value: "True" - name: "ADMIN_USERS" value: "admin" + - name: "DEBUG" + value: "1" + - name: "SUPERVISORD_LOG_LEVEL" + value: "warn" volumeMounts: - name: config mountPath: /app/alertad.conf.j2 @@ -37,6 +38,30 @@ containers: - name: postgres mountPath: /var/lib/postgresql/data +- name: redis + image: "redis:6.2.6" + args: + - redis-server + - --bind + - 127.0.0.1 + +- name: mailer + image: "alerta-web-with-mailer:8.7.0" + command: + - alerta-mailer + env: + - name: "ALERTA_CONF_FILE" + value: "/app/alerta-mailer.conf" + volumeMounts: + - name: config + mountPath: /app/alerta-mailer.conf + subPath: alerta-mailer.conf + readOnly: true + - name: config + mountPath: /app/email.tmpl + subPath: email.tmpl + readOnly: true + volumes: - name: config hostPath: -- cgit v1.2.3 From 48f47342d85692e4d342c490085c13518be7a07e Mon Sep 17 00:00:00 2001 From: Christian Pointner Date: Tue, 8 Mar 2022 22:39:30 +0100 Subject: prometheus/alerta: tune some severities --- roles/monitoring/alerta/tasks/main.yml | 1 + roles/monitoring/prometheus/server/defaults/main/rules_node.yml | 4 ++-- 2 files changed, 3 insertions(+), 2 deletions(-) (limited to 'roles/monitoring/alerta') diff --git a/roles/monitoring/alerta/tasks/main.yml b/roles/monitoring/alerta/tasks/main.yml index 47df5308..490f5e3d 100644 --- a/roles/monitoring/alerta/tasks/main.yml +++ b/roles/monitoring/alerta/tasks/main.yml @@ -58,6 +58,7 @@ key = aNqBsEyG0ynIKcc3e7acaBVBk5B793o_z7tvlsht endpoint = http://localhost:8080/api amqp_url = redis://localhost:6379 + severities = critical, warning smtp_host = 192.168.28.250 smtp_port = 25 smtp_starttls = False diff --git a/roles/monitoring/prometheus/server/defaults/main/rules_node.yml b/roles/monitoring/prometheus/server/defaults/main/rules_node.yml index 75e96bca..d211731a 100644 --- a/roles/monitoring/prometheus/server/defaults/main/rules_node.yml +++ b/roles/monitoring/prometheus/server/defaults/main/rules_node.yml @@ -167,7 +167,7 @@ prometheus_server_rules_node: expr: increase(node_edac_correctable_errors_total[1m]) > 0 for: 0m labels: - severity: info + severity: warning annotations: summary: Host EDAC Correctable Errors detected (instance {{ '{{' }} $labels.instance {{ '}}' }}) description: "Host {{ '{{' }} $labels.instance {{ '}}' }} has had {{ '{{' }} printf \"%.0f\" $value {{ '}}' }} correctable memory errors reported by EDAC in the last 5 minutes.\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}" @@ -176,7 +176,7 @@ prometheus_server_rules_node: expr: node_edac_uncorrectable_errors_total > 0 for: 0m labels: - severity: warning + severity: critical annotations: summary: Host EDAC Uncorrectable Errors detected (instance {{ '{{' }} $labels.instance {{ '}}' }}) description: "Host {{ '{{' }} $labels.instance {{ '}}' }} has had {{ '{{' }} printf \"%.0f\" $value {{ '}}' }} uncorrectable memory errors reported by EDAC in the last 5 minutes.\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}" -- cgit v1.2.3