summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--chaos-at-home/ch-prometheus.yml4
-rw-r--r--inventory/group_vars/promzone-chaos-at-home/vars.yml1
-rw-r--r--inventory/host_vars/ch-prometheus.yml17
-rw-r--r--inventory/hosts.ini1
-rw-r--r--roles/monitoring/prometheus/exporter/base/tasks/main.yml5
-rw-r--r--roles/monitoring/prometheus/exporter/base/tasks/tls.yml4
-rw-r--r--roles/monitoring/prometheus/exporter/ipmi/templates/service.j25
-rw-r--r--roles/monitoring/prometheus/server/defaults/main/rules_ipmi.yml137
8 files changed, 166 insertions, 8 deletions
diff --git a/chaos-at-home/ch-prometheus.yml b/chaos-at-home/ch-prometheus.yml
index d7948562..0206228a 100644
--- a/chaos-at-home/ch-prometheus.yml
+++ b/chaos-at-home/ch-prometheus.yml
@@ -7,9 +7,11 @@
- role: core/sshd/base
- role: core/zsh
- role: core/cpu-microcode
+ - role: apt-repo/spreadspace
+ - role: nginx/base
+ - role: monitoring/prometheus/exporter
- role: storage/luks/volumes
- role: storage/zfs/pools
- - role: apt-repo/spreadspace
- role: storage/zfs/sanoid
- role: chaos-at-home/fileserver
- role: vm/host/base
diff --git a/inventory/group_vars/promzone-chaos-at-home/vars.yml b/inventory/group_vars/promzone-chaos-at-home/vars.yml
index 529bf3e7..a77fc0cb 100644
--- a/inventory/group_vars/promzone-chaos-at-home/vars.yml
+++ b/inventory/group_vars/promzone-chaos-at-home/vars.yml
@@ -13,6 +13,7 @@ prometheus_server_jobs:
- nut/ups
- blackbox
- blackbox/probe
+ - ipmi
prometheus_zone_name: chaos@home
prometheus_zone_targets: "{{ groups['promzone-chaos-at-home'] }}"
diff --git a/inventory/host_vars/ch-prometheus.yml b/inventory/host_vars/ch-prometheus.yml
index 9381f0c2..1779c602 100644
--- a/inventory/host_vars/ch-prometheus.yml
+++ b/inventory/host_vars/ch-prometheus.yml
@@ -44,6 +44,23 @@ ssh_keys_root_extra:
- ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAABAQC9AkOBxvf1wZ0B3wEyf7O3GbaIGx5o2f6cVuQIrOjeFfgMSAr1LwiB/gmHhMSEq6OSauD37TA5yDIrzk6NPPjVs/wiklsHgYtTqIxSPItTZFPX4gLvNwwGuRvEW9bTEiHd+bVPIiIT7HOje0kgacjan44rdgppX9DgcUp2j7uSZZabsxDCS/ms0slhwBNU1gtR31PoQ56vIya23D2uMauNAbRJzDEOfAjy4pHF8njYcXPas/yrbLi8PUZ1YO1u/AZto96EIYfHaCLWlstqeCX+R2JrTunvfTr8TF3AkFw8lHMzk3neUR+tPAAFQaqeTlqGPiSNq1Oyf+52XR16qwhd equinox@mail
- ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIIZK9NBainiE0+A8pT8dbwlNZ0k0AZVhLTzUSo3YtKJt ZFS Backup syncoid@epimetheus
+
+spreadspace_apt_repo_components:
+ - main
+ - prometheus
+
+prometheus_exporters_extra:
+ - ipmi
+
+prometheus_exporter_ipmi_modules:
+ default:
+ collectors:
+ - bmc
+ - ipmi
+ - chassis
+ - sel
+
+
installer_storage:
type: lvm
vg: "{{ host_name }}"
diff --git a/inventory/hosts.ini b/inventory/hosts.ini
index 06d360a0..36d2751e 100644
--- a/inventory/hosts.ini
+++ b/inventory/hosts.ini
@@ -409,6 +409,7 @@ ch-mon
[promzone-chaos-at-home]
ch-router
ch-testvm-prometheus
+ch-prometheus
[promzone-chaos-at-home:children]
chaos-at-home-ap
chaos-at-home-ups
diff --git a/roles/monitoring/prometheus/exporter/base/tasks/main.yml b/roles/monitoring/prometheus/exporter/base/tasks/main.yml
index 5f42867d..c69c6e05 100644
--- a/roles/monitoring/prometheus/exporter/base/tasks/main.yml
+++ b/roles/monitoring/prometheus/exporter/base/tasks/main.yml
@@ -13,6 +13,11 @@
home: /nonexistent
create_home: no
+- name: create base directory for exporter configs
+ file:
+ path: /etc/prometheus/exporter
+ state: directory
+
- name: create TLS certificate and key
import_tasks: tls.yml
diff --git a/roles/monitoring/prometheus/exporter/base/tasks/tls.yml b/roles/monitoring/prometheus/exporter/base/tasks/tls.yml
index e34025e4..083ca930 100644
--- a/roles/monitoring/prometheus/exporter/base/tasks/tls.yml
+++ b/roles/monitoring/prometheus/exporter/base/tasks/tls.yml
@@ -25,7 +25,7 @@
owner: prometheus-exporter
group: prometheus-exporter
mode: 0400
- notify: restart prometheus-exporter-exporter
+ notify: reload nginx
- name: create signing request for exporter certificate
openssl_csr:
@@ -86,7 +86,7 @@
copy:
content: "{{ prometheus_exporter_server_cert.certificate }}"
dest: /etc/ssl/prometheus/exporter/crt.pem
- notify: restart prometheus-exporter-exporter
+ notify: reload nginx
- name: slurp CA certificate
delegate_to: "{{ prometheus_server }}"
diff --git a/roles/monitoring/prometheus/exporter/ipmi/templates/service.j2 b/roles/monitoring/prometheus/exporter/ipmi/templates/service.j2
index 465215e8..d862e299 100644
--- a/roles/monitoring/prometheus/exporter/ipmi/templates/service.j2
+++ b/roles/monitoring/prometheus/exporter/ipmi/templates/service.j2
@@ -1,22 +1,21 @@
[Unit]
Description=Prometheus ipmi exporter
+After=systemd-modules-load.service
[Service]
Restart=always
-User=prometheus-exporter
ExecStart=/usr/bin/prometheus-ipmi-exporter --web.listen-address="127.0.0.1:9290" --config.file=/etc/prometheus/exporter/ipmi/config.yml --freeipmi.path="/usr/sbin"
ExecReload=/bin/kill -HUP $MAINPID
-{# TODO: test which hardening options need to be removed for IPMI to work... #}
# systemd hardening-options
AmbientCapabilities=
CapabilityBoundingSet=
DeviceAllow=/dev/null rw
+DeviceAllow=char-ipmidev rw
DevicePolicy=strict
LockPersonality=true
MemoryDenyWriteExecute=true
NoNewPrivileges=true
-PrivateDevices=true
PrivateTmp=true
ProtectControlGroups=true
ProtectHome=true
diff --git a/roles/monitoring/prometheus/server/defaults/main/rules_ipmi.yml b/roles/monitoring/prometheus/server/defaults/main/rules_ipmi.yml
index 41dcd7e9..31c15d51 100644
--- a/roles/monitoring/prometheus/server/defaults/main/rules_ipmi.yml
+++ b/roles/monitoring/prometheus/server/defaults/main/rules_ipmi.yml
@@ -1,4 +1,137 @@
---
prometheus_server_rules_ipmi_extra: []
-prometheus_server_rules_ipmi: []
-## TODO: add common IPMI alert rules
+prometheus_server_rules_ipmi:
+ - alert: IpmiFailedToScrapeCollector
+ expr: ipmi_up == 0
+ for: 2m
+ labels:
+ severity: warning
+ annotations:
+ summary: IPMI collector failed to scrape (instance {{ '{{' }} $labels.instance {{ '}}' }})
+ description: "The IPMI collector {{ '{{' }} $labels.collector {{ '}}' }} could not be scraped.\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}"
+
+ - alert: IpmiSensorStateWarning
+ expr: ipmi_sensor_state == 1
+ for: 2m
+ labels:
+ severity: warning
+ annotations:
+ summary: IPMI Sensor state is warning (instance {{ '{{' }} $labels.instance {{ '}}' }})
+ description: "The IPMI sensor {{ '{{' }} $labels.name {{ '}}' }} has state: warning.\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}"
+
+ - alert: IpmiSensorStateCritical
+ expr: ipmi_sensor_state == 2
+ for: 2m
+ labels:
+ severity: critical
+ annotations:
+ summary: IPMI Sensor state is critical (instance {{ '{{' }} $labels.instance {{ '}}' }})
+ description: "The IPMI sensor {{ '{{' }} $labels.name {{ '}}' }} has state: critical.\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}"
+
+ - alert: IpmiFanSpeedStateWarning
+ expr: ipmi_fan_speed_state == 1
+ for: 2m
+ labels:
+ severity: warning
+ annotations:
+ summary: IPMI fan-speed state is warning (instance {{ '{{' }} $labels.instance {{ '}}' }})
+ description: "The IPMI fanspeed {{ '{{' }} $labels.name {{ '}}' }} has state: warning.\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}"
+
+ - alert: IpmiFanSpeedStateCritical
+ expr: ipmi_fan_speed_state == 2
+ for: 2m
+ labels:
+ severity: critical
+ annotations:
+ summary: IPMI fan-speed state is critical (instance {{ '{{' }} $labels.instance {{ '}}' }})
+ description: "The IPMI fan speed {{ '{{' }} $labels.name {{ '}}' }} has state: critical.\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}"
+
+ - alert: IpmiTemperatureStateWarning
+ expr: ipmi_temperature_state == 1
+ for: 2m
+ labels:
+ severity: warning
+ annotations:
+ summary: IPMI Temperature state is warning (instance {{ '{{' }} $labels.instance {{ '}}' }})
+ description: "The IPMI temperature {{ '{{' }} $labels.name {{ '}}' }} has state: warning.\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}"
+
+ - alert: IpmiTemperatureStateCritical
+ expr: ipmi_temperature_state == 2
+ for: 2m
+ labels:
+ severity: critical
+ annotations:
+ summary: IPMI Temperature state is critical (instance {{ '{{' }} $labels.instance {{ '}}' }})
+ description: "The IPMI temperature {{ '{{' }} $labels.name {{ '}}' }} has state: critical.\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}"
+
+ - alert: IpmiVoltageStateWarning
+ expr: ipmi_voltage_state == 1
+ for: 2m
+ labels:
+ severity: warning
+ annotations:
+ summary: IPMI Voltage state is warning (instance {{ '{{' }} $labels.instance {{ '}}' }})
+ description: "The IPMI voltage {{ '{{' }} $labels.name {{ '}}' }} has state: warning.\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}"
+
+ - alert: IpmiVoltageStateCritical
+ expr: ipmi_voltage_state == 2
+ for: 2m
+ labels:
+ severity: critical
+ annotations:
+ summary: IPMI Voltage state is critical (instance {{ '{{' }} $labels.instance {{ '}}' }})
+ description: "The IPMI voltage {{ '{{' }} $labels.name {{ '}}' }} has state: critical.\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}"
+
+ - alert: IpmiCurrentStateWarning
+ expr: ipmi_current_state == 1
+ for: 2m
+ labels:
+ severity: warning
+ annotations:
+ summary: IPMI Current state is warning (instance {{ '{{' }} $labels.instance {{ '}}' }})
+ description: "The IPMI current {{ '{{' }} $labels.name {{ '}}' }} has state: warning.\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}"
+
+ - alert: IpmiCurrentStateCritical
+ expr: ipmi_current_state == 2
+ for: 2m
+ labels:
+ severity: critical
+ annotations:
+ summary: IPMI Current state is critical (instance {{ '{{' }} $labels.instance {{ '}}' }})
+ description: "The IPMI current {{ '{{' }} $labels.name {{ '}}' }} has state: critical.\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}"
+
+ - alert: IpmiPowerStateWarning
+ expr: ipmi_power_state == 1
+ for: 2m
+ labels:
+ severity: warning
+ annotations:
+ summary: IPMI Power state is warning (instance {{ '{{' }} $labels.instance {{ '}}' }})
+ description: "The IPMI power {{ '{{' }} $labels.name {{ '}}' }} has state: warning.\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}"
+
+ - alert: IpmiPowerStateCritical
+ expr: ipmi_power_state == 2
+ for: 2m
+ labels:
+ severity: critical
+ annotations:
+ summary: IPMI Power state is critical (instance {{ '{{' }} $labels.instance {{ '}}' }})
+ description: "The IPMI power {{ '{{' }} $labels.name {{ '}}' }} has state: critical.\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}"
+
+ - alert: IpmiSystemEventLogManyEvents
+ expr: rate(ipmi_sel_logs_count[2m]) > 0.1
+ for: 1m
+ labels:
+ severity: warning
+ annotations:
+ summary: IPMI SEL grows quickly (instance {{ '{{' }} $labels.instance {{ '}}' }})
+ description: "The IPMI system log contains too many new events.\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}"
+
+ - alert: IpmiSystemEventLogFull
+ expr: ipmi_sel_free_space_bytes < 100
+ for: 0m
+ labels:
+ severity: warning
+ annotations:
+ summary: IPMI SEL is full (instance {{ '{{' }} $labels.instance {{ '}}' }})
+ description: "The IPMI system log ran out of space.\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}"