diff options
-rw-r--r-- | chaos-at-home/ch-prometheus.yml | 4 | ||||
-rw-r--r-- | inventory/group_vars/promzone-chaos-at-home/vars.yml | 1 | ||||
-rw-r--r-- | inventory/host_vars/ch-prometheus.yml | 17 | ||||
-rw-r--r-- | inventory/hosts.ini | 1 | ||||
-rw-r--r-- | roles/monitoring/prometheus/exporter/base/tasks/main.yml | 5 | ||||
-rw-r--r-- | roles/monitoring/prometheus/exporter/base/tasks/tls.yml | 4 | ||||
-rw-r--r-- | roles/monitoring/prometheus/exporter/ipmi/templates/service.j2 | 5 | ||||
-rw-r--r-- | roles/monitoring/prometheus/server/defaults/main/rules_ipmi.yml | 137 |
8 files changed, 166 insertions, 8 deletions
diff --git a/chaos-at-home/ch-prometheus.yml b/chaos-at-home/ch-prometheus.yml index d7948562..0206228a 100644 --- a/chaos-at-home/ch-prometheus.yml +++ b/chaos-at-home/ch-prometheus.yml @@ -7,9 +7,11 @@ - role: core/sshd/base - role: core/zsh - role: core/cpu-microcode + - role: apt-repo/spreadspace + - role: nginx/base + - role: monitoring/prometheus/exporter - role: storage/luks/volumes - role: storage/zfs/pools - - role: apt-repo/spreadspace - role: storage/zfs/sanoid - role: chaos-at-home/fileserver - role: vm/host/base diff --git a/inventory/group_vars/promzone-chaos-at-home/vars.yml b/inventory/group_vars/promzone-chaos-at-home/vars.yml index 529bf3e7..a77fc0cb 100644 --- a/inventory/group_vars/promzone-chaos-at-home/vars.yml +++ b/inventory/group_vars/promzone-chaos-at-home/vars.yml @@ -13,6 +13,7 @@ prometheus_server_jobs: - nut/ups - blackbox - blackbox/probe + - ipmi prometheus_zone_name: chaos@home prometheus_zone_targets: "{{ groups['promzone-chaos-at-home'] }}" diff --git a/inventory/host_vars/ch-prometheus.yml b/inventory/host_vars/ch-prometheus.yml index 9381f0c2..1779c602 100644 --- a/inventory/host_vars/ch-prometheus.yml +++ b/inventory/host_vars/ch-prometheus.yml @@ -44,6 +44,23 @@ ssh_keys_root_extra: - ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAABAQC9AkOBxvf1wZ0B3wEyf7O3GbaIGx5o2f6cVuQIrOjeFfgMSAr1LwiB/gmHhMSEq6OSauD37TA5yDIrzk6NPPjVs/wiklsHgYtTqIxSPItTZFPX4gLvNwwGuRvEW9bTEiHd+bVPIiIT7HOje0kgacjan44rdgppX9DgcUp2j7uSZZabsxDCS/ms0slhwBNU1gtR31PoQ56vIya23D2uMauNAbRJzDEOfAjy4pHF8njYcXPas/yrbLi8PUZ1YO1u/AZto96EIYfHaCLWlstqeCX+R2JrTunvfTr8TF3AkFw8lHMzk3neUR+tPAAFQaqeTlqGPiSNq1Oyf+52XR16qwhd equinox@mail - ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIIZK9NBainiE0+A8pT8dbwlNZ0k0AZVhLTzUSo3YtKJt ZFS Backup syncoid@epimetheus + +spreadspace_apt_repo_components: + - main + - prometheus + +prometheus_exporters_extra: + - ipmi + +prometheus_exporter_ipmi_modules: + default: + collectors: + - bmc + - ipmi + - chassis + - sel + + installer_storage: type: lvm vg: "{{ host_name }}" diff --git a/inventory/hosts.ini b/inventory/hosts.ini index 06d360a0..36d2751e 100644 --- a/inventory/hosts.ini +++ b/inventory/hosts.ini @@ -409,6 +409,7 @@ ch-mon [promzone-chaos-at-home] ch-router ch-testvm-prometheus +ch-prometheus [promzone-chaos-at-home:children] chaos-at-home-ap chaos-at-home-ups diff --git a/roles/monitoring/prometheus/exporter/base/tasks/main.yml b/roles/monitoring/prometheus/exporter/base/tasks/main.yml index 5f42867d..c69c6e05 100644 --- a/roles/monitoring/prometheus/exporter/base/tasks/main.yml +++ b/roles/monitoring/prometheus/exporter/base/tasks/main.yml @@ -13,6 +13,11 @@ home: /nonexistent create_home: no +- name: create base directory for exporter configs + file: + path: /etc/prometheus/exporter + state: directory + - name: create TLS certificate and key import_tasks: tls.yml diff --git a/roles/monitoring/prometheus/exporter/base/tasks/tls.yml b/roles/monitoring/prometheus/exporter/base/tasks/tls.yml index e34025e4..083ca930 100644 --- a/roles/monitoring/prometheus/exporter/base/tasks/tls.yml +++ b/roles/monitoring/prometheus/exporter/base/tasks/tls.yml @@ -25,7 +25,7 @@ owner: prometheus-exporter group: prometheus-exporter mode: 0400 - notify: restart prometheus-exporter-exporter + notify: reload nginx - name: create signing request for exporter certificate openssl_csr: @@ -86,7 +86,7 @@ copy: content: "{{ prometheus_exporter_server_cert.certificate }}" dest: /etc/ssl/prometheus/exporter/crt.pem - notify: restart prometheus-exporter-exporter + notify: reload nginx - name: slurp CA certificate delegate_to: "{{ prometheus_server }}" diff --git a/roles/monitoring/prometheus/exporter/ipmi/templates/service.j2 b/roles/monitoring/prometheus/exporter/ipmi/templates/service.j2 index 465215e8..d862e299 100644 --- a/roles/monitoring/prometheus/exporter/ipmi/templates/service.j2 +++ b/roles/monitoring/prometheus/exporter/ipmi/templates/service.j2 @@ -1,22 +1,21 @@ [Unit] Description=Prometheus ipmi exporter +After=systemd-modules-load.service [Service] Restart=always -User=prometheus-exporter ExecStart=/usr/bin/prometheus-ipmi-exporter --web.listen-address="127.0.0.1:9290" --config.file=/etc/prometheus/exporter/ipmi/config.yml --freeipmi.path="/usr/sbin" ExecReload=/bin/kill -HUP $MAINPID -{# TODO: test which hardening options need to be removed for IPMI to work... #} # systemd hardening-options AmbientCapabilities= CapabilityBoundingSet= DeviceAllow=/dev/null rw +DeviceAllow=char-ipmidev rw DevicePolicy=strict LockPersonality=true MemoryDenyWriteExecute=true NoNewPrivileges=true -PrivateDevices=true PrivateTmp=true ProtectControlGroups=true ProtectHome=true diff --git a/roles/monitoring/prometheus/server/defaults/main/rules_ipmi.yml b/roles/monitoring/prometheus/server/defaults/main/rules_ipmi.yml index 41dcd7e9..31c15d51 100644 --- a/roles/monitoring/prometheus/server/defaults/main/rules_ipmi.yml +++ b/roles/monitoring/prometheus/server/defaults/main/rules_ipmi.yml @@ -1,4 +1,137 @@ --- prometheus_server_rules_ipmi_extra: [] -prometheus_server_rules_ipmi: [] -## TODO: add common IPMI alert rules +prometheus_server_rules_ipmi: + - alert: IpmiFailedToScrapeCollector + expr: ipmi_up == 0 + for: 2m + labels: + severity: warning + annotations: + summary: IPMI collector failed to scrape (instance {{ '{{' }} $labels.instance {{ '}}' }}) + description: "The IPMI collector {{ '{{' }} $labels.collector {{ '}}' }} could not be scraped.\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}" + + - alert: IpmiSensorStateWarning + expr: ipmi_sensor_state == 1 + for: 2m + labels: + severity: warning + annotations: + summary: IPMI Sensor state is warning (instance {{ '{{' }} $labels.instance {{ '}}' }}) + description: "The IPMI sensor {{ '{{' }} $labels.name {{ '}}' }} has state: warning.\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}" + + - alert: IpmiSensorStateCritical + expr: ipmi_sensor_state == 2 + for: 2m + labels: + severity: critical + annotations: + summary: IPMI Sensor state is critical (instance {{ '{{' }} $labels.instance {{ '}}' }}) + description: "The IPMI sensor {{ '{{' }} $labels.name {{ '}}' }} has state: critical.\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}" + + - alert: IpmiFanSpeedStateWarning + expr: ipmi_fan_speed_state == 1 + for: 2m + labels: + severity: warning + annotations: + summary: IPMI fan-speed state is warning (instance {{ '{{' }} $labels.instance {{ '}}' }}) + description: "The IPMI fanspeed {{ '{{' }} $labels.name {{ '}}' }} has state: warning.\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}" + + - alert: IpmiFanSpeedStateCritical + expr: ipmi_fan_speed_state == 2 + for: 2m + labels: + severity: critical + annotations: + summary: IPMI fan-speed state is critical (instance {{ '{{' }} $labels.instance {{ '}}' }}) + description: "The IPMI fan speed {{ '{{' }} $labels.name {{ '}}' }} has state: critical.\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}" + + - alert: IpmiTemperatureStateWarning + expr: ipmi_temperature_state == 1 + for: 2m + labels: + severity: warning + annotations: + summary: IPMI Temperature state is warning (instance {{ '{{' }} $labels.instance {{ '}}' }}) + description: "The IPMI temperature {{ '{{' }} $labels.name {{ '}}' }} has state: warning.\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}" + + - alert: IpmiTemperatureStateCritical + expr: ipmi_temperature_state == 2 + for: 2m + labels: + severity: critical + annotations: + summary: IPMI Temperature state is critical (instance {{ '{{' }} $labels.instance {{ '}}' }}) + description: "The IPMI temperature {{ '{{' }} $labels.name {{ '}}' }} has state: critical.\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}" + + - alert: IpmiVoltageStateWarning + expr: ipmi_voltage_state == 1 + for: 2m + labels: + severity: warning + annotations: + summary: IPMI Voltage state is warning (instance {{ '{{' }} $labels.instance {{ '}}' }}) + description: "The IPMI voltage {{ '{{' }} $labels.name {{ '}}' }} has state: warning.\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}" + + - alert: IpmiVoltageStateCritical + expr: ipmi_voltage_state == 2 + for: 2m + labels: + severity: critical + annotations: + summary: IPMI Voltage state is critical (instance {{ '{{' }} $labels.instance {{ '}}' }}) + description: "The IPMI voltage {{ '{{' }} $labels.name {{ '}}' }} has state: critical.\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}" + + - alert: IpmiCurrentStateWarning + expr: ipmi_current_state == 1 + for: 2m + labels: + severity: warning + annotations: + summary: IPMI Current state is warning (instance {{ '{{' }} $labels.instance {{ '}}' }}) + description: "The IPMI current {{ '{{' }} $labels.name {{ '}}' }} has state: warning.\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}" + + - alert: IpmiCurrentStateCritical + expr: ipmi_current_state == 2 + for: 2m + labels: + severity: critical + annotations: + summary: IPMI Current state is critical (instance {{ '{{' }} $labels.instance {{ '}}' }}) + description: "The IPMI current {{ '{{' }} $labels.name {{ '}}' }} has state: critical.\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}" + + - alert: IpmiPowerStateWarning + expr: ipmi_power_state == 1 + for: 2m + labels: + severity: warning + annotations: + summary: IPMI Power state is warning (instance {{ '{{' }} $labels.instance {{ '}}' }}) + description: "The IPMI power {{ '{{' }} $labels.name {{ '}}' }} has state: warning.\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}" + + - alert: IpmiPowerStateCritical + expr: ipmi_power_state == 2 + for: 2m + labels: + severity: critical + annotations: + summary: IPMI Power state is critical (instance {{ '{{' }} $labels.instance {{ '}}' }}) + description: "The IPMI power {{ '{{' }} $labels.name {{ '}}' }} has state: critical.\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}" + + - alert: IpmiSystemEventLogManyEvents + expr: rate(ipmi_sel_logs_count[2m]) > 0.1 + for: 1m + labels: + severity: warning + annotations: + summary: IPMI SEL grows quickly (instance {{ '{{' }} $labels.instance {{ '}}' }}) + description: "The IPMI system log contains too many new events.\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}" + + - alert: IpmiSystemEventLogFull + expr: ipmi_sel_free_space_bytes < 100 + for: 0m + labels: + severity: warning + annotations: + summary: IPMI SEL is full (instance {{ '{{' }} $labels.instance {{ '}}' }}) + description: "The IPMI system log ran out of space.\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}" |