diff options
Diffstat (limited to 'roles/monitoring/prometheus/server')
28 files changed, 235 insertions, 148 deletions
diff --git a/roles/monitoring/prometheus/server/defaults/main/main.yml b/roles/monitoring/prometheus/server/defaults/main/main.yml index 95b9da6d..1e0ccf78 100644 --- a/roles/monitoring/prometheus/server/defaults/main/main.yml +++ b/roles/monitoring/prometheus/server/defaults/main/main.yml @@ -5,9 +5,8 @@ prometheus_server_retention: "15d" -prometheus_server_jobs_generic: +prometheus_server_jobs: - node -prometheus_server_jobs_special: [] #prometheus_server_jobs_extra: | # - job_name: ... @@ -16,9 +15,11 @@ prometheus_server_rules: node: "{{ prometheus_server_rules_node + prometheus_server_rules_node_extra }}" openwrt: "{{ prometheus_server_rules_openwrt + prometheus_server_rules_node_extra }}" nut: "{{ prometheus_server_rules_nut + prometheus_server_rules_nut_extra }}" - "blackbox-ping": "{{ prometheus_server_rules_blackbox_ping + prometheus_server_rules_blackbox_ping_extra }}" - "blackbox-https": "{{ prometheus_server_rules_blackbox_https + prometheus_server_rules_blackbox_https_extra }}" - "blackbox-ssh": "{{ prometheus_server_rules_blackbox_ssh + prometheus_server_rules_blackbox_ssh_extra }}" + nut/ups: "{{ prometheus_server_rules_nut__ups + prometheus_server_rules_nut__ups_extra }}" + blackbox: "{{ prometheus_server_rules_blackbox + prometheus_server_rules_blackbox_extra }}" + blackbox/probe: "{{ prometheus_server_rules_blackbox__probe + prometheus_server_rules_blackbox__probe_extra }}" + ipmi: "{{ prometheus_server_rules_ipmi + prometheus_server_rules_ipmi_extra }}" + ipmi/remote: "{{ prometheus_server_rules_ipmi__remote + prometheus_server_rules_ipmi__remote_extra }}" # prometheus_server_alertmanager: # url: "127.0.0.1:9093" diff --git a/roles/monitoring/prometheus/server/defaults/main/rules_blackbox-https.yml b/roles/monitoring/prometheus/server/defaults/main/rules_blackbox-https.yml deleted file mode 100644 index bb806075..00000000 --- a/roles/monitoring/prometheus/server/defaults/main/rules_blackbox-https.yml +++ /dev/null @@ -1,3 +0,0 @@ ---- -prometheus_server_rules_blackbox_https_extra: [] -prometheus_server_rules_blackbox_https: [] diff --git a/roles/monitoring/prometheus/server/defaults/main/rules_blackbox-ping.yml b/roles/monitoring/prometheus/server/defaults/main/rules_blackbox-ping.yml deleted file mode 100644 index 56c122f5..00000000 --- a/roles/monitoring/prometheus/server/defaults/main/rules_blackbox-ping.yml +++ /dev/null @@ -1,3 +0,0 @@ ---- -prometheus_server_rules_blackbox_ping_extra: [] -prometheus_server_rules_blackbox_ping: [] diff --git a/roles/monitoring/prometheus/server/defaults/main/rules_blackbox-ssh.yml b/roles/monitoring/prometheus/server/defaults/main/rules_blackbox-ssh.yml deleted file mode 100644 index 727d2292..00000000 --- a/roles/monitoring/prometheus/server/defaults/main/rules_blackbox-ssh.yml +++ /dev/null @@ -1,3 +0,0 @@ ---- -prometheus_server_rules_blackbox_ssh_extra: [] -prometheus_server_rules_blackbox_ssh: [] diff --git a/roles/monitoring/prometheus/server/defaults/main/rules_blackbox.yml b/roles/monitoring/prometheus/server/defaults/main/rules_blackbox.yml new file mode 100644 index 00000000..99f2e83c --- /dev/null +++ b/roles/monitoring/prometheus/server/defaults/main/rules_blackbox.yml @@ -0,0 +1,3 @@ +--- +prometheus_server_rules_blackbox_extra: [] +prometheus_server_rules_blackbox: [] diff --git a/roles/monitoring/prometheus/server/defaults/main/rules_blackbox__probe.yml b/roles/monitoring/prometheus/server/defaults/main/rules_blackbox__probe.yml new file mode 100644 index 00000000..9f9d2292 --- /dev/null +++ b/roles/monitoring/prometheus/server/defaults/main/rules_blackbox__probe.yml @@ -0,0 +1,74 @@ +--- +prometheus_server_rules_blackbox__probe_extra: [] +prometheus_server_rules_blackbox__probe: + - alert: BlackboxProbeFailed + expr: probe_success == 0 + for: 0m + labels: + severity: critical + annotations: + summary: Blackbox probe failed (instance {{ '{{' }} $labels.instance {{ '}}' }}) + description: "Probe failed\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}" + + - alert: BlackboxSlowProbe + expr: avg_over_time(probe_duration_seconds[1m]) > 1 + for: 1m + labels: + severity: warning + annotations: + summary: Blackbox slow probe (instance {{ '{{' }} $labels.instance {{ '}}' }}) + description: "Blackbox probe took more than 1s to complete\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}" + + - alert: BlackboxSslCertificateWillExpireSoon + expr: probe_ssl_earliest_cert_expiry - time() < 86400 * 30 + for: 0m + labels: + severity: warning + annotations: + summary: Blackbox SSL certificate will expire soon (instance {{ '{{' }} $labels.instance {{ '}}' }}) + description: "SSL certificate expires in 30 days\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}" + + - alert: BlackboxSslCertificateWillExpireSoon + expr: probe_ssl_earliest_cert_expiry - time() < 86400 * 3 + for: 0m + labels: + severity: critical + annotations: + summary: Blackbox SSL certificate will expire soon (instance {{ '{{' }} $labels.instance {{ '}}' }}) + description: "SSL certificate expires in 3 days\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}" + + - alert: BlackboxSslCertificateExpired + expr: probe_ssl_earliest_cert_expiry - time() <= 0 + for: 0m + labels: + severity: critical + annotations: + summary: Blackbox SSL certificate expired (instance {{ '{{' }} $labels.instance {{ '}}' }}) + description: "SSL certificate has expired already\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}" + + - alert: BlackboxProbeHttpFailure + expr: probe_http_status_code <= 199 OR probe_http_status_code >= 400 + for: 0m + labels: + severity: critical + annotations: + summary: Blackbox probe HTTP failure (instance {{ '{{' }} $labels.instance {{ '}}' }}) + description: "HTTP status code is not 200-399\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}" + + - alert: BlackboxProbeSlowHttp + expr: avg_over_time(probe_http_duration_seconds[1m]) > 1 + for: 1m + labels: + severity: warning + annotations: + summary: Blackbox probe slow HTTP (instance {{ '{{' }} $labels.instance {{ '}}' }}) + description: "HTTP request took more than 1s\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}" + + - alert: BlackboxProbeSlowPing + expr: avg_over_time(probe_icmp_duration_seconds[1m]) > 1 + for: 1m + labels: + severity: warning + annotations: + summary: Blackbox probe slow ping (instance {{ '{{' }} $labels.instance {{ '}}' }}) + description: "Blackbox ping took more than 1s\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}" diff --git a/roles/monitoring/prometheus/server/defaults/main/rules_ipmi.yml b/roles/monitoring/prometheus/server/defaults/main/rules_ipmi.yml new file mode 100644 index 00000000..41dcd7e9 --- /dev/null +++ b/roles/monitoring/prometheus/server/defaults/main/rules_ipmi.yml @@ -0,0 +1,4 @@ +--- +prometheus_server_rules_ipmi_extra: [] +prometheus_server_rules_ipmi: [] +## TODO: add common IPMI alert rules diff --git a/roles/monitoring/prometheus/server/defaults/main/rules_ipmi__remote.yml b/roles/monitoring/prometheus/server/defaults/main/rules_ipmi__remote.yml new file mode 100644 index 00000000..1f9338ea --- /dev/null +++ b/roles/monitoring/prometheus/server/defaults/main/rules_ipmi__remote.yml @@ -0,0 +1,4 @@ +--- +prometheus_server_rules_ipmi__remote_extra: [] +prometheus_server_rules_ipmi__remote: [] +## TODO: add remote-IPMI specific alert rules diff --git a/roles/monitoring/prometheus/server/defaults/main/rules_node.yml b/roles/monitoring/prometheus/server/defaults/main/rules_node.yml index ab7317ac..55641534 100644 --- a/roles/monitoring/prometheus/server/defaults/main/rules_node.yml +++ b/roles/monitoring/prometheus/server/defaults/main/rules_node.yml @@ -92,6 +92,15 @@ prometheus_server_rules_node: summary: Host CPU steal noisy neighbor (instance {{ '{{' }} $labels.instance {{ '}}' }}) description: "CPU steal is > 10%. A noisy neighbor is killing VM performances or a spot instance may be out of credit.\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}" + - alert: HostSystemdNotRunning + expr: node_systemd_system_running == 0 + for: 0m + labels: + severity: warning + annotations: + summary: Host systemd is not in running state (instance {{ '{{' }} $labels.instance {{ '}}' }}) + description: "systemd is not in running state.\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}" + - alert: HostSystemdServiceCrashed expr: node_systemd_unit_state{state="failed"} == 1 for: 0m @@ -99,7 +108,7 @@ prometheus_server_rules_node: severity: warning annotations: summary: Host systemd service crashed (instance {{ '{{' }} $labels.instance {{ '}}' }}) - description: "systemd service crashed\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}" + description: "The systemd service unit {{ '{{' }} $labels.name {{ '}}' }} is in failed state.\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}" - alert: HostPhysicalComponentTooHot expr: node_hwmon_temp_celsius > 75 @@ -217,3 +226,30 @@ prometheus_server_rules_node: annotations: summary: Host clock not synchronising (instance {{ '{{' }} $labels.instance {{ '}}' }}) description: "Clock not synchronising.\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}" + + - alert: AptUpgradesPending + expr: sum by (instance) (apt_upgrades_pending) > 0 + for: 0m + labels: + severity: warning + annotations: + summary: Host has upgradeable packages (instance {{ '{{' }} $labels.instance {{ '}}' }}) + description: "Host {{ '{{' }} $labels.instance {{ '}}' }} has {{ '{{' }} $value {{ '}}' }} upgradable packages.\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}" + + - alert: AptAutoremovePending + expr: sum by (instance) (apt_autoremove_pending) > 0 + for: 0m + labels: + severity: warning + annotations: + summary: Host has packages that can be autoremoved (instance {{ '{{' }} $labels.instance {{ '}}' }}) + description: "Host {{ '{{' }} $labels.instance {{ '}}' }} has {{ '{{' }} $value {{ '}}' }} packages that can be autoremoved.\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}" + + - alert: HostNeedsRebooting + expr: node_reboot_required > 0 + for: 0m + labels: + severity: warning + annotations: + summary: Host must be rebootet (instance {{ '{{' }} $labels.instance {{ '}}' }}) + description: "Host {{ '{{' }} $labels.instance {{ '}}' }} must be rebootet for security uppdates to take effect.\n VALUE = {{ '{{' }} $value {{ '}}' }}\n LABELS = {{ '{{' }} $labels {{ '}}' }}" diff --git a/roles/monitoring/prometheus/server/defaults/main/rules_nut__ups.yml b/roles/monitoring/prometheus/server/defaults/main/rules_nut__ups.yml new file mode 100644 index 00000000..150a507e --- /dev/null +++ b/roles/monitoring/prometheus/server/defaults/main/rules_nut__ups.yml @@ -0,0 +1,4 @@ +--- +prometheus_server_rules_nut__ups_extra: [] +prometheus_server_rules_nut__ups: [] +## TODO: add NUT/UPS alert rules diff --git a/roles/monitoring/prometheus/server/defaults/main/rules_openwrt.yml b/roles/monitoring/prometheus/server/defaults/main/rules_openwrt.yml index 88d84f31..04b178f1 100644 --- a/roles/monitoring/prometheus/server/defaults/main/rules_openwrt.yml +++ b/roles/monitoring/prometheus/server/defaults/main/rules_openwrt.yml @@ -1,3 +1,4 @@ --- prometheus_server_rules_openwrt_extra: [] prometheus_server_rules_openwrt: [] +## TODO: add openwrt specific alert rules diff --git a/roles/monitoring/prometheus/server/filter_plugins/prometheus.py b/roles/monitoring/prometheus/server/filter_plugins/prometheus.py index 5a8722c2..d91ef619 100644 --- a/roles/monitoring/prometheus/server/filter_plugins/prometheus.py +++ b/roles/monitoring/prometheus/server/filter_plugins/prometheus.py @@ -6,38 +6,31 @@ from functools import partial from ansible import errors -def prometheus_generic_job_targets(hostvars, jobs, targets): +def prometheus_job_targets(hostvars, jobs, targets): try: result = [] for job in jobs: for target in targets: - enabled = job in hostvars[target]['prometheus_exporters_default'] or job in hostvars[target]['prometheus_exporters_extra'] - result.append({'job': job, 'instance': target, 'enabled': enabled}) + multitarget_config_varname = 'prometheus_job_multitarget_' + job.replace('-', '_').replace('/', '__') + if multitarget_config_varname in hostvars[target]: + for exporter_hostname, configs in hostvars[target][multitarget_config_varname].items(): + for config in configs: + result.append({'job': job, 'instance': config['instance'], 'enabled': True, + 'exporter_hostname': exporter_hostname, 'config': config}) + + else: + enabled = job in hostvars[target]['prometheus_exporters_default'] or job in hostvars[target]['prometheus_exporters_extra'] + result.append({'job': job, 'instance': target, 'enabled': enabled}) return result except Exception as e: - raise errors.AnsibleFilterError("prometheus_generic_job_targets(): %s" % str(e)) - - -def prometheus_special_job_targets(hostvars, jobs, targets): - try: - result = [] - for job in jobs: - for target in targets: - config_varname = 'prometheus_special_job_' + job.replace('-', '_') - if config_varname in hostvars[target]: - for config in hostvars[target][config_varname]: - result.append({'job': job, 'instance': config['instance'], 'config': config}) - return result - except Exception as e: - raise errors.AnsibleFilterError("prometheus_special_job_targets(): %s" % str(e)) + raise errors.AnsibleFilterError("prometheus_job_targets(): %s" % str(e)) class FilterModule(object): ''' prometheus filters ''' filter_map = { - 'prometheus_generic_job_targets': prometheus_generic_job_targets, - 'prometheus_special_job_targets': prometheus_special_job_targets, + 'prometheus_job_targets': prometheus_job_targets, } def filters(self): diff --git a/roles/monitoring/prometheus/server/tasks/main.yml b/roles/monitoring/prometheus/server/tasks/main.yml index d0ccd8af..16167c9c 100644 --- a/roles/monitoring/prometheus/server/tasks/main.yml +++ b/roles/monitoring/prometheus/server/tasks/main.yml @@ -13,6 +13,7 @@ include_role: name: "storage/{{ prometheus_server_storage.type }}/volume" + ## TODO: pin version - name: install apt packages apt: name: prom-server @@ -37,50 +38,52 @@ - name: create configuration directories loop: - - jobs - rules - targets file: path: "/etc/prometheus/{{ item }}" state: directory -- name: create sub-directroy for all exporter types in jobs directory - loop: "{{ prometheus_server_jobs_generic + prometheus_server_jobs_special }}" +- name: create sub-directories for all jobs in targets directory + loop: "{{ prometheus_server_jobs }}" file: - path: "/etc/prometheus/jobs/{{ item }}" + path: "/etc/prometheus/targets/{{ item }}" state: directory -- name: generate generic targets config - loop: "{{ prometheus_zone_targets }}" - loop_control: - loop_var: target - template: - src: targets/generic.yml.j2 - dest: "/etc/prometheus/targets/{{ target }}.yml" - notify: reload prometheus +- name: enable/disable job targets + vars: + job_targets: "{{ hostvars | prometheus_job_targets(prometheus_server_jobs, prometheus_zone_targets) }}" + block: + - name: install files for enabled targets + loop: "{{ job_targets }}" + loop_control: + loop_var: target + label: "{{ target.job }} -> {{ target.instance }}" + when: target.enabled + template: + src: "{{ lookup('first_found', {'paths': ['templates/targets'], 'files': [target.job + '.yml.j2', 'generic.yml.j2']}) }}" + dest: "/etc/prometheus/targets/{{ target.job }}/{{ target.instance }}.yml" + notify: reload prometheus -- name: enable targets for generic jobs - loop: "{{ hostvars | prometheus_generic_job_targets(prometheus_server_jobs_generic, prometheus_zone_targets) }}" - loop_control: - label: "{{ item.job }} -> {{ item.instance }}" - file: - src: "{{ item.enabled | ternary('/etc/prometheus/targets/' + item.instance + '.yml', omit) }}" - path: "/etc/prometheus/jobs/{{ item.job }}/{{ item.instance }}.yml" - state: "{{ item.enabled | ternary('link', 'absent') }}" - notify: reload prometheus + - name: remove files for disabled targets + loop: "{{ job_targets }}" + loop_control: + loop_var: target + label: "{{ target.job }} -> {{ target.instance }}" + when: not target.enabled + file: + path: "/etc/prometheus/targets/{{ target.job }}/{{ target.instance }}.yml" + state: absent + notify: reload prometheus -- name: enable targets for special jobs - loop: "{{ hostvars | prometheus_special_job_targets(prometheus_server_jobs_special, prometheus_zone_targets) }}" - loop_control: - loop_var: target - label: "{{ target.job }} -> {{ target.instance }}" - template: - src: "targets/{{ target.job }}.yml.j2" - dest: "/etc/prometheus/jobs/{{ target.job }}/{{ target.instance }}.yml" - notify: reload prometheus +- name: create sub-directories for all jobs in rules directory + loop: "{{ prometheus_server_jobs | select('match', '.*/.*') | map('dirname') | unique }}" + file: + path: "/etc/prometheus/rules/{{ item }}" + state: directory - name: generate rules files for all jobs - loop: "{{ (prometheus_server_jobs_generic + prometheus_server_jobs_special) | union(['prometheus']) }}" + loop: "{{ prometheus_server_jobs | union(['prometheus']) }}" template: src: rules.yml.j2 dest: "/etc/prometheus/rules/{{ item }}.yml" diff --git a/roles/monitoring/prometheus/server/templates/job-snippets/blackbox-https.j2 b/roles/monitoring/prometheus/server/templates/job-snippets/blackbox-https.j2 deleted file mode 100644 index 0a6d2dfa..00000000 --- a/roles/monitoring/prometheus/server/templates/job-snippets/blackbox-https.j2 +++ /dev/null @@ -1,14 +0,0 @@ - - job_name: '{{ job }}' - metrics_path: /proxy - params: - module: - - blackbox - - http_tls_2xx - scheme: https - tls_config: - ca_file: /etc/ssl/prometheus/ca-crt.pem - cert_file: /etc/ssl/prometheus/server/scrape-crt.pem - key_file: /etc/ssl/prometheus/server/scrape-key.pem - file_sd_configs: - - files: - - "/etc/prometheus/jobs/{{ job }}/*.yml" diff --git a/roles/monitoring/prometheus/server/templates/job-snippets/blackbox-ping.j2 b/roles/monitoring/prometheus/server/templates/job-snippets/blackbox-ping.j2 deleted file mode 100644 index 7f4f12df..00000000 --- a/roles/monitoring/prometheus/server/templates/job-snippets/blackbox-ping.j2 +++ /dev/null @@ -1,14 +0,0 @@ - - job_name: '{{ job }}' - metrics_path: /proxy - params: - module: - - blackbox - - icmp - scheme: https - tls_config: - ca_file: /etc/ssl/prometheus/ca-crt.pem - cert_file: /etc/ssl/prometheus/server/scrape-crt.pem - key_file: /etc/ssl/prometheus/server/scrape-key.pem - file_sd_configs: - - files: - - "/etc/prometheus/jobs/{{ job }}/*.yml" diff --git a/roles/monitoring/prometheus/server/templates/job-snippets/blackbox-ssh.j2 b/roles/monitoring/prometheus/server/templates/job-snippets/blackbox-ssh.j2 deleted file mode 100644 index 18381e32..00000000 --- a/roles/monitoring/prometheus/server/templates/job-snippets/blackbox-ssh.j2 +++ /dev/null @@ -1,14 +0,0 @@ - - job_name: '{{ job }}' - metrics_path: /proxy - params: - module: - - blackbox - - ssh_banner - scheme: https - tls_config: - ca_file: /etc/ssl/prometheus/ca-crt.pem - cert_file: /etc/ssl/prometheus/server/scrape-crt.pem - key_file: /etc/ssl/prometheus/server/scrape-key.pem - file_sd_configs: - - files: - - "/etc/prometheus/jobs/{{ job }}/*.yml" diff --git a/roles/monitoring/prometheus/server/templates/job-snippets/generic.j2 b/roles/monitoring/prometheus/server/templates/jobs/generic.j2 index 87992eeb..65a95007 100644 --- a/roles/monitoring/prometheus/server/templates/job-snippets/generic.j2 +++ b/roles/monitoring/prometheus/server/templates/jobs/generic.j2 @@ -1,8 +1,5 @@ - job_name: '{{ job }}' - metrics_path: /proxy - params: - module: - - {{ job }} + metrics_path: /{{ job }} scheme: https tls_config: ca_file: /etc/ssl/prometheus/ca-crt.pem @@ -10,4 +7,4 @@ key_file: /etc/ssl/prometheus/server/scrape-key.pem file_sd_configs: - files: - - "/etc/prometheus/jobs/{{ job }}/*.yml" + - "/etc/prometheus/targets/{{ job }}/*.yml" diff --git a/roles/monitoring/prometheus/server/templates/jobs/node.j2 b/roles/monitoring/prometheus/server/templates/jobs/node.j2 new file mode 100644 index 00000000..1b14e1f6 --- /dev/null +++ b/roles/monitoring/prometheus/server/templates/jobs/node.j2 @@ -0,0 +1,17 @@ + - job_name: '{{ job }}' + metrics_path: /{{ job }} + scheme: https + tls_config: + ca_file: /etc/ssl/prometheus/ca-crt.pem + cert_file: /etc/ssl/prometheus/server/scrape-crt.pem + key_file: /etc/ssl/prometheus/server/scrape-key.pem + file_sd_configs: + - files: + - "/etc/prometheus/targets/{{ job }}/*.yml" + metric_relabel_configs: + - source_labels: [ "mountpoint" ] + regex: ".*/\\.snapshot/.*" + action: drop + - source_labels: [ "__name__", "state" ] + regex: "node_systemd_unit_state;(activating|deactivating|inactive)" + action: drop diff --git a/roles/monitoring/prometheus/server/templates/job-snippets/openwrt.j2 b/roles/monitoring/prometheus/server/templates/jobs/openwrt.j2 index 493a4fdb..e93f8be7 100644 --- a/roles/monitoring/prometheus/server/templates/job-snippets/openwrt.j2 +++ b/roles/monitoring/prometheus/server/templates/jobs/openwrt.j2 @@ -2,4 +2,4 @@ scheme: http file_sd_configs: - files: - - "/etc/prometheus/jobs/{{ job }}/*.yml" + - "/etc/prometheus/targets/{{ job }}/*.yml" diff --git a/roles/monitoring/prometheus/server/templates/prometheus.yml.j2 b/roles/monitoring/prometheus/server/templates/prometheus.yml.j2 index 4cfcc498..e73ca354 100644 --- a/roles/monitoring/prometheus/server/templates/prometheus.yml.j2 +++ b/roles/monitoring/prometheus/server/templates/prometheus.yml.j2 @@ -6,6 +6,9 @@ global: rule_files: - /etc/prometheus/rules/*.yml +{% for subdir in (prometheus_server_jobs | select('match', '.*/.*') | map('dirname') | unique) %} + - /etc/prometheus/rules/{{ subdir }}/*.yml +{% endfor %} {% if prometheus_server_alertmanager is defined %} alerting: @@ -25,7 +28,7 @@ scrape_configs: static_configs: - targets: ['localhost:9090'] labels: - instance: "{{ inventory_hostname }}" + instance: '{{ inventory_hostname }}' {% if prometheus_server_alertmanager is defined %} - job_name: 'alertmanager' @@ -35,9 +38,9 @@ scrape_configs: static_configs: - targets: ['{{ prometheus_server_alertmanager.url }}'] {% endif %} -{% for job in (prometheus_server_jobs_generic + prometheus_server_jobs_special) %} +{% for job in (prometheus_server_jobs) %} -{% include 'job-snippets/' + (lookup('first_found', {'paths': ['templates/job-snippets'], 'files': [job + '.j2', 'generic.j2']}) | basename) %}{{ '' }} +{% include lookup('first_found', {'paths': ['templates/jobs'], 'files': [job + '.j2', 'generic.j2']}) | relpath(template_fullpath | dirname) %}{{ '' }} {% endfor %} {% if prometheus_server_jobs_extra is defined %} diff --git a/roles/monitoring/prometheus/server/templates/targets/blackbox-https.yml.j2 b/roles/monitoring/prometheus/server/templates/targets/blackbox-https.yml.j2 deleted file mode 100644 index e843de36..00000000 --- a/roles/monitoring/prometheus/server/templates/targets/blackbox-https.yml.j2 +++ /dev/null @@ -1,4 +0,0 @@ -- targets: [ "{{ hostvars[target.config.exporter_hostname].prometheus_scrape_endpoint }}" ] - labels: - instance: "{{ target.instance }}" - __param_target: {{ target.config.address }} diff --git a/roles/monitoring/prometheus/server/templates/targets/blackbox-ping.yml.j2 b/roles/monitoring/prometheus/server/templates/targets/blackbox-ping.yml.j2 deleted file mode 100644 index e843de36..00000000 --- a/roles/monitoring/prometheus/server/templates/targets/blackbox-ping.yml.j2 +++ /dev/null @@ -1,4 +0,0 @@ -- targets: [ "{{ hostvars[target.config.exporter_hostname].prometheus_scrape_endpoint }}" ] - labels: - instance: "{{ target.instance }}" - __param_target: {{ target.config.address }} diff --git a/roles/monitoring/prometheus/server/templates/targets/blackbox-ssh.yml.j2 b/roles/monitoring/prometheus/server/templates/targets/blackbox-ssh.yml.j2 deleted file mode 100644 index e843de36..00000000 --- a/roles/monitoring/prometheus/server/templates/targets/blackbox-ssh.yml.j2 +++ /dev/null @@ -1,4 +0,0 @@ -- targets: [ "{{ hostvars[target.config.exporter_hostname].prometheus_scrape_endpoint }}" ] - labels: - instance: "{{ target.instance }}" - __param_target: {{ target.config.address }} diff --git a/roles/monitoring/prometheus/server/templates/targets/blackbox/probe.yml.j2 b/roles/monitoring/prometheus/server/templates/targets/blackbox/probe.yml.j2 new file mode 100644 index 00000000..4e336873 --- /dev/null +++ b/roles/monitoring/prometheus/server/templates/targets/blackbox/probe.yml.j2 @@ -0,0 +1,5 @@ +- targets: [ '{{ hostvars[target.exporter_hostname].prometheus_scrape_endpoint }}' ] + labels: + instance: '{{ target.instance }}' + __param_target: '{{ target.config.target }}' + __param_module: '{{ target.config.module }}' diff --git a/roles/monitoring/prometheus/server/templates/targets/generic.yml.j2 b/roles/monitoring/prometheus/server/templates/targets/generic.yml.j2 index e83b6bf4..6591362b 100644 --- a/roles/monitoring/prometheus/server/templates/targets/generic.yml.j2 +++ b/roles/monitoring/prometheus/server/templates/targets/generic.yml.j2 @@ -1,3 +1,3 @@ -- targets: [ "{{ hostvars[target].prometheus_scrape_endpoint }}" ] +- targets: [ '{{ hostvars[target.instance].prometheus_scrape_endpoint }}' ] labels: - instance: "{{ target }}" + instance: '{{ target.instance }}' diff --git a/roles/monitoring/prometheus/server/templates/targets/ipmi/remote.yml.j2 b/roles/monitoring/prometheus/server/templates/targets/ipmi/remote.yml.j2 new file mode 100644 index 00000000..4e336873 --- /dev/null +++ b/roles/monitoring/prometheus/server/templates/targets/ipmi/remote.yml.j2 @@ -0,0 +1,5 @@ +- targets: [ '{{ hostvars[target.exporter_hostname].prometheus_scrape_endpoint }}' ] + labels: + instance: '{{ target.instance }}' + __param_target: '{{ target.config.target }}' + __param_module: '{{ target.config.module }}' diff --git a/roles/monitoring/prometheus/server/templates/targets/nut.yml.j2 b/roles/monitoring/prometheus/server/templates/targets/nut.yml.j2 deleted file mode 100644 index da3de3d7..00000000 --- a/roles/monitoring/prometheus/server/templates/targets/nut.yml.j2 +++ /dev/null @@ -1,17 +0,0 @@ -- targets: [ "{{ hostvars[target.config.exporter_hostname].prometheus_scrape_endpoint }}" ] - labels: - instance: "{{ target.instance }}" - __param_ups: {{ target.config.ups }} - __param_server: {{ target.config.server | default('127.0.0.1') }} -{% if 'username' in target.config %} - __param_username: {{ target.config.username }} -{% endif %} -{% if 'password' in target.config %} - __param_password: {{ target.config.password }} -{% endif %} -{% if 'variables' in target.config %} - __param_variables: {{ target.config.variables }} -{% endif %} -{% if 'statuses' in target.config %} - __param_statuses: {{ target.config.statuses }} -{% endif %} diff --git a/roles/monitoring/prometheus/server/templates/targets/nut/ups.yml.j2 b/roles/monitoring/prometheus/server/templates/targets/nut/ups.yml.j2 new file mode 100644 index 00000000..c60077c7 --- /dev/null +++ b/roles/monitoring/prometheus/server/templates/targets/nut/ups.yml.j2 @@ -0,0 +1,17 @@ +- targets: [ '{{ hostvars[target.exporter_hostname].prometheus_scrape_endpoint }}' ] + labels: + instance: '{{ target.instance }}' + __param_ups: '{{ target.config.ups }}' + __param_server: '{{ target.config.server | default('127.0.0.1') }}' +{% if 'username' in target.config %} + __param_username: '{{ target.config.username }}' +{% endif %} +{% if 'password' in target.config %} + __param_password: '{{ target.config.password }}' +{% endif %} +{% if 'variables' in target.config %} + __param_variables: '{{ target.config.variables }}' +{% endif %} +{% if 'statuses' in target.config %} + __param_statuses: '{{ target.config.statuses }}' +{% endif %} |