From 6cb1eaa514c267467b91b63d39c05b4ccebc555e Mon Sep 17 00:00:00 2001 From: Julian Labus Date: Wed, 26 Sep 2018 15:35:47 +0200 Subject: [PATCH] role service-prometheus: configure alertmanager --- inventory/group_vars/ffmwu-monitoring | 7 + playbooks/prometheus/alert.rules | 124 ++++++++++++++++++ playbooks/prometheus/prometheus.yml.j2 | 8 +- roles/service-prometheus/defaults/main.yml | 2 +- .../service-prometheus/files/alertmanager.yml | 14 +- roles/service-prometheus/files/amtool.yml | 8 ++ .../service-prometheus/tasks/alertmanager.yml | 13 ++ .../templates/prometheus_vhost.conf.j2 | 16 ++- 8 files changed, 183 insertions(+), 9 deletions(-) create mode 100644 playbooks/prometheus/alert.rules create mode 100644 roles/service-prometheus/files/amtool.yml diff --git a/inventory/group_vars/ffmwu-monitoring b/inventory/group_vars/ffmwu-monitoring index 0177628..98271ac 100644 --- a/inventory/group_vars/ffmwu-monitoring +++ b/inventory/group_vars/ffmwu-monitoring @@ -15,11 +15,18 @@ http_prometheus_prefix: prom prometheus_conf_main: prometheus/prometheus.yml.j2 +prometheus_rule_files: + alert_rules: + src: prometheus/alert.rules + dest: alert.rules + prometheus_components: - prometheus - alertmanager - node_exporter - blackbox_exporter +alertmanager_opts: "--web.external-url=https://{{ http_prometheus_prefix }}.{{ http_domain_external }}/alertmanager/" + yanic_blacklist: - 98ded0c5e0c0 diff --git a/playbooks/prometheus/alert.rules b/playbooks/prometheus/alert.rules new file mode 100644 index 0000000..569c9d4 --- /dev/null +++ b/playbooks/prometheus/alert.rules @@ -0,0 +1,124 @@ +groups: +- name: blackbox + rules: + - alert: Icmp4Timeout + expr: probe_success{job="icmp4"} == 0 + for: 5m + labels: + severity: page + annotations: + description: 'ICMP requests to the primary IPv4 address timed out' + summary: 'Instance {{ $labels.instance }} does not respond to ICMPv4 echo requests' + + - alert: Icmp6Timeout + expr: probe_success{job="icmp6"} == 0 + for: 5m + labels: + severity: page + annotations: + description: 'ICMP requests to the primary IPv6 address timed out' + summary: 'Instance {{ $labels.instance }} does not respond to ICMPv6 echo requests' + +- name: node + rules: + - alert: InstanceDown + expr: up{job="node"} == 0 + for: 5m + labels: + severity: page + annotations: + description: 'The instance is down for more than 5 minutes' + summary: 'Instance {{ $labels.instance }} is down' + + - alert: ExporterDown + expr: up{job!="node"} == 0 + for: 5m + annotations: + description: 'An exporter is down for more than 5 minutes' + summary: 'Exporter {{ $labels.instance }} is down' + + - alert: InstanceHighCpu + expr: 100 - (avg(rate(node_cpu_seconds_total{mode="idle"}[5m])) BY (instance) * 100) > 90 + for: 5m + annotations: + description: 'CPU usage above 90% for more than 5m' + summary: 'Instance {{ $labels.instance }}: cpu usage at {{ $value }}' + value: '{{ $value }}' + + - alert: InstanceHighCpuLong + expr: 100 - (avg(rate(node_cpu_seconds_total{mode="idle"}[5m])) BY (instance) * 100) > 90 + for: 30m + labels: + severity: page + annotations: + description: 'CPU usage above 90% for more than 30m' + summary: 'Instance {{ $labels.instance }}: persistent cpu usage at {{ $value }}' + value: '{{ $value }}' + + - alert: InstanceLowMem + expr: node_memory_MemAvailable_bytes / 1024 / 1024 < node_memory_MemTotal_bytes / 1024 / 1024 / 10 + for: 3m + labels: + severity: page + annotations: + description: 'Less than 10% of free memory' + summary: 'Instance {{ $labels.instance }}: {{ $value }}MB of free memory' + value: '{{ $value }}' + + - alert: InstanceLowDiskPrediction4Hours + expr: predict_linear(node_filesystem_free_bytes{device=~"/dev/.*",job="node"}[1h], 4 * 3600) < 0 + for: 30m + labels: + severity: page + annotations: + description: 'Disk {{ $labels.mountpoint }} ({{ $labels.device }}) will be full in less than 4 hours' + summary: 'Instance {{ $labels.instance }}: Disk {{ $labels.mountpoint }} ({{ $labels.device }}) will be full in less than 4 hours' + + - alert: InstanceLowDiskPrediction12Hours + expr: predict_linear(node_filesystem_free_bytes{device=~"/dev/.*",job="node"}[3h], 12 * 3600) < 0 + for: 60m + labels: + severity: page + annotations: + description: 'Disk {{ $labels.mountpoint }} ({{ $labels.device }}) will be full in less than 12 hours' + summary: 'Instance {{ $labels.instance }}: Disk {{ $labels.mountpoint }} ({{ $labels.device }}) will be full in less than 12 hours' + + - alert: InstanceLowDiskAbs + expr: node_filesystem_avail_bytes{mountpoint="/"} / 1024 / 1024 < 1024 + for: 1m + labels: + severity: page + annotations: + description: 'Less than 1GB of free disk space left on the root filesystem' + summary: 'Instance {{ $labels.instance }}: {{ $value }}MB free disk space on {{ $labels.device }}' + value: '{{ $value }}' + + - alert: InstanceLowDiskPerc + expr: 100 * (node_filesystem_free_bytes / node_filesystem_size_bytes) < 10 + for: 1m + labels: + severity: page + annotations: + description: 'Less than 10% of free disk space left on a device' + summary: 'Instance {{ $labels.instance }}: {{ $value }}% free disk space on {{ $labels.device }}' + value: '{{ $value }}' + + - alert: ServiceFailed + expr: node_systemd_unit_state{state="failed"} > 0 + for: 1m + labels: + severity: page + annotations: + description: 'A systemd unit went into failed state' + summary: 'Instance {{ $labels.instance }}: Service {{ $labels.name }} failed' + value: '{{ $labels.name }}' + + - alert: ServiceFlapping + expr: changes(node_systemd_unit_state{state="failed"}[5m]) > 5 or + (changes(node_systemd_unit_state{state="failed"}[1h]) > 15 unless changes(node_systemd_unit_state{state="failed"}[30m]) < 7) + labels: + severity: page + annotations: + description: 'A systemd service changed its state more than 5x/5min or 15x/1h' + summary: 'Instance {{ $labels.instance }}: Service {{ $labels.name }} is flapping' + value: '{{ $labels.name }}' diff --git a/playbooks/prometheus/prometheus.yml.j2 b/playbooks/prometheus/prometheus.yml.j2 index ac7c106..760e828 100644 --- a/playbooks/prometheus/prometheus.yml.j2 +++ b/playbooks/prometheus/prometheus.yml.j2 @@ -8,6 +8,13 @@ global: external_labels: monitor: 'master' +alerting: + alertmanagers: + - scheme: https + path_prefix: /alertmanager/ + static_configs: + - targets: ['{{ http_prometheus_prefix }}.{{ http_domain_external }}'] + {% if prometheus_rule_files is defined %} # Rule files specifies a list of files from which rules are read. rule_files: @@ -18,7 +25,6 @@ rule_files: # A list of scrape configurations. scrape_configs: - - job_name: 'prometheus' scrape_interval: 10s scrape_timeout: 10s diff --git a/roles/service-prometheus/defaults/main.yml b/roles/service-prometheus/defaults/main.yml index feaba90..4ccedb4 100644 --- a/roles/service-prometheus/defaults/main.yml +++ b/roles/service-prometheus/defaults/main.yml @@ -32,6 +32,6 @@ prometheus_goroot: "{{ prometheus_workdir }}/go" prometheus_gopath: "{{ prometheus_workdir }}/gopath" prometheus_default_opts: "--web.listen-address=localhost:9090 --config.file={{ prometheus_config_path }}/prometheus.yml --storage.tsdb.path={{ prometheus_db_path }}" -alertmanager_default_opts: "--config.file={{ prometheus_config_path }}/alertmanager.yml --storage.path={{ alertmanager_db_path }}" +alertmanager_default_opts: "--web.listen-address=localhost:9093 --config.file={{ prometheus_config_path }}/alertmanager.yml --storage.path={{ alertmanager_db_path }}" node_exporter_default_opts: "--web.listen-address=localhost:9100" blackbox_default_opts: "--web.listen-address=localhost:9115 --config.file={{ prometheus_config_path }}/blackbox.yml" diff --git a/roles/service-prometheus/files/alertmanager.yml b/roles/service-prometheus/files/alertmanager.yml index abab519..175f319 100644 --- a/roles/service-prometheus/files/alertmanager.yml +++ b/roles/service-prometheus/files/alertmanager.yml @@ -1,13 +1,17 @@ global: + smtp_from: "admin@freifunk-mwu.de" + smtp_smarthost: "localhost:25" + smtp_require_tls: false route: - group_by: ['alertname', 'cluster'] + group_by: ['alertname', 'alertstate', 'cluster', 'service'] group_wait: 30s group_interval: 5m repeat_interval: 3h - receiver: 'default-pager' + receiver: 'email' receivers: - - name: 'default-pager' - pagerduty_configs: - - service_key: + - name: 'email' + email_configs: + - send_resolved: true + to: "admin@lists.freifunk-mwu.de" diff --git a/roles/service-prometheus/files/amtool.yml b/roles/service-prometheus/files/amtool.yml new file mode 100644 index 0000000..efa6116 --- /dev/null +++ b/roles/service-prometheus/files/amtool.yml @@ -0,0 +1,8 @@ +# Define the path that amtool can find your `alertmanager` instance at +alertmanager.url: "http://localhost:9093/alertmanager" + +# Override the default author. (unset defaults to your username) +author: admin@freifunk-mwu.de + +# Force amtool to give you an error if you don't include a comment on a silence +comment_required: true diff --git a/roles/service-prometheus/tasks/alertmanager.yml b/roles/service-prometheus/tasks/alertmanager.yml index 6f44ea7..1ba92de 100644 --- a/roles/service-prometheus/tasks/alertmanager.yml +++ b/roles/service-prometheus/tasks/alertmanager.yml @@ -41,6 +41,14 @@ group: "{{ prometheus_group }}" mode: "u=rwx,g=rx,o=" +- name: mkdir for amtool config + file: + path: "/etc/amtool" + state: directory + owner: "{{ prometheus_user }}" + group: "{{ prometheus_group }}" + mode: "u=rwx,g=rx,o=rx" + - name: copy alertmanager systemd config template: src: "alertmanager.service.j2" @@ -49,6 +57,11 @@ - reload systemd - restart alertmanager +- name: install amtool config file + copy: + src: "amtool.yml" + dest: "/etc/amtool/config.yml" + - name: install alertmanager config file copy: src: "alertmanager.yml" diff --git a/roles/service-prometheus/templates/prometheus_vhost.conf.j2 b/roles/service-prometheus/templates/prometheus_vhost.conf.j2 index 1c36b67..afbdd9d 100644 --- a/roles/service-prometheus/templates/prometheus_vhost.conf.j2 +++ b/roles/service-prometheus/templates/prometheus_vhost.conf.j2 @@ -17,9 +17,21 @@ server { include /etc/nginx/snippets/letsencrypt-acme-challenge.conf; + satisfy any; + + allow 127.0.0.0/8; + allow ::1/128; + allow {{ lookup('dig', inventory_hostname, 'qtype=A') }}; + allow {{ lookup('dig', inventory_hostname, 'qtype=AAAA') }}; + + auth_basic "Prometheus"; + auth_basic_user_file /etc/nginx/htpasswd_prometheus; + + location /alertmanager { + proxy_pass http://127.0.0.1:9093; + } + location / { - auth_basic "Prometheus"; - auth_basic_user_file /etc/nginx/htpasswd_prometheus; proxy_pass http://127.0.0.1:9090; } }