role service-prometheus: configure alertmanager

2018-09-26 15:35:47 +02:00 · 2018-09-26 15:35:47 +02:00 · 6cb1eaa514
commit 6cb1eaa514
parent 1d72eb8439
8 changed files with 183 additions and 9 deletions
--- a/inventory/group_vars/ffmwu-monitoring
+++ b/inventory/group_vars/ffmwu-monitoring
@ -15,11 +15,18 @@ http_prometheus_prefix: prom

 prometheus_conf_main: prometheus/prometheus.yml.j2

+prometheus_rule_files:
+  alert_rules:
+    src:  prometheus/alert.rules
+    dest: alert.rules
+
 prometheus_components:
  - prometheus
  - alertmanager
  - node_exporter
  - blackbox_exporter

+alertmanager_opts: "--web.external-url=https://{{ http_prometheus_prefix }}.{{ http_domain_external }}/alertmanager/"
+
 yanic_blacklist:
  - 98ded0c5e0c0
--- a/playbooks/prometheus/alert.rules
+++ b/playbooks/prometheus/alert.rules
@ -0,0 +1,124 @@
+groups:
+- name: blackbox
+  rules:
+  - alert: Icmp4Timeout
+    expr: probe_success{job="icmp4"} == 0
+    for: 5m
+    labels:
+      severity: page
+    annotations:
+      description: 'ICMP requests to the primary IPv4 address timed out'
+      summary: 'Instance {{ $labels.instance }} does not respond to ICMPv4 echo requests'
+
+  - alert: Icmp6Timeout
+    expr: probe_success{job="icmp6"} == 0
+    for: 5m
+    labels:
+      severity: page
+    annotations:
+      description: 'ICMP requests to the primary IPv6 address timed out'
+      summary: 'Instance {{ $labels.instance }} does not respond to ICMPv6 echo requests'
+
+- name: node
+  rules:
+  - alert: InstanceDown
+    expr: up{job="node"} == 0
+    for: 5m
+    labels:
+      severity: page
+    annotations:
+      description: 'The instance is down for more than 5 minutes'
+      summary: 'Instance {{ $labels.instance }} is down'
+
+  - alert: ExporterDown
+    expr: up{job!="node"} == 0
+    for: 5m
+    annotations:
+      description: 'An exporter is down for more than 5 minutes'
+      summary: 'Exporter {{ $labels.instance }} is down'
+
+  - alert: InstanceHighCpu
+    expr: 100 - (avg(rate(node_cpu_seconds_total{mode="idle"}[5m])) BY (instance) * 100) > 90
+    for: 5m
+    annotations:
+      description: 'CPU usage above 90% for more than 5m'
+      summary: 'Instance {{ $labels.instance }}: cpu usage at {{ $value }}'
+      value: '{{ $value }}'
+
+  - alert: InstanceHighCpuLong
+    expr: 100 - (avg(rate(node_cpu_seconds_total{mode="idle"}[5m])) BY (instance) * 100) > 90
+    for: 30m
+    labels:
+      severity: page
+    annotations:
+      description: 'CPU usage above 90% for more than 30m'
+      summary: 'Instance {{ $labels.instance }}: persistent cpu usage at {{ $value }}'
+      value: '{{ $value }}'
+
+  - alert: InstanceLowMem
+    expr: node_memory_MemAvailable_bytes / 1024 / 1024 < node_memory_MemTotal_bytes / 1024 / 1024 / 10
+    for: 3m
+    labels:
+      severity: page
+    annotations:
+      description: 'Less than 10% of free memory'
+      summary: 'Instance {{ $labels.instance }}: {{ $value }}MB of free memory'
+      value: '{{ $value }}'
+
+  - alert: InstanceLowDiskPrediction4Hours
+    expr: predict_linear(node_filesystem_free_bytes{device=~"/dev/.*",job="node"}[1h], 4 * 3600) < 0
+    for: 30m
+    labels:
+      severity: page
+    annotations:
+      description: 'Disk {{ $labels.mountpoint }} ({{ $labels.device }}) will be full in less than 4 hours'
+      summary: 'Instance {{ $labels.instance }}: Disk {{ $labels.mountpoint }} ({{ $labels.device }}) will be full in less than 4 hours'
+
+  - alert: InstanceLowDiskPrediction12Hours
+    expr: predict_linear(node_filesystem_free_bytes{device=~"/dev/.*",job="node"}[3h], 12 * 3600) < 0
+    for: 60m
+    labels:
+      severity: page
+    annotations:
+      description: 'Disk {{ $labels.mountpoint }} ({{ $labels.device }}) will be full in less than 12 hours'
+      summary: 'Instance {{ $labels.instance }}: Disk {{ $labels.mountpoint }} ({{ $labels.device }}) will be full in less than 12 hours'
+
+  - alert: InstanceLowDiskAbs
+    expr: node_filesystem_avail_bytes{mountpoint="/"} / 1024 / 1024 < 1024
+    for: 1m
+    labels:
+      severity: page
+    annotations:
+      description: 'Less than 1GB of free disk space left on the root filesystem'
+      summary: 'Instance {{ $labels.instance }}: {{ $value }}MB free disk space on {{ $labels.device }}'
+      value: '{{ $value }}'
+
+  - alert: InstanceLowDiskPerc
+    expr: 100 * (node_filesystem_free_bytes / node_filesystem_size_bytes) < 10
+    for: 1m
+    labels:
+      severity: page
+    annotations:
+      description: 'Less than 10% of free disk space left on a device'
+      summary: 'Instance {{ $labels.instance }}: {{ $value }}% free disk space on {{ $labels.device }}'
+      value: '{{ $value }}'
+
+  - alert: ServiceFailed
+    expr: node_systemd_unit_state{state="failed"} > 0
+    for: 1m
+    labels:
+      severity: page
+    annotations:
+      description: 'A systemd unit went into failed state'
+      summary: 'Instance {{ $labels.instance }}: Service {{ $labels.name }} failed'
+      value: '{{ $labels.name }}'
+
+  - alert: ServiceFlapping
+    expr: changes(node_systemd_unit_state{state="failed"}[5m]) > 5 or
+     (changes(node_systemd_unit_state{state="failed"}[1h]) > 15 unless changes(node_systemd_unit_state{state="failed"}[30m]) < 7)
+    labels:
+      severity: page
+    annotations:
+      description: 'A systemd service changed its state more than 5x/5min or 15x/1h'
+      summary: 'Instance {{ $labels.instance }}: Service {{ $labels.name }} is flapping'
+      value: '{{ $labels.name }}'
--- a/playbooks/prometheus/prometheus.yml.j2
+++ b/playbooks/prometheus/prometheus.yml.j2
@ -8,6 +8,13 @@ global:
  external_labels:
    monitor: 'master'

+alerting:
+  alertmanagers:
+    - scheme: https
+      path_prefix: /alertmanager/
+      static_configs:
+        - targets: ['{{ http_prometheus_prefix }}.{{ http_domain_external }}']
+
 {% if prometheus_rule_files is defined %}
 # Rule files specifies a list of files from which rules are read.
 rule_files:
@ -18,7 +25,6 @@ rule_files:

 # A list of scrape configurations.
 scrape_configs:
-
  - job_name: 'prometheus'
    scrape_interval: 10s
    scrape_timeout:  10s
--- a/roles/service-prometheus/defaults/main.yml
+++ b/roles/service-prometheus/defaults/main.yml
@ -32,6 +32,6 @@ prometheus_goroot: "{{ prometheus_workdir }}/go"
 prometheus_gopath: "{{ prometheus_workdir }}/gopath"

 prometheus_default_opts: "--web.listen-address=localhost:9090 --config.file={{ prometheus_config_path }}/prometheus.yml --storage.tsdb.path={{ prometheus_db_path }}"
-alertmanager_default_opts: "--config.file={{ prometheus_config_path }}/alertmanager.yml --storage.path={{ alertmanager_db_path }}"
+alertmanager_default_opts: "--web.listen-address=localhost:9093 --config.file={{ prometheus_config_path }}/alertmanager.yml --storage.path={{ alertmanager_db_path }}"
 node_exporter_default_opts: "--web.listen-address=localhost:9100"
 blackbox_default_opts: "--web.listen-address=localhost:9115 --config.file={{ prometheus_config_path }}/blackbox.yml"
--- a/roles/service-prometheus/files/alertmanager.yml
+++ b/roles/service-prometheus/files/alertmanager.yml
@ -1,13 +1,17 @@
 global:
+  smtp_from: "admin@freifunk-mwu.de"
+  smtp_smarthost: "localhost:25"
+  smtp_require_tls: false

 route:
-  group_by: ['alertname', 'cluster']
+  group_by: ['alertname', 'alertstate', 'cluster', 'service']
  group_wait: 30s
  group_interval: 5m
  repeat_interval: 3h
-  receiver: 'default-pager'
+  receiver: 'email'

 receivers:
-  - name: 'default-pager'
-    pagerduty_configs:
-    - service_key: <team-X-key>
+  - name: 'email'
+    email_configs:
+    - send_resolved: true
+      to: "admin@lists.freifunk-mwu.de"
--- a/roles/service-prometheus/files/amtool.yml
+++ b/roles/service-prometheus/files/amtool.yml
@ -0,0 +1,8 @@
+# Define the path that amtool can find your `alertmanager` instance at
+alertmanager.url: "http://localhost:9093/alertmanager"
+
+# Override the default author. (unset defaults to your username)
+author: admin@freifunk-mwu.de
+
+# Force amtool to give you an error if you don't include a comment on a silence
+comment_required: true
--- a/roles/service-prometheus/tasks/alertmanager.yml
+++ b/roles/service-prometheus/tasks/alertmanager.yml
@ -41,6 +41,14 @@
    group: "{{ prometheus_group }}"
    mode: "u=rwx,g=rx,o="

+- name: mkdir for amtool config
+  file:
+    path: "/etc/amtool"
+    state: directory
+    owner: "{{ prometheus_user }}"
+    group: "{{ prometheus_group }}"
+    mode: "u=rwx,g=rx,o=rx"
+
 - name: copy alertmanager systemd config
  template:
    src: "alertmanager.service.j2"
@ -49,6 +57,11 @@
    - reload systemd
    - restart alertmanager

+- name: install amtool config file
+  copy:
+    src: "amtool.yml"
+    dest: "/etc/amtool/config.yml"
+
 - name: install alertmanager config file
  copy:
    src: "alertmanager.yml"
--- a/roles/service-prometheus/templates/prometheus_vhost.conf.j2
+++ b/roles/service-prometheus/templates/prometheus_vhost.conf.j2
@ -17,9 +17,21 @@ server {

    include /etc/nginx/snippets/letsencrypt-acme-challenge.conf;

-    location / {
+    satisfy  any;
+
+    allow 127.0.0.0/8;
+    allow ::1/128;
+    allow {{ lookup('dig', inventory_hostname, 'qtype=A') }};
+    allow {{ lookup('dig', inventory_hostname, 'qtype=AAAA') }};
+
    auth_basic "Prometheus";
    auth_basic_user_file /etc/nginx/htpasswd_prometheus;
+
+    location /alertmanager {
+      proxy_pass http://127.0.0.1:9093;
+    }
+
+    location / {
      proxy_pass http://127.0.0.1:9090;
    }
 }