role service-prometheus: configure alertmanager
This commit is contained in:
parent
1d72eb8439
commit
6cb1eaa514
8 changed files with 183 additions and 9 deletions
|
@ -15,11 +15,18 @@ http_prometheus_prefix: prom
|
||||||
|
|
||||||
prometheus_conf_main: prometheus/prometheus.yml.j2
|
prometheus_conf_main: prometheus/prometheus.yml.j2
|
||||||
|
|
||||||
|
prometheus_rule_files:
|
||||||
|
alert_rules:
|
||||||
|
src: prometheus/alert.rules
|
||||||
|
dest: alert.rules
|
||||||
|
|
||||||
prometheus_components:
|
prometheus_components:
|
||||||
- prometheus
|
- prometheus
|
||||||
- alertmanager
|
- alertmanager
|
||||||
- node_exporter
|
- node_exporter
|
||||||
- blackbox_exporter
|
- blackbox_exporter
|
||||||
|
|
||||||
|
alertmanager_opts: "--web.external-url=https://{{ http_prometheus_prefix }}.{{ http_domain_external }}/alertmanager/"
|
||||||
|
|
||||||
yanic_blacklist:
|
yanic_blacklist:
|
||||||
- 98ded0c5e0c0
|
- 98ded0c5e0c0
|
||||||
|
|
124
playbooks/prometheus/alert.rules
Normal file
124
playbooks/prometheus/alert.rules
Normal file
|
@ -0,0 +1,124 @@
|
||||||
|
groups:
|
||||||
|
- name: blackbox
|
||||||
|
rules:
|
||||||
|
- alert: Icmp4Timeout
|
||||||
|
expr: probe_success{job="icmp4"} == 0
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: page
|
||||||
|
annotations:
|
||||||
|
description: 'ICMP requests to the primary IPv4 address timed out'
|
||||||
|
summary: 'Instance {{ $labels.instance }} does not respond to ICMPv4 echo requests'
|
||||||
|
|
||||||
|
- alert: Icmp6Timeout
|
||||||
|
expr: probe_success{job="icmp6"} == 0
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: page
|
||||||
|
annotations:
|
||||||
|
description: 'ICMP requests to the primary IPv6 address timed out'
|
||||||
|
summary: 'Instance {{ $labels.instance }} does not respond to ICMPv6 echo requests'
|
||||||
|
|
||||||
|
- name: node
|
||||||
|
rules:
|
||||||
|
- alert: InstanceDown
|
||||||
|
expr: up{job="node"} == 0
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: page
|
||||||
|
annotations:
|
||||||
|
description: 'The instance is down for more than 5 minutes'
|
||||||
|
summary: 'Instance {{ $labels.instance }} is down'
|
||||||
|
|
||||||
|
- alert: ExporterDown
|
||||||
|
expr: up{job!="node"} == 0
|
||||||
|
for: 5m
|
||||||
|
annotations:
|
||||||
|
description: 'An exporter is down for more than 5 minutes'
|
||||||
|
summary: 'Exporter {{ $labels.instance }} is down'
|
||||||
|
|
||||||
|
- alert: InstanceHighCpu
|
||||||
|
expr: 100 - (avg(rate(node_cpu_seconds_total{mode="idle"}[5m])) BY (instance) * 100) > 90
|
||||||
|
for: 5m
|
||||||
|
annotations:
|
||||||
|
description: 'CPU usage above 90% for more than 5m'
|
||||||
|
summary: 'Instance {{ $labels.instance }}: cpu usage at {{ $value }}'
|
||||||
|
value: '{{ $value }}'
|
||||||
|
|
||||||
|
- alert: InstanceHighCpuLong
|
||||||
|
expr: 100 - (avg(rate(node_cpu_seconds_total{mode="idle"}[5m])) BY (instance) * 100) > 90
|
||||||
|
for: 30m
|
||||||
|
labels:
|
||||||
|
severity: page
|
||||||
|
annotations:
|
||||||
|
description: 'CPU usage above 90% for more than 30m'
|
||||||
|
summary: 'Instance {{ $labels.instance }}: persistent cpu usage at {{ $value }}'
|
||||||
|
value: '{{ $value }}'
|
||||||
|
|
||||||
|
- alert: InstanceLowMem
|
||||||
|
expr: node_memory_MemAvailable_bytes / 1024 / 1024 < node_memory_MemTotal_bytes / 1024 / 1024 / 10
|
||||||
|
for: 3m
|
||||||
|
labels:
|
||||||
|
severity: page
|
||||||
|
annotations:
|
||||||
|
description: 'Less than 10% of free memory'
|
||||||
|
summary: 'Instance {{ $labels.instance }}: {{ $value }}MB of free memory'
|
||||||
|
value: '{{ $value }}'
|
||||||
|
|
||||||
|
- alert: InstanceLowDiskPrediction4Hours
|
||||||
|
expr: predict_linear(node_filesystem_free_bytes{device=~"/dev/.*",job="node"}[1h], 4 * 3600) < 0
|
||||||
|
for: 30m
|
||||||
|
labels:
|
||||||
|
severity: page
|
||||||
|
annotations:
|
||||||
|
description: 'Disk {{ $labels.mountpoint }} ({{ $labels.device }}) will be full in less than 4 hours'
|
||||||
|
summary: 'Instance {{ $labels.instance }}: Disk {{ $labels.mountpoint }} ({{ $labels.device }}) will be full in less than 4 hours'
|
||||||
|
|
||||||
|
- alert: InstanceLowDiskPrediction12Hours
|
||||||
|
expr: predict_linear(node_filesystem_free_bytes{device=~"/dev/.*",job="node"}[3h], 12 * 3600) < 0
|
||||||
|
for: 60m
|
||||||
|
labels:
|
||||||
|
severity: page
|
||||||
|
annotations:
|
||||||
|
description: 'Disk {{ $labels.mountpoint }} ({{ $labels.device }}) will be full in less than 12 hours'
|
||||||
|
summary: 'Instance {{ $labels.instance }}: Disk {{ $labels.mountpoint }} ({{ $labels.device }}) will be full in less than 12 hours'
|
||||||
|
|
||||||
|
- alert: InstanceLowDiskAbs
|
||||||
|
expr: node_filesystem_avail_bytes{mountpoint="/"} / 1024 / 1024 < 1024
|
||||||
|
for: 1m
|
||||||
|
labels:
|
||||||
|
severity: page
|
||||||
|
annotations:
|
||||||
|
description: 'Less than 1GB of free disk space left on the root filesystem'
|
||||||
|
summary: 'Instance {{ $labels.instance }}: {{ $value }}MB free disk space on {{ $labels.device }}'
|
||||||
|
value: '{{ $value }}'
|
||||||
|
|
||||||
|
- alert: InstanceLowDiskPerc
|
||||||
|
expr: 100 * (node_filesystem_free_bytes / node_filesystem_size_bytes) < 10
|
||||||
|
for: 1m
|
||||||
|
labels:
|
||||||
|
severity: page
|
||||||
|
annotations:
|
||||||
|
description: 'Less than 10% of free disk space left on a device'
|
||||||
|
summary: 'Instance {{ $labels.instance }}: {{ $value }}% free disk space on {{ $labels.device }}'
|
||||||
|
value: '{{ $value }}'
|
||||||
|
|
||||||
|
- alert: ServiceFailed
|
||||||
|
expr: node_systemd_unit_state{state="failed"} > 0
|
||||||
|
for: 1m
|
||||||
|
labels:
|
||||||
|
severity: page
|
||||||
|
annotations:
|
||||||
|
description: 'A systemd unit went into failed state'
|
||||||
|
summary: 'Instance {{ $labels.instance }}: Service {{ $labels.name }} failed'
|
||||||
|
value: '{{ $labels.name }}'
|
||||||
|
|
||||||
|
- alert: ServiceFlapping
|
||||||
|
expr: changes(node_systemd_unit_state{state="failed"}[5m]) > 5 or
|
||||||
|
(changes(node_systemd_unit_state{state="failed"}[1h]) > 15 unless changes(node_systemd_unit_state{state="failed"}[30m]) < 7)
|
||||||
|
labels:
|
||||||
|
severity: page
|
||||||
|
annotations:
|
||||||
|
description: 'A systemd service changed its state more than 5x/5min or 15x/1h'
|
||||||
|
summary: 'Instance {{ $labels.instance }}: Service {{ $labels.name }} is flapping'
|
||||||
|
value: '{{ $labels.name }}'
|
|
@ -8,6 +8,13 @@ global:
|
||||||
external_labels:
|
external_labels:
|
||||||
monitor: 'master'
|
monitor: 'master'
|
||||||
|
|
||||||
|
alerting:
|
||||||
|
alertmanagers:
|
||||||
|
- scheme: https
|
||||||
|
path_prefix: /alertmanager/
|
||||||
|
static_configs:
|
||||||
|
- targets: ['{{ http_prometheus_prefix }}.{{ http_domain_external }}']
|
||||||
|
|
||||||
{% if prometheus_rule_files is defined %}
|
{% if prometheus_rule_files is defined %}
|
||||||
# Rule files specifies a list of files from which rules are read.
|
# Rule files specifies a list of files from which rules are read.
|
||||||
rule_files:
|
rule_files:
|
||||||
|
@ -18,7 +25,6 @@ rule_files:
|
||||||
|
|
||||||
# A list of scrape configurations.
|
# A list of scrape configurations.
|
||||||
scrape_configs:
|
scrape_configs:
|
||||||
|
|
||||||
- job_name: 'prometheus'
|
- job_name: 'prometheus'
|
||||||
scrape_interval: 10s
|
scrape_interval: 10s
|
||||||
scrape_timeout: 10s
|
scrape_timeout: 10s
|
||||||
|
|
|
@ -32,6 +32,6 @@ prometheus_goroot: "{{ prometheus_workdir }}/go"
|
||||||
prometheus_gopath: "{{ prometheus_workdir }}/gopath"
|
prometheus_gopath: "{{ prometheus_workdir }}/gopath"
|
||||||
|
|
||||||
prometheus_default_opts: "--web.listen-address=localhost:9090 --config.file={{ prometheus_config_path }}/prometheus.yml --storage.tsdb.path={{ prometheus_db_path }}"
|
prometheus_default_opts: "--web.listen-address=localhost:9090 --config.file={{ prometheus_config_path }}/prometheus.yml --storage.tsdb.path={{ prometheus_db_path }}"
|
||||||
alertmanager_default_opts: "--config.file={{ prometheus_config_path }}/alertmanager.yml --storage.path={{ alertmanager_db_path }}"
|
alertmanager_default_opts: "--web.listen-address=localhost:9093 --config.file={{ prometheus_config_path }}/alertmanager.yml --storage.path={{ alertmanager_db_path }}"
|
||||||
node_exporter_default_opts: "--web.listen-address=localhost:9100"
|
node_exporter_default_opts: "--web.listen-address=localhost:9100"
|
||||||
blackbox_default_opts: "--web.listen-address=localhost:9115 --config.file={{ prometheus_config_path }}/blackbox.yml"
|
blackbox_default_opts: "--web.listen-address=localhost:9115 --config.file={{ prometheus_config_path }}/blackbox.yml"
|
||||||
|
|
|
@ -1,13 +1,17 @@
|
||||||
global:
|
global:
|
||||||
|
smtp_from: "admin@freifunk-mwu.de"
|
||||||
|
smtp_smarthost: "localhost:25"
|
||||||
|
smtp_require_tls: false
|
||||||
|
|
||||||
route:
|
route:
|
||||||
group_by: ['alertname', 'cluster']
|
group_by: ['alertname', 'alertstate', 'cluster', 'service']
|
||||||
group_wait: 30s
|
group_wait: 30s
|
||||||
group_interval: 5m
|
group_interval: 5m
|
||||||
repeat_interval: 3h
|
repeat_interval: 3h
|
||||||
receiver: 'default-pager'
|
receiver: 'email'
|
||||||
|
|
||||||
receivers:
|
receivers:
|
||||||
- name: 'default-pager'
|
- name: 'email'
|
||||||
pagerduty_configs:
|
email_configs:
|
||||||
- service_key: <team-X-key>
|
- send_resolved: true
|
||||||
|
to: "admin@lists.freifunk-mwu.de"
|
||||||
|
|
8
roles/service-prometheus/files/amtool.yml
Normal file
8
roles/service-prometheus/files/amtool.yml
Normal file
|
@ -0,0 +1,8 @@
|
||||||
|
# Define the path that amtool can find your `alertmanager` instance at
|
||||||
|
alertmanager.url: "http://localhost:9093/alertmanager"
|
||||||
|
|
||||||
|
# Override the default author. (unset defaults to your username)
|
||||||
|
author: admin@freifunk-mwu.de
|
||||||
|
|
||||||
|
# Force amtool to give you an error if you don't include a comment on a silence
|
||||||
|
comment_required: true
|
|
@ -41,6 +41,14 @@
|
||||||
group: "{{ prometheus_group }}"
|
group: "{{ prometheus_group }}"
|
||||||
mode: "u=rwx,g=rx,o="
|
mode: "u=rwx,g=rx,o="
|
||||||
|
|
||||||
|
- name: mkdir for amtool config
|
||||||
|
file:
|
||||||
|
path: "/etc/amtool"
|
||||||
|
state: directory
|
||||||
|
owner: "{{ prometheus_user }}"
|
||||||
|
group: "{{ prometheus_group }}"
|
||||||
|
mode: "u=rwx,g=rx,o=rx"
|
||||||
|
|
||||||
- name: copy alertmanager systemd config
|
- name: copy alertmanager systemd config
|
||||||
template:
|
template:
|
||||||
src: "alertmanager.service.j2"
|
src: "alertmanager.service.j2"
|
||||||
|
@ -49,6 +57,11 @@
|
||||||
- reload systemd
|
- reload systemd
|
||||||
- restart alertmanager
|
- restart alertmanager
|
||||||
|
|
||||||
|
- name: install amtool config file
|
||||||
|
copy:
|
||||||
|
src: "amtool.yml"
|
||||||
|
dest: "/etc/amtool/config.yml"
|
||||||
|
|
||||||
- name: install alertmanager config file
|
- name: install alertmanager config file
|
||||||
copy:
|
copy:
|
||||||
src: "alertmanager.yml"
|
src: "alertmanager.yml"
|
||||||
|
|
|
@ -17,9 +17,21 @@ server {
|
||||||
|
|
||||||
include /etc/nginx/snippets/letsencrypt-acme-challenge.conf;
|
include /etc/nginx/snippets/letsencrypt-acme-challenge.conf;
|
||||||
|
|
||||||
|
satisfy any;
|
||||||
|
|
||||||
|
allow 127.0.0.0/8;
|
||||||
|
allow ::1/128;
|
||||||
|
allow {{ lookup('dig', inventory_hostname, 'qtype=A') }};
|
||||||
|
allow {{ lookup('dig', inventory_hostname, 'qtype=AAAA') }};
|
||||||
|
|
||||||
|
auth_basic "Prometheus";
|
||||||
|
auth_basic_user_file /etc/nginx/htpasswd_prometheus;
|
||||||
|
|
||||||
|
location /alertmanager {
|
||||||
|
proxy_pass http://127.0.0.1:9093;
|
||||||
|
}
|
||||||
|
|
||||||
location / {
|
location / {
|
||||||
auth_basic "Prometheus";
|
|
||||||
auth_basic_user_file /etc/nginx/htpasswd_prometheus;
|
|
||||||
proxy_pass http://127.0.0.1:9090;
|
proxy_pass http://127.0.0.1:9090;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in a new issue