diff --git a/inventory/group_vars/ffmwu-monitoring b/inventory/group_vars/ffmwu-monitoring index 98271ac..65fc8ee 100644 --- a/inventory/group_vars/ffmwu-monitoring +++ b/inventory/group_vars/ffmwu-monitoring @@ -26,6 +26,7 @@ prometheus_components: - node_exporter - blackbox_exporter +prometheus_opts: "--web.external-url=https://{{ http_prometheus_prefix }}.{{ http_domain_external }}" alertmanager_opts: "--web.external-url=https://{{ http_prometheus_prefix }}.{{ http_domain_external }}/alertmanager/" yanic_blacklist: diff --git a/playbooks/prometheus/alert.rules b/playbooks/prometheus/alert.rules index 569c9d4..7cf1f74 100644 --- a/playbooks/prometheus/alert.rules +++ b/playbooks/prometheus/alert.rules @@ -4,8 +4,6 @@ groups: - alert: Icmp4Timeout expr: probe_success{job="icmp4"} == 0 for: 5m - labels: - severity: page annotations: description: 'ICMP requests to the primary IPv4 address timed out' summary: 'Instance {{ $labels.instance }} does not respond to ICMPv4 echo requests' @@ -13,8 +11,6 @@ groups: - alert: Icmp6Timeout expr: probe_success{job="icmp6"} == 0 for: 5m - labels: - severity: page annotations: description: 'ICMP requests to the primary IPv6 address timed out' summary: 'Instance {{ $labels.instance }} does not respond to ICMPv6 echo requests' @@ -24,8 +20,6 @@ groups: - alert: InstanceDown expr: up{job="node"} == 0 for: 5m - labels: - severity: page annotations: description: 'The instance is down for more than 5 minutes' summary: 'Instance {{ $labels.instance }} is down' @@ -48,8 +42,6 @@ groups: - alert: InstanceHighCpuLong expr: 100 - (avg(rate(node_cpu_seconds_total{mode="idle"}[5m])) BY (instance) * 100) > 90 for: 30m - labels: - severity: page annotations: description: 'CPU usage above 90% for more than 30m' summary: 'Instance {{ $labels.instance }}: persistent cpu usage at {{ $value }}' @@ -58,8 +50,6 @@ groups: - alert: InstanceLowMem expr: node_memory_MemAvailable_bytes / 1024 / 1024 < node_memory_MemTotal_bytes / 1024 / 1024 / 10 for: 3m - labels: - severity: page annotations: description: 'Less than 10% of free memory' summary: 'Instance {{ $labels.instance }}: {{ $value }}MB of free memory' @@ -68,8 +58,6 @@ groups: - alert: InstanceLowDiskPrediction4Hours expr: predict_linear(node_filesystem_free_bytes{device=~"/dev/.*",job="node"}[1h], 4 * 3600) < 0 for: 30m - labels: - severity: page annotations: description: 'Disk {{ $labels.mountpoint }} ({{ $labels.device }}) will be full in less than 4 hours' summary: 'Instance {{ $labels.instance }}: Disk {{ $labels.mountpoint }} ({{ $labels.device }}) will be full in less than 4 hours' @@ -77,8 +65,6 @@ groups: - alert: InstanceLowDiskPrediction12Hours expr: predict_linear(node_filesystem_free_bytes{device=~"/dev/.*",job="node"}[3h], 12 * 3600) < 0 for: 60m - labels: - severity: page annotations: description: 'Disk {{ $labels.mountpoint }} ({{ $labels.device }}) will be full in less than 12 hours' summary: 'Instance {{ $labels.instance }}: Disk {{ $labels.mountpoint }} ({{ $labels.device }}) will be full in less than 12 hours' @@ -86,8 +72,6 @@ groups: - alert: InstanceLowDiskAbs expr: node_filesystem_avail_bytes{mountpoint="/"} / 1024 / 1024 < 1024 for: 1m - labels: - severity: page annotations: description: 'Less than 1GB of free disk space left on the root filesystem' summary: 'Instance {{ $labels.instance }}: {{ $value }}MB free disk space on {{ $labels.device }}' @@ -96,8 +80,6 @@ groups: - alert: InstanceLowDiskPerc expr: 100 * (node_filesystem_free_bytes / node_filesystem_size_bytes) < 10 for: 1m - labels: - severity: page annotations: description: 'Less than 10% of free disk space left on a device' summary: 'Instance {{ $labels.instance }}: {{ $value }}% free disk space on {{ $labels.device }}' @@ -106,8 +88,6 @@ groups: - alert: ServiceFailed expr: node_systemd_unit_state{state="failed"} > 0 for: 1m - labels: - severity: page annotations: description: 'A systemd unit went into failed state' summary: 'Instance {{ $labels.instance }}: Service {{ $labels.name }} failed' @@ -116,8 +96,6 @@ groups: - alert: ServiceFlapping expr: changes(node_systemd_unit_state{state="failed"}[5m]) > 5 or (changes(node_systemd_unit_state{state="failed"}[1h]) > 15 unless changes(node_systemd_unit_state{state="failed"}[30m]) < 7) - labels: - severity: page annotations: description: 'A systemd service changed its state more than 5x/5min or 15x/1h' summary: 'Instance {{ $labels.instance }}: Service {{ $labels.name }} is flapping' diff --git a/playbooks/prometheus/prometheus.yml.j2 b/playbooks/prometheus/prometheus.yml.j2 index 760e828..a0345e3 100644 --- a/playbooks/prometheus/prometheus.yml.j2 +++ b/playbooks/prometheus/prometheus.yml.j2 @@ -3,11 +3,6 @@ global: evaluation_interval: 15s # By default, scrape targets every 15 seconds. # scrape_timeout is set to the global default (10s). - # The labels to add to any time series or alerts when communicating with - # external systems (federation, remote storage, Alertmanager). - external_labels: - monitor: 'master' - alerting: alertmanagers: - scheme: https @@ -51,23 +46,26 @@ scrape_configs: - '{{ host }}:9281' {% endfor %} - - job_name: "icmp6" +{% for job in ['icmp4','icmp6'] %} + - job_name: "{{ job }}" metrics_path: /probe params: - module: ["icmp6"] + module: ["{{ job }}"] static_configs: - targets: -{%for group in prometheus_groups %} +{% for group in prometheus_groups %} {% for host in groups[group] %} {% if host != inventory_hostname %} - - "{{ host.rsplit('.freifunk-mwu.de')[0] }}.ffwi.org" - - "{{ host.rsplit('.freifunk-mwu.de')[0] }}.ffmz.org" + - "{{ host.rsplit('.')[0] }}.ffwi.org" + - "{{ host.rsplit('.')[0] }}.ffmz.org" {% endif %} {% endfor %} {% endfor %} {% for host, _ in bgp_mwu_servers.items() %} +{% if host not in ['extrasahne'] %} - "{{ host }}.ffwi.org" - "{{ host }}.ffmz.org" +{% endif %} {% endfor %} relabel_configs: - source_labels: [__address__] @@ -76,3 +74,5 @@ scrape_configs: target_label: instance - target_label: __address__ replacement: 127.0.0.1:9115 + +{% endfor %} diff --git a/roles/service-prometheus/files/blackbox.yml b/roles/service-prometheus/files/blackbox.yml index 83912cd..231df93 100644 --- a/roles/service-prometheus/files/blackbox.yml +++ b/roles/service-prometheus/files/blackbox.yml @@ -9,7 +9,7 @@ modules: tcp: query_response: - expect: "^SSH-2.0-" - icmp: + icmp4: prober: icmp icmp: preferred_ip_protocol: "ip4"