role service-prometheus: update targets and alerts
This commit is contained in:
parent
a287a61d5b
commit
8bed1b7c5b
4 changed files with 12 additions and 33 deletions
|
@ -26,6 +26,7 @@ prometheus_components:
|
||||||
- node_exporter
|
- node_exporter
|
||||||
- blackbox_exporter
|
- blackbox_exporter
|
||||||
|
|
||||||
|
prometheus_opts: "--web.external-url=https://{{ http_prometheus_prefix }}.{{ http_domain_external }}"
|
||||||
alertmanager_opts: "--web.external-url=https://{{ http_prometheus_prefix }}.{{ http_domain_external }}/alertmanager/"
|
alertmanager_opts: "--web.external-url=https://{{ http_prometheus_prefix }}.{{ http_domain_external }}/alertmanager/"
|
||||||
|
|
||||||
yanic_blacklist:
|
yanic_blacklist:
|
||||||
|
|
|
@ -4,8 +4,6 @@ groups:
|
||||||
- alert: Icmp4Timeout
|
- alert: Icmp4Timeout
|
||||||
expr: probe_success{job="icmp4"} == 0
|
expr: probe_success{job="icmp4"} == 0
|
||||||
for: 5m
|
for: 5m
|
||||||
labels:
|
|
||||||
severity: page
|
|
||||||
annotations:
|
annotations:
|
||||||
description: 'ICMP requests to the primary IPv4 address timed out'
|
description: 'ICMP requests to the primary IPv4 address timed out'
|
||||||
summary: 'Instance {{ $labels.instance }} does not respond to ICMPv4 echo requests'
|
summary: 'Instance {{ $labels.instance }} does not respond to ICMPv4 echo requests'
|
||||||
|
@ -13,8 +11,6 @@ groups:
|
||||||
- alert: Icmp6Timeout
|
- alert: Icmp6Timeout
|
||||||
expr: probe_success{job="icmp6"} == 0
|
expr: probe_success{job="icmp6"} == 0
|
||||||
for: 5m
|
for: 5m
|
||||||
labels:
|
|
||||||
severity: page
|
|
||||||
annotations:
|
annotations:
|
||||||
description: 'ICMP requests to the primary IPv6 address timed out'
|
description: 'ICMP requests to the primary IPv6 address timed out'
|
||||||
summary: 'Instance {{ $labels.instance }} does not respond to ICMPv6 echo requests'
|
summary: 'Instance {{ $labels.instance }} does not respond to ICMPv6 echo requests'
|
||||||
|
@ -24,8 +20,6 @@ groups:
|
||||||
- alert: InstanceDown
|
- alert: InstanceDown
|
||||||
expr: up{job="node"} == 0
|
expr: up{job="node"} == 0
|
||||||
for: 5m
|
for: 5m
|
||||||
labels:
|
|
||||||
severity: page
|
|
||||||
annotations:
|
annotations:
|
||||||
description: 'The instance is down for more than 5 minutes'
|
description: 'The instance is down for more than 5 minutes'
|
||||||
summary: 'Instance {{ $labels.instance }} is down'
|
summary: 'Instance {{ $labels.instance }} is down'
|
||||||
|
@ -48,8 +42,6 @@ groups:
|
||||||
- alert: InstanceHighCpuLong
|
- alert: InstanceHighCpuLong
|
||||||
expr: 100 - (avg(rate(node_cpu_seconds_total{mode="idle"}[5m])) BY (instance) * 100) > 90
|
expr: 100 - (avg(rate(node_cpu_seconds_total{mode="idle"}[5m])) BY (instance) * 100) > 90
|
||||||
for: 30m
|
for: 30m
|
||||||
labels:
|
|
||||||
severity: page
|
|
||||||
annotations:
|
annotations:
|
||||||
description: 'CPU usage above 90% for more than 30m'
|
description: 'CPU usage above 90% for more than 30m'
|
||||||
summary: 'Instance {{ $labels.instance }}: persistent cpu usage at {{ $value }}'
|
summary: 'Instance {{ $labels.instance }}: persistent cpu usage at {{ $value }}'
|
||||||
|
@ -58,8 +50,6 @@ groups:
|
||||||
- alert: InstanceLowMem
|
- alert: InstanceLowMem
|
||||||
expr: node_memory_MemAvailable_bytes / 1024 / 1024 < node_memory_MemTotal_bytes / 1024 / 1024 / 10
|
expr: node_memory_MemAvailable_bytes / 1024 / 1024 < node_memory_MemTotal_bytes / 1024 / 1024 / 10
|
||||||
for: 3m
|
for: 3m
|
||||||
labels:
|
|
||||||
severity: page
|
|
||||||
annotations:
|
annotations:
|
||||||
description: 'Less than 10% of free memory'
|
description: 'Less than 10% of free memory'
|
||||||
summary: 'Instance {{ $labels.instance }}: {{ $value }}MB of free memory'
|
summary: 'Instance {{ $labels.instance }}: {{ $value }}MB of free memory'
|
||||||
|
@ -68,8 +58,6 @@ groups:
|
||||||
- alert: InstanceLowDiskPrediction4Hours
|
- alert: InstanceLowDiskPrediction4Hours
|
||||||
expr: predict_linear(node_filesystem_free_bytes{device=~"/dev/.*",job="node"}[1h], 4 * 3600) < 0
|
expr: predict_linear(node_filesystem_free_bytes{device=~"/dev/.*",job="node"}[1h], 4 * 3600) < 0
|
||||||
for: 30m
|
for: 30m
|
||||||
labels:
|
|
||||||
severity: page
|
|
||||||
annotations:
|
annotations:
|
||||||
description: 'Disk {{ $labels.mountpoint }} ({{ $labels.device }}) will be full in less than 4 hours'
|
description: 'Disk {{ $labels.mountpoint }} ({{ $labels.device }}) will be full in less than 4 hours'
|
||||||
summary: 'Instance {{ $labels.instance }}: Disk {{ $labels.mountpoint }} ({{ $labels.device }}) will be full in less than 4 hours'
|
summary: 'Instance {{ $labels.instance }}: Disk {{ $labels.mountpoint }} ({{ $labels.device }}) will be full in less than 4 hours'
|
||||||
|
@ -77,8 +65,6 @@ groups:
|
||||||
- alert: InstanceLowDiskPrediction12Hours
|
- alert: InstanceLowDiskPrediction12Hours
|
||||||
expr: predict_linear(node_filesystem_free_bytes{device=~"/dev/.*",job="node"}[3h], 12 * 3600) < 0
|
expr: predict_linear(node_filesystem_free_bytes{device=~"/dev/.*",job="node"}[3h], 12 * 3600) < 0
|
||||||
for: 60m
|
for: 60m
|
||||||
labels:
|
|
||||||
severity: page
|
|
||||||
annotations:
|
annotations:
|
||||||
description: 'Disk {{ $labels.mountpoint }} ({{ $labels.device }}) will be full in less than 12 hours'
|
description: 'Disk {{ $labels.mountpoint }} ({{ $labels.device }}) will be full in less than 12 hours'
|
||||||
summary: 'Instance {{ $labels.instance }}: Disk {{ $labels.mountpoint }} ({{ $labels.device }}) will be full in less than 12 hours'
|
summary: 'Instance {{ $labels.instance }}: Disk {{ $labels.mountpoint }} ({{ $labels.device }}) will be full in less than 12 hours'
|
||||||
|
@ -86,8 +72,6 @@ groups:
|
||||||
- alert: InstanceLowDiskAbs
|
- alert: InstanceLowDiskAbs
|
||||||
expr: node_filesystem_avail_bytes{mountpoint="/"} / 1024 / 1024 < 1024
|
expr: node_filesystem_avail_bytes{mountpoint="/"} / 1024 / 1024 < 1024
|
||||||
for: 1m
|
for: 1m
|
||||||
labels:
|
|
||||||
severity: page
|
|
||||||
annotations:
|
annotations:
|
||||||
description: 'Less than 1GB of free disk space left on the root filesystem'
|
description: 'Less than 1GB of free disk space left on the root filesystem'
|
||||||
summary: 'Instance {{ $labels.instance }}: {{ $value }}MB free disk space on {{ $labels.device }}'
|
summary: 'Instance {{ $labels.instance }}: {{ $value }}MB free disk space on {{ $labels.device }}'
|
||||||
|
@ -96,8 +80,6 @@ groups:
|
||||||
- alert: InstanceLowDiskPerc
|
- alert: InstanceLowDiskPerc
|
||||||
expr: 100 * (node_filesystem_free_bytes / node_filesystem_size_bytes) < 10
|
expr: 100 * (node_filesystem_free_bytes / node_filesystem_size_bytes) < 10
|
||||||
for: 1m
|
for: 1m
|
||||||
labels:
|
|
||||||
severity: page
|
|
||||||
annotations:
|
annotations:
|
||||||
description: 'Less than 10% of free disk space left on a device'
|
description: 'Less than 10% of free disk space left on a device'
|
||||||
summary: 'Instance {{ $labels.instance }}: {{ $value }}% free disk space on {{ $labels.device }}'
|
summary: 'Instance {{ $labels.instance }}: {{ $value }}% free disk space on {{ $labels.device }}'
|
||||||
|
@ -106,8 +88,6 @@ groups:
|
||||||
- alert: ServiceFailed
|
- alert: ServiceFailed
|
||||||
expr: node_systemd_unit_state{state="failed"} > 0
|
expr: node_systemd_unit_state{state="failed"} > 0
|
||||||
for: 1m
|
for: 1m
|
||||||
labels:
|
|
||||||
severity: page
|
|
||||||
annotations:
|
annotations:
|
||||||
description: 'A systemd unit went into failed state'
|
description: 'A systemd unit went into failed state'
|
||||||
summary: 'Instance {{ $labels.instance }}: Service {{ $labels.name }} failed'
|
summary: 'Instance {{ $labels.instance }}: Service {{ $labels.name }} failed'
|
||||||
|
@ -116,8 +96,6 @@ groups:
|
||||||
- alert: ServiceFlapping
|
- alert: ServiceFlapping
|
||||||
expr: changes(node_systemd_unit_state{state="failed"}[5m]) > 5 or
|
expr: changes(node_systemd_unit_state{state="failed"}[5m]) > 5 or
|
||||||
(changes(node_systemd_unit_state{state="failed"}[1h]) > 15 unless changes(node_systemd_unit_state{state="failed"}[30m]) < 7)
|
(changes(node_systemd_unit_state{state="failed"}[1h]) > 15 unless changes(node_systemd_unit_state{state="failed"}[30m]) < 7)
|
||||||
labels:
|
|
||||||
severity: page
|
|
||||||
annotations:
|
annotations:
|
||||||
description: 'A systemd service changed its state more than 5x/5min or 15x/1h'
|
description: 'A systemd service changed its state more than 5x/5min or 15x/1h'
|
||||||
summary: 'Instance {{ $labels.instance }}: Service {{ $labels.name }} is flapping'
|
summary: 'Instance {{ $labels.instance }}: Service {{ $labels.name }} is flapping'
|
||||||
|
|
|
@ -3,11 +3,6 @@ global:
|
||||||
evaluation_interval: 15s # By default, scrape targets every 15 seconds.
|
evaluation_interval: 15s # By default, scrape targets every 15 seconds.
|
||||||
# scrape_timeout is set to the global default (10s).
|
# scrape_timeout is set to the global default (10s).
|
||||||
|
|
||||||
# The labels to add to any time series or alerts when communicating with
|
|
||||||
# external systems (federation, remote storage, Alertmanager).
|
|
||||||
external_labels:
|
|
||||||
monitor: 'master'
|
|
||||||
|
|
||||||
alerting:
|
alerting:
|
||||||
alertmanagers:
|
alertmanagers:
|
||||||
- scheme: https
|
- scheme: https
|
||||||
|
@ -51,23 +46,26 @@ scrape_configs:
|
||||||
- '{{ host }}:9281'
|
- '{{ host }}:9281'
|
||||||
{% endfor %}
|
{% endfor %}
|
||||||
|
|
||||||
- job_name: "icmp6"
|
{% for job in ['icmp4','icmp6'] %}
|
||||||
|
- job_name: "{{ job }}"
|
||||||
metrics_path: /probe
|
metrics_path: /probe
|
||||||
params:
|
params:
|
||||||
module: ["icmp6"]
|
module: ["{{ job }}"]
|
||||||
static_configs:
|
static_configs:
|
||||||
- targets:
|
- targets:
|
||||||
{% for group in prometheus_groups %}
|
{% for group in prometheus_groups %}
|
||||||
{% for host in groups[group] %}
|
{% for host in groups[group] %}
|
||||||
{% if host != inventory_hostname %}
|
{% if host != inventory_hostname %}
|
||||||
- "{{ host.rsplit('.freifunk-mwu.de')[0] }}.ffwi.org"
|
- "{{ host.rsplit('.')[0] }}.ffwi.org"
|
||||||
- "{{ host.rsplit('.freifunk-mwu.de')[0] }}.ffmz.org"
|
- "{{ host.rsplit('.')[0] }}.ffmz.org"
|
||||||
{% endif %}
|
{% endif %}
|
||||||
{% endfor %}
|
{% endfor %}
|
||||||
{% endfor %}
|
{% endfor %}
|
||||||
{% for host, _ in bgp_mwu_servers.items() %}
|
{% for host, _ in bgp_mwu_servers.items() %}
|
||||||
|
{% if host not in ['extrasahne'] %}
|
||||||
- "{{ host }}.ffwi.org"
|
- "{{ host }}.ffwi.org"
|
||||||
- "{{ host }}.ffmz.org"
|
- "{{ host }}.ffmz.org"
|
||||||
|
{% endif %}
|
||||||
{% endfor %}
|
{% endfor %}
|
||||||
relabel_configs:
|
relabel_configs:
|
||||||
- source_labels: [__address__]
|
- source_labels: [__address__]
|
||||||
|
@ -76,3 +74,5 @@ scrape_configs:
|
||||||
target_label: instance
|
target_label: instance
|
||||||
- target_label: __address__
|
- target_label: __address__
|
||||||
replacement: 127.0.0.1:9115
|
replacement: 127.0.0.1:9115
|
||||||
|
|
||||||
|
{% endfor %}
|
||||||
|
|
|
@ -9,7 +9,7 @@ modules:
|
||||||
tcp:
|
tcp:
|
||||||
query_response:
|
query_response:
|
||||||
- expect: "^SSH-2.0-"
|
- expect: "^SSH-2.0-"
|
||||||
icmp:
|
icmp4:
|
||||||
prober: icmp
|
prober: icmp
|
||||||
icmp:
|
icmp:
|
||||||
preferred_ip_protocol: "ip4"
|
preferred_ip_protocol: "ip4"
|
||||||
|
|
Loading…
Reference in a new issue