Role service-prometheus: make alert rules less verbose

This commit is contained in:
Julian Labus 2018-12-10 12:47:27 +01:00
parent 7b9d3352e8
commit ab252b3776
No known key found for this signature in database
GPG key ID: 8AF209F2C6B3572A
2 changed files with 18 additions and 4 deletions

View file

@ -2,14 +2,14 @@ groups:
- name: blackbox - name: blackbox
rules: rules:
- alert: Icmp4Timeout - alert: Icmp4Timeout
expr: probe_success{job="icmp4"} == 0 expr: (probe_success{job="icmp4"} == 0 and on(hostname) ALERTS{alertname="InstanceDown"} == 0) or (probe_success{job="icmp4",hostname=~"zuckerwatte|aubergine|glueckskeks"} == 0)
for: 5m for: 5m
annotations: annotations:
description: 'ICMP requests to the primary IPv4 address timed out' description: 'ICMP requests to the primary IPv4 address timed out'
summary: 'Instance {{ $labels.instance }} does not respond to ICMPv4 echo requests' summary: 'Instance {{ $labels.instance }} does not respond to ICMPv4 echo requests'
- alert: Icmp6Timeout - alert: Icmp6Timeout
expr: probe_success{job="icmp6"} == 0 expr: (probe_success{job="icmp6"} == 0 and on(hostname) ALERTS{alertname="InstanceDown"} == 0) or (probe_success{job="icmp6",hostname=~"zuckerwatte|aubergine|glueckskeks"} == 0)
for: 5m for: 5m
annotations: annotations:
description: 'ICMP requests to the primary IPv6 address timed out' description: 'ICMP requests to the primary IPv6 address timed out'
@ -25,7 +25,7 @@ groups:
summary: 'Instance {{ $labels.instance }} is down' summary: 'Instance {{ $labels.instance }} is down'
- alert: ExporterDown - alert: ExporterDown
expr: up{job!="node"} == 0 expr: up{job!="node"} == 0 and ON(hostname) ALERTS{alertname="InstanceDown"} == 0
for: 5m for: 5m
annotations: annotations:
description: 'An exporter is down for more than 5 minutes' description: 'An exporter is down for more than 5 minutes'
@ -104,7 +104,7 @@ groups:
- name: fastd - name: fastd
rules: rules:
- alert: FastdNoTraffic - alert: FastdNoTraffic
expr: irate(fastd_tx_bytes{interface!~".*-1312"}[5m]) == 0 expr: irate(fastd_tx_bytes{interface!~".*-1312"}[5m]) == 0 and ON(hostname) (time() - node_boot_time_seconds{job="node"}) / 60 > 30
for: 5m for: 5m
annotations: annotations:
description: 'No TX data was seen on a fastd interface for more than 5 minutes' description: 'No TX data was seen on a fastd interface for more than 5 minutes'

View file

@ -37,6 +37,11 @@ scrape_configs:
labels: labels:
group: '{{ group }}' group: '{{ group }}'
{% endfor %} {% endfor %}
relabel_configs:
- source_labels: [__address__]
regex: '([a-z]+)\..*'
replacement: '$1'
target_label: hostname
- job_name: "fastd" - job_name: "fastd"
scheme: "https" scheme: "https"
@ -45,6 +50,11 @@ scrape_configs:
{% for host in groups['ffmwu-gateways'] %} {% for host in groups['ffmwu-gateways'] %}
- '{{ host }}:9281' - '{{ host }}:9281'
{% endfor %} {% endfor %}
relabel_configs:
- source_labels: [__address__]
regex: '([a-z]+)\..*'
replacement: '$1'
target_label: hostname
{% for job in ['icmp4','icmp6'] %} {% for job in ['icmp4','icmp6'] %}
- job_name: "{{ job }}" - job_name: "{{ job }}"
@ -74,5 +84,9 @@ scrape_configs:
target_label: instance target_label: instance
- target_label: __address__ - target_label: __address__
replacement: 127.0.0.1:9115 replacement: 127.0.0.1:9115
- source_labels: [__param_target]
regex: '([a-z]+)\..*'
replacement: '$1'
target_label: hostname
{% endfor %} {% endfor %}