feat(grafana): provision basic alerting

This commit is contained in:
Rob Watson 2023-09-07 21:04:13 +02:00
parent 07f7b1b75f
commit e849494077
6 changed files with 298 additions and 11 deletions

View File

@ -35,6 +35,8 @@ data:
url: http://prometheus-server
contactpoints.yaml: |
apiVersion: 1
rules.yaml: |
apiVersion: 1
dashboardproviders.yaml: |
apiVersion: 1
providers:

View File

@ -26,7 +26,7 @@ spec:
app.kubernetes.io/name: grafana
app.kubernetes.io/instance: grafana
annotations:
checksum/config: 36a36abf9dd9e61eaa035cfc90acbb82d3e6c131aa9fd57eaf98ae5380401bf3
checksum/config: 008eb6b5d7e1de9723209fca089750d0b54ffa5b829c51598d8def8d878c44c5
checksum/dashboards-json-config: 2b3b91b055108de2da8951a904e7c7ea49b5a5a250d2649ba27b7b7b7ec34cfd
checksum/sc-dashboard-provider-config: 01ba4719c80b6fe911b091a7c05124b64eeece964e09c058ef8f9805daca546b
kubectl.kubernetes.io/default-container: grafana
@ -83,6 +83,9 @@ spec:
- name: config
mountPath: "/etc/grafana/provisioning/alerting/contactpoints.yaml"
subPath: "contactpoints.yaml"
- name: config
mountPath: "/etc/grafana/provisioning/alerting/rules.yaml"
subPath: "rules.yaml"
- name: config
mountPath: "/etc/grafana/provisioning/dashboards/dashboardproviders.yaml"
subPath: "dashboardproviders.yaml"

View File

@ -22,6 +22,8 @@ datasources:
alerting:
contactpoints.yaml:
apiVersion: 1
rules.yaml:
apiVersion: 1
dashboardProviders:
dashboardproviders.yaml:
apiVersion: 1

View File

@ -0,0 +1,288 @@
apiVersion: 1
groups:
- name: rules.yaml
interval: 60s
folder: Solar
rules:
- id: 2
uid: c40e8d57-9d65-4a28-8485-a46b810c033e
orgID: 1
folderUID: ded7fd24-65bf-4e04-95ec-0970287687cb
ruleGroup: every_minute
title: solar_grid_mode_unexpected_value
condition: B
data:
- refId: A
queryType: ""
relativeTimeRange:
from: 300
to: 0
datasourceUid: P0A2ACEDBDFD04F7F
model:
datasource:
type: postgres
uid: P0A2ACEDBDFD04F7F
editorMode: code
format: table
hide: false
intervalMs: 1000
maxDataPoints: 43200
rawQuery: true
rawSql: SELECT grid_mode FROM et_runtime_data ORDER BY "timestamp" DESC LIMIT 1
refId: A
sql:
columns:
- parameters:
- name: grid_mode
type: functionParameter
type: function
groupBy:
- property:
type: string
type: groupBy
limit: 5
orderBy:
property:
name: '"timestamp"'
type: string
type: property
orderByDirection: ASC
whereJsonTree:
children1:
- id: a9b888a9-0123-4456-b89a-b18a6c43c585
properties:
field: null
operator: null
value: []
valueSrc: []
type: rule
id: bb8b9bba-89ab-4cde-b012-318a673c506d
type: group
table: et_runtime_data
- refId: B
queryType: ""
relativeTimeRange:
from: 600
to: 0
datasourceUid: __expr__
model:
conditions:
- evaluator:
params:
- 1
- 1
type: outside_range
operator:
type: and
query:
params:
- A
reducer:
params: []
type: last
type: query
datasource:
name: Expression
type: __expr__
uid: __expr__
expression: ""
intervalMs: 1000
maxDataPoints: 43200
refId: B
type: classic_conditions
updated: "2023-09-06T20:55:12Z"
noDataState: Alerting
execErrState: Error
for: 5m
isPaused: false
- id: 4
uid: e6c41854-ace4-4149-9aea-dfb74454a496
orgID: 1
folderUID: ded7fd24-65bf-4e04-95ec-0970287687cb
ruleGroup: every_minute
title: solar_temperature_high
condition: B
data:
- refId: A
queryType: ""
relativeTimeRange:
from: 600
to: 0
datasourceUid: P0A2ACEDBDFD04F7F
model:
datasource:
type: postgres
uid: P0A2ACEDBDFD04F7F
editorMode: code
format: table
hide: false
intervalMs: 1000
maxDataPoints: 43200
rawQuery: true
rawSql: 'SELECT temperature FROM et_runtime_data WHERE timestamp > (NOW() - ''15 minutes''::interval) ORDER BY "timestamp" DESC LIMIT 50 '
refId: A
sql:
columns:
- parameters:
- name: temperature
type: functionParameter
type: function
groupBy:
- property:
type: string
type: groupBy
limit: 50
orderBy:
property:
name: '"timestamp"'
type: string
type: property
orderByDirection: DESC
whereJsonTree:
children1:
- id: 8bba9888-89ab-4cde-b012-318a70ec0037
properties:
field: '"timestamp"'
operator: equal
value:
- null
valueSrc:
- value
valueType:
- datetime
type: rule
id: bbb98b9a-89ab-4cde-b012-318a70eb5f5f
type: group
table: et_runtime_data
- refId: B
queryType: ""
relativeTimeRange:
from: 0
to: 0
datasourceUid: __expr__
model:
conditions:
- evaluator:
params:
- 60
- 0
type: gt
operator:
type: and
query:
params:
- A
reducer:
params: []
type: avg
type: query
datasource:
name: Expression
type: __expr__
uid: __expr__
expression: ""
intervalMs: 1000
maxDataPoints: 43200
refId: B
type: classic_conditions
updated: "2023-09-07T18:37:57Z"
noDataState: NoData
execErrState: Error
for: 5m
isPaused: false
- name: rules.yaml
interval: 60s
folder: Nodes (General)
rules:
- id: 6
uid: a33b6255-4262-4924-bc25-99893d3e6d2c
orgID: 1
folderUID: b2d32456-52c2-456e-8906-4652925c88c6
ruleGroup: every_minute
title: db_disk_utilization_high
condition: C
data:
- refId: A
queryType: ""
relativeTimeRange:
from: 600
to: 0
datasourceUid: PBFA97CFB590B2093
model:
datasource:
type: prometheus
uid: PBFA97CFB590B2093
editorMode: code
expr: 100 - ((node_filesystem_avail_bytes{instance="prod-db:9100",job="node",device!~'rootfs',mountpoint="/mnt/volume_db"} * 100) / node_filesystem_size_bytes{instance="prod-db:9100",job="node",device!~'rootfs',mountpoint="/mnt/volume_db"})
hide: false
instant: true
intervalMs: 1000
maxDataPoints: 43200
range: false
refId: A
- refId: B
queryType: ""
relativeTimeRange:
from: 600
to: 0
datasourceUid: __expr__
model:
conditions:
- evaluator:
params: []
type: gt
operator:
type: and
query:
params:
- B
reducer:
params: []
type: last
type: query
datasource:
type: __expr__
uid: __expr__
expression: A
hide: false
intervalMs: 1000
maxDataPoints: 43200
reducer: mean
refId: B
type: reduce
- refId: C
queryType: ""
relativeTimeRange:
from: 600
to: 0
datasourceUid: __expr__
model:
conditions:
- evaluator:
params:
- 85
type: gt
operator:
type: and
query:
params:
- C
reducer:
params: []
type: last
type: query
datasource:
type: __expr__
uid: __expr__
expression: B
hide: false
intervalMs: 1000
maxDataPoints: 43200
refId: C
type: threshold
updated: "2023-09-07T18:50:52Z"
noDataState: NoData
execErrState: Error
for: 5m
isPaused: false

View File

@ -28,6 +28,7 @@ configMapGenerator:
- grafana.ini=secrets/grafana-config.ini
- datasources.yaml=secrets/grafana-datasources.yaml
- contactpoints.yaml=grafana-contactpoints.yaml
- rules.yaml=grafana-rules.yaml
- name: invidious-config
files:
- config.yml=invidious-config.yaml

View File

@ -1,10 +1 @@
groups:
- name: default-group
rules:
- alert: DBRootFSUsed
expr: 100 - ((node_filesystem_avail_bytes{instance="prod-db:9100",job="node",mountpoint="/",fstype!="rootfs"} * 100) / node_filesystem_size_bytes{instance="prod-db:9100",job="node",mountpoint="/",fstype!="rootfs"}) > 85
for: 10m
labels:
severity: alert
annotations:
summary: database disk space
groups: []