diff --git a/deploy/prod-ovh/deploy-grafana.yaml b/deploy/prod-ovh/deploy-grafana.yaml new file mode 100644 index 0000000..954d5d7 --- /dev/null +++ b/deploy/prod-ovh/deploy-grafana.yaml @@ -0,0 +1,36 @@ +--- +- op: add + path: /spec/template/spec/containers/0/env/- + value: + name: GF_DATABASE_TYPE + value: postgres +- op: add + path: /spec/template/spec/containers/0/env/- + value: + name: GF_DATABASE_HOST + value: prod-db +- op: add + path: /spec/template/spec/containers/0/env/- + value: + name: GF_DATABASE_PORT + value: "5432" +- op: add + path: /spec/template/spec/containers/0/env/- + value: + name: GF_DATABASE_USER + value: grafana +- op: add + path: /spec/template/spec/containers/0/env/- + value: + name: GF_DATABASE_PASSWORD + value: ${GRAFANA_POSTGRESQL_PASSWORD} +- op: add + path: /spec/template/spec/containers/0/env/- + value: + name: GF_DATABASE_NAME + value: grafana +- op: add + path: /spec/template/spec/containers/0/env/- + value: + name: GF_DATABASE_SSL_MODE + value: require diff --git a/deploy/prod-ovh/kustomization.yaml b/deploy/prod-ovh/kustomization.yaml index d24dbbd..3144747 100644 --- a/deploy/prod-ovh/kustomization.yaml +++ b/deploy/prod-ovh/kustomization.yaml @@ -30,3 +30,45 @@ resources: - clusterissuer.yaml - cert-ingress-tls.yaml - ingress.yaml + +configMapGenerator: +- name: prometheus-server + namespace: prometheus + behavior: merge + files: + - prometheus.yml=resources/prometheus.yaml + - alerting_rules.yml=resources/prometheus-alerting-rules.yaml + options: + labels: + app: prometheus +- name: prometheus-alertmanager + namespace: prometheus + behavior: merge + files: + - alertmanager.yml=secrets/prometheus-alertmanager.yaml + options: + labels: + app: prometheus +- name: grafana + behavior: merge + files: + - grafana.ini=secrets/grafana-config.ini + - datasources.yaml=secrets/grafana-datasources.yaml + - contactpoints.yaml=resources/grafana-contactpoints.yaml + - rules.yaml=resources/grafana-rules.yaml + +secretGenerator: +- name: prometheus-credentials + files: + - secrets/exporter-password +- name: grafana-credentials + files: + - admin-user=secrets/grafana-admin-user + - admin-password=secrets/grafana-admin-password + +patches: +# Patch Grafana deployment to inject PostgreSQL credentials: +# - target: +# kind: Deployment +# name: grafana +# path: deploy-grafana.yaml diff --git a/deploy/prod-ovh/resources/element-config.json b/deploy/prod-ovh/resources/element-config.json new file mode 100644 index 0000000..2de527a --- /dev/null +++ b/deploy/prod-ovh/resources/element-config.json @@ -0,0 +1,55 @@ +{ + "default_server_config": { + "m.homeserver": { + "base_url": "https://synapse.netflux.io" + }, + "m.identity_server": { + "base_url": "https://vector.im" + } + }, + "disable_custom_urls": false, + "disable_guests": false, + "disable_login_language_selector": false, + "disable_3pid_login": false, + "brand": "Element", + "integrations_ui_url": "https://scalar.vector.im/", + "integrations_rest_url": "https://scalar.vector.im/api", + "integrations_widgets_urls": [ + "https://scalar.vector.im/_matrix/integrations/v1", + "https://scalar.vector.im/api", + "https://scalar-staging.vector.im/_matrix/integrations/v1", + "https://scalar-staging.vector.im/api", + "https://scalar-staging.riot.im/scalar/api" + ], + "bug_report_endpoint_url": "https://element.io/bugreports/submit", + "uisi_autorageshake_app": "element-auto-uisi", + "default_country_code": "GB", + "show_labs_settings": false, + "features": { }, + "default_federate": true, + "default_theme": "light", + "room_directory": { + "servers": [ + "matrix.org", + "gitter.im", + "privacytools.io" + ] + }, + "piwik": { + "url": "https://piwik.riot.im/", + "whitelisted_hs_urls": ["https://matrix.org"], + "whitelisted_is_urls": ["https://vector.im", "https://matrix.org"], + "siteId": 1 + }, + "enable_presence_by_hs_url": { + "https://matrix.org": false, + "https://matrix-client.matrix.org": false + }, + "setting_defaults": { + "breadcrumbs": true + }, + "jitsi": { + "preferred_domain": "meet.element.io" + }, + "map_style_url": "https://api.maptiler.com/maps/streets/style.json?key=fU3vlMsMn4Jb6dnEIFsx" +} diff --git a/deploy/prod-ovh/resources/grafana-contactpoints.yaml b/deploy/prod-ovh/resources/grafana-contactpoints.yaml new file mode 100644 index 0000000..417c64b --- /dev/null +++ b/deploy/prod-ovh/resources/grafana-contactpoints.yaml @@ -0,0 +1,9 @@ +apiVersion: 1 +contactPoints: +- name: Prometheus Alertmanager + receivers: + - uid: prometheus-alertmanager-1 + type: prometheus-alertmanager + disableResolveMessage: false + settings: + url: http://prod-prometheus-alertmanager:9093 diff --git a/deploy/prod-ovh/resources/grafana-rules.yaml b/deploy/prod-ovh/resources/grafana-rules.yaml new file mode 100644 index 0000000..06aa0e9 --- /dev/null +++ b/deploy/prod-ovh/resources/grafana-rules.yaml @@ -0,0 +1,288 @@ +apiVersion: 1 +groups: +- name: rules.yaml + interval: 60s + folder: Solar + rules: + - id: 2 + uid: c40e8d57-9d65-4a28-8485-a46b810c033e + orgID: 1 + folderUID: ded7fd24-65bf-4e04-95ec-0970287687cb + ruleGroup: every_minute + title: solar_grid_mode_unexpected_value + condition: B + data: + - refId: A + queryType: "" + relativeTimeRange: + from: 300 + to: 0 + datasourceUid: P0A2ACEDBDFD04F7F + model: + datasource: + type: postgres + uid: P0A2ACEDBDFD04F7F + editorMode: code + format: table + hide: false + intervalMs: 1000 + maxDataPoints: 43200 + rawQuery: true + rawSql: SELECT grid_mode FROM et_runtime_data ORDER BY "timestamp" DESC LIMIT 1 + refId: A + sql: + columns: + - parameters: + - name: grid_mode + type: functionParameter + type: function + groupBy: + - property: + type: string + type: groupBy + limit: 5 + orderBy: + property: + name: '"timestamp"' + type: string + type: property + orderByDirection: ASC + whereJsonTree: + children1: + - id: a9b888a9-0123-4456-b89a-b18a6c43c585 + properties: + field: null + operator: null + value: [] + valueSrc: [] + type: rule + id: bb8b9bba-89ab-4cde-b012-318a673c506d + type: group + table: et_runtime_data + - refId: B + queryType: "" + relativeTimeRange: + from: 600 + to: 0 + datasourceUid: __expr__ + model: + conditions: + - evaluator: + params: + - 1 + - 1 + type: outside_range + operator: + type: and + query: + params: + - A + reducer: + params: [] + type: last + type: query + datasource: + name: Expression + type: __expr__ + uid: __expr__ + expression: "" + intervalMs: 1000 + maxDataPoints: 43200 + refId: B + type: classic_conditions + updated: "2023-09-06T20:55:12Z" + noDataState: Alerting + execErrState: Error + for: 5m + isPaused: false + - id: 4 + uid: e6c41854-ace4-4149-9aea-dfb74454a496 + orgID: 1 + folderUID: ded7fd24-65bf-4e04-95ec-0970287687cb + ruleGroup: every_minute + title: solar_temperature_high + condition: B + data: + - refId: A + queryType: "" + relativeTimeRange: + from: 600 + to: 0 + datasourceUid: P0A2ACEDBDFD04F7F + model: + datasource: + type: postgres + uid: P0A2ACEDBDFD04F7F + editorMode: code + format: table + hide: false + intervalMs: 1000 + maxDataPoints: 43200 + rawQuery: true + rawSql: 'SELECT temperature FROM et_runtime_data WHERE timestamp > (NOW() - ''15 minutes''::interval) ORDER BY "timestamp" DESC LIMIT 50 ' + refId: A + sql: + columns: + - parameters: + - name: temperature + type: functionParameter + type: function + groupBy: + - property: + type: string + type: groupBy + limit: 50 + orderBy: + property: + name: '"timestamp"' + type: string + type: property + orderByDirection: DESC + whereJsonTree: + children1: + - id: 8bba9888-89ab-4cde-b012-318a70ec0037 + properties: + field: '"timestamp"' + operator: equal + value: + - null + valueSrc: + - value + valueType: + - datetime + type: rule + id: bbb98b9a-89ab-4cde-b012-318a70eb5f5f + type: group + table: et_runtime_data + - refId: B + queryType: "" + relativeTimeRange: + from: 0 + to: 0 + datasourceUid: __expr__ + model: + conditions: + - evaluator: + params: + - 60 + - 0 + type: gt + operator: + type: and + query: + params: + - A + reducer: + params: [] + type: avg + type: query + datasource: + name: Expression + type: __expr__ + uid: __expr__ + expression: "" + intervalMs: 1000 + maxDataPoints: 43200 + refId: B + type: classic_conditions + updated: "2023-09-07T18:37:57Z" + noDataState: NoData + execErrState: Error + for: 5m + isPaused: false +- name: rules.yaml + interval: 60s + folder: Nodes (General) + rules: + - id: 6 + uid: a33b6255-4262-4924-bc25-99893d3e6d2c + orgID: 1 + folderUID: b2d32456-52c2-456e-8906-4652925c88c6 + ruleGroup: every_minute + title: db_disk_utilization_high + condition: C + data: + - refId: A + queryType: "" + relativeTimeRange: + from: 600 + to: 0 + datasourceUid: PBFA97CFB590B2093 + model: + datasource: + type: prometheus + uid: PBFA97CFB590B2093 + editorMode: code + expr: 100 - ((node_filesystem_avail_bytes{instance="prod-db:9100",job="node",device!~'rootfs',mountpoint="/mnt/volume_db"} * 100) / node_filesystem_size_bytes{instance="prod-db:9100",job="node",device!~'rootfs',mountpoint="/mnt/volume_db"}) + hide: false + instant: true + intervalMs: 1000 + maxDataPoints: 43200 + range: false + refId: A + - refId: B + queryType: "" + relativeTimeRange: + from: 600 + to: 0 + datasourceUid: __expr__ + model: + conditions: + - evaluator: + params: [] + type: gt + operator: + type: and + query: + params: + - B + reducer: + params: [] + type: last + type: query + datasource: + type: __expr__ + uid: __expr__ + expression: A + hide: false + intervalMs: 1000 + maxDataPoints: 43200 + reducer: mean + refId: B + type: reduce + - refId: C + queryType: "" + relativeTimeRange: + from: 600 + to: 0 + datasourceUid: __expr__ + model: + conditions: + - evaluator: + params: + - 85 + type: gt + operator: + type: and + query: + params: + - C + reducer: + params: [] + type: last + type: query + datasource: + type: __expr__ + uid: __expr__ + expression: B + hide: false + intervalMs: 1000 + maxDataPoints: 43200 + refId: C + type: threshold + updated: "2023-09-07T18:50:52Z" + noDataState: NoData + execErrState: Error + for: 5m + isPaused: false + diff --git a/deploy/prod-ovh/resources/invidious-config.yaml b/deploy/prod-ovh/resources/invidious-config.yaml new file mode 100644 index 0000000..bf05369 --- /dev/null +++ b/deploy/prod-ovh/resources/invidious-config.yaml @@ -0,0 +1,9 @@ +domain: tube.netflux.io +external_port: 443 +channel_threads: 2 +feed_threads: 2 +registration_enabled: false +default_user_preferences: + dark_mode: true +popular_enabled: false +https_only: true diff --git a/deploy/prod-ovh/resources/prometheus-alerting-rules.yaml b/deploy/prod-ovh/resources/prometheus-alerting-rules.yaml new file mode 100644 index 0000000..2ae2220 --- /dev/null +++ b/deploy/prod-ovh/resources/prometheus-alerting-rules.yaml @@ -0,0 +1 @@ +groups: [] diff --git a/deploy/prod-ovh/resources/prometheus.yaml b/deploy/prod-ovh/resources/prometheus.yaml new file mode 100644 index 0000000..3c696e5 --- /dev/null +++ b/deploy/prod-ovh/resources/prometheus.yaml @@ -0,0 +1,341 @@ +# This file is based on the default configuration file generated by the +# Prometheus helm chart. It is overridden here to avoid re-inflating the Helm +# chart to update configuration. +global: + evaluation_interval: 1m + scrape_interval: 15s + scrape_timeout: 10s +rule_files: +- /etc/config/recording_rules.yml +- /etc/config/alerting_rules.yml +- /etc/config/rules +- /etc/config/alerts +scrape_configs: +- job_name: prometheus + static_configs: + - targets: + - localhost:9090 +- bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token + job_name: kubernetes-apiservers + kubernetes_sd_configs: + - role: endpoints + relabel_configs: + - action: keep + regex: default;kubernetes;https + source_labels: + - __meta_kubernetes_namespace + - __meta_kubernetes_service_name + - __meta_kubernetes_endpoint_port_name + scheme: https + tls_config: + ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt + insecure_skip_verify: true +- bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token + job_name: kubernetes-nodes + kubernetes_sd_configs: + - role: node + relabel_configs: + - action: labelmap + regex: __meta_kubernetes_node_label_(.+) + - replacement: kubernetes.default.svc:443 + target_label: __address__ + - regex: (.+) + replacement: /api/v1/nodes/$1/proxy/metrics + source_labels: + - __meta_kubernetes_node_name + target_label: __metrics_path__ + scheme: https + tls_config: + ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt + insecure_skip_verify: true +- bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token + job_name: kubernetes-nodes-cadvisor + kubernetes_sd_configs: + - role: node + relabel_configs: + - action: labelmap + regex: __meta_kubernetes_node_label_(.+) + - replacement: kubernetes.default.svc:443 + target_label: __address__ + - regex: (.+) + replacement: /api/v1/nodes/$1/proxy/metrics/cadvisor + source_labels: + - __meta_kubernetes_node_name + target_label: __metrics_path__ + scheme: https + tls_config: + ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt + insecure_skip_verify: true +- honor_labels: true + job_name: kubernetes-service-endpoints + kubernetes_sd_configs: + - role: endpoints + relabel_configs: + - action: keep + regex: true + source_labels: + - __meta_kubernetes_service_annotation_prometheus_io_scrape + - action: drop + regex: true + source_labels: + - __meta_kubernetes_service_annotation_prometheus_io_scrape_slow + - action: replace + regex: (https?) + source_labels: + - __meta_kubernetes_service_annotation_prometheus_io_scheme + target_label: __scheme__ + - action: replace + regex: (.+) + source_labels: + - __meta_kubernetes_service_annotation_prometheus_io_path + target_label: __metrics_path__ + - action: replace + regex: (.+?)(?::\d+)?;(\d+) + replacement: $1:$2 + source_labels: + - __address__ + - __meta_kubernetes_service_annotation_prometheus_io_port + target_label: __address__ + - action: labelmap + regex: __meta_kubernetes_service_annotation_prometheus_io_param_(.+) + replacement: __param_$1 + - action: labelmap + regex: __meta_kubernetes_service_label_(.+) + - action: replace + source_labels: + - __meta_kubernetes_namespace + target_label: namespace + - action: replace + source_labels: + - __meta_kubernetes_service_name + target_label: service + - action: replace + source_labels: + - __meta_kubernetes_pod_node_name + target_label: node +- honor_labels: true + job_name: kubernetes-service-endpoints-slow + kubernetes_sd_configs: + - role: endpoints + relabel_configs: + - action: keep + regex: true + source_labels: + - __meta_kubernetes_service_annotation_prometheus_io_scrape_slow + - action: replace + regex: (https?) + source_labels: + - __meta_kubernetes_service_annotation_prometheus_io_scheme + target_label: __scheme__ + - action: replace + regex: (.+) + source_labels: + - __meta_kubernetes_service_annotation_prometheus_io_path + target_label: __metrics_path__ + - action: replace + regex: (.+?)(?::\d+)?;(\d+) + replacement: $1:$2 + source_labels: + - __address__ + - __meta_kubernetes_service_annotation_prometheus_io_port + target_label: __address__ + - action: labelmap + regex: __meta_kubernetes_service_annotation_prometheus_io_param_(.+) + replacement: __param_$1 + - action: labelmap + regex: __meta_kubernetes_service_label_(.+) + - action: replace + source_labels: + - __meta_kubernetes_namespace + target_label: namespace + - action: replace + source_labels: + - __meta_kubernetes_service_name + target_label: service + - action: replace + source_labels: + - __meta_kubernetes_pod_node_name + target_label: node + scrape_interval: 5m + scrape_timeout: 30s +- honor_labels: true + job_name: prometheus-pushgateway + kubernetes_sd_configs: + - role: service + relabel_configs: + - action: keep + regex: pushgateway + source_labels: + - __meta_kubernetes_service_annotation_prometheus_io_probe +- honor_labels: true + job_name: kubernetes-services + kubernetes_sd_configs: + - role: service + metrics_path: /probe + params: + module: + - http_2xx + relabel_configs: + - action: keep + regex: true + source_labels: + - __meta_kubernetes_service_annotation_prometheus_io_probe + - source_labels: + - __address__ + target_label: __param_target + - replacement: blackbox + target_label: __address__ + - source_labels: + - __param_target + target_label: instance + - action: labelmap + regex: __meta_kubernetes_service_label_(.+) + - source_labels: + - __meta_kubernetes_namespace + target_label: namespace + - source_labels: + - __meta_kubernetes_service_name + target_label: service +- honor_labels: true + job_name: kubernetes-pods + kubernetes_sd_configs: + - role: pod + relabel_configs: + - action: keep + regex: true + source_labels: + - __meta_kubernetes_pod_annotation_prometheus_io_scrape + - action: drop + regex: true + source_labels: + - __meta_kubernetes_pod_annotation_prometheus_io_scrape_slow + - action: replace + regex: (https?) + source_labels: + - __meta_kubernetes_pod_annotation_prometheus_io_scheme + target_label: __scheme__ + - action: replace + regex: (.+) + source_labels: + - __meta_kubernetes_pod_annotation_prometheus_io_path + target_label: __metrics_path__ + - action: replace + regex: (\d+);(([A-Fa-f0-9]{1,4}::?){1,7}[A-Fa-f0-9]{1,4}) + replacement: '[$2]:$1' + source_labels: + - __meta_kubernetes_pod_annotation_prometheus_io_port + - __meta_kubernetes_pod_ip + target_label: __address__ + - action: replace + regex: (\d+);((([0-9]+?)(\.|$)){4}) + replacement: $2:$1 + source_labels: + - __meta_kubernetes_pod_annotation_prometheus_io_port + - __meta_kubernetes_pod_ip + target_label: __address__ + - action: labelmap + regex: __meta_kubernetes_pod_annotation_prometheus_io_param_(.+) + replacement: __param_$1 + - action: labelmap + regex: __meta_kubernetes_pod_label_(.+) + - action: replace + source_labels: + - __meta_kubernetes_namespace + target_label: namespace + - action: replace + source_labels: + - __meta_kubernetes_pod_name + target_label: pod + - action: drop + regex: Pending|Succeeded|Failed|Completed + source_labels: + - __meta_kubernetes_pod_phase + - action: replace + source_labels: + - __meta_kubernetes_pod_node_name + target_label: node +- honor_labels: true + job_name: kubernetes-pods-slow + kubernetes_sd_configs: + - role: pod + relabel_configs: + - action: keep + regex: true + source_labels: + - __meta_kubernetes_pod_annotation_prometheus_io_scrape_slow + - action: replace + regex: (https?) + source_labels: + - __meta_kubernetes_pod_annotation_prometheus_io_scheme + target_label: __scheme__ + - action: replace + regex: (.+) + source_labels: + - __meta_kubernetes_pod_annotation_prometheus_io_path + target_label: __metrics_path__ + - action: replace + regex: (\d+);(([A-Fa-f0-9]{1,4}::?){1,7}[A-Fa-f0-9]{1,4}) + replacement: '[$2]:$1' + source_labels: + - __meta_kubernetes_pod_annotation_prometheus_io_port + - __meta_kubernetes_pod_ip + target_label: __address__ + - action: replace + regex: (\d+);((([0-9]+?)(\.|$)){4}) + replacement: $2:$1 + source_labels: + - __meta_kubernetes_pod_annotation_prometheus_io_port + - __meta_kubernetes_pod_ip + target_label: __address__ + - action: labelmap + regex: __meta_kubernetes_pod_annotation_prometheus_io_param_(.+) + replacement: __param_$1 + - action: labelmap + regex: __meta_kubernetes_pod_label_(.+) + - action: replace + source_labels: + - __meta_kubernetes_namespace + target_label: namespace + - action: replace + source_labels: + - __meta_kubernetes_pod_name + target_label: pod + - action: drop + regex: Pending|Succeeded|Failed|Completed + source_labels: + - __meta_kubernetes_pod_phase + - action: replace + source_labels: + - __meta_kubernetes_pod_node_name + target_label: node + scrape_interval: 5m + scrape_timeout: 30s +- job_name: "node" + scheme: https + basic_auth: + username: metrics + password_file: /etc/secrets/exporter-password + tls_config: + insecure_skip_verify: true + static_configs: + - targets: ["prod-db:9100", "prod-db:9187"] +alerting: + alertmanagers: + - kubernetes_sd_configs: + - role: pod + tls_config: + ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt + bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token + relabel_configs: + - source_labels: [__meta_kubernetes_namespace] + regex: default + action: keep + - source_labels: [__meta_kubernetes_pod_label_app_kubernetes_io_instance] + regex: prometheus + action: keep + - source_labels: [__meta_kubernetes_pod_label_app_kubernetes_io_name] + regex: alertmanager + action: keep + - source_labels: [__meta_kubernetes_pod_container_port_number] + regex: "9093" + action: keep