monitoring: add grafana and alertmanager
This commit is contained in:
parent
a875b0a42e
commit
06337f2b9d
@ -8,7 +8,7 @@ metadata:
|
||||
spec:
|
||||
interval: 1m0s
|
||||
ref:
|
||||
branch: main
|
||||
branch: feature/atlas-monitoring
|
||||
secretRef:
|
||||
name: flux-system-gitea
|
||||
url: ssh://git@scm.bstein.dev:2242/bstein/titan-iac.git
|
||||
|
||||
15
services/monitoring/README.md
Normal file
15
services/monitoring/README.md
Normal file
@ -0,0 +1,15 @@
|
||||
# services/monitoring
|
||||
|
||||
## Grafana admin secret
|
||||
|
||||
The Grafana Helm release expects a pre-existing secret named `grafana-admin`
|
||||
in the `monitoring` namespace. Create or rotate it with:
|
||||
|
||||
```bash
|
||||
kubectl create secret generic grafana-admin \
|
||||
--namespace monitoring \
|
||||
--from-literal=admin-user=admin \
|
||||
--from-literal=admin-password='REPLACE_ME'
|
||||
```
|
||||
|
||||
Update the password whenever you rotate credentials.
|
||||
227
services/monitoring/grafana-dashboard-public.yaml
Normal file
227
services/monitoring/grafana-dashboard-public.yaml
Normal file
@ -0,0 +1,227 @@
|
||||
# services/monitoring/grafana-dashboard-public.yaml
|
||||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
metadata:
|
||||
name: grafana-dashboard-public
|
||||
labels:
|
||||
grafana_dashboard: "1"
|
||||
data:
|
||||
atlas-public-overview.json: |
|
||||
{
|
||||
"annotations": {
|
||||
"list": [
|
||||
{
|
||||
"builtIn": 1,
|
||||
"datasource": {
|
||||
"type": "datasource",
|
||||
"uid": "grafana"
|
||||
},
|
||||
"enable": true,
|
||||
"hide": true,
|
||||
"iconColor": "rgba(0, 211, 255, 1)",
|
||||
"name": "Annotations & Alerts",
|
||||
"type": "dashboard"
|
||||
}
|
||||
]
|
||||
},
|
||||
"editable": false,
|
||||
"fiscalYearStartMonth": 0,
|
||||
"graphTooltip": 0,
|
||||
"id": null,
|
||||
"links": [],
|
||||
"liveNow": false,
|
||||
"panels": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "atlas-vm"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": {
|
||||
"mode": "palette-classic"
|
||||
},
|
||||
"mappings": [],
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{
|
||||
"color": "green",
|
||||
"value": null
|
||||
}
|
||||
]
|
||||
},
|
||||
"unit": "none"
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 7,
|
||||
"w": 6,
|
||||
"x": 0,
|
||||
"y": 0
|
||||
},
|
||||
"id": 1,
|
||||
"options": {
|
||||
"colorMode": "value",
|
||||
"graphMode": "area",
|
||||
"justifyMode": "auto",
|
||||
"orientation": "auto",
|
||||
"reduceOptions": {
|
||||
"calcs": [
|
||||
"lastNotNull"
|
||||
],
|
||||
"fields": "",
|
||||
"values": false
|
||||
},
|
||||
"text": {},
|
||||
"textMode": "auto"
|
||||
},
|
||||
"pluginVersion": "10.4.0",
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "atlas-vm"
|
||||
},
|
||||
"editorMode": "code",
|
||||
"expr": "sum(kube_pod_status_phase{phase=\"Running\"})",
|
||||
"legendFormat": "",
|
||||
"range": true,
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"title": "Running pods",
|
||||
"type": "stat"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "atlas-vm"
|
||||
},
|
||||
"description": "Aggregated CPU usage across all schedulable nodes.",
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": {
|
||||
"mode": "continuous-BlYlRd"
|
||||
},
|
||||
"mappings": [],
|
||||
"max": 100,
|
||||
"min": 0,
|
||||
"thresholds": {
|
||||
"mode": "percentage",
|
||||
"steps": [
|
||||
{
|
||||
"color": "green",
|
||||
"value": null
|
||||
},
|
||||
{
|
||||
"color": "yellow",
|
||||
"value": 60
|
||||
},
|
||||
{
|
||||
"color": "red",
|
||||
"value": 85
|
||||
}
|
||||
]
|
||||
},
|
||||
"unit": "percent"
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 7,
|
||||
"w": 6,
|
||||
"x": 6,
|
||||
"y": 0
|
||||
},
|
||||
"id": 2,
|
||||
"options": {
|
||||
"colorMode": "value",
|
||||
"graphMode": "area",
|
||||
"justifyMode": "auto",
|
||||
"orientation": "auto",
|
||||
"reduceOptions": {
|
||||
"calcs": [
|
||||
"lastNotNull"
|
||||
],
|
||||
"fields": "",
|
||||
"values": false
|
||||
},
|
||||
"text": {},
|
||||
"textMode": "auto"
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "atlas-vm"
|
||||
},
|
||||
"expr": "avg(100 - (avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m])) * 100))",
|
||||
"legendFormat": "",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"title": "Average node CPU",
|
||||
"type": "stat"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "atlas-vm"
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 0,
|
||||
"y": 7
|
||||
},
|
||||
"id": 3,
|
||||
"options": {
|
||||
"legend": {
|
||||
"calcs": [],
|
||||
"displayMode": "list",
|
||||
"placement": "bottom",
|
||||
"showLegend": true
|
||||
},
|
||||
"tooltip": {
|
||||
"mode": "single",
|
||||
"sort": "none"
|
||||
}
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "atlas-vm"
|
||||
},
|
||||
"expr": "sum(rate(container_cpu_usage_seconds_total{namespace!=\"\", container!=\"\"}[5m])) by (namespace)",
|
||||
"legendFormat": "{{namespace}}",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"title": "Namespace CPU (5m avg)",
|
||||
"type": "timeseries"
|
||||
}
|
||||
],
|
||||
"refresh": "30s",
|
||||
"schemaVersion": 39,
|
||||
"style": "dark",
|
||||
"tags": [
|
||||
"atlas",
|
||||
"public"
|
||||
],
|
||||
"templating": {
|
||||
"list": []
|
||||
},
|
||||
"time": {
|
||||
"from": "now-6h",
|
||||
"to": "now"
|
||||
},
|
||||
"timepicker": {},
|
||||
"timezone": "",
|
||||
"title": "Atlas Public Overview",
|
||||
"uid": "atlas-public",
|
||||
"version": 1,
|
||||
"weekStart": ""
|
||||
}
|
||||
223
services/monitoring/grafana-dashboard-sre.yaml
Normal file
223
services/monitoring/grafana-dashboard-sre.yaml
Normal file
@ -0,0 +1,223 @@
|
||||
# services/monitoring/grafana-dashboard-sre.yaml
|
||||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
metadata:
|
||||
name: grafana-dashboard-sre
|
||||
labels:
|
||||
grafana_dashboard: "1"
|
||||
data:
|
||||
atlas-sre-overview.json: |
|
||||
{
|
||||
"annotations": {
|
||||
"list": [
|
||||
{
|
||||
"builtIn": 1,
|
||||
"datasource": {
|
||||
"type": "datasource",
|
||||
"uid": "grafana"
|
||||
},
|
||||
"enable": true,
|
||||
"hide": true,
|
||||
"iconColor": "rgba(0, 211, 255, 1)",
|
||||
"name": "Annotations & Alerts",
|
||||
"type": "dashboard"
|
||||
}
|
||||
]
|
||||
},
|
||||
"editable": true,
|
||||
"fiscalYearStartMonth": 0,
|
||||
"graphTooltip": 0,
|
||||
"links": [],
|
||||
"panels": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "atlas-vm"
|
||||
},
|
||||
"description": "Percentage of Ready nodes.",
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": {
|
||||
"mode": "continuous"
|
||||
},
|
||||
"mappings": [],
|
||||
"max": 100,
|
||||
"min": 0,
|
||||
"thresholds": {
|
||||
"mode": "percentage",
|
||||
"steps": [
|
||||
{
|
||||
"color": "red",
|
||||
"value": null
|
||||
},
|
||||
{
|
||||
"color": "green",
|
||||
"value": 90
|
||||
}
|
||||
]
|
||||
},
|
||||
"unit": "percent"
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 7,
|
||||
"w": 6,
|
||||
"x": 0,
|
||||
"y": 0
|
||||
},
|
||||
"id": 10,
|
||||
"options": {
|
||||
"colorMode": "value",
|
||||
"graphMode": "none",
|
||||
"justifyMode": "center",
|
||||
"orientation": "auto",
|
||||
"reduceOptions": {
|
||||
"calcs": [
|
||||
"lastNotNull"
|
||||
],
|
||||
"fields": "",
|
||||
"values": false
|
||||
}
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "atlas-vm"
|
||||
},
|
||||
"expr": "avg(kube_node_status_condition{condition=\"Ready\",status=\"true\"}) * 100",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"title": "Ready nodes",
|
||||
"type": "stat"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "atlas-vm"
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 7,
|
||||
"w": 6,
|
||||
"x": 6,
|
||||
"y": 0
|
||||
},
|
||||
"id": 11,
|
||||
"options": {
|
||||
"legend": {
|
||||
"displayMode": "table",
|
||||
"placement": "bottom"
|
||||
},
|
||||
"tooltip": {
|
||||
"mode": "multi"
|
||||
}
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "atlas-vm"
|
||||
},
|
||||
"expr": "sum by (node)(node_filesystem_avail_bytes{mountpoint=\"/\"})",
|
||||
"legendFormat": "{{node}}",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"title": "Free root filesystem bytes",
|
||||
"type": "timeseries"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "atlas-vm"
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 10,
|
||||
"w": 12,
|
||||
"x": 0,
|
||||
"y": 7
|
||||
},
|
||||
"id": 12,
|
||||
"options": {
|
||||
"legend": {
|
||||
"calcs": [],
|
||||
"displayMode": "list",
|
||||
"placement": "bottom"
|
||||
},
|
||||
"tooltip": {
|
||||
"mode": "single"
|
||||
}
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "atlas-vm"
|
||||
},
|
||||
"expr": "sum(rate(container_cpu_usage_seconds_total{namespace=\"crypto\",container!=\"\"}[5m])) by (pod)",
|
||||
"legendFormat": "{{pod}}",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"title": "Crypto namespace CPU usage",
|
||||
"type": "timeseries"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "atlas-vm"
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 9,
|
||||
"w": 12,
|
||||
"x": 0,
|
||||
"y": 17
|
||||
},
|
||||
"id": 13,
|
||||
"options": {
|
||||
"displayMode": "gradient",
|
||||
"orientation": "horizontal",
|
||||
"reduceOptions": {
|
||||
"calcs": [
|
||||
"lastNotNull"
|
||||
],
|
||||
"fields": "",
|
||||
"values": false
|
||||
},
|
||||
"showUnfilled": false
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "atlas-vm"
|
||||
},
|
||||
"expr": "count(sum(kube_pod_status_phase{phase=\"Failed\"}) by (namespace))",
|
||||
"legendFormat": "",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"title": "Namespaces with failed pods",
|
||||
"type": "bargauge"
|
||||
}
|
||||
],
|
||||
"schemaVersion": 39,
|
||||
"style": "dark",
|
||||
"tags": [
|
||||
"atlas",
|
||||
"sre"
|
||||
],
|
||||
"templating": {
|
||||
"list": []
|
||||
},
|
||||
"time": {
|
||||
"from": "now-12h",
|
||||
"to": "now"
|
||||
},
|
||||
"timepicker": {},
|
||||
"title": "Atlas SRE Overview",
|
||||
"uid": "atlas-sre",
|
||||
"version": 1
|
||||
}
|
||||
@ -210,3 +210,134 @@ spec:
|
||||
- action: keep
|
||||
source_labels: [__meta_kubernetes_namespace, __meta_kubernetes_pod_label_app_kubernetes_io_part_of]
|
||||
regex: flux-system;flux
|
||||
|
||||
---
|
||||
|
||||
apiVersion: helm.toolkit.fluxcd.io/v2
|
||||
kind: HelmRelease
|
||||
metadata:
|
||||
name: grafana
|
||||
namespace: monitoring
|
||||
spec:
|
||||
interval: 15m
|
||||
chart:
|
||||
spec:
|
||||
chart: grafana
|
||||
version: "~8.5.0"
|
||||
sourceRef:
|
||||
kind: HelmRepository
|
||||
name: grafana
|
||||
namespace: flux-system
|
||||
values:
|
||||
admin:
|
||||
existingSecret: grafana-admin
|
||||
userKey: admin-user
|
||||
passwordKey: admin-password
|
||||
persistence:
|
||||
enabled: true
|
||||
size: 20Gi
|
||||
storageClassName: astreae
|
||||
service:
|
||||
type: ClusterIP
|
||||
env:
|
||||
- name: GF_AUTH_ANONYMOUS_ENABLED
|
||||
value: "true"
|
||||
- name: GF_AUTH_ANONYMOUS_ORG_ROLE
|
||||
value: Viewer
|
||||
- name: GF_SECURITY_ALLOW_EMBEDDING
|
||||
value: "true"
|
||||
grafana.ini:
|
||||
server:
|
||||
domain: reporting.bstein.dev
|
||||
root_url: https://reporting.bstein.dev/
|
||||
auth.anonymous:
|
||||
hide_version: true
|
||||
users:
|
||||
default_theme: dark
|
||||
ingress:
|
||||
enabled: true
|
||||
ingressClassName: traefik
|
||||
annotations:
|
||||
cert-manager.io/cluster-issuer: letsencrypt
|
||||
hosts:
|
||||
- reporting.bstein.dev
|
||||
tls:
|
||||
- secretName: grafana-reporting-tls
|
||||
hosts:
|
||||
- reporting.bstein.dev
|
||||
datasources:
|
||||
datasources.yaml:
|
||||
apiVersion: 1
|
||||
datasources:
|
||||
- name: VictoriaMetrics
|
||||
type: prometheus
|
||||
access: proxy
|
||||
url: http://victoria-metrics-single-server:8428
|
||||
isDefault: true
|
||||
jsonData:
|
||||
timeInterval: "15s"
|
||||
dashboardProviders:
|
||||
dashboardproviders.yaml:
|
||||
apiVersion: 1
|
||||
providers:
|
||||
- name: public
|
||||
orgId: 1
|
||||
folder: Atlas Public
|
||||
type: file
|
||||
disableDeletion: false
|
||||
allowUiUpdates: false
|
||||
options:
|
||||
path: /var/lib/grafana/dashboards/public
|
||||
- name: sre
|
||||
orgId: 1
|
||||
folder: Atlas SRE
|
||||
type: file
|
||||
disableDeletion: false
|
||||
allowUiUpdates: true
|
||||
options:
|
||||
path: /var/lib/grafana/dashboards/sre
|
||||
dashboardsConfigMaps:
|
||||
- configMapName: grafana-dashboard-public
|
||||
folder: public
|
||||
- configMapName: grafana-dashboard-sre
|
||||
folder: sre
|
||||
|
||||
---
|
||||
|
||||
apiVersion: helm.toolkit.fluxcd.io/v2
|
||||
kind: HelmRelease
|
||||
metadata:
|
||||
name: alertmanager
|
||||
namespace: monitoring
|
||||
spec:
|
||||
interval: 15m
|
||||
chart:
|
||||
spec:
|
||||
chart: alertmanager
|
||||
version: "~1.9.0"
|
||||
sourceRef:
|
||||
kind: HelmRepository
|
||||
name: prometheus
|
||||
namespace: flux-system
|
||||
values:
|
||||
ingress:
|
||||
enabled: true
|
||||
ingressClassName: traefik
|
||||
annotations:
|
||||
cert-manager.io/cluster-issuer: letsencrypt
|
||||
hosts:
|
||||
- alerts.bstein.dev
|
||||
tls:
|
||||
- secretName: alerts-bstein-dev-tls
|
||||
hosts:
|
||||
- alerts.bstein.dev
|
||||
config:
|
||||
global:
|
||||
resolve_timeout: 5m
|
||||
route:
|
||||
receiver: default
|
||||
group_wait: 30s
|
||||
group_interval: 5m
|
||||
repeat_interval: 2h
|
||||
receivers:
|
||||
- name: default
|
||||
|
||||
@ -5,4 +5,6 @@ namespace: monitoring
|
||||
resources:
|
||||
- namespace.yaml
|
||||
- rbac.yaml
|
||||
- grafana-dashboard-public.yaml
|
||||
- grafana-dashboard-sre.yaml
|
||||
- helmrelease.yaml
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user