monitoring: add grafana and alertmanager

This commit is contained in:
Brad Stein 2025-11-14 00:02:59 -03:00
parent a875b0a42e
commit 06337f2b9d
6 changed files with 599 additions and 1 deletions

View File

@ -8,7 +8,7 @@ metadata:
spec:
interval: 1m0s
ref:
branch: main
branch: feature/atlas-monitoring
secretRef:
name: flux-system-gitea
url: ssh://git@scm.bstein.dev:2242/bstein/titan-iac.git

View File

@ -0,0 +1,15 @@
# services/monitoring
## Grafana admin secret
The Grafana Helm release expects a pre-existing secret named `grafana-admin`
in the `monitoring` namespace. Create or rotate it with:
```bash
kubectl create secret generic grafana-admin \
--namespace monitoring \
--from-literal=admin-user=admin \
--from-literal=admin-password='REPLACE_ME'
```
Update the password whenever you rotate credentials.

View File

@ -0,0 +1,227 @@
# services/monitoring/grafana-dashboard-public.yaml
apiVersion: v1
kind: ConfigMap
metadata:
name: grafana-dashboard-public
labels:
grafana_dashboard: "1"
data:
atlas-public-overview.json: |
{
"annotations": {
"list": [
{
"builtIn": 1,
"datasource": {
"type": "datasource",
"uid": "grafana"
},
"enable": true,
"hide": true,
"iconColor": "rgba(0, 211, 255, 1)",
"name": "Annotations & Alerts",
"type": "dashboard"
}
]
},
"editable": false,
"fiscalYearStartMonth": 0,
"graphTooltip": 0,
"id": null,
"links": [],
"liveNow": false,
"panels": [
{
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
}
]
},
"unit": "none"
},
"overrides": []
},
"gridPos": {
"h": 7,
"w": 6,
"x": 0,
"y": 0
},
"id": 1,
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "auto",
"orientation": "auto",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"text": {},
"textMode": "auto"
},
"pluginVersion": "10.4.0",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"editorMode": "code",
"expr": "sum(kube_pod_status_phase{phase=\"Running\"})",
"legendFormat": "",
"range": true,
"refId": "A"
}
],
"title": "Running pods",
"type": "stat"
},
{
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"description": "Aggregated CPU usage across all schedulable nodes.",
"fieldConfig": {
"defaults": {
"color": {
"mode": "continuous-BlYlRd"
},
"mappings": [],
"max": 100,
"min": 0,
"thresholds": {
"mode": "percentage",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "yellow",
"value": 60
},
{
"color": "red",
"value": 85
}
]
},
"unit": "percent"
},
"overrides": []
},
"gridPos": {
"h": 7,
"w": 6,
"x": 6,
"y": 0
},
"id": 2,
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "auto",
"orientation": "auto",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"text": {},
"textMode": "auto"
},
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"expr": "avg(100 - (avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m])) * 100))",
"legendFormat": "",
"refId": "A"
}
],
"title": "Average node CPU",
"type": "stat"
},
{
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 8,
"w": 12,
"x": 0,
"y": 7
},
"id": 3,
"options": {
"legend": {
"calcs": [],
"displayMode": "list",
"placement": "bottom",
"showLegend": true
},
"tooltip": {
"mode": "single",
"sort": "none"
}
},
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"expr": "sum(rate(container_cpu_usage_seconds_total{namespace!=\"\", container!=\"\"}[5m])) by (namespace)",
"legendFormat": "{{namespace}}",
"refId": "A"
}
],
"title": "Namespace CPU (5m avg)",
"type": "timeseries"
}
],
"refresh": "30s",
"schemaVersion": 39,
"style": "dark",
"tags": [
"atlas",
"public"
],
"templating": {
"list": []
},
"time": {
"from": "now-6h",
"to": "now"
},
"timepicker": {},
"timezone": "",
"title": "Atlas Public Overview",
"uid": "atlas-public",
"version": 1,
"weekStart": ""
}

View File

@ -0,0 +1,223 @@
# services/monitoring/grafana-dashboard-sre.yaml
apiVersion: v1
kind: ConfigMap
metadata:
name: grafana-dashboard-sre
labels:
grafana_dashboard: "1"
data:
atlas-sre-overview.json: |
{
"annotations": {
"list": [
{
"builtIn": 1,
"datasource": {
"type": "datasource",
"uid": "grafana"
},
"enable": true,
"hide": true,
"iconColor": "rgba(0, 211, 255, 1)",
"name": "Annotations & Alerts",
"type": "dashboard"
}
]
},
"editable": true,
"fiscalYearStartMonth": 0,
"graphTooltip": 0,
"links": [],
"panels": [
{
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"description": "Percentage of Ready nodes.",
"fieldConfig": {
"defaults": {
"color": {
"mode": "continuous"
},
"mappings": [],
"max": 100,
"min": 0,
"thresholds": {
"mode": "percentage",
"steps": [
{
"color": "red",
"value": null
},
{
"color": "green",
"value": 90
}
]
},
"unit": "percent"
},
"overrides": []
},
"gridPos": {
"h": 7,
"w": 6,
"x": 0,
"y": 0
},
"id": 10,
"options": {
"colorMode": "value",
"graphMode": "none",
"justifyMode": "center",
"orientation": "auto",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
}
},
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"expr": "avg(kube_node_status_condition{condition=\"Ready\",status=\"true\"}) * 100",
"refId": "A"
}
],
"title": "Ready nodes",
"type": "stat"
},
{
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 7,
"w": 6,
"x": 6,
"y": 0
},
"id": 11,
"options": {
"legend": {
"displayMode": "table",
"placement": "bottom"
},
"tooltip": {
"mode": "multi"
}
},
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"expr": "sum by (node)(node_filesystem_avail_bytes{mountpoint=\"/\"})",
"legendFormat": "{{node}}",
"refId": "A"
}
],
"title": "Free root filesystem bytes",
"type": "timeseries"
},
{
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 10,
"w": 12,
"x": 0,
"y": 7
},
"id": 12,
"options": {
"legend": {
"calcs": [],
"displayMode": "list",
"placement": "bottom"
},
"tooltip": {
"mode": "single"
}
},
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"expr": "sum(rate(container_cpu_usage_seconds_total{namespace=\"crypto\",container!=\"\"}[5m])) by (pod)",
"legendFormat": "{{pod}}",
"refId": "A"
}
],
"title": "Crypto namespace CPU usage",
"type": "timeseries"
},
{
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 9,
"w": 12,
"x": 0,
"y": 17
},
"id": 13,
"options": {
"displayMode": "gradient",
"orientation": "horizontal",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"showUnfilled": false
},
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"expr": "count(sum(kube_pod_status_phase{phase=\"Failed\"}) by (namespace))",
"legendFormat": "",
"refId": "A"
}
],
"title": "Namespaces with failed pods",
"type": "bargauge"
}
],
"schemaVersion": 39,
"style": "dark",
"tags": [
"atlas",
"sre"
],
"templating": {
"list": []
},
"time": {
"from": "now-12h",
"to": "now"
},
"timepicker": {},
"title": "Atlas SRE Overview",
"uid": "atlas-sre",
"version": 1
}

View File

@ -210,3 +210,134 @@ spec:
- action: keep
source_labels: [__meta_kubernetes_namespace, __meta_kubernetes_pod_label_app_kubernetes_io_part_of]
regex: flux-system;flux
---
apiVersion: helm.toolkit.fluxcd.io/v2
kind: HelmRelease
metadata:
name: grafana
namespace: monitoring
spec:
interval: 15m
chart:
spec:
chart: grafana
version: "~8.5.0"
sourceRef:
kind: HelmRepository
name: grafana
namespace: flux-system
values:
admin:
existingSecret: grafana-admin
userKey: admin-user
passwordKey: admin-password
persistence:
enabled: true
size: 20Gi
storageClassName: astreae
service:
type: ClusterIP
env:
- name: GF_AUTH_ANONYMOUS_ENABLED
value: "true"
- name: GF_AUTH_ANONYMOUS_ORG_ROLE
value: Viewer
- name: GF_SECURITY_ALLOW_EMBEDDING
value: "true"
grafana.ini:
server:
domain: reporting.bstein.dev
root_url: https://reporting.bstein.dev/
auth.anonymous:
hide_version: true
users:
default_theme: dark
ingress:
enabled: true
ingressClassName: traefik
annotations:
cert-manager.io/cluster-issuer: letsencrypt
hosts:
- reporting.bstein.dev
tls:
- secretName: grafana-reporting-tls
hosts:
- reporting.bstein.dev
datasources:
datasources.yaml:
apiVersion: 1
datasources:
- name: VictoriaMetrics
type: prometheus
access: proxy
url: http://victoria-metrics-single-server:8428
isDefault: true
jsonData:
timeInterval: "15s"
dashboardProviders:
dashboardproviders.yaml:
apiVersion: 1
providers:
- name: public
orgId: 1
folder: Atlas Public
type: file
disableDeletion: false
allowUiUpdates: false
options:
path: /var/lib/grafana/dashboards/public
- name: sre
orgId: 1
folder: Atlas SRE
type: file
disableDeletion: false
allowUiUpdates: true
options:
path: /var/lib/grafana/dashboards/sre
dashboardsConfigMaps:
- configMapName: grafana-dashboard-public
folder: public
- configMapName: grafana-dashboard-sre
folder: sre
---
apiVersion: helm.toolkit.fluxcd.io/v2
kind: HelmRelease
metadata:
name: alertmanager
namespace: monitoring
spec:
interval: 15m
chart:
spec:
chart: alertmanager
version: "~1.9.0"
sourceRef:
kind: HelmRepository
name: prometheus
namespace: flux-system
values:
ingress:
enabled: true
ingressClassName: traefik
annotations:
cert-manager.io/cluster-issuer: letsencrypt
hosts:
- alerts.bstein.dev
tls:
- secretName: alerts-bstein-dev-tls
hosts:
- alerts.bstein.dev
config:
global:
resolve_timeout: 5m
route:
receiver: default
group_wait: 30s
group_interval: 5m
repeat_interval: 2h
receivers:
- name: default

View File

@ -5,4 +5,6 @@ namespace: monitoring
resources:
- namespace.yaml
- rbac.yaml
- grafana-dashboard-public.yaml
- grafana-dashboard-sre.yaml
- helmrelease.yaml