diff --git a/clusters/atlas/flux-system/gotk-sync.yaml b/clusters/atlas/flux-system/gotk-sync.yaml index 473ab99..46f65d3 100644 --- a/clusters/atlas/flux-system/gotk-sync.yaml +++ b/clusters/atlas/flux-system/gotk-sync.yaml @@ -8,7 +8,7 @@ metadata: spec: interval: 1m0s ref: - branch: main + branch: feature/atlas-monitoring secretRef: name: flux-system-gitea url: ssh://git@scm.bstein.dev:2242/bstein/titan-iac.git diff --git a/services/monitoring/README.md b/services/monitoring/README.md new file mode 100644 index 0000000..74baf08 --- /dev/null +++ b/services/monitoring/README.md @@ -0,0 +1,15 @@ +# services/monitoring + +## Grafana admin secret + +The Grafana Helm release expects a pre-existing secret named `grafana-admin` +in the `monitoring` namespace. Create or rotate it with: + +```bash +kubectl create secret generic grafana-admin \ + --namespace monitoring \ + --from-literal=admin-user=admin \ + --from-literal=admin-password='REPLACE_ME' +``` + +Update the password whenever you rotate credentials. diff --git a/services/monitoring/grafana-dashboard-public.yaml b/services/monitoring/grafana-dashboard-public.yaml new file mode 100644 index 0000000..db5d6c1 --- /dev/null +++ b/services/monitoring/grafana-dashboard-public.yaml @@ -0,0 +1,227 @@ +# services/monitoring/grafana-dashboard-public.yaml +apiVersion: v1 +kind: ConfigMap +metadata: + name: grafana-dashboard-public + labels: + grafana_dashboard: "1" +data: + atlas-public-overview.json: | + { + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { + "type": "datasource", + "uid": "grafana" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "editable": false, + "fiscalYearStartMonth": 0, + "graphTooltip": 0, + "id": null, + "links": [], + "liveNow": false, + "panels": [ + { + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "gridPos": { + "h": 7, + "w": 6, + "x": 0, + "y": 0 + }, + "id": 1, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "text": {}, + "textMode": "auto" + }, + "pluginVersion": "10.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "editorMode": "code", + "expr": "sum(kube_pod_status_phase{phase=\"Running\"})", + "legendFormat": "", + "range": true, + "refId": "A" + } + ], + "title": "Running pods", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "description": "Aggregated CPU usage across all schedulable nodes.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "continuous-BlYlRd" + }, + "mappings": [], + "max": 100, + "min": 0, + "thresholds": { + "mode": "percentage", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 60 + }, + { + "color": "red", + "value": 85 + } + ] + }, + "unit": "percent" + }, + "overrides": [] + }, + "gridPos": { + "h": 7, + "w": 6, + "x": 6, + "y": 0 + }, + "id": 2, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "text": {}, + "textMode": "auto" + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "expr": "avg(100 - (avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m])) * 100))", + "legendFormat": "", + "refId": "A" + } + ], + "title": "Average node CPU", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 7 + }, + "id": 3, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "expr": "sum(rate(container_cpu_usage_seconds_total{namespace!=\"\", container!=\"\"}[5m])) by (namespace)", + "legendFormat": "{{namespace}}", + "refId": "A" + } + ], + "title": "Namespace CPU (5m avg)", + "type": "timeseries" + } + ], + "refresh": "30s", + "schemaVersion": 39, + "style": "dark", + "tags": [ + "atlas", + "public" + ], + "templating": { + "list": [] + }, + "time": { + "from": "now-6h", + "to": "now" + }, + "timepicker": {}, + "timezone": "", + "title": "Atlas Public Overview", + "uid": "atlas-public", + "version": 1, + "weekStart": "" + } diff --git a/services/monitoring/grafana-dashboard-sre.yaml b/services/monitoring/grafana-dashboard-sre.yaml new file mode 100644 index 0000000..12995af --- /dev/null +++ b/services/monitoring/grafana-dashboard-sre.yaml @@ -0,0 +1,223 @@ +# services/monitoring/grafana-dashboard-sre.yaml +apiVersion: v1 +kind: ConfigMap +metadata: + name: grafana-dashboard-sre + labels: + grafana_dashboard: "1" +data: + atlas-sre-overview.json: | + { + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { + "type": "datasource", + "uid": "grafana" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 0, + "links": [], + "panels": [ + { + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "description": "Percentage of Ready nodes.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "continuous" + }, + "mappings": [], + "max": 100, + "min": 0, + "thresholds": { + "mode": "percentage", + "steps": [ + { + "color": "red", + "value": null + }, + { + "color": "green", + "value": 90 + } + ] + }, + "unit": "percent" + }, + "overrides": [] + }, + "gridPos": { + "h": 7, + "w": 6, + "x": 0, + "y": 0 + }, + "id": 10, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "center", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "expr": "avg(kube_node_status_condition{condition=\"Ready\",status=\"true\"}) * 100", + "refId": "A" + } + ], + "title": "Ready nodes", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 7, + "w": 6, + "x": 6, + "y": 0 + }, + "id": 11, + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom" + }, + "tooltip": { + "mode": "multi" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "expr": "sum by (node)(node_filesystem_avail_bytes{mountpoint=\"/\"})", + "legendFormat": "{{node}}", + "refId": "A" + } + ], + "title": "Free root filesystem bytes", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 7 + }, + "id": 12, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom" + }, + "tooltip": { + "mode": "single" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "expr": "sum(rate(container_cpu_usage_seconds_total{namespace=\"crypto\",container!=\"\"}[5m])) by (pod)", + "legendFormat": "{{pod}}", + "refId": "A" + } + ], + "title": "Crypto namespace CPU usage", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 9, + "w": 12, + "x": 0, + "y": 17 + }, + "id": 13, + "options": { + "displayMode": "gradient", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showUnfilled": false + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "expr": "count(sum(kube_pod_status_phase{phase=\"Failed\"}) by (namespace))", + "legendFormat": "", + "refId": "A" + } + ], + "title": "Namespaces with failed pods", + "type": "bargauge" + } + ], + "schemaVersion": 39, + "style": "dark", + "tags": [ + "atlas", + "sre" + ], + "templating": { + "list": [] + }, + "time": { + "from": "now-12h", + "to": "now" + }, + "timepicker": {}, + "title": "Atlas SRE Overview", + "uid": "atlas-sre", + "version": 1 + } diff --git a/services/monitoring/helmrelease.yaml b/services/monitoring/helmrelease.yaml index 22bc2b1..3341e9d 100644 --- a/services/monitoring/helmrelease.yaml +++ b/services/monitoring/helmrelease.yaml @@ -210,3 +210,134 @@ spec: - action: keep source_labels: [__meta_kubernetes_namespace, __meta_kubernetes_pod_label_app_kubernetes_io_part_of] regex: flux-system;flux + +--- + +apiVersion: helm.toolkit.fluxcd.io/v2 +kind: HelmRelease +metadata: + name: grafana + namespace: monitoring +spec: + interval: 15m + chart: + spec: + chart: grafana + version: "~8.5.0" + sourceRef: + kind: HelmRepository + name: grafana + namespace: flux-system + values: + admin: + existingSecret: grafana-admin + userKey: admin-user + passwordKey: admin-password + persistence: + enabled: true + size: 20Gi + storageClassName: astreae + service: + type: ClusterIP + env: + - name: GF_AUTH_ANONYMOUS_ENABLED + value: "true" + - name: GF_AUTH_ANONYMOUS_ORG_ROLE + value: Viewer + - name: GF_SECURITY_ALLOW_EMBEDDING + value: "true" + grafana.ini: + server: + domain: reporting.bstein.dev + root_url: https://reporting.bstein.dev/ + auth.anonymous: + hide_version: true + users: + default_theme: dark + ingress: + enabled: true + ingressClassName: traefik + annotations: + cert-manager.io/cluster-issuer: letsencrypt + hosts: + - reporting.bstein.dev + tls: + - secretName: grafana-reporting-tls + hosts: + - reporting.bstein.dev + datasources: + datasources.yaml: + apiVersion: 1 + datasources: + - name: VictoriaMetrics + type: prometheus + access: proxy + url: http://victoria-metrics-single-server:8428 + isDefault: true + jsonData: + timeInterval: "15s" + dashboardProviders: + dashboardproviders.yaml: + apiVersion: 1 + providers: + - name: public + orgId: 1 + folder: Atlas Public + type: file + disableDeletion: false + allowUiUpdates: false + options: + path: /var/lib/grafana/dashboards/public + - name: sre + orgId: 1 + folder: Atlas SRE + type: file + disableDeletion: false + allowUiUpdates: true + options: + path: /var/lib/grafana/dashboards/sre + dashboardsConfigMaps: + - configMapName: grafana-dashboard-public + folder: public + - configMapName: grafana-dashboard-sre + folder: sre + +--- + +apiVersion: helm.toolkit.fluxcd.io/v2 +kind: HelmRelease +metadata: + name: alertmanager + namespace: monitoring +spec: + interval: 15m + chart: + spec: + chart: alertmanager + version: "~1.9.0" + sourceRef: + kind: HelmRepository + name: prometheus + namespace: flux-system + values: + ingress: + enabled: true + ingressClassName: traefik + annotations: + cert-manager.io/cluster-issuer: letsencrypt + hosts: + - alerts.bstein.dev + tls: + - secretName: alerts-bstein-dev-tls + hosts: + - alerts.bstein.dev + config: + global: + resolve_timeout: 5m + route: + receiver: default + group_wait: 30s + group_interval: 5m + repeat_interval: 2h + receivers: + - name: default diff --git a/services/monitoring/kustomization.yaml b/services/monitoring/kustomization.yaml index 036afa3..bb321b5 100644 --- a/services/monitoring/kustomization.yaml +++ b/services/monitoring/kustomization.yaml @@ -5,4 +5,6 @@ namespace: monitoring resources: - namespace.yaml - rbac.yaml + - grafana-dashboard-public.yaml + - grafana-dashboard-sre.yaml - helmrelease.yaml