# services/monitoring/helmrelease.yaml apiVersion: helm.toolkit.fluxcd.io/v2 kind: HelmRelease metadata: name: kube-state-metrics namespace: monitoring spec: interval: 15m chart: spec: chart: kube-state-metrics version: "~6.0.0" sourceRef: kind: HelmRepository name: prometheus namespace: flux-system values: affinity: nodeAffinity: requiredDuringSchedulingIgnoredDuringExecution: nodeSelectorTerms: - matchExpressions: - key: kubernetes.io/hostname operator: NotIn values: - titan-22 prometheusScrape: false metricLabelsAllowlist: - cronjobs=[atlas.bstein.dev/glue] --- apiVersion: helm.toolkit.fluxcd.io/v2 kind: HelmRelease metadata: name: node-exporter namespace: monitoring spec: interval: 15m install: disableWait: true chart: spec: chart: prometheus-node-exporter version: "~4.0.0" sourceRef: kind: HelmRepository name: prometheus namespace: flux-system upgrade: disableWait: true values: rbac: pspEnabled: false service: annotations: prometheus.io/scrape: "true" prometheus.io/port: "9100" --- apiVersion: helm.toolkit.fluxcd.io/v2 kind: HelmRelease metadata: name: victoria-metrics-single namespace: monitoring spec: interval: 15m chart: spec: chart: victoria-metrics-single version: "~0.15.0" # or omit to track appVersion sourceRef: kind: HelmRepository name: victoria-metrics namespace: flux-system values: server: # keep 1 year; supports "d", "y" extraArgs: retentionPeriod: "1y" # VM flag -retentionPeriod=1y. :contentReference[oaicite:11]{index=11} persistentVolume: enabled: true size: 100Gi affinity: nodeAffinity: requiredDuringSchedulingIgnoredDuringExecution: nodeSelectorTerms: - matchExpressions: - key: kubernetes.io/hostname operator: NotIn values: - titan-22 # Enable built-in Kubernetes scraping scrape: enabled: true # chart enables promscrape. :contentReference[oaicite:12]{index=12} config: global: scrape_interval: 15s scrape_configs: # VM self-metrics - job_name: victoriametrics static_configs: - targets: ["localhost:8428"] # --- K8s control-plane & nodes (from VM docs guide) --- - job_name: "kubernetes-apiservers" kubernetes_sd_configs: [{ role: endpoints }] scheme: https tls_config: ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt insecure_skip_verify: true bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token relabel_configs: - action: keep source_labels: [__meta_kubernetes_namespace,__meta_kubernetes_service_name,__meta_kubernetes_endpoint_port_name] regex: default;kubernetes;https - job_name: "kubernetes-nodes" scheme: https tls_config: ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt insecure_skip_verify: true bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token kubernetes_sd_configs: [{ role: node }] relabel_configs: - action: labelmap regex: __meta_kubernetes_node_label_(.+) - target_label: __address__ replacement: kubernetes.default.svc:443 - source_labels: [__meta_kubernetes_node_name] regex: (.+) target_label: __metrics_path__ replacement: /api/v1/nodes/$1/proxy/metrics - job_name: "kubernetes-nodes-cadvisor" scheme: https tls_config: ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt insecure_skip_verify: true bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token kubernetes_sd_configs: [{ role: node }] relabel_configs: - action: labelmap regex: __meta_kubernetes_node_label_(.+) - target_label: __address__ replacement: kubernetes.default.svc:443 - source_labels: [__meta_kubernetes_node_name] regex: (.+) target_label: __metrics_path__ replacement: /api/v1/nodes/$1/proxy/metrics/cadvisor # --- Annotated Services (generic autodiscovery) --- - job_name: "kubernetes-service-endpoints" kubernetes_sd_configs: [{ role: endpoints }] relabel_configs: - action: keep source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scrape] regex: "true" - action: replace source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scheme] regex: (https?) target_label: __scheme__ - action: replace source_labels: [__meta_kubernetes_service_annotation_prometheus_io_path] target_label: __metrics_path__ - action: replace regex: (.+)(?::\d+);(\d+) replacement: $1:$2 source_labels: [__address__, __meta_kubernetes_service_annotation_prometheus_io_port] target_label: __address__ # --- Annotated Pods (generic autodiscovery) --- - job_name: "kubernetes-pods" kubernetes_sd_configs: [{ role: pod }] relabel_configs: - action: keep source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_scrape] regex: "true" - action: drop source_labels: [__meta_kubernetes_pod_container_port_name] regex: ".*health.*" - action: replace source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_path] target_label: __metrics_path__ - action: replace regex: (.+):(?:\d+);(\d+) replacement: $1:$2 source_labels: [__address__, __meta_kubernetes_pod_annotation_prometheus_io_port] target_label: __address__ # --- kube-state-metrics (via its Service) --- - job_name: "kube-state-metrics" kubernetes_sd_configs: [{ role: endpoints }] relabel_configs: - action: keep source_labels: [__meta_kubernetes_service_label_app_kubernetes_io_name] regex: kube-state-metrics # --- Longhorn --- - job_name: "longhorn-backend" static_configs: - targets: ["longhorn-backend.longhorn-system.svc:9500"] metrics_path: /metrics # --- titan-db node_exporter (external control-plane DB host) --- - job_name: "titan-db" static_configs: - targets: ["192.168.22.10:9100"] relabel_configs: - source_labels: [__address__] target_label: instance replacement: titan-db # --- titan-jh node_exporter (external control-plane host) --- - job_name: "titan-jh" static_configs: - targets: ["192.168.22.8:9100"] relabel_configs: - source_labels: [__address__] target_label: instance replacement: titan-jh # --- cert-manager (pods expose on 9402) --- - job_name: "cert-manager" kubernetes_sd_configs: [{ role: pod }] relabel_configs: - action: keep source_labels: [__meta_kubernetes_namespace, __meta_kubernetes_pod_label_app_kubernetes_io_name] regex: cert-manager;cert-manager - action: drop source_labels: [__meta_kubernetes_pod_container_port_name] regex: ".*health.*" - action: replace source_labels: [__address__] regex: "(.+):\\d+" replacement: "$1:9402" target_label: __address__ # --- Flux controllers (default :8080/metrics) --- - job_name: "flux" kubernetes_sd_configs: [{ role: pod }] relabel_configs: - action: keep source_labels: [__meta_kubernetes_namespace, __meta_kubernetes_pod_label_app_kubernetes_io_part_of] regex: flux-system;flux --- apiVersion: helm.toolkit.fluxcd.io/v2 kind: HelmRelease metadata: name: grafana namespace: monitoring spec: interval: 15m chart: spec: chart: grafana version: "~8.5.0" sourceRef: kind: HelmRepository name: grafana namespace: flux-system install: remediation: { retries: 3 } timeout: 15m upgrade: remediation: retries: 3 remediateLastFailure: true cleanupOnFail: true timeout: 15m values: admin: existingSecret: grafana-admin userKey: admin-user passwordKey: admin-password serviceAccount: create: false name: monitoring-vault-sync automountServiceAccountToken: true podAnnotations: vault.hashicorp.com/agent-inject: "true" vault.hashicorp.com/role: "monitoring" vault.hashicorp.com/agent-inject-secret-grafana-env.sh: "kv/data/atlas/monitoring/grafana-admin" vault.hashicorp.com/agent-inject-template-grafana-env.sh: | {{ with secret "kv/data/atlas/monitoring/grafana-admin" }} export GF_SECURITY_ADMIN_USER="{{ index .Data.data "admin-user" }}" export GF_SECURITY_ADMIN_PASSWORD="{{ index .Data.data "admin-password" }}" {{ end }} {{ with secret "kv/data/atlas/shared/postmark-relay" }} export GF_SMTP_USER="{{ index .Data.data "apikey" }}" export GF_SMTP_PASSWORD="{{ index .Data.data "apikey" }}" {{ end }} persistence: enabled: true size: 20Gi storageClassName: astreae nodeSelector: kubernetes.io/arch: arm64 node-role.kubernetes.io/worker: "true" affinity: nodeAffinity: requiredDuringSchedulingIgnoredDuringExecution: nodeSelectorTerms: - matchExpressions: - key: hardware operator: In values: - rpi5 - rpi4 preferredDuringSchedulingIgnoredDuringExecution: - weight: 100 preference: matchExpressions: - key: hardware operator: In values: - rpi5 - weight: 70 preference: matchExpressions: - key: hardware operator: In values: - rpi4 deploymentStrategy: type: Recreate service: type: ClusterIP env: GF_AUTH_GENERIC_OAUTH_CLIENT_ID: "grafana" GF_AUTH_GENERIC_OAUTH_CLIENT_SECRET: "" GF_AUTH_ANONYMOUS_ENABLED: "true" GF_AUTH_ANONYMOUS_ORG_NAME: "Overview" GF_AUTH_ANONYMOUS_ORG_ROLE: "Viewer" GF_SMTP_ENABLED: "true" GF_SMTP_HOST: "smtp.postmarkapp.com:587" GF_SMTP_FROM_ADDRESS: "no-reply-grafana@bstein.dev" GF_SMTP_FROM_NAME: "Atlas Grafana" GRAFANA_ALERT_EMAILS: "brad@bstein.dev" GF_SECURITY_ALLOW_EMBEDDING: "true" GF_AUTH_GENERIC_OAUTH_ENABLED: "true" GF_AUTH_GENERIC_OAUTH_NAME: "Keycloak" GF_AUTH_GENERIC_OAUTH_ALLOW_SIGN_UP: "true" GF_AUTH_GENERIC_OAUTH_SCOPES: "openid profile email groups" GF_AUTH_GENERIC_OAUTH_AUTH_URL: "https://sso.bstein.dev/realms/atlas/protocol/openid-connect/auth" GF_AUTH_GENERIC_OAUTH_TOKEN_URL: "https://sso.bstein.dev/realms/atlas/protocol/openid-connect/token" GF_AUTH_GENERIC_OAUTH_API_URL: "https://sso.bstein.dev/realms/atlas/protocol/openid-connect/userinfo" GF_AUTH_GENERIC_OAUTH_ROLE_ATTRIBUTE_PATH: "contains(groups, 'admin') && 'Admin' || 'Viewer'" GF_AUTH_GENERIC_OAUTH_USE_PKCE: "true" GF_AUTH_GENERIC_OAUTH_TLS_SKIP_VERIFY_INSECURE: "false" GF_AUTH_GENERIC_OAUTH_ALLOW_INSECURE_EMAIL_LOOKUP: "true" GF_AUTH_GENERIC_OAUTH_EMAIL_ATTRIBUTE_PATH: "email" GF_AUTH_SIGNOUT_REDIRECT_URL: "https://sso.bstein.dev/realms/atlas/protocol/openid-connect/logout?redirect_uri=https://metrics.bstein.dev/" grafana.ini: server: domain: metrics.bstein.dev root_url: https://metrics.bstein.dev/ dashboards: default_home_dashboard_path: /var/lib/grafana/dashboards/overview/atlas-overview.json auth.anonymous: hide_version: true users: default_theme: dark ingress: enabled: true ingressClassName: traefik annotations: cert-manager.io/cluster-issuer: letsencrypt traefik.ingress.kubernetes.io/router.entrypoints: websecure traefik.ingress.kubernetes.io/router.tls: "true" hosts: - metrics.bstein.dev path: / tls: - secretName: grafana-metrics-tls hosts: - metrics.bstein.dev datasources: datasources.yaml: apiVersion: 1 datasources: - name: VictoriaMetrics type: prometheus access: proxy url: http://victoria-metrics-single-server:8428 isDefault: true jsonData: timeInterval: "15s" uid: atlas-vm orgId: 1 - name: VictoriaMetrics type: prometheus access: proxy url: http://victoria-metrics-single-server:8428 isDefault: true jsonData: timeInterval: "15s" uid: atlas-vm orgId: 2 dashboardProviders: dashboardproviders.yaml: apiVersion: 1 providers: - name: overview orgId: 1 folder: Overview type: file disableDeletion: false editable: false options: path: /var/lib/grafana/dashboards/overview - name: overview-public orgId: 2 folder: Overview type: file disableDeletion: false editable: false options: path: /var/lib/grafana/dashboards/overview-public - name: pods orgId: 1 folder: Atlas Internal type: file disableDeletion: false editable: true options: path: /var/lib/grafana/dashboards/pods - name: nodes orgId: 1 folder: Atlas Internal type: file disableDeletion: false editable: true options: path: /var/lib/grafana/dashboards/nodes - name: storage orgId: 1 folder: Atlas Internal type: file disableDeletion: false editable: true options: path: /var/lib/grafana/dashboards/storage - name: gpu orgId: 1 folder: Atlas Internal type: file disableDeletion: false editable: true options: path: /var/lib/grafana/dashboards/gpu - name: network orgId: 1 folder: Atlas Internal type: file disableDeletion: false editable: true options: path: /var/lib/grafana/dashboards/network - name: mail orgId: 1 folder: Atlas Internal type: file disableDeletion: false editable: true options: path: /var/lib/grafana/dashboards/mail - name: jobs orgId: 1 folder: Atlas Internal type: file disableDeletion: false editable: true options: path: /var/lib/grafana/dashboards/jobs dashboardsConfigMaps: overview: grafana-dashboard-overview overview-public: grafana-dashboard-overview pods: grafana-dashboard-pods nodes: grafana-dashboard-nodes storage: grafana-dashboard-storage gpu: grafana-dashboard-gpu network: grafana-dashboard-network mail: grafana-dashboard-mail jobs: grafana-dashboard-jobs extraConfigmapMounts: - name: grafana-folders mountPath: /etc/grafana/provisioning/folders configMap: grafana-folders readOnly: true - name: grafana-alerting mountPath: /etc/grafana/provisioning/alerting configMap: grafana-alerting readOnly: true postRenderers: - kustomize: patches: - target: kind: Deployment name: grafana patch: |- apiVersion: apps/v1 kind: Deployment metadata: name: grafana spec: template: spec: serviceAccountName: monitoring-vault-sync automountServiceAccountToken: true containers: - name: grafana command: - /entrypoint.sh args: - /run.sh env: - name: GF_SECURITY_ADMIN_USER $patch: delete - name: GF_SECURITY_ADMIN_PASSWORD $patch: delete - name: GF_SMTP_USER $patch: delete - name: GF_SMTP_PASSWORD $patch: delete - name: VAULT_ENV_FILE value: /vault/secrets/grafana-env.sh volumeMounts: - name: monitoring-vault-entrypoint mountPath: /entrypoint.sh subPath: vault-entrypoint.sh volumes: - name: monitoring-vault-entrypoint configMap: name: monitoring-vault-entrypoint defaultMode: 493 --- apiVersion: helm.toolkit.fluxcd.io/v2 kind: HelmRelease metadata: name: alertmanager namespace: monitoring spec: interval: 15m chart: spec: chart: alertmanager version: "~1.9.0" sourceRef: kind: HelmRepository name: prometheus namespace: flux-system values: affinity: nodeAffinity: requiredDuringSchedulingIgnoredDuringExecution: nodeSelectorTerms: - matchExpressions: - key: kubernetes.io/hostname operator: NotIn values: - titan-22 ingress: enabled: true ingressClassName: traefik annotations: cert-manager.io/cluster-issuer: letsencrypt traefik.ingress.kubernetes.io/router.entrypoints: websecure traefik.ingress.kubernetes.io/router.tls: "true" hosts: - host: alerts.bstein.dev paths: - path: / pathType: Prefix tls: - secretName: alerts-bstein-dev-tls hosts: - alerts.bstein.dev config: global: resolve_timeout: 5m route: receiver: default group_wait: 30s group_interval: 5m repeat_interval: 2h receivers: - name: default