# services/monitoring/kube-state-metrics-helmrelease.yaml apiVersion: helm.toolkit.fluxcd.io/v2 kind: HelmRelease metadata: name: kube-state-metrics namespace: monitoring spec: interval: 15m chart: spec: chart: kube-state-metrics version: "~6.0.0" sourceRef: kind: HelmRepository name: prometheus namespace: flux-system values: prometheusScrape: false --- apiVersion: helm.toolkit.fluxcd.io/v2 kind: HelmRelease metadata: name: node-exporter namespace: monitoring spec: interval: 15m install: disableWait: true chart: spec: chart: prometheus-node-exporter version: "~4.0.0" sourceRef: kind: HelmRepository name: prometheus namespace: flux-system upgrade: disableWait: true values: rbac: pspEnabled: false service: annotations: prometheus.io/scrape: "true" prometheus.io/port: "9100" --- apiVersion: helm.toolkit.fluxcd.io/v2 kind: HelmRelease metadata: name: victoria-metrics-single namespace: monitoring spec: interval: 15m chart: spec: chart: victoria-metrics-single version: "~0.15.0" # or omit to track appVersion sourceRef: kind: HelmRepository name: victoria-metrics namespace: flux-system values: server: # keep ~3 months; change as you like (supports "d", "y") extraArgs: retentionPeriod: "90d" # VM flag -retentionPeriod=90d. :contentReference[oaicite:11]{index=11} persistentVolume: enabled: true size: 100Gi # Enable built-in Kubernetes scraping scrape: enabled: true # chart enables promscrape. :contentReference[oaicite:12]{index=12} config: global: scrape_interval: 15s scrape_configs: # VM self-metrics - job_name: victoriametrics static_configs: - targets: ["localhost:8428"] # --- K8s control-plane & nodes (from VM docs guide) --- - job_name: "kubernetes-apiservers" kubernetes_sd_configs: [{ role: endpoints }] scheme: https tls_config: ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt insecure_skip_verify: true bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token relabel_configs: - action: keep source_labels: [__meta_kubernetes_namespace,__meta_kubernetes_service_name,__meta_kubernetes_endpoint_port_name] regex: default;kubernetes;https - job_name: "kubernetes-nodes" scheme: https tls_config: ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt insecure_skip_verify: true bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token kubernetes_sd_configs: [{ role: node }] relabel_configs: - action: labelmap regex: __meta_kubernetes_node_label_(.+) - target_label: __address__ replacement: kubernetes.default.svc:443 - source_labels: [__meta_kubernetes_node_name] regex: (.+) target_label: __metrics_path__ replacement: /api/v1/nodes/$1/proxy/metrics - job_name: "kubernetes-nodes-cadvisor" scheme: https tls_config: ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt insecure_skip_verify: true bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token kubernetes_sd_configs: [{ role: node }] relabel_configs: - action: labelmap regex: __meta_kubernetes_node_label_(.+) - target_label: __address__ replacement: kubernetes.default.svc:443 - source_labels: [__meta_kubernetes_node_name] regex: (.+) target_label: __metrics_path__ replacement: /api/v1/nodes/$1/proxy/metrics/cadvisor # --- Annotated Services (generic autodiscovery) --- - job_name: "kubernetes-service-endpoints" kubernetes_sd_configs: [{ role: endpoints }] relabel_configs: - action: keep source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scrape] regex: "true" - action: replace source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scheme] regex: (https?) target_label: __scheme__ - action: replace source_labels: [__meta_kubernetes_service_annotation_prometheus_io_path] target_label: __metrics_path__ - action: replace regex: (.+)(?::\d+);(\d+) replacement: $1:$2 source_labels: [__address__, __meta_kubernetes_service_annotation_prometheus_io_port] target_label: __address__ # --- Annotated Pods (generic autodiscovery) --- - job_name: "kubernetes-pods" kubernetes_sd_configs: [{ role: pod }] relabel_configs: - action: keep source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_scrape] regex: "true" - action: drop source_labels: [__meta_kubernetes_pod_container_port_name] regex: ".*health.*" - action: replace source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_path] target_label: __metrics_path__ - action: replace regex: (.+):(?:\d+);(\d+) replacement: $1:$2 source_labels: [__address__, __meta_kubernetes_pod_annotation_prometheus_io_port] target_label: __address__ # --- kube-state-metrics (via its Service) --- - job_name: "kube-state-metrics" kubernetes_sd_configs: [{ role: endpoints }] relabel_configs: - action: keep source_labels: [__meta_kubernetes_service_label_app_kubernetes_io_name] regex: kube-state-metrics # --- Longhorn --- - job_name: "longhorn-backend" static_configs: - targets: ["longhorn-backend.longhorn-system.svc:9500"] metrics_path: /metrics # --- cert-manager (pods expose on 9402) --- - job_name: "cert-manager" kubernetes_sd_configs: [{ role: pod }] relabel_configs: - action: keep source_labels: [__meta_kubernetes_namespace, __meta_kubernetes_pod_label_app_kubernetes_io_name] regex: cert-manager;cert-manager - action: drop source_labels: [__meta_kubernetes_pod_container_port_name] regex: ".*health.*" - action: replace source_labels: [__address__] regex: "(.+):\\d+" replacement: "$1:9402" target_label: __address__ # --- Flux controllers (default :8080/metrics) --- - job_name: "flux" kubernetes_sd_configs: [{ role: pod }] relabel_configs: - action: keep source_labels: [__meta_kubernetes_namespace, __meta_kubernetes_pod_label_app_kubernetes_io_part_of] regex: flux-system;flux --- apiVersion: helm.toolkit.fluxcd.io/v2 kind: HelmRelease metadata: name: grafana namespace: monitoring spec: interval: 15m chart: spec: chart: grafana version: "~8.5.0" sourceRef: kind: HelmRepository name: grafana namespace: flux-system values: admin: existingSecret: grafana-admin userKey: admin-user passwordKey: admin-password persistence: enabled: true size: 20Gi storageClassName: astreae service: type: ClusterIP env: GF_AUTH_ANONYMOUS_ENABLED: "true" GF_AUTH_ANONYMOUS_ORG_ROLE: Viewer GF_SECURITY_ALLOW_EMBEDDING: "true" grafana.ini: server: domain: atlas.metrics.bstein.dev root_url: https://atlas.metrics.bstein.dev/ auth.anonymous: hide_version: true users: default_theme: dark ingress: enabled: true ingressClassName: traefik annotations: cert-manager.io/cluster-issuer: letsencrypt hosts: - atlas.metrics.bstein.dev path: / tls: - secretName: grafana-atlas-metrics-tls hosts: - atlas.metrics.bstein.dev datasources: datasources.yaml: apiVersion: 1 datasources: - name: VictoriaMetrics type: prometheus access: proxy url: http://victoria-metrics-single-server:8428 isDefault: true jsonData: timeInterval: "15s" dashboardProviders: dashboardproviders.yaml: apiVersion: 1 providers: - name: public orgId: 1 folder: Atlas Public type: file disableDeletion: false editable: false options: path: /var/lib/grafana/dashboards/public - name: sre orgId: 1 folder: Atlas SRE type: file disableDeletion: false editable: true options: path: /var/lib/grafana/dashboards/sre dashboardsConfigMaps: public: grafana-dashboard-public sre: grafana-dashboard-sre --- apiVersion: helm.toolkit.fluxcd.io/v2 kind: HelmRelease metadata: name: alertmanager namespace: monitoring spec: interval: 15m chart: spec: chart: alertmanager version: "~1.9.0" sourceRef: kind: HelmRepository name: prometheus namespace: flux-system values: ingress: enabled: true ingressClassName: traefik annotations: cert-manager.io/cluster-issuer: letsencrypt hosts: - host: atlas.alerts.bstein.dev paths: - path: / pathType: Prefix tls: - secretName: alerts-bstein-dev-tls hosts: - atlas.alerts.bstein.dev config: global: resolve_timeout: 5m route: receiver: default group_wait: 30s group_interval: 5m repeat_interval: 2h receivers: - name: default