titan-iac/services/monitoring/helmrelease.yaml

212 lines
7.9 KiB
YAML

# services/monitoring/kube-state-metrics-helmrelease.yaml
apiVersion: helm.toolkit.fluxcd.io/v2
kind: HelmRelease
metadata:
name: kube-state-metrics
namespace: monitoring
spec:
interval: 15m
chart:
spec:
chart: kube-state-metrics
version: "~6.0.0"
sourceRef:
kind: HelmRepository
name: prometheus-community
namespace: flux-system
values:
prometheusScrape: true # annotates for /metrics auto-scrape. :contentReference[oaicite:16]{index=16}
service:
annotations:
prometheus.io/scrape: "true"
prometheus.io/port: "8080" # ksm serves metrics on 8080 by default
prometheus.io/path: "/metrics"
---
apiVersion: helm.toolkit.fluxcd.io/v2
kind: HelmRelease
metadata:
name: node-exporter
namespace: monitoring
spec:
interval: 15m
chart:
spec:
chart: prometheus-node-exporter
version: "~4.0.0"
sourceRef:
kind: HelmRepository
name: prometheus-community
namespace: flux-system
values:
service:
annotations:
prometheus.io/scrape: "true"
prometheus.io/port: "9100"
---
apiVersion: helm.toolkit.fluxcd.io/v2
kind: HelmRelease
metadata:
name: victoria-metrics-single
namespace: monitoring
spec:
interval: 15m
chart:
spec:
chart: victoria-metrics-single
version: "~0.15.0" # or omit to track appVersion
sourceRef:
kind: HelmRepository
name: victoria-metrics
namespace: flux-system
values:
server:
# keep ~3 months; change as you like (supports "d", "y")
extraArgs:
retentionPeriod: "90d" # VM flag -retentionPeriod=90d. :contentReference[oaicite:11]{index=11}
persistentVolume:
enabled: true
size: 100Gi # adjust; uses default StorageClass (Longhorn)
# storageClassName: "" # set if you want a specific class
# Enable built-in Kubernetes scraping
scrape:
enabled: true # chart enables promscrape. :contentReference[oaicite:12]{index=12}
config:
global:
scrape_interval: 15s
scrape_configs:
# VM self-metrics
- job_name: victoriametrics
static_configs:
- targets: ["localhost:8428"]
# --- K8s control-plane & nodes (from VM docs guide) ---
- job_name: "kubernetes-apiservers"
kubernetes_sd_configs: [{ role: endpoints }]
scheme: https
tls_config:
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
insecure_skip_verify: true
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
relabel_configs:
- action: keep
source_labels: [__meta_kubernetes_namespace,__meta_kubernetes_service_name,__meta_kubernetes_endpoint_port_name]
regex: default;kubernetes;https
- job_name: "kubernetes-nodes"
scheme: https
tls_config:
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
insecure_skip_verify: true
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
kubernetes_sd_configs: [{ role: node }]
relabel_configs:
- action: labelmap
regex: __meta_kubernetes_node_label_(.+)
- target_label: __address__
replacement: kubernetes.default.svc:443
- source_labels: [__meta_kubernetes_node_name]
regex: (.+)
target_label: __metrics_path__
replacement: /api/v1/nodes/$1/proxy/metrics
- job_name: "kubernetes-nodes-cadvisor"
scheme: https
tls_config:
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
insecure_skip_verify: true
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
kubernetes_sd_configs: [{ role: node }]
relabel_configs:
- action: labelmap
regex: __meta_kubernetes_node_label_(.+)
- target_label: __address__
replacement: kubernetes.default.svc:443
- source_labels: [__meta_kubernetes_node_name]
regex: (.+)
target_label: __metrics_path__
replacement: /api/v1/nodes/$1/proxy/metrics/cadvisor
# --- Annotated Services (generic autodiscovery) ---
- job_name: "kubernetes-service-endpoints"
kubernetes_sd_configs: [{ role: endpoints }]
relabel_configs:
- action: keep
source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scrape]
regex: "true"
- action: replace
source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scheme]
regex: (https?)
target_label: __scheme__
- action: replace
source_labels: [__meta_kubernetes_service_annotation_prometheus_io_path]
target_label: __metrics_path__
- action: replace
regex: (.+)(?::\d+);(\d+)
replacement: $1:$2
source_labels: [__address__, __meta_kubernetes_service_annotation_prometheus_io_port]
target_label: __address__
# --- Annotated Pods (generic autodiscovery) ---
- job_name: "kubernetes-pods"
kubernetes_sd_configs: [{ role: pod }]
relabel_configs:
- action: keep
source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_scrape]
regex: "true"
- action: drop
source_labels: [__meta_kubernetes_pod_container_port_name]
regex: ".*health.*"
- action: replace
source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_path]
target_label: __metrics_path__
- action: replace
regex: (.+):(?:\d+);(\d+)
replacement: $1:$2
source_labels: [__address__, __meta_kubernetes_pod_annotation_prometheus_io_port]
target_label: __address__
# --- kube-state-metrics (via its Service) ---
- job_name: "kube-state-metrics"
kubernetes_sd_configs: [{ role: endpoints }]
relabel_configs:
- action: keep
source_labels: [__meta_kubernetes_service_label_app_kubernetes_io_name]
regex: kube-state-metrics
# --- Longhorn ---
- job_name: "longhorn-backend"
static_configs:
- targets: ["longhorn-backend.longhorn-system.svc:9500"]
metrics_path: /metrics
# --- cert-manager (pods expose on 9402) ---
- job_name: "cert-manager"
kubernetes_sd_configs: [{ role: pod }]
relabel_configs:
- action: keep
source_labels: [__meta_kubernetes_namespace, __meta_kubernetes_pod_label_app_kubernetes_io_name]
regex: cert-manager;cert-manager
- action: drop
source_labels: [__meta_kubernetes_pod_container_port_name]
regex: ".*health.*"
- action: replace
source_labels: [__address__]
regex: "(.+):\\d+"
replacement: "$1:9402"
target_label: __address__
# --- Flux controllers (default :8080/metrics) ---
- job_name: "flux"
kubernetes_sd_configs: [{ role: pod }]
relabel_configs:
- action: keep
source_labels: [__meta_kubernetes_namespace, __meta_kubernetes_pod_label_app_kubernetes_io_part_of]
regex: flux-system;flux