titan-iac/services/monitoring/helmrelease.yaml

396 lines
12 KiB
YAML
Raw Normal View History

# services/monitoring/kube-state-metrics-helmrelease.yaml
apiVersion: helm.toolkit.fluxcd.io/v2
kind: HelmRelease
metadata:
name: kube-state-metrics
namespace: monitoring
spec:
interval: 15m
chart:
spec:
chart: kube-state-metrics
version: "~6.0.0"
sourceRef:
kind: HelmRepository
2025-11-09 12:59:03 -03:00
name: prometheus
namespace: flux-system
values:
prometheusScrape: false
---
apiVersion: helm.toolkit.fluxcd.io/v2
kind: HelmRelease
metadata:
name: node-exporter
namespace: monitoring
spec:
interval: 15m
install:
disableWait: true
chart:
spec:
chart: prometheus-node-exporter
version: "~4.0.0"
sourceRef:
kind: HelmRepository
2025-11-09 12:59:03 -03:00
name: prometheus
namespace: flux-system
upgrade:
disableWait: true
values:
2025-11-09 13:16:21 -03:00
rbac:
pspEnabled: false
service:
annotations:
prometheus.io/scrape: "true"
prometheus.io/port: "9100"
---
apiVersion: helm.toolkit.fluxcd.io/v2
kind: HelmRelease
metadata:
name: victoria-metrics-single
namespace: monitoring
spec:
interval: 15m
chart:
spec:
chart: victoria-metrics-single
version: "~0.15.0" # or omit to track appVersion
sourceRef:
kind: HelmRepository
name: victoria-metrics
namespace: flux-system
values:
server:
# keep ~3 months; change as you like (supports "d", "y")
extraArgs:
retentionPeriod: "90d" # VM flag -retentionPeriod=90d. :contentReference[oaicite:11]{index=11}
persistentVolume:
enabled: true
2025-11-14 19:13:40 -03:00
size: 100Gi
# Enable built-in Kubernetes scraping
scrape:
enabled: true # chart enables promscrape. :contentReference[oaicite:12]{index=12}
config:
global:
scrape_interval: 15s
scrape_configs:
# VM self-metrics
- job_name: victoriametrics
static_configs:
- targets: ["localhost:8428"]
# --- K8s control-plane & nodes (from VM docs guide) ---
- job_name: "kubernetes-apiservers"
kubernetes_sd_configs: [{ role: endpoints }]
scheme: https
tls_config:
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
insecure_skip_verify: true
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
relabel_configs:
- action: keep
source_labels: [__meta_kubernetes_namespace,__meta_kubernetes_service_name,__meta_kubernetes_endpoint_port_name]
regex: default;kubernetes;https
- job_name: "kubernetes-nodes"
scheme: https
tls_config:
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
insecure_skip_verify: true
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
kubernetes_sd_configs: [{ role: node }]
relabel_configs:
- action: labelmap
regex: __meta_kubernetes_node_label_(.+)
- target_label: __address__
replacement: kubernetes.default.svc:443
- source_labels: [__meta_kubernetes_node_name]
regex: (.+)
target_label: __metrics_path__
replacement: /api/v1/nodes/$1/proxy/metrics
- job_name: "kubernetes-nodes-cadvisor"
scheme: https
tls_config:
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
insecure_skip_verify: true
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
kubernetes_sd_configs: [{ role: node }]
relabel_configs:
- action: labelmap
regex: __meta_kubernetes_node_label_(.+)
- target_label: __address__
replacement: kubernetes.default.svc:443
- source_labels: [__meta_kubernetes_node_name]
regex: (.+)
target_label: __metrics_path__
replacement: /api/v1/nodes/$1/proxy/metrics/cadvisor
# --- Annotated Services (generic autodiscovery) ---
- job_name: "kubernetes-service-endpoints"
kubernetes_sd_configs: [{ role: endpoints }]
relabel_configs:
- action: keep
source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scrape]
regex: "true"
- action: replace
source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scheme]
regex: (https?)
target_label: __scheme__
- action: replace
source_labels: [__meta_kubernetes_service_annotation_prometheus_io_path]
target_label: __metrics_path__
- action: replace
regex: (.+)(?::\d+);(\d+)
replacement: $1:$2
source_labels: [__address__, __meta_kubernetes_service_annotation_prometheus_io_port]
target_label: __address__
# --- Annotated Pods (generic autodiscovery) ---
2025-11-09 12:33:11 -03:00
- job_name: "kubernetes-pods"
kubernetes_sd_configs: [{ role: pod }]
relabel_configs:
- action: keep
source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_scrape]
regex: "true"
- action: drop
source_labels: [__meta_kubernetes_pod_container_port_name]
regex: ".*health.*"
- action: replace
source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_path]
target_label: __metrics_path__
- action: replace
regex: (.+):(?:\d+);(\d+)
replacement: $1:$2
source_labels: [__address__, __meta_kubernetes_pod_annotation_prometheus_io_port]
target_label: __address__
# --- kube-state-metrics (via its Service) ---
- job_name: "kube-state-metrics"
kubernetes_sd_configs: [{ role: endpoints }]
relabel_configs:
- action: keep
source_labels: [__meta_kubernetes_service_label_app_kubernetes_io_name]
regex: kube-state-metrics
# --- Longhorn ---
- job_name: "longhorn-backend"
static_configs:
- targets: ["longhorn-backend.longhorn-system.svc:9500"]
metrics_path: /metrics
# --- cert-manager (pods expose on 9402) ---
2025-11-09 12:33:11 -03:00
- job_name: "cert-manager"
kubernetes_sd_configs: [{ role: pod }]
relabel_configs:
- action: keep
source_labels: [__meta_kubernetes_namespace, __meta_kubernetes_pod_label_app_kubernetes_io_name]
regex: cert-manager;cert-manager
- action: drop
source_labels: [__meta_kubernetes_pod_container_port_name]
regex: ".*health.*"
- action: replace
source_labels: [__address__]
regex: "(.+):\\d+"
replacement: "$1:9402"
target_label: __address__
# --- Flux controllers (default :8080/metrics) ---
- job_name: "flux"
kubernetes_sd_configs: [{ role: pod }]
relabel_configs:
- action: keep
source_labels: [__meta_kubernetes_namespace, __meta_kubernetes_pod_label_app_kubernetes_io_part_of]
regex: flux-system;flux
- job_name: "titan-db"
static_configs:
- targets: ["titan-db:9100"]
relabel_configs:
- source_labels: [__address__]
target_label: instance
metric_relabel_configs:
- source_labels: [instance]
target_label: node
replacement: titan-db
---
apiVersion: helm.toolkit.fluxcd.io/v2
kind: HelmRelease
metadata:
name: grafana
namespace: monitoring
spec:
interval: 15m
chart:
spec:
chart: grafana
version: "~8.5.0"
sourceRef:
kind: HelmRepository
name: grafana
namespace: flux-system
values:
admin:
existingSecret: grafana-admin
userKey: admin-user
passwordKey: admin-password
persistence:
enabled: true
size: 20Gi
storageClassName: astreae
service:
type: ClusterIP
env:
GF_AUTH_ANONYMOUS_ENABLED: "true"
GF_AUTH_ANONYMOUS_ORG_ROLE: Viewer
GF_SECURITY_ALLOW_EMBEDDING: "true"
grafana.ini:
server:
domain: metrics.bstein.dev
root_url: https://metrics.bstein.dev/
dashboards:
default_home_dashboard_path: /var/lib/grafana/dashboards/overview/atlas-overview.json
auth.anonymous:
hide_version: true
users:
default_theme: dark
ingress:
enabled: true
ingressClassName: traefik
annotations:
cert-manager.io/cluster-issuer: letsencrypt
hosts:
- metrics.bstein.dev
2025-11-14 08:37:46 -03:00
path: /
tls:
- secretName: grafana-metrics-tls
hosts:
- metrics.bstein.dev
datasources:
datasources.yaml:
apiVersion: 1
datasources:
- name: VictoriaMetrics
type: prometheus
access: proxy
url: http://victoria-metrics-single-server:8428
isDefault: true
jsonData:
timeInterval: "15s"
2025-11-15 11:35:27 -03:00
uid: atlas-vm
dashboardProviders:
dashboardproviders.yaml:
apiVersion: 1
providers:
- name: overview
orgId: 1
folder: Overview
type: file
disableDeletion: false
2025-11-14 08:33:53 -03:00
editable: false
options:
path: /var/lib/grafana/dashboards/overview
- name: pods
orgId: 1
2025-11-17 16:27:38 -03:00
folder: Atlas Internal
type: file
disableDeletion: false
2025-11-14 08:33:53 -03:00
editable: true
options:
path: /var/lib/grafana/dashboards/pods
- name: nodes
orgId: 1
2025-11-17 16:27:38 -03:00
folder: Atlas Internal
type: file
disableDeletion: false
editable: true
options:
path: /var/lib/grafana/dashboards/nodes
- name: storage
orgId: 1
2025-11-17 16:27:38 -03:00
folder: Atlas Internal
type: file
disableDeletion: false
editable: true
options:
path: /var/lib/grafana/dashboards/storage
- name: gpu
orgId: 1
folder: Atlas Internal
type: file
disableDeletion: false
editable: true
options:
path: /var/lib/grafana/dashboards/gpu
2025-11-17 16:27:38 -03:00
- name: network
orgId: 1
folder: Atlas Internal
type: file
disableDeletion: false
editable: true
options:
path: /var/lib/grafana/dashboards/network
dashboardsConfigMaps:
overview: grafana-dashboard-overview
pods: grafana-dashboard-pods
nodes: grafana-dashboard-nodes
storage: grafana-dashboard-storage
gpu: grafana-dashboard-gpu
2025-11-17 16:27:38 -03:00
network: grafana-dashboard-network
2025-11-15 21:03:11 -03:00
extraConfigmapMounts:
- name: grafana-folders
mountPath: /etc/grafana/provisioning/folders
configMap: grafana-folders
readOnly: true
---
apiVersion: helm.toolkit.fluxcd.io/v2
kind: HelmRelease
metadata:
name: alertmanager
namespace: monitoring
spec:
interval: 15m
chart:
spec:
chart: alertmanager
version: "~1.9.0"
sourceRef:
kind: HelmRepository
name: prometheus
namespace: flux-system
values:
ingress:
enabled: true
ingressClassName: traefik
annotations:
cert-manager.io/cluster-issuer: letsencrypt
hosts:
- host: alerts.bstein.dev
2025-11-14 08:29:59 -03:00
paths:
- path: /
pathType: Prefix
tls:
- secretName: alerts-bstein-dev-tls
hosts:
- alerts.bstein.dev
config:
global:
resolve_timeout: 5m
route:
receiver: default
group_wait: 30s
group_interval: 5m
repeat_interval: 2h
receivers:
- name: default