396 lines
12 KiB
YAML
396 lines
12 KiB
YAML
# services/monitoring/kube-state-metrics-helmrelease.yaml
|
|
apiVersion: helm.toolkit.fluxcd.io/v2
|
|
kind: HelmRelease
|
|
metadata:
|
|
name: kube-state-metrics
|
|
namespace: monitoring
|
|
spec:
|
|
interval: 15m
|
|
chart:
|
|
spec:
|
|
chart: kube-state-metrics
|
|
version: "~6.0.0"
|
|
sourceRef:
|
|
kind: HelmRepository
|
|
name: prometheus
|
|
namespace: flux-system
|
|
values:
|
|
prometheusScrape: false
|
|
|
|
---
|
|
|
|
apiVersion: helm.toolkit.fluxcd.io/v2
|
|
kind: HelmRelease
|
|
metadata:
|
|
name: node-exporter
|
|
namespace: monitoring
|
|
spec:
|
|
interval: 15m
|
|
install:
|
|
disableWait: true
|
|
chart:
|
|
spec:
|
|
chart: prometheus-node-exporter
|
|
version: "~4.0.0"
|
|
sourceRef:
|
|
kind: HelmRepository
|
|
name: prometheus
|
|
namespace: flux-system
|
|
upgrade:
|
|
disableWait: true
|
|
values:
|
|
rbac:
|
|
pspEnabled: false
|
|
service:
|
|
annotations:
|
|
prometheus.io/scrape: "true"
|
|
prometheus.io/port: "9100"
|
|
|
|
---
|
|
|
|
apiVersion: helm.toolkit.fluxcd.io/v2
|
|
kind: HelmRelease
|
|
metadata:
|
|
name: victoria-metrics-single
|
|
namespace: monitoring
|
|
spec:
|
|
interval: 15m
|
|
chart:
|
|
spec:
|
|
chart: victoria-metrics-single
|
|
version: "~0.15.0" # or omit to track appVersion
|
|
sourceRef:
|
|
kind: HelmRepository
|
|
name: victoria-metrics
|
|
namespace: flux-system
|
|
values:
|
|
server:
|
|
# keep ~3 months; change as you like (supports "d", "y")
|
|
extraArgs:
|
|
retentionPeriod: "90d" # VM flag -retentionPeriod=90d. :contentReference[oaicite:11]{index=11}
|
|
|
|
persistentVolume:
|
|
enabled: true
|
|
size: 100Gi
|
|
|
|
# Enable built-in Kubernetes scraping
|
|
scrape:
|
|
enabled: true # chart enables promscrape. :contentReference[oaicite:12]{index=12}
|
|
config:
|
|
global:
|
|
scrape_interval: 15s
|
|
|
|
scrape_configs:
|
|
# VM self-metrics
|
|
- job_name: victoriametrics
|
|
static_configs:
|
|
- targets: ["localhost:8428"]
|
|
|
|
# --- K8s control-plane & nodes (from VM docs guide) ---
|
|
- job_name: "kubernetes-apiservers"
|
|
kubernetes_sd_configs: [{ role: endpoints }]
|
|
scheme: https
|
|
tls_config:
|
|
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
|
|
insecure_skip_verify: true
|
|
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
|
|
relabel_configs:
|
|
- action: keep
|
|
source_labels: [__meta_kubernetes_namespace,__meta_kubernetes_service_name,__meta_kubernetes_endpoint_port_name]
|
|
regex: default;kubernetes;https
|
|
|
|
- job_name: "kubernetes-nodes"
|
|
scheme: https
|
|
tls_config:
|
|
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
|
|
insecure_skip_verify: true
|
|
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
|
|
kubernetes_sd_configs: [{ role: node }]
|
|
relabel_configs:
|
|
- action: labelmap
|
|
regex: __meta_kubernetes_node_label_(.+)
|
|
- target_label: __address__
|
|
replacement: kubernetes.default.svc:443
|
|
- source_labels: [__meta_kubernetes_node_name]
|
|
regex: (.+)
|
|
target_label: __metrics_path__
|
|
replacement: /api/v1/nodes/$1/proxy/metrics
|
|
|
|
- job_name: "kubernetes-nodes-cadvisor"
|
|
scheme: https
|
|
tls_config:
|
|
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
|
|
insecure_skip_verify: true
|
|
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
|
|
kubernetes_sd_configs: [{ role: node }]
|
|
relabel_configs:
|
|
- action: labelmap
|
|
regex: __meta_kubernetes_node_label_(.+)
|
|
- target_label: __address__
|
|
replacement: kubernetes.default.svc:443
|
|
- source_labels: [__meta_kubernetes_node_name]
|
|
regex: (.+)
|
|
target_label: __metrics_path__
|
|
replacement: /api/v1/nodes/$1/proxy/metrics/cadvisor
|
|
|
|
# --- Annotated Services (generic autodiscovery) ---
|
|
- job_name: "kubernetes-service-endpoints"
|
|
kubernetes_sd_configs: [{ role: endpoints }]
|
|
relabel_configs:
|
|
- action: keep
|
|
source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scrape]
|
|
regex: "true"
|
|
- action: replace
|
|
source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scheme]
|
|
regex: (https?)
|
|
target_label: __scheme__
|
|
- action: replace
|
|
source_labels: [__meta_kubernetes_service_annotation_prometheus_io_path]
|
|
target_label: __metrics_path__
|
|
- action: replace
|
|
regex: (.+)(?::\d+);(\d+)
|
|
replacement: $1:$2
|
|
source_labels: [__address__, __meta_kubernetes_service_annotation_prometheus_io_port]
|
|
target_label: __address__
|
|
|
|
# --- Annotated Pods (generic autodiscovery) ---
|
|
- job_name: "kubernetes-pods"
|
|
kubernetes_sd_configs: [{ role: pod }]
|
|
relabel_configs:
|
|
- action: keep
|
|
source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_scrape]
|
|
regex: "true"
|
|
- action: drop
|
|
source_labels: [__meta_kubernetes_pod_container_port_name]
|
|
regex: ".*health.*"
|
|
- action: replace
|
|
source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_path]
|
|
target_label: __metrics_path__
|
|
- action: replace
|
|
regex: (.+):(?:\d+);(\d+)
|
|
replacement: $1:$2
|
|
source_labels: [__address__, __meta_kubernetes_pod_annotation_prometheus_io_port]
|
|
target_label: __address__
|
|
|
|
# --- kube-state-metrics (via its Service) ---
|
|
- job_name: "kube-state-metrics"
|
|
kubernetes_sd_configs: [{ role: endpoints }]
|
|
relabel_configs:
|
|
- action: keep
|
|
source_labels: [__meta_kubernetes_service_label_app_kubernetes_io_name]
|
|
regex: kube-state-metrics
|
|
|
|
# --- Longhorn ---
|
|
- job_name: "longhorn-backend"
|
|
static_configs:
|
|
- targets: ["longhorn-backend.longhorn-system.svc:9500"]
|
|
metrics_path: /metrics
|
|
|
|
# --- cert-manager (pods expose on 9402) ---
|
|
- job_name: "cert-manager"
|
|
kubernetes_sd_configs: [{ role: pod }]
|
|
relabel_configs:
|
|
- action: keep
|
|
source_labels: [__meta_kubernetes_namespace, __meta_kubernetes_pod_label_app_kubernetes_io_name]
|
|
regex: cert-manager;cert-manager
|
|
- action: drop
|
|
source_labels: [__meta_kubernetes_pod_container_port_name]
|
|
regex: ".*health.*"
|
|
- action: replace
|
|
source_labels: [__address__]
|
|
regex: "(.+):\\d+"
|
|
replacement: "$1:9402"
|
|
target_label: __address__
|
|
|
|
# --- Flux controllers (default :8080/metrics) ---
|
|
- job_name: "flux"
|
|
kubernetes_sd_configs: [{ role: pod }]
|
|
relabel_configs:
|
|
- action: keep
|
|
source_labels: [__meta_kubernetes_namespace, __meta_kubernetes_pod_label_app_kubernetes_io_part_of]
|
|
regex: flux-system;flux
|
|
- job_name: "titan-db"
|
|
static_configs:
|
|
- targets: ["titan-db:9100"]
|
|
relabel_configs:
|
|
- source_labels: [__address__]
|
|
target_label: instance
|
|
metric_relabel_configs:
|
|
- source_labels: [instance]
|
|
target_label: node
|
|
replacement: titan-db
|
|
|
|
---
|
|
|
|
apiVersion: helm.toolkit.fluxcd.io/v2
|
|
kind: HelmRelease
|
|
metadata:
|
|
name: grafana
|
|
namespace: monitoring
|
|
spec:
|
|
interval: 15m
|
|
chart:
|
|
spec:
|
|
chart: grafana
|
|
version: "~8.5.0"
|
|
sourceRef:
|
|
kind: HelmRepository
|
|
name: grafana
|
|
namespace: flux-system
|
|
values:
|
|
admin:
|
|
existingSecret: grafana-admin
|
|
userKey: admin-user
|
|
passwordKey: admin-password
|
|
persistence:
|
|
enabled: true
|
|
size: 20Gi
|
|
storageClassName: astreae
|
|
service:
|
|
type: ClusterIP
|
|
env:
|
|
GF_AUTH_ANONYMOUS_ENABLED: "true"
|
|
GF_AUTH_ANONYMOUS_ORG_ROLE: Viewer
|
|
GF_SECURITY_ALLOW_EMBEDDING: "true"
|
|
grafana.ini:
|
|
server:
|
|
domain: metrics.bstein.dev
|
|
root_url: https://metrics.bstein.dev/
|
|
dashboards:
|
|
default_home_dashboard_path: /var/lib/grafana/dashboards/overview/atlas-overview.json
|
|
auth.anonymous:
|
|
hide_version: true
|
|
users:
|
|
default_theme: dark
|
|
ingress:
|
|
enabled: true
|
|
ingressClassName: traefik
|
|
annotations:
|
|
cert-manager.io/cluster-issuer: letsencrypt
|
|
hosts:
|
|
- metrics.bstein.dev
|
|
path: /
|
|
tls:
|
|
- secretName: grafana-metrics-tls
|
|
hosts:
|
|
- metrics.bstein.dev
|
|
datasources:
|
|
datasources.yaml:
|
|
apiVersion: 1
|
|
datasources:
|
|
- name: VictoriaMetrics
|
|
type: prometheus
|
|
access: proxy
|
|
url: http://victoria-metrics-single-server:8428
|
|
isDefault: true
|
|
jsonData:
|
|
timeInterval: "15s"
|
|
uid: atlas-vm
|
|
dashboardProviders:
|
|
dashboardproviders.yaml:
|
|
apiVersion: 1
|
|
providers:
|
|
- name: overview
|
|
orgId: 1
|
|
folder: Overview
|
|
type: file
|
|
disableDeletion: false
|
|
editable: false
|
|
options:
|
|
path: /var/lib/grafana/dashboards/overview
|
|
- name: pods
|
|
orgId: 1
|
|
folder: Atlas Internal
|
|
type: file
|
|
disableDeletion: false
|
|
editable: true
|
|
options:
|
|
path: /var/lib/grafana/dashboards/pods
|
|
- name: nodes
|
|
orgId: 1
|
|
folder: Atlas Internal
|
|
type: file
|
|
disableDeletion: false
|
|
editable: true
|
|
options:
|
|
path: /var/lib/grafana/dashboards/nodes
|
|
- name: storage
|
|
orgId: 1
|
|
folder: Atlas Internal
|
|
type: file
|
|
disableDeletion: false
|
|
editable: true
|
|
options:
|
|
path: /var/lib/grafana/dashboards/storage
|
|
- name: gpu
|
|
orgId: 1
|
|
folder: Atlas Internal
|
|
type: file
|
|
disableDeletion: false
|
|
editable: true
|
|
options:
|
|
path: /var/lib/grafana/dashboards/gpu
|
|
- name: network
|
|
orgId: 1
|
|
folder: Atlas Internal
|
|
type: file
|
|
disableDeletion: false
|
|
editable: true
|
|
options:
|
|
path: /var/lib/grafana/dashboards/network
|
|
dashboardsConfigMaps:
|
|
overview: grafana-dashboard-overview
|
|
pods: grafana-dashboard-pods
|
|
nodes: grafana-dashboard-nodes
|
|
storage: grafana-dashboard-storage
|
|
gpu: grafana-dashboard-gpu
|
|
network: grafana-dashboard-network
|
|
extraConfigmapMounts:
|
|
- name: grafana-folders
|
|
mountPath: /etc/grafana/provisioning/folders
|
|
configMap: grafana-folders
|
|
readOnly: true
|
|
|
|
---
|
|
|
|
apiVersion: helm.toolkit.fluxcd.io/v2
|
|
kind: HelmRelease
|
|
metadata:
|
|
name: alertmanager
|
|
namespace: monitoring
|
|
spec:
|
|
interval: 15m
|
|
chart:
|
|
spec:
|
|
chart: alertmanager
|
|
version: "~1.9.0"
|
|
sourceRef:
|
|
kind: HelmRepository
|
|
name: prometheus
|
|
namespace: flux-system
|
|
values:
|
|
ingress:
|
|
enabled: true
|
|
ingressClassName: traefik
|
|
annotations:
|
|
cert-manager.io/cluster-issuer: letsencrypt
|
|
hosts:
|
|
- host: alerts.bstein.dev
|
|
paths:
|
|
- path: /
|
|
pathType: Prefix
|
|
tls:
|
|
- secretName: alerts-bstein-dev-tls
|
|
hosts:
|
|
- alerts.bstein.dev
|
|
config:
|
|
global:
|
|
resolve_timeout: 5m
|
|
route:
|
|
receiver: default
|
|
group_wait: 30s
|
|
group_interval: 5m
|
|
repeat_interval: 2h
|
|
receivers:
|
|
- name: default
|