titan-iac/services/monitoring/helmrelease.yaml

565 lines
19 KiB
YAML

# services/monitoring/helmrelease.yaml
apiVersion: helm.toolkit.fluxcd.io/v2
kind: HelmRelease
metadata:
name: kube-state-metrics
namespace: monitoring
spec:
interval: 15m
chart:
spec:
chart: kube-state-metrics
version: "~6.0.0"
sourceRef:
kind: HelmRepository
name: prometheus
namespace: flux-system
values:
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: kubernetes.io/hostname
operator: NotIn
values:
- titan-22
prometheusScrape: false
---
apiVersion: helm.toolkit.fluxcd.io/v2
kind: HelmRelease
metadata:
name: node-exporter
namespace: monitoring
spec:
interval: 15m
install:
disableWait: true
chart:
spec:
chart: prometheus-node-exporter
version: "~4.0.0"
sourceRef:
kind: HelmRepository
name: prometheus
namespace: flux-system
upgrade:
disableWait: true
values:
rbac:
pspEnabled: false
service:
annotations:
prometheus.io/scrape: "true"
prometheus.io/port: "9100"
---
apiVersion: helm.toolkit.fluxcd.io/v2
kind: HelmRelease
metadata:
name: victoria-metrics-single
namespace: monitoring
spec:
interval: 15m
chart:
spec:
chart: victoria-metrics-single
version: "~0.15.0" # or omit to track appVersion
sourceRef:
kind: HelmRepository
name: victoria-metrics
namespace: flux-system
values:
server:
# keep 1 year; supports "d", "y"
extraArgs:
retentionPeriod: "1y" # VM flag -retentionPeriod=1y. :contentReference[oaicite:11]{index=11}
persistentVolume:
enabled: true
size: 100Gi
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: kubernetes.io/hostname
operator: NotIn
values:
- titan-22
# Enable built-in Kubernetes scraping
scrape:
enabled: true # chart enables promscrape. :contentReference[oaicite:12]{index=12}
config:
global:
scrape_interval: 15s
scrape_configs:
# VM self-metrics
- job_name: victoriametrics
static_configs:
- targets: ["localhost:8428"]
# --- K8s control-plane & nodes (from VM docs guide) ---
- job_name: "kubernetes-apiservers"
kubernetes_sd_configs: [{ role: endpoints }]
scheme: https
tls_config:
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
insecure_skip_verify: true
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
relabel_configs:
- action: keep
source_labels: [__meta_kubernetes_namespace,__meta_kubernetes_service_name,__meta_kubernetes_endpoint_port_name]
regex: default;kubernetes;https
- job_name: "kubernetes-nodes"
scheme: https
tls_config:
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
insecure_skip_verify: true
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
kubernetes_sd_configs: [{ role: node }]
relabel_configs:
- action: labelmap
regex: __meta_kubernetes_node_label_(.+)
- target_label: __address__
replacement: kubernetes.default.svc:443
- source_labels: [__meta_kubernetes_node_name]
regex: (.+)
target_label: __metrics_path__
replacement: /api/v1/nodes/$1/proxy/metrics
- job_name: "kubernetes-nodes-cadvisor"
scheme: https
tls_config:
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
insecure_skip_verify: true
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
kubernetes_sd_configs: [{ role: node }]
relabel_configs:
- action: labelmap
regex: __meta_kubernetes_node_label_(.+)
- target_label: __address__
replacement: kubernetes.default.svc:443
- source_labels: [__meta_kubernetes_node_name]
regex: (.+)
target_label: __metrics_path__
replacement: /api/v1/nodes/$1/proxy/metrics/cadvisor
# --- Annotated Services (generic autodiscovery) ---
- job_name: "kubernetes-service-endpoints"
kubernetes_sd_configs: [{ role: endpoints }]
relabel_configs:
- action: keep
source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scrape]
regex: "true"
- action: replace
source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scheme]
regex: (https?)
target_label: __scheme__
- action: replace
source_labels: [__meta_kubernetes_service_annotation_prometheus_io_path]
target_label: __metrics_path__
- action: replace
regex: (.+)(?::\d+);(\d+)
replacement: $1:$2
source_labels: [__address__, __meta_kubernetes_service_annotation_prometheus_io_port]
target_label: __address__
# --- Annotated Pods (generic autodiscovery) ---
- job_name: "kubernetes-pods"
kubernetes_sd_configs: [{ role: pod }]
relabel_configs:
- action: keep
source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_scrape]
regex: "true"
- action: drop
source_labels: [__meta_kubernetes_pod_container_port_name]
regex: ".*health.*"
- action: replace
source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_path]
target_label: __metrics_path__
- action: replace
regex: (.+):(?:\d+);(\d+)
replacement: $1:$2
source_labels: [__address__, __meta_kubernetes_pod_annotation_prometheus_io_port]
target_label: __address__
# --- kube-state-metrics (via its Service) ---
- job_name: "kube-state-metrics"
kubernetes_sd_configs: [{ role: endpoints }]
relabel_configs:
- action: keep
source_labels: [__meta_kubernetes_service_label_app_kubernetes_io_name]
regex: kube-state-metrics
# --- Longhorn ---
- job_name: "longhorn-backend"
static_configs:
- targets: ["longhorn-backend.longhorn-system.svc:9500"]
metrics_path: /metrics
# --- titan-db node_exporter (external control-plane DB host) ---
- job_name: "titan-db"
static_configs:
- targets: ["192.168.22.10:9100"]
relabel_configs:
- source_labels: [__address__]
target_label: instance
replacement: titan-db
# --- titan-jh node_exporter (external control-plane host) ---
- job_name: "titan-jh"
static_configs:
- targets: ["192.168.22.8:9100"]
relabel_configs:
- source_labels: [__address__]
target_label: instance
replacement: titan-jh
# --- cert-manager (pods expose on 9402) ---
- job_name: "cert-manager"
kubernetes_sd_configs: [{ role: pod }]
relabel_configs:
- action: keep
source_labels: [__meta_kubernetes_namespace, __meta_kubernetes_pod_label_app_kubernetes_io_name]
regex: cert-manager;cert-manager
- action: drop
source_labels: [__meta_kubernetes_pod_container_port_name]
regex: ".*health.*"
- action: replace
source_labels: [__address__]
regex: "(.+):\\d+"
replacement: "$1:9402"
target_label: __address__
# --- Flux controllers (default :8080/metrics) ---
- job_name: "flux"
kubernetes_sd_configs: [{ role: pod }]
relabel_configs:
- action: keep
source_labels: [__meta_kubernetes_namespace, __meta_kubernetes_pod_label_app_kubernetes_io_part_of]
regex: flux-system;flux
---
apiVersion: helm.toolkit.fluxcd.io/v2
kind: HelmRelease
metadata:
name: grafana
namespace: monitoring
spec:
interval: 15m
chart:
spec:
chart: grafana
version: "~8.5.0"
sourceRef:
kind: HelmRepository
name: grafana
namespace: flux-system
install:
remediation: { retries: 3 }
timeout: 15m
upgrade:
remediation:
retries: 3
remediateLastFailure: true
cleanupOnFail: true
timeout: 15m
values:
admin:
existingSecret: grafana-admin
userKey: admin-user
passwordKey: admin-password
serviceAccount:
create: false
name: monitoring-vault-sync
automountServiceAccountToken: true
podAnnotations:
vault.hashicorp.com/agent-inject: "true"
vault.hashicorp.com/role: "monitoring"
vault.hashicorp.com/agent-inject-secret-grafana-env.sh: "kv/data/atlas/monitoring/grafana-admin"
vault.hashicorp.com/agent-inject-template-grafana-env.sh: |
{{ with secret "kv/data/atlas/monitoring/grafana-admin" }}
export GF_SECURITY_ADMIN_USER="{{ index .Data.data "admin-user" }}"
export GF_SECURITY_ADMIN_PASSWORD="{{ index .Data.data "admin-password" }}"
{{ end }}
{{ with secret "kv/data/atlas/shared/postmark-relay" }}
export GF_SMTP_USER="{{ index .Data.data "apikey" }}"
export GF_SMTP_PASSWORD="{{ index .Data.data "apikey" }}"
{{ end }}
persistence:
enabled: true
size: 20Gi
storageClassName: astreae
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: kubernetes.io/hostname
operator: NotIn
values:
- titan-22
deploymentStrategy:
type: Recreate
service:
type: ClusterIP
env:
GF_AUTH_GENERIC_OAUTH_CLIENT_ID: "grafana"
GF_AUTH_GENERIC_OAUTH_CLIENT_SECRET: ""
GF_AUTH_ANONYMOUS_ENABLED: "true"
GF_AUTH_ANONYMOUS_ORG_NAME: "Overview"
GF_AUTH_ANONYMOUS_ORG_ROLE: "Viewer"
GF_SMTP_ENABLED: "true"
GF_SMTP_HOST: "mail.bstein.dev:587"
GF_SMTP_FROM: "no-reply-grafana@bstein.dev"
GF_SMTP_FROM_NAME: "Atlas Grafana"
GRAFANA_ALERT_EMAILS: "alerts@bstein.dev"
GF_SECURITY_ALLOW_EMBEDDING: "true"
GF_AUTH_GENERIC_OAUTH_ENABLED: "true"
GF_AUTH_GENERIC_OAUTH_NAME: "Keycloak"
GF_AUTH_GENERIC_OAUTH_ALLOW_SIGN_UP: "true"
GF_AUTH_GENERIC_OAUTH_SCOPES: "openid profile email groups"
GF_AUTH_GENERIC_OAUTH_AUTH_URL: "https://sso.bstein.dev/realms/atlas/protocol/openid-connect/auth"
GF_AUTH_GENERIC_OAUTH_TOKEN_URL: "https://sso.bstein.dev/realms/atlas/protocol/openid-connect/token"
GF_AUTH_GENERIC_OAUTH_API_URL: "https://sso.bstein.dev/realms/atlas/protocol/openid-connect/userinfo"
GF_AUTH_GENERIC_OAUTH_ROLE_ATTRIBUTE_PATH: "contains(groups, 'admin') && 'Admin' || 'Viewer'"
GF_AUTH_GENERIC_OAUTH_USE_PKCE: "true"
GF_AUTH_GENERIC_OAUTH_TLS_SKIP_VERIFY_INSECURE: "false"
GF_AUTH_SIGNOUT_REDIRECT_URL: "https://sso.bstein.dev/realms/atlas/protocol/openid-connect/logout?redirect_uri=https://metrics.bstein.dev/"
grafana.ini:
server:
domain: metrics.bstein.dev
root_url: https://metrics.bstein.dev/
dashboards:
default_home_dashboard_path: /var/lib/grafana/dashboards/overview/atlas-overview.json
auth.anonymous:
hide_version: true
users:
default_theme: dark
ingress:
enabled: true
ingressClassName: traefik
annotations:
cert-manager.io/cluster-issuer: letsencrypt
traefik.ingress.kubernetes.io/router.entrypoints: websecure
traefik.ingress.kubernetes.io/router.tls: "true"
hosts:
- metrics.bstein.dev
path: /
tls:
- secretName: grafana-metrics-tls
hosts:
- metrics.bstein.dev
datasources:
datasources.yaml:
apiVersion: 1
datasources:
- name: VictoriaMetrics
type: prometheus
access: proxy
url: http://victoria-metrics-single-server:8428
isDefault: true
jsonData:
timeInterval: "15s"
uid: atlas-vm
orgId: 1
- name: VictoriaMetrics
type: prometheus
access: proxy
url: http://victoria-metrics-single-server:8428
isDefault: true
jsonData:
timeInterval: "15s"
uid: atlas-vm
orgId: 2
dashboardProviders:
dashboardproviders.yaml:
apiVersion: 1
providers:
- name: overview
orgId: 1
folder: Overview
type: file
disableDeletion: false
editable: false
options:
path: /var/lib/grafana/dashboards/overview
- name: overview-public
orgId: 2
folder: Overview
type: file
disableDeletion: false
editable: false
options:
path: /var/lib/grafana/dashboards/overview-public
- name: pods
orgId: 1
folder: Atlas Internal
type: file
disableDeletion: false
editable: true
options:
path: /var/lib/grafana/dashboards/pods
- name: nodes
orgId: 1
folder: Atlas Internal
type: file
disableDeletion: false
editable: true
options:
path: /var/lib/grafana/dashboards/nodes
- name: storage
orgId: 1
folder: Atlas Internal
type: file
disableDeletion: false
editable: true
options:
path: /var/lib/grafana/dashboards/storage
- name: gpu
orgId: 1
folder: Atlas Internal
type: file
disableDeletion: false
editable: true
options:
path: /var/lib/grafana/dashboards/gpu
- name: network
orgId: 1
folder: Atlas Internal
type: file
disableDeletion: false
editable: true
options:
path: /var/lib/grafana/dashboards/network
- name: mail
orgId: 1
folder: Atlas Internal
type: file
disableDeletion: false
editable: true
options:
path: /var/lib/grafana/dashboards/mail
dashboardsConfigMaps:
overview: grafana-dashboard-overview
overview-public: grafana-dashboard-overview
pods: grafana-dashboard-pods
nodes: grafana-dashboard-nodes
storage: grafana-dashboard-storage
gpu: grafana-dashboard-gpu
network: grafana-dashboard-network
mail: grafana-dashboard-mail
extraConfigmapMounts:
- name: grafana-folders
mountPath: /etc/grafana/provisioning/folders
configMap: grafana-folders
readOnly: true
- name: grafana-alerting
mountPath: /etc/grafana/provisioning/alerting
configMap: grafana-alerting
readOnly: true
postRenderers:
- kustomize:
patches:
- target:
kind: Deployment
name: grafana
patch: |-
apiVersion: apps/v1
kind: Deployment
metadata:
name: grafana
spec:
template:
spec:
serviceAccountName: monitoring-vault-sync
automountServiceAccountToken: true
containers:
- name: grafana
command:
- /entrypoint.sh
args:
- /run.sh
env:
- name: GF_SECURITY_ADMIN_USER
$patch: delete
- name: GF_SECURITY_ADMIN_PASSWORD
$patch: delete
- name: GF_SMTP_USER
$patch: delete
- name: GF_SMTP_PASSWORD
$patch: delete
- name: VAULT_ENV_FILE
value: /vault/secrets/grafana-env.sh
volumeMounts:
- name: monitoring-vault-entrypoint
mountPath: /entrypoint.sh
subPath: vault-entrypoint.sh
volumes:
- name: monitoring-vault-entrypoint
configMap:
name: monitoring-vault-entrypoint
defaultMode: 493
---
apiVersion: helm.toolkit.fluxcd.io/v2
kind: HelmRelease
metadata:
name: alertmanager
namespace: monitoring
spec:
interval: 15m
chart:
spec:
chart: alertmanager
version: "~1.9.0"
sourceRef:
kind: HelmRepository
name: prometheus
namespace: flux-system
values:
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: kubernetes.io/hostname
operator: NotIn
values:
- titan-22
ingress:
enabled: true
ingressClassName: traefik
annotations:
cert-manager.io/cluster-issuer: letsencrypt
traefik.ingress.kubernetes.io/router.entrypoints: websecure
traefik.ingress.kubernetes.io/router.tls: "true"
hosts:
- host: alerts.bstein.dev
paths:
- path: /
pathType: Prefix
tls:
- secretName: alerts-bstein-dev-tls
hosts:
- alerts.bstein.dev
config:
global:
resolve_timeout: 5m
route:
receiver: default
group_wait: 30s
group_interval: 5m
repeat_interval: 2h
receivers:
- name: default