monitoring add, jellyfin/pegasus update, and traefik tweaks

This commit is contained in:
Brad Stein 2025-10-07 23:26:27 -05:00
parent 41292eff0b
commit ae85dcfeaa
21 changed files with 759 additions and 2 deletions

View File

@ -0,0 +1,14 @@
# infrastructure/flux-system/kustomization-monitoring.yaml
apiVersion: kustomize.toolkit.fluxcd.io/v1
kind: Kustomization
metadata:
name: monitoring
namespace: flux-system
spec:
interval: 10m
path: ./services/monitoring
prune: true
sourceRef:
kind: GitRepository
name: flux-system
wait: true

View File

@ -0,0 +1,18 @@
# infrastructure/flux-system/kustomization-traefik.yaml
apiVersion: kustomize.toolkit.fluxcd.io/v1
kind: Kustomization
metadata:
name: traefik
namespace: flux-system
spec:
interval: 10m
path: ./infrastructure/traefik
targetNamespace: traefik
prune: true
sourceRef:
kind: GitRepository
name: flux-system
namespace: flux-system
dependsOn:
- name: core
wait: true

View File

@ -11,8 +11,10 @@ resources:
- kustomization-vault.yaml
- kustomization-jitsi.yaml
- kustomization-crypto.yaml
- kustomization-traefik.yaml
- kustomization-monerod.yaml
- kustomization-pegasus.yaml
- kustomization-jellyfin.yaml
- kustomization-xmr-miner.yaml
- kustomization-monitoring.yaml
- kustomization-longhorn-ui.yaml

View File

@ -1,3 +1,4 @@
# infrastructure/sources/helm/grafana.yaml
apiVersion: source.toolkit.fluxcd.io/v1
kind: HelmRepository
metadata:

View File

@ -1,3 +1,4 @@
# infrastructure/sources/helm/hashicorp.yaml
apiVersion: source.toolkit.fluxcd.io/v1
kind: HelmRepository
metadata:

View File

@ -1,3 +1,4 @@
# infrastructure/sources/helm/jetstack.yaml
apiVersion: source.toolkit.fluxcd.io/v1
kind: HelmRepository
metadata:

View File

@ -1,3 +1,4 @@
# infrastructure/sources/helm/prometheus.yaml
apiVersion: source.toolkit.fluxcd.io/v1
kind: HelmRepository
metadata:

View File

@ -0,0 +1,9 @@
# infrastructure/sources/helm/victoria-metrics.yaml
apiVersion: source.toolkit.fluxcd.io/v1
kind: HelmRepository
metadata:
name: victoria-metrics
namespace: flux-system
spec:
interval: 1h
url: https://victoriametrics.github.io/helm-charts/

View File

@ -35,6 +35,12 @@ items:
- --entrypoints.web.address=:80
- --entrypoints.websecure.address=:443
- --api.dashboard=true
- --metrics.prometheus=true
- --metrics.prometheus.addEntryPointsLabels=true
- --metrics.prometheus.addRoutersLabels=true
- --metrics.prometheus.addServicesLabels=true
- --entrypoints.metrics.address=:9100
- --metrics.prometheus.entryPoint=metrics
image: traefik:v3.3.3
imagePullPolicy: IfNotPresent
name: traefik
@ -48,6 +54,9 @@ items:
- containerPort: 8080
name: admin
protocol: TCP
- containerPort: 9100
name: metrics
protocol: TCP
terminationMessagePath: /dev/termination-log
terminationMessagePolicy: File
dnsPolicy: ClusterFirst

View File

@ -9,3 +9,4 @@ resources:
- serviceaccount.yaml
- clusterrole.yaml
- clusterrolebinding.yaml
- service.yaml

View File

@ -0,0 +1,20 @@
# infrastructure/traefik/service.yaml
apiVersion: v1
kind: Service
metadata:
name: traefik-metrics
namespace: traefik
labels:
app: traefik
annotations:
prometheus.io/scrape: "true"
prometheus.io/port: "9100"
prometheus.io/path: "/metrics"
spec:
type: ClusterIP
selector:
app: traefik
ports:
- name: metrics
port: 9100
targetPort: metrics

2
scripts/longhorn_volume_usage.fish Normal file → Executable file
View File

@ -1,3 +1,5 @@
#!/usr/bin/env fish
function pvc-usage --description "Show Longhorn PVC usage (human-readable) mapped to namespace/name"
begin
kubectl -n longhorn-system get volumes.longhorn.io -o json \

View File

@ -0,0 +1,218 @@
#!/usr/bin/env bash
set -euo pipefail
# 0) Create dedicated user if it doesn't exist
if ! id -u styx >/dev/null 2>&1; then
sudo useradd -m -s /bin/bash styx
echo "Created user 'styx'"
fi
# 1) App directory
sudo mkdir -p /opt/styx-kiosk/keys
sudo chown -R styx:styx /opt/styx-kiosk
# 2) Drop the kiosk app (written below) into place
sudo tee /opt/styx-kiosk/kiosk.py >/dev/null <<'PY'
#!/usr/bin/env python3
import base64, json, os, subprocess, threading, tempfile
from datetime import datetime
import tkinter as tk
from tkinter import ttk, messagebox
APP_TITLE = "STYX Airgap Signer"
CAMERA_DEV = os.environ.get("ZBAR_DEV", "/dev/video0")
KEY_PATH = os.environ.get("STYX_KEY", "/vault/keys/signer_ed25519.pem") # in the LUKS vault
ALGO = os.environ.get("STYX_ALGO", "ed25519") # or 'secp256r1'
QR_TMP = "/tmp/styx_signed.png"
def zbar_scan_oneshot():
# --raw -> data only; --nodisplay -> no preview window; --oneshot -> exit after first code
# (zbarcam supports --oneshot; prints one code and exits). :contentReference[oaicite:2]{index=2}
cmd = ["zbarcam", "--raw", "--nodisplay", "--oneshot", CAMERA_DEV]
try:
out = subprocess.check_output(cmd, text=True, timeout=30)
out = out.strip()
return out if out else None
except Exception as e:
return None
def openssl_pub_der_b64(key_path):
der = subprocess.check_output(["openssl","pkey","-in",key_path,"-pubout","-outform","DER"])
return base64.b64encode(der).decode()
def sign_bytes(msg: bytes, key_path: str, algo: str) -> bytes:
with tempfile.NamedTemporaryFile(delete=False) as f:
f.write(msg)
msg_path = f.name
try:
if algo.lower() == "ed25519":
# Ed25519 expects raw message; OpenSSL handles hashing internally.
sig = subprocess.check_output(
["openssl","pkeyutl","-sign","-inkey",key_path,"-rawin","-in",msg_path]
)
return sig
elif algo.lower() in ("secp256r1","prime256v1","p256"):
# ECDSA over P-256; hash with SHA-256; OpenSSL returns DER-encoded (r,s)
sig = subprocess.check_output(
["openssl","dgst","-sha256","-sign",key_path,msg_path]
)
return sig
else:
raise RuntimeError(f"Unsupported algo: {algo}")
finally:
try: os.unlink(msg_path)
except: pass
def make_signed_envelope(scanned_text: str, key_path: str, algo: str) -> dict:
# Accept either raw string or JSON with 'tx_bytes' (base64) or 'message'
try:
obj = json.loads(scanned_text)
if "tx_bytes" in obj:
msg = base64.b64decode(obj["tx_bytes"])
elif "message" in obj:
msg = obj["message"].encode()
else:
# If it's JSON but doesn't carry known fields, sign canonical JSON bytes
msg = json.dumps(obj, sort_keys=True, separators=(",",":")).encode()
request_id = obj.get("request_id")
except Exception:
# Non-JSON → treat the scanned text as the message to sign
msg = scanned_text.encode()
request_id = None
sig = sign_bytes(msg, key_path, algo)
env = {
"algo": algo.lower(),
"signature_b64": base64.b64encode(sig).decode(),
"pubkey_spki_der_b64": openssl_pub_der_b64(key_path),
"payload_sha256_b64": base64.b64encode(subprocess.check_output(["openssl","dgst","-sha256","-binary"], input=msg)).decode(),
"quote_raw": scanned_text,
"request_id": request_id,
"device": os.uname().nodename,
"ts_utc": datetime.utcnow().isoformat(timespec="seconds") + "Z",
}
return env
def qrencode_to_file(text: str, path: str):
# Use qrencode CLI to render a PNG we can display.
subprocess.run(["qrencode","-l","M","-s","16","-t","PNG","-o",path], input=text.encode(), check=True)
class App(tk.Tk):
def __init__(self):
super().__init__()
self.title(APP_TITLE)
self.attributes("-fullscreen", True)
self.configure(background="black")
self.bind("<Escape>", lambda e: self.quit()) # for maintenance only
s = ttk.Style(self)
s.configure("Big.TButton", font=("DejaVu Sans", 48), padding=24)
s.configure("Big.TLabel", font=("DejaVu Sans", 32), foreground="white", background="black")
self.container = tk.Frame(self, bg="black")
self.container.pack(expand=True, fill="both")
self.status = ttk.Label(self.container, text="Ready", style="Big.TLabel")
self.status.pack(pady=20)
self.scan_btn = ttk.Button(self.container, text="SCAN", style="Big.TButton", command=self.start_scan)
self.scan_btn.pack(pady=20)
self.image_label = tk.Label(self.container, bg="black")
self.image_label.pack(pady=10)
self.new_btn = ttk.Button(self.container, text="NEW SCAN", style="Big.TButton", command=self.reset)
self.new_btn.pack_forget()
self.note = ttk.Label(self.container, text="", style="Big.TLabel")
self.note.pack(pady=10)
if not os.path.exists(KEY_PATH):
self.status.config(text=f"Key not found at {KEY_PATH}\nInsert/unlock vault to proceed.")
def reset(self):
self.image_label.configure(image="")
self.image_label.image = None
self.new_btn.pack_forget()
self.note.config(text="")
self.status.config(text="Ready")
self.scan_btn.config(state="normal")
def start_scan(self):
if not os.path.exists(KEY_PATH):
messagebox.showerror("Key missing", f"Signing key not found at:\n{KEY_PATH}\nUnlock your vault.")
return
self.status.config(text="Scanning…")
self.scan_btn.config(state="disabled")
threading.Thread(target=self._do_scan_and_sign, daemon=True).start()
def _do_scan_and_sign(self):
scanned = zbar_scan_oneshot()
if not scanned:
self.after(0, self._scan_failed)
return
try:
envelope = make_signed_envelope(scanned, KEY_PATH, ALGO)
payload = json.dumps(envelope, separators=(",",":"))
qrencode_to_file(payload, QR_TMP)
self.after(0, self._show_qr, envelope)
except Exception as e:
self.after(0, lambda: self._error(str(e)))
def _scan_failed(self):
self.status.config(text="No QR detected. Try again.")
self.scan_btn.config(state="normal")
def _show_qr(self, envelope):
# Display the PNG produced by qrencode
try:
img = tk.PhotoImage(file=QR_TMP)
self.image_label.configure(image=img)
self.image_label.image = img
except Exception as e:
self.status.config(text=f"QR render failed: {e}")
self.scan_btn.config(state="normal")
return
self.status.config(text="Signed. Show this QR to your online box.")
self.note.config(text=f"Algo: {envelope['algo']} Host: {envelope['device']}")
self.new_btn.pack(pady=20)
if __name__ == "__main__":
App().mainloop()
PY
sudo chmod +x /opt/styx-kiosk/kiosk.py
sudo chown -R styx:styx /opt/styx-kiosk
# 3) Minimal X session: openbox + kiosk; no mouse pointer
sudo -u styx tee /home/styx/.xinitrc >/dev/null <<'XRC'
xset -dpms
xset s off
xset s noblank
# If 'unclutter' is installed, uncomment the next line to hide cursor:
# unclutter -idle 0 -root &
openbox-session &
/opt/styx-kiosk/kiosk.py
XRC
sudo chown styx:styx /home/styx/.xinitrc
sudo chmod 0755 /home/styx/.xinitrc
# 4) Autologin the 'styx' user on tty1, auto-start X
sudo mkdir -p /etc/systemd/system/getty@tty1.service.d
sudo tee /etc/systemd/system/getty@tty1.service.d/override.conf >/dev/null <<'OVR'
[Service]
ExecStart=
ExecStart=-/sbin/agetty --autologin styx --noclear %I $TERM
Type=idle
OVR
sudo -u styx tee -a /home/styx/.bash_profile >/dev/null <<'BRC'
# Start X on the first tty automatically, headless
if [ -z "$DISPLAY" ] && [ "$(tty)" = "/dev/tty1" ]; then
exec startx -- -nocursor
fi
BRC
sudo systemctl daemon-reload
sudo systemctl enable getty@tty1.service
echo "Done. Reboot to try the kiosk."

195
scripts/styx_prep.sh Executable file
View File

@ -0,0 +1,195 @@
#!/usr/bin/env bash
set -euo pipefail
# === CONFIG ===
STYX_USER="styx"
STYX_PASS="TempPass#123" # change at first login
STYX_HOSTNAME="styx"
SSH_PUBKEY="" # e.g., 'ssh-ed25519 AAAA... your@host' (optional)
# === helpers ===
require_root() {
if [[ $EUID -ne 0 ]]; then exec sudo -E "$0" "$@"; fi
}
ensure_binfmt_arm64() {
# If binfmt for arm64 isn't registered, register it via Docker (idempotent).
if [[ ! -e /proc/sys/fs/binfmt_misc/qemu-aarch64 ]]; then
command -v docker >/dev/null || { echo "Docker required to register binfmt (sudo pacman -S docker)"; exit 1; }
sudo systemctl enable --now docker >/dev/null 2>&1 || true
sudo docker run --rm --privileged tonistiigi/binfmt --install arm64
fi
}
find_parts() {
BOOT=$(lsblk -o LABEL,PATH -nr | awk '$1=="system-boot"{print $2}' | head -n1)
ROOT=$(lsblk -o LABEL,PATH -nr | awk '$1=="writable"{print $2}' | head -n1)
if [[ -z "${BOOT:-}" || -z "${ROOT:-}" ]]; then
echo "Could not find 'system-boot'/'writable' on any device."
lsblk -o NAME,SIZE,FSTYPE,LABEL,PATH -nr
exit 1
fi
}
mount_parts() {
mkdir -p /mnt/pi-boot /mnt/pi-root
mount "$ROOT" /mnt/pi-root
mount "$BOOT" /mnt/pi-boot
# Bind only what we need (avoid /run to prevent postinst fights)
for d in dev dev/pts proc sys; do mount --bind "/$d" "/mnt/pi-root/$d"; done
# Ubuntu images use a resolv.conf symlink—replace with a real file
if [[ -L /mnt/pi-root/etc/resolv.conf || ! -e /mnt/pi-root/etc/resolv.conf ]]; then
rm -f /mnt/pi-root/etc/resolv.conf
cat /etc/resolv.conf > /mnt/pi-root/etc/resolv.conf
fi
}
prep_chroot() {
# Block service starts inside chroot (no systemd there)
cat >/mnt/pi-root/usr/sbin/policy-rc.d <<'EOF'
#!/bin/sh
exit 101
EOF
chmod +x /mnt/pi-root/usr/sbin/policy-rc.d
# All the work happens inside the ARM64 rootfs
CHCMD=$(cat <<'EOS'
set -euo pipefail
export DEBIAN_FRONTEND=noninteractive TZ=Etc/UTC
# Ensure sbin is in PATH so user/group tools work
export PATH="/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin"
apt-get update
apt-get -y full-upgrade
# Remove snaps and keep them gone (Ubuntu for Pi ships with snaps)
apt-get -y purge snapd || true
rm -rf /snap /var/snap /var/lib/snapd /home/*/snap || true
mkdir -p /etc/apt/preferences.d
printf 'Package: snapd\nPin: release *\nPin-Priority: -10\n' > /etc/apt/preferences.d/nosnap.pref
# Ensure user/group tools exist
apt-get install -y passwd adduser || true
getent group i2c >/dev/null || /usr/sbin/groupadd i2c
# Base packages
BASE_PKGS="openssh-server git i2c-tools python3-smbus python3-pil zbar-tools qrencode lm-sensors"
apt-get install -y $BASE_PKGS
# ------- OLED (Luma) -------
# Prefer distro package; fall back to pip if not present in this release
if ! dpkg -s python3-luma.oled >/dev/null 2>&1; then
apt-get update
if ! apt-get install -y python3-luma.oled; then
apt-get install -y python3-pip
pip3 install --no-input --break-system-packages luma.oled
fi
fi
# ------- Camera apps -------
# Ubuntu renamed libcamera-apps -> rpicam-apps for Raspberry Pi.
# Try in order; tolerate absence (the box might be display-only).
apt-get update
if ! apt-get install -y rpicam-apps; then
apt-get install -y libcamera-apps || apt-get install -y libcamera-tools || true
fi
# Enable SSH on boot (no systemctl in chroot)
mkdir -p /etc/systemd/system/multi-user.target.wants
ln -sf /lib/systemd/system/ssh.service /etc/systemd/system/multi-user.target.wants/ssh.service
# Create user and set password
if ! id -u STYX_USER >/dev/null 2>&1; then
/usr/sbin/useradd -m -s /bin/bash -G sudo,video,i2c STYX_USER
fi
echo 'STYX_USER:STYX_PASS' | /usr/sbin/chpasswd
# Optional: preload SSH key
if [ -n 'SSH_PUBKEY' ] && echo 'SSH_PUBKEY' | grep -q 'ssh-'; then
install -d -m700 /home/STYX_USER/.ssh
echo 'SSH_PUBKEY' >> /home/STYX_USER/.ssh/authorized_keys
chmod 600 /home/STYX_USER/.ssh/authorized_keys
chown -R STYX_USER:STYX_USER /home/STYX_USER/.ssh
fi
# Freenove code
git clone https://github.com/Freenove/Freenove_Computer_Case_Kit_for_Raspberry_Pi.git /opt/freenove || true
# Hostname
echo 'STYX_HOSTNAME' > /etc/hostname
if grep -q '^127\.0\.1\.1' /etc/hosts; then
sed -i 's/^127\.0\.1\.1.*/127.0.1.1\tSTYX_HOSTNAME/' /etc/hosts
else
echo -e '127.0.1.1\tSTYX_HOSTNAME' >> /etc/hosts
fi
apt-get clean
EOS
)
# Inject config values safely
CHCMD="${CHCMD//STYX_USER/${STYX_USER}}"
CHCMD="${CHCMD//STYX_PASS/${STYX_PASS}}"
CHCMD="${CHCMD//STYX_HOSTNAME/${STYX_HOSTNAME}}"
CHCMD="${CHCMD//SSH_PUBKEY/${SSH_PUBKEY}}"
chroot /mnt/pi-root /bin/bash -lc "$CHCMD"
}
install_service_host() {
# Systemd unit for the Freenove example app
mkdir -p /mnt/pi-root/etc/systemd/system/multi-user.target.wants
cat >/mnt/pi-root/etc/systemd/system/freenove-case.service <<'SERVICE'
[Unit]
Description=Freenove Case OLED/Fans/LEDs
After=multi-user.target
[Service]
Type=simple
ExecStart=/usr/bin/python3 /opt/freenove/Code/application.py
Restart=on-failure
[Install]
WantedBy=multi-user.target
SERVICE
ln -sf /etc/systemd/system/freenove-case.service \
/mnt/pi-root/etc/systemd/system/multi-user.target.wants/freenove-case.service || true
}
boot_tweaks() {
# Enable I2C and set DSI panel on the BOOT partition
grep -q 'dtparam=i2c_arm=on' /mnt/pi-boot/config.txt || echo 'dtparam=i2c_arm=on' >> /mnt/pi-boot/config.txt
# Append kernel cmdline only once
if ! grep -q 'DSI-1:800x480@60D' /mnt/pi-boot/cmdline.txt 2>/dev/null; then
sed -i '1 s#$# video=DSI-1:800x480@60D video=HDMI-A-1:off video=HDMI-A-2:off#' /mnt/pi-boot/cmdline.txt || true
fi
}
cleanup() {
rm -f /mnt/pi-root/usr/sbin/policy-rc.d || true
for d in dev/pts dev proc sys; do umount -lf "/mnt/pi-root/$d" 2>/dev/null || true; done
umount -lf /mnt/pi-boot 2>/dev/null || true
umount -lf /mnt/pi-root 2>/dev/null || true
sync || true
}
main() {
require_root
ensure_binfmt_arm64
find_parts
trap 'echo "ERROR at line $LINENO" >&2; cleanup' ERR INT
mount_parts
prep_chroot
install_service_host
boot_tweaks
cleanup
echo "✅ Done. Move the NVMe to the Pi and boot."
echo " Login: user '${STYX_USER}' pass '${STYX_PASS}' (change with 'passwd')."
echo " Quick checks on the Pi:"
echo " sudo i2cdetect -y 1"
echo " rpicam-still -n -o test.jpg # (if rpicam-apps installed)"
echo " libcamera-still -n -o test.jpg # (if legacy libcamera-apps installed)"
echo " systemctl status freenove-case"
}
main "$@"

View File

@ -75,4 +75,4 @@ spec:
claimName: jellyfin-cache-astreae
- name: media
persistentVolumeClaim:
claimName: jellyfin-media-asteria
claimName: jellyfin-media-asteria-new

View File

@ -38,3 +38,17 @@ spec:
requests:
storage: 4Ti
storageClassName: asteria
---
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
name: jellyfin-media-asteria-new
namespace: jellyfin
spec:
accessModes: ["ReadWriteMany"]
resources:
requests:
storage: 4Ti
storageClassName: asteria

View File

@ -0,0 +1,206 @@
# services/monitoring/kube-state-metrics-helmrelease.yaml
apiVersion: helm.toolkit.fluxcd.io/v2
kind: HelmRelease
metadata:
name: kube-state-metrics
namespace: monitoring
spec:
interval: 15m
chart:
spec:
chart: kube-state-metrics
version: "~6.0.0"
sourceRef:
kind: HelmRepository
name: prometheus-community
namespace: flux-system
values:
prometheusScrape: true # annotates for /metrics auto-scrape. :contentReference[oaicite:16]{index=16}
service:
annotations:
prometheus.io/scrape: "true"
prometheus.io/port: "8080" # ksm serves metrics on 8080 by default
prometheus.io/path: "/metrics"
---
apiVersion: helm.toolkit.fluxcd.io/v2
kind: HelmRelease
metadata:
name: node-exporter
namespace: monitoring
spec:
interval: 15m
chart:
spec:
chart: prometheus-node-exporter
version: "~4.0.0"
sourceRef:
kind: HelmRepository
name: prometheus-community
namespace: flux-system
values:
service:
annotations:
prometheus.io/scrape: "true"
prometheus.io/port: "9100"
---
apiVersion: helm.toolkit.fluxcd.io/v2
kind: HelmRelease
metadata:
name: victoria-metrics-single
namespace: monitoring
spec:
interval: 15m
chart:
spec:
chart: victoria-metrics-single
version: "~0.15.0" # or omit to track appVersion
sourceRef:
kind: HelmRepository
name: victoria-metrics
namespace: flux-system
values:
server:
# keep ~3 months; change as you like (supports "d", "y")
extraArgs:
retentionPeriod: "90d" # VM flag -retentionPeriod=90d. :contentReference[oaicite:11]{index=11}
persistentVolume:
enabled: true
size: 100Gi # adjust; uses default StorageClass (Longhorn)
# storageClassName: "" # set if you want a specific class
# Enable built-in Kubernetes scraping
scrape:
enabled: true # chart enables promscrape. :contentReference[oaicite:12]{index=12}
config:
global:
scrape_interval: 15s
scrape_configs:
# VM self-metrics
- job_name: victoriametrics
static_configs:
- targets: ["localhost:8428"]
# --- K8s control-plane & nodes (from VM docs guide) ---
- job_name: "kubernetes-apiservers"
kubernetes_sd_configs: [{ role: endpoints }]
scheme: https
tls_config:
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
insecure_skip_verify: true
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
relabel_configs:
- action: keep
source_labels: [__meta_kubernetes_namespace,__meta_kubernetes_service_name,__meta_kubernetes_endpoint_port_name]
regex: default;kubernetes;https
- job_name: "kubernetes-nodes"
scheme: https
tls_config:
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
insecure_skip_verify: true
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
kubernetes_sd_configs: [{ role: node }]
relabel_configs:
- action: labelmap
regex: __meta_kubernetes_node_label_(.+)
- target_label: __address__
replacement: kubernetes.default.svc:443
- source_labels: [__meta_kubernetes_node_name]
regex: (.+)
target_label: __metrics_path__
replacement: /api/v1/nodes/$1/proxy/metrics
- job_name: "kubernetes-nodes-cadvisor"
scheme: https
tls_config:
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
insecure_skip_verify: true
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
kubernetes_sd_configs: [{ role: node }]
relabel_configs:
- action: labelmap
regex: __meta_kubernetes_node_label_(.+)
- target_label: __address__
replacement: kubernetes.default.svc:443
- source_labels: [__meta_kubernetes_node_name]
regex: (.+)
target_label: __metrics_path__
replacement: /api/v1/nodes/$1/proxy/metrics/cadvisor
# --- Annotated Services (generic autodiscovery) ---
- job_name: "kubernetes-service-endpoints"
kubernetes_sd_configs: [{ role: endpoints }]
relabel_configs:
- action: keep
source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scrape]
regex: "true"
- action: replace
source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scheme]
regex: (https?)
target_label: __scheme__
- action: replace
source_labels: [__meta_kubernetes_service_annotation_prometheus_io_path]
target_label: __metrics_path__
- action: replace
regex: (.+)(?::\d+);(\d+)
replacement: $1:$2
source_labels: [__address__, __meta_kubernetes_service_annotation_prometheus_io_port]
target_label: __address__
# --- Annotated Pods (generic autodiscovery) ---
- job_name: "kubernetes-pods"
kubernetes_sd_configs: [{ role: pod }]
relabel_configs:
- action: keep
source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_scrape]
regex: "true"
- action: replace
source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_path]
target_label: __metrics_path__
- action: replace
regex: (.+):(?:\d+);(\d+)
replacement: $1:$2
source_labels: [__address__, __meta_kubernetes_pod_annotation_prometheus_io_port]
target_label: __address__
# --- kube-state-metrics (via its Service) ---
- job_name: "kube-state-metrics"
kubernetes_sd_configs: [{ role: endpoints }]
relabel_configs:
- action: keep
source_labels: [__meta_kubernetes_service_label_app_kubernetes_io_name]
regex: kube-state-metrics
# --- Longhorn ---
- job_name: "longhorn-backend"
static_configs:
- targets: ["longhorn-backend.longhorn-system.svc:9500"]
metrics_path: /metrics
# --- cert-manager (pods expose on 9402) ---
- job_name: "cert-manager"
kubernetes_sd_configs: [{ role: pod }]
relabel_configs:
- action: keep
source_labels: [__meta_kubernetes_namespace, __meta_kubernetes_pod_label_app_kubernetes_io_name]
regex: cert-manager;cert-manager
- action: replace
source_labels: [__address__]
regex: "(.+):\\d+"
replacement: "$1:9402"
target_label: __address__
# --- Flux controllers (default :8080/metrics) ---
- job_name: "flux"
kubernetes_sd_configs: [{ role: pod }]
relabel_configs:
- action: keep
source_labels: [__meta_kubernetes_namespace, __meta_kubernetes_pod_label_app_kubernetes_io_part_of]
regex: flux-system;flux

View File

@ -0,0 +1,8 @@
# services/monitoring/kustomization.yaml
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
namespace: monitoring
resources:
- namespace.yaml
- rbac.yaml
- helmrelease.yaml

View File

@ -0,0 +1,4 @@
apiVersion: v1
kind: Namespace
metadata:
name: monitoring

View File

@ -0,0 +1,33 @@
# services/monitoring/rbac.yaml
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
name: vmsingle-scrape
rules:
- apiGroups: [""]
resources:
- nodes
- nodes/proxy
- nodes/metrics
- services
- endpoints
- pods
verbs: ["get","list","watch"]
- apiGroups: ["networking.k8s.io","extensions"]
resources: ["ingresses"]
verbs: ["get","list","watch"]
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
name: vmsingle-scrape
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: vmsingle-scrape
subjects:
- kind: ServiceAccount
name: victoria-metrics-single
namespace: monitoring

View File

@ -107,7 +107,7 @@ spec:
volumes:
- name: media
persistentVolumeClaim:
claimName: jellyfin-media-asteria
claimName: jellyfin-media-asteria-new
- name: config
configMap: { name: pegasus-user-map }
- name: tmp