Merge pull request 'feature/ariadne' (#11) from feature/ariadne into main

Reviewed-on: #11
This commit is contained in:
bstein 2026-01-28 14:05:38 +00:00
commit cc51eb6d1e
171 changed files with 17181 additions and 1444 deletions

1
.gitignore vendored
View File

@ -6,4 +6,5 @@ __pycache__/
*.py[cod]
.pytest_cache
.venv
.venv-ci
tmp/

77
Jenkinsfile vendored Normal file
View File

@ -0,0 +1,77 @@
// Mirror of ci/Jenkinsfile.titan-iac for multibranch discovery.
pipeline {
agent {
kubernetes {
defaultContainer 'python'
yaml """
apiVersion: v1
kind: Pod
spec:
nodeSelector:
hardware: rpi5
kubernetes.io/arch: arm64
node-role.kubernetes.io/worker: "true"
containers:
- name: python
image: python:3.12-slim
command:
- cat
tty: true
"""
}
}
environment {
PIP_DISABLE_PIP_VERSION_CHECK = '1'
PYTHONUNBUFFERED = '1'
}
stages {
stage('Checkout') {
steps {
checkout scm
}
}
stage('Install deps') {
steps {
sh 'pip install --no-cache-dir -r ci/requirements.txt'
}
}
stage('Glue tests') {
steps {
sh 'pytest -q ci/tests/glue'
}
}
stage('Resolve Flux branch') {
steps {
script {
env.FLUX_BRANCH = sh(
returnStdout: true,
script: "awk '/branch:/{print $2; exit}' clusters/atlas/flux-system/gotk-sync.yaml"
).trim()
if (!env.FLUX_BRANCH) {
error('Flux branch not found in gotk-sync.yaml')
}
echo "Flux branch: ${env.FLUX_BRANCH}"
}
}
}
stage('Promote') {
when {
expression {
def branch = env.BRANCH_NAME ?: (env.GIT_BRANCH ?: '').replaceFirst('origin/', '')
return env.FLUX_BRANCH && branch == env.FLUX_BRANCH
}
}
steps {
withCredentials([usernamePassword(credentialsId: 'gitea-pat', usernameVariable: 'GIT_USER', passwordVariable: 'GIT_TOKEN')]) {
sh '''
set +x
git config user.email "jenkins@bstein.dev"
git config user.name "jenkins"
git remote set-url origin https://${GIT_USER}:${GIT_TOKEN}@scm.bstein.dev/bstein/titan-iac.git
git push origin HEAD:${FLUX_BRANCH}
'''
}
}
}
}
}

View File

@ -6,6 +6,10 @@ pipeline {
apiVersion: v1
kind: Pod
spec:
nodeSelector:
hardware: rpi5
kubernetes.io/arch: arm64
node-role.kubernetes.io/worker: "true"
containers:
- name: python
image: python:3.12-slim
@ -18,7 +22,6 @@ spec:
environment {
PIP_DISABLE_PIP_VERSION_CHECK = '1'
PYTHONUNBUFFERED = '1'
DEPLOY_BRANCH = 'deploy'
}
stages {
stage('Checkout') {
@ -36,7 +39,27 @@ spec:
sh 'pytest -q ci/tests/glue'
}
}
stage('Resolve Flux branch') {
steps {
script {
env.FLUX_BRANCH = sh(
returnStdout: true,
script: "awk '/branch:/{print $2; exit}' clusters/atlas/flux-system/gotk-sync.yaml"
).trim()
if (!env.FLUX_BRANCH) {
error('Flux branch not found in gotk-sync.yaml')
}
echo "Flux branch: ${env.FLUX_BRANCH}"
}
}
}
stage('Promote') {
when {
expression {
def branch = env.BRANCH_NAME ?: (env.GIT_BRANCH ?: '').replaceFirst('origin/', '')
return env.FLUX_BRANCH && branch == env.FLUX_BRANCH
}
}
steps {
withCredentials([usernamePassword(credentialsId: 'gitea-pat', usernameVariable: 'GIT_USER', passwordVariable: 'GIT_TOKEN')]) {
sh '''
@ -44,7 +67,7 @@ spec:
git config user.email "jenkins@bstein.dev"
git config user.name "jenkins"
git remote set-url origin https://${GIT_USER}:${GIT_TOKEN}@scm.bstein.dev/bstein/titan-iac.git
git push origin HEAD:${DEPLOY_BRANCH}
git push origin HEAD:${FLUX_BRANCH}
'''
}
}

View File

@ -1,7 +1,16 @@
max_success_age_hours: 48
allow_suspended:
- bstein-dev-home/vaultwarden-cred-sync
- comms/othrys-room-reset
- comms/pin-othrys-invite
- comms/seed-othrys-room
- finance/firefly-user-sync
- health/wger-admin-ensure
- health/wger-user-sync
- mailu-mailserver/mailu-sync-nightly
- nextcloud/nextcloud-mail-sync
ariadne_schedule_tasks:
- schedule.mailu_sync
- schedule.nextcloud_sync
- schedule.vaultwarden_sync
- schedule.wger_admin

View File

@ -1,11 +1,19 @@
from __future__ import annotations
import os
from pathlib import Path
import requests
import yaml
VM_URL = os.environ.get("VM_URL", "http://victoria-metrics-single-server:8428").rstrip("/")
CONFIG_PATH = Path(__file__).with_name("config.yaml")
def _load_config() -> dict:
with CONFIG_PATH.open("r", encoding="utf-8") as handle:
return yaml.safe_load(handle) or {}
def _query(promql: str) -> list[dict]:
@ -27,3 +35,14 @@ def test_glue_metrics_success_join():
)
series = _query(query)
assert series, "No glue cronjob last success series found"
def test_ariadne_schedule_metrics_present():
cfg = _load_config()
expected = cfg.get("ariadne_schedule_tasks", [])
if not expected:
return
series = _query("ariadne_schedule_next_run_timestamp_seconds")
tasks = {item.get("metric", {}).get("task") for item in series}
missing = [task for task in expected if task not in tasks]
assert not missing, f"Missing Ariadne schedule metrics for: {', '.join(missing)}"

View File

@ -0,0 +1,17 @@
# clusters/atlas/flux-system/applications/bstein-dev-home-migrations/kustomization.yaml
apiVersion: kustomize.toolkit.fluxcd.io/v1
kind: Kustomization
metadata:
name: bstein-dev-home-migrations
namespace: flux-system
spec:
interval: 10m
path: ./services/bstein-dev-home/oneoffs/migrations
prune: true
force: true
sourceRef:
kind: GitRepository
name: flux-system
targetNamespace: bstein-dev-home
wait: false
suspend: true

View File

@ -3,7 +3,7 @@ apiVersion: image.toolkit.fluxcd.io/v1
kind: ImageUpdateAutomation
metadata:
name: bstein-dev-home
namespace: flux-system
namespace: bstein-dev-home
spec:
interval: 1m0s
sourceRef:
@ -13,14 +13,14 @@ spec:
git:
checkout:
ref:
branch: feature/vault-consumption
branch: feature/ariadne
commit:
author:
email: ops@bstein.dev
name: flux-bot
messageTemplate: "chore(bstein-dev-home): update images to {{range .Updated.Images}}{{.}}{{end}}"
messageTemplate: "chore(bstein-dev-home): automated image update"
push:
branch: feature/vault-consumption
branch: feature/ariadne
update:
strategy: Setters
path: services/bstein-dev-home

View File

@ -13,11 +13,6 @@ spec:
kind: GitRepository
name: flux-system
namespace: flux-system
healthChecks:
- apiVersion: helm.toolkit.fluxcd.io/v2
kind: HelmRelease
name: harbor
namespace: harbor
wait: false
dependsOn:
- name: core

View File

@ -12,6 +12,7 @@ resources:
- pegasus/image-automation.yaml
- bstein-dev-home/kustomization.yaml
- bstein-dev-home/image-automation.yaml
- bstein-dev-home-migrations/kustomization.yaml
- harbor/kustomization.yaml
- harbor/image-automation.yaml
- jellyfin/kustomization.yaml

View File

@ -3,7 +3,7 @@ apiVersion: image.toolkit.fluxcd.io/v1
kind: ImageUpdateAutomation
metadata:
name: pegasus
namespace: flux-system
namespace: jellyfin
spec:
interval: 1m0s
sourceRef:

View File

@ -11,6 +11,7 @@ resources:
- monitoring/kustomization.yaml
- logging/kustomization.yaml
- maintenance/kustomization.yaml
- maintenance/image-automation.yaml
- longhorn-adopt/kustomization.yaml
- longhorn/kustomization.yaml
- longhorn-ui/kustomization.yaml

View File

@ -0,0 +1,26 @@
# clusters/atlas/flux-system/platform/maintenance/image-automation.yaml
apiVersion: image.toolkit.fluxcd.io/v1
kind: ImageUpdateAutomation
metadata:
name: maintenance
namespace: maintenance
spec:
interval: 1m0s
sourceRef:
kind: GitRepository
name: flux-system
namespace: flux-system
git:
checkout:
ref:
branch: feature/ariadne
commit:
author:
email: ops@bstein.dev
name: flux-bot
messageTemplate: "chore(maintenance): automated image update"
push:
branch: feature/ariadne
update:
strategy: Setters
path: services/maintenance

View File

@ -8,6 +8,7 @@ spec:
interval: 10m
path: ./services/maintenance
prune: true
force: true
sourceRef:
kind: GitRepository
name: flux-system

View File

@ -32,6 +32,9 @@ data:
192.168.22.9 notes.bstein.dev
192.168.22.9 office.bstein.dev
192.168.22.9 pegasus.bstein.dev
3.136.224.193 pm-bounces.bstein.dev
3.150.68.49 pm-bounces.bstein.dev
18.189.137.81 pm-bounces.bstein.dev
192.168.22.9 registry.bstein.dev
192.168.22.9 scm.bstein.dev
192.168.22.9 secret.bstein.dev

View File

@ -6,5 +6,6 @@ resources:
- ../modules/profiles/atlas-ha
- coredns-custom.yaml
- coredns-deployment.yaml
- ntp-sync-daemonset.yaml
- ../sources/cert-manager/letsencrypt.yaml
- ../sources/cert-manager/letsencrypt-prod.yaml

View File

@ -0,0 +1,50 @@
# infrastructure/core/ntp-sync-daemonset.yaml
apiVersion: apps/v1
kind: DaemonSet
metadata:
name: ntp-sync
namespace: kube-system
labels:
app: ntp-sync
spec:
selector:
matchLabels:
app: ntp-sync
template:
metadata:
labels:
app: ntp-sync
spec:
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: node-role.kubernetes.io/control-plane
operator: DoesNotExist
- key: node-role.kubernetes.io/master
operator: DoesNotExist
containers:
- name: ntp-sync
image: public.ecr.aws/docker/library/busybox:1.36.1
imagePullPolicy: IfNotPresent
command: ["/bin/sh", "-c"]
args:
- |
set -eu
while true; do
ntpd -q -p pool.ntp.org || true
sleep 300
done
securityContext:
capabilities:
add: ["SYS_TIME"]
runAsUser: 0
runAsGroup: 0
resources:
requests:
cpu: 10m
memory: 16Mi
limits:
cpu: 50m
memory: 64Mi

View File

@ -11,7 +11,7 @@ spec:
roleName: "longhorn"
objects: |
- objectName: "harbor-pull__dockerconfigjson"
secretPath: "kv/data/atlas/harbor-pull/longhorn"
secretPath: "kv/data/atlas/shared/harbor-pull"
secretKey: "dockerconfigjson"
secretObjects:
- secretName: longhorn-registry

View File

@ -4,6 +4,10 @@ kind: Service
metadata:
name: postgres-service
namespace: postgres
annotations:
prometheus.io/scrape: "true"
prometheus.io/port: "9187"
prometheus.io/path: "/metrics"
spec:
clusterIP: None
ports:
@ -11,5 +15,9 @@ spec:
port: 5432
protocol: TCP
targetPort: 5432
- name: metrics
port: 9187
protocol: TCP
targetPort: 9187
selector:
app: postgres

View File

@ -58,6 +58,23 @@ spec:
- name: vault-secrets
mountPath: /mnt/vault
readOnly: true
- name: postgres-exporter
image: quay.io/prometheuscommunity/postgres-exporter:v0.15.0
ports:
- name: metrics
containerPort: 9187
protocol: TCP
env:
- name: DATA_SOURCE_URI
value: "localhost:5432/postgres?sslmode=disable"
- name: DATA_SOURCE_USER
value: postgres
- name: DATA_SOURCE_PASS_FILE
value: /mnt/vault/postgres_password
volumeMounts:
- name: vault-secrets
mountPath: /mnt/vault
readOnly: true
volumes:
- name: vault-secrets
csi:

View File

@ -5,7 +5,7 @@ metadata:
name: letsencrypt-prod
spec:
acme:
email: brad.stein@gmail.com
email: brad@bstein.dev
server: https://acme-v02.api.letsencrypt.org/directory
privateKeySecretRef:
name: letsencrypt-prod-account-key

View File

@ -5,7 +5,7 @@ metadata:
name: letsencrypt
spec:
acme:
email: brad.stein@gmail.com
email: brad@bstein.dev
server: https://acme-v02.api.letsencrypt.org/directory
privateKeySecretRef:
name: letsencrypt-account-key

View File

@ -17,4 +17,5 @@ spec:
values:
syncSecret:
enabled: true
enableSecretRotation: false
enableSecretRotation: true
rotationPollInterval: 2m

View File

@ -1,8 +1,8 @@
{
"counts": {
"helmrelease_host_hints": 17,
"http_endpoints": 37,
"services": 43,
"workloads": 54
"helmrelease_host_hints": 19,
"http_endpoints": 45,
"services": 47,
"workloads": 74
}
}

File diff suppressed because it is too large Load Diff

View File

@ -8,6 +8,15 @@ sources:
- name: bstein-dev-home
path: services/bstein-dev-home
targetNamespace: bstein-dev-home
- name: bstein-dev-home-migrations
path: services/bstein-dev-home/migrations
targetNamespace: bstein-dev-home
- name: cert-manager
path: infrastructure/cert-manager
targetNamespace: cert-manager
- name: cert-manager-cleanup
path: infrastructure/cert-manager/cleanup
targetNamespace: cert-manager
- name: comms
path: services/comms
targetNamespace: comms
@ -17,6 +26,9 @@ sources:
- name: crypto
path: services/crypto
targetNamespace: crypto
- name: finance
path: services/finance
targetNamespace: finance
- name: flux-system
path: clusters/atlas/flux-system
targetNamespace: null
@ -29,6 +41,9 @@ sources:
- name: harbor
path: services/harbor
targetNamespace: harbor
- name: health
path: services/health
targetNamespace: health
- name: helm
path: infrastructure/sources/helm
targetNamespace: flux-system
@ -44,6 +59,12 @@ sources:
- name: logging
path: services/logging
targetNamespace: null
- name: longhorn
path: infrastructure/longhorn/core
targetNamespace: longhorn-system
- name: longhorn-adopt
path: infrastructure/longhorn/adopt
targetNamespace: longhorn-system
- name: longhorn-ui
path: infrastructure/longhorn/ui-ingress
targetNamespace: longhorn-system
@ -98,9 +119,15 @@ sources:
- name: vault-csi
path: infrastructure/vault-csi
targetNamespace: kube-system
- name: vault-injector
path: infrastructure/vault-injector
targetNamespace: vault
- name: vaultwarden
path: services/vaultwarden
targetNamespace: vaultwarden
- name: wallet-monero-temp
path: services/crypto/wallet-monero-temp
targetNamespace: crypto
- name: xmr-miner
path: services/crypto/xmr-miner
targetNamespace: crypto
@ -124,7 +151,7 @@ workloads:
kubernetes.io/arch: arm64
node-role.kubernetes.io/worker: 'true'
images:
- registry.bstein.dev/bstein/bstein-dev-home-backend:0.1.1-92
- registry.bstein.dev/bstein/bstein-dev-home-backend:0.1.1-157
- kind: Deployment
namespace: bstein-dev-home
name: bstein-dev-home-frontend
@ -135,13 +162,22 @@ workloads:
kubernetes.io/arch: arm64
node-role.kubernetes.io/worker: 'true'
images:
- registry.bstein.dev/bstein/bstein-dev-home-frontend:0.1.1-92
- registry.bstein.dev/bstein/bstein-dev-home-frontend:0.1.1-157
- kind: Deployment
namespace: bstein-dev-home
name: bstein-dev-home-vault-sync
labels:
app: bstein-dev-home-vault-sync
serviceAccountName: bstein-dev-home-vault-sync
nodeSelector: {}
images:
- alpine:3.20
- kind: Deployment
namespace: bstein-dev-home
name: chat-ai-gateway
labels:
app: chat-ai-gateway
serviceAccountName: null
serviceAccountName: bstein-dev-home
nodeSelector:
kubernetes.io/arch: arm64
node-role.kubernetes.io/worker: 'true'
@ -157,12 +193,21 @@ workloads:
hardware: rpi5
images:
- python:3.11-slim
- kind: Deployment
namespace: comms
name: comms-vault-sync
labels:
app: comms-vault-sync
serviceAccountName: comms-vault
nodeSelector: {}
images:
- alpine:3.20
- kind: Deployment
namespace: comms
name: coturn
labels:
app: coturn
serviceAccountName: null
serviceAccountName: comms-vault
nodeSelector:
hardware: rpi5
images:
@ -182,7 +227,7 @@ workloads:
name: livekit
labels:
app: livekit
serviceAccountName: null
serviceAccountName: comms-vault
nodeSelector:
hardware: rpi5
images:
@ -192,17 +237,17 @@ workloads:
name: livekit-token-service
labels:
app: livekit-token-service
serviceAccountName: null
serviceAccountName: comms-vault
nodeSelector:
hardware: rpi5
images:
- ghcr.io/element-hq/lk-jwt-service:0.3.0
- registry.bstein.dev/tools/lk-jwt-service-vault:0.3.0
- kind: Deployment
namespace: comms
name: matrix-authentication-service
labels:
app: matrix-authentication-service
serviceAccountName: null
serviceAccountName: comms-vault
nodeSelector:
hardware: rpi5
images:
@ -212,7 +257,7 @@ workloads:
name: matrix-guest-register
labels:
app.kubernetes.io/name: matrix-guest-register
serviceAccountName: null
serviceAccountName: comms-vault
nodeSelector: {}
images:
- python:3.11-slim
@ -235,12 +280,21 @@ workloads:
node-role.kubernetes.io/worker: 'true'
images:
- ghcr.io/tari-project/xmrig@sha256:80defbfd0b640d604c91cb5101d3642db7928e1e68ee3c6b011289b3565a39d9
- kind: Deployment
namespace: crypto
name: crypto-vault-sync
labels:
app: crypto-vault-sync
serviceAccountName: crypto-vault-sync
nodeSelector: {}
images:
- alpine:3.20
- kind: Deployment
namespace: crypto
name: monero-p2pool
labels:
app: monero-p2pool
serviceAccountName: null
serviceAccountName: crypto-vault-sync
nodeSelector:
node-role.kubernetes.io/worker: 'true'
images:
@ -255,6 +309,38 @@ workloads:
node-role.kubernetes.io/worker: 'true'
images:
- registry.bstein.dev/crypto/monerod:0.18.4.1
- kind: Deployment
namespace: crypto
name: wallet-monero-temp
labels:
app: wallet-monero-temp
serviceAccountName: crypto-vault-sync
nodeSelector:
node-role.kubernetes.io/worker: 'true'
images:
- registry.bstein.dev/crypto/monero-wallet-rpc:0.18.4.1
- kind: Deployment
namespace: finance
name: actual-budget
labels:
app: actual-budget
serviceAccountName: finance-vault
nodeSelector:
kubernetes.io/arch: arm64
node-role.kubernetes.io/worker: 'true'
images:
- actualbudget/actual-server:26.1.0-alpine@sha256:34aae5813fdfee12af2a50c4d0667df68029f1d61b90f45f282473273eb70d0d
- kind: Deployment
namespace: finance
name: firefly
labels:
app: firefly
serviceAccountName: finance-vault
nodeSelector:
kubernetes.io/arch: arm64
node-role.kubernetes.io/worker: 'true'
images:
- fireflyiii/core:version-6.4.15
- kind: Deployment
namespace: flux-system
name: helm-controller
@ -344,17 +430,38 @@ workloads:
name: gitea
labels:
app: gitea
serviceAccountName: null
serviceAccountName: gitea-vault
nodeSelector:
node-role.kubernetes.io/worker: 'true'
images:
- gitea/gitea:1.23
- kind: Deployment
namespace: harbor
name: harbor-vault-sync
labels:
app: harbor-vault-sync
serviceAccountName: harbor-vault-sync
nodeSelector: {}
images:
- alpine:3.20
- kind: Deployment
namespace: health
name: wger
labels:
app: wger
serviceAccountName: health-vault-sync
nodeSelector:
kubernetes.io/arch: arm64
node-role.kubernetes.io/worker: 'true'
images:
- nginx:1.27.5-alpine@sha256:65645c7bb6a0661892a8b03b89d0743208a18dd2f3f17a54ef4b76fb8e2f2a10
- wger/server@sha256:710588b78af4e0aa0b4d8a8061e4563e16eae80eeaccfe7f9e0d9cbdd7f0cbc5
- kind: Deployment
namespace: jellyfin
name: jellyfin
labels:
app: jellyfin
serviceAccountName: null
serviceAccountName: pegasus-vault-sync
nodeSelector: {}
images:
- docker.io/jellyfin/jellyfin:10.11.5
@ -363,13 +470,22 @@ workloads:
name: pegasus
labels:
app: pegasus
serviceAccountName: null
serviceAccountName: pegasus-vault-sync
nodeSelector:
kubernetes.io/arch: arm64
node-role.kubernetes.io/worker: 'true'
images:
- alpine:3.20
- registry.bstein.dev/streaming/pegasus:1.2.32
- registry.bstein.dev/streaming/pegasus-vault:1.2.32
- kind: Deployment
namespace: jellyfin
name: pegasus-vault-sync
labels:
app: pegasus-vault-sync
serviceAccountName: pegasus-vault-sync
nodeSelector: {}
images:
- alpine:3.20
- kind: Deployment
namespace: jenkins
name: jenkins
@ -381,6 +497,26 @@ workloads:
node-role.kubernetes.io/worker: 'true'
images:
- jenkins/jenkins:2.528.3-jdk21
- kind: Deployment
namespace: jenkins
name: jenkins-vault-sync
labels:
app: jenkins-vault-sync
serviceAccountName: jenkins-vault-sync
nodeSelector:
kubernetes.io/arch: arm64
node-role.kubernetes.io/worker: 'true'
images:
- alpine:3.20
- kind: DaemonSet
namespace: kube-system
name: ntp-sync
labels:
app: ntp-sync
serviceAccountName: null
nodeSelector: {}
images:
- public.ecr.aws/docker/library/busybox:1.36.1
- kind: DaemonSet
namespace: kube-system
name: nvidia-device-plugin-jetson
@ -427,6 +563,16 @@ workloads:
kubernetes.io/os: linux
images:
- hashicorp/vault-csi-provider:1.7.0
- kind: Deployment
namespace: kube-system
name: coredns
labels:
k8s-app: kube-dns
serviceAccountName: coredns
nodeSelector:
kubernetes.io/os: linux
images:
- registry.bstein.dev/infra/coredns:1.12.1
- kind: DaemonSet
namespace: logging
name: node-image-gc-rpi4
@ -457,22 +603,41 @@ workloads:
hardware: rpi5
images:
- bitnami/kubectl@sha256:554ab88b1858e8424c55de37ad417b16f2a0e65d1607aa0f3fe3ce9b9f10b131
- kind: Deployment
namespace: logging
name: logging-vault-sync
labels:
app: logging-vault-sync
serviceAccountName: logging-vault-sync
nodeSelector: {}
images:
- alpine:3.20
- kind: Deployment
namespace: logging
name: oauth2-proxy-logs
labels:
app: oauth2-proxy-logs
serviceAccountName: null
serviceAccountName: logging-vault-sync
nodeSelector:
node-role.kubernetes.io/worker: 'true'
images:
- quay.io/oauth2-proxy/oauth2-proxy:v7.6.0
- registry.bstein.dev/tools/oauth2-proxy-vault:v7.6.0
- kind: Deployment
namespace: longhorn-system
name: longhorn-vault-sync
labels:
app: longhorn-vault-sync
serviceAccountName: longhorn-vault-sync
nodeSelector:
node-role.kubernetes.io/worker: 'true'
images:
- alpine:3.20
- kind: Deployment
namespace: longhorn-system
name: oauth2-proxy-longhorn
labels:
app: oauth2-proxy-longhorn
serviceAccountName: null
serviceAccountName: longhorn-vault
nodeSelector:
node-role.kubernetes.io/worker: 'true'
images:
@ -489,13 +654,34 @@ workloads:
- registry.bstein.dev/bstein/kubectl:1.35.0
- kind: Deployment
namespace: mailu-mailserver
name: mailu-sync-listener
name: mailu-vault-sync
labels:
app: mailu-sync-listener
serviceAccountName: null
app: mailu-vault-sync
serviceAccountName: mailu-vault-sync
nodeSelector: {}
images:
- python:3.11-alpine
- alpine:3.20
- kind: DaemonSet
namespace: maintenance
name: disable-k3s-traefik
labels:
app: disable-k3s-traefik
serviceAccountName: disable-k3s-traefik
nodeSelector:
node-role.kubernetes.io/control-plane: 'true'
images:
- bitnami/kubectl@sha256:554ab88b1858e8424c55de37ad417b16f2a0e65d1607aa0f3fe3ce9b9f10b131
- kind: DaemonSet
namespace: maintenance
name: k3s-agent-restart
labels:
app: k3s-agent-restart
serviceAccountName: node-nofile
nodeSelector:
kubernetes.io/arch: arm64
node-role.kubernetes.io/worker: 'true'
images:
- bitnami/kubectl@sha256:554ab88b1858e8424c55de37ad417b16f2a0e65d1607aa0f3fe3ce9b9f10b131
- kind: DaemonSet
namespace: maintenance
name: node-image-sweeper
@ -515,6 +701,26 @@ workloads:
nodeSelector: {}
images:
- bitnami/kubectl@sha256:554ab88b1858e8424c55de37ad417b16f2a0e65d1607aa0f3fe3ce9b9f10b131
- kind: Deployment
namespace: maintenance
name: ariadne
labels:
app: ariadne
serviceAccountName: ariadne
nodeSelector:
kubernetes.io/arch: arm64
node-role.kubernetes.io/worker: 'true'
images:
- registry.bstein.dev/bstein/ariadne:0.1.0-49
- kind: Deployment
namespace: maintenance
name: maintenance-vault-sync
labels:
app: maintenance-vault-sync
serviceAccountName: maintenance-vault-sync
nodeSelector: {}
images:
- alpine:3.20
- kind: DaemonSet
namespace: monitoring
name: dcgm-exporter
@ -534,12 +740,21 @@ workloads:
jetson: 'true'
images:
- python:3.10-slim
- kind: Deployment
namespace: monitoring
name: monitoring-vault-sync
labels:
app: monitoring-vault-sync
serviceAccountName: monitoring-vault-sync
nodeSelector: {}
images:
- alpine:3.20
- kind: Deployment
namespace: monitoring
name: postmark-exporter
labels:
app: postmark-exporter
serviceAccountName: null
serviceAccountName: monitoring-vault-sync
nodeSelector: {}
images:
- python:3.12-alpine
@ -558,7 +773,7 @@ workloads:
name: nextcloud
labels:
app: nextcloud
serviceAccountName: null
serviceAccountName: nextcloud-vault
nodeSelector:
hardware: rpi5
images:
@ -568,7 +783,7 @@ workloads:
name: outline
labels:
app: outline
serviceAccountName: null
serviceAccountName: outline-vault
nodeSelector:
node-role.kubernetes.io/worker: 'true'
images:
@ -588,7 +803,7 @@ workloads:
name: planka
labels:
app: planka
serviceAccountName: null
serviceAccountName: planka-vault
nodeSelector:
node-role.kubernetes.io/worker: 'true'
images:
@ -603,13 +818,16 @@ workloads:
node-role.kubernetes.io/worker: 'true'
images:
- postgres:15
- quay.io/prometheuscommunity/postgres-exporter:v0.15.0
- kind: Deployment
namespace: sso
name: keycloak
labels:
app: keycloak
serviceAccountName: null
nodeSelector: {}
serviceAccountName: sso-vault
nodeSelector:
kubernetes.io/arch: arm64
node-role.kubernetes.io/worker: 'true'
images:
- quay.io/keycloak/keycloak:26.0.7
- kind: Deployment
@ -617,17 +835,26 @@ workloads:
name: oauth2-proxy
labels:
app: oauth2-proxy
serviceAccountName: null
serviceAccountName: sso-vault
nodeSelector:
node-role.kubernetes.io/worker: 'true'
images:
- quay.io/oauth2-proxy/oauth2-proxy:v7.6.0
- registry.bstein.dev/tools/oauth2-proxy-vault:v7.6.0
- kind: Deployment
namespace: sso
name: sso-vault-sync
labels:
app: sso-vault-sync
serviceAccountName: sso-vault-sync
nodeSelector: {}
images:
- alpine:3.20
- kind: StatefulSet
namespace: sso
name: openldap
labels:
app: openldap
serviceAccountName: null
serviceAccountName: sso-vault
nodeSelector:
kubernetes.io/arch: arm64
node-role.kubernetes.io/worker: 'true'
@ -640,7 +867,7 @@ workloads:
app: sui-metrics
serviceAccountName: sui-metrics
nodeSelector:
kubernetes.io/hostname: titan-24
hardware: rpi5
images:
- victoriametrics/vmagent:v1.103.0
- kind: Deployment
@ -648,6 +875,8 @@ workloads:
name: traefik
labels:
app: traefik
app.kubernetes.io/instance: traefik-kube-system
app.kubernetes.io/name: traefik
serviceAccountName: traefik-ingress-controller
nodeSelector:
node-role.kubernetes.io/worker: 'true'
@ -669,10 +898,12 @@ workloads:
name: vaultwarden
labels:
app: vaultwarden
serviceAccountName: null
nodeSelector: {}
serviceAccountName: vaultwarden-vault
nodeSelector:
kubernetes.io/arch: arm64
node-role.kubernetes.io/worker: 'true'
images:
- vaultwarden/server:1.33.2
- vaultwarden/server:1.35.2
services:
- namespace: ai
name: ollama
@ -1040,6 +1271,36 @@ services:
port: 3333
targetPort: 3333
protocol: TCP
- namespace: crypto
name: wallet-monero-temp
type: ClusterIP
selector:
app: wallet-monero-temp
ports:
- name: rpc
port: 18083
targetPort: 18083
protocol: TCP
- namespace: finance
name: actual-budget
type: ClusterIP
selector:
app: actual-budget
ports:
- name: http
port: 80
targetPort: 5006
protocol: TCP
- namespace: finance
name: firefly
type: ClusterIP
selector:
app: firefly
ports:
- name: http
port: 80
targetPort: 8080
protocol: TCP
- namespace: flux-system
name: notification-controller
type: ClusterIP
@ -1082,7 +1343,7 @@ services:
protocol: TCP
- namespace: gitea
name: gitea-ssh
type: NodePort
type: LoadBalancer
selector:
app: gitea
ports:
@ -1090,6 +1351,16 @@ services:
port: 2242
targetPort: 2242
protocol: TCP
- namespace: health
name: wger
type: ClusterIP
selector:
app: wger
ports:
- name: http
port: 80
targetPort: http
protocol: TCP
- namespace: jellyfin
name: jellyfin
type: ClusterIP
@ -1124,21 +1395,6 @@ services:
port: 50000
targetPort: 50000
protocol: TCP
- namespace: kube-system
name: traefik
type: LoadBalancer
selector:
app.kubernetes.io/instance: traefik-kube-system
app.kubernetes.io/name: traefik
ports:
- name: web
port: 80
targetPort: web
protocol: TCP
- name: websecure
port: 443
targetPort: websecure
protocol: TCP
- namespace: logging
name: oauth2-proxy-logs
type: ClusterIP
@ -1191,15 +1447,15 @@ services:
port: 4190
targetPort: 4190
protocol: TCP
- namespace: mailu-mailserver
name: mailu-sync-listener
- namespace: maintenance
name: ariadne
type: ClusterIP
selector:
app: mailu-sync-listener
app: ariadne
ports:
- name: http
port: 8080
targetPort: 8080
port: 80
targetPort: http
protocol: TCP
- namespace: monitoring
name: dcgm-exporter
@ -1291,6 +1547,10 @@ services:
port: 5432
targetPort: 5432
protocol: TCP
- name: metrics
port: 9187
targetPort: 9187
protocol: TCP
- namespace: sso
name: keycloak
type: ClusterIP
@ -1335,6 +1595,20 @@ services:
port: 8429
targetPort: 8429
protocol: TCP
- namespace: traefik
name: traefik
type: LoadBalancer
selector:
app: traefik
ports:
- name: web
port: 80
targetPort: web
protocol: TCP
- name: websecure
port: 443
targetPort: websecure
protocol: TCP
- namespace: traefik
name: traefik-metrics
type: ClusterIP
@ -1447,6 +1721,19 @@ http_endpoints:
kind: Ingress
name: bstein-dev-home
source: bstein-dev-home
- host: budget.bstein.dev
path: /
backend:
namespace: finance
service: actual-budget
port: 80
workloads:
- kind: Deployment
name: actual-budget
via:
kind: Ingress
name: actual-budget
source: finance
- host: call.live.bstein.dev
path: /
backend:
@ -1499,6 +1786,19 @@ http_endpoints:
kind: Ingress
name: nextcloud
source: nextcloud
- host: health.bstein.dev
path: /
backend:
namespace: health
service: wger
port: 80
workloads:
- kind: Deployment
name: wger
via:
kind: Ingress
name: wger
source: health
- host: kit.live.bstein.dev
path: /livekit/jwt
backend:
@ -1558,6 +1858,65 @@ http_endpoints:
kind: Ingress
name: matrix-routing
source: comms
- host: live.bstein.dev
path: /_matrix/client/r0/register
backend:
namespace: comms
service: matrix-guest-register
port: 8080
workloads: &id003
- kind: Deployment
name: matrix-guest-register
via:
kind: Ingress
name: matrix-routing
source: comms
- host: live.bstein.dev
path: /_matrix/client/v3/login
backend:
namespace: comms
service: matrix-authentication-service
port: 8080
workloads: &id002
- kind: Deployment
name: matrix-authentication-service
via:
kind: Ingress
name: matrix-routing
source: comms
- host: live.bstein.dev
path: /_matrix/client/v3/logout
backend:
namespace: comms
service: matrix-authentication-service
port: 8080
workloads: *id002
via:
kind: Ingress
name: matrix-routing
source: comms
- host: live.bstein.dev
path: /_matrix/client/v3/refresh
backend:
namespace: comms
service: matrix-authentication-service
port: 8080
workloads: *id002
via:
kind: Ingress
name: matrix-routing
source: comms
- host: live.bstein.dev
path: /_matrix/client/v3/register
backend:
namespace: comms
service: matrix-guest-register
port: 8080
workloads: *id003
via:
kind: Ingress
name: matrix-routing
source: comms
- host: logs.bstein.dev
path: /
backend:
@ -1601,9 +1960,7 @@ http_endpoints:
namespace: comms
service: matrix-authentication-service
port: 8080
workloads: &id002
- kind: Deployment
name: matrix-authentication-service
workloads: *id002
via:
kind: Ingress
name: matrix-routing
@ -1647,9 +2004,7 @@ http_endpoints:
namespace: comms
service: matrix-guest-register
port: 8080
workloads: &id003
- kind: Deployment
name: matrix-guest-register
workloads: *id003
via:
kind: Ingress
name: matrix-routing
@ -1722,6 +2077,19 @@ http_endpoints:
kind: Ingress
name: monerod
source: monerod
- host: money.bstein.dev
path: /
backend:
namespace: finance
service: firefly
port: 80
workloads:
- kind: Deployment
name: firefly
via:
kind: Ingress
name: firefly
source: finance
- host: notes.bstein.dev
path: /
backend:
@ -1845,7 +2213,6 @@ helmrelease_host_hints:
- live.bstein.dev
- matrix.live.bstein.dev
comms:comms/othrys-synapse:
- bstein.dev
- kit.live.bstein.dev
- live.bstein.dev
- matrix.live.bstein.dev
@ -1856,6 +2223,8 @@ helmrelease_host_hints:
- registry.bstein.dev
logging:logging/data-prepper:
- registry.bstein.dev
longhorn:longhorn-system/longhorn:
- registry.bstein.dev
mailu:mailu-mailserver/mailu:
- bstein.dev
- mail.bstein.dev
@ -1863,5 +2232,8 @@ helmrelease_host_hints:
- alerts.bstein.dev
monitoring:monitoring/grafana:
- bstein.dev
- mail.bstein.dev
- metrics.bstein.dev
- sso.bstein.dev
monitoring:monitoring/kube-state-metrics:
- atlas.bstein.dev

File diff suppressed because it is too large Load Diff

File diff suppressed because one or more lines are too long

View File

@ -17,6 +17,11 @@ flowchart LR
host_bstein_dev --> svc_bstein_dev_home_bstein_dev_home_backend
wl_bstein_dev_home_bstein_dev_home_backend["bstein-dev-home/bstein-dev-home-backend (Deployment)"]
svc_bstein_dev_home_bstein_dev_home_backend --> wl_bstein_dev_home_bstein_dev_home_backend
host_budget_bstein_dev["budget.bstein.dev"]
svc_finance_actual_budget["finance/actual-budget (Service)"]
host_budget_bstein_dev --> svc_finance_actual_budget
wl_finance_actual_budget["finance/actual-budget (Deployment)"]
svc_finance_actual_budget --> wl_finance_actual_budget
host_call_live_bstein_dev["call.live.bstein.dev"]
svc_comms_element_call["comms/element-call (Service)"]
host_call_live_bstein_dev --> svc_comms_element_call
@ -37,6 +42,11 @@ flowchart LR
host_cloud_bstein_dev --> svc_nextcloud_nextcloud
wl_nextcloud_nextcloud["nextcloud/nextcloud (Deployment)"]
svc_nextcloud_nextcloud --> wl_nextcloud_nextcloud
host_health_bstein_dev["health.bstein.dev"]
svc_health_wger["health/wger (Service)"]
host_health_bstein_dev --> svc_health_wger
wl_health_wger["health/wger (Deployment)"]
svc_health_wger --> wl_health_wger
host_kit_live_bstein_dev["kit.live.bstein.dev"]
svc_comms_livekit_token_service["comms/livekit-token-service (Service)"]
host_kit_live_bstein_dev --> svc_comms_livekit_token_service
@ -50,6 +60,14 @@ flowchart LR
host_live_bstein_dev --> svc_comms_matrix_wellknown
svc_comms_othrys_synapse_matrix_synapse["comms/othrys-synapse-matrix-synapse (Service)"]
host_live_bstein_dev --> svc_comms_othrys_synapse_matrix_synapse
svc_comms_matrix_guest_register["comms/matrix-guest-register (Service)"]
host_live_bstein_dev --> svc_comms_matrix_guest_register
wl_comms_matrix_guest_register["comms/matrix-guest-register (Deployment)"]
svc_comms_matrix_guest_register --> wl_comms_matrix_guest_register
svc_comms_matrix_authentication_service["comms/matrix-authentication-service (Service)"]
host_live_bstein_dev --> svc_comms_matrix_authentication_service
wl_comms_matrix_authentication_service["comms/matrix-authentication-service (Deployment)"]
svc_comms_matrix_authentication_service --> wl_comms_matrix_authentication_service
host_logs_bstein_dev["logs.bstein.dev"]
svc_logging_oauth2_proxy_logs["logging/oauth2-proxy-logs (Service)"]
host_logs_bstein_dev --> svc_logging_oauth2_proxy_logs
@ -64,21 +82,20 @@ flowchart LR
svc_mailu_mailserver_mailu_front["mailu-mailserver/mailu-front (Service)"]
host_mail_bstein_dev --> svc_mailu_mailserver_mailu_front
host_matrix_live_bstein_dev["matrix.live.bstein.dev"]
svc_comms_matrix_authentication_service["comms/matrix-authentication-service (Service)"]
host_matrix_live_bstein_dev --> svc_comms_matrix_authentication_service
wl_comms_matrix_authentication_service["comms/matrix-authentication-service (Deployment)"]
svc_comms_matrix_authentication_service --> wl_comms_matrix_authentication_service
host_matrix_live_bstein_dev --> svc_comms_matrix_wellknown
host_matrix_live_bstein_dev --> svc_comms_othrys_synapse_matrix_synapse
svc_comms_matrix_guest_register["comms/matrix-guest-register (Service)"]
host_matrix_live_bstein_dev --> svc_comms_matrix_guest_register
wl_comms_matrix_guest_register["comms/matrix-guest-register (Deployment)"]
svc_comms_matrix_guest_register --> wl_comms_matrix_guest_register
host_monero_bstein_dev["monero.bstein.dev"]
svc_crypto_monerod["crypto/monerod (Service)"]
host_monero_bstein_dev --> svc_crypto_monerod
wl_crypto_monerod["crypto/monerod (Deployment)"]
svc_crypto_monerod --> wl_crypto_monerod
host_money_bstein_dev["money.bstein.dev"]
svc_finance_firefly["finance/firefly (Service)"]
host_money_bstein_dev --> svc_finance_firefly
wl_finance_firefly["finance/firefly (Deployment)"]
svc_finance_firefly --> wl_finance_firefly
host_notes_bstein_dev["notes.bstein.dev"]
svc_outline_outline["outline/outline (Service)"]
host_notes_bstein_dev --> svc_outline_outline
@ -143,19 +160,29 @@ flowchart LR
svc_comms_livekit
wl_comms_livekit
svc_comms_othrys_synapse_matrix_synapse
svc_comms_matrix_authentication_service
wl_comms_matrix_authentication_service
svc_comms_matrix_guest_register
wl_comms_matrix_guest_register
svc_comms_matrix_authentication_service
wl_comms_matrix_authentication_service
end
subgraph crypto[crypto]
svc_crypto_monerod
wl_crypto_monerod
end
subgraph finance[finance]
svc_finance_actual_budget
wl_finance_actual_budget
svc_finance_firefly
wl_finance_firefly
end
subgraph gitea[gitea]
svc_gitea_gitea
wl_gitea_gitea
end
subgraph health[health]
svc_health_wger
wl_health_wger
end
subgraph jellyfin[jellyfin]
svc_jellyfin_pegasus
wl_jellyfin_pegasus

View File

@ -70,6 +70,7 @@ WORKER_NODES = [
"titan-13",
"titan-14",
"titan-15",
"titan-16",
"titan-17",
"titan-18",
"titan-19",
@ -207,7 +208,66 @@ def namespace_ram_raw(scope_var):
def namespace_gpu_usage_instant(scope_var):
return f"sum(DCGM_FI_DEV_GPU_UTIL{{{namespace_gpu_selector(scope_var)}}}) by (namespace)"
return gpu_usage_by_namespace(scope_var)
def jetson_gpu_util_by_node():
return 'max by (node) (jetson_gr3d_freq_percent{node!=""})'
def dcgm_gpu_util_by_node():
dcgm_pod = 'label_replace(DCGM_FI_DEV_GPU_UTIL, "pod", "$1", "Hostname", "(.*)")'
dcgm_ns = 'label_replace(' + dcgm_pod + ', "namespace", "monitoring", "", "")'
return (
"avg by (node) ("
f"{dcgm_ns} * on(namespace,pod) group_left(node) "
'kube_pod_info{namespace="monitoring"}'
")"
)
def gpu_util_by_node():
return f"{dcgm_gpu_util_by_node()} or {jetson_gpu_util_by_node()}"
def gpu_util_by_hostname():
return 'label_replace(' + gpu_util_by_node() + ', "Hostname", "$1", "node", "(.*)")'
def gpu_node_labels():
return 'kube_node_labels{label_accelerator=~".+"} or kube_node_labels{label_jetson="true"}'
def gpu_requests_by_namespace_node(scope_var):
return (
"sum by (namespace,node) ("
f'kube_pod_container_resource_requests{{resource=~"nvidia.com/gpu.*",{scope_var}}} '
"* on(namespace,pod) group_left(node) kube_pod_info "
f"* on(node) group_left() ({gpu_node_labels()})"
")"
)
def gpu_usage_by_namespace(scope_var):
requests_by_ns = gpu_requests_by_namespace_node(scope_var)
total_by_node = f"sum by (node) ({requests_by_ns})"
return (
"sum by (namespace) ("
f"({requests_by_ns}) / clamp_min({total_by_node}, 1) "
f"* on(node) group_left() ({gpu_util_by_node()})"
")"
)
def jetson_gpu_usage_by_namespace(scope_var):
requests_by_ns = jetson_gpu_requests(scope_var)
total_by_node = f"sum by (node) ({requests_by_ns})"
return (
"sum by (namespace) ("
f"({requests_by_ns}) / clamp_min({total_by_node}, 1) "
f"* on(node) group_left() {jetson_gpu_util_by_node()}"
")"
)
def namespace_share_expr(resource_expr):
@ -227,7 +287,7 @@ def namespace_gpu_share_expr(scope_var):
usage = namespace_gpu_usage_instant(scope_var)
total = f"(sum({usage}) or on() vector(0))"
share = f"100 * ({usage}) / clamp_min({total}, 1)"
idle = 'label_replace(vector(100), "namespace", "idle", "", "") and on() (' + total + " == 0)"
idle = 'label_replace(vector(100), "namespace", "idle", "", "") * scalar(' + total + " == bool 0)"
return f"({share}) or ({idle})"
@ -333,9 +393,60 @@ GLUE_STALE = f"({GLUE_LAST_SUCCESS_AGE} > bool {GLUE_STALE_WINDOW_SEC})"
GLUE_MISSING = f"({GLUE_JOBS} unless on(namespace,cronjob) kube_cronjob_status_last_successful_time)"
GLUE_STALE_ACTIVE = f"({GLUE_STALE} unless on(namespace,cronjob) {GLUE_SUSPENDED})"
GLUE_MISSING_ACTIVE = f"({GLUE_MISSING} unless on(namespace,cronjob) {GLUE_SUSPENDED})"
GLUE_STALE_COUNT = f"(sum({GLUE_STALE_ACTIVE}) + count({GLUE_MISSING_ACTIVE}))"
GLUE_MISSING_COUNT = f"count({GLUE_MISSING_ACTIVE})"
GLUE_SUSPENDED_COUNT = f"sum({GLUE_SUSPENDED})"
GLUE_STALE_COUNT = f"(sum({GLUE_STALE_ACTIVE}) + count({GLUE_MISSING_ACTIVE})) or on() vector(0)"
GLUE_MISSING_COUNT = f"count({GLUE_MISSING_ACTIVE}) or on() vector(0)"
GLUE_SUSPENDED_COUNT = f"sum({GLUE_SUSPENDED}) or on() vector(0)"
ARIADNE_TASK_ERRORS_RANGE = 'sum by (task) (increase(ariadne_task_runs_total{status="error"}[$__range]))'
ARIADNE_TASK_ERRORS_24H = 'sum by (task) (increase(ariadne_task_runs_total{status="error"}[24h]))'
ARIADNE_TASK_ERRORS_1H = 'sum by (task) (increase(ariadne_task_runs_total{status="error"}[1h]))'
ARIADNE_TASK_ERRORS_30D = 'sum by (task) (increase(ariadne_task_runs_total{status="error"}[30d]))'
ARIADNE_TASK_SUCCESS_24H = 'sum by (task) (increase(ariadne_task_runs_total{status="ok"}[24h]))'
ARIADNE_TASK_RUNS_BY_STATUS_1H = 'sum by (status) (increase(ariadne_task_runs_total[1h]))'
ARIADNE_TASK_ERRORS_1H_TOTAL = 'sum(increase(ariadne_task_runs_total{status="error"}[1h]))'
ARIADNE_TASK_ERRORS_24H_TOTAL = 'sum(increase(ariadne_task_runs_total{status="error"}[24h]))'
ARIADNE_TASK_RUNS_1H_TOTAL = 'sum(increase(ariadne_task_runs_total[1h]))'
ARIADNE_TASK_ATTEMPTS_SERIES = 'sum(increase(ariadne_task_runs_total[$__interval]))'
ARIADNE_TASK_FAILURES_SERIES = 'sum(increase(ariadne_task_runs_total{status="error"}[$__interval]))'
ARIADNE_TASK_WARNINGS_SERIES = (
'sum(increase(ariadne_task_runs_total{status!~"ok|error"}[$__interval])) or on() vector(0)'
)
ARIADNE_SCHEDULE_LAST_SUCCESS_HOURS = "(time() - ariadne_schedule_last_success_timestamp_seconds) / 3600"
ARIADNE_SCHEDULE_LAST_ERROR_HOURS = "(time() - ariadne_schedule_last_error_timestamp_seconds) / 3600"
ARIADNE_SCHEDULE_LAST_SUCCESS_RANGE_HOURS = (
"(time() - max_over_time(ariadne_schedule_last_success_timestamp_seconds[$__range])) / 3600"
)
ARIADNE_SCHEDULE_LAST_ERROR_RANGE_HOURS = (
"(time() - max_over_time(ariadne_schedule_last_error_timestamp_seconds[$__range])) / 3600"
)
ARIADNE_ACCESS_REQUESTS = "ariadne_access_requests_total"
ARIADNE_CI_COVERAGE = 'ariadne_ci_coverage_percent{repo="ariadne"}'
ARIADNE_CI_TESTS = 'ariadne_ci_tests_total{repo="ariadne"}'
ARIADNE_TEST_SUCCESS_RATE = (
"100 * "
'sum(max_over_time(ariadne_ci_tests_total{repo="ariadne",result="passed"}[30d])) '
"/ clamp_min("
'sum(max_over_time(ariadne_ci_tests_total{repo="ariadne",result=~"passed|failed|error"}[30d])), 1)'
)
ARIADNE_TEST_FAILURES_24H = (
'sum by (result) (max_over_time(ariadne_ci_tests_total{repo="ariadne",result=~"failed|error"}[24h]))'
)
POSTGRES_CONN_USED = (
'label_replace(sum(pg_stat_activity_count), "conn", "used", "__name__", ".*") '
'or label_replace(max(pg_settings_max_connections), "conn", "max", "__name__", ".*")'
)
POSTGRES_CONN_HOTTEST = 'topk(1, sum by (datname) (pg_stat_activity_count))'
ONEOFF_JOB_OWNER = (
'label_replace(kube_job_owner{owner_kind="CronJob"}, "owner_name", "$1", "job_name", "(.*)")'
)
ONEOFF_JOB_PODS = f'(kube_pod_owner{{owner_kind="Job"}} unless on(namespace, owner_name) {ONEOFF_JOB_OWNER})'
ONEOFF_JOB_POD_AGE_HOURS = (
'((time() - kube_pod_start_time{pod!=""}) / 3600) '
f'* on(namespace,pod) group_left(owner_name) {ONEOFF_JOB_PODS} '
'* on(namespace,pod) group_left(phase) '
'max by (namespace,pod,phase) (kube_pod_status_phase{phase=~"Running|Succeeded"})'
)
GLUE_LAST_SUCCESS_RANGE_HOURS = f"(time() - max_over_time({GLUE_LAST_SUCCESS}[$__range])) / 3600"
GLUE_LAST_SCHEDULE_RANGE_HOURS = f"(time() - max_over_time({GLUE_LAST_SCHEDULE}[$__range])) / 3600"
GPU_NODES = ["titan-20", "titan-21", "titan-22", "titan-24"]
GPU_NODE_REGEX = "|".join(GPU_NODES)
TRAEFIK_ROUTER_EXPR = "sum by (router) (rate(traefik_router_requests_total[5m]))"
@ -513,6 +624,7 @@ def timeseries_panel(
grid,
*,
unit="none",
max_value=None,
legend=None,
legend_display="table",
legend_placement="bottom",
@ -537,6 +649,8 @@ def timeseries_panel(
"tooltip": {"mode": "multi"},
},
}
if max_value is not None:
panel["fieldConfig"]["defaults"]["max"] = max_value
if legend:
panel["targets"][0]["legendFormat"] = legend
if legend_calcs:
@ -688,13 +802,22 @@ def bargauge_panel(
grid,
*,
unit="none",
legend=None,
links=None,
limit=None,
sort_order="desc",
thresholds=None,
decimals=None,
instant=False,
overrides=None,
):
"""Return a bar gauge panel with label-aware reduction."""
cleaned_expr = expr.strip()
if not cleaned_expr.startswith(("sort(", "sort_desc(")):
if sort_order == "desc":
expr = f"sort_desc({expr})"
elif sort_order == "asc":
expr = f"sort({expr})"
panel = {
"id": panel_id,
"type": "bargauge",
@ -702,7 +825,12 @@ def bargauge_panel(
"datasource": PROM_DS,
"gridPos": grid,
"targets": [
{"expr": expr, "refId": "A", "legendFormat": "{{node}}", **({"instant": True} if instant else {})}
{
"expr": expr,
"refId": "A",
"legendFormat": legend or "{{node}}",
**({"instant": True} if instant else {}),
}
],
"fieldConfig": {
"defaults": {
@ -732,6 +860,8 @@ def bargauge_panel(
},
},
}
if overrides:
panel["fieldConfig"]["overrides"].extend(overrides)
if decimals is not None:
panel["fieldConfig"]["defaults"]["decimals"] = decimals
if links:
@ -740,7 +870,7 @@ def bargauge_panel(
panel["transformations"] = [
{
"id": "sortBy",
"options": {"fields": ["Value"], "order": "desc"},
"options": {"fields": ["Value"], "order": sort_order},
}
]
if limit:
@ -780,6 +910,15 @@ def build_overview():
{"color": "red", "value": 3},
],
}
age_thresholds = {
"mode": "absolute",
"steps": [
{"color": "green", "value": None},
{"color": "yellow", "value": 6},
{"color": "orange", "value": 24},
{"color": "red", "value": 48},
],
}
row1_stats = [
{
@ -982,7 +1121,7 @@ def build_overview():
30,
"Mail Sent (1d)",
'max(postmark_outbound_sent{window="1d"})',
{"h": 2, "w": 5, "x": 0, "y": 8},
{"h": 3, "w": 4, "x": 0, "y": 8},
unit="none",
links=link_to("atlas-mail"),
)
@ -993,7 +1132,7 @@ def build_overview():
"type": "stat",
"title": "Mail Bounces (1d)",
"datasource": PROM_DS,
"gridPos": {"h": 2, "w": 5, "x": 10, "y": 8},
"gridPos": {"h": 3, "w": 4, "x": 8, "y": 8},
"targets": [
{
"expr": 'max(postmark_outbound_bounce_rate{window="1d"})',
@ -1039,7 +1178,7 @@ def build_overview():
32,
"Mail Success Rate (1d)",
'clamp_min(100 - max(postmark_outbound_bounce_rate{window="1d"}), 0)',
{"h": 2, "w": 5, "x": 5, "y": 8},
{"h": 3, "w": 4, "x": 4, "y": 8},
unit="percent",
thresholds=mail_success_thresholds,
decimals=1,
@ -1051,13 +1190,38 @@ def build_overview():
33,
"Mail Limit Used (30d)",
"max(postmark_sending_limit_used_percent)",
{"h": 2, "w": 5, "x": 15, "y": 8},
{"h": 3, "w": 4, "x": 12, "y": 8},
unit="percent",
thresholds=mail_limit_thresholds,
decimals=1,
links=link_to("atlas-mail"),
)
)
panels.append(
stat_panel(
34,
"Postgres Connections Used",
POSTGRES_CONN_USED,
{"h": 3, "w": 4, "x": 16, "y": 8},
decimals=0,
text_mode="name_and_value",
legend="{{conn}}",
instant=True,
)
)
panels.append(
stat_panel(
35,
"Postgres Hottest Connections",
POSTGRES_CONN_HOTTEST,
{"h": 3, "w": 4, "x": 20, "y": 8},
unit="none",
decimals=0,
text_mode="name_and_value",
legend="{{datname}}",
instant=True,
)
)
storage_panels = [
(23, "Astreae Usage", astreae_usage_expr("/mnt/astreae"), "percent"),
@ -1071,13 +1235,104 @@ def build_overview():
panel_id,
title,
expr,
{"h": 6, "w": 6, "x": 6 * idx, "y": 10},
{"h": 3, "w": 6, "x": 6 * idx, "y": 11},
unit=unit,
thresholds=PERCENT_THRESHOLDS if unit == "percent" else None,
links=link_to("atlas-storage"),
)
)
panels.append(
bargauge_panel(
40,
"One-off Job Pods (age hours)",
ONEOFF_JOB_POD_AGE_HOURS,
{"h": 6, "w": 6, "x": 0, "y": 14},
unit="h",
instant=True,
legend="{{namespace}}/{{pod}}",
thresholds=age_thresholds,
limit=8,
decimals=2,
)
)
panels.append(
{
"id": 41,
"type": "timeseries",
"title": "Ariadne Attempts / Failures",
"datasource": PROM_DS,
"gridPos": {"h": 6, "w": 6, "x": 6, "y": 14},
"targets": [
{"expr": ARIADNE_TASK_ATTEMPTS_SERIES, "refId": "A", "legendFormat": "Attempts"},
{"expr": ARIADNE_TASK_FAILURES_SERIES, "refId": "B", "legendFormat": "Failures"},
],
"fieldConfig": {
"defaults": {"unit": "none"},
"overrides": [
{
"matcher": {"id": "byName", "options": "Attempts"},
"properties": [
{"id": "color", "value": {"mode": "fixed", "fixedColor": "green"}}
],
},
{
"matcher": {"id": "byName", "options": "Failures"},
"properties": [
{"id": "color", "value": {"mode": "fixed", "fixedColor": "red"}}
],
},
],
},
"options": {
"legend": {"displayMode": "table", "placement": "right"},
"tooltip": {"mode": "multi"},
},
}
)
panels.append(
timeseries_panel(
42,
"Ariadne Test Success Rate",
ARIADNE_TEST_SUCCESS_RATE,
{"h": 6, "w": 6, "x": 12, "y": 14},
unit="percent",
max_value=100,
legend=None,
legend_display="list",
)
)
panels.append(
bargauge_panel(
43,
"Tests with Failures (24h)",
ARIADNE_TEST_FAILURES_24H,
{"h": 6, "w": 6, "x": 18, "y": 14},
unit="none",
instant=True,
legend="{{result}}",
overrides=[
{
"matcher": {"id": "byName", "options": "error"},
"properties": [{"id": "color", "value": {"mode": "fixed", "fixedColor": "yellow"}}],
},
{
"matcher": {"id": "byName", "options": "failed"},
"properties": [{"id": "color", "value": {"mode": "fixed", "fixedColor": "red"}}],
},
],
thresholds={
"mode": "absolute",
"steps": [
{"color": "green", "value": None},
{"color": "yellow", "value": 1},
{"color": "orange", "value": 5},
{"color": "red", "value": 10},
],
},
)
)
cpu_scope = "$namespace_scope_cpu"
gpu_scope = "$namespace_scope_gpu"
ram_scope = "$namespace_scope_ram"
@ -1087,7 +1342,7 @@ def build_overview():
11,
"Namespace CPU Share",
namespace_cpu_share_expr(cpu_scope),
{"h": 9, "w": 8, "x": 0, "y": 16},
{"h": 9, "w": 8, "x": 0, "y": 20},
links=namespace_scope_links("namespace_scope_cpu"),
description="Shares are normalized within the selected filter. Switching scope changes the denominator.",
)
@ -1097,7 +1352,7 @@ def build_overview():
12,
"Namespace GPU Share",
namespace_gpu_share_expr(gpu_scope),
{"h": 9, "w": 8, "x": 8, "y": 16},
{"h": 9, "w": 8, "x": 8, "y": 20},
links=namespace_scope_links("namespace_scope_gpu"),
description="Shares are normalized within the selected filter. Switching scope changes the denominator.",
)
@ -1107,7 +1362,7 @@ def build_overview():
13,
"Namespace RAM Share",
namespace_ram_share_expr(ram_scope),
{"h": 9, "w": 8, "x": 16, "y": 16},
{"h": 9, "w": 8, "x": 16, "y": 20},
links=namespace_scope_links("namespace_scope_ram"),
description="Shares are normalized within the selected filter. Switching scope changes the denominator.",
)
@ -1119,7 +1374,7 @@ def build_overview():
14,
"Worker Node CPU",
node_cpu_expr(worker_filter),
{"h": 12, "w": 12, "x": 0, "y": 32},
{"h": 12, "w": 12, "x": 0, "y": 36},
unit="percent",
legend="{{node}}",
legend_calcs=["last"],
@ -1133,7 +1388,7 @@ def build_overview():
15,
"Worker Node RAM",
node_mem_expr(worker_filter),
{"h": 12, "w": 12, "x": 12, "y": 32},
{"h": 12, "w": 12, "x": 12, "y": 36},
unit="percent",
legend="{{node}}",
legend_calcs=["last"],
@ -1148,7 +1403,7 @@ def build_overview():
16,
"Control plane CPU",
node_cpu_expr(CONTROL_ALL_REGEX),
{"h": 10, "w": 12, "x": 0, "y": 44},
{"h": 10, "w": 12, "x": 0, "y": 48},
unit="percent",
legend="{{node}}",
legend_display="table",
@ -1160,7 +1415,7 @@ def build_overview():
17,
"Control plane RAM",
node_mem_expr(CONTROL_ALL_REGEX),
{"h": 10, "w": 12, "x": 12, "y": 44},
{"h": 10, "w": 12, "x": 12, "y": 48},
unit="percent",
legend="{{node}}",
legend_display="table",
@ -1173,7 +1428,7 @@ def build_overview():
28,
"Node Pod Share",
'(sum(kube_pod_info{pod!="" , node!=""}) by (node) / clamp_min(sum(kube_pod_info{pod!="" , node!=""}), 1)) * 100',
{"h": 10, "w": 12, "x": 0, "y": 54},
{"h": 10, "w": 12, "x": 0, "y": 58},
)
)
panels.append(
@ -1181,7 +1436,7 @@ def build_overview():
29,
"Top Nodes by Pod Count",
'topk(12, sum(kube_pod_info{pod!="" , node!=""}) by (node))',
{"h": 10, "w": 12, "x": 12, "y": 54},
{"h": 10, "w": 12, "x": 12, "y": 58},
unit="none",
limit=12,
decimals=0,
@ -1203,7 +1458,7 @@ def build_overview():
18,
"Cluster Ingress Throughput",
NET_INGRESS_EXPR,
{"h": 7, "w": 8, "x": 0, "y": 25},
{"h": 7, "w": 8, "x": 0, "y": 29},
unit="Bps",
legend="Ingress (Traefik)",
legend_display="list",
@ -1216,7 +1471,7 @@ def build_overview():
19,
"Cluster Egress Throughput",
NET_EGRESS_EXPR,
{"h": 7, "w": 8, "x": 8, "y": 25},
{"h": 7, "w": 8, "x": 8, "y": 29},
unit="Bps",
legend="Egress (Traefik)",
legend_display="list",
@ -1229,7 +1484,7 @@ def build_overview():
20,
"Intra-Cluster Throughput",
NET_INTERNAL_EXPR,
{"h": 7, "w": 8, "x": 16, "y": 25},
{"h": 7, "w": 8, "x": 16, "y": 29},
unit="Bps",
legend="Internal traffic",
legend_display="list",
@ -1243,7 +1498,7 @@ def build_overview():
21,
"Root Filesystem Usage",
root_usage_expr(),
{"h": 16, "w": 12, "x": 0, "y": 64},
{"h": 16, "w": 12, "x": 0, "y": 68},
unit="percent",
legend="{{node}}",
legend_calcs=["last"],
@ -1258,7 +1513,7 @@ def build_overview():
22,
"Nodes Closest to Full Root Disks",
f"topk(12, {root_usage_expr()})",
{"h": 16, "w": 12, "x": 12, "y": 64},
{"h": 16, "w": 12, "x": 12, "y": 68},
unit="percent",
thresholds=PERCENT_THRESHOLDS,
links=link_to("atlas-storage"),
@ -2153,16 +2408,103 @@ def build_mail_dashboard():
}
def build_testing_dashboard():
def build_jobs_dashboard():
panels = []
sort_desc = [{"id": "labelsToFields", "options": {}}, {"id": "sortBy", "options": {"fields": ["Value"], "order": "desc"}}]
age_thresholds = {
"mode": "absolute",
"steps": [
{"color": "green", "value": None},
{"color": "yellow", "value": 6},
{"color": "orange", "value": 24},
{"color": "red", "value": 48},
],
}
recent_error_thresholds = {
"mode": "absolute",
"steps": [
{"color": "red", "value": None},
{"color": "orange", "value": 1},
{"color": "yellow", "value": 6},
{"color": "green", "value": 24},
],
}
task_error_thresholds = {
"mode": "absolute",
"steps": [
{"color": "green", "value": None},
{"color": "yellow", "value": 1},
{"color": "orange", "value": 3},
{"color": "red", "value": 5},
],
}
panels.append(
stat_panel(
bargauge_panel(
1,
"Ariadne Task Errors (range)",
ARIADNE_TASK_ERRORS_RANGE,
{"h": 7, "w": 8, "x": 0, "y": 0},
unit="none",
instant=True,
legend="{{task}}",
thresholds=task_error_thresholds,
)
)
panels.append(
{
"id": 2,
"type": "timeseries",
"title": "Ariadne Attempts / Failures",
"datasource": PROM_DS,
"gridPos": {"h": 7, "w": 8, "x": 8, "y": 0},
"targets": [
{"expr": ARIADNE_TASK_ATTEMPTS_SERIES, "refId": "A", "legendFormat": "Attempts"},
{"expr": ARIADNE_TASK_FAILURES_SERIES, "refId": "B", "legendFormat": "Failures"},
],
"fieldConfig": {
"defaults": {"unit": "none"},
"overrides": [
{
"matcher": {"id": "byName", "options": "Attempts"},
"properties": [
{"id": "color", "value": {"mode": "fixed", "fixedColor": "green"}}
],
},
{
"matcher": {"id": "byName", "options": "Failures"},
"properties": [
{"id": "color", "value": {"mode": "fixed", "fixedColor": "red"}}
],
},
],
},
"options": {
"legend": {"displayMode": "table", "placement": "right"},
"tooltip": {"mode": "multi"},
},
}
)
panels.append(
bargauge_panel(
3,
"One-off Job Pods (age hours)",
ONEOFF_JOB_POD_AGE_HOURS,
{"h": 7, "w": 8, "x": 16, "y": 0},
unit="h",
instant=True,
legend="{{namespace}}/{{pod}}",
thresholds=age_thresholds,
limit=12,
decimals=2,
)
)
panels.append(
stat_panel(
4,
"Glue Jobs Stale (>36h)",
GLUE_STALE_COUNT,
{"h": 4, "w": 6, "x": 0, "y": 0},
{"h": 4, "w": 4, "x": 0, "y": 7},
unit="none",
thresholds={
"mode": "absolute",
@ -2176,64 +2518,164 @@ def build_testing_dashboard():
)
)
panels.append(
table_panel(
2,
"Glue Jobs Missing Success",
GLUE_MISSING_ACTIVE,
{"h": 4, "w": 6, "x": 6, "y": 0},
unit="none",
transformations=sort_desc,
instant=True,
)
)
panels.append(
table_panel(
3,
"Glue Jobs Suspended",
GLUE_SUSPENDED,
{"h": 4, "w": 6, "x": 12, "y": 0},
unit="none",
transformations=sort_desc,
instant=True,
)
)
panels.append(
table_panel(
4,
"Glue Jobs Active Runs",
GLUE_ACTIVE,
{"h": 4, "w": 6, "x": 18, "y": 0},
unit="none",
transformations=sort_desc,
instant=True,
)
)
panels.append(
table_panel(
stat_panel(
5,
"Glue Jobs Last Success (hours ago)",
GLUE_LAST_SUCCESS_AGE_HOURS,
{"h": 8, "w": 12, "x": 0, "y": 4},
"Glue Jobs Missing Success",
GLUE_MISSING_COUNT,
{"h": 4, "w": 4, "x": 4, "y": 7},
unit="none",
)
)
panels.append(
stat_panel(
6,
"Glue Jobs Suspended",
GLUE_SUSPENDED_COUNT,
{"h": 4, "w": 4, "x": 8, "y": 7},
unit="none",
)
)
panels.append(
stat_panel(
7,
"Ariadne Task Errors (1h)",
ARIADNE_TASK_ERRORS_1H_TOTAL,
{"h": 4, "w": 4, "x": 12, "y": 7},
unit="none",
)
)
panels.append(
stat_panel(
8,
"Ariadne Task Errors (24h)",
ARIADNE_TASK_ERRORS_24H_TOTAL,
{"h": 4, "w": 4, "x": 16, "y": 7},
unit="none",
)
)
panels.append(
stat_panel(
9,
"Ariadne Task Runs (1h)",
ARIADNE_TASK_RUNS_1H_TOTAL,
{"h": 4, "w": 4, "x": 20, "y": 7},
unit="none",
)
)
panels.append(
bargauge_panel(
10,
"Ariadne Schedule Last Error (hours ago)",
ARIADNE_SCHEDULE_LAST_ERROR_RANGE_HOURS,
{"h": 6, "w": 12, "x": 0, "y": 17},
unit="h",
transformations=sort_desc,
instant=True,
legend="{{task}}",
thresholds=recent_error_thresholds,
decimals=2,
)
)
panels.append(
bargauge_panel(
11,
"Ariadne Schedule Last Success (hours ago)",
ARIADNE_SCHEDULE_LAST_SUCCESS_RANGE_HOURS,
{"h": 6, "w": 12, "x": 12, "y": 17},
unit="h",
instant=True,
legend="{{task}}",
thresholds=age_thresholds,
decimals=2,
)
)
panels.append(
bargauge_panel(
12,
"Glue Jobs Last Success (hours ago)",
GLUE_LAST_SUCCESS_RANGE_HOURS,
{"h": 6, "w": 12, "x": 0, "y": 23},
unit="h",
instant=True,
legend="{{namespace}}/{{cronjob}}",
thresholds=age_thresholds,
decimals=2,
)
)
panels.append(
bargauge_panel(
13,
"Glue Jobs Last Schedule (hours ago)",
GLUE_LAST_SCHEDULE_RANGE_HOURS,
{"h": 6, "w": 12, "x": 12, "y": 23},
unit="h",
instant=True,
legend="{{namespace}}/{{cronjob}}",
thresholds=age_thresholds,
decimals=2,
)
)
panels.append(
bargauge_panel(
14,
"Ariadne Task Errors (1h)",
ARIADNE_TASK_ERRORS_1H,
{"h": 6, "w": 12, "x": 0, "y": 29},
unit="none",
instant=True,
legend="{{task}}",
thresholds=task_error_thresholds,
)
)
panels.append(
bargauge_panel(
15,
"Ariadne Task Errors (30d)",
ARIADNE_TASK_ERRORS_30D,
{"h": 6, "w": 12, "x": 12, "y": 29},
unit="none",
instant=True,
legend="{{task}}",
thresholds=task_error_thresholds,
)
)
panels.append(
bargauge_panel(
16,
"Ariadne Access Requests",
ARIADNE_ACCESS_REQUESTS,
{"h": 6, "w": 8, "x": 0, "y": 11},
unit="none",
instant=True,
legend="{{status}}",
)
)
panels.append(
stat_panel(
17,
"Ariadne CI Coverage (%)",
ARIADNE_CI_COVERAGE,
{"h": 6, "w": 4, "x": 8, "y": 11},
unit="percent",
decimals=1,
instant=True,
legend="{{branch}}",
)
)
panels.append(
table_panel(
6,
"Glue Jobs Last Schedule (hours ago)",
GLUE_LAST_SCHEDULE_AGE_HOURS,
{"h": 8, "w": 12, "x": 12, "y": 4},
unit="h",
transformations=sort_desc,
18,
"Ariadne CI Tests (latest)",
ARIADNE_CI_TESTS,
{"h": 6, "w": 12, "x": 12, "y": 11},
unit="none",
transformations=[{"id": "labelsToFields", "options": {}}, {"id": "sortBy", "options": {"fields": ["Value"], "order": "desc"}}],
instant=True,
)
)
return {
"uid": "atlas-testing",
"title": "Atlas Testing",
"uid": "atlas-jobs",
"title": "Atlas Jobs",
"folderUid": PRIVATE_FOLDER,
"editable": True,
"panels": panels,
@ -2241,7 +2683,7 @@ def build_testing_dashboard():
"annotations": {"list": []},
"schemaVersion": 39,
"style": "dark",
"tags": ["atlas", "testing"],
"tags": ["atlas", "jobs", "glue"],
}
@ -2274,7 +2716,7 @@ def build_gpu_dashboard():
timeseries_panel(
3,
"GPU Util by Node",
'sum by (Hostname) (DCGM_FI_DEV_GPU_UTIL{pod!=""})',
gpu_util_by_hostname(),
{"h": 8, "w": 12, "x": 0, "y": 8},
unit="percent",
legend="{{Hostname}}",
@ -2338,9 +2780,9 @@ DASHBOARDS = {
"builder": build_mail_dashboard,
"configmap": ROOT / "services" / "monitoring" / "grafana-dashboard-mail.yaml",
},
"atlas-testing": {
"builder": build_testing_dashboard,
"configmap": ROOT / "services" / "monitoring" / "grafana-dashboard-testing.yaml",
"atlas-jobs": {
"builder": build_jobs_dashboard,
"configmap": ROOT / "services" / "monitoring" / "grafana-dashboard-jobs.yaml",
},
"atlas-gpu": {
"builder": build_gpu_dashboard,

View File

@ -20,11 +20,13 @@ import subprocess
import sys
from dataclasses import dataclass
from pathlib import Path
import shutil
from typing import Any, Iterable
import yaml
REPO_ROOT = Path(__file__).resolve().parents[1]
DASHBOARD_DIR = REPO_ROOT / "services" / "monitoring" / "dashboards"
CLUSTER_SCOPED_KINDS = {
"Namespace",
@ -60,6 +62,70 @@ def _run(cmd: list[str], *, cwd: Path) -> str:
return res.stdout
def _sync_tree(source: Path, dest: Path) -> None:
if dest.exists():
shutil.rmtree(dest)
shutil.copytree(source, dest)
def _iter_dashboard_panels(dashboard: dict[str, Any]) -> Iterable[dict[str, Any]]:
panels = dashboard.get("panels") if isinstance(dashboard.get("panels"), list) else []
for panel in panels:
if not isinstance(panel, dict):
continue
if panel.get("type") == "row" and isinstance(panel.get("panels"), list):
yield from _iter_dashboard_panels({"panels": panel.get("panels")})
continue
yield panel
def _extract_metrics_index(dashboard_dir: Path) -> list[dict[str, Any]]:
index: list[dict[str, Any]] = []
for path in sorted(dashboard_dir.glob("*.json")):
try:
data = json.loads(path.read_text(encoding="utf-8"))
except json.JSONDecodeError:
continue
if not isinstance(data, dict):
continue
dash_title = data.get("title") or path.stem
dash_tags = data.get("tags") or []
for panel in _iter_dashboard_panels(data):
targets = panel.get("targets")
if not isinstance(targets, list):
continue
exprs: list[str] = []
for target in targets:
if not isinstance(target, dict):
continue
expr = target.get("expr")
if isinstance(expr, str) and expr.strip():
exprs.append(expr.strip())
if not exprs:
continue
datasource = panel.get("datasource") or {}
if isinstance(datasource, dict):
ds_uid = datasource.get("uid")
ds_type = datasource.get("type")
else:
ds_uid = None
ds_type = None
index.append(
{
"dashboard": dash_title,
"panel_title": panel.get("title") or "",
"panel_id": panel.get("id"),
"panel_type": panel.get("type"),
"description": panel.get("description") or "",
"tags": dash_tags,
"datasource_uid": ds_uid,
"datasource_type": ds_type,
"exprs": exprs,
}
)
return index
def kustomize_build(path: Path) -> str:
rel = path.relative_to(REPO_ROOT)
try:
@ -472,6 +538,11 @@ def main() -> int:
action="store_true",
help="Write generated files (otherwise just print a summary).",
)
ap.add_argument(
"--sync-comms",
action="store_true",
help="Mirror rendered knowledge into services/comms/knowledge for atlasbot.",
)
args = ap.parse_args()
out_dir = REPO_ROOT / args.out
@ -504,6 +575,7 @@ def main() -> int:
summary_path = out_dir / "catalog" / "atlas-summary.json"
diagram_path = out_dir / "diagrams" / "atlas-http.mmd"
runbooks_json_path = out_dir / "catalog" / "runbooks.json"
metrics_json_path = out_dir / "catalog" / "metrics.json"
catalog_rel = catalog_path.relative_to(REPO_ROOT).as_posix()
catalog_path.write_text(
@ -517,9 +589,14 @@ def main() -> int:
diagram_path.write_text(diagram, encoding="utf-8")
# Render runbooks into JSON for lightweight, dependency-free consumption in-cluster.
runbooks_dir = out_dir / "runbooks"
runbook_dirs = [
out_dir / "runbooks",
out_dir / "software",
]
runbooks: list[dict[str, Any]] = []
if runbooks_dir.exists():
for runbooks_dir in runbook_dirs:
if not runbooks_dir.exists():
continue
for md_file in sorted(runbooks_dir.glob("*.md")):
raw = md_file.read_text(encoding="utf-8")
fm: dict[str, Any] = {}
@ -543,12 +620,22 @@ def main() -> int:
}
)
runbooks_json_path.write_text(json.dumps(runbooks, indent=2, sort_keys=False) + "\n", encoding="utf-8")
metrics_index = _extract_metrics_index(DASHBOARD_DIR)
metrics_json_path.write_text(
json.dumps(metrics_index, indent=2, sort_keys=False) + "\n", encoding="utf-8"
)
print(f"Wrote {catalog_path.relative_to(REPO_ROOT)}")
print(f"Wrote {catalog_json_path.relative_to(REPO_ROOT)}")
print(f"Wrote {summary_path.relative_to(REPO_ROOT)}")
print(f"Wrote {diagram_path.relative_to(REPO_ROOT)}")
print(f"Wrote {runbooks_json_path.relative_to(REPO_ROOT)}")
print(f"Wrote {metrics_json_path.relative_to(REPO_ROOT)}")
if args.sync_comms:
comms_dir = REPO_ROOT / "services" / "comms" / "knowledge"
_sync_tree(out_dir, comms_dir)
print(f"Synced {out_dir.relative_to(REPO_ROOT)} -> {comms_dir.relative_to(REPO_ROOT)}")
return 0

View File

@ -20,8 +20,9 @@ spec:
labels:
app: ollama
annotations:
ai.bstein.dev/model: qwen2.5-coder:7b-instruct-q4_0
ai.bstein.dev/gpu: GPU pool (titan-20/21/22/24)
ai.bstein.dev/model: qwen2.5:14b-instruct-q4_0
ai.bstein.dev/gpu: GPU pool (titan-22/24)
ai.bstein.dev/restartedAt: "2026-01-26T12:00:00Z"
spec:
affinity:
nodeAffinity:
@ -31,8 +32,6 @@ spec:
- key: kubernetes.io/hostname
operator: In
values:
- titan-20
- titan-21
- titan-22
- titan-24
runtimeClassName: nvidia
@ -53,7 +52,7 @@ spec:
- name: OLLAMA_MODELS
value: /root/.ollama
- name: OLLAMA_MODEL
value: qwen2.5-coder:7b-instruct-q4_0
value: qwen2.5:14b-instruct-q4_0
command:
- /bin/sh
- -c
@ -68,8 +67,8 @@ spec:
mountPath: /root/.ollama
resources:
requests:
cpu: 250m
memory: 1Gi
cpu: 500m
memory: 2Gi
nvidia.com/gpu.shared: 1
limits:
nvidia.com/gpu.shared: 1
@ -96,10 +95,10 @@ spec:
mountPath: /root/.ollama
resources:
requests:
cpu: "2"
memory: 8Gi
cpu: "4"
memory: 16Gi
nvidia.com/gpu.shared: 1
limits:
cpu: "4"
memory: 12Gi
cpu: "8"
memory: 24Gi
nvidia.com/gpu.shared: 1

View File

@ -28,6 +28,7 @@ spec:
{{ with secret "kv/data/atlas/shared/chat-ai-keys-runtime" }}
export CHAT_KEY_MATRIX="{{ .Data.data.matrix }}"
export CHAT_KEY_HOMEPAGE="{{ .Data.data.homepage }}"
export AI_ATLASBOT_TOKEN="{{ .Data.data.homepage }}"
{{ end }}
{{ with secret "kv/data/atlas/shared/portal-e2e-client" }}
export PORTAL_E2E_CLIENT_ID="{{ .Data.data.client_id }}"
@ -58,14 +59,18 @@ spec:
args:
- >-
. /vault/secrets/portal-env.sh
&& exec gunicorn -b 0.0.0.0:8080 --workers 2 --timeout 180 app:app
&& exec gunicorn -b 0.0.0.0:8080 --workers 2 --timeout 600 app:app
env:
- name: AI_CHAT_API
value: http://ollama.ai.svc.cluster.local:11434
- name: AI_CHAT_MODEL
value: qwen2.5-coder:7b-instruct-q4_0
- name: AI_CHAT_TIMEOUT_SEC
value: "60"
value: "480"
- name: AI_ATLASBOT_ENDPOINT
value: http://atlasbot.comms.svc.cluster.local:8090/v1/answer
- name: AI_ATLASBOT_TIMEOUT_SEC
value: "30"
- name: AI_NODE_NAME
valueFrom:
fieldRef:
@ -91,10 +96,28 @@ spec:
value: atlas
- name: KEYCLOAK_ADMIN_CLIENT_ID
value: bstein-dev-home-admin
- name: ARIADNE_URL
value: http://ariadne.maintenance.svc.cluster.local
- name: ARIADNE_TIMEOUT_SEC
value: "10"
- name: ACCOUNT_ALLOWED_GROUPS
value: ""
- name: HTTP_CHECK_TIMEOUT_SEC
value: "2"
- name: PORTAL_DB_POOL_MIN
value: "0"
- name: PORTAL_DB_POOL_MAX
value: "5"
- name: PORTAL_DB_CONNECT_TIMEOUT_SEC
value: "5"
- name: PORTAL_DB_LOCK_TIMEOUT_SEC
value: "5"
- name: PORTAL_DB_STATEMENT_TIMEOUT_SEC
value: "30"
- name: PORTAL_DB_IDLE_IN_TX_TIMEOUT_SEC
value: "10"
- name: PORTAL_RUN_MIGRATIONS
value: "false"
- name: ACCESS_REQUEST_SUBMIT_RATE_LIMIT
value: "30"
- name: ACCESS_REQUEST_SUBMIT_RATE_WINDOW_SEC

View File

@ -47,6 +47,8 @@ spec:
env:
- name: UPSTREAM_URL
value: http://bstein-dev-home-backend/api/chat
- name: UPSTREAM_TIMEOUT_SEC
value: "600"
ports:
- name: http
containerPort: 8080
@ -65,10 +67,10 @@ spec:
resources:
requests:
cpu: 20m
memory: 64Mi
memory: 128Mi
limits:
cpu: 200m
memory: 256Mi
memory: 512Mi
volumeMounts:
- name: code
mountPath: /app/gateway.py

View File

@ -7,6 +7,8 @@ metadata:
spec:
image: registry.bstein.dev/bstein/bstein-dev-home-frontend
interval: 1m0s
secretRef:
name: harbor-regcred
---
apiVersion: image.toolkit.fluxcd.io/v1beta2
kind: ImagePolicy
@ -28,6 +30,8 @@ metadata:
spec:
image: registry.bstein.dev/bstein/bstein-dev-home-backend
interval: 1m0s
secretRef:
name: harbor-regcred
---
apiVersion: image.toolkit.fluxcd.io/v1beta2
kind: ImagePolicy

View File

@ -16,13 +16,13 @@ resources:
- backend-deployment.yaml
- backend-service.yaml
- vaultwarden-cred-sync-cronjob.yaml
- portal-onboarding-e2e-test-job.yaml
- oneoffs/portal-onboarding-e2e-test-job.yaml
- ingress.yaml
images:
- name: registry.bstein.dev/bstein/bstein-dev-home-frontend
newTag: 0.1.1-102 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend"}
newTag: 0.1.1-162 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"}
- name: registry.bstein.dev/bstein/bstein-dev-home-backend
newTag: 0.1.1-103 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend"}
newTag: 0.1.1-162 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"}
configMapGenerator:
- name: chat-ai-gateway
namespace: bstein-dev-home

View File

@ -0,0 +1,6 @@
# services/bstein-dev-home/oneoffs/migrations/kustomization.yaml
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
namespace: bstein-dev-home
resources:
- portal-migrate-job.yaml

View File

@ -0,0 +1,48 @@
# services/bstein-dev-home/oneoffs/migrations/portal-migrate-job.yaml
# One-off job for bstein-dev-home/bstein-dev-home-portal-migrate-36.
# Purpose: bstein dev home portal migrate 36 (see container args/env in this file).
# Run by setting spec.suspend to false, reconcile, then set it back to true.
# Safe to delete the finished Job/pod; it should not run continuously.
apiVersion: batch/v1
kind: Job
metadata:
name: bstein-dev-home-portal-migrate-36
namespace: bstein-dev-home
annotations:
kustomize.toolkit.fluxcd.io/force: "true"
spec:
suspend: true
backoffLimit: 1
ttlSecondsAfterFinished: 3600
template:
metadata:
labels:
app: bstein-dev-home-portal-migrate
annotations:
vault.hashicorp.com/agent-inject: "true"
vault.hashicorp.com/role: "bstein-dev-home"
vault.hashicorp.com/agent-inject-secret-portal-env.sh: "kv/data/atlas/portal/atlas-portal-db"
vault.hashicorp.com/agent-inject-template-portal-env.sh: |
{{ with secret "kv/data/atlas/portal/atlas-portal-db" }}
export PORTAL_DATABASE_URL="{{ .Data.data.PORTAL_DATABASE_URL }}"
{{ end }}
spec:
serviceAccountName: bstein-dev-home
restartPolicy: Never
nodeSelector:
kubernetes.io/arch: arm64
node-role.kubernetes.io/worker: "true"
imagePullSecrets:
- name: harbor-regcred
containers:
- name: migrate
image: registry.bstein.dev/bstein/bstein-dev-home-backend:0.1.1-95
imagePullPolicy: Always
command: ["/bin/sh", "-c"]
args:
- >-
. /vault/secrets/portal-env.sh
&& exec python -m atlas_portal.migrate
env:
- name: PORTAL_RUN_MIGRATIONS
value: "true"

View File

@ -1,10 +1,15 @@
# services/bstein-dev-home/portal-onboarding-e2e-test-job.yaml
# services/bstein-dev-home/oneoffs/portal-onboarding-e2e-test-job.yaml
# One-off job for bstein-dev-home/portal-onboarding-e2e-test-27.
# Purpose: portal onboarding e2e test 27 (see container args/env in this file).
# Run by setting spec.suspend to false, reconcile, then set it back to true.
# Safe to delete the finished Job/pod; it should not run continuously.
apiVersion: batch/v1
kind: Job
metadata:
name: portal-onboarding-e2e-test-19
name: portal-onboarding-e2e-test-27
namespace: bstein-dev-home
spec:
suspend: true
backoffLimit: 0
template:
metadata:

View File

@ -6,6 +6,7 @@ from urllib import request, error
UPSTREAM = os.environ.get("UPSTREAM_URL", "http://bstein-dev-home-backend/api/chat")
KEY_MATRIX = os.environ.get("CHAT_KEY_MATRIX", "")
KEY_HOMEPAGE = os.environ.get("CHAT_KEY_HOMEPAGE", "")
UPSTREAM_TIMEOUT_SEC = float(os.environ.get("UPSTREAM_TIMEOUT_SEC", "90"))
ALLOWED = {k for k in (KEY_MATRIX, KEY_HOMEPAGE) if k}
@ -41,7 +42,7 @@ class Handler(BaseHTTPRequestHandler):
headers={"Content-Type": "application/json"},
method="POST",
)
with request.urlopen(upstream_req, timeout=90) as resp:
with request.urlopen(upstream_req, timeout=UPSTREAM_TIMEOUT_SEC) as resp:
data = resp.read()
self.send_response(resp.status)
for k, v in resp.headers.items():

View File

@ -11,7 +11,7 @@ spec:
roleName: "bstein-dev-home"
objects: |
- objectName: "harbor-pull__dockerconfigjson"
secretPath: "kv/data/atlas/harbor-pull/bstein-dev-home"
secretPath: "kv/data/atlas/shared/harbor-pull"
secretKey: "dockerconfigjson"
secretObjects:
- secretName: harbor-regcred

View File

@ -8,6 +8,7 @@ metadata:
atlas.bstein.dev/glue: "true"
spec:
schedule: "*/15 * * * *"
suspend: true
concurrencyPolicy: Forbid
successfulJobsHistoryLimit: 1
failedJobsHistoryLimit: 3

View File

@ -16,7 +16,7 @@ spec:
labels:
app: atlasbot
annotations:
checksum/atlasbot-configmap: manual-atlasbot-4
checksum/atlasbot-configmap: manual-atlasbot-101
vault.hashicorp.com/agent-inject: "true"
vault.hashicorp.com/role: "comms"
vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret"
@ -73,12 +73,33 @@ spec:
value: /kb
- name: VM_URL
value: http://victoria-metrics-single-server.monitoring.svc.cluster.local:8428
- name: ARIADNE_STATE_URL
value: http://ariadne.maintenance.svc.cluster.local/api/internal/cluster/state
- name: BOT_USER
value: atlasbot
- name: BOT_MENTIONS
value: atlasbot,aatlasbot,atlas_quick,atlas_smart
- name: OLLAMA_URL
value: https://chat.ai.bstein.dev/
value: http://ollama.ai.svc.cluster.local:11434
- name: OLLAMA_MODEL
value: qwen2.5-coder:7b-instruct-q4_0
value: qwen2.5:14b-instruct
- name: ATLASBOT_MODEL_FAST
value: qwen2.5:14b-instruct-q4_0
- name: ATLASBOT_MODEL_DEEP
value: qwen2.5:14b-instruct
- name: OLLAMA_FALLBACK_MODEL
value: qwen2.5:14b-instruct-q4_0
- name: OLLAMA_TIMEOUT_SEC
value: "600"
- name: ATLASBOT_THINKING_INTERVAL_SEC
value: "120"
- name: ATLASBOT_SNAPSHOT_TTL_SEC
value: "30"
- name: ATLASBOT_HTTP_PORT
value: "8090"
ports:
- name: http
containerPort: 8090
resources:
requests:
cpu: 100m
@ -110,6 +131,8 @@ spec:
path: catalog/atlas.json
- key: atlas-summary.json
path: catalog/atlas-summary.json
- key: metrics.json
path: catalog/metrics.json
- key: runbooks.json
path: catalog/runbooks.json
- key: atlas-http.mmd

View File

@ -0,0 +1,15 @@
apiVersion: v1
kind: Service
metadata:
name: atlasbot
namespace: comms
labels:
app: atlasbot
spec:
selector:
app: atlasbot
ports:
- name: http
port: 8090
targetPort: 8090
type: ClusterIP

View File

@ -8,7 +8,7 @@ metadata:
atlas.bstein.dev/glue: "true"
spec:
schedule: "*/1 * * * *"
suspend: false
suspend: true
concurrencyPolicy: Forbid
successfulJobsHistoryLimit: 1
failedJobsHistoryLimit: 1

View File

@ -140,6 +140,7 @@ spec:
autocreate_auto_join_rooms: true
default_room_version: "11"
experimental_features:
msc4108_enabled: true
msc3266_enabled: true
msc4143_enabled: true
msc4222_enabled: true

View File

@ -1,8 +1,8 @@
{
"counts": {
"helmrelease_host_hints": 17,
"http_endpoints": 37,
"services": 43,
"workloads": 54
"helmrelease_host_hints": 19,
"http_endpoints": 45,
"services": 47,
"workloads": 74
}
}

File diff suppressed because it is too large Load Diff

View File

@ -1,4 +1,4 @@
# services/comms/knowledge/catalog/atlas.yaml
# knowledge/catalog/atlas.yaml
# Generated by scripts/knowledge_render_atlas.py (do not edit by hand)
cluster: atlas
sources:
@ -8,6 +8,15 @@ sources:
- name: bstein-dev-home
path: services/bstein-dev-home
targetNamespace: bstein-dev-home
- name: bstein-dev-home-migrations
path: services/bstein-dev-home/migrations
targetNamespace: bstein-dev-home
- name: cert-manager
path: infrastructure/cert-manager
targetNamespace: cert-manager
- name: cert-manager-cleanup
path: infrastructure/cert-manager/cleanup
targetNamespace: cert-manager
- name: comms
path: services/comms
targetNamespace: comms
@ -17,6 +26,9 @@ sources:
- name: crypto
path: services/crypto
targetNamespace: crypto
- name: finance
path: services/finance
targetNamespace: finance
- name: flux-system
path: clusters/atlas/flux-system
targetNamespace: null
@ -29,6 +41,9 @@ sources:
- name: harbor
path: services/harbor
targetNamespace: harbor
- name: health
path: services/health
targetNamespace: health
- name: helm
path: infrastructure/sources/helm
targetNamespace: flux-system
@ -44,6 +59,12 @@ sources:
- name: logging
path: services/logging
targetNamespace: null
- name: longhorn
path: infrastructure/longhorn/core
targetNamespace: longhorn-system
- name: longhorn-adopt
path: infrastructure/longhorn/adopt
targetNamespace: longhorn-system
- name: longhorn-ui
path: infrastructure/longhorn/ui-ingress
targetNamespace: longhorn-system
@ -98,9 +119,15 @@ sources:
- name: vault-csi
path: infrastructure/vault-csi
targetNamespace: kube-system
- name: vault-injector
path: infrastructure/vault-injector
targetNamespace: vault
- name: vaultwarden
path: services/vaultwarden
targetNamespace: vaultwarden
- name: wallet-monero-temp
path: services/crypto/wallet-monero-temp
targetNamespace: crypto
- name: xmr-miner
path: services/crypto/xmr-miner
targetNamespace: crypto
@ -124,7 +151,7 @@ workloads:
kubernetes.io/arch: arm64
node-role.kubernetes.io/worker: 'true'
images:
- registry.bstein.dev/bstein/bstein-dev-home-backend:0.1.1-92
- registry.bstein.dev/bstein/bstein-dev-home-backend:0.1.1-157
- kind: Deployment
namespace: bstein-dev-home
name: bstein-dev-home-frontend
@ -135,13 +162,22 @@ workloads:
kubernetes.io/arch: arm64
node-role.kubernetes.io/worker: 'true'
images:
- registry.bstein.dev/bstein/bstein-dev-home-frontend:0.1.1-92
- registry.bstein.dev/bstein/bstein-dev-home-frontend:0.1.1-157
- kind: Deployment
namespace: bstein-dev-home
name: bstein-dev-home-vault-sync
labels:
app: bstein-dev-home-vault-sync
serviceAccountName: bstein-dev-home-vault-sync
nodeSelector: {}
images:
- alpine:3.20
- kind: Deployment
namespace: bstein-dev-home
name: chat-ai-gateway
labels:
app: chat-ai-gateway
serviceAccountName: null
serviceAccountName: bstein-dev-home
nodeSelector:
kubernetes.io/arch: arm64
node-role.kubernetes.io/worker: 'true'
@ -157,12 +193,21 @@ workloads:
hardware: rpi5
images:
- python:3.11-slim
- kind: Deployment
namespace: comms
name: comms-vault-sync
labels:
app: comms-vault-sync
serviceAccountName: comms-vault
nodeSelector: {}
images:
- alpine:3.20
- kind: Deployment
namespace: comms
name: coturn
labels:
app: coturn
serviceAccountName: null
serviceAccountName: comms-vault
nodeSelector:
hardware: rpi5
images:
@ -182,7 +227,7 @@ workloads:
name: livekit
labels:
app: livekit
serviceAccountName: null
serviceAccountName: comms-vault
nodeSelector:
hardware: rpi5
images:
@ -192,17 +237,17 @@ workloads:
name: livekit-token-service
labels:
app: livekit-token-service
serviceAccountName: null
serviceAccountName: comms-vault
nodeSelector:
hardware: rpi5
images:
- ghcr.io/element-hq/lk-jwt-service:0.3.0
- registry.bstein.dev/tools/lk-jwt-service-vault:0.3.0
- kind: Deployment
namespace: comms
name: matrix-authentication-service
labels:
app: matrix-authentication-service
serviceAccountName: null
serviceAccountName: comms-vault
nodeSelector:
hardware: rpi5
images:
@ -212,7 +257,7 @@ workloads:
name: matrix-guest-register
labels:
app.kubernetes.io/name: matrix-guest-register
serviceAccountName: null
serviceAccountName: comms-vault
nodeSelector: {}
images:
- python:3.11-slim
@ -235,12 +280,21 @@ workloads:
node-role.kubernetes.io/worker: 'true'
images:
- ghcr.io/tari-project/xmrig@sha256:80defbfd0b640d604c91cb5101d3642db7928e1e68ee3c6b011289b3565a39d9
- kind: Deployment
namespace: crypto
name: crypto-vault-sync
labels:
app: crypto-vault-sync
serviceAccountName: crypto-vault-sync
nodeSelector: {}
images:
- alpine:3.20
- kind: Deployment
namespace: crypto
name: monero-p2pool
labels:
app: monero-p2pool
serviceAccountName: null
serviceAccountName: crypto-vault-sync
nodeSelector:
node-role.kubernetes.io/worker: 'true'
images:
@ -255,6 +309,38 @@ workloads:
node-role.kubernetes.io/worker: 'true'
images:
- registry.bstein.dev/crypto/monerod:0.18.4.1
- kind: Deployment
namespace: crypto
name: wallet-monero-temp
labels:
app: wallet-monero-temp
serviceAccountName: crypto-vault-sync
nodeSelector:
node-role.kubernetes.io/worker: 'true'
images:
- registry.bstein.dev/crypto/monero-wallet-rpc:0.18.4.1
- kind: Deployment
namespace: finance
name: actual-budget
labels:
app: actual-budget
serviceAccountName: finance-vault
nodeSelector:
kubernetes.io/arch: arm64
node-role.kubernetes.io/worker: 'true'
images:
- actualbudget/actual-server:26.1.0-alpine@sha256:34aae5813fdfee12af2a50c4d0667df68029f1d61b90f45f282473273eb70d0d
- kind: Deployment
namespace: finance
name: firefly
labels:
app: firefly
serviceAccountName: finance-vault
nodeSelector:
kubernetes.io/arch: arm64
node-role.kubernetes.io/worker: 'true'
images:
- fireflyiii/core:version-6.4.15
- kind: Deployment
namespace: flux-system
name: helm-controller
@ -344,17 +430,38 @@ workloads:
name: gitea
labels:
app: gitea
serviceAccountName: null
serviceAccountName: gitea-vault
nodeSelector:
node-role.kubernetes.io/worker: 'true'
images:
- gitea/gitea:1.23
- kind: Deployment
namespace: harbor
name: harbor-vault-sync
labels:
app: harbor-vault-sync
serviceAccountName: harbor-vault-sync
nodeSelector: {}
images:
- alpine:3.20
- kind: Deployment
namespace: health
name: wger
labels:
app: wger
serviceAccountName: health-vault-sync
nodeSelector:
kubernetes.io/arch: arm64
node-role.kubernetes.io/worker: 'true'
images:
- nginx:1.27.5-alpine@sha256:65645c7bb6a0661892a8b03b89d0743208a18dd2f3f17a54ef4b76fb8e2f2a10
- wger/server@sha256:710588b78af4e0aa0b4d8a8061e4563e16eae80eeaccfe7f9e0d9cbdd7f0cbc5
- kind: Deployment
namespace: jellyfin
name: jellyfin
labels:
app: jellyfin
serviceAccountName: null
serviceAccountName: pegasus-vault-sync
nodeSelector: {}
images:
- docker.io/jellyfin/jellyfin:10.11.5
@ -363,13 +470,22 @@ workloads:
name: pegasus
labels:
app: pegasus
serviceAccountName: null
serviceAccountName: pegasus-vault-sync
nodeSelector:
kubernetes.io/arch: arm64
node-role.kubernetes.io/worker: 'true'
images:
- alpine:3.20
- registry.bstein.dev/streaming/pegasus:1.2.32
- registry.bstein.dev/streaming/pegasus-vault:1.2.32
- kind: Deployment
namespace: jellyfin
name: pegasus-vault-sync
labels:
app: pegasus-vault-sync
serviceAccountName: pegasus-vault-sync
nodeSelector: {}
images:
- alpine:3.20
- kind: Deployment
namespace: jenkins
name: jenkins
@ -381,6 +497,26 @@ workloads:
node-role.kubernetes.io/worker: 'true'
images:
- jenkins/jenkins:2.528.3-jdk21
- kind: Deployment
namespace: jenkins
name: jenkins-vault-sync
labels:
app: jenkins-vault-sync
serviceAccountName: jenkins-vault-sync
nodeSelector:
kubernetes.io/arch: arm64
node-role.kubernetes.io/worker: 'true'
images:
- alpine:3.20
- kind: DaemonSet
namespace: kube-system
name: ntp-sync
labels:
app: ntp-sync
serviceAccountName: null
nodeSelector: {}
images:
- public.ecr.aws/docker/library/busybox:1.36.1
- kind: DaemonSet
namespace: kube-system
name: nvidia-device-plugin-jetson
@ -427,6 +563,16 @@ workloads:
kubernetes.io/os: linux
images:
- hashicorp/vault-csi-provider:1.7.0
- kind: Deployment
namespace: kube-system
name: coredns
labels:
k8s-app: kube-dns
serviceAccountName: coredns
nodeSelector:
kubernetes.io/os: linux
images:
- registry.bstein.dev/infra/coredns:1.12.1
- kind: DaemonSet
namespace: logging
name: node-image-gc-rpi4
@ -457,22 +603,41 @@ workloads:
hardware: rpi5
images:
- bitnami/kubectl@sha256:554ab88b1858e8424c55de37ad417b16f2a0e65d1607aa0f3fe3ce9b9f10b131
- kind: Deployment
namespace: logging
name: logging-vault-sync
labels:
app: logging-vault-sync
serviceAccountName: logging-vault-sync
nodeSelector: {}
images:
- alpine:3.20
- kind: Deployment
namespace: logging
name: oauth2-proxy-logs
labels:
app: oauth2-proxy-logs
serviceAccountName: null
serviceAccountName: logging-vault-sync
nodeSelector:
node-role.kubernetes.io/worker: 'true'
images:
- quay.io/oauth2-proxy/oauth2-proxy:v7.6.0
- registry.bstein.dev/tools/oauth2-proxy-vault:v7.6.0
- kind: Deployment
namespace: longhorn-system
name: longhorn-vault-sync
labels:
app: longhorn-vault-sync
serviceAccountName: longhorn-vault-sync
nodeSelector:
node-role.kubernetes.io/worker: 'true'
images:
- alpine:3.20
- kind: Deployment
namespace: longhorn-system
name: oauth2-proxy-longhorn
labels:
app: oauth2-proxy-longhorn
serviceAccountName: null
serviceAccountName: longhorn-vault
nodeSelector:
node-role.kubernetes.io/worker: 'true'
images:
@ -489,13 +654,34 @@ workloads:
- registry.bstein.dev/bstein/kubectl:1.35.0
- kind: Deployment
namespace: mailu-mailserver
name: mailu-sync-listener
name: mailu-vault-sync
labels:
app: mailu-sync-listener
serviceAccountName: null
app: mailu-vault-sync
serviceAccountName: mailu-vault-sync
nodeSelector: {}
images:
- python:3.11-alpine
- alpine:3.20
- kind: DaemonSet
namespace: maintenance
name: disable-k3s-traefik
labels:
app: disable-k3s-traefik
serviceAccountName: disable-k3s-traefik
nodeSelector:
node-role.kubernetes.io/control-plane: 'true'
images:
- bitnami/kubectl@sha256:554ab88b1858e8424c55de37ad417b16f2a0e65d1607aa0f3fe3ce9b9f10b131
- kind: DaemonSet
namespace: maintenance
name: k3s-agent-restart
labels:
app: k3s-agent-restart
serviceAccountName: node-nofile
nodeSelector:
kubernetes.io/arch: arm64
node-role.kubernetes.io/worker: 'true'
images:
- bitnami/kubectl@sha256:554ab88b1858e8424c55de37ad417b16f2a0e65d1607aa0f3fe3ce9b9f10b131
- kind: DaemonSet
namespace: maintenance
name: node-image-sweeper
@ -515,6 +701,26 @@ workloads:
nodeSelector: {}
images:
- bitnami/kubectl@sha256:554ab88b1858e8424c55de37ad417b16f2a0e65d1607aa0f3fe3ce9b9f10b131
- kind: Deployment
namespace: maintenance
name: ariadne
labels:
app: ariadne
serviceAccountName: ariadne
nodeSelector:
kubernetes.io/arch: arm64
node-role.kubernetes.io/worker: 'true'
images:
- registry.bstein.dev/bstein/ariadne:0.1.0-49
- kind: Deployment
namespace: maintenance
name: maintenance-vault-sync
labels:
app: maintenance-vault-sync
serviceAccountName: maintenance-vault-sync
nodeSelector: {}
images:
- alpine:3.20
- kind: DaemonSet
namespace: monitoring
name: dcgm-exporter
@ -534,12 +740,21 @@ workloads:
jetson: 'true'
images:
- python:3.10-slim
- kind: Deployment
namespace: monitoring
name: monitoring-vault-sync
labels:
app: monitoring-vault-sync
serviceAccountName: monitoring-vault-sync
nodeSelector: {}
images:
- alpine:3.20
- kind: Deployment
namespace: monitoring
name: postmark-exporter
labels:
app: postmark-exporter
serviceAccountName: null
serviceAccountName: monitoring-vault-sync
nodeSelector: {}
images:
- python:3.12-alpine
@ -558,7 +773,7 @@ workloads:
name: nextcloud
labels:
app: nextcloud
serviceAccountName: null
serviceAccountName: nextcloud-vault
nodeSelector:
hardware: rpi5
images:
@ -568,7 +783,7 @@ workloads:
name: outline
labels:
app: outline
serviceAccountName: null
serviceAccountName: outline-vault
nodeSelector:
node-role.kubernetes.io/worker: 'true'
images:
@ -588,7 +803,7 @@ workloads:
name: planka
labels:
app: planka
serviceAccountName: null
serviceAccountName: planka-vault
nodeSelector:
node-role.kubernetes.io/worker: 'true'
images:
@ -603,13 +818,16 @@ workloads:
node-role.kubernetes.io/worker: 'true'
images:
- postgres:15
- quay.io/prometheuscommunity/postgres-exporter:v0.15.0
- kind: Deployment
namespace: sso
name: keycloak
labels:
app: keycloak
serviceAccountName: null
nodeSelector: {}
serviceAccountName: sso-vault
nodeSelector:
kubernetes.io/arch: arm64
node-role.kubernetes.io/worker: 'true'
images:
- quay.io/keycloak/keycloak:26.0.7
- kind: Deployment
@ -617,17 +835,26 @@ workloads:
name: oauth2-proxy
labels:
app: oauth2-proxy
serviceAccountName: null
serviceAccountName: sso-vault
nodeSelector:
node-role.kubernetes.io/worker: 'true'
images:
- quay.io/oauth2-proxy/oauth2-proxy:v7.6.0
- registry.bstein.dev/tools/oauth2-proxy-vault:v7.6.0
- kind: Deployment
namespace: sso
name: sso-vault-sync
labels:
app: sso-vault-sync
serviceAccountName: sso-vault-sync
nodeSelector: {}
images:
- alpine:3.20
- kind: StatefulSet
namespace: sso
name: openldap
labels:
app: openldap
serviceAccountName: null
serviceAccountName: sso-vault
nodeSelector:
kubernetes.io/arch: arm64
node-role.kubernetes.io/worker: 'true'
@ -640,7 +867,7 @@ workloads:
app: sui-metrics
serviceAccountName: sui-metrics
nodeSelector:
kubernetes.io/hostname: titan-24
hardware: rpi5
images:
- victoriametrics/vmagent:v1.103.0
- kind: Deployment
@ -648,6 +875,8 @@ workloads:
name: traefik
labels:
app: traefik
app.kubernetes.io/instance: traefik-kube-system
app.kubernetes.io/name: traefik
serviceAccountName: traefik-ingress-controller
nodeSelector:
node-role.kubernetes.io/worker: 'true'
@ -669,10 +898,12 @@ workloads:
name: vaultwarden
labels:
app: vaultwarden
serviceAccountName: null
nodeSelector: {}
serviceAccountName: vaultwarden-vault
nodeSelector:
kubernetes.io/arch: arm64
node-role.kubernetes.io/worker: 'true'
images:
- vaultwarden/server:1.33.2
- vaultwarden/server:1.35.2
services:
- namespace: ai
name: ollama
@ -1040,6 +1271,36 @@ services:
port: 3333
targetPort: 3333
protocol: TCP
- namespace: crypto
name: wallet-monero-temp
type: ClusterIP
selector:
app: wallet-monero-temp
ports:
- name: rpc
port: 18083
targetPort: 18083
protocol: TCP
- namespace: finance
name: actual-budget
type: ClusterIP
selector:
app: actual-budget
ports:
- name: http
port: 80
targetPort: 5006
protocol: TCP
- namespace: finance
name: firefly
type: ClusterIP
selector:
app: firefly
ports:
- name: http
port: 80
targetPort: 8080
protocol: TCP
- namespace: flux-system
name: notification-controller
type: ClusterIP
@ -1082,7 +1343,7 @@ services:
protocol: TCP
- namespace: gitea
name: gitea-ssh
type: NodePort
type: LoadBalancer
selector:
app: gitea
ports:
@ -1090,6 +1351,16 @@ services:
port: 2242
targetPort: 2242
protocol: TCP
- namespace: health
name: wger
type: ClusterIP
selector:
app: wger
ports:
- name: http
port: 80
targetPort: http
protocol: TCP
- namespace: jellyfin
name: jellyfin
type: ClusterIP
@ -1124,21 +1395,6 @@ services:
port: 50000
targetPort: 50000
protocol: TCP
- namespace: kube-system
name: traefik
type: LoadBalancer
selector:
app.kubernetes.io/instance: traefik-kube-system
app.kubernetes.io/name: traefik
ports:
- name: web
port: 80
targetPort: web
protocol: TCP
- name: websecure
port: 443
targetPort: websecure
protocol: TCP
- namespace: logging
name: oauth2-proxy-logs
type: ClusterIP
@ -1191,15 +1447,15 @@ services:
port: 4190
targetPort: 4190
protocol: TCP
- namespace: mailu-mailserver
name: mailu-sync-listener
- namespace: maintenance
name: ariadne
type: ClusterIP
selector:
app: mailu-sync-listener
app: ariadne
ports:
- name: http
port: 8080
targetPort: 8080
port: 80
targetPort: http
protocol: TCP
- namespace: monitoring
name: dcgm-exporter
@ -1291,6 +1547,10 @@ services:
port: 5432
targetPort: 5432
protocol: TCP
- name: metrics
port: 9187
targetPort: 9187
protocol: TCP
- namespace: sso
name: keycloak
type: ClusterIP
@ -1335,6 +1595,20 @@ services:
port: 8429
targetPort: 8429
protocol: TCP
- namespace: traefik
name: traefik
type: LoadBalancer
selector:
app: traefik
ports:
- name: web
port: 80
targetPort: web
protocol: TCP
- name: websecure
port: 443
targetPort: websecure
protocol: TCP
- namespace: traefik
name: traefik-metrics
type: ClusterIP
@ -1447,6 +1721,19 @@ http_endpoints:
kind: Ingress
name: bstein-dev-home
source: bstein-dev-home
- host: budget.bstein.dev
path: /
backend:
namespace: finance
service: actual-budget
port: 80
workloads:
- kind: Deployment
name: actual-budget
via:
kind: Ingress
name: actual-budget
source: finance
- host: call.live.bstein.dev
path: /
backend:
@ -1499,6 +1786,19 @@ http_endpoints:
kind: Ingress
name: nextcloud
source: nextcloud
- host: health.bstein.dev
path: /
backend:
namespace: health
service: wger
port: 80
workloads:
- kind: Deployment
name: wger
via:
kind: Ingress
name: wger
source: health
- host: kit.live.bstein.dev
path: /livekit/jwt
backend:
@ -1558,6 +1858,65 @@ http_endpoints:
kind: Ingress
name: matrix-routing
source: comms
- host: live.bstein.dev
path: /_matrix/client/r0/register
backend:
namespace: comms
service: matrix-guest-register
port: 8080
workloads: &id003
- kind: Deployment
name: matrix-guest-register
via:
kind: Ingress
name: matrix-routing
source: comms
- host: live.bstein.dev
path: /_matrix/client/v3/login
backend:
namespace: comms
service: matrix-authentication-service
port: 8080
workloads: &id002
- kind: Deployment
name: matrix-authentication-service
via:
kind: Ingress
name: matrix-routing
source: comms
- host: live.bstein.dev
path: /_matrix/client/v3/logout
backend:
namespace: comms
service: matrix-authentication-service
port: 8080
workloads: *id002
via:
kind: Ingress
name: matrix-routing
source: comms
- host: live.bstein.dev
path: /_matrix/client/v3/refresh
backend:
namespace: comms
service: matrix-authentication-service
port: 8080
workloads: *id002
via:
kind: Ingress
name: matrix-routing
source: comms
- host: live.bstein.dev
path: /_matrix/client/v3/register
backend:
namespace: comms
service: matrix-guest-register
port: 8080
workloads: *id003
via:
kind: Ingress
name: matrix-routing
source: comms
- host: logs.bstein.dev
path: /
backend:
@ -1601,9 +1960,7 @@ http_endpoints:
namespace: comms
service: matrix-authentication-service
port: 8080
workloads: &id002
- kind: Deployment
name: matrix-authentication-service
workloads: *id002
via:
kind: Ingress
name: matrix-routing
@ -1647,9 +2004,7 @@ http_endpoints:
namespace: comms
service: matrix-guest-register
port: 8080
workloads: &id003
- kind: Deployment
name: matrix-guest-register
workloads: *id003
via:
kind: Ingress
name: matrix-routing
@ -1722,6 +2077,19 @@ http_endpoints:
kind: Ingress
name: monerod
source: monerod
- host: money.bstein.dev
path: /
backend:
namespace: finance
service: firefly
port: 80
workloads:
- kind: Deployment
name: firefly
via:
kind: Ingress
name: firefly
source: finance
- host: notes.bstein.dev
path: /
backend:
@ -1845,7 +2213,6 @@ helmrelease_host_hints:
- live.bstein.dev
- matrix.live.bstein.dev
comms:comms/othrys-synapse:
- bstein.dev
- kit.live.bstein.dev
- live.bstein.dev
- matrix.live.bstein.dev
@ -1856,6 +2223,8 @@ helmrelease_host_hints:
- registry.bstein.dev
logging:logging/data-prepper:
- registry.bstein.dev
longhorn:longhorn-system/longhorn:
- registry.bstein.dev
mailu:mailu-mailserver/mailu:
- bstein.dev
- mail.bstein.dev
@ -1863,5 +2232,8 @@ helmrelease_host_hints:
- alerts.bstein.dev
monitoring:monitoring/grafana:
- bstein.dev
- mail.bstein.dev
- metrics.bstein.dev
- sso.bstein.dev
monitoring:monitoring/kube-state-metrics:
- atlas.bstein.dev

File diff suppressed because it is too large Load Diff

File diff suppressed because one or more lines are too long

View File

@ -17,6 +17,11 @@ flowchart LR
host_bstein_dev --> svc_bstein_dev_home_bstein_dev_home_backend
wl_bstein_dev_home_bstein_dev_home_backend["bstein-dev-home/bstein-dev-home-backend (Deployment)"]
svc_bstein_dev_home_bstein_dev_home_backend --> wl_bstein_dev_home_bstein_dev_home_backend
host_budget_bstein_dev["budget.bstein.dev"]
svc_finance_actual_budget["finance/actual-budget (Service)"]
host_budget_bstein_dev --> svc_finance_actual_budget
wl_finance_actual_budget["finance/actual-budget (Deployment)"]
svc_finance_actual_budget --> wl_finance_actual_budget
host_call_live_bstein_dev["call.live.bstein.dev"]
svc_comms_element_call["comms/element-call (Service)"]
host_call_live_bstein_dev --> svc_comms_element_call
@ -37,6 +42,11 @@ flowchart LR
host_cloud_bstein_dev --> svc_nextcloud_nextcloud
wl_nextcloud_nextcloud["nextcloud/nextcloud (Deployment)"]
svc_nextcloud_nextcloud --> wl_nextcloud_nextcloud
host_health_bstein_dev["health.bstein.dev"]
svc_health_wger["health/wger (Service)"]
host_health_bstein_dev --> svc_health_wger
wl_health_wger["health/wger (Deployment)"]
svc_health_wger --> wl_health_wger
host_kit_live_bstein_dev["kit.live.bstein.dev"]
svc_comms_livekit_token_service["comms/livekit-token-service (Service)"]
host_kit_live_bstein_dev --> svc_comms_livekit_token_service
@ -50,6 +60,14 @@ flowchart LR
host_live_bstein_dev --> svc_comms_matrix_wellknown
svc_comms_othrys_synapse_matrix_synapse["comms/othrys-synapse-matrix-synapse (Service)"]
host_live_bstein_dev --> svc_comms_othrys_synapse_matrix_synapse
svc_comms_matrix_guest_register["comms/matrix-guest-register (Service)"]
host_live_bstein_dev --> svc_comms_matrix_guest_register
wl_comms_matrix_guest_register["comms/matrix-guest-register (Deployment)"]
svc_comms_matrix_guest_register --> wl_comms_matrix_guest_register
svc_comms_matrix_authentication_service["comms/matrix-authentication-service (Service)"]
host_live_bstein_dev --> svc_comms_matrix_authentication_service
wl_comms_matrix_authentication_service["comms/matrix-authentication-service (Deployment)"]
svc_comms_matrix_authentication_service --> wl_comms_matrix_authentication_service
host_logs_bstein_dev["logs.bstein.dev"]
svc_logging_oauth2_proxy_logs["logging/oauth2-proxy-logs (Service)"]
host_logs_bstein_dev --> svc_logging_oauth2_proxy_logs
@ -64,21 +82,20 @@ flowchart LR
svc_mailu_mailserver_mailu_front["mailu-mailserver/mailu-front (Service)"]
host_mail_bstein_dev --> svc_mailu_mailserver_mailu_front
host_matrix_live_bstein_dev["matrix.live.bstein.dev"]
svc_comms_matrix_authentication_service["comms/matrix-authentication-service (Service)"]
host_matrix_live_bstein_dev --> svc_comms_matrix_authentication_service
wl_comms_matrix_authentication_service["comms/matrix-authentication-service (Deployment)"]
svc_comms_matrix_authentication_service --> wl_comms_matrix_authentication_service
host_matrix_live_bstein_dev --> svc_comms_matrix_wellknown
host_matrix_live_bstein_dev --> svc_comms_othrys_synapse_matrix_synapse
svc_comms_matrix_guest_register["comms/matrix-guest-register (Service)"]
host_matrix_live_bstein_dev --> svc_comms_matrix_guest_register
wl_comms_matrix_guest_register["comms/matrix-guest-register (Deployment)"]
svc_comms_matrix_guest_register --> wl_comms_matrix_guest_register
host_monero_bstein_dev["monero.bstein.dev"]
svc_crypto_monerod["crypto/monerod (Service)"]
host_monero_bstein_dev --> svc_crypto_monerod
wl_crypto_monerod["crypto/monerod (Deployment)"]
svc_crypto_monerod --> wl_crypto_monerod
host_money_bstein_dev["money.bstein.dev"]
svc_finance_firefly["finance/firefly (Service)"]
host_money_bstein_dev --> svc_finance_firefly
wl_finance_firefly["finance/firefly (Deployment)"]
svc_finance_firefly --> wl_finance_firefly
host_notes_bstein_dev["notes.bstein.dev"]
svc_outline_outline["outline/outline (Service)"]
host_notes_bstein_dev --> svc_outline_outline
@ -143,19 +160,29 @@ flowchart LR
svc_comms_livekit
wl_comms_livekit
svc_comms_othrys_synapse_matrix_synapse
svc_comms_matrix_authentication_service
wl_comms_matrix_authentication_service
svc_comms_matrix_guest_register
wl_comms_matrix_guest_register
svc_comms_matrix_authentication_service
wl_comms_matrix_authentication_service
end
subgraph crypto[crypto]
svc_crypto_monerod
wl_crypto_monerod
end
subgraph finance[finance]
svc_finance_actual_budget
wl_finance_actual_budget
svc_finance_firefly
wl_finance_firefly
end
subgraph gitea[gitea]
svc_gitea_gitea
wl_gitea_gitea
end
subgraph health[health]
svc_health_wger
wl_health_wger
end
subgraph jellyfin[jellyfin]
svc_jellyfin_pegasus
wl_jellyfin_pegasus

View File

@ -0,0 +1,26 @@
# Metis (node recovery)
## Node classes (current map)
- rpi5 Ubuntu workers: titan-04,05,06,07,08,09,10,11,20,21 (Ubuntu 24.04.3, k3s agent)
- rpi5 control-plane: titan-0a/0b/0c (Ubuntu 24.04.1, k3s server, control-plane taint)
- rpi4 Armbian longhorn: titan-13/15/17/19 (Armbian 6.6.x, k3s agent, longhorn disks)
- rpi4 Armbian standard: titan-12/14/18 (Armbian 6.6.x, k3s agent)
- amd64 agents: titan-22/24 (Debian 13, k3s agent)
- External/non-cluster: tethys, titan-db, titan-jh, oceanus/titan-23, future titan-20/21 (when added), plus any newcomers.
## Longhorn disk UUIDs (critical nodes)
- titan-13: /mnt/astreae UUID=6031fa8b-f28c-45c3-b7bc-6133300e07c6 (ext4); /mnt/asteria UUID=cbd4989d-62b5-4741-8b2a-28fdae259cae (ext4)
- titan-15: /mnt/astreae UUID=f3362f14-5822-449f-944b-ac570b5cd615 (ext4); /mnt/asteria UUID=9c5316e6-f847-4884-b502-11f2d0d15d6f (ext4)
- titan-17: /mnt/astreae UUID=1fecdade-08b0-49cb-9ae3-be6c188b0a96 (ext4); /mnt/asteria UUID=2fe9f613-d372-47ca-b84f-82084e4edda0 (ext4)
- titan-19: /mnt/astreae UUID=4890abb9-dda2-4f4f-9c0f-081ee82849cf (ext4); /mnt/asteria UUID=2b4ea28d-b0e6-4fa3-841b-cd7067ae9153 (ext4)
## Metis repo (~/Development/metis)
- CLI skeleton in Go (`cmd/metis`), inventory loader (`pkg/inventory`), plan builder (`pkg/plan`).
- `inventory.example.yaml` shows expected schema (classes + per-node overlay, Longhorn disks, labels, taints).
- `AGENTS.md` in repo is untracked and holds raw notes.
## Next implementation steps
- Add per-class golden image refs and checksums (Harbor or file://) when ready.
- Implement burn execution: download with checksum, write via dd/etcher-equivalent, mount boot/root to inject hostname/IP/k3s tokens/labels/taints, journald/GC drop-ins, and Longhorn fstab entries. Add Windows writer (diskpart + wmic) and Linux writer (dd + sgdisk) paths.
- Add Keycloak/SSH bootstrap: ensure ssh user, authorized keys, and k3s token/URL injection for agents; control-plane restore path with etcd snapshot selection.
- Add per-host inventory entries for tethys, titan-db, titan-jh, oceanus/titan-23, future 20/21 once audited.

View File

@ -0,0 +1,30 @@
---
title: Othrys verification checklist
tags:
- comms
- matrix
- element
- livekit
entrypoints:
- https://live.bstein.dev
- https://matrix.live.bstein.dev
---
1) Guest join:
- Open a private window and visit:
`https://live.bstein.dev/#/room/#othrys:live.bstein.dev?action=join`
- Confirm the guest join flow works and the displayname becomes `<word>-<word>`.
2) Keycloak login:
- Log in from `https://live.bstein.dev` and confirm MAS -> Keycloak -> Element redirect.
3) Video rooms:
- Start an Element Call room and confirm audio/video with a second account.
- Check that guests can read public rooms but cannot start calls.
4) Well-known:
- `https://live.bstein.dev/.well-known/matrix/client` returns JSON.
- `https://matrix.live.bstein.dev/.well-known/matrix/client` returns JSON.
5) TURN reachability:
- Confirm `turn.live.bstein.dev:3478` and `turns:5349` are reachable from WAN.

View File

@ -0,0 +1,73 @@
# Metis (node recovery)
## Node classes (current map)
- rpi5 Ubuntu workers: titan-04,05,06,07,08,09,10,11,20,21 (Ubuntu 24.04.3, k3s agent)
- rpi5 control-plane: titan-0a/0b/0c (Ubuntu 24.04.1, k3s server, control-plane taint)
- rpi4 Armbian longhorn: titan-13/15/17/19 (Armbian 6.6.x, k3s agent, longhorn disks)
- rpi4 Armbian standard: titan-12/14/18 (Armbian 6.6.x, k3s agent)
- amd64 agents: titan-22/24 (Debian 13, k3s agent)
- External/non-cluster: tethys, titan-db, titan-jh, oceanus/titan-23, plus any newcomers.
### Jetson nodes (titan-20/21)
- Ubuntu 20.04.6 (Focal), kernel 5.10.104-tegra, CRI containerd 2.0.5-k3s2, arch arm64.
- Storage: NVMe 232G at / (ext4); onboard mmc partitions present but root on NVMe; 1.9T sda present (unused).
- k3s agent with drop-in 99-nofile.conf.
## Longhorn disk UUIDs (critical nodes)
- titan-13: /mnt/astreae UUID=6031fa8b-f28c-45c3-b7bc-6133300e07c6 (ext4); /mnt/asteria UUID=cbd4989d-62b5-4741-8b2a-28fdae259cae (ext4)
- titan-15: /mnt/astreae UUID=f3362f14-5822-449f-944b-ac570b5cd615 (ext4); /mnt/asteria UUID=9c5316e6-f847-4884-b502-11f2d0d15d6f (ext4)
- titan-17: /mnt/astreae UUID=1fecdade-08b0-49cb-9ae3-be6c188b0a96 (ext4); /mnt/asteria UUID=2fe9f613-d372-47ca-b84f-82084e4edda0 (ext4)
- titan-19: /mnt/astreae UUID=4890abb9-dda2-4f4f-9c0f-081ee82849cf (ext4); /mnt/asteria UUID=2b4ea28d-b0e6-4fa3-841b-cd7067ae9153 (ext4)
## Metis repo (~/Development/metis)
- CLI skeleton in Go (`cmd/metis`), inventory loader (`pkg/inventory`), plan builder (`pkg/plan`).
- `inventory.example.yaml` shows expected schema (classes + per-node overlay, Longhorn disks, labels, taints).
- `AGENTS.md` in repo is untracked and holds raw notes.
## Next implementation steps
- Add per-class golden image refs and checksums (Harbor or file://) when ready.
- Implement burn execution: download with checksum, write via dd/etcher-equivalent, mount boot/root to inject hostname/IP/k3s tokens/labels/taints, journald/GC drop-ins, and Longhorn fstab entries. Add Windows writer (diskpart + wmic) and Linux writer (dd + sgdisk) paths.
- Add Keycloak/SSH bootstrap: ensure ssh user, authorized keys, and k3s token/URL injection for agents; control-plane restore path with etcd snapshot selection.
- Add per-host inventory entries for tethys, titan-db, titan-jh, oceanus/titan-23, future 20/21 once audited.
## Node OS/Kernel/CRI snapshot (Jan 2026)
- titan-04: Ubuntu 24.04.3 LTS, kernel 6.8.0-1031-raspi, CRI containerd://2.0.5-k3s2, arch arm64
- titan-05: Ubuntu 24.04.3 LTS, kernel 6.8.0-1039-raspi, CRI containerd://2.0.5-k3s2, arch arm64
- titan-06: Ubuntu 24.04.3 LTS, kernel 6.8.0-1039-raspi, CRI containerd://2.0.5-k3s2, arch arm64
- titan-07: Ubuntu 24.04.3 LTS, kernel 6.8.0-1039-raspi, CRI containerd://2.0.5-k3s2, arch arm64
- titan-08: Ubuntu 24.04.3 LTS, kernel 6.8.0-1039-raspi, CRI containerd://2.0.5-k3s2, arch arm64
- titan-09: Ubuntu 24.04.3 LTS, kernel 6.8.0-1031-raspi, CRI containerd://2.0.5-k3s2, arch arm64
- titan-0a: Ubuntu 24.04.1 LTS, kernel 6.8.0-1038-raspi, CRI containerd://2.0.5-k3s2, arch arm64
- titan-0b: Ubuntu 24.04.1 LTS, kernel 6.8.0-1038-raspi, CRI containerd://2.0.5-k3s2, arch arm64
- titan-0c: Ubuntu 24.04.1 LTS, kernel 6.8.0-1038-raspi, CRI containerd://2.0.5-k3s2, arch arm64
- titan-10: Ubuntu 24.04.3 LTS, kernel 6.8.0-1039-raspi, CRI containerd://2.0.5-k3s2, arch arm64
- titan-11: Ubuntu 24.04.3 LTS, kernel 6.8.0-1039-raspi, CRI containerd://2.0.5-k3s2, arch arm64
- titan-12: Armbian 24.11.1 noble, kernel 6.6.60-current-bcm2711, CRI containerd://1.7.23-k3s2, arch arm64
- titan-13: Armbian 25.2.1 noble, kernel 6.6.63-current-bcm2711, CRI containerd://1.7.23-k3s2, arch arm64
- titan-14: Armbian 24.11.1 noble, kernel 6.6.60-current-bcm2711, CRI containerd://1.7.23-k3s2, arch arm64
- titan-15: Armbian 25.2.1 noble, kernel 6.6.63-current-bcm2711, CRI containerd://1.7.23-k3s2, arch arm64
- titan-17: Armbian 25.2.1 noble, kernel 6.6.63-current-bcm2711, CRI containerd://1.7.23-k3s2, arch arm64
- titan-18: Armbian 24.11.1 noble, kernel 6.6.60-current-bcm2711, CRI containerd://1.7.23-k3s2, arch arm64
- titan-19: Armbian 25.2.1 noble, kernel 6.6.63-current-bcm2711, CRI containerd://1.7.23-k3s2, arch arm64
- titan-20: Ubuntu 20.04.6 LTS, kernel 5.10.104-tegra, CRI containerd://2.0.5-k3s2, arch arm64
- titan-21: Ubuntu 20.04.6 LTS, kernel 5.10.104-tegra, CRI containerd://2.0.5-k3s2, arch arm64
- titan-22: Debian 13 (trixie), kernel 6.12.41+deb13-amd64, CRI containerd://2.0.5-k3s2, arch amd64
- titan-24: Debian 13 (trixie), kernel 6.12.57+deb13-amd64, CRI containerd://2.0.5-k3s2, arch amd64
### External hosts
- titan-db: Ubuntu 24.10, kernel 6.11.0-1015-raspi, root on /dev/sda2 ext4 (465G), boot vfat /dev/sda1; PostgreSQL service enabled.
- titan-jh: Arch Linux ARM (rolling), kernel 6.18.4-2-rpi, NVMe root ext4 238G (/), boot vfat 512M; ~495 packages installed (pacman -Q).
- titan-23/oceanus: TODO audit (future).
### Control plane Pis (titan-0a/0b/0c)
- Ubuntu 24.04.1 LTS, kernel 6.8.0-1038-raspi, containerd 2.0.5-k3s2.
- Storage: 477G SSD root (/dev/sda2 ext4), /boot/firmware vfat (/dev/sda1). fstab uses LABEL=writable and LABEL=system-boot.
- k3s server (control-plane taint expected); etcd snapshots not yet cataloged (TODO).
## k3s versions
- rpi5 workers/control-plane: k3s v1.33.3+k3s1 (crictl v1.31.0-k3s2)
- rpi4 nodes: k3s v1.31.5+k3s1 (crictl v1.31.0-k3s2)
- Jetson titan-20/21: k3s v1.33.3+k3s1 (per node info), crictl v1.31.0-k3s2

View File

@ -14,6 +14,7 @@ resources:
- guest-register-deployment.yaml
- guest-register-service.yaml
- atlasbot-deployment.yaml
- atlasbot-service.yaml
- wellknown.yaml
- atlasbot-rbac.yaml
- mas-secrets-ensure-rbac.yaml
@ -21,23 +22,24 @@ resources:
- mas-db-ensure-rbac.yaml
- synapse-signingkey-ensure-rbac.yaml
- vault-sync-deployment.yaml
- mas-admin-client-secret-ensure-job.yaml
- mas-db-ensure-job.yaml
- comms-secrets-ensure-job.yaml
- synapse-signingkey-ensure-job.yaml
- synapse-seeder-admin-ensure-job.yaml
- synapse-user-seed-job.yaml
- mas-local-users-ensure-job.yaml
- oneoffs/mas-admin-client-secret-ensure-job.yaml
- oneoffs/mas-db-ensure-job.yaml
- oneoffs/comms-secrets-ensure-job.yaml
- oneoffs/synapse-admin-ensure-job.yaml
- oneoffs/synapse-signingkey-ensure-job.yaml
- oneoffs/synapse-seeder-admin-ensure-job.yaml
- oneoffs/synapse-user-seed-job.yaml
- oneoffs/mas-local-users-ensure-job.yaml
- mas-deployment.yaml
- livekit-token-deployment.yaml
- livekit.yaml
- coturn.yaml
- seed-othrys-room.yaml
- guest-name-job.yaml
- othrys-kick-numeric-job.yaml
- oneoffs/othrys-kick-numeric-job.yaml
- pin-othrys-job.yaml
- reset-othrys-room-job.yaml
- bstein-force-leave-job.yaml
- oneoffs/bstein-force-leave-job.yaml
- livekit-ingress.yaml
- livekit-middlewares.yaml
- matrix-ingress.yaml
@ -73,5 +75,6 @@ configMapGenerator:
- INDEX.md=knowledge/INDEX.md
- atlas.json=knowledge/catalog/atlas.json
- atlas-summary.json=knowledge/catalog/atlas-summary.json
- metrics.json=knowledge/catalog/metrics.json
- runbooks.json=knowledge/catalog/runbooks.json
- atlas-http.mmd=knowledge/diagrams/atlas-http.mmd

View File

@ -72,7 +72,7 @@ data:
template: "{{ user.name }}"
email:
action: force
template: "{{ user.email }}"
template: "{{ user.mailu_email }}"
policy:
data:

View File

@ -1,10 +1,15 @@
# services/comms/bstein-force-leave-job.yaml
# services/comms/oneoffs/bstein-force-leave-job.yaml
# One-off job for comms/bstein-leave-rooms-12.
# Purpose: bstein leave rooms 12 (see container args/env in this file).
# Run by setting spec.suspend to false, reconcile, then set it back to true.
# Safe to delete the finished Job/pod; it should not run continuously.
apiVersion: batch/v1
kind: Job
metadata:
name: bstein-leave-rooms-12
namespace: comms
spec:
suspend: true
backoffLimit: 0
template:
metadata:

View File

@ -1,10 +1,15 @@
# services/comms/comms-secrets-ensure-job.yaml
# services/comms/oneoffs/comms-secrets-ensure-job.yaml
# One-off job for comms/comms-secrets-ensure-7.
# Purpose: comms secrets ensure 7 (see container args/env in this file).
# Run by setting spec.suspend to false, reconcile, then set it back to true.
# Safe to delete the finished Job/pod; it should not run continuously.
apiVersion: batch/v1
kind: Job
metadata:
name: comms-secrets-ensure-6
name: comms-secrets-ensure-7
namespace: comms
spec:
suspend: true
backoffLimit: 1
ttlSecondsAfterFinished: 3600
template:

View File

@ -1,4 +1,8 @@
# services/comms/mas-admin-client-secret-ensure-job.yaml
# services/comms/oneoffs/mas-admin-client-secret-ensure-job.yaml
# One-off job for comms/mas-admin-client-secret-writer.
# Purpose: mas admin client secret writer (see container args/env in this file).
# Run by setting spec.suspend to false, reconcile, then set it back to true.
# Safe to delete the finished Job/pod; it should not run continuously.
apiVersion: v1
kind: ServiceAccount
metadata:
@ -41,6 +45,7 @@ metadata:
name: mas-admin-client-secret-ensure-11
namespace: comms
spec:
suspend: true
backoffLimit: 2
template:
spec:

View File

@ -1,10 +1,15 @@
# services/comms/mas-db-ensure-job.yaml
# services/comms/oneoffs/mas-db-ensure-job.yaml
# One-off job for comms/mas-db-ensure-22.
# Purpose: mas db ensure 22 (see container args/env in this file).
# Run by setting spec.suspend to false, reconcile, then set it back to true.
# Safe to delete the finished Job/pod; it should not run continuously.
apiVersion: batch/v1
kind: Job
metadata:
name: mas-db-ensure-22
namespace: comms
spec:
suspend: true
backoffLimit: 1
ttlSecondsAfterFinished: 600
template:

View File

@ -1,10 +1,15 @@
# services/comms/mas-local-users-ensure-job.yaml
# services/comms/oneoffs/mas-local-users-ensure-job.yaml
# One-off job for comms/mas-local-users-ensure-18.
# Purpose: mas local users ensure 18 (see container args/env in this file).
# Run by setting spec.suspend to false, reconcile, then set it back to true.
# Safe to delete the finished Job/pod; it should not run continuously.
apiVersion: batch/v1
kind: Job
metadata:
name: mas-local-users-ensure-15
name: mas-local-users-ensure-18
namespace: comms
spec:
suspend: true
backoffLimit: 1
ttlSecondsAfterFinished: 3600
template:

View File

@ -1,10 +1,15 @@
# services/comms/othrys-kick-numeric-job.yaml
# services/comms/oneoffs/othrys-kick-numeric-job.yaml
# One-off job for comms/othrys-kick-numeric-8.
# Purpose: othrys kick numeric 8 (see container args/env in this file).
# Run by setting spec.suspend to false, reconcile, then set it back to true.
# Safe to delete the finished Job/pod; it should not run continuously.
apiVersion: batch/v1
kind: Job
metadata:
name: othrys-kick-numeric-8
namespace: comms
spec:
suspend: true
backoffLimit: 0
template:
metadata:

View File

@ -0,0 +1,219 @@
# services/comms/oneoffs/synapse-admin-ensure-job.yaml
# One-off job for comms/synapse-admin-ensure-3.
# Purpose: synapse admin ensure 3 (see container args/env in this file).
# Run by setting spec.suspend to false, reconcile, then set it back to true.
# Safe to delete the finished Job/pod; it should not run continuously.
apiVersion: batch/v1
kind: Job
metadata:
name: synapse-admin-ensure-3
namespace: comms
spec:
suspend: true
backoffLimit: 0
ttlSecondsAfterFinished: 3600
template:
spec:
serviceAccountName: comms-secrets-ensure
restartPolicy: Never
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: node-role.kubernetes.io/worker
operator: Exists
preferredDuringSchedulingIgnoredDuringExecution:
- weight: 100
preference:
matchExpressions:
- key: kubernetes.io/arch
operator: In
values: ["arm64"]
containers:
- name: ensure
image: python:3.11-slim
env:
- name: VAULT_ADDR
value: http://vault.vault.svc.cluster.local:8200
- name: VAULT_ROLE
value: comms-secrets
- name: SYNAPSE_ADMIN_URL
value: http://othrys-synapse-matrix-synapse.comms.svc.cluster.local:8008
command:
- /bin/sh
- -c
- |
set -euo pipefail
pip install --no-cache-dir psycopg2-binary bcrypt
python - <<'PY'
import json
import os
import secrets
import string
import time
import urllib.error
import urllib.request
import bcrypt
import psycopg2
VAULT_ADDR = os.environ.get("VAULT_ADDR", "http://vault.vault.svc.cluster.local:8200").rstrip("/")
VAULT_ROLE = os.environ.get("VAULT_ROLE", "comms-secrets")
SA_TOKEN_PATH = "/var/run/secrets/kubernetes.io/serviceaccount/token"
PGHOST = "postgres-service.postgres.svc.cluster.local"
PGPORT = 5432
PGDATABASE = "synapse"
PGUSER = "synapse"
def log(msg: str) -> None:
print(msg, flush=True)
def request_json(url: str, payload: dict | None = None) -> dict:
data = None
headers = {"Content-Type": "application/json"}
if payload is not None:
data = json.dumps(payload).encode("utf-8")
req = urllib.request.Request(url, data=data, headers=headers, method="POST" if data else "GET")
with urllib.request.urlopen(req, timeout=30) as resp:
return json.loads(resp.read().decode("utf-8"))
def vault_login() -> str:
with open(SA_TOKEN_PATH, "r", encoding="utf-8") as f:
jwt = f.read().strip()
payload = {"jwt": jwt, "role": VAULT_ROLE}
resp = request_json(f"{VAULT_ADDR}/v1/auth/kubernetes/login", payload)
token = resp.get("auth", {}).get("client_token")
if not token:
raise RuntimeError("vault login failed")
return token
def vault_get(token: str, path: str) -> dict:
req = urllib.request.Request(
f"{VAULT_ADDR}/v1/kv/data/atlas/{path}",
headers={"X-Vault-Token": token},
)
try:
with urllib.request.urlopen(req, timeout=30) as resp:
payload = json.loads(resp.read().decode("utf-8"))
return payload.get("data", {}).get("data", {})
except urllib.error.HTTPError as exc:
if exc.code == 404:
return {}
raise
def vault_put(token: str, path: str, data: dict) -> None:
payload = {"data": data}
req = urllib.request.Request(
f"{VAULT_ADDR}/v1/kv/data/atlas/{path}",
data=json.dumps(payload).encode("utf-8"),
headers={"X-Vault-Token": token, "Content-Type": "application/json"},
method="POST",
)
with urllib.request.urlopen(req, timeout=30) as resp:
resp.read()
def random_password(length: int = 32) -> str:
alphabet = string.ascii_letters + string.digits
return "".join(secrets.choice(alphabet) for _ in range(length))
def ensure_admin_creds(token: str) -> dict:
data = vault_get(token, "comms/synapse-admin")
username = (data.get("username") or "").strip() or "synapse-admin"
password = (data.get("password") or "").strip()
if not password:
password = random_password()
data["username"] = username
data["password"] = password
vault_put(token, "comms/synapse-admin", data)
return data
def ensure_user(cur, cols, user_id, password, admin):
now_ms = int(time.time() * 1000)
values = {
"name": user_id,
"password_hash": bcrypt.hashpw(password.encode(), bcrypt.gensalt()).decode(),
"creation_ts": now_ms,
}
def add_flag(name, flag):
if name not in cols:
return
if cols[name]["type"] in ("smallint", "integer"):
values[name] = int(flag)
else:
values[name] = bool(flag)
add_flag("admin", admin)
add_flag("deactivated", False)
add_flag("shadow_banned", False)
add_flag("is_guest", False)
columns = list(values.keys())
placeholders = ", ".join(["%s"] * len(columns))
updates = ", ".join([f"{col}=EXCLUDED.{col}" for col in columns if col != "name"])
query = f"INSERT INTO users ({', '.join(columns)}) VALUES ({placeholders}) ON CONFLICT (name) DO UPDATE SET {updates};"
cur.execute(query, [values[c] for c in columns])
def get_cols(cur):
cur.execute(
"""
SELECT column_name, is_nullable, column_default, data_type
FROM information_schema.columns
WHERE table_schema = 'public' AND table_name = 'users'
"""
)
cols = {}
for name, is_nullable, default, data_type in cur.fetchall():
cols[name] = {
"nullable": is_nullable == "YES",
"default": default,
"type": data_type,
}
return cols
def ensure_access_token(cur, user_id, token_value):
cur.execute("SELECT COALESCE(MAX(id), 0) + 1 FROM access_tokens")
token_id = cur.fetchone()[0]
cur.execute(
"""
INSERT INTO access_tokens (id, user_id, token, device_id, valid_until_ms)
VALUES (%s, %s, %s, %s, NULL)
ON CONFLICT (token) DO NOTHING
""",
(token_id, user_id, token_value, "ariadne-admin"),
)
vault_token = vault_login()
admin_data = ensure_admin_creds(vault_token)
if admin_data.get("access_token"):
log("synapse admin token already present")
raise SystemExit(0)
synapse_db = vault_get(vault_token, "comms/synapse-db")
pg_password = synapse_db.get("POSTGRES_PASSWORD")
if not pg_password:
raise RuntimeError("synapse db password missing")
user_id = f"@{admin_data['username']}:live.bstein.dev"
conn = psycopg2.connect(
host=PGHOST,
port=PGPORT,
dbname=PGDATABASE,
user=PGUSER,
password=pg_password,
)
token_value = secrets.token_urlsafe(32)
try:
with conn:
with conn.cursor() as cur:
cols = get_cols(cur)
ensure_user(cur, cols, user_id, admin_data["password"], True)
ensure_access_token(cur, user_id, token_value)
finally:
conn.close()
admin_data["access_token"] = token_value
vault_put(vault_token, "comms/synapse-admin", admin_data)
log("synapse admin token stored")
PY

View File

@ -1,10 +1,15 @@
# services/comms/synapse-seeder-admin-ensure-job.yaml
# services/comms/oneoffs/synapse-seeder-admin-ensure-job.yaml
# One-off job for comms/synapse-seeder-admin-ensure-9.
# Purpose: synapse seeder admin ensure 9 (see container args/env in this file).
# Run by setting spec.suspend to false, reconcile, then set it back to true.
# Safe to delete the finished Job/pod; it should not run continuously.
apiVersion: batch/v1
kind: Job
metadata:
name: synapse-seeder-admin-ensure-7
name: synapse-seeder-admin-ensure-9
namespace: comms
spec:
suspend: true
backoffLimit: 2
template:
metadata:

View File

@ -1,10 +1,15 @@
# services/comms/synapse-signingkey-ensure-job.yaml
# services/comms/oneoffs/synapse-signingkey-ensure-job.yaml
# One-off job for comms/othrys-synapse-signingkey-ensure-7.
# Purpose: othrys synapse signingkey ensure 7 (see container args/env in this file).
# Run by setting spec.suspend to false, reconcile, then set it back to true.
# Safe to delete the finished Job/pod; it should not run continuously.
apiVersion: batch/v1
kind: Job
metadata:
name: othrys-synapse-signingkey-ensure-7
namespace: comms
spec:
suspend: true
backoffLimit: 2
template:
spec:

View File

@ -1,10 +1,15 @@
# services/comms/synapse-user-seed-job.yaml
# services/comms/oneoffs/synapse-user-seed-job.yaml
# One-off job for comms/synapse-user-seed-8.
# Purpose: synapse user seed 8 (see container args/env in this file).
# Run by setting spec.suspend to false, reconcile, then set it back to true.
# Safe to delete the finished Job/pod; it should not run continuously.
apiVersion: batch/v1
kind: Job
metadata:
name: synapse-user-seed-7
name: synapse-user-seed-8
namespace: comms
spec:
suspend: true
backoffLimit: 1
ttlSecondsAfterFinished: 3600
template:

File diff suppressed because it is too large Load Diff

View File

@ -11,7 +11,7 @@ spec:
roleName: "comms"
objects: |
- objectName: "harbor-pull__dockerconfigjson"
secretPath: "kv/data/atlas/harbor-pull/comms"
secretPath: "kv/data/atlas/shared/harbor-pull"
secretKey: "dockerconfigjson"
secretObjects:
- secretName: harbor-regcred

View File

@ -11,7 +11,7 @@ spec:
roleName: "crypto"
objects: |
- objectName: "harbor-pull__dockerconfigjson"
secretPath: "kv/data/atlas/harbor-pull/crypto"
secretPath: "kv/data/atlas/shared/harbor-pull"
secretKey: "dockerconfigjson"
secretObjects:
- secretName: harbor-regcred

View File

@ -90,6 +90,8 @@ spec:
value: openid
- name: ACTUAL_MULTIUSER
value: "true"
- name: ACTUAL_USER_CREATION_MODE
value: login
- name: ACTUAL_OPENID_DISCOVERY_URL
value: https://sso.bstein.dev/realms/atlas
- name: ACTUAL_OPENID_AUTHORIZATION_ENDPOINT
@ -128,6 +130,8 @@ spec:
value: openid
- name: ACTUAL_MULTIUSER
value: "true"
- name: ACTUAL_USER_CREATION_MODE
value: login
- name: ACTUAL_OPENID_DISCOVERY_URL
value: https://sso.bstein.dev/realms/atlas
- name: ACTUAL_OPENID_AUTHORIZATION_ENDPOINT

View File

@ -6,6 +6,7 @@ metadata:
namespace: finance
spec:
schedule: "0 3 * * *"
suspend: true
concurrencyPolicy: Forbid
successfulJobsHistoryLimit: 1
failedJobsHistoryLimit: 3

View File

@ -9,7 +9,7 @@ resources:
- finance-secrets-ensure-rbac.yaml
- actual-budget-data-pvc.yaml
- firefly-storage-pvc.yaml
- finance-secrets-ensure-job.yaml
- oneoffs/finance-secrets-ensure-job.yaml
- actual-budget-deployment.yaml
- firefly-deployment.yaml
- firefly-user-sync-cronjob.yaml

View File

@ -1,10 +1,15 @@
# services/finance/finance-secrets-ensure-job.yaml
# services/finance/oneoffs/finance-secrets-ensure-job.yaml
# One-off job for finance/finance-secrets-ensure-5.
# Purpose: finance secrets ensure 5 (see container args/env in this file).
# Run by setting spec.suspend to false, reconcile, then set it back to true.
# Safe to delete the finished Job/pod; it should not run continuously.
apiVersion: batch/v1
kind: Job
metadata:
name: finance-secrets-ensure-5
namespace: finance
spec:
suspend: true
backoffLimit: 1
ttlSecondsAfterFinished: 3600
template:

View File

@ -29,3 +29,17 @@ subjects:
- kind: ServiceAccount
name: bstein-dev-home
namespace: bstein-dev-home
---
apiVersion: rbac.authorization.k8s.io/v1
kind: RoleBinding
metadata:
name: ariadne-firefly-user-sync
namespace: finance
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: Role
name: bstein-dev-home-firefly-user-sync
subjects:
- kind: ServiceAccount
name: ariadne
namespace: maintenance

View File

@ -169,6 +169,8 @@ spec:
value: "trace"
- name: GITEA__service__REQUIRE_SIGNIN_VIEW
value: "false"
- name: GITEA__webhook__ALLOWED_HOST_LIST
value: "ci.bstein.dev"
- name: GITEA__server__PROXY_HEADERS
value: "X-Forwarded-For, X-Forwarded-Proto, X-Forwarded-Host"
- name: GITEA__session__COOKIE_SECURE

View File

@ -391,6 +391,16 @@ spec:
$patch: delete
- name: core-writable
emptyDir: {}
- target:
kind: Ingress
name: harbor-ingress
patch: |-
- op: replace
path: /spec/rules/0/http/paths/2/backend/service/name
value: harbor-registry
- op: replace
path: /spec/rules/0/http/paths/2/backend/service/port/number
value: 5000
- target:
kind: Deployment
name: harbor-jobservice

View File

@ -11,7 +11,7 @@ spec:
roleName: "harbor"
objects: |
- objectName: "harbor-pull__dockerconfigjson"
secretPath: "kv/data/atlas/harbor-pull/harbor"
secretPath: "kv/data/atlas/shared/harbor-pull"
secretKey: "dockerconfigjson"
secretObjects:
- secretName: harbor-regcred

View File

@ -8,7 +8,7 @@ rules:
- apiGroups: ["batch"]
resources: ["cronjobs"]
verbs: ["get"]
resourceNames: ["wger-user-sync"]
resourceNames: ["wger-user-sync", "wger-admin-ensure"]
- apiGroups: ["batch"]
resources: ["jobs"]
verbs: ["create", "get", "list", "watch"]
@ -29,3 +29,17 @@ subjects:
- kind: ServiceAccount
name: bstein-dev-home
namespace: bstein-dev-home
---
apiVersion: rbac.authorization.k8s.io/v1
kind: RoleBinding
metadata:
name: ariadne-wger-user-sync
namespace: health
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: Role
name: bstein-dev-home-wger-user-sync
subjects:
- kind: ServiceAccount
name: ariadne
namespace: maintenance

View File

@ -8,6 +8,7 @@ metadata:
atlas.bstein.dev/glue: "true"
spec:
schedule: "15 3 * * *"
suspend: true
concurrencyPolicy: Forbid
successfulJobsHistoryLimit: 1
failedJobsHistoryLimit: 3

View File

@ -0,0 +1,13 @@
# services/jenkins/cache-pvc.yaml
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
name: jenkins-cache-v2
namespace: jenkins
spec:
accessModes:
- ReadWriteOnce
resources:
requests:
storage: 20Gi
storageClassName: astreae

View File

@ -18,7 +18,7 @@ data:
logoutFromOpenIdProvider: true
postLogoutRedirectUrl: "https://ci.bstein.dev"
sendScopesInTokenRequest: true
rootURLFromRequest: true
rootURLFromRequest: false
userNameField: "preferred_username"
fullNameFieldName: "name"
emailFieldName: "email"
@ -49,8 +49,15 @@ data:
jobs:
- script: |
pipelineJob('harbor-arm-build') {
triggers {
scm('H/5 * * * *')
properties {
pipelineTriggers {
triggers {
scmTrigger {
scmpoll_spec('H/5 * * * *')
ignorePostCommitHooks(false)
}
}
}
}
definition {
cpsScm {
@ -83,8 +90,15 @@ data:
}
}
pipelineJob('ci-demo') {
triggers {
scm('H/1 * * * *')
properties {
pipelineTriggers {
triggers {
scmTrigger {
scmpoll_spec('H/1 * * * *')
ignorePostCommitHooks(false)
}
}
}
}
definition {
cpsScm {
@ -102,8 +116,15 @@ data:
}
}
pipelineJob('bstein-dev-home') {
triggers {
scm('H/2 * * * *')
properties {
pipelineTriggers {
triggers {
scmTrigger {
scmpoll_spec('H/2 * * * *')
ignorePostCommitHooks(false)
}
}
}
}
definition {
cpsScm {
@ -120,9 +141,42 @@ data:
}
}
}
pipelineJob('ariadne') {
properties {
pipelineTriggers {
triggers {
scmTrigger {
scmpoll_spec('H/2 * * * *')
ignorePostCommitHooks(false)
}
}
}
}
definition {
cpsScm {
scm {
git {
remote {
url('https://scm.bstein.dev/bstein/ariadne.git')
credentials('gitea-pat')
}
branches('*/master')
}
}
scriptPath('Jenkinsfile')
}
}
}
pipelineJob('data-prepper') {
triggers {
scm('H/5 * * * *')
properties {
pipelineTriggers {
triggers {
scmTrigger {
scmpoll_spec('H/5 * * * *')
ignorePostCommitHooks(false)
}
}
}
}
definition {
cpsScm {
@ -139,24 +193,39 @@ data:
}
}
}
pipelineJob('titan-iac-quality-gate') {
triggers {
scm('H/5 * * * *')
}
definition {
cpsScm {
scm {
multibranchPipelineJob('titan-iac-quality-gate') {
branchSources {
branchSource {
source {
git {
remote {
url('https://scm.bstein.dev/bstein/titan-iac.git')
credentials('gitea-pat')
}
branches('*/feature/vault-consumption')
id('titan-iac-quality-gate')
remote('https://scm.bstein.dev/bstein/titan-iac.git')
credentialsId('gitea-pat')
}
}
}
}
factory {
workflowBranchProjectFactory {
scriptPath('ci/Jenkinsfile.titan-iac')
}
}
orphanedItemStrategy {
discardOldItems {
numToKeep(30)
}
}
triggers {
periodicFolderTrigger {
interval('12h')
}
}
configure { node ->
def webhookToken = System.getenv('TITAN_IAC_WEBHOOK_TOKEN') ?: ''
def triggers = node / 'triggers'
def webhook = triggers.appendNode('com.igalg.jenkins.plugins.mswt.trigger.ComputedFolderWebHookTrigger')
webhook.appendNode('token', webhookToken)
}
}
base.yaml: |
jenkins:
@ -189,6 +258,11 @@ data:
templates:
- name: "default"
namespace: "jenkins"
workspaceVolume:
dynamicPVC:
accessModes: "ReadWriteOnce"
requestsSize: "20Gi"
storageClassName: "astreae"
containers:
- name: "jnlp"
args: "^${computer.jnlpmac} ^${computer.name}"
@ -217,3 +291,6 @@ data:
crumbIssuer:
standard:
excludeClientIPFromCrumb: true
unclassified:
location:
url: "https://ci.bstein.dev/"

View File

@ -6,12 +6,17 @@ metadata:
namespace: jenkins
data:
plugins.txt: |
kubernetes
workflow-aggregator
git
pipeline-utility-steps
configuration-as-code
configuration-as-code-support
oic-auth
job-dsl
simple-theme-plugin
kubernetes:4416.v2ea_b_5372da_a_e
workflow-aggregator:608.v67378e9d3db_1
git:5.8.1
pipeline-utility-steps:2.20.0
configuration-as-code:2031.veb_a_fdda_b_3ffd
oic-auth:4.609.v9de140f63d01
job-dsl:1.93
simple-theme-plugin:230.v8b_fd91b_b_800c
workflow-multibranch:821.vc3b_4ea_780798
branch-api:2.1268.v044a_87612da_8
scm-api:724.v7d839074eb_5c
gitea:268.v75e47974c01d
gitea-checks:603.621.vc708da_fb_371d
multibranch-scan-webhook-trigger:1.0.11

View File

@ -22,23 +22,33 @@ spec:
vault.hashicorp.com/role: "jenkins"
vault.hashicorp.com/agent-inject-secret-jenkins-env: "kv/data/atlas/jenkins/jenkins-oidc"
vault.hashicorp.com/agent-inject-template-jenkins-env: |
{{- with secret "kv/data/atlas/jenkins/jenkins-oidc" -}}
{{ with secret "kv/data/atlas/jenkins/jenkins-oidc" }}
OIDC_CLIENT_ID={{ .Data.data.clientId }}
OIDC_CLIENT_SECRET={{ .Data.data.clientSecret }}
OIDC_AUTH_URL={{ .Data.data.authorizationUrl }}
OIDC_TOKEN_URL={{ .Data.data.tokenUrl }}
OIDC_USERINFO_URL={{ .Data.data.userInfoUrl }}
OIDC_LOGOUT_URL={{ .Data.data.logoutUrl }}
{{- end }}
{{- with secret "kv/data/atlas/jenkins/harbor-robot-creds" -}}
{{ end }}
{{ with secret "kv/data/atlas/jenkins/harbor-robot-creds" }}
HARBOR_ROBOT_USERNAME={{ .Data.data.username }}
HARBOR_ROBOT_PASSWORD={{ .Data.data.password }}
{{ end }}
{{ with secret "kv/data/atlas/shared/harbor-pull" }}
{{- if and .Data.data.username .Data.data.password }}
HARBOR_ROBOT_USERNAME={{ .Data.data.username }}
HARBOR_ROBOT_PASSWORD={{ .Data.data.password }}
{{- end }}
{{- with secret "kv/data/atlas/jenkins/gitea-pat" -}}
{{ end }}
{{ with secret "kv/data/atlas/jenkins/gitea-pat" }}
GITEA_PAT_USERNAME={{ .Data.data.username }}
GITEA_PAT_TOKEN={{ .Data.data.token }}
{{- end -}}
bstein.dev/restarted-at: "2026-01-19T00:25:00Z"
{{ end }}
{{ with secret "kv/data/atlas/jenkins/webhook-tokens" }}
TITAN_IAC_WEBHOOK_TOKEN={{ .Data.data.titan_iac_quality_gate }}
GIT_NOTIFY_TOKEN_BSTEIN_DEV_HOME={{ .Data.data.git_notify_bstein_dev_home }}
{{ end }}
bstein.dev/restarted-at: "2026-01-20T14:52:41Z"
spec:
serviceAccountName: jenkins
nodeSelector:
@ -98,7 +108,9 @@ spec:
containerPort: 50000
env:
- name: JAVA_OPTS
value: "-Xms512m -Xmx2048m"
value: "-Xms512m -Xmx2048m -Duser.timezone=America/Chicago"
- name: TZ
value: "America/Chicago"
- name: JENKINS_OPTS
value: "--webroot=/var/jenkins_cache/war"
- name: JENKINS_SLAVE_AGENT_PORT
@ -148,6 +160,8 @@ spec:
mountPath: /config/jcasc
- name: init-scripts
mountPath: /usr/share/jenkins/ref/init.groovy.d
- name: init-scripts
mountPath: /var/jenkins_home/init.groovy.d
- name: plugin-dir
mountPath: /usr/share/jenkins/ref/plugins
- name: tmp
@ -157,9 +171,11 @@ spec:
persistentVolumeClaim:
claimName: jenkins
- name: jenkins-cache
emptyDir: {}
persistentVolumeClaim:
claimName: jenkins-cache-v2
- name: plugin-dir
emptyDir: {}
persistentVolumeClaim:
claimName: jenkins-plugins-v2
- name: plugins
configMap:
name: jenkins-plugins
@ -170,4 +186,5 @@ spec:
configMap:
name: jenkins-init-scripts
- name: tmp
emptyDir: {}
emptyDir:
medium: Memory

View File

@ -5,9 +5,14 @@ namespace: jenkins
resources:
- namespace.yaml
- serviceaccount.yaml
- vault-serviceaccount.yaml
- pvc.yaml
- cache-pvc.yaml
- plugins-pvc.yaml
- configmap-jcasc.yaml
- configmap-plugins.yaml
- secretproviderclass.yaml
- vault-sync-deployment.yaml
- deployment.yaml
- service.yaml
- ingress.yaml
@ -16,6 +21,7 @@ configMapGenerator:
- name: jenkins-init-scripts
namespace: jenkins
files:
- git-notify-token.groovy=scripts/git-notify-token.groovy
- theme.groovy=scripts/theme.groovy
options:
disableNameSuffixHash: true

View File

@ -0,0 +1,13 @@
# services/jenkins/plugins-pvc.yaml
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
name: jenkins-plugins-v2
namespace: jenkins
spec:
accessModes:
- ReadWriteOnce
resources:
requests:
storage: 10Gi
storageClassName: astreae

View File

@ -0,0 +1,41 @@
import hudson.plugins.git.ApiTokenPropertyConfiguration
import hudson.Util
import java.nio.charset.StandardCharsets
import java.security.MessageDigest
def entries = [
[env: 'GIT_NOTIFY_TOKEN_BSTEIN_DEV_HOME', name: 'gitea-bstein-dev-home'],
]
entries.each { entry ->
def token = System.getenv(entry.env)
if (!token || token.trim().isEmpty()) {
println("Git notifyCommit token ${entry.env} missing; skipping")
return
}
try {
def config = ApiTokenPropertyConfiguration.get()
if (config.hasMatchingApiToken(token)) {
println("Git notifyCommit token ${entry.name} already configured")
return
}
def digest = MessageDigest.getInstance("SHA-256")
def hash = Util.toHexString(digest.digest(token.getBytes(StandardCharsets.US_ASCII)))
def field = ApiTokenPropertyConfiguration.class.getDeclaredField("apiTokens")
field.setAccessible(true)
def tokens = field.get(config)
def ctor = ApiTokenPropertyConfiguration.HashedApiToken.class.getDeclaredConstructor(String.class, String.class)
ctor.setAccessible(true)
tokens.add(ctor.newInstance(entry.name, hash))
config.save()
println("Added git notifyCommit access token ${entry.name}")
} catch (Throwable e) {
println("Failed to configure git notifyCommit token ${entry.name}: ${e.class.simpleName}: ${e.message}")
}
}

View File

@ -1,15 +1,137 @@
import jenkins.model.Jenkins
import org.codefirst.SimpleThemeDecorator
import org.jenkinsci.plugins.simpletheme.CssTextThemeElement
def instance = Jenkins.get()
def decorators = instance.getExtensionList(SimpleThemeDecorator.class)
if (decorators?.size() > 0) {
def theme = decorators[0]
theme.setCssUrl("https://jenkins-contrib-themes.github.io/jenkins-material-theme/dist/material-ocean.css")
def cssRules = """
:root,
.app-theme-picker__picker[data-theme=none] {
--background: #0f1216 !important;
--header-background: #141922 !important;
--header-border: #2b313b !important;
--white: #141922 !important;
--black: #e6e9ef !important;
--very-light-grey: #171b21 !important;
--light-grey: #202734 !important;
--medium-grey: #2b313b !important;
--dark-grey: #0b0f14 !important;
--text-color: #e6e9ef !important;
--text-color-secondary: #a6adba !important;
--card-background: #171b21 !important;
--card-border-color: #2b313b !important;
--pane-header-bg: #1f252d !important;
--pane-header-border-color: #2b313b !important;
--pane-border-color: #2b313b !important;
--pane-text-color: #e6e9ef !important;
--pane-header-text-color: #e6e9ef !important;
--link-color: #8fb7ff !important;
--link-color--hover: #b0ccff !important;
--link-dark-color: #e6e9ef !important;
--link-dark-color--hover: #b0ccff !important;
--input-color: #151a20 !important;
--input-border: #2b313b !important;
--input-border-hover: #3a424d !important;
--button-background: #232a33 !important;
--button-background--hover: #2b313b !important;
--button-background--active: #323b46 !important;
--item-background--hover: #232a33 !important;
--item-background--active: #2b313b !important;
--accent-color: #8fb7ff !important;
}
body,
#page-body,
#page-header,
#header,
#main-panel,
#main-panel-content,
#side-panel,
.top-sticker-inner,
.bottom-sticker-inner,
#breadcrumbBar,
#breadcrumbs {
background-color: var(--background) !important;
color: var(--text-color) !important;
}
.jenkins-card,
.jenkins-section,
.jenkins-section__item,
#main-panel .jenkins-card,
#main-panel .jenkins-section {
background-color: var(--card-background) !important;
color: var(--text-color) !important;
border-color: var(--card-border-color) !important;
}
table.pane,
table.pane td,
table.pane th,
#projectstatus td,
#projectstatus th {
background-color: var(--card-background) !important;
color: var(--text-color) !important;
}
table.pane tr:nth-child(even) td,
#projectstatus tr:hover td {
background-color: #1f252d !important;
}
input,
select,
textarea,
#search-box {
background-color: #151a20 !important;
color: var(--text-color) !important;
border-color: var(--input-border) !important;
}
a,
a:visited,
a:link {
color: var(--link-color) !important;
}
a:hover {
opacity: 0.85;
}
#side-panel .task-link,
#breadcrumbs a,
#breadcrumbs,
#projectstatus th a {
color: var(--text-color-secondary) !important;
}
.console-output,
.console-output pre,
pre,
code,
.CodeMirror {
background-color: #0c0f14 !important;
color: #d9dee7 !important;
}
#footer {
background-color: var(--background) !important;
color: var(--text-color-secondary) !important;
}
.jenkins_ver:after {
content: "atlas dark";
}
""".stripIndent().trim()
theme.setElements([new CssTextThemeElement(cssRules)])
theme.setCssUrl("")
theme.setCssRules(cssRules)
theme.setJsUrl("")
theme.setTheme("")
instance.save()
theme.save()
println("Applied simple-theme-plugin dark theme")
} else {
println("simple-theme-plugin not installed; skipping theme configuration")

View File

@ -0,0 +1,21 @@
# services/jenkins/secretproviderclass.yaml
apiVersion: secrets-store.csi.x-k8s.io/v1
kind: SecretProviderClass
metadata:
name: jenkins-vault
namespace: jenkins
spec:
provider: vault
parameters:
vaultAddress: "http://vault.vault.svc.cluster.local:8200"
roleName: "jenkins"
objects: |
- objectName: "harbor-pull__dockerconfigjson"
secretPath: "kv/data/atlas/shared/harbor-pull"
secretKey: "dockerconfigjson"
secretObjects:
- secretName: harbor-bstein-robot
type: kubernetes.io/dockerconfigjson
data:
- objectName: harbor-pull__dockerconfigjson
key: .dockerconfigjson

View File

@ -0,0 +1,6 @@
# services/jenkins/vault-serviceaccount.yaml
apiVersion: v1
kind: ServiceAccount
metadata:
name: jenkins-vault-sync
namespace: jenkins

View File

@ -0,0 +1,37 @@
# services/jenkins/vault-sync-deployment.yaml
apiVersion: apps/v1
kind: Deployment
metadata:
name: jenkins-vault-sync
namespace: jenkins
spec:
replicas: 1
selector:
matchLabels:
app: jenkins-vault-sync
template:
metadata:
labels:
app: jenkins-vault-sync
spec:
serviceAccountName: jenkins-vault-sync
nodeSelector:
kubernetes.io/arch: arm64
node-role.kubernetes.io/worker: "true"
containers:
- name: sync
image: alpine:3.20
command: ["/bin/sh", "-c"]
args:
- "sleep infinity"
volumeMounts:
- name: vault-secrets
mountPath: /vault/secrets
readOnly: true
volumes:
- name: vault-secrets
csi:
driver: secrets-store.csi.k8s.io
readOnly: true
volumeAttributes:
secretProviderClass: jenkins-vault

View File

@ -126,7 +126,7 @@ spec:
- name: KC_EVENTS_LISTENERS
value: jboss-logging,mailu-http
- name: KC_SPI_EVENTS_LISTENER_MAILU-HTTP_ENDPOINT
value: http://mailu-sync-listener.mailu-mailserver.svc.cluster.local:8080/events
value: http://ariadne.maintenance.svc.cluster.local/events
ports:
- containerPort: 8080
name: http

View File

@ -10,21 +10,21 @@ resources:
- secretproviderclass.yaml
- vault-sync-deployment.yaml
- deployment.yaml
- realm-settings-job.yaml
- portal-admin-client-secret-ensure-job.yaml
- portal-e2e-client-job.yaml
- portal-e2e-target-client-job.yaml
- portal-e2e-token-exchange-permissions-job.yaml
- portal-e2e-token-exchange-test-job.yaml
- portal-e2e-execute-actions-email-test-job.yaml
- ldap-federation-job.yaml
- user-overrides-job.yaml
- mas-secrets-ensure-job.yaml
- synapse-oidc-secret-ensure-job.yaml
- logs-oidc-secret-ensure-job.yaml
- harbor-oidc-secret-ensure-job.yaml
- vault-oidc-secret-ensure-job.yaml
- actual-oidc-secret-ensure-job.yaml
- oneoffs/realm-settings-job.yaml
- oneoffs/portal-admin-client-secret-ensure-job.yaml
- oneoffs/portal-e2e-client-job.yaml
- oneoffs/portal-e2e-target-client-job.yaml
- oneoffs/portal-e2e-token-exchange-permissions-job.yaml
- oneoffs/portal-e2e-token-exchange-test-job.yaml
- oneoffs/portal-e2e-execute-actions-email-test-job.yaml
- oneoffs/ldap-federation-job.yaml
- oneoffs/user-overrides-job.yaml
- oneoffs/mas-secrets-ensure-job.yaml
- oneoffs/synapse-oidc-secret-ensure-job.yaml
- oneoffs/logs-oidc-secret-ensure-job.yaml
- oneoffs/harbor-oidc-secret-ensure-job.yaml
- oneoffs/vault-oidc-secret-ensure-job.yaml
- oneoffs/actual-oidc-secret-ensure-job.yaml
- service.yaml
- ingress.yaml
generatorOptions:

View File

@ -1,10 +1,15 @@
# services/keycloak/actual-oidc-secret-ensure-job.yaml
# services/keycloak/oneoffs/actual-oidc-secret-ensure-job.yaml
# One-off job for sso/actual-oidc-secret-ensure-3.
# Purpose: actual oidc secret ensure 3 (see container args/env in this file).
# Run by setting spec.suspend to false, reconcile, then set it back to true.
# Safe to delete the finished Job/pod; it should not run continuously.
apiVersion: batch/v1
kind: Job
metadata:
name: actual-oidc-secret-ensure-3
namespace: sso
spec:
suspend: true
backoffLimit: 0
ttlSecondsAfterFinished: 3600
template:

View File

@ -1,10 +1,15 @@
# services/keycloak/harbor-oidc-secret-ensure-job.yaml
# services/keycloak/oneoffs/harbor-oidc-secret-ensure-job.yaml
# One-off job for sso/harbor-oidc-secret-ensure-10.
# Purpose: harbor oidc secret ensure 10 (see container args/env in this file).
# Run by setting spec.suspend to false, reconcile, then set it back to true.
# Safe to delete the finished Job/pod; it should not run continuously.
apiVersion: batch/v1
kind: Job
metadata:
name: harbor-oidc-secret-ensure-9
name: harbor-oidc-secret-ensure-10
namespace: sso
spec:
suspend: true
backoffLimit: 0
ttlSecondsAfterFinished: 3600
template:

View File

@ -1,10 +1,15 @@
# services/keycloak/ldap-federation-job.yaml
# services/keycloak/oneoffs/ldap-federation-job.yaml
# One-off job for sso/keycloak-ldap-federation-12.
# Purpose: keycloak ldap federation 12 (see container args/env in this file).
# Run by setting spec.suspend to false, reconcile, then set it back to true.
# Safe to delete the finished Job/pod; it should not run continuously.
apiVersion: batch/v1
kind: Job
metadata:
name: keycloak-ldap-federation-11
name: keycloak-ldap-federation-12
namespace: sso
spec:
suspend: true
backoffLimit: 2
template:
metadata:
@ -325,6 +330,54 @@ spec:
if status not in (201, 204):
raise SystemExit(f"Unexpected group mapper create status: {status}")
def ensure_user_attr_mapper(name: str, ldap_attr: str, user_attr: str):
mapper = None
for c in components:
if c.get("name") == name and c.get("parentId") == ldap_component_id:
mapper = c
break
payload = {
"name": name,
"providerId": "user-attribute-ldap-mapper",
"providerType": "org.keycloak.storage.ldap.mappers.LDAPStorageMapper",
"parentId": ldap_component_id,
"config": {
"ldap.attribute": [ldap_attr],
"user.model.attribute": [user_attr],
"read.only": ["false"],
"always.read.value.from.ldap": ["false"],
"is.mandatory.in.ldap": ["false"],
},
}
if mapper:
payload["id"] = mapper["id"]
payload["parentId"] = mapper.get("parentId", payload["parentId"])
print(f"Updating LDAP user mapper: {payload['id']} ({name})")
status, _, _ = http_json(
"PUT",
f"{base_url}/admin/realms/{realm}/components/{payload['id']}",
token,
payload,
)
if status not in (200, 204):
raise SystemExit(f"Unexpected user mapper update status for {name}: {status}")
else:
print(f"Creating LDAP user mapper: {name}")
status, _, _ = http_json(
"POST",
f"{base_url}/admin/realms/{realm}/components",
token,
payload,
)
if status not in (201, 204):
raise SystemExit(f"Unexpected user mapper create status for {name}: {status}")
ensure_user_attr_mapper("openldap-email", "mail", "email")
ensure_user_attr_mapper("openldap-first-name", "givenName", "firstName")
ensure_user_attr_mapper("openldap-last-name", "sn", "lastName")
# Cleanup duplicate LDAP federation providers and their child components (mappers, etc).
# Keep only the canonical provider we updated/created above.
try:

View File

@ -1,10 +1,15 @@
# services/keycloak/logs-oidc-secret-ensure-job.yaml
# services/keycloak/oneoffs/logs-oidc-secret-ensure-job.yaml
# One-off job for sso/logs-oidc-secret-ensure-10.
# Purpose: logs oidc secret ensure 10 (see container args/env in this file).
# Run by setting spec.suspend to false, reconcile, then set it back to true.
# Safe to delete the finished Job/pod; it should not run continuously.
apiVersion: batch/v1
kind: Job
metadata:
name: logs-oidc-secret-ensure-10
namespace: sso
spec:
suspend: true
backoffLimit: 0
ttlSecondsAfterFinished: 3600
template:

View File

@ -1,4 +1,8 @@
# services/keycloak/mas-secrets-ensure-job.yaml
# services/keycloak/oneoffs/mas-secrets-ensure-job.yaml
# One-off job for sso/mas-secrets-ensure.
# Purpose: mas secrets ensure (see container args/env in this file).
# Run by setting spec.suspend to false, reconcile, then set it back to true.
# Safe to delete the finished Job/pod; it should not run continuously.
apiVersion: v1
kind: ServiceAccount
metadata:
@ -13,6 +17,7 @@ metadata:
name: mas-secrets-ensure-21
namespace: sso
spec:
suspend: true
backoffLimit: 0
ttlSecondsAfterFinished: 3600
template:

View File

@ -1,10 +1,15 @@
# services/keycloak/portal-admin-client-secret-ensure-job.yaml
# services/keycloak/oneoffs/portal-admin-client-secret-ensure-job.yaml
# One-off job for sso/keycloak-portal-admin-secret-ensure-4.
# Purpose: keycloak portal admin secret ensure 4 (see container args/env in this file).
# Run by setting spec.suspend to false, reconcile, then set it back to true.
# Safe to delete the finished Job/pod; it should not run continuously.
apiVersion: batch/v1
kind: Job
metadata:
name: keycloak-portal-admin-secret-ensure-4
namespace: sso
spec:
suspend: true
backoffLimit: 0
template:
metadata:

View File

@ -1,10 +1,15 @@
# services/keycloak/portal-e2e-client-job.yaml
# services/keycloak/oneoffs/portal-e2e-client-job.yaml
# One-off job for sso/keycloak-portal-e2e-client-8.
# Purpose: keycloak portal e2e client 8 (see container args/env in this file).
# Run by setting spec.suspend to false, reconcile, then set it back to true.
# Safe to delete the finished Job/pod; it should not run continuously.
apiVersion: batch/v1
kind: Job
metadata:
name: keycloak-portal-e2e-client-8
namespace: sso
spec:
suspend: true
backoffLimit: 0
template:
metadata:

View File

@ -1,10 +1,15 @@
# services/keycloak/portal-e2e-execute-actions-email-test-job.yaml
# services/keycloak/oneoffs/portal-e2e-execute-actions-email-test-job.yaml
# One-off job for sso/keycloak-portal-e2e-execute-actions-email-14.
# Purpose: keycloak portal e2e execute actions email 14 (see container args/env in this file).
# Run by setting spec.suspend to false, reconcile, then set it back to true.
# Safe to delete the finished Job/pod; it should not run continuously.
apiVersion: batch/v1
kind: Job
metadata:
name: keycloak-portal-e2e-execute-actions-email-14
namespace: sso
spec:
suspend: true
backoffLimit: 3
template:
metadata:

Some files were not shown because too many files have changed in this diff Show More