Compare commits

..

No commits in common. "main" and "feature/postgres-migration" have entirely different histories.

402 changed files with 7785 additions and 33247 deletions

2
.gitignore vendored
View File

@ -6,5 +6,3 @@ __pycache__/
*.py[cod]
.pytest_cache
.venv
.venv-ci
tmp/

77
Jenkinsfile vendored
View File

@ -1,77 +0,0 @@
// Mirror of ci/Jenkinsfile.titan-iac for multibranch discovery.
pipeline {
agent {
kubernetes {
defaultContainer 'python'
yaml """
apiVersion: v1
kind: Pod
spec:
nodeSelector:
hardware: rpi5
kubernetes.io/arch: arm64
node-role.kubernetes.io/worker: "true"
containers:
- name: python
image: python:3.12-slim
command:
- cat
tty: true
"""
}
}
environment {
PIP_DISABLE_PIP_VERSION_CHECK = '1'
PYTHONUNBUFFERED = '1'
}
stages {
stage('Checkout') {
steps {
checkout scm
}
}
stage('Install deps') {
steps {
sh 'pip install --no-cache-dir -r ci/requirements.txt'
}
}
stage('Glue tests') {
steps {
sh 'pytest -q ci/tests/glue'
}
}
stage('Resolve Flux branch') {
steps {
script {
env.FLUX_BRANCH = sh(
returnStdout: true,
script: "awk '/branch:/{print $2; exit}' clusters/atlas/flux-system/gotk-sync.yaml"
).trim()
if (!env.FLUX_BRANCH) {
error('Flux branch not found in gotk-sync.yaml')
}
echo "Flux branch: ${env.FLUX_BRANCH}"
}
}
}
stage('Promote') {
when {
expression {
def branch = env.BRANCH_NAME ?: (env.GIT_BRANCH ?: '').replaceFirst('origin/', '')
return env.FLUX_BRANCH && branch == env.FLUX_BRANCH
}
}
steps {
withCredentials([usernamePassword(credentialsId: 'gitea-pat', usernameVariable: 'GIT_USER', passwordVariable: 'GIT_TOKEN')]) {
sh '''
set +x
git config user.email "jenkins@bstein.dev"
git config user.name "jenkins"
git remote set-url origin https://${GIT_USER}:${GIT_TOKEN}@scm.bstein.dev/bstein/titan-iac.git
git push origin HEAD:${FLUX_BRANCH}
'''
}
}
}
}
}

View File

@ -1,76 +0,0 @@
pipeline {
agent {
kubernetes {
defaultContainer 'python'
yaml """
apiVersion: v1
kind: Pod
spec:
nodeSelector:
hardware: rpi5
kubernetes.io/arch: arm64
node-role.kubernetes.io/worker: "true"
containers:
- name: python
image: python:3.12-slim
command:
- cat
tty: true
"""
}
}
environment {
PIP_DISABLE_PIP_VERSION_CHECK = '1'
PYTHONUNBUFFERED = '1'
}
stages {
stage('Checkout') {
steps {
checkout scm
}
}
stage('Install deps') {
steps {
sh 'pip install --no-cache-dir -r ci/requirements.txt'
}
}
stage('Glue tests') {
steps {
sh 'pytest -q ci/tests/glue'
}
}
stage('Resolve Flux branch') {
steps {
script {
env.FLUX_BRANCH = sh(
returnStdout: true,
script: "awk '/branch:/{print $2; exit}' clusters/atlas/flux-system/gotk-sync.yaml"
).trim()
if (!env.FLUX_BRANCH) {
error('Flux branch not found in gotk-sync.yaml')
}
echo "Flux branch: ${env.FLUX_BRANCH}"
}
}
}
stage('Promote') {
when {
expression {
def branch = env.BRANCH_NAME ?: (env.GIT_BRANCH ?: '').replaceFirst('origin/', '')
return env.FLUX_BRANCH && branch == env.FLUX_BRANCH
}
}
steps {
withCredentials([usernamePassword(credentialsId: 'gitea-pat', usernameVariable: 'GIT_USER', passwordVariable: 'GIT_TOKEN')]) {
sh '''
set +x
git config user.email "jenkins@bstein.dev"
git config user.name "jenkins"
git remote set-url origin https://${GIT_USER}:${GIT_TOKEN}@scm.bstein.dev/bstein/titan-iac.git
git push origin HEAD:${FLUX_BRANCH}
'''
}
}
}
}
}

View File

@ -1,4 +0,0 @@
pytest==8.3.4
kubernetes==30.1.0
PyYAML==6.0.2
requests==2.32.3

View File

@ -1,16 +0,0 @@
max_success_age_hours: 48
allow_suspended:
- bstein-dev-home/vaultwarden-cred-sync
- comms/othrys-room-reset
- comms/pin-othrys-invite
- comms/seed-othrys-room
- finance/firefly-user-sync
- health/wger-admin-ensure
- health/wger-user-sync
- mailu-mailserver/mailu-sync-nightly
- nextcloud/nextcloud-mail-sync
ariadne_schedule_tasks:
- schedule.mailu_sync
- schedule.nextcloud_sync
- schedule.vaultwarden_sync
- schedule.wger_admin

View File

@ -1,46 +0,0 @@
from __future__ import annotations
from datetime import datetime, timezone
from pathlib import Path
import yaml
from kubernetes import client, config
CONFIG_PATH = Path(__file__).with_name("config.yaml")
def _load_config() -> dict:
with CONFIG_PATH.open("r", encoding="utf-8") as handle:
return yaml.safe_load(handle) or {}
def _load_kube():
try:
config.load_incluster_config()
except config.ConfigException:
config.load_kube_config()
def test_glue_cronjobs_recent_success():
cfg = _load_config()
max_age_hours = int(cfg.get("max_success_age_hours", 48))
allow_suspended = set(cfg.get("allow_suspended", []))
_load_kube()
batch = client.BatchV1Api()
cronjobs = batch.list_cron_job_for_all_namespaces(label_selector="atlas.bstein.dev/glue=true").items
assert cronjobs, "No glue cronjobs found with atlas.bstein.dev/glue=true"
now = datetime.now(timezone.utc)
for cronjob in cronjobs:
name = f"{cronjob.metadata.namespace}/{cronjob.metadata.name}"
if cronjob.spec.suspend:
assert name in allow_suspended, f"{name} is suspended but not in allow_suspended"
continue
last_success = cronjob.status.last_successful_time
assert last_success is not None, f"{name} has no lastSuccessfulTime"
age_hours = (now - last_success).total_seconds() / 3600
assert age_hours <= max_age_hours, f"{name} last success {age_hours:.1f}h ago"

View File

@ -1,48 +0,0 @@
from __future__ import annotations
import os
from pathlib import Path
import requests
import yaml
VM_URL = os.environ.get("VM_URL", "http://victoria-metrics-single-server:8428").rstrip("/")
CONFIG_PATH = Path(__file__).with_name("config.yaml")
def _load_config() -> dict:
with CONFIG_PATH.open("r", encoding="utf-8") as handle:
return yaml.safe_load(handle) or {}
def _query(promql: str) -> list[dict]:
response = requests.get(f"{VM_URL}/api/v1/query", params={"query": promql}, timeout=10)
response.raise_for_status()
payload = response.json()
return payload.get("data", {}).get("result", [])
def test_glue_metrics_present():
series = _query('kube_cronjob_labels{label_atlas_bstein_dev_glue="true"}')
assert series, "No glue cronjob label series found"
def test_glue_metrics_success_join():
query = (
"kube_cronjob_status_last_successful_time "
'and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue="true"}'
)
series = _query(query)
assert series, "No glue cronjob last success series found"
def test_ariadne_schedule_metrics_present():
cfg = _load_config()
expected = cfg.get("ariadne_schedule_tasks", [])
if not expected:
return
series = _query("ariadne_schedule_next_run_timestamp_seconds")
tasks = {item.get("metric", {}).get("task") for item in series}
missing = [task for task in expected if task not in tasks]
assert not missing, f"Missing Ariadne schedule metrics for: {', '.join(missing)}"

View File

@ -0,0 +1,13 @@
# clusters/atlas/applications/kustomization.yaml
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
resources:
- ../../services/crypto
- ../../services/gitea
- ../../services/jellyfin
- ../../services/comms
- ../../services/monitoring
- ../../services/logging
- ../../services/pegasus
- ../../services/vault
- ../../services/bstein-dev-home

View File

@ -1,17 +0,0 @@
# clusters/atlas/flux-system/applications/bstein-dev-home-migrations/kustomization.yaml
apiVersion: kustomize.toolkit.fluxcd.io/v1
kind: Kustomization
metadata:
name: bstein-dev-home-migrations
namespace: flux-system
spec:
interval: 10m
path: ./services/bstein-dev-home/oneoffs/migrations
prune: true
force: true
sourceRef:
kind: GitRepository
name: flux-system
targetNamespace: bstein-dev-home
wait: false
suspend: true

View File

@ -3,7 +3,7 @@ apiVersion: image.toolkit.fluxcd.io/v1
kind: ImageUpdateAutomation
metadata:
name: bstein-dev-home
namespace: bstein-dev-home
namespace: flux-system
spec:
interval: 1m0s
sourceRef:
@ -13,14 +13,14 @@ spec:
git:
checkout:
ref:
branch: feature/ariadne
branch: main
commit:
author:
email: ops@bstein.dev
name: flux-bot
messageTemplate: "chore(bstein-dev-home): automated image update"
messageTemplate: "chore(bstein-dev-home): update images to {{range .Updated.Images}}{{.}}{{end}}"
push:
branch: feature/ariadne
branch: main
update:
strategy: Setters
path: services/bstein-dev-home

View File

@ -1,4 +1,4 @@
# clusters/atlas/flux-system/applications/comms/kustomization.yaml
# clusters/atlas/flux-system/applications/communication/kustomization.yaml
apiVersion: kustomize.toolkit.fluxcd.io/v1
kind: Kustomization
metadata:

View File

@ -1,24 +0,0 @@
# clusters/atlas/flux-system/applications/finance/kustomization.yaml
apiVersion: kustomize.toolkit.fluxcd.io/v1
kind: Kustomization
metadata:
name: finance
namespace: flux-system
spec:
interval: 10m
path: ./services/finance
prune: true
sourceRef:
kind: GitRepository
name: flux-system
targetNamespace: finance
healthChecks:
- apiVersion: apps/v1
kind: Deployment
name: actual-budget
namespace: finance
- apiVersion: apps/v1
kind: Deployment
name: firefly
namespace: finance
wait: false

View File

@ -13,6 +13,11 @@ spec:
kind: GitRepository
name: flux-system
namespace: flux-system
healthChecks:
- apiVersion: helm.toolkit.fluxcd.io/v2
kind: HelmRelease
name: harbor
namespace: harbor
wait: false
dependsOn:
- name: core

View File

@ -1,25 +0,0 @@
# clusters/atlas/flux-system/applications/health/kustomization.yaml
apiVersion: kustomize.toolkit.fluxcd.io/v1
kind: Kustomization
metadata:
name: health
namespace: flux-system
spec:
interval: 10m
path: ./services/health
prune: true
sourceRef:
kind: GitRepository
name: flux-system
targetNamespace: health
dependsOn:
- name: keycloak
- name: postgres
- name: traefik
- name: vault
healthChecks:
- apiVersion: apps/v1
kind: Deployment
name: wger
namespace: health
wait: false

View File

@ -12,12 +12,10 @@ resources:
- pegasus/image-automation.yaml
- bstein-dev-home/kustomization.yaml
- bstein-dev-home/image-automation.yaml
- bstein-dev-home-migrations/kustomization.yaml
- harbor/kustomization.yaml
- harbor/image-automation.yaml
- jellyfin/kustomization.yaml
- xmr-miner/kustomization.yaml
- wallet-monero-temp/kustomization.yaml
- sui-metrics/kustomization.yaml
- openldap/kustomization.yaml
- keycloak/kustomization.yaml
@ -29,5 +27,3 @@ resources:
- nextcloud-mail-sync/kustomization.yaml
- outline/kustomization.yaml
- planka/kustomization.yaml
- finance/kustomization.yaml
- health/kustomization.yaml

View File

@ -3,7 +3,7 @@ apiVersion: image.toolkit.fluxcd.io/v1
kind: ImageUpdateAutomation
metadata:
name: pegasus
namespace: jellyfin
namespace: flux-system
spec:
interval: 1m0s
sourceRef:

View File

@ -1,19 +0,0 @@
# clusters/atlas/flux-system/applications/wallet-monero-temp/kustomization.yaml
apiVersion: kustomize.toolkit.fluxcd.io/v1
kind: Kustomization
metadata:
name: wallet-monero-temp
namespace: flux-system
spec:
interval: 10m
path: ./services/crypto/wallet-monero-temp
targetNamespace: crypto
prune: true
sourceRef:
kind: GitRepository
name: flux-system
namespace: flux-system
dependsOn:
- name: crypto
- name: xmr-miner
wait: true

View File

@ -1,4 +1,3 @@
# clusters/atlas/flux-system/gotk-components.yaml
---
# This manifest was generated by flux. DO NOT EDIT.
# Flux Version: v2.7.5

View File

@ -1,4 +1,3 @@
# clusters/atlas/flux-system/gotk-sync.yaml
# This manifest was generated by flux. DO NOT EDIT.
---
apiVersion: source.toolkit.fluxcd.io/v1
@ -9,7 +8,7 @@ metadata:
spec:
interval: 1m0s
ref:
branch: feature/ariadne
branch: feature/sso-hardening
secretRef:
name: flux-system-gitea
url: ssh://git@scm.bstein.dev:2242/bstein/titan-iac.git

View File

@ -1,17 +0,0 @@
# clusters/atlas/flux-system/platform/cert-manager-cleanup/kustomization.yaml
apiVersion: kustomize.toolkit.fluxcd.io/v1
kind: Kustomization
metadata:
name: cert-manager-cleanup
namespace: flux-system
spec:
interval: 30m
path: ./infrastructure/cert-manager/cleanup
prune: true
force: true
sourceRef:
kind: GitRepository
name: flux-system
namespace: flux-system
targetNamespace: cert-manager
wait: true

View File

@ -1,19 +0,0 @@
# clusters/atlas/flux-system/platform/cert-manager/kustomization.yaml
apiVersion: kustomize.toolkit.fluxcd.io/v1
kind: Kustomization
metadata:
name: cert-manager
namespace: flux-system
spec:
interval: 30m
path: ./infrastructure/cert-manager
prune: true
force: true
sourceRef:
kind: GitRepository
name: flux-system
namespace: flux-system
targetNamespace: cert-manager
dependsOn:
- name: helm
wait: true

View File

@ -4,17 +4,12 @@ kind: Kustomization
resources:
- core/kustomization.yaml
- helm/kustomization.yaml
- cert-manager/kustomization.yaml
- metallb/kustomization.yaml
- traefik/kustomization.yaml
- gitops-ui/kustomization.yaml
- monitoring/kustomization.yaml
- logging/kustomization.yaml
- maintenance/kustomization.yaml
- maintenance/image-automation.yaml
- longhorn-adopt/kustomization.yaml
- longhorn/kustomization.yaml
- longhorn-ui/kustomization.yaml
- postgres/kustomization.yaml
- ../platform/vault-csi/kustomization.yaml
- ../platform/vault-injector/kustomization.yaml

View File

@ -1,17 +0,0 @@
# clusters/atlas/flux-system/platform/longhorn-adopt/kustomization.yaml
apiVersion: kustomize.toolkit.fluxcd.io/v1
kind: Kustomization
metadata:
name: longhorn-adopt
namespace: flux-system
spec:
interval: 30m
path: ./infrastructure/longhorn/adopt
prune: true
force: true
sourceRef:
kind: GitRepository
name: flux-system
namespace: flux-system
targetNamespace: longhorn-system
wait: true

View File

@ -15,5 +15,4 @@ spec:
namespace: flux-system
dependsOn:
- name: core
- name: longhorn
wait: true

View File

@ -1,20 +0,0 @@
# clusters/atlas/flux-system/platform/longhorn/kustomization.yaml
apiVersion: kustomize.toolkit.fluxcd.io/v1
kind: Kustomization
metadata:
name: longhorn
namespace: flux-system
spec:
interval: 30m
path: ./infrastructure/longhorn/core
prune: true
force: true
sourceRef:
kind: GitRepository
name: flux-system
namespace: flux-system
targetNamespace: longhorn-system
dependsOn:
- name: helm
- name: longhorn-adopt
wait: false

View File

@ -1,26 +0,0 @@
# clusters/atlas/flux-system/platform/maintenance/image-automation.yaml
apiVersion: image.toolkit.fluxcd.io/v1
kind: ImageUpdateAutomation
metadata:
name: maintenance
namespace: maintenance
spec:
interval: 1m0s
sourceRef:
kind: GitRepository
name: flux-system
namespace: flux-system
git:
checkout:
ref:
branch: feature/ariadne
commit:
author:
email: ops@bstein.dev
name: flux-bot
messageTemplate: "chore(maintenance): automated image update"
push:
branch: feature/ariadne
update:
strategy: Setters
path: services/maintenance

View File

@ -8,7 +8,6 @@ spec:
interval: 10m
path: ./services/maintenance
prune: true
force: true
sourceRef:
kind: GitRepository
name: flux-system

View File

@ -1,16 +0,0 @@
# clusters/atlas/flux-system/platform/vault-injector/kustomization.yaml
apiVersion: kustomize.toolkit.fluxcd.io/v1
kind: Kustomization
metadata:
name: vault-injector
namespace: flux-system
spec:
interval: 30m
path: ./infrastructure/vault-injector
targetNamespace: vault
prune: true
sourceRef:
kind: GitRepository
name: flux-system
namespace: flux-system
wait: true

View File

@ -0,0 +1,8 @@
# clusters/atlas/platform/kustomization.yaml
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
resources:
- ../../../infrastructure/modules/base
- ../../../infrastructure/modules/profiles/atlas-ha
- ../../../infrastructure/sources/cert-manager/letsencrypt.yaml
- ../../../infrastructure/metallb

View File

@ -1,5 +0,0 @@
FROM python:3.11-slim
ENV PIP_DISABLE_PIP_VERSION_CHECK=1
RUN pip install --no-cache-dir requests psycopg2-binary

View File

@ -1,9 +0,0 @@
FROM registry.bstein.dev/infra/harbor-core:v2.14.1-arm64
USER root
COPY dockerfiles/vault-entrypoint.sh /entrypoint.sh
RUN chmod 0755 /entrypoint.sh
USER harbor
ENTRYPOINT ["/entrypoint.sh"]
CMD ["/harbor/entrypoint.sh"]

View File

@ -1,9 +0,0 @@
FROM registry.bstein.dev/infra/harbor-jobservice:v2.14.1-arm64
USER root
COPY dockerfiles/vault-entrypoint.sh /entrypoint.sh
RUN chmod 0755 /entrypoint.sh
USER harbor
ENTRYPOINT ["/entrypoint.sh"]
CMD ["/harbor/entrypoint.sh"]

View File

@ -1,9 +0,0 @@
FROM registry.bstein.dev/infra/harbor-registry:v2.14.1-arm64
USER root
COPY dockerfiles/vault-entrypoint.sh /entrypoint.sh
RUN chmod 0755 /entrypoint.sh
USER harbor
ENTRYPOINT ["/entrypoint.sh"]
CMD ["/home/harbor/entrypoint.sh"]

View File

@ -1,9 +0,0 @@
FROM registry.bstein.dev/infra/harbor-registryctl:v2.14.1-arm64
USER root
COPY dockerfiles/vault-entrypoint.sh /entrypoint.sh
RUN chmod 0755 /entrypoint.sh
USER harbor
ENTRYPOINT ["/entrypoint.sh"]
CMD ["/home/harbor/start.sh"]

View File

@ -1,10 +0,0 @@
FROM ghcr.io/element-hq/lk-jwt-service:0.3.0 AS base
FROM alpine:3.20
RUN apk add --no-cache ca-certificates
COPY --from=base /lk-jwt-service /lk-jwt-service
COPY dockerfiles/vault-entrypoint.sh /entrypoint.sh
RUN chmod 0755 /entrypoint.sh
ENTRYPOINT ["/entrypoint.sh"]
CMD ["/lk-jwt-service"]

View File

@ -1,10 +0,0 @@
FROM quay.io/oauth2-proxy/oauth2-proxy:v7.6.0 AS base
FROM alpine:3.20
RUN apk add --no-cache ca-certificates
COPY --from=base /bin/oauth2-proxy /bin/oauth2-proxy
COPY dockerfiles/vault-entrypoint.sh /entrypoint.sh
RUN chmod 0755 /entrypoint.sh
ENTRYPOINT ["/entrypoint.sh"]
CMD ["/bin/oauth2-proxy"]

View File

@ -1,10 +0,0 @@
FROM registry.bstein.dev/streaming/pegasus:1.2.32 AS base
FROM alpine:3.20
RUN apk add --no-cache ca-certificates
COPY --from=base /pegasus /pegasus
COPY dockerfiles/vault-entrypoint.sh /entrypoint.sh
RUN chmod 0755 /entrypoint.sh
ENTRYPOINT ["/entrypoint.sh"]
CMD ["/pegasus"]

View File

@ -1,34 +0,0 @@
#!/bin/sh
set -eu
if [ -n "${VAULT_ENV_FILE:-}" ]; then
if [ -f "${VAULT_ENV_FILE}" ]; then
# shellcheck disable=SC1090
. "${VAULT_ENV_FILE}"
else
echo "Vault env file not found: ${VAULT_ENV_FILE}" >&2
exit 1
fi
fi
if [ -n "${VAULT_COPY_FILES:-}" ]; then
old_ifs="$IFS"
IFS=','
for pair in ${VAULT_COPY_FILES}; do
src="${pair%%:*}"
dest="${pair#*:}"
if [ -z "${src}" ] || [ -z "${dest}" ]; then
echo "Vault copy entry malformed: ${pair}" >&2
exit 1
fi
if [ ! -f "${src}" ]; then
echo "Vault file not found: ${src}" >&2
exit 1
fi
mkdir -p "$(dirname "${dest}")"
cp "${src}" "${dest}"
done
IFS="$old_ifs"
fi
exec "$@"

View File

@ -1,40 +0,0 @@
# infrastructure/cert-manager/cleanup/cert-manager-cleanup-job.yaml
apiVersion: batch/v1
kind: Job
metadata:
name: cert-manager-cleanup-2
namespace: cert-manager
spec:
backoffLimit: 1
template:
spec:
serviceAccountName: cert-manager-cleanup
restartPolicy: Never
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: node-role.kubernetes.io/worker
operator: Exists
preferredDuringSchedulingIgnoredDuringExecution:
- weight: 100
preference:
matchExpressions:
- key: kubernetes.io/arch
operator: In
values: ["arm64"]
containers:
- name: cleanup
image: bitnami/kubectl@sha256:554ab88b1858e8424c55de37ad417b16f2a0e65d1607aa0f3fe3ce9b9f10b131
command: ["/usr/bin/env", "bash"]
args: ["/scripts/cert_manager_cleanup.sh"]
volumeMounts:
- name: script
mountPath: /scripts
readOnly: true
volumes:
- name: script
configMap:
name: cert-manager-cleanup-script
defaultMode: 0555

View File

@ -1,58 +0,0 @@
# infrastructure/cert-manager/cleanup/cert-manager-cleanup-rbac.yaml
apiVersion: v1
kind: ServiceAccount
metadata:
name: cert-manager-cleanup
namespace: cert-manager
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
name: cert-manager-cleanup
rules:
- apiGroups: [""]
resources:
- pods
- services
- endpoints
- configmaps
- secrets
- serviceaccounts
verbs: ["get", "list", "watch", "delete"]
- apiGroups: ["apps"]
resources:
- deployments
- daemonsets
- statefulsets
- replicasets
verbs: ["get", "list", "watch", "delete"]
- apiGroups: ["batch"]
resources:
- jobs
- cronjobs
verbs: ["get", "list", "watch", "delete"]
- apiGroups: ["rbac.authorization.k8s.io"]
resources:
- roles
- rolebindings
- clusterroles
- clusterrolebindings
verbs: ["get", "list", "watch", "delete"]
- apiGroups: ["admissionregistration.k8s.io"]
resources:
- validatingwebhookconfigurations
- mutatingwebhookconfigurations
verbs: ["get", "list", "watch", "delete"]
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
name: cert-manager-cleanup
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: cert-manager-cleanup
subjects:
- kind: ServiceAccount
name: cert-manager-cleanup
namespace: cert-manager

View File

@ -1,15 +0,0 @@
# infrastructure/cert-manager/cleanup/kustomization.yaml
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
resources:
- namespace.yaml
- cert-manager-cleanup-rbac.yaml
- cert-manager-cleanup-job.yaml
configMapGenerator:
- name: cert-manager-cleanup-script
namespace: cert-manager
files:
- cert_manager_cleanup.sh=scripts/cert_manager_cleanup.sh
options:
disableNameSuffixHash: true

View File

@ -1,5 +0,0 @@
# infrastructure/cert-manager/cleanup/namespace.yaml
apiVersion: v1
kind: Namespace
metadata:
name: cert-manager

View File

@ -1,37 +0,0 @@
#!/usr/bin/env bash
set -euo pipefail
namespace="cert-manager"
selectors=(
"app.kubernetes.io/name=cert-manager"
"app.kubernetes.io/instance=cert-manager"
"app.kubernetes.io/instance=certmanager-prod"
)
delete_namespaced() {
local selector="$1"
kubectl -n "${namespace}" delete deployment,daemonset,statefulset,replicaset \
--selector "${selector}" --ignore-not-found --wait=false
kubectl -n "${namespace}" delete pod,service,endpoints,serviceaccount,configmap,secret \
--selector "${selector}" --ignore-not-found --wait=false
kubectl -n "${namespace}" delete role,rolebinding \
--selector "${selector}" --ignore-not-found --wait=false
kubectl -n "${namespace}" delete job,cronjob \
--selector "${selector}" --ignore-not-found --wait=false
}
delete_cluster_scoped() {
local selector="$1"
kubectl delete clusterrole,clusterrolebinding \
--selector "${selector}" --ignore-not-found --wait=false
kubectl delete mutatingwebhookconfiguration,validatingwebhookconfiguration \
--selector "${selector}" --ignore-not-found --wait=false
}
for selector in "${selectors[@]}"; do
delete_namespaced "${selector}"
delete_cluster_scoped "${selector}"
done
kubectl delete mutatingwebhookconfiguration cert-manager-webhook --ignore-not-found --wait=false
kubectl delete validatingwebhookconfiguration cert-manager-webhook --ignore-not-found --wait=false

View File

@ -1,67 +0,0 @@
# infrastructure/cert-manager/helmrelease.yaml
apiVersion: helm.toolkit.fluxcd.io/v2
kind: HelmRelease
metadata:
name: cert-manager
namespace: cert-manager
spec:
interval: 30m
chart:
spec:
chart: cert-manager
version: v1.17.0
sourceRef:
kind: HelmRepository
name: jetstack
namespace: flux-system
install:
crds: CreateReplace
remediation: { retries: 3 }
timeout: 10m
upgrade:
crds: CreateReplace
remediation:
retries: 3
remediateLastFailure: true
cleanupOnFail: true
timeout: 10m
values:
installCRDs: true
nodeSelector:
node-role.kubernetes.io/worker: "true"
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: hardware
operator: In
values:
- rpi5
- rpi4
webhook:
nodeSelector:
node-role.kubernetes.io/worker: "true"
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: hardware
operator: In
values:
- rpi5
- rpi4
cainjector:
nodeSelector:
node-role.kubernetes.io/worker: "true"
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: hardware
operator: In
values:
- rpi5
- rpi4

View File

@ -1,6 +0,0 @@
# infrastructure/cert-manager/kustomization.yaml
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
resources:
- namespace.yaml
- helmrelease.yaml

View File

@ -1,5 +0,0 @@
# infrastructure/cert-manager/namespace.yaml
apiVersion: v1
kind: Namespace
metadata:
name: cert-manager

View File

@ -1,47 +0,0 @@
# infrastructure/core/coredns-custom.yaml
apiVersion: v1
kind: ConfigMap
metadata:
name: coredns-custom
namespace: kube-system
data:
bstein-dev.server: |
bstein.dev:53 {
errors
cache 30
hosts {
192.168.22.9 alerts.bstein.dev
192.168.22.9 auth.bstein.dev
192.168.22.9 bstein.dev
10.43.6.87 budget.bstein.dev
192.168.22.9 call.live.bstein.dev
192.168.22.9 cd.bstein.dev
192.168.22.9 chat.ai.bstein.dev
192.168.22.9 ci.bstein.dev
192.168.22.9 cloud.bstein.dev
192.168.22.9 health.bstein.dev
192.168.22.9 kit.live.bstein.dev
192.168.22.9 live.bstein.dev
192.168.22.9 logs.bstein.dev
192.168.22.9 longhorn.bstein.dev
192.168.22.4 mail.bstein.dev
192.168.22.9 matrix.live.bstein.dev
192.168.22.9 metrics.bstein.dev
192.168.22.9 monero.bstein.dev
10.43.6.87 money.bstein.dev
192.168.22.9 notes.bstein.dev
192.168.22.9 office.bstein.dev
192.168.22.9 pegasus.bstein.dev
3.136.224.193 pm-bounces.bstein.dev
3.150.68.49 pm-bounces.bstein.dev
18.189.137.81 pm-bounces.bstein.dev
192.168.22.9 registry.bstein.dev
192.168.22.9 scm.bstein.dev
192.168.22.9 secret.bstein.dev
192.168.22.9 sso.bstein.dev
192.168.22.9 stream.bstein.dev
192.168.22.9 tasks.bstein.dev
192.168.22.9 vault.bstein.dev
fallthrough
}
}

View File

@ -1,141 +0,0 @@
# infrastructure/core/coredns-deployment.yaml
apiVersion: apps/v1
kind: Deployment
metadata:
name: coredns
namespace: kube-system
labels:
k8s-app: kube-dns
kubernetes.io/name: CoreDNS
spec:
progressDeadlineSeconds: 600
replicas: 2
revisionHistoryLimit: 0
selector:
matchLabels:
k8s-app: kube-dns
strategy:
type: RollingUpdate
rollingUpdate:
maxSurge: 25%
maxUnavailable: 1
template:
metadata:
labels:
k8s-app: kube-dns
spec:
containers:
- name: coredns
image: registry.bstein.dev/infra/coredns:1.12.1
imagePullPolicy: IfNotPresent
args:
- -conf
- /etc/coredns/Corefile
ports:
- containerPort: 53
name: dns
protocol: UDP
- containerPort: 53
name: dns-tcp
protocol: TCP
- containerPort: 9153
name: metrics
protocol: TCP
livenessProbe:
httpGet:
path: /health
port: 8080
scheme: HTTP
initialDelaySeconds: 60
periodSeconds: 10
timeoutSeconds: 1
successThreshold: 1
failureThreshold: 3
readinessProbe:
httpGet:
path: /ready
port: 8181
scheme: HTTP
periodSeconds: 2
timeoutSeconds: 1
successThreshold: 1
failureThreshold: 3
resources:
limits:
memory: 170Mi
requests:
cpu: 100m
memory: 70Mi
securityContext:
allowPrivilegeEscalation: false
capabilities:
add:
- NET_BIND_SERVICE
drop:
- all
readOnlyRootFilesystem: true
volumeMounts:
- name: config-volume
mountPath: /etc/coredns
readOnly: true
- name: custom-config-volume
mountPath: /etc/coredns/custom
readOnly: true
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: hardware
operator: In
values:
- rpi5
- rpi4
- key: node-role.kubernetes.io/worker
operator: In
values:
- "true"
dnsPolicy: Default
nodeSelector:
kubernetes.io/os: linux
priorityClassName: system-cluster-critical
restartPolicy: Always
schedulerName: default-scheduler
serviceAccountName: coredns
tolerations:
- key: CriticalAddonsOnly
operator: Exists
- key: node-role.kubernetes.io/control-plane
operator: Exists
effect: NoSchedule
- key: node-role.kubernetes.io/master
operator: Exists
effect: NoSchedule
topologySpreadConstraints:
- maxSkew: 1
topologyKey: kubernetes.io/hostname
whenUnsatisfiable: DoNotSchedule
labelSelector:
matchLabels:
k8s-app: kube-dns
- maxSkew: 1
topologyKey: topology.kubernetes.io/zone
whenUnsatisfiable: ScheduleAnyway
labelSelector:
matchLabels:
k8s-app: kube-dns
volumes:
- name: config-volume
configMap:
name: coredns
defaultMode: 420
items:
- key: Corefile
path: Corefile
- key: NodeHosts
path: NodeHosts
- name: custom-config-volume
configMap:
name: coredns-custom
optional: true
defaultMode: 420

View File

@ -4,8 +4,5 @@ kind: Kustomization
resources:
- ../modules/base
- ../modules/profiles/atlas-ha
- coredns-custom.yaml
- coredns-deployment.yaml
- ntp-sync-daemonset.yaml
- ../sources/cert-manager/letsencrypt.yaml
- ../sources/cert-manager/letsencrypt-prod.yaml

View File

@ -1,50 +0,0 @@
# infrastructure/core/ntp-sync-daemonset.yaml
apiVersion: apps/v1
kind: DaemonSet
metadata:
name: ntp-sync
namespace: kube-system
labels:
app: ntp-sync
spec:
selector:
matchLabels:
app: ntp-sync
template:
metadata:
labels:
app: ntp-sync
spec:
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: node-role.kubernetes.io/control-plane
operator: DoesNotExist
- key: node-role.kubernetes.io/master
operator: DoesNotExist
containers:
- name: ntp-sync
image: public.ecr.aws/docker/library/busybox:1.36.1
imagePullPolicy: IfNotPresent
command: ["/bin/sh", "-c"]
args:
- |
set -eu
while true; do
ntpd -q -p pool.ntp.org || true
sleep 300
done
securityContext:
capabilities:
add: ["SYS_TIME"]
runAsUser: 0
runAsGroup: 0
resources:
requests:
cpu: 10m
memory: 16Mi
limits:
cpu: 50m
memory: 64Mi

View File

@ -1,15 +0,0 @@
# infrastructure/longhorn/adopt/kustomization.yaml
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
resources:
- namespace.yaml
- longhorn-adopt-rbac.yaml
- longhorn-helm-adopt-job.yaml
configMapGenerator:
- name: longhorn-helm-adopt-script
namespace: longhorn-system
files:
- longhorn_helm_adopt.sh=scripts/longhorn_helm_adopt.sh
options:
disableNameSuffixHash: true

View File

@ -1,56 +0,0 @@
# infrastructure/longhorn/adopt/longhorn-adopt-rbac.yaml
apiVersion: v1
kind: ServiceAccount
metadata:
name: longhorn-helm-adopt
namespace: longhorn-system
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
name: longhorn-helm-adopt
rules:
- apiGroups: [""]
resources:
- configmaps
- services
- serviceaccounts
- secrets
verbs: ["get", "list", "watch", "patch", "update"]
- apiGroups: ["apps"]
resources:
- deployments
- daemonsets
verbs: ["get", "list", "watch", "patch", "update"]
- apiGroups: ["batch"]
resources:
- jobs
verbs: ["get", "list", "watch", "patch", "update"]
- apiGroups: ["rbac.authorization.k8s.io"]
resources:
- roles
- rolebindings
- clusterroles
- clusterrolebindings
verbs: ["get", "list", "watch", "patch", "update"]
- apiGroups: ["apiextensions.k8s.io"]
resources:
- customresourcedefinitions
verbs: ["get", "list", "watch", "patch", "update"]
- apiGroups: ["scheduling.k8s.io"]
resources:
- priorityclasses
verbs: ["get", "list", "watch", "patch", "update"]
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
name: longhorn-helm-adopt
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: longhorn-helm-adopt
subjects:
- kind: ServiceAccount
name: longhorn-helm-adopt
namespace: longhorn-system

View File

@ -1,40 +0,0 @@
# infrastructure/longhorn/adopt/longhorn-helm-adopt-job.yaml
apiVersion: batch/v1
kind: Job
metadata:
name: longhorn-helm-adopt-2
namespace: longhorn-system
spec:
backoffLimit: 1
template:
spec:
serviceAccountName: longhorn-helm-adopt
restartPolicy: Never
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: node-role.kubernetes.io/worker
operator: Exists
preferredDuringSchedulingIgnoredDuringExecution:
- weight: 100
preference:
matchExpressions:
- key: kubernetes.io/arch
operator: In
values: ["arm64"]
containers:
- name: adopt
image: bitnami/kubectl@sha256:554ab88b1858e8424c55de37ad417b16f2a0e65d1607aa0f3fe3ce9b9f10b131
command: ["/usr/bin/env", "bash"]
args: ["/scripts/longhorn_helm_adopt.sh"]
volumeMounts:
- name: script
mountPath: /scripts
readOnly: true
volumes:
- name: script
configMap:
name: longhorn-helm-adopt-script
defaultMode: 0555

View File

@ -1,5 +0,0 @@
# infrastructure/longhorn/adopt/namespace.yaml
apiVersion: v1
kind: Namespace
metadata:
name: longhorn-system

View File

@ -1,52 +0,0 @@
#!/usr/bin/env bash
set -euo pipefail
release_name="longhorn"
release_namespace="longhorn-system"
selector="app.kubernetes.io/instance=${release_name}"
annotate_and_label() {
local scope="$1"
local kind="$2"
if [ "${scope}" = "namespaced" ]; then
kubectl -n "${release_namespace}" annotate "${kind}" -l "${selector}" \
meta.helm.sh/release-name="${release_name}" \
meta.helm.sh/release-namespace="${release_namespace}" \
--overwrite >/dev/null 2>&1 || true
kubectl -n "${release_namespace}" label "${kind}" -l "${selector}" \
app.kubernetes.io/managed-by=Helm --overwrite >/dev/null 2>&1 || true
else
kubectl annotate "${kind}" -l "${selector}" \
meta.helm.sh/release-name="${release_name}" \
meta.helm.sh/release-namespace="${release_namespace}" \
--overwrite >/dev/null 2>&1 || true
kubectl label "${kind}" -l "${selector}" \
app.kubernetes.io/managed-by=Helm --overwrite >/dev/null 2>&1 || true
fi
}
namespaced_kinds=(
configmap
service
serviceaccount
deployment
daemonset
job
role
rolebinding
)
cluster_kinds=(
clusterrole
clusterrolebinding
customresourcedefinition
priorityclass
)
for kind in "${namespaced_kinds[@]}"; do
annotate_and_label "namespaced" "${kind}"
done
for kind in "${cluster_kinds[@]}"; do
annotate_and_label "cluster" "${kind}"
done

View File

@ -1,80 +0,0 @@
# infrastructure/longhorn/core/helmrelease.yaml
apiVersion: helm.toolkit.fluxcd.io/v2
kind: HelmRelease
metadata:
name: longhorn
namespace: longhorn-system
spec:
interval: 30m
chart:
spec:
chart: longhorn
version: 1.8.2
sourceRef:
kind: HelmRepository
name: longhorn
namespace: flux-system
install:
crds: Skip
remediation: { retries: 3 }
timeout: 15m
upgrade:
crds: Skip
remediation:
retries: 3
remediateLastFailure: true
cleanupOnFail: true
timeout: 15m
values:
service:
ui:
type: NodePort
nodePort: 30824
privateRegistry:
createSecret: false
registrySecret: longhorn-registry
image:
pullPolicy: Always
longhorn:
engine:
repository: registry.bstein.dev/infra/longhorn-engine
tag: v1.8.2
manager:
repository: registry.bstein.dev/infra/longhorn-manager
tag: v1.8.2
ui:
repository: registry.bstein.dev/infra/longhorn-ui
tag: v1.8.2
instanceManager:
repository: registry.bstein.dev/infra/longhorn-instance-manager
tag: v1.8.2
shareManager:
repository: registry.bstein.dev/infra/longhorn-share-manager
tag: v1.8.2
backingImageManager:
repository: registry.bstein.dev/infra/longhorn-backing-image-manager
tag: v1.8.2
supportBundleKit:
repository: registry.bstein.dev/infra/longhorn-support-bundle-kit
tag: v0.0.56
csi:
attacher:
repository: registry.bstein.dev/infra/longhorn-csi-attacher
tag: v4.9.0
provisioner:
repository: registry.bstein.dev/infra/longhorn-csi-provisioner
tag: v5.3.0
nodeDriverRegistrar:
repository: registry.bstein.dev/infra/longhorn-csi-node-driver-registrar
tag: v2.14.0
resizer:
repository: registry.bstein.dev/infra/longhorn-csi-resizer
tag: v1.13.2
snapshotter:
repository: registry.bstein.dev/infra/longhorn-csi-snapshotter
tag: v8.2.0
livenessProbe:
repository: registry.bstein.dev/infra/longhorn-livenessprobe
tag: v2.16.0
defaultSettings:
systemManagedPodsImagePullPolicy: Always

View File

@ -1,18 +0,0 @@
# infrastructure/longhorn/core/kustomization.yaml
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
resources:
- namespace.yaml
- vault-serviceaccount.yaml
- secretproviderclass.yaml
- vault-sync-deployment.yaml
- helmrelease.yaml
- longhorn-settings-ensure-job.yaml
configMapGenerator:
- name: longhorn-settings-ensure-script
files:
- longhorn_settings_ensure.sh=scripts/longhorn_settings_ensure.sh
generatorOptions:
disableNameSuffixHash: true

View File

@ -1,36 +0,0 @@
# infrastructure/longhorn/core/longhorn-settings-ensure-job.yaml
apiVersion: batch/v1
kind: Job
metadata:
name: longhorn-settings-ensure-4
namespace: longhorn-system
spec:
backoffLimit: 0
ttlSecondsAfterFinished: 3600
template:
spec:
serviceAccountName: longhorn-service-account
restartPolicy: Never
volumes:
- name: longhorn-settings-ensure-script
configMap:
name: longhorn-settings-ensure-script
defaultMode: 0555
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: kubernetes.io/arch
operator: In
values: ["arm64"]
- key: node-role.kubernetes.io/worker
operator: Exists
containers:
- name: apply
image: bitnami/kubectl@sha256:554ab88b1858e8424c55de37ad417b16f2a0e65d1607aa0f3fe3ce9b9f10b131
command: ["/scripts/longhorn_settings_ensure.sh"]
volumeMounts:
- name: longhorn-settings-ensure-script
mountPath: /scripts
readOnly: true

View File

@ -1,5 +0,0 @@
# infrastructure/longhorn/core/namespace.yaml
apiVersion: v1
kind: Namespace
metadata:
name: longhorn-system

View File

@ -1,42 +0,0 @@
#!/usr/bin/env sh
set -eu
# Longhorn blocks direct CR patches for some settings; use the internal API instead.
api_base="http://longhorn-backend.longhorn-system.svc:9500/v1/settings"
wait_for_api() {
attempts=30
while [ "${attempts}" -gt 0 ]; do
if curl -fsS "${api_base}" >/dev/null 2>&1; then
return 0
fi
attempts=$((attempts - 1))
sleep 2
done
echo "Longhorn API not ready after retries." >&2
return 1
}
update_setting() {
name="$1"
value="$2"
current="$(curl -fsS "${api_base}/${name}" || true)"
if echo "${current}" | grep -Fq "\"value\":\"${value}\""; then
echo "Setting ${name} already set."
return 0
fi
echo "Setting ${name} -> ${value}"
curl -fsS -X PUT \
-H "Content-Type: application/json" \
-d "{\"value\":\"${value}\"}" \
"${api_base}/${name}" >/dev/null
}
wait_for_api
update_setting default-engine-image "registry.bstein.dev/infra/longhorn-engine:v1.8.2"
update_setting default-instance-manager-image "registry.bstein.dev/infra/longhorn-instance-manager:v1.8.2"
update_setting default-backing-image-manager-image "registry.bstein.dev/infra/longhorn-backing-image-manager:v1.8.2"
update_setting support-bundle-manager-image "registry.bstein.dev/infra/longhorn-support-bundle-kit:v0.0.56"

View File

@ -1,21 +0,0 @@
# infrastructure/longhorn/core/secretproviderclass.yaml
apiVersion: secrets-store.csi.x-k8s.io/v1
kind: SecretProviderClass
metadata:
name: longhorn-vault
namespace: longhorn-system
spec:
provider: vault
parameters:
vaultAddress: "http://vault.vault.svc.cluster.local:8200"
roleName: "longhorn"
objects: |
- objectName: "harbor-pull__dockerconfigjson"
secretPath: "kv/data/atlas/shared/harbor-pull"
secretKey: "dockerconfigjson"
secretObjects:
- secretName: longhorn-registry
type: kubernetes.io/dockerconfigjson
data:
- objectName: harbor-pull__dockerconfigjson
key: .dockerconfigjson

View File

@ -1,6 +0,0 @@
# infrastructure/longhorn/core/vault-serviceaccount.yaml
apiVersion: v1
kind: ServiceAccount
metadata:
name: longhorn-vault-sync
namespace: longhorn-system

View File

@ -1,45 +0,0 @@
# infrastructure/longhorn/core/vault-sync-deployment.yaml
apiVersion: apps/v1
kind: Deployment
metadata:
name: longhorn-vault-sync
namespace: longhorn-system
spec:
replicas: 1
selector:
matchLabels:
app: longhorn-vault-sync
template:
metadata:
labels:
app: longhorn-vault-sync
spec:
serviceAccountName: longhorn-vault-sync
nodeSelector:
node-role.kubernetes.io/worker: "true"
affinity:
nodeAffinity:
preferredDuringSchedulingIgnoredDuringExecution:
- weight: 80
preference:
matchExpressions:
- key: hardware
operator: In
values: ["rpi5", "rpi4"]
containers:
- name: sync
image: alpine:3.20
command: ["/bin/sh", "-c"]
args:
- "sleep infinity"
volumeMounts:
- name: vault-secrets
mountPath: /vault/secrets
readOnly: true
volumes:
- name: vault-secrets
csi:
driver: secrets-store.csi.k8s.io
readOnly: true
volumeAttributes:
secretProviderClass: longhorn-vault

View File

@ -2,7 +2,6 @@
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
resources:
- serviceaccount.yaml
- oauth2-proxy-longhorn.yaml
- middleware.yaml
- ingress.yaml
- oauth2-proxy-longhorn.yaml

View File

@ -32,18 +32,7 @@ spec:
metadata:
labels:
app: oauth2-proxy-longhorn
annotations:
vault.hashicorp.com/agent-inject: "true"
vault.hashicorp.com/role: "longhorn"
vault.hashicorp.com/agent-inject-secret-oidc-config: "kv/data/atlas/longhorn/oauth2-proxy"
vault.hashicorp.com/agent-inject-template-oidc-config: |
{{- with secret "kv/data/atlas/longhorn/oauth2-proxy" -}}
client_id = "{{ .Data.data.client_id }}"
client_secret = "{{ .Data.data.client_secret }}"
cookie_secret = "{{ .Data.data.cookie_secret }}"
{{- end -}}
spec:
serviceAccountName: longhorn-vault
nodeSelector:
node-role.kubernetes.io/worker: "true"
affinity:
@ -61,7 +50,6 @@ spec:
imagePullPolicy: IfNotPresent
args:
- --provider=oidc
- --config=/vault/secrets/oidc-config
- --redirect-url=https://longhorn.bstein.dev/oauth2/callback
- --oidc-issuer-url=https://sso.bstein.dev/realms/atlas
- --scope=openid profile email groups
@ -81,6 +69,22 @@ spec:
- --skip-jwt-bearer-tokens=true
- --oidc-groups-claim=groups
- --cookie-domain=longhorn.bstein.dev
env:
- name: OAUTH2_PROXY_CLIENT_ID
valueFrom:
secretKeyRef:
name: oauth2-proxy-longhorn-oidc
key: client_id
- name: OAUTH2_PROXY_CLIENT_SECRET
valueFrom:
secretKeyRef:
name: oauth2-proxy-longhorn-oidc
key: client_secret
- name: OAUTH2_PROXY_COOKIE_SECRET
valueFrom:
secretKeyRef:
name: oauth2-proxy-longhorn-oidc
key: cookie_secret
ports:
- containerPort: 4180
name: http

View File

@ -1,6 +0,0 @@
# infrastructure/longhorn/ui-ingress/serviceaccount.yaml
apiVersion: v1
kind: ServiceAccount
metadata:
name: longhorn-vault
namespace: longhorn-system

View File

@ -1,47 +0,0 @@
# infrastructure/metallb/helmrelease.yaml
apiVersion: helm.toolkit.fluxcd.io/v2
kind: HelmRelease
metadata:
name: metallb
namespace: metallb-system
spec:
interval: 30m
chart:
spec:
chart: metallb
version: 0.15.3
sourceRef:
kind: HelmRepository
name: metallb
namespace: flux-system
install:
crds: CreateReplace
remediation: { retries: 3 }
timeout: 10m
upgrade:
crds: CreateReplace
remediation:
retries: 3
remediateLastFailure: true
cleanupOnFail: true
timeout: 10m
values:
loadBalancerClass: metallb
prometheus:
metricsPort: 7472
controller:
logLevel: info
webhookMode: enabled
tlsMinVersion: VersionTLS12
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: hardware
operator: In
values:
- rpi4
- rpi5
speaker:
logLevel: info

View File

@ -3,5 +3,8 @@ apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
resources:
- namespace.yaml
- helmrelease.yaml
- metallb-rendered.yaml
- ippool.yaml
patchesStrategicMerge:
- patches/node-placement.yaml
- patches/speaker-loglevel.yaml

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,27 @@
# infrastructure/metallb/patches/node-placement.yaml
apiVersion: apps/v1
kind: Deployment
metadata:
name: metallb-controller
namespace: metallb-system
spec:
template:
spec:
containers:
- name: controller
args:
- --port=7472
- --log-level=info
- --webhook-mode=enabled
- --tls-min-version=VersionTLS12
- --lb-class=metallb
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: hardware
operator: In
values:
- rpi4
- rpi5

View File

@ -0,0 +1,15 @@
# infrastructure/metallb/patches/speaker-loglevel.yaml
apiVersion: apps/v1
kind: DaemonSet
metadata:
name: metallb-speaker
namespace: metallb-system
spec:
template:
spec:
containers:
- name: speaker
args:
- --port=7472
- --log-level=info
- --lb-class=metallb

View File

@ -1,24 +0,0 @@
# infrastructure/modules/base/storageclass/asteria-encrypted.yaml
apiVersion: storage.k8s.io/v1
kind: StorageClass
metadata:
name: asteria-encrypted
parameters:
diskSelector: asteria
fromBackup: ""
numberOfReplicas: "2"
staleReplicaTimeout: "30"
fsType: "ext4"
replicaAutoBalance: "least-effort"
dataLocality: "disabled"
encrypted: "true"
csi.storage.k8s.io/provisioner-secret-name: ${pvc.name}
csi.storage.k8s.io/provisioner-secret-namespace: ${pvc.namespace}
csi.storage.k8s.io/node-publish-secret-name: ${pvc.name}
csi.storage.k8s.io/node-publish-secret-namespace: ${pvc.namespace}
csi.storage.k8s.io/node-stage-secret-name: ${pvc.name}
csi.storage.k8s.io/node-stage-secret-namespace: ${pvc.namespace}
provisioner: driver.longhorn.io
reclaimPolicy: Retain
allowVolumeExpansion: true
volumeBindingMode: Immediate

View File

@ -3,5 +3,4 @@ apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
resources:
- asteria.yaml
- asteria-encrypted.yaml
- astreae.yaml

View File

@ -11,5 +11,5 @@ spec:
roleName: "postgres"
objects: |
- objectName: "postgres_password"
secretPath: "kv/data/atlas/postgres/postgres-db"
secretPath: "kv/data/postgres"
secretKey: "POSTGRES_PASSWORD"

View File

@ -4,10 +4,6 @@ kind: Service
metadata:
name: postgres-service
namespace: postgres
annotations:
prometheus.io/scrape: "true"
prometheus.io/port: "9187"
prometheus.io/path: "/metrics"
spec:
clusterIP: None
ports:
@ -15,9 +11,5 @@ spec:
port: 5432
protocol: TCP
targetPort: 5432
- name: metrics
port: 9187
protocol: TCP
targetPort: 9187
selector:
app: postgres

View File

@ -58,23 +58,6 @@ spec:
- name: vault-secrets
mountPath: /mnt/vault
readOnly: true
- name: postgres-exporter
image: quay.io/prometheuscommunity/postgres-exporter:v0.15.0
ports:
- name: metrics
containerPort: 9187
protocol: TCP
env:
- name: DATA_SOURCE_URI
value: "localhost:5432/postgres?sslmode=disable"
- name: DATA_SOURCE_USER
value: postgres
- name: DATA_SOURCE_PASS_FILE
value: /mnt/vault/postgres_password
volumeMounts:
- name: vault-secrets
mountPath: /mnt/vault
readOnly: true
volumes:
- name: vault-secrets
csi:

View File

@ -1,11 +1,10 @@
# infrastructure/sources/cert-manager/letsencrypt-prod.yaml
apiVersion: cert-manager.io/v1
kind: ClusterIssuer
metadata:
name: letsencrypt-prod
spec:
acme:
email: brad@bstein.dev
email: brad.stein@gmail.com
server: https://acme-v02.api.letsencrypt.org/directory
privateKeySecretRef:
name: letsencrypt-prod-account-key

View File

@ -1,11 +1,10 @@
# infrastructure/sources/cert-manager/letsencrypt.yaml
apiVersion: cert-manager.io/v1
kind: ClusterIssuer
metadata:
name: letsencrypt
spec:
acme:
email: brad@bstein.dev
email: brad.stein@gmail.com
server: https://acme-v02.api.letsencrypt.org/directory
privateKeySecretRef:
name: letsencrypt-account-key

View File

@ -1,9 +0,0 @@
# infrastructure/sources/helm/ananace.yaml
apiVersion: source.toolkit.fluxcd.io/v1
kind: HelmRepository
metadata:
name: ananace
namespace: flux-system
spec:
interval: 1h
url: https://ananace.gitlab.io/charts

View File

@ -2,18 +2,15 @@
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
resources:
- ananace.yaml
- fluent-bit.yaml
- grafana.yaml
- hashicorp.yaml
- jetstack.yaml
- jenkins.yaml
- mailu.yaml
- metallb.yaml
- opentelemetry.yaml
- opensearch.yaml
- harbor.yaml
- longhorn.yaml
- prometheus.yaml
- victoria-metrics.yaml
- secrets-store-csi.yaml

View File

@ -1,9 +0,0 @@
# infrastructure/sources/helm/longhorn.yaml
apiVersion: source.toolkit.fluxcd.io/v1
kind: HelmRepository
metadata:
name: longhorn
namespace: flux-system
spec:
interval: 30m
url: https://charts.longhorn.io

View File

@ -1,9 +0,0 @@
# infrastructure/sources/helm/metallb.yaml
apiVersion: source.toolkit.fluxcd.io/v1
kind: HelmRepository
metadata:
name: metallb
namespace: flux-system
spec:
interval: 1h
url: https://metallb.github.io/metallb

File diff suppressed because it is too large Load Diff

View File

@ -27,8 +27,6 @@ items:
creationTimestamp: null
labels:
app: traefik
app.kubernetes.io/instance: traefik-kube-system
app.kubernetes.io/name: traefik
spec:
containers:
- args:

View File

@ -5,7 +5,6 @@ metadata:
name: traefik
namespace: flux-system
resources:
- crds.yaml
- deployment.yaml
- serviceaccount.yaml
- clusterrole.yaml

View File

@ -3,10 +3,9 @@ apiVersion: v1
kind: Service
metadata:
name: traefik
namespace: traefik
namespace: kube-system
annotations:
metallb.universe.tf/address-pool: communication-pool
metallb.universe.tf/allow-shared-ip: traefik
spec:
type: LoadBalancer
loadBalancerClass: metallb
@ -21,4 +20,5 @@ spec:
targetPort: websecure
protocol: TCP
selector:
app: traefik
app.kubernetes.io/instance: traefik-kube-system
app.kubernetes.io/name: traefik

View File

@ -17,5 +17,4 @@ spec:
values:
syncSecret:
enabled: true
enableSecretRotation: true
rotationPollInterval: 2m
enableSecretRotation: false

View File

@ -1,43 +0,0 @@
# infrastructure/vault-injector/helmrelease.yaml
apiVersion: helm.toolkit.fluxcd.io/v2
kind: HelmRelease
metadata:
name: vault-injector
namespace: vault
spec:
interval: 30m
chart:
spec:
chart: vault
version: 0.31.0
sourceRef:
kind: HelmRepository
name: hashicorp
namespace: flux-system
install:
remediation: { retries: 3 }
timeout: 10m
upgrade:
remediation:
retries: 3
remediateLastFailure: true
cleanupOnFail: true
timeout: 10m
values:
global:
externalVaultAddr: http://vault.vault.svc.cluster.local:8200
tlsDisable: true
server:
enabled: false
csi:
enabled: false
injector:
enabled: true
replicas: 1
agentImage:
repository: hashicorp/vault
tag: "1.17.6"
webhook:
failurePolicy: Ignore
nodeSelector:
node-role.kubernetes.io/worker: "true"

View File

@ -1,5 +0,0 @@
# infrastructure/vault-injector/kustomization.yaml
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
resources:
- helmrelease.yaml

View File

@ -1,8 +1,8 @@
{
"counts": {
"helmrelease_host_hints": 19,
"http_endpoints": 45,
"services": 47,
"workloads": 74
"helmrelease_host_hints": 7,
"http_endpoints": 35,
"services": 44,
"workloads": 49
}
}

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because one or more lines are too long

View File

@ -17,11 +17,6 @@ flowchart LR
host_bstein_dev --> svc_bstein_dev_home_bstein_dev_home_backend
wl_bstein_dev_home_bstein_dev_home_backend["bstein-dev-home/bstein-dev-home-backend (Deployment)"]
svc_bstein_dev_home_bstein_dev_home_backend --> wl_bstein_dev_home_bstein_dev_home_backend
host_budget_bstein_dev["budget.bstein.dev"]
svc_finance_actual_budget["finance/actual-budget (Service)"]
host_budget_bstein_dev --> svc_finance_actual_budget
wl_finance_actual_budget["finance/actual-budget (Deployment)"]
svc_finance_actual_budget --> wl_finance_actual_budget
host_call_live_bstein_dev["call.live.bstein.dev"]
svc_comms_element_call["comms/element-call (Service)"]
host_call_live_bstein_dev --> svc_comms_element_call
@ -42,11 +37,6 @@ flowchart LR
host_cloud_bstein_dev --> svc_nextcloud_nextcloud
wl_nextcloud_nextcloud["nextcloud/nextcloud (Deployment)"]
svc_nextcloud_nextcloud --> wl_nextcloud_nextcloud
host_health_bstein_dev["health.bstein.dev"]
svc_health_wger["health/wger (Service)"]
host_health_bstein_dev --> svc_health_wger
wl_health_wger["health/wger (Deployment)"]
svc_health_wger --> wl_health_wger
host_kit_live_bstein_dev["kit.live.bstein.dev"]
svc_comms_livekit_token_service["comms/livekit-token-service (Service)"]
host_kit_live_bstein_dev --> svc_comms_livekit_token_service
@ -57,22 +47,15 @@ flowchart LR
wl_comms_livekit["comms/livekit (Deployment)"]
svc_comms_livekit --> wl_comms_livekit
host_live_bstein_dev["live.bstein.dev"]
svc_comms_othrys_element_element_web["comms/othrys-element-element-web (Service)"]
host_live_bstein_dev --> svc_comms_othrys_element_element_web
wl_comms_othrys_element_element_web["comms/othrys-element-element-web (Deployment)"]
svc_comms_othrys_element_element_web --> wl_comms_othrys_element_element_web
host_live_bstein_dev --> svc_comms_matrix_wellknown
svc_comms_othrys_synapse_matrix_synapse["comms/othrys-synapse-matrix-synapse (Service)"]
host_live_bstein_dev --> svc_comms_othrys_synapse_matrix_synapse
svc_comms_matrix_guest_register["comms/matrix-guest-register (Service)"]
host_live_bstein_dev --> svc_comms_matrix_guest_register
wl_comms_matrix_guest_register["comms/matrix-guest-register (Deployment)"]
svc_comms_matrix_guest_register --> wl_comms_matrix_guest_register
svc_comms_matrix_authentication_service["comms/matrix-authentication-service (Service)"]
host_live_bstein_dev --> svc_comms_matrix_authentication_service
wl_comms_matrix_authentication_service["comms/matrix-authentication-service (Deployment)"]
svc_comms_matrix_authentication_service --> wl_comms_matrix_authentication_service
host_logs_bstein_dev["logs.bstein.dev"]
svc_logging_oauth2_proxy_logs["logging/oauth2-proxy-logs (Service)"]
host_logs_bstein_dev --> svc_logging_oauth2_proxy_logs
wl_logging_oauth2_proxy_logs["logging/oauth2-proxy-logs (Deployment)"]
svc_logging_oauth2_proxy_logs --> wl_logging_oauth2_proxy_logs
wl_comms_othrys_synapse_matrix_synapse["comms/othrys-synapse-matrix-synapse (Deployment)"]
svc_comms_othrys_synapse_matrix_synapse --> wl_comms_othrys_synapse_matrix_synapse
host_longhorn_bstein_dev["longhorn.bstein.dev"]
svc_longhorn_system_oauth2_proxy_longhorn["longhorn-system/oauth2-proxy-longhorn (Service)"]
host_longhorn_bstein_dev --> svc_longhorn_system_oauth2_proxy_longhorn
@ -82,25 +65,21 @@ flowchart LR
svc_mailu_mailserver_mailu_front["mailu-mailserver/mailu-front (Service)"]
host_mail_bstein_dev --> svc_mailu_mailserver_mailu_front
host_matrix_live_bstein_dev["matrix.live.bstein.dev"]
svc_comms_matrix_authentication_service["comms/matrix-authentication-service (Service)"]
host_matrix_live_bstein_dev --> svc_comms_matrix_authentication_service
wl_comms_matrix_authentication_service["comms/matrix-authentication-service (Deployment)"]
svc_comms_matrix_authentication_service --> wl_comms_matrix_authentication_service
host_matrix_live_bstein_dev --> svc_comms_matrix_wellknown
host_matrix_live_bstein_dev --> svc_comms_othrys_synapse_matrix_synapse
svc_comms_matrix_guest_register["comms/matrix-guest-register (Service)"]
host_matrix_live_bstein_dev --> svc_comms_matrix_guest_register
wl_comms_matrix_guest_register["comms/matrix-guest-register (Deployment)"]
svc_comms_matrix_guest_register --> wl_comms_matrix_guest_register
host_monero_bstein_dev["monero.bstein.dev"]
svc_crypto_monerod["crypto/monerod (Service)"]
host_monero_bstein_dev --> svc_crypto_monerod
wl_crypto_monerod["crypto/monerod (Deployment)"]
svc_crypto_monerod --> wl_crypto_monerod
host_money_bstein_dev["money.bstein.dev"]
svc_finance_firefly["finance/firefly (Service)"]
host_money_bstein_dev --> svc_finance_firefly
wl_finance_firefly["finance/firefly (Deployment)"]
svc_finance_firefly --> wl_finance_firefly
host_notes_bstein_dev["notes.bstein.dev"]
svc_outline_outline["outline/outline (Service)"]
host_notes_bstein_dev --> svc_outline_outline
wl_outline_outline["outline/outline (Deployment)"]
svc_outline_outline --> wl_outline_outline
host_office_bstein_dev["office.bstein.dev"]
svc_nextcloud_collabora["nextcloud/collabora (Service)"]
host_office_bstein_dev --> svc_nextcloud_collabora
@ -131,11 +110,6 @@ flowchart LR
host_stream_bstein_dev --> svc_jellyfin_jellyfin
wl_jellyfin_jellyfin["jellyfin/jellyfin (Deployment)"]
svc_jellyfin_jellyfin --> wl_jellyfin_jellyfin
host_tasks_bstein_dev["tasks.bstein.dev"]
svc_planka_planka["planka/planka (Service)"]
host_tasks_bstein_dev --> svc_planka_planka
wl_planka_planka["planka/planka (Deployment)"]
svc_planka_planka --> wl_planka_planka
host_vault_bstein_dev["vault.bstein.dev"]
svc_vaultwarden_vaultwarden_service["vaultwarden/vaultwarden-service (Service)"]
host_vault_bstein_dev --> svc_vaultwarden_vaultwarden_service
@ -159,30 +133,23 @@ flowchart LR
wl_comms_livekit_token_service
svc_comms_livekit
wl_comms_livekit
svc_comms_othrys_element_element_web
wl_comms_othrys_element_element_web
svc_comms_othrys_synapse_matrix_synapse
svc_comms_matrix_guest_register
wl_comms_matrix_guest_register
wl_comms_othrys_synapse_matrix_synapse
svc_comms_matrix_authentication_service
wl_comms_matrix_authentication_service
svc_comms_matrix_guest_register
wl_comms_matrix_guest_register
end
subgraph crypto[crypto]
svc_crypto_monerod
wl_crypto_monerod
end
subgraph finance[finance]
svc_finance_actual_budget
wl_finance_actual_budget
svc_finance_firefly
wl_finance_firefly
end
subgraph gitea[gitea]
svc_gitea_gitea
wl_gitea_gitea
end
subgraph health[health]
svc_health_wger
wl_health_wger
end
subgraph jellyfin[jellyfin]
svc_jellyfin_pegasus
wl_jellyfin_pegasus
@ -193,10 +160,6 @@ flowchart LR
svc_jenkins_jenkins
wl_jenkins_jenkins
end
subgraph logging[logging]
svc_logging_oauth2_proxy_logs
wl_logging_oauth2_proxy_logs
end
subgraph longhorn_system[longhorn-system]
svc_longhorn_system_oauth2_proxy_longhorn
wl_longhorn_system_oauth2_proxy_longhorn
@ -210,14 +173,6 @@ flowchart LR
svc_nextcloud_collabora
wl_nextcloud_collabora
end
subgraph outline[outline]
svc_outline_outline
wl_outline_outline
end
subgraph planka[planka]
svc_planka_planka
wl_planka_planka
end
subgraph sso[sso]
svc_sso_oauth2_proxy
wl_sso_oauth2_proxy

View File

@ -70,7 +70,6 @@ WORKER_NODES = [
"titan-13",
"titan-14",
"titan-15",
"titan-16",
"titan-17",
"titan-18",
"titan-19",
@ -86,17 +85,19 @@ WORKER_TOTAL = len(WORKER_NODES)
CONTROL_SUFFIX = f"/{CONTROL_TOTAL}"
WORKER_SUFFIX = f"/{WORKER_TOTAL}"
# Namespaces considered infrastructure (excluded from workload counts)
INFRA_PATTERNS = [
"kube-.*",
".*-system",
"traefik",
INFRA_NAMESPACES = [
"kube-system",
"longhorn-system",
"metallb-system",
"monitoring",
"logging",
"cert-manager",
"flux-system",
"traefik",
"maintenance",
"postgres",
]
INFRA_REGEX = f"^({'|'.join(INFRA_PATTERNS)})$"
INFRA_REGEX = f"^({'|'.join(INFRA_NAMESPACES)})$"
# Namespaces allowed on control plane without counting as workloads
CP_ALLOWED_NS = INFRA_REGEX
LONGHORN_NODE_REGEX = "titan-1[2-9]|titan-2[24]"
@ -208,66 +209,7 @@ def namespace_ram_raw(scope_var):
def namespace_gpu_usage_instant(scope_var):
return gpu_usage_by_namespace(scope_var)
def jetson_gpu_util_by_node():
return 'max by (node) (jetson_gr3d_freq_percent{node!=""})'
def dcgm_gpu_util_by_node():
dcgm_pod = 'label_replace(DCGM_FI_DEV_GPU_UTIL, "pod", "$1", "Hostname", "(.*)")'
dcgm_ns = 'label_replace(' + dcgm_pod + ', "namespace", "monitoring", "", "")'
return (
"avg by (node) ("
f"{dcgm_ns} * on(namespace,pod) group_left(node) "
'kube_pod_info{namespace="monitoring"}'
")"
)
def gpu_util_by_node():
return f"{dcgm_gpu_util_by_node()} or {jetson_gpu_util_by_node()}"
def gpu_util_by_hostname():
return 'label_replace(' + gpu_util_by_node() + ', "Hostname", "$1", "node", "(.*)")'
def gpu_node_labels():
return 'kube_node_labels{label_accelerator=~".+"} or kube_node_labels{label_jetson="true"}'
def gpu_requests_by_namespace_node(scope_var):
return (
"sum by (namespace,node) ("
f'kube_pod_container_resource_requests{{resource=~"nvidia.com/gpu.*",{scope_var}}} '
"* on(namespace,pod) group_left(node) kube_pod_info "
f"* on(node) group_left() ({gpu_node_labels()})"
")"
)
def gpu_usage_by_namespace(scope_var):
requests_by_ns = gpu_requests_by_namespace_node(scope_var)
total_by_node = f"sum by (node) ({requests_by_ns})"
return (
"sum by (namespace) ("
f"({requests_by_ns}) / clamp_min({total_by_node}, 1) "
f"* on(node) group_left() ({gpu_util_by_node()})"
")"
)
def jetson_gpu_usage_by_namespace(scope_var):
requests_by_ns = jetson_gpu_requests(scope_var)
total_by_node = f"sum by (node) ({requests_by_ns})"
return (
"sum by (namespace) ("
f"({requests_by_ns}) / clamp_min({total_by_node}, 1) "
f"* on(node) group_left() {jetson_gpu_util_by_node()}"
")"
)
return f"sum(DCGM_FI_DEV_GPU_UTIL{{{namespace_gpu_selector(scope_var)}}}) by (namespace)"
def namespace_share_expr(resource_expr):
@ -287,7 +229,7 @@ def namespace_gpu_share_expr(scope_var):
usage = namespace_gpu_usage_instant(scope_var)
total = f"(sum({usage}) or on() vector(0))"
share = f"100 * ({usage}) / clamp_min({total}, 1)"
idle = 'label_replace(vector(100), "namespace", "idle", "", "") * scalar(' + total + " == bool 0)"
idle = 'label_replace(vector(100), "namespace", "idle", "", "") and on() (' + total + " == 0)"
return f"({share}) or ({idle})"
@ -377,76 +319,6 @@ NAMESPACE_SCOPE_WORKLOAD = f'namespace!~"{INFRA_REGEX}"'
NAMESPACE_SCOPE_ALL = 'namespace=~".*"'
NAMESPACE_SCOPE_INFRA = f'namespace=~"{INFRA_REGEX}"'
NAMESPACE_SCOPE_VARS = ["namespace_scope_cpu", "namespace_scope_gpu", "namespace_scope_ram"]
GLUE_LABEL = 'label_atlas_bstein_dev_glue="true"'
GLUE_JOBS = f"kube_cronjob_labels{{{GLUE_LABEL}}}"
GLUE_FILTER = f"and on(namespace,cronjob) {GLUE_JOBS}"
GLUE_LAST_SUCCESS = f"(kube_cronjob_status_last_successful_time {GLUE_FILTER})"
GLUE_LAST_SCHEDULE = f"(kube_cronjob_status_last_schedule_time {GLUE_FILTER})"
GLUE_SUSPENDED = f"(kube_cronjob_spec_suspend {GLUE_FILTER}) == 1"
GLUE_ACTIVE = f"(kube_cronjob_status_active {GLUE_FILTER})"
GLUE_LAST_SUCCESS_AGE = f"(time() - {GLUE_LAST_SUCCESS})"
GLUE_LAST_SCHEDULE_AGE = f"(time() - {GLUE_LAST_SCHEDULE})"
GLUE_LAST_SUCCESS_AGE_HOURS = f"({GLUE_LAST_SUCCESS_AGE}) / 3600"
GLUE_LAST_SCHEDULE_AGE_HOURS = f"({GLUE_LAST_SCHEDULE_AGE}) / 3600"
GLUE_STALE_WINDOW_SEC = 36 * 3600
GLUE_STALE = f"({GLUE_LAST_SUCCESS_AGE} > bool {GLUE_STALE_WINDOW_SEC})"
GLUE_MISSING = f"({GLUE_JOBS} unless on(namespace,cronjob) kube_cronjob_status_last_successful_time)"
GLUE_STALE_ACTIVE = f"({GLUE_STALE} unless on(namespace,cronjob) {GLUE_SUSPENDED})"
GLUE_MISSING_ACTIVE = f"({GLUE_MISSING} unless on(namespace,cronjob) {GLUE_SUSPENDED})"
GLUE_STALE_COUNT = f"(sum({GLUE_STALE_ACTIVE}) + count({GLUE_MISSING_ACTIVE})) or on() vector(0)"
GLUE_MISSING_COUNT = f"count({GLUE_MISSING_ACTIVE}) or on() vector(0)"
GLUE_SUSPENDED_COUNT = f"sum({GLUE_SUSPENDED}) or on() vector(0)"
ARIADNE_TASK_ERRORS_RANGE = 'sum by (task) (increase(ariadne_task_runs_total{status="error"}[$__range]))'
ARIADNE_TASK_ERRORS_24H = 'sum by (task) (increase(ariadne_task_runs_total{status="error"}[24h]))'
ARIADNE_TASK_ERRORS_1H = 'sum by (task) (increase(ariadne_task_runs_total{status="error"}[1h]))'
ARIADNE_TASK_ERRORS_30D = 'sum by (task) (increase(ariadne_task_runs_total{status="error"}[30d]))'
ARIADNE_TASK_SUCCESS_24H = 'sum by (task) (increase(ariadne_task_runs_total{status="ok"}[24h]))'
ARIADNE_TASK_RUNS_BY_STATUS_1H = 'sum by (status) (increase(ariadne_task_runs_total[1h]))'
ARIADNE_TASK_ERRORS_1H_TOTAL = 'sum(increase(ariadne_task_runs_total{status="error"}[1h]))'
ARIADNE_TASK_ERRORS_24H_TOTAL = 'sum(increase(ariadne_task_runs_total{status="error"}[24h]))'
ARIADNE_TASK_RUNS_1H_TOTAL = 'sum(increase(ariadne_task_runs_total[1h]))'
ARIADNE_TASK_ATTEMPTS_SERIES = 'sum(increase(ariadne_task_runs_total[$__interval]))'
ARIADNE_TASK_FAILURES_SERIES = 'sum(increase(ariadne_task_runs_total{status="error"}[$__interval]))'
ARIADNE_TASK_WARNINGS_SERIES = (
'sum(increase(ariadne_task_runs_total{status!~"ok|error"}[$__interval])) or on() vector(0)'
)
ARIADNE_SCHEDULE_LAST_SUCCESS_HOURS = "(time() - ariadne_schedule_last_success_timestamp_seconds) / 3600"
ARIADNE_SCHEDULE_LAST_ERROR_HOURS = "(time() - ariadne_schedule_last_error_timestamp_seconds) / 3600"
ARIADNE_SCHEDULE_LAST_SUCCESS_RANGE_HOURS = (
"(time() - max_over_time(ariadne_schedule_last_success_timestamp_seconds[$__range])) / 3600"
)
ARIADNE_SCHEDULE_LAST_ERROR_RANGE_HOURS = (
"(time() - max_over_time(ariadne_schedule_last_error_timestamp_seconds[$__range])) / 3600"
)
ARIADNE_ACCESS_REQUESTS = "ariadne_access_requests_total"
ARIADNE_CI_COVERAGE = 'ariadne_ci_coverage_percent{repo="ariadne"}'
ARIADNE_CI_TESTS = 'ariadne_ci_tests_total{repo="ariadne"}'
ARIADNE_TEST_SUCCESS_RATE = (
"100 * "
'sum(max_over_time(ariadne_ci_tests_total{repo="ariadne",result="passed"}[30d])) '
"/ clamp_min("
'sum(max_over_time(ariadne_ci_tests_total{repo="ariadne",result=~"passed|failed|error"}[30d])), 1)'
)
ARIADNE_TEST_FAILURES_24H = (
'sum by (result) (max_over_time(ariadne_ci_tests_total{repo="ariadne",result=~"failed|error"}[24h]))'
)
POSTGRES_CONN_USED = (
'label_replace(sum(pg_stat_activity_count), "conn", "used", "__name__", ".*") '
'or label_replace(max(pg_settings_max_connections), "conn", "max", "__name__", ".*")'
)
POSTGRES_CONN_HOTTEST = 'topk(1, sum by (datname) (pg_stat_activity_count))'
ONEOFF_JOB_OWNER = (
'label_replace(kube_job_owner{owner_kind="CronJob"}, "owner_name", "$1", "job_name", "(.*)")'
)
ONEOFF_JOB_PODS = f'(kube_pod_owner{{owner_kind="Job"}} unless on(namespace, owner_name) {ONEOFF_JOB_OWNER})'
ONEOFF_JOB_POD_AGE_HOURS = (
'((time() - kube_pod_start_time{pod!=""}) / 3600) '
f'* on(namespace,pod) group_left(owner_name) {ONEOFF_JOB_PODS} '
'* on(namespace,pod) group_left(phase) '
'max by (namespace,pod,phase) (kube_pod_status_phase{phase=~"Running|Succeeded"})'
)
GLUE_LAST_SUCCESS_RANGE_HOURS = f"(time() - max_over_time({GLUE_LAST_SUCCESS}[$__range])) / 3600"
GLUE_LAST_SCHEDULE_RANGE_HOURS = f"(time() - max_over_time({GLUE_LAST_SCHEDULE}[$__range])) / 3600"
GPU_NODES = ["titan-20", "titan-21", "titan-22", "titan-24"]
GPU_NODE_REGEX = "|".join(GPU_NODES)
TRAEFIK_ROUTER_EXPR = "sum by (router) (rate(traefik_router_requests_total[5m]))"
@ -624,7 +496,6 @@ def timeseries_panel(
grid,
*,
unit="none",
max_value=None,
legend=None,
legend_display="table",
legend_placement="bottom",
@ -649,8 +520,6 @@ def timeseries_panel(
"tooltip": {"mode": "multi"},
},
}
if max_value is not None:
panel["fieldConfig"]["defaults"]["max"] = max_value
if legend:
panel["targets"][0]["legendFormat"] = legend
if legend_calcs:
@ -802,22 +671,13 @@ def bargauge_panel(
grid,
*,
unit="none",
legend=None,
links=None,
limit=None,
sort_order="desc",
thresholds=None,
decimals=None,
instant=False,
overrides=None,
):
"""Return a bar gauge panel with label-aware reduction."""
cleaned_expr = expr.strip()
if not cleaned_expr.startswith(("sort(", "sort_desc(")):
if sort_order == "desc":
expr = f"sort_desc({expr})"
elif sort_order == "asc":
expr = f"sort({expr})"
panel = {
"id": panel_id,
"type": "bargauge",
@ -825,12 +685,7 @@ def bargauge_panel(
"datasource": PROM_DS,
"gridPos": grid,
"targets": [
{
"expr": expr,
"refId": "A",
"legendFormat": legend or "{{node}}",
**({"instant": True} if instant else {}),
}
{"expr": expr, "refId": "A", "legendFormat": "{{node}}", **({"instant": True} if instant else {})}
],
"fieldConfig": {
"defaults": {
@ -860,8 +715,6 @@ def bargauge_panel(
},
},
}
if overrides:
panel["fieldConfig"]["overrides"].extend(overrides)
if decimals is not None:
panel["fieldConfig"]["defaults"]["decimals"] = decimals
if links:
@ -870,7 +723,7 @@ def bargauge_panel(
panel["transformations"] = [
{
"id": "sortBy",
"options": {"fields": ["Value"], "order": sort_order},
"options": {"fields": ["Value"], "order": "desc"},
}
]
if limit:
@ -910,15 +763,6 @@ def build_overview():
{"color": "red", "value": 3},
],
}
age_thresholds = {
"mode": "absolute",
"steps": [
{"color": "green", "value": None},
{"color": "yellow", "value": 6},
{"color": "orange", "value": 24},
{"color": "red", "value": 48},
],
}
row1_stats = [
{
@ -1121,7 +965,7 @@ def build_overview():
30,
"Mail Sent (1d)",
'max(postmark_outbound_sent{window="1d"})',
{"h": 3, "w": 4, "x": 0, "y": 8},
{"h": 2, "w": 6, "x": 0, "y": 8},
unit="none",
links=link_to("atlas-mail"),
)
@ -1132,7 +976,7 @@ def build_overview():
"type": "stat",
"title": "Mail Bounces (1d)",
"datasource": PROM_DS,
"gridPos": {"h": 3, "w": 4, "x": 8, "y": 8},
"gridPos": {"h": 2, "w": 6, "x": 12, "y": 8},
"targets": [
{
"expr": 'max(postmark_outbound_bounce_rate{window="1d"})',
@ -1178,7 +1022,7 @@ def build_overview():
32,
"Mail Success Rate (1d)",
'clamp_min(100 - max(postmark_outbound_bounce_rate{window="1d"}), 0)',
{"h": 3, "w": 4, "x": 4, "y": 8},
{"h": 2, "w": 6, "x": 6, "y": 8},
unit="percent",
thresholds=mail_success_thresholds,
decimals=1,
@ -1190,38 +1034,13 @@ def build_overview():
33,
"Mail Limit Used (30d)",
"max(postmark_sending_limit_used_percent)",
{"h": 3, "w": 4, "x": 12, "y": 8},
{"h": 2, "w": 6, "x": 18, "y": 8},
unit="percent",
thresholds=mail_limit_thresholds,
decimals=1,
links=link_to("atlas-mail"),
)
)
panels.append(
stat_panel(
34,
"Postgres Connections Used",
POSTGRES_CONN_USED,
{"h": 3, "w": 4, "x": 16, "y": 8},
decimals=0,
text_mode="name_and_value",
legend="{{conn}}",
instant=True,
)
)
panels.append(
stat_panel(
35,
"Postgres Hottest Connections",
POSTGRES_CONN_HOTTEST,
{"h": 3, "w": 4, "x": 20, "y": 8},
unit="none",
decimals=0,
text_mode="name_and_value",
legend="{{datname}}",
instant=True,
)
)
storage_panels = [
(23, "Astreae Usage", astreae_usage_expr("/mnt/astreae"), "percent"),
@ -1235,104 +1054,13 @@ def build_overview():
panel_id,
title,
expr,
{"h": 3, "w": 6, "x": 6 * idx, "y": 11},
{"h": 6, "w": 6, "x": 6 * idx, "y": 10},
unit=unit,
thresholds=PERCENT_THRESHOLDS if unit == "percent" else None,
links=link_to("atlas-storage"),
)
)
panels.append(
bargauge_panel(
40,
"One-off Job Pods (age hours)",
ONEOFF_JOB_POD_AGE_HOURS,
{"h": 6, "w": 6, "x": 0, "y": 14},
unit="h",
instant=True,
legend="{{namespace}}/{{pod}}",
thresholds=age_thresholds,
limit=8,
decimals=2,
)
)
panels.append(
{
"id": 41,
"type": "timeseries",
"title": "Ariadne Attempts / Failures",
"datasource": PROM_DS,
"gridPos": {"h": 6, "w": 6, "x": 6, "y": 14},
"targets": [
{"expr": ARIADNE_TASK_ATTEMPTS_SERIES, "refId": "A", "legendFormat": "Attempts"},
{"expr": ARIADNE_TASK_FAILURES_SERIES, "refId": "B", "legendFormat": "Failures"},
],
"fieldConfig": {
"defaults": {"unit": "none"},
"overrides": [
{
"matcher": {"id": "byName", "options": "Attempts"},
"properties": [
{"id": "color", "value": {"mode": "fixed", "fixedColor": "green"}}
],
},
{
"matcher": {"id": "byName", "options": "Failures"},
"properties": [
{"id": "color", "value": {"mode": "fixed", "fixedColor": "red"}}
],
},
],
},
"options": {
"legend": {"displayMode": "table", "placement": "right"},
"tooltip": {"mode": "multi"},
},
}
)
panels.append(
timeseries_panel(
42,
"Ariadne Test Success Rate",
ARIADNE_TEST_SUCCESS_RATE,
{"h": 6, "w": 6, "x": 12, "y": 14},
unit="percent",
max_value=100,
legend=None,
legend_display="list",
)
)
panels.append(
bargauge_panel(
43,
"Tests with Failures (24h)",
ARIADNE_TEST_FAILURES_24H,
{"h": 6, "w": 6, "x": 18, "y": 14},
unit="none",
instant=True,
legend="{{result}}",
overrides=[
{
"matcher": {"id": "byName", "options": "error"},
"properties": [{"id": "color", "value": {"mode": "fixed", "fixedColor": "yellow"}}],
},
{
"matcher": {"id": "byName", "options": "failed"},
"properties": [{"id": "color", "value": {"mode": "fixed", "fixedColor": "red"}}],
},
],
thresholds={
"mode": "absolute",
"steps": [
{"color": "green", "value": None},
{"color": "yellow", "value": 1},
{"color": "orange", "value": 5},
{"color": "red", "value": 10},
],
},
)
)
cpu_scope = "$namespace_scope_cpu"
gpu_scope = "$namespace_scope_gpu"
ram_scope = "$namespace_scope_ram"
@ -1342,9 +1070,9 @@ def build_overview():
11,
"Namespace CPU Share",
namespace_cpu_share_expr(cpu_scope),
{"h": 9, "w": 8, "x": 0, "y": 20},
{"h": 9, "w": 8, "x": 0, "y": 16},
links=namespace_scope_links("namespace_scope_cpu"),
description="Shares are normalized within the selected filter. Switching scope changes the denominator.",
description="Values are normalized within the selected scope; use panel links to switch scope.",
)
)
panels.append(
@ -1352,9 +1080,9 @@ def build_overview():
12,
"Namespace GPU Share",
namespace_gpu_share_expr(gpu_scope),
{"h": 9, "w": 8, "x": 8, "y": 20},
{"h": 9, "w": 8, "x": 8, "y": 16},
links=namespace_scope_links("namespace_scope_gpu"),
description="Shares are normalized within the selected filter. Switching scope changes the denominator.",
description="Values are normalized within the selected scope; use panel links to switch scope.",
)
)
panels.append(
@ -1362,9 +1090,9 @@ def build_overview():
13,
"Namespace RAM Share",
namespace_ram_share_expr(ram_scope),
{"h": 9, "w": 8, "x": 16, "y": 20},
{"h": 9, "w": 8, "x": 16, "y": 16},
links=namespace_scope_links("namespace_scope_ram"),
description="Shares are normalized within the selected filter. Switching scope changes the denominator.",
description="Values are normalized within the selected scope; use panel links to switch scope.",
)
)
@ -1374,7 +1102,7 @@ def build_overview():
14,
"Worker Node CPU",
node_cpu_expr(worker_filter),
{"h": 12, "w": 12, "x": 0, "y": 36},
{"h": 12, "w": 12, "x": 0, "y": 32},
unit="percent",
legend="{{node}}",
legend_calcs=["last"],
@ -1388,7 +1116,7 @@ def build_overview():
15,
"Worker Node RAM",
node_mem_expr(worker_filter),
{"h": 12, "w": 12, "x": 12, "y": 36},
{"h": 12, "w": 12, "x": 12, "y": 32},
unit="percent",
legend="{{node}}",
legend_calcs=["last"],
@ -1403,7 +1131,7 @@ def build_overview():
16,
"Control plane CPU",
node_cpu_expr(CONTROL_ALL_REGEX),
{"h": 10, "w": 12, "x": 0, "y": 48},
{"h": 10, "w": 12, "x": 0, "y": 44},
unit="percent",
legend="{{node}}",
legend_display="table",
@ -1415,7 +1143,7 @@ def build_overview():
17,
"Control plane RAM",
node_mem_expr(CONTROL_ALL_REGEX),
{"h": 10, "w": 12, "x": 12, "y": 48},
{"h": 10, "w": 12, "x": 12, "y": 44},
unit="percent",
legend="{{node}}",
legend_display="table",
@ -1428,7 +1156,7 @@ def build_overview():
28,
"Node Pod Share",
'(sum(kube_pod_info{pod!="" , node!=""}) by (node) / clamp_min(sum(kube_pod_info{pod!="" , node!=""}), 1)) * 100',
{"h": 10, "w": 12, "x": 0, "y": 58},
{"h": 10, "w": 12, "x": 0, "y": 54},
)
)
panels.append(
@ -1436,7 +1164,7 @@ def build_overview():
29,
"Top Nodes by Pod Count",
'topk(12, sum(kube_pod_info{pod!="" , node!=""}) by (node))',
{"h": 10, "w": 12, "x": 12, "y": 58},
{"h": 10, "w": 12, "x": 12, "y": 54},
unit="none",
limit=12,
decimals=0,
@ -1458,7 +1186,7 @@ def build_overview():
18,
"Cluster Ingress Throughput",
NET_INGRESS_EXPR,
{"h": 7, "w": 8, "x": 0, "y": 29},
{"h": 7, "w": 8, "x": 0, "y": 25},
unit="Bps",
legend="Ingress (Traefik)",
legend_display="list",
@ -1471,7 +1199,7 @@ def build_overview():
19,
"Cluster Egress Throughput",
NET_EGRESS_EXPR,
{"h": 7, "w": 8, "x": 8, "y": 29},
{"h": 7, "w": 8, "x": 8, "y": 25},
unit="Bps",
legend="Egress (Traefik)",
legend_display="list",
@ -1484,7 +1212,7 @@ def build_overview():
20,
"Intra-Cluster Throughput",
NET_INTERNAL_EXPR,
{"h": 7, "w": 8, "x": 16, "y": 29},
{"h": 7, "w": 8, "x": 16, "y": 25},
unit="Bps",
legend="Internal traffic",
legend_display="list",
@ -1498,7 +1226,7 @@ def build_overview():
21,
"Root Filesystem Usage",
root_usage_expr(),
{"h": 16, "w": 12, "x": 0, "y": 68},
{"h": 16, "w": 12, "x": 0, "y": 64},
unit="percent",
legend="{{node}}",
legend_calcs=["last"],
@ -1513,7 +1241,7 @@ def build_overview():
22,
"Nodes Closest to Full Root Disks",
f"topk(12, {root_usage_expr()})",
{"h": 16, "w": 12, "x": 12, "y": 68},
{"h": 16, "w": 12, "x": 12, "y": 64},
unit="percent",
thresholds=PERCENT_THRESHOLDS,
links=link_to("atlas-storage"),
@ -1999,7 +1727,7 @@ def build_storage_dashboard():
stat_panel(
31,
"Maintenance Cron Freshness (s)",
'time() - max by (cronjob) (kube_cronjob_status_last_successful_time{namespace="maintenance",cronjob="image-sweeper"})',
'time() - max by (cronjob) (kube_cronjob_status_last_successful_time{namespace="maintenance",cronjob=~"image-sweeper|grafana-smtp-sync"})',
{"h": 4, "w": 12, "x": 12, "y": 44},
unit="s",
thresholds={
@ -2408,285 +2136,6 @@ def build_mail_dashboard():
}
def build_jobs_dashboard():
panels = []
age_thresholds = {
"mode": "absolute",
"steps": [
{"color": "green", "value": None},
{"color": "yellow", "value": 6},
{"color": "orange", "value": 24},
{"color": "red", "value": 48},
],
}
recent_error_thresholds = {
"mode": "absolute",
"steps": [
{"color": "red", "value": None},
{"color": "orange", "value": 1},
{"color": "yellow", "value": 6},
{"color": "green", "value": 24},
],
}
task_error_thresholds = {
"mode": "absolute",
"steps": [
{"color": "green", "value": None},
{"color": "yellow", "value": 1},
{"color": "orange", "value": 3},
{"color": "red", "value": 5},
],
}
panels.append(
bargauge_panel(
1,
"Ariadne Task Errors (range)",
ARIADNE_TASK_ERRORS_RANGE,
{"h": 7, "w": 8, "x": 0, "y": 0},
unit="none",
instant=True,
legend="{{task}}",
thresholds=task_error_thresholds,
)
)
panels.append(
{
"id": 2,
"type": "timeseries",
"title": "Ariadne Attempts / Failures",
"datasource": PROM_DS,
"gridPos": {"h": 7, "w": 8, "x": 8, "y": 0},
"targets": [
{"expr": ARIADNE_TASK_ATTEMPTS_SERIES, "refId": "A", "legendFormat": "Attempts"},
{"expr": ARIADNE_TASK_FAILURES_SERIES, "refId": "B", "legendFormat": "Failures"},
],
"fieldConfig": {
"defaults": {"unit": "none"},
"overrides": [
{
"matcher": {"id": "byName", "options": "Attempts"},
"properties": [
{"id": "color", "value": {"mode": "fixed", "fixedColor": "green"}}
],
},
{
"matcher": {"id": "byName", "options": "Failures"},
"properties": [
{"id": "color", "value": {"mode": "fixed", "fixedColor": "red"}}
],
},
],
},
"options": {
"legend": {"displayMode": "table", "placement": "right"},
"tooltip": {"mode": "multi"},
},
}
)
panels.append(
bargauge_panel(
3,
"One-off Job Pods (age hours)",
ONEOFF_JOB_POD_AGE_HOURS,
{"h": 7, "w": 8, "x": 16, "y": 0},
unit="h",
instant=True,
legend="{{namespace}}/{{pod}}",
thresholds=age_thresholds,
limit=12,
decimals=2,
)
)
panels.append(
stat_panel(
4,
"Glue Jobs Stale (>36h)",
GLUE_STALE_COUNT,
{"h": 4, "w": 4, "x": 0, "y": 7},
unit="none",
thresholds={
"mode": "absolute",
"steps": [
{"color": "green", "value": None},
{"color": "yellow", "value": 1},
{"color": "orange", "value": 2},
{"color": "red", "value": 3},
],
},
)
)
panels.append(
stat_panel(
5,
"Glue Jobs Missing Success",
GLUE_MISSING_COUNT,
{"h": 4, "w": 4, "x": 4, "y": 7},
unit="none",
)
)
panels.append(
stat_panel(
6,
"Glue Jobs Suspended",
GLUE_SUSPENDED_COUNT,
{"h": 4, "w": 4, "x": 8, "y": 7},
unit="none",
)
)
panels.append(
stat_panel(
7,
"Ariadne Task Errors (1h)",
ARIADNE_TASK_ERRORS_1H_TOTAL,
{"h": 4, "w": 4, "x": 12, "y": 7},
unit="none",
)
)
panels.append(
stat_panel(
8,
"Ariadne Task Errors (24h)",
ARIADNE_TASK_ERRORS_24H_TOTAL,
{"h": 4, "w": 4, "x": 16, "y": 7},
unit="none",
)
)
panels.append(
stat_panel(
9,
"Ariadne Task Runs (1h)",
ARIADNE_TASK_RUNS_1H_TOTAL,
{"h": 4, "w": 4, "x": 20, "y": 7},
unit="none",
)
)
panels.append(
bargauge_panel(
10,
"Ariadne Schedule Last Error (hours ago)",
ARIADNE_SCHEDULE_LAST_ERROR_RANGE_HOURS,
{"h": 6, "w": 12, "x": 0, "y": 17},
unit="h",
instant=True,
legend="{{task}}",
thresholds=recent_error_thresholds,
decimals=2,
)
)
panels.append(
bargauge_panel(
11,
"Ariadne Schedule Last Success (hours ago)",
ARIADNE_SCHEDULE_LAST_SUCCESS_RANGE_HOURS,
{"h": 6, "w": 12, "x": 12, "y": 17},
unit="h",
instant=True,
legend="{{task}}",
thresholds=age_thresholds,
decimals=2,
)
)
panels.append(
bargauge_panel(
12,
"Glue Jobs Last Success (hours ago)",
GLUE_LAST_SUCCESS_RANGE_HOURS,
{"h": 6, "w": 12, "x": 0, "y": 23},
unit="h",
instant=True,
legend="{{namespace}}/{{cronjob}}",
thresholds=age_thresholds,
decimals=2,
)
)
panels.append(
bargauge_panel(
13,
"Glue Jobs Last Schedule (hours ago)",
GLUE_LAST_SCHEDULE_RANGE_HOURS,
{"h": 6, "w": 12, "x": 12, "y": 23},
unit="h",
instant=True,
legend="{{namespace}}/{{cronjob}}",
thresholds=age_thresholds,
decimals=2,
)
)
panels.append(
bargauge_panel(
14,
"Ariadne Task Errors (1h)",
ARIADNE_TASK_ERRORS_1H,
{"h": 6, "w": 12, "x": 0, "y": 29},
unit="none",
instant=True,
legend="{{task}}",
thresholds=task_error_thresholds,
)
)
panels.append(
bargauge_panel(
15,
"Ariadne Task Errors (30d)",
ARIADNE_TASK_ERRORS_30D,
{"h": 6, "w": 12, "x": 12, "y": 29},
unit="none",
instant=True,
legend="{{task}}",
thresholds=task_error_thresholds,
)
)
panels.append(
bargauge_panel(
16,
"Ariadne Access Requests",
ARIADNE_ACCESS_REQUESTS,
{"h": 6, "w": 8, "x": 0, "y": 11},
unit="none",
instant=True,
legend="{{status}}",
)
)
panels.append(
stat_panel(
17,
"Ariadne CI Coverage (%)",
ARIADNE_CI_COVERAGE,
{"h": 6, "w": 4, "x": 8, "y": 11},
unit="percent",
decimals=1,
instant=True,
legend="{{branch}}",
)
)
panels.append(
table_panel(
18,
"Ariadne CI Tests (latest)",
ARIADNE_CI_TESTS,
{"h": 6, "w": 12, "x": 12, "y": 11},
unit="none",
transformations=[{"id": "labelsToFields", "options": {}}, {"id": "sortBy", "options": {"fields": ["Value"], "order": "desc"}}],
instant=True,
)
)
return {
"uid": "atlas-jobs",
"title": "Atlas Jobs",
"folderUid": PRIVATE_FOLDER,
"editable": True,
"panels": panels,
"time": {"from": "now-7d", "to": "now"},
"annotations": {"list": []},
"schemaVersion": 39,
"style": "dark",
"tags": ["atlas", "jobs", "glue"],
}
def build_gpu_dashboard():
panels = []
gpu_scope = "$namespace_scope_gpu"
@ -2697,7 +2146,7 @@ def build_gpu_dashboard():
namespace_gpu_share_expr(gpu_scope),
{"h": 8, "w": 12, "x": 0, "y": 0},
links=namespace_scope_links("namespace_scope_gpu"),
description="Shares are normalized within the selected filter. Switching scope changes the denominator.",
description="Values are normalized within the selected scope; use panel links to switch scope.",
)
)
panels.append(
@ -2716,7 +2165,7 @@ def build_gpu_dashboard():
timeseries_panel(
3,
"GPU Util by Node",
gpu_util_by_hostname(),
'sum by (Hostname) (DCGM_FI_DEV_GPU_UTIL{pod!=""})',
{"h": 8, "w": 12, "x": 0, "y": 8},
unit="percent",
legend="{{Hostname}}",
@ -2780,10 +2229,6 @@ DASHBOARDS = {
"builder": build_mail_dashboard,
"configmap": ROOT / "services" / "monitoring" / "grafana-dashboard-mail.yaml",
},
"atlas-jobs": {
"builder": build_jobs_dashboard,
"configmap": ROOT / "services" / "monitoring" / "grafana-dashboard-jobs.yaml",
},
"atlas-gpu": {
"builder": build_gpu_dashboard,
"configmap": ROOT / "services" / "monitoring" / "grafana-dashboard-gpu.yaml",

View File

@ -20,13 +20,11 @@ import subprocess
import sys
from dataclasses import dataclass
from pathlib import Path
import shutil
from typing import Any, Iterable
import yaml
REPO_ROOT = Path(__file__).resolve().parents[1]
DASHBOARD_DIR = REPO_ROOT / "services" / "monitoring" / "dashboards"
CLUSTER_SCOPED_KINDS = {
"Namespace",
@ -62,70 +60,6 @@ def _run(cmd: list[str], *, cwd: Path) -> str:
return res.stdout
def _sync_tree(source: Path, dest: Path) -> None:
if dest.exists():
shutil.rmtree(dest)
shutil.copytree(source, dest)
def _iter_dashboard_panels(dashboard: dict[str, Any]) -> Iterable[dict[str, Any]]:
panels = dashboard.get("panels") if isinstance(dashboard.get("panels"), list) else []
for panel in panels:
if not isinstance(panel, dict):
continue
if panel.get("type") == "row" and isinstance(panel.get("panels"), list):
yield from _iter_dashboard_panels({"panels": panel.get("panels")})
continue
yield panel
def _extract_metrics_index(dashboard_dir: Path) -> list[dict[str, Any]]:
index: list[dict[str, Any]] = []
for path in sorted(dashboard_dir.glob("*.json")):
try:
data = json.loads(path.read_text(encoding="utf-8"))
except json.JSONDecodeError:
continue
if not isinstance(data, dict):
continue
dash_title = data.get("title") or path.stem
dash_tags = data.get("tags") or []
for panel in _iter_dashboard_panels(data):
targets = panel.get("targets")
if not isinstance(targets, list):
continue
exprs: list[str] = []
for target in targets:
if not isinstance(target, dict):
continue
expr = target.get("expr")
if isinstance(expr, str) and expr.strip():
exprs.append(expr.strip())
if not exprs:
continue
datasource = panel.get("datasource") or {}
if isinstance(datasource, dict):
ds_uid = datasource.get("uid")
ds_type = datasource.get("type")
else:
ds_uid = None
ds_type = None
index.append(
{
"dashboard": dash_title,
"panel_title": panel.get("title") or "",
"panel_id": panel.get("id"),
"panel_type": panel.get("type"),
"description": panel.get("description") or "",
"tags": dash_tags,
"datasource_uid": ds_uid,
"datasource_type": ds_type,
"exprs": exprs,
}
)
return index
def kustomize_build(path: Path) -> str:
rel = path.relative_to(REPO_ROOT)
try:
@ -538,11 +472,6 @@ def main() -> int:
action="store_true",
help="Write generated files (otherwise just print a summary).",
)
ap.add_argument(
"--sync-comms",
action="store_true",
help="Mirror rendered knowledge into services/comms/knowledge for atlasbot.",
)
args = ap.parse_args()
out_dir = REPO_ROOT / args.out
@ -575,11 +504,8 @@ def main() -> int:
summary_path = out_dir / "catalog" / "atlas-summary.json"
diagram_path = out_dir / "diagrams" / "atlas-http.mmd"
runbooks_json_path = out_dir / "catalog" / "runbooks.json"
metrics_json_path = out_dir / "catalog" / "metrics.json"
catalog_rel = catalog_path.relative_to(REPO_ROOT).as_posix()
catalog_path.write_text(
f"# {catalog_rel}\n"
"# Generated by scripts/knowledge_render_atlas.py (do not edit by hand)\n"
+ yaml.safe_dump(catalog, sort_keys=False),
encoding="utf-8",
@ -589,14 +515,9 @@ def main() -> int:
diagram_path.write_text(diagram, encoding="utf-8")
# Render runbooks into JSON for lightweight, dependency-free consumption in-cluster.
runbook_dirs = [
out_dir / "runbooks",
out_dir / "software",
]
runbooks_dir = out_dir / "runbooks"
runbooks: list[dict[str, Any]] = []
for runbooks_dir in runbook_dirs:
if not runbooks_dir.exists():
continue
if runbooks_dir.exists():
for md_file in sorted(runbooks_dir.glob("*.md")):
raw = md_file.read_text(encoding="utf-8")
fm: dict[str, Any] = {}
@ -620,22 +541,12 @@ def main() -> int:
}
)
runbooks_json_path.write_text(json.dumps(runbooks, indent=2, sort_keys=False) + "\n", encoding="utf-8")
metrics_index = _extract_metrics_index(DASHBOARD_DIR)
metrics_json_path.write_text(
json.dumps(metrics_index, indent=2, sort_keys=False) + "\n", encoding="utf-8"
)
print(f"Wrote {catalog_path.relative_to(REPO_ROOT)}")
print(f"Wrote {catalog_json_path.relative_to(REPO_ROOT)}")
print(f"Wrote {summary_path.relative_to(REPO_ROOT)}")
print(f"Wrote {diagram_path.relative_to(REPO_ROOT)}")
print(f"Wrote {runbooks_json_path.relative_to(REPO_ROOT)}")
print(f"Wrote {metrics_json_path.relative_to(REPO_ROOT)}")
if args.sync_comms:
comms_dir = REPO_ROOT / "services" / "comms" / "knowledge"
_sync_tree(out_dir, comms_dir)
print(f"Synced {out_dir.relative_to(REPO_ROOT)} -> {comms_dir.relative_to(REPO_ROOT)}")
return 0

View File

@ -7,8 +7,6 @@ test accounts created via the bstein-dev-home onboarding portal.
Targets (best-effort):
- Keycloak users in realm "atlas"
- Atlas portal Postgres rows (access_requests + dependent tables)
- Mailu mailboxes created for test users
- Nextcloud Mail accounts created for test users
- Vaultwarden users/invites created by the portal
Safety:
@ -58,19 +56,6 @@ class VaultwardenUser:
status: int
@dataclass(frozen=True)
class MailuUser:
email: str
localpart: str
domain: str
@dataclass(frozen=True)
class NextcloudMailAccount:
account_id: str
email: str
def _run(cmd: list[str], *, input_bytes: bytes | None = None) -> str:
proc = subprocess.run(
cmd,
@ -85,19 +70,6 @@ def _run(cmd: list[str], *, input_bytes: bytes | None = None) -> str:
return proc.stdout.decode("utf-8", errors="replace")
def _run_capture(cmd: list[str], *, input_bytes: bytes | None = None) -> tuple[int, str, str]:
proc = subprocess.run(
cmd,
input=input_bytes,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
check=False,
)
stdout = proc.stdout.decode("utf-8", errors="replace")
stderr = proc.stderr.decode("utf-8", errors="replace")
return proc.returncode, stdout, stderr
def _kubectl_get_secret_value(namespace: str, name: str, key: str) -> str:
raw_b64 = _run(
[
@ -138,21 +110,6 @@ def _kubectl_first_pod(namespace: str) -> str:
return pod_name
def _kubectl_exec(namespace: str, target: str, cmd: list[str]) -> tuple[int, str, str]:
return _run_capture(
[
"kubectl",
"-n",
namespace,
"exec",
"-i",
target,
"--",
*cmd,
]
)
def _validate_prefixes(prefixes: list[str]) -> list[str]:
cleaned: list[str] = []
for prefix in prefixes:
@ -230,62 +187,6 @@ def _keycloak_delete_user(server: str, realm: str, token: str, user_id: str) ->
raise
def _sql_quote(value: str) -> str:
return "'" + value.replace("'", "''") + "'"
def _psql_exec(db_name: str, sql: str, *, user: str = "postgres") -> str:
postgres_pod = _kubectl_first_pod("postgres")
return _run(
[
"kubectl",
"-n",
"postgres",
"exec",
"-i",
postgres_pod,
"--",
"psql",
"-U",
user,
"-d",
db_name,
"-c",
sql,
]
)
def _psql_tsv(db_name: str, sql: str, *, user: str = "postgres") -> list[list[str]]:
postgres_pod = _kubectl_first_pod("postgres")
out = _run(
[
"kubectl",
"-n",
"postgres",
"exec",
"-i",
postgres_pod,
"--",
"psql",
"-U",
user,
"-d",
db_name,
"-At",
"-F",
"\t",
"-c",
sql,
]
)
rows: list[list[str]] = []
for line in out.splitlines():
parts = line.split("\t")
rows.append(parts)
return rows
def _psql_json(portal_db_url: str, sql: str) -> list[dict[str, Any]]:
postgres_pod = _kubectl_first_pod("postgres")
out = _run(
@ -355,89 +256,6 @@ def _portal_delete_requests(portal_db_url: str, prefixes: list[str]) -> int:
return int(match.group(1)) if match else 0
def _mailu_list_users(prefixes: list[str], domain: str, db_name: str, protected: set[str]) -> list[MailuUser]:
if not prefixes or not domain:
return []
clauses = " OR ".join([f"localpart LIKE '{p}%'" for p in prefixes])
sql = (
'SELECT email, localpart, domain_name '
'FROM "user" '
f"WHERE domain_name = {_sql_quote(domain)} AND ({clauses}) "
"ORDER BY email;"
)
rows = _psql_tsv(db_name, sql)
users: list[MailuUser] = []
for row in rows:
if len(row) < 3:
continue
email = row[0].strip()
if not email or email in protected:
continue
users.append(MailuUser(email=email, localpart=row[1].strip(), domain=row[2].strip()))
return users
def _mailu_delete_users(db_name: str, emails: list[str]) -> int:
if not emails:
return 0
email_list = ",".join(_sql_quote(e) for e in emails)
sql = f'DELETE FROM "user" WHERE email IN ({email_list});'
out = _psql_exec(db_name, sql)
match = re.search(r"DELETE\\s+(\\d+)", out)
return int(match.group(1)) if match else 0
_NEXTCLOUD_ACCOUNT_RE = re.compile(r"^Account\\s+(\\d+):")
_EMAIL_RE = re.compile(r"[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+")
def _nextcloud_exec(cmd: list[str]) -> tuple[int, str, str]:
namespace = os.getenv("NEXTCLOUD_NAMESPACE", "nextcloud").strip() or "nextcloud"
target = os.getenv("NEXTCLOUD_EXEC_TARGET", "deploy/nextcloud").strip() or "deploy/nextcloud"
return _kubectl_exec(namespace, target, cmd)
def _parse_nextcloud_mail_accounts(export_output: str) -> list[NextcloudMailAccount]:
accounts: list[NextcloudMailAccount] = []
current_id = ""
for line in export_output.splitlines():
line = line.strip()
if not line:
continue
match = _NEXTCLOUD_ACCOUNT_RE.match(line)
if match:
current_id = match.group(1)
continue
if not current_id or "@" not in line:
continue
email_match = _EMAIL_RE.search(line)
if not email_match:
continue
accounts.append(NextcloudMailAccount(account_id=current_id, email=email_match.group(0)))
current_id = ""
return accounts
def _nextcloud_list_mail_accounts(username: str) -> list[NextcloudMailAccount]:
occ_path = os.getenv("NEXTCLOUD_OCC_PATH", "/var/www/html/occ").strip() or "/var/www/html/occ"
rc, out, err = _nextcloud_exec(["php", occ_path, "mail:account:export", username])
if rc != 0:
message = (err or out).strip()
lowered = message.lower()
if any(token in lowered for token in ("not found", "does not exist", "no such user", "unknown user")):
return []
raise RuntimeError(f"nextcloud mail export failed for {username}: {message}")
return _parse_nextcloud_mail_accounts(out)
def _nextcloud_delete_mail_account(account_id: str) -> None:
occ_path = os.getenv("NEXTCLOUD_OCC_PATH", "/var/www/html/occ").strip() or "/var/www/html/occ"
rc, out, err = _nextcloud_exec(["php", occ_path, "mail:account:delete", "-q", account_id])
if rc != 0:
message = (err or out).strip()
raise RuntimeError(f"nextcloud mail delete failed for account {account_id}: {message}")
def _vaultwarden_admin_cookie(admin_token: str, base_url: str) -> str:
data = urllib.parse.urlencode({"token": admin_token}).encode("utf-8")
req = urllib.request.Request(f"{base_url}/admin", data=data, method="POST")
@ -538,8 +356,6 @@ def main() -> int:
),
)
parser.add_argument("--skip-keycloak", action="store_true", help="Skip Keycloak user deletion.")
parser.add_argument("--skip-mailu", action="store_true", help="Skip Mailu mailbox cleanup.")
parser.add_argument("--skip-nextcloud-mail", action="store_true", help="Skip Nextcloud Mail account cleanup.")
parser.add_argument("--skip-portal-db", action="store_true", help="Skip portal DB cleanup.")
parser.add_argument("--skip-vaultwarden", action="store_true", help="Skip Vaultwarden cleanup.")
parser.add_argument(
@ -548,18 +364,6 @@ def main() -> int:
default=[],
help="Keycloak usernames that must never be deleted (repeatable).",
)
parser.add_argument(
"--protect-mailu-email",
action="append",
default=[],
help="Mailu emails that must never be deleted (repeatable).",
)
parser.add_argument(
"--protect-nextcloud-username",
action="append",
default=[],
help="Nextcloud usernames that must never be touched (repeatable).",
)
parser.add_argument(
"--protect-vaultwarden-email",
action="append",
@ -572,11 +376,7 @@ def main() -> int:
apply = bool(args.apply)
expected_confirm = ",".join(prefixes)
protected_keycloak = {"bstein", "robotuser", *[u.strip() for u in args.protect_keycloak_username if u.strip()]}
protected_mailu = {e.strip() for e in args.protect_mailu_email if e.strip()}
protected_nextcloud = {u.strip() for u in args.protect_nextcloud_username if u.strip()}
protected_vaultwarden = {e.strip() for e in args.protect_vaultwarden_email if e.strip()}
mailu_domain = os.getenv("MAILU_DOMAIN", "bstein.dev").strip() or "bstein.dev"
mailu_db_name = os.getenv("MAILU_DB_NAME", "mailu").strip() or "mailu"
if apply and args.confirm != expected_confirm:
raise SystemExit(
@ -588,29 +388,23 @@ def main() -> int:
print("mode:", "APPLY (destructive)" if apply else "DRY RUN (no changes)")
if protected_keycloak:
print("protected keycloak usernames:", ", ".join(sorted(protected_keycloak)))
if protected_mailu:
print("protected mailu emails:", ", ".join(sorted(protected_mailu)))
if protected_nextcloud:
print("protected nextcloud usernames:", ", ".join(sorted(protected_nextcloud)))
if protected_vaultwarden:
print("protected vaultwarden emails:", ", ".join(sorted(protected_vaultwarden)))
print()
portal_requests: list[PortalRequestRow] = []
if not args.skip_portal_db:
portal_db_url = _kubectl_get_secret_value("bstein-dev-home", "atlas-portal-db", "PORTAL_DATABASE_URL")
portal_requests = _portal_list_requests(portal_db_url, prefixes)
print(f"Portal DB: {len(portal_requests)} access_requests matched")
for row in portal_requests[:50]:
requests = _portal_list_requests(portal_db_url, prefixes)
print(f"Portal DB: {len(requests)} access_requests matched")
for row in requests[:50]:
print(f" {row.request_code}\t{row.status}\t{row.username}")
if len(portal_requests) > 50:
print(f" ... and {len(portal_requests) - 50} more")
if apply and portal_requests:
if len(requests) > 50:
print(f" ... and {len(requests) - 50} more")
if apply and requests:
deleted = _portal_delete_requests(portal_db_url, prefixes)
print(f"Portal DB: deleted {deleted} access_requests (cascade removes tasks/steps/artifacts).")
print()
keycloak_users: list[KeycloakUser] = []
if not args.skip_keycloak:
kc_server = os.getenv("KEYCLOAK_PUBLIC_URL", "https://sso.bstein.dev").rstrip("/")
kc_realm = os.getenv("KEYCLOAK_REALM", "atlas")
@ -627,63 +421,18 @@ def main() -> int:
if user.username in protected_keycloak:
continue
found[user.user_id] = user
keycloak_users = list(found.values())
keycloak_users.sort(key=lambda u: u.username)
print(f"Keycloak: {len(keycloak_users)} users matched")
for user in keycloak_users[:50]:
users = list(found.values())
users.sort(key=lambda u: u.username)
print(f"Keycloak: {len(users)} users matched")
for user in users[:50]:
email = user.email or "-"
print(f" {user.username}\t{email}\t{user.user_id}")
if len(keycloak_users) > 50:
print(f" ... and {len(keycloak_users) - 50} more")
if apply and keycloak_users:
for user in keycloak_users:
if len(users) > 50:
print(f" ... and {len(users) - 50} more")
if apply and users:
for user in users:
_keycloak_delete_user(kc_server, kc_realm, token, user.user_id)
print(f"Keycloak: deleted {len(keycloak_users)} users.")
print()
if not args.skip_mailu:
mailu_users = _mailu_list_users(prefixes, mailu_domain, mailu_db_name, protected_mailu)
print(f"Mailu: {len(mailu_users)} mailboxes matched (domain={mailu_domain})")
for user in mailu_users[:50]:
print(f" {user.email}\t{user.localpart}\t{user.domain}")
if len(mailu_users) > 50:
print(f" ... and {len(mailu_users) - 50} more")
if apply and mailu_users:
deleted = _mailu_delete_users(mailu_db_name, [u.email for u in mailu_users])
print(f"Mailu: deleted {deleted} mailboxes.")
print()
if not args.skip_nextcloud_mail:
nextcloud_usernames = {row.username for row in portal_requests if row.username}
nextcloud_usernames.update({u.username for u in keycloak_users if u.username})
nextcloud_usernames = {u for u in nextcloud_usernames if _starts_with_any(u, prefixes)}
nextcloud_usernames = {u for u in nextcloud_usernames if u not in protected_nextcloud}
matches: list[tuple[str, NextcloudMailAccount]] = []
for username in sorted(nextcloud_usernames):
accounts = _nextcloud_list_mail_accounts(username)
for account in accounts:
email = account.email.strip()
if not email:
continue
if not email.lower().endswith(f"@{mailu_domain.lower()}"):
continue
localpart = email.split("@", 1)[0]
if not _starts_with_any(localpart, prefixes):
continue
if email in protected_mailu:
continue
matches.append((username, account))
print(f"Nextcloud Mail: {len(matches)} accounts matched")
for username, account in matches[:50]:
print(f" {username}\t{account.account_id}\t{account.email}")
if len(matches) > 50:
print(f" ... and {len(matches) - 50} more")
if apply and matches:
for _, account in matches:
_nextcloud_delete_mail_account(account.account_id)
print(f"Nextcloud Mail: deleted {len(matches)} accounts.")
print(f"Keycloak: deleted {len(users)} users.")
print()
if not args.skip_vaultwarden:

View File

@ -55,11 +55,11 @@ class _FakeResponse:
class _FakeSession:
def __init__(self, put_resp, get_resps):
def __init__(self, put_resp, get_resp):
self.put_resp = put_resp
self.get_resps = list(get_resps)
self.get_resp = get_resp
self.put_called = False
self.get_calls = 0
self.get_called = False
def post(self, *args, **kwargs):
return _FakeResponse({"access_token": "dummy"})
@ -69,26 +69,22 @@ class _FakeSession:
return self.put_resp
def get(self, *args, **kwargs):
self.get_calls += 1
if self.get_resps:
return self.get_resps.pop(0)
return _FakeResponse({})
self.get_called = True
return self.get_resp
def test_kc_update_attributes_succeeds(monkeypatch):
sync = load_sync_module(monkeypatch)
current_resp = _FakeResponse({"attributes": {}})
ok_resp = _FakeResponse({"attributes": {"mailu_app_password": ["abc"]}})
sync.SESSION = _FakeSession(_FakeResponse({}), [current_resp, ok_resp])
sync.SESSION = _FakeSession(_FakeResponse({}), ok_resp)
sync.kc_update_attributes("token", {"id": "u1", "username": "u1"}, {"mailu_app_password": "abc"})
assert sync.SESSION.put_called and sync.SESSION.get_calls == 2
assert sync.SESSION.put_called and sync.SESSION.get_called
def test_kc_update_attributes_raises_without_attribute(monkeypatch):
sync = load_sync_module(monkeypatch)
current_resp = _FakeResponse({"attributes": {}})
missing_attr_resp = _FakeResponse({"attributes": {}}, status=200)
sync.SESSION = _FakeSession(_FakeResponse({}), [current_resp, missing_attr_resp])
sync.SESSION = _FakeSession(_FakeResponse({}), missing_attr_resp)
with pytest.raises(Exception):
sync.kc_update_attributes("token", {"id": "u1", "username": "u1"}, {"mailu_app_password": "abc"})
@ -148,25 +144,9 @@ def test_main_generates_password_and_upserts(monkeypatch):
sync = load_sync_module(monkeypatch)
monkeypatch.setattr(sync.bcrypt_sha256, "hash", lambda password: f"hash:{password}")
users = [
{
"id": "u1",
"username": "user1",
"email": "user1@example.com",
"attributes": {"mailu_enabled": ["true"]},
},
{
"id": "u2",
"username": "user2",
"email": "user2@example.com",
"attributes": {"mailu_app_password": ["keepme"], "mailu_enabled": ["true"]},
},
{
"id": "u3",
"username": "user3",
"email": "user3@example.com",
"attributes": {"mailu_email": ["user3@example.com"]},
},
{"id": "u4", "username": "user4", "email": "user4@other.com", "attributes": {}},
{"id": "u1", "username": "user1", "email": "user1@example.com", "attributes": {}},
{"id": "u2", "username": "user2", "email": "user2@example.com", "attributes": {"mailu_app_password": ["keepme"]}},
{"id": "u3", "username": "user3", "email": "user3@other.com", "attributes": {}},
]
updated = []
@ -205,6 +185,6 @@ def test_main_generates_password_and_upserts(monkeypatch):
sync.main()
# Only mail-enabled users (or legacy users with a mailbox) are synced and backfilled.
# Always backfill mailu_email, even if Keycloak recovery email is external.
assert len(updated) == 3
assert conns and len(conns[0]._cursor.executions) == 3

View File

@ -20,9 +20,8 @@ spec:
labels:
app: ollama
annotations:
ai.bstein.dev/model: qwen2.5:14b-instruct-q4_0
ai.bstein.dev/gpu: GPU pool (titan-22/24)
ai.bstein.dev/restartedAt: "2026-01-26T12:00:00Z"
ai.bstein.dev/model: qwen2.5-coder:7b-instruct-q4_0
ai.bstein.dev/gpu: GPU pool (titan-20/21/22/24)
spec:
affinity:
nodeAffinity:
@ -32,6 +31,8 @@ spec:
- key: kubernetes.io/hostname
operator: In
values:
- titan-20
- titan-21
- titan-22
- titan-24
runtimeClassName: nvidia
@ -41,7 +42,7 @@ spec:
claimName: ollama-models
initContainers:
- name: warm-model
image: ollama/ollama@sha256:2c9595c555fd70a28363489ac03bd5bf9e7c5bdf2890373c3a830ffd7252ce6d
image: ollama/ollama:latest
env:
- name: OLLAMA_HOST
value: 0.0.0.0
@ -52,7 +53,7 @@ spec:
- name: OLLAMA_MODELS
value: /root/.ollama
- name: OLLAMA_MODEL
value: qwen2.5:14b-instruct-q4_0
value: qwen2.5-coder:7b-instruct-q4_0
command:
- /bin/sh
- -c
@ -67,14 +68,14 @@ spec:
mountPath: /root/.ollama
resources:
requests:
cpu: 500m
memory: 2Gi
cpu: 250m
memory: 1Gi
nvidia.com/gpu.shared: 1
limits:
nvidia.com/gpu.shared: 1
containers:
- name: ollama
image: ollama/ollama@sha256:2c9595c555fd70a28363489ac03bd5bf9e7c5bdf2890373c3a830ffd7252ce6d
image: ollama/ollama:latest
imagePullPolicy: IfNotPresent
ports:
- name: http
@ -95,10 +96,10 @@ spec:
mountPath: /root/.ollama
resources:
requests:
cpu: "4"
memory: 16Gi
cpu: "2"
memory: 8Gi
nvidia.com/gpu.shared: 1
limits:
cpu: "8"
memory: 24Gi
cpu: "4"
memory: 12Gi
nvidia.com/gpu.shared: 1

Some files were not shown because too many files have changed in this diff Show More