feature/sso-hardening #9

Merged
bstein merged 685 commits from feature/sso-hardening into main 2026-01-13 20:23:26 +00:00
325 changed files with 37644 additions and 1317 deletions

6
.gitignore vendored
View File

@ -1,2 +1,8 @@
*.md *.md
!README.md !README.md
!knowledge/**/*.md
!services/comms/knowledge/**/*.md
__pycache__/
*.py[cod]
.pytest_cache
.venv

View File

@ -5,8 +5,9 @@ resources:
- ../../services/crypto - ../../services/crypto
- ../../services/gitea - ../../services/gitea
- ../../services/jellyfin - ../../services/jellyfin
- ../../services/jitsi - ../../services/comms
- ../../services/monitoring - ../../services/monitoring
- ../../services/logging
- ../../services/pegasus - ../../services/pegasus
- ../../services/vault - ../../services/vault
- ../../services/bstein-dev-home - ../../services/bstein-dev-home

View File

@ -0,0 +1,23 @@
# clusters/atlas/flux-system/applications/ai-llm/kustomization.yaml
apiVersion: kustomize.toolkit.fluxcd.io/v1
kind: Kustomization
metadata:
name: ai-llm
namespace: flux-system
spec:
interval: 10m
path: ./services/ai-llm
targetNamespace: ai
prune: true
sourceRef:
kind: GitRepository
name: flux-system
namespace: flux-system
wait: true
healthChecks:
- apiVersion: apps/v1
kind: Deployment
name: ollama
namespace: ai
dependsOn:
- name: core

View File

@ -1,26 +0,0 @@
# clusters/atlas/flux-system/applications/ci-demo/image-automation.yaml
apiVersion: image.toolkit.fluxcd.io/v1
kind: ImageUpdateAutomation
metadata:
name: ci-demo
namespace: flux-system
spec:
interval: 1m0s
sourceRef:
kind: GitRepository
name: flux-system
namespace: flux-system
git:
checkout:
ref:
branch: feature/ci-gitops
commit:
author:
email: ops@bstein.dev
name: flux-bot
messageTemplate: "chore(ci-demo): apply image updates"
push:
branch: feature/ci-gitops
update:
strategy: Setters
path: services/ci-demo

View File

@ -0,0 +1,17 @@
# clusters/atlas/flux-system/applications/communication/kustomization.yaml
apiVersion: kustomize.toolkit.fluxcd.io/v1
kind: Kustomization
metadata:
name: comms
namespace: flux-system
spec:
interval: 10m
prune: true
sourceRef:
kind: GitRepository
name: flux-system
path: ./services/comms
targetNamespace: comms
timeout: 2m
dependsOn:
- name: traefik

View File

@ -15,5 +15,6 @@ spec:
namespace: flux-system namespace: flux-system
dependsOn: dependsOn:
- name: core - name: core
- name: openldap
wait: true wait: true
timeout: 5m timeout: 5m

View File

@ -16,8 +16,12 @@ spec:
- name: helm - name: helm
- name: traefik - name: traefik
healthChecks: healthChecks:
- apiVersion: helm.toolkit.fluxcd.io/v2 - apiVersion: apps/v1
kind: HelmRelease kind: Deployment
name: jenkins
namespace: jenkins
- apiVersion: v1
kind: Service
name: jenkins name: jenkins
namespace: jenkins namespace: jenkins
wait: false wait: false

View File

@ -4,7 +4,8 @@ kind: Kustomization
resources: resources:
- gitea/kustomization.yaml - gitea/kustomization.yaml
- vault/kustomization.yaml - vault/kustomization.yaml
- jitsi/kustomization.yaml - vaultwarden/kustomization.yaml
- comms/kustomization.yaml
- crypto/kustomization.yaml - crypto/kustomization.yaml
- monerod/kustomization.yaml - monerod/kustomization.yaml
- pegasus/kustomization.yaml - pegasus/kustomization.yaml
@ -16,9 +17,14 @@ resources:
- jellyfin/kustomization.yaml - jellyfin/kustomization.yaml
- xmr-miner/kustomization.yaml - xmr-miner/kustomization.yaml
- sui-metrics/kustomization.yaml - sui-metrics/kustomization.yaml
- openldap/kustomization.yaml
- keycloak/kustomization.yaml - keycloak/kustomization.yaml
- oauth2-proxy/kustomization.yaml - oauth2-proxy/kustomization.yaml
- mailu/kustomization.yaml - mailu/kustomization.yaml
- jenkins/kustomization.yaml - jenkins/kustomization.yaml
- ci-demo/kustomization.yaml - ai-llm/kustomization.yaml
- ci-demo/image-automation.yaml - nextcloud/kustomization.yaml
- nextcloud-mail-sync/kustomization.yaml
- postgres/kustomization.yaml
- outline/kustomization.yaml
- planka/kustomization.yaml

View File

@ -0,0 +1,17 @@
# clusters/atlas/flux-system/applications/nextcloud-mail-sync/kustomization.yaml
apiVersion: kustomize.toolkit.fluxcd.io/v1
kind: Kustomization
metadata:
name: nextcloud-mail-sync
namespace: flux-system
spec:
interval: 10m
prune: true
sourceRef:
kind: GitRepository
name: flux-system
path: ./services/nextcloud-mail-sync
targetNamespace: nextcloud
timeout: 2m
dependsOn:
- name: keycloak

View File

@ -0,0 +1,16 @@
# clusters/atlas/flux-system/applications/nextcloud/kustomization.yaml
apiVersion: kustomize.toolkit.fluxcd.io/v1beta2
kind: Kustomization
metadata:
name: nextcloud
namespace: flux-system
spec:
interval: 10m
path: ./services/nextcloud
targetNamespace: nextcloud
prune: true
sourceRef:
kind: GitRepository
name: flux-system
namespace: flux-system
wait: true

View File

@ -1,18 +1,18 @@
# clusters/atlas/flux-system/applications/jitsi/kustomization.yaml # clusters/atlas/flux-system/applications/openldap/kustomization.yaml
apiVersion: kustomize.toolkit.fluxcd.io/v1 apiVersion: kustomize.toolkit.fluxcd.io/v1
kind: Kustomization kind: Kustomization
metadata: metadata:
name: jitsi name: openldap
namespace: flux-system namespace: flux-system
spec: spec:
interval: 10m interval: 10m
path: ./services/jitsi
targetNamespace: jitsi
prune: true prune: true
sourceRef: sourceRef:
kind: GitRepository kind: GitRepository
name: flux-system name: flux-system
namespace: flux-system namespace: flux-system
path: ./services/openldap
targetNamespace: sso
dependsOn: dependsOn:
- name: core - name: core
wait: true wait: true

View File

@ -0,0 +1,28 @@
# clusters/atlas/flux-system/applications/outline/kustomization.yaml
apiVersion: kustomize.toolkit.fluxcd.io/v1
kind: Kustomization
metadata:
name: outline
namespace: flux-system
spec:
interval: 10m
path: ./services/outline
prune: true
sourceRef:
kind: GitRepository
name: flux-system
targetNamespace: outline
dependsOn:
- name: keycloak
- name: mailu
- name: traefik
healthChecks:
- apiVersion: apps/v1
kind: Deployment
name: outline
namespace: outline
- apiVersion: v1
kind: Service
name: outline
namespace: outline
wait: false

View File

@ -0,0 +1,28 @@
# clusters/atlas/flux-system/applications/planka/kustomization.yaml
apiVersion: kustomize.toolkit.fluxcd.io/v1
kind: Kustomization
metadata:
name: planka
namespace: flux-system
spec:
interval: 10m
path: ./services/planka
prune: true
sourceRef:
kind: GitRepository
name: flux-system
targetNamespace: planka
dependsOn:
- name: keycloak
- name: mailu
- name: traefik
healthChecks:
- apiVersion: apps/v1
kind: Deployment
name: planka
namespace: planka
- apiVersion: v1
kind: Service
name: planka
namespace: planka
wait: false

View File

@ -0,0 +1,24 @@
# clusters/atlas/flux-system/applications/postgres/kustomization.yaml
apiVersion: kustomize.toolkit.fluxcd.io/v1
kind: Kustomization
metadata:
name: postgres
namespace: flux-system
spec:
interval: 10m
path: ./services/postgres
prune: true
force: true
sourceRef:
kind: GitRepository
name: flux-system
targetNamespace: postgres
dependsOn:
- name: vault
- name: vault-csi
healthChecks:
- apiVersion: apps/v1
kind: StatefulSet
name: postgres
namespace: postgres
wait: true

View File

@ -0,0 +1,20 @@
# clusters/atlas/flux-system/applications/vaultwarden/kustomization.yaml
apiVersion: kustomize.toolkit.fluxcd.io/v1
kind: Kustomization
metadata:
name: vaultwarden
namespace: flux-system
spec:
interval: 10m
suspend: false
sourceRef:
kind: GitRepository
name: flux-system
namespace: flux-system
path: ./services/vaultwarden
targetNamespace: vaultwarden
prune: true
wait: true
dependsOn:
- name: helm
- name: traefik

View File

@ -8,7 +8,7 @@ metadata:
spec: spec:
interval: 1m0s interval: 1m0s
ref: ref:
branch: main branch: feature/sso-hardening
secretRef: secretRef:
name: flux-system-gitea name: flux-system-gitea
url: ssh://git@scm.bstein.dev:2242/bstein/titan-iac.git url: ssh://git@scm.bstein.dev:2242/bstein/titan-iac.git

View File

@ -4,7 +4,11 @@ kind: Kustomization
resources: resources:
- core/kustomization.yaml - core/kustomization.yaml
- helm/kustomization.yaml - helm/kustomization.yaml
- metallb/kustomization.yaml
- traefik/kustomization.yaml - traefik/kustomization.yaml
- gitops-ui/kustomization.yaml - gitops-ui/kustomization.yaml
- monitoring/kustomization.yaml - monitoring/kustomization.yaml
- logging/kustomization.yaml
- maintenance/kustomization.yaml
- longhorn-ui/kustomization.yaml - longhorn-ui/kustomization.yaml
- ../platform/vault-csi/kustomization.yaml

View File

@ -0,0 +1,14 @@
# clusters/atlas/flux-system/platform/logging/kustomization.yaml
apiVersion: kustomize.toolkit.fluxcd.io/v1
kind: Kustomization
metadata:
name: logging
namespace: flux-system
spec:
interval: 10m
path: ./services/logging
prune: true
sourceRef:
kind: GitRepository
name: flux-system
wait: false

View File

@ -1,17 +1,14 @@
# clusters/atlas/flux-system/applications/ci-demo/kustomization.yaml # clusters/atlas/flux-system/platform/maintenance/kustomization.yaml
apiVersion: kustomize.toolkit.fluxcd.io/v1 apiVersion: kustomize.toolkit.fluxcd.io/v1
kind: Kustomization kind: Kustomization
metadata: metadata:
name: ci-demo name: maintenance
namespace: flux-system namespace: flux-system
spec: spec:
interval: 10m interval: 10m
path: ./services/ci-demo path: ./services/maintenance
prune: true prune: true
sourceRef: sourceRef:
kind: GitRepository kind: GitRepository
name: flux-system name: flux-system
namespace: flux-system
dependsOn:
- name: core
wait: false wait: false

View File

@ -0,0 +1,16 @@
# clusters/atlas/flux-system/platform/metallb/kustomization.yaml
apiVersion: kustomize.toolkit.fluxcd.io/v1
kind: Kustomization
metadata:
name: metallb
namespace: flux-system
spec:
interval: 30m
sourceRef:
kind: GitRepository
name: flux-system
namespace: flux-system
path: ./infrastructure/metallb
prune: true
wait: true
targetNamespace: metallb-system

View File

@ -15,4 +15,5 @@ spec:
namespace: flux-system namespace: flux-system
dependsOn: dependsOn:
- name: core - name: core
- name: metallb
wait: true wait: true

View File

@ -0,0 +1,16 @@
# clusters/atlas/flux-system/platform/vault-csi/kustomization.yaml
apiVersion: kustomize.toolkit.fluxcd.io/v1
kind: Kustomization
metadata:
name: vault-csi
namespace: flux-system
spec:
interval: 30m
sourceRef:
kind: GitRepository
name: flux-system
namespace: flux-system
path: ./infrastructure/vault-csi
prune: true
wait: true
targetNamespace: kube-system

View File

@ -5,3 +5,4 @@ resources:
- ../../../infrastructure/modules/base - ../../../infrastructure/modules/base
- ../../../infrastructure/modules/profiles/atlas-ha - ../../../infrastructure/modules/profiles/atlas-ha
- ../../../infrastructure/sources/cert-manager/letsencrypt.yaml - ../../../infrastructure/sources/cert-manager/letsencrypt.yaml
- ../../../infrastructure/metallb

View File

@ -0,0 +1,16 @@
FROM --platform=$BUILDPLATFORM opensearchproject/data-prepper:2.8.0 AS source
FROM --platform=$TARGETPLATFORM eclipse-temurin:17-jre
ENV DATA_PREPPER_PATH=/usr/share/data-prepper
RUN useradd -u 10001 -M -U -d / -s /usr/sbin/nologin data_prepper \
&& mkdir -p /var/log/data-prepper
COPY --from=source /usr/share/data-prepper /usr/share/data-prepper
RUN chown -R 10001:10001 /usr/share/data-prepper /var/log/data-prepper
USER 10001
WORKDIR /usr/share/data-prepper
CMD ["bin/data-prepper"]

View File

@ -1,5 +1,18 @@
# hosts/roles/titan_jh/tasks/main.yaml # hosts/roles/titan_jh/tasks/main.yaml
--- ---
- name: Install node exporter
ansible.builtin.package:
name: prometheus-node-exporter
state: present
tags: ['jumphost', 'monitoring']
- name: Enable node exporter
ansible.builtin.service:
name: prometheus-node-exporter
enabled: true
state: started
tags: ['jumphost', 'monitoring']
- name: Placeholder for jumphost hardening - name: Placeholder for jumphost hardening
ansible.builtin.debug: ansible.builtin.debug:
msg: "Harden SSH, manage bastion tooling, and configure audit logging here." msg: "Harden SSH, manage bastion tooling, and configure audit logging here."

View File

@ -0,0 +1,20 @@
# infrastructure/metallb/ippool.yaml
apiVersion: metallb.io/v1beta1
kind: IPAddressPool
metadata:
name: communication-pool
namespace: metallb-system
spec:
addresses:
- 192.168.22.4-192.168.22.6
- 192.168.22.9-192.168.22.9
autoAssign: true
---
apiVersion: metallb.io/v1beta1
kind: L2Advertisement
metadata:
name: communication-adv
namespace: metallb-system
spec:
ipAddressPools:
- communication-pool

View File

@ -0,0 +1,10 @@
# infrastructure/metallb/kustomization.yaml
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
resources:
- namespace.yaml
- metallb-rendered.yaml
- ippool.yaml
patchesStrategicMerge:
- patches/node-placement.yaml
- patches/speaker-loglevel.yaml

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,5 @@
# infrastructure/metallb/namespace.yaml
apiVersion: v1
kind: Namespace
metadata:
name: metallb-system

View File

@ -0,0 +1,27 @@
# infrastructure/metallb/patches/node-placement.yaml
apiVersion: apps/v1
kind: Deployment
metadata:
name: metallb-controller
namespace: metallb-system
spec:
template:
spec:
containers:
- name: controller
args:
- --port=7472
- --log-level=info
- --webhook-mode=enabled
- --tls-min-version=VersionTLS12
- --lb-class=metallb
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: hardware
operator: In
values:
- rpi4
- rpi5

View File

@ -0,0 +1,15 @@
# infrastructure/metallb/patches/speaker-loglevel.yaml
apiVersion: apps/v1
kind: DaemonSet
metadata:
name: metallb-speaker
namespace: metallb-system
spec:
template:
spec:
containers:
- name: speaker
args:
- --port=7472
- --log-level=info
- --lb-class=metallb

View File

@ -2,6 +2,7 @@
apiVersion: kustomize.config.k8s.io/v1beta1 apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization kind: Kustomization
resources: resources:
- ../components/device-plugin-config
- ../components/device-plugin-jetson - ../components/device-plugin-jetson
- ../components/device-plugin-minipc - ../components/device-plugin-minipc
- ../components/device-plugin-tethys - ../components/device-plugin-tethys

View File

@ -0,0 +1,15 @@
# infrastructure/modules/profiles/components/device-plugin-config/configmap.yaml
apiVersion: v1
kind: ConfigMap
metadata:
name: nvidia-device-plugin-config
namespace: kube-system
data:
config.yaml: |
version: v1
sharing:
timeSlicing:
renameByDefault: true
resources:
- name: nvidia.com/gpu
replicas: 4

View File

@ -0,0 +1,5 @@
# infrastructure/modules/profiles/components/device-plugin-config/kustomization.yaml
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
resources:
- configmap.yaml

View File

@ -30,7 +30,8 @@ spec:
imagePullPolicy: IfNotPresent imagePullPolicy: IfNotPresent
args: args:
- "--fail-on-init-error=false" - "--fail-on-init-error=false"
- "--device-list-strategy=envvar,cdi" - "--device-list-strategy=envvar"
- "--config-file=/config/config.yaml"
securityContext: securityContext:
privileged: true privileged: true
env: env:
@ -41,7 +42,12 @@ spec:
volumeMounts: volumeMounts:
- name: device-plugin - name: device-plugin
mountPath: /var/lib/kubelet/device-plugins mountPath: /var/lib/kubelet/device-plugins
- name: config
mountPath: /config
volumes: volumes:
- name: device-plugin - name: device-plugin
hostPath: hostPath:
path: /var/lib/kubelet/device-plugins path: /var/lib/kubelet/device-plugins
- name: config
configMap:
name: nvidia-device-plugin-config

View File

@ -32,6 +32,7 @@ spec:
- "--fail-on-init-error=false" - "--fail-on-init-error=false"
- "--device-list-strategy=envvar" - "--device-list-strategy=envvar"
- "--mig-strategy=none" - "--mig-strategy=none"
- "--config-file=/config/config.yaml"
securityContext: securityContext:
privileged: true privileged: true
env: env:
@ -42,7 +43,12 @@ spec:
volumeMounts: volumeMounts:
- name: device-plugin - name: device-plugin
mountPath: /var/lib/kubelet/device-plugins mountPath: /var/lib/kubelet/device-plugins
- name: config
mountPath: /config
volumes: volumes:
- name: device-plugin - name: device-plugin
hostPath: hostPath:
path: /var/lib/kubelet/device-plugins path: /var/lib/kubelet/device-plugins
- name: config
configMap:
name: nvidia-device-plugin-config

View File

@ -33,6 +33,7 @@ spec:
- "--fail-on-init-error=false" - "--fail-on-init-error=false"
- "--device-list-strategy=envvar" - "--device-list-strategy=envvar"
- "--mig-strategy=none" - "--mig-strategy=none"
- "--config-file=/config/config.yaml"
securityContext: securityContext:
privileged: true privileged: true
env: env:
@ -43,7 +44,12 @@ spec:
volumeMounts: volumeMounts:
- name: device-plugin - name: device-plugin
mountPath: /var/lib/kubelet/device-plugins mountPath: /var/lib/kubelet/device-plugins
- name: config
mountPath: /config
volumes: volumes:
- name: device-plugin - name: device-plugin
hostPath: hostPath:
path: /var/lib/kubelet/device-plugins path: /var/lib/kubelet/device-plugins
- name: config
configMap:
name: nvidia-device-plugin-config

View File

@ -2,4 +2,5 @@
apiVersion: kustomize.config.k8s.io/v1beta1 apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization kind: Kustomization
resources: resources:
- ../components/device-plugin-config
- ../components/device-plugin-tethys - ../components/device-plugin-tethys

View File

@ -0,0 +1,9 @@
# infrastructure/sources/helm/fluent-bit.yaml
apiVersion: source.toolkit.fluxcd.io/v1
kind: HelmRepository
metadata:
name: fluent
namespace: flux-system
spec:
interval: 1h
url: https://fluent.github.io/helm-charts

View File

@ -2,11 +2,15 @@
apiVersion: kustomize.config.k8s.io/v1beta1 apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization kind: Kustomization
resources: resources:
- fluent-bit.yaml
- grafana.yaml - grafana.yaml
- hashicorp.yaml - hashicorp.yaml
- jetstack.yaml - jetstack.yaml
- jenkins.yaml - jenkins.yaml
- mailu.yaml - mailu.yaml
- opentelemetry.yaml
- opensearch.yaml
- harbor.yaml - harbor.yaml
- prometheus.yaml - prometheus.yaml
- victoria-metrics.yaml - victoria-metrics.yaml
- secrets-store-csi.yaml

View File

@ -0,0 +1,9 @@
# infrastructure/sources/helm/opensearch.yaml
apiVersion: source.toolkit.fluxcd.io/v1
kind: HelmRepository
metadata:
name: opensearch
namespace: flux-system
spec:
interval: 1h
url: https://opensearch-project.github.io/helm-charts

View File

@ -0,0 +1,9 @@
# infrastructure/sources/helm/opentelemetry.yaml
apiVersion: source.toolkit.fluxcd.io/v1
kind: HelmRepository
metadata:
name: opentelemetry
namespace: flux-system
spec:
interval: 1h
url: https://open-telemetry.github.io/opentelemetry-helm-charts

View File

@ -0,0 +1,9 @@
# infrastructure/sources/helm/secrets-store-csi.yaml
apiVersion: source.toolkit.fluxcd.io/v1
kind: HelmRepository
metadata:
name: secrets-store-csi-driver
namespace: flux-system
spec:
interval: 1h
url: https://kubernetes-sigs.github.io/secrets-store-csi-driver/charts

View File

@ -71,9 +71,10 @@ rules:
- tlsoptions - tlsoptions
- tlsstores - tlsstores
- serverstransports - serverstransports
- serverstransporttcps
- traefikservices - traefikservices
- middlewaretcps
verbs: verbs:
- get - get
- list - list
- watch - watch

View File

@ -10,3 +10,4 @@ resources:
- clusterrole.yaml - clusterrole.yaml
- clusterrolebinding.yaml - clusterrolebinding.yaml
- service.yaml - service.yaml
- traefik-service-lb.yaml

View File

@ -0,0 +1,24 @@
# infrastructure/traefik/traefik-service-lb.yaml
apiVersion: v1
kind: Service
metadata:
name: traefik
namespace: kube-system
annotations:
metallb.universe.tf/address-pool: communication-pool
spec:
type: LoadBalancer
loadBalancerClass: metallb
loadBalancerIP: 192.168.22.9
ports:
- name: web
port: 80
targetPort: web
protocol: TCP
- name: websecure
port: 443
targetPort: websecure
protocol: TCP
selector:
app.kubernetes.io/instance: traefik-kube-system
app.kubernetes.io/name: traefik

View File

@ -0,0 +1,6 @@
# infrastructure/vault-csi/kustomization.yaml
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
resources:
- secrets-store-csi-driver.yaml
- vault-csi-provider.yaml

View File

@ -0,0 +1,20 @@
# infrastructure/vault-csi/secrets-store-csi-driver.yaml
apiVersion: helm.toolkit.fluxcd.io/v2
kind: HelmRelease
metadata:
name: secrets-store-csi-driver
namespace: kube-system
spec:
interval: 15m
chart:
spec:
chart: secrets-store-csi-driver
version: "~1.3.0"
sourceRef:
kind: HelmRepository
name: secrets-store-csi-driver
namespace: flux-system
values:
syncSecret:
enabled: true
enableSecretRotation: false

View File

@ -0,0 +1,111 @@
# infrastructure/vault-csi/vault-csi-provider.yaml
apiVersion: v1
kind: ServiceAccount
metadata:
name: vault-csi-provider
namespace: kube-system
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
name: vault-csi-provider-clusterrole
rules:
- apiGroups: [""]
resources: ["serviceaccounts/token"]
verbs: ["create"]
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
name: vault-csi-provider-clusterrolebinding
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: vault-csi-provider-clusterrole
subjects:
- kind: ServiceAccount
name: vault-csi-provider
namespace: kube-system
---
apiVersion: rbac.authorization.k8s.io/v1
kind: Role
metadata:
name: vault-csi-provider-role
namespace: kube-system
rules:
- apiGroups: [""]
resources: ["secrets"]
verbs: ["get"]
resourceNames: ["vault-csi-provider-hmac-key"]
- apiGroups: [""]
resources: ["secrets"]
verbs: ["create"]
---
apiVersion: rbac.authorization.k8s.io/v1
kind: RoleBinding
metadata:
name: vault-csi-provider-rolebinding
namespace: kube-system
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: Role
name: vault-csi-provider-role
subjects:
- kind: ServiceAccount
name: vault-csi-provider
namespace: kube-system
---
apiVersion: apps/v1
kind: DaemonSet
metadata:
name: vault-csi-provider
namespace: kube-system
labels: { app.kubernetes.io/name: vault-csi-provider }
spec:
updateStrategy:
type: RollingUpdate
selector:
matchLabels: { app.kubernetes.io/name: vault-csi-provider }
template:
metadata:
labels: { app.kubernetes.io/name: vault-csi-provider }
spec:
serviceAccountName: vault-csi-provider
containers:
- name: provider-vault-installer
image: hashicorp/vault-csi-provider:1.7.0
imagePullPolicy: IfNotPresent
args:
- -endpoint=/provider/vault.sock
- -log-level=info
resources:
requests: { cpu: 50m, memory: 100Mi }
limits: { cpu: 50m, memory: 100Mi }
volumeMounts:
- { name: providervol, mountPath: "/provider" }
livenessProbe:
httpGet:
path: "/health/ready"
port: 8080
scheme: "HTTP"
failureThreshold: 2
initialDelaySeconds: 5
periodSeconds: 5
successThreshold: 1
timeoutSeconds: 3
readinessProbe:
httpGet:
path: "/health/ready"
port: 8080
scheme: "HTTP"
failureThreshold: 2
initialDelaySeconds: 5
periodSeconds: 5
successThreshold: 1
timeoutSeconds: 3
volumes:
- name: providervol
hostPath:
path: "/var/run/secrets-store-csi-providers"
nodeSelector:
kubernetes.io/os: linux

22
knowledge/INDEX.md Normal file
View File

@ -0,0 +1,22 @@
Atlas Knowledge Base (KB)
This folder is the source-of-truth “memory” for Atlas/Titan assistants (and for humans). It is designed to be:
- Accurate (grounded in GitOps + read-only cluster tools)
- Maintainable (small docs + deterministic generators)
- Safe (no secrets; refer to Secret/Vault paths by name only)
Layout
- `knowledge/runbooks/`: human-written docs (short, chunkable Markdown).
- `knowledge/catalog/`: generated machine-readable facts (YAML/JSON).
- `knowledge/diagrams/`: generated Mermaid diagrams (`.mmd`) derived from the catalog.
Regeneration
- Update manifests/docs, then regenerate generated artifacts:
- `python scripts/knowledge_render_atlas.py --write`
Authoring rules
- Never include secret values. Prefer `secretRef` names or Vault paths like `kv/atlas/...`.
- Prefer stable identifiers: Kubernetes `namespace/name`, DNS hostnames, Flux kustomization paths.
- Keep each runbook small; one topic per file; use headings.
- When in doubt, link to the exact file path in this repo that configures the behavior.

View File

@ -0,0 +1,8 @@
{
"counts": {
"helmrelease_host_hints": 7,
"http_endpoints": 35,
"services": 44,
"workloads": 49
}
}

2771
knowledge/catalog/atlas.json Normal file

File diff suppressed because it is too large Load Diff

1786
knowledge/catalog/atlas.yaml Normal file

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,89 @@
[
{
"path": "runbooks/ci-gitea-jenkins.md",
"title": "CI: Gitea \u2192 Jenkins pipeline",
"tags": [
"atlas",
"ci",
"gitea",
"jenkins"
],
"entrypoints": [
"scm.bstein.dev",
"ci.bstein.dev"
],
"source_paths": [
"services/gitea",
"services/jenkins",
"scripts/jenkins_cred_sync.sh",
"scripts/gitea_cred_sync.sh"
],
"body": "# CI: Gitea \u2192 Jenkins pipeline\n\n## What this is\nAtlas uses Gitea for source control and Jenkins for CI. Authentication is via Keycloak (SSO).\n\n## Where it is configured\n- Gitea manifests: `services/gitea/`\n- Jenkins manifests: `services/jenkins/`\n- Credential sync helpers: `scripts/gitea_cred_sync.sh`, `scripts/jenkins_cred_sync.sh`\n\n## What users do (typical flow)\n- Create a repo in Gitea.\n- Create/update a Jenkins job/pipeline that can fetch the repo.\n- Configure a webhook (or SCM polling) so pushes trigger builds.\n\n## Troubleshooting (common)\n- \u201cWebhook not firing\u201d: confirm ingress host, webhook URL, and Jenkins job is reachable.\n- \u201cAuth denied cloning\u201d: confirm Keycloak group membership and that Jenkins has a valid token/credential configured."
},
{
"path": "runbooks/comms-verify.md",
"title": "Othrys verification checklist",
"tags": [
"comms",
"matrix",
"element",
"livekit"
],
"entrypoints": [
"https://live.bstein.dev",
"https://matrix.live.bstein.dev"
],
"source_paths": [],
"body": "1) Guest join:\n- Open a private window and visit:\n `https://live.bstein.dev/#/room/#othrys:live.bstein.dev?action=join`\n- Confirm the guest join flow works and the displayname becomes `<word>-<word>`.\n\n2) Keycloak login:\n- Log in from `https://live.bstein.dev` and confirm MAS -> Keycloak -> Element redirect.\n\n3) Video rooms:\n- Start an Element Call room and confirm audio/video with a second account.\n- Check that guests can read public rooms but cannot start calls.\n\n4) Well-known:\n- `https://live.bstein.dev/.well-known/matrix/client` returns JSON.\n- `https://matrix.live.bstein.dev/.well-known/matrix/client` returns JSON.\n\n5) TURN reachability:\n- Confirm `turn.live.bstein.dev:3478` and `turns:5349` are reachable from WAN."
},
{
"path": "runbooks/kb-authoring.md",
"title": "KB authoring: what to write (and what not to)",
"tags": [
"atlas",
"kb",
"runbooks"
],
"entrypoints": [],
"source_paths": [
"knowledge/runbooks",
"scripts/knowledge_render_atlas.py"
],
"body": "# KB authoring: what to write (and what not to)\n\n## The goal\nGive Atlas assistants enough grounded, Atlas-specific context to answer \u201chow do I\u2026?\u201d questions without guessing.\n\n## What to capture (high value)\n- User workflows: \u201cclick here, set X, expected result\u201d\n- Operator workflows: \u201cedit these files, reconcile this kustomization, verify with these commands\u201d\n- Wiring: \u201cthis host routes to this service; this service depends on Postgres/Vault/etc\u201d\n- Failure modes: exact error messages + the 2\u20135 checks that usually resolve them\n- Permissions: Keycloak groups/roles and what they unlock\n\n## What to avoid (low value / fluff)\n- Generic Kubernetes explanations (link to upstream docs instead)\n- Copy-pasting large manifests (prefer file paths + small snippets)\n- Anything that will drift quickly (render it from GitOps instead)\n- Any secret values (reference Secret/Vault locations by name only)\n\n## Document pattern (recommended)\nEach runbook should answer:\n- \u201cWhat is this?\u201d\n- \u201cWhat do users do?\u201d\n- \u201cWhat do operators change (where in Git)?\u201d\n- \u201cHow do we verify it works?\u201d\n- \u201cWhat breaks and how to debug it?\u201d"
},
{
"path": "runbooks/observability.md",
"title": "Observability: Grafana + VictoriaMetrics (how to query safely)",
"tags": [
"atlas",
"monitoring",
"grafana",
"victoriametrics"
],
"entrypoints": [
"metrics.bstein.dev",
"alerts.bstein.dev"
],
"source_paths": [
"services/monitoring"
],
"body": "# Observability: Grafana + VictoriaMetrics (how to query safely)\n\n## Where it is configured\n- `services/monitoring/helmrelease.yaml` (Grafana + Alertmanager + VM values)\n- `services/monitoring/grafana-dashboard-*.yaml` (dashboards and their PromQL)\n\n## Using metrics as a \u201ctool\u201d for Atlas assistants\nThe safest pattern is: map a small set of intents \u2192 fixed PromQL queries, then summarize results.\n\nExamples (intents)\n- \u201cIs the cluster healthy?\u201d \u2192 node readiness + pod restart rate\n- \u201cWhy is Element Call failing?\u201d \u2192 LiveKit/coturn pod restarts + synapse errors + ingress 5xx\n- \u201cIs Jenkins slow?\u201d \u2192 pod CPU/memory + HTTP latency metrics (if exported)\n\n## Why dashboards are not the KB\nDashboards are great references, but the assistant should query VictoriaMetrics directly for live answers and keep the\nKB focused on wiring, runbooks, and stable conventions."
},
{
"path": "runbooks/template.md",
"title": "<short title>",
"tags": [
"atlas",
"<service>",
"<topic>"
],
"entrypoints": [
"<hostnames if relevant>"
],
"source_paths": [
"services/<svc>",
"clusters/atlas/<...>"
],
"body": "# <Short title>\n\n## What this is\n\n## For users (how to)\n\n## For operators (where configured)\n\n## Troubleshooting (symptoms \u2192 checks)"
}
]

View File

@ -0,0 +1,189 @@
flowchart LR
host_auth_bstein_dev["auth.bstein.dev"]
svc_sso_oauth2_proxy["sso/oauth2-proxy (Service)"]
host_auth_bstein_dev --> svc_sso_oauth2_proxy
wl_sso_oauth2_proxy["sso/oauth2-proxy (Deployment)"]
svc_sso_oauth2_proxy --> wl_sso_oauth2_proxy
host_bstein_dev["bstein.dev"]
svc_bstein_dev_home_bstein_dev_home_frontend["bstein-dev-home/bstein-dev-home-frontend (Service)"]
host_bstein_dev --> svc_bstein_dev_home_bstein_dev_home_frontend
wl_bstein_dev_home_bstein_dev_home_frontend["bstein-dev-home/bstein-dev-home-frontend (Deployment)"]
svc_bstein_dev_home_bstein_dev_home_frontend --> wl_bstein_dev_home_bstein_dev_home_frontend
svc_comms_matrix_wellknown["comms/matrix-wellknown (Service)"]
host_bstein_dev --> svc_comms_matrix_wellknown
wl_comms_matrix_wellknown["comms/matrix-wellknown (Deployment)"]
svc_comms_matrix_wellknown --> wl_comms_matrix_wellknown
svc_bstein_dev_home_bstein_dev_home_backend["bstein-dev-home/bstein-dev-home-backend (Service)"]
host_bstein_dev --> svc_bstein_dev_home_bstein_dev_home_backend
wl_bstein_dev_home_bstein_dev_home_backend["bstein-dev-home/bstein-dev-home-backend (Deployment)"]
svc_bstein_dev_home_bstein_dev_home_backend --> wl_bstein_dev_home_bstein_dev_home_backend
host_call_live_bstein_dev["call.live.bstein.dev"]
svc_comms_element_call["comms/element-call (Service)"]
host_call_live_bstein_dev --> svc_comms_element_call
wl_comms_element_call["comms/element-call (Deployment)"]
svc_comms_element_call --> wl_comms_element_call
host_chat_ai_bstein_dev["chat.ai.bstein.dev"]
svc_bstein_dev_home_chat_ai_gateway["bstein-dev-home/chat-ai-gateway (Service)"]
host_chat_ai_bstein_dev --> svc_bstein_dev_home_chat_ai_gateway
wl_bstein_dev_home_chat_ai_gateway["bstein-dev-home/chat-ai-gateway (Deployment)"]
svc_bstein_dev_home_chat_ai_gateway --> wl_bstein_dev_home_chat_ai_gateway
host_ci_bstein_dev["ci.bstein.dev"]
svc_jenkins_jenkins["jenkins/jenkins (Service)"]
host_ci_bstein_dev --> svc_jenkins_jenkins
wl_jenkins_jenkins["jenkins/jenkins (Deployment)"]
svc_jenkins_jenkins --> wl_jenkins_jenkins
host_cloud_bstein_dev["cloud.bstein.dev"]
svc_nextcloud_nextcloud["nextcloud/nextcloud (Service)"]
host_cloud_bstein_dev --> svc_nextcloud_nextcloud
wl_nextcloud_nextcloud["nextcloud/nextcloud (Deployment)"]
svc_nextcloud_nextcloud --> wl_nextcloud_nextcloud
host_kit_live_bstein_dev["kit.live.bstein.dev"]
svc_comms_livekit_token_service["comms/livekit-token-service (Service)"]
host_kit_live_bstein_dev --> svc_comms_livekit_token_service
wl_comms_livekit_token_service["comms/livekit-token-service (Deployment)"]
svc_comms_livekit_token_service --> wl_comms_livekit_token_service
svc_comms_livekit["comms/livekit (Service)"]
host_kit_live_bstein_dev --> svc_comms_livekit
wl_comms_livekit["comms/livekit (Deployment)"]
svc_comms_livekit --> wl_comms_livekit
host_live_bstein_dev["live.bstein.dev"]
svc_comms_othrys_element_element_web["comms/othrys-element-element-web (Service)"]
host_live_bstein_dev --> svc_comms_othrys_element_element_web
wl_comms_othrys_element_element_web["comms/othrys-element-element-web (Deployment)"]
svc_comms_othrys_element_element_web --> wl_comms_othrys_element_element_web
host_live_bstein_dev --> svc_comms_matrix_wellknown
svc_comms_othrys_synapse_matrix_synapse["comms/othrys-synapse-matrix-synapse (Service)"]
host_live_bstein_dev --> svc_comms_othrys_synapse_matrix_synapse
wl_comms_othrys_synapse_matrix_synapse["comms/othrys-synapse-matrix-synapse (Deployment)"]
svc_comms_othrys_synapse_matrix_synapse --> wl_comms_othrys_synapse_matrix_synapse
host_longhorn_bstein_dev["longhorn.bstein.dev"]
svc_longhorn_system_oauth2_proxy_longhorn["longhorn-system/oauth2-proxy-longhorn (Service)"]
host_longhorn_bstein_dev --> svc_longhorn_system_oauth2_proxy_longhorn
wl_longhorn_system_oauth2_proxy_longhorn["longhorn-system/oauth2-proxy-longhorn (Deployment)"]
svc_longhorn_system_oauth2_proxy_longhorn --> wl_longhorn_system_oauth2_proxy_longhorn
host_mail_bstein_dev["mail.bstein.dev"]
svc_mailu_mailserver_mailu_front["mailu-mailserver/mailu-front (Service)"]
host_mail_bstein_dev --> svc_mailu_mailserver_mailu_front
host_matrix_live_bstein_dev["matrix.live.bstein.dev"]
svc_comms_matrix_authentication_service["comms/matrix-authentication-service (Service)"]
host_matrix_live_bstein_dev --> svc_comms_matrix_authentication_service
wl_comms_matrix_authentication_service["comms/matrix-authentication-service (Deployment)"]
svc_comms_matrix_authentication_service --> wl_comms_matrix_authentication_service
host_matrix_live_bstein_dev --> svc_comms_matrix_wellknown
host_matrix_live_bstein_dev --> svc_comms_othrys_synapse_matrix_synapse
svc_comms_matrix_guest_register["comms/matrix-guest-register (Service)"]
host_matrix_live_bstein_dev --> svc_comms_matrix_guest_register
wl_comms_matrix_guest_register["comms/matrix-guest-register (Deployment)"]
svc_comms_matrix_guest_register --> wl_comms_matrix_guest_register
host_monero_bstein_dev["monero.bstein.dev"]
svc_crypto_monerod["crypto/monerod (Service)"]
host_monero_bstein_dev --> svc_crypto_monerod
wl_crypto_monerod["crypto/monerod (Deployment)"]
svc_crypto_monerod --> wl_crypto_monerod
host_office_bstein_dev["office.bstein.dev"]
svc_nextcloud_collabora["nextcloud/collabora (Service)"]
host_office_bstein_dev --> svc_nextcloud_collabora
wl_nextcloud_collabora["nextcloud/collabora (Deployment)"]
svc_nextcloud_collabora --> wl_nextcloud_collabora
host_pegasus_bstein_dev["pegasus.bstein.dev"]
svc_jellyfin_pegasus["jellyfin/pegasus (Service)"]
host_pegasus_bstein_dev --> svc_jellyfin_pegasus
wl_jellyfin_pegasus["jellyfin/pegasus (Deployment)"]
svc_jellyfin_pegasus --> wl_jellyfin_pegasus
host_scm_bstein_dev["scm.bstein.dev"]
svc_gitea_gitea["gitea/gitea (Service)"]
host_scm_bstein_dev --> svc_gitea_gitea
wl_gitea_gitea["gitea/gitea (Deployment)"]
svc_gitea_gitea --> wl_gitea_gitea
host_secret_bstein_dev["secret.bstein.dev"]
svc_vault_vault["vault/vault (Service)"]
host_secret_bstein_dev --> svc_vault_vault
wl_vault_vault["vault/vault (StatefulSet)"]
svc_vault_vault --> wl_vault_vault
host_sso_bstein_dev["sso.bstein.dev"]
svc_sso_keycloak["sso/keycloak (Service)"]
host_sso_bstein_dev --> svc_sso_keycloak
wl_sso_keycloak["sso/keycloak (Deployment)"]
svc_sso_keycloak --> wl_sso_keycloak
host_stream_bstein_dev["stream.bstein.dev"]
svc_jellyfin_jellyfin["jellyfin/jellyfin (Service)"]
host_stream_bstein_dev --> svc_jellyfin_jellyfin
wl_jellyfin_jellyfin["jellyfin/jellyfin (Deployment)"]
svc_jellyfin_jellyfin --> wl_jellyfin_jellyfin
host_vault_bstein_dev["vault.bstein.dev"]
svc_vaultwarden_vaultwarden_service["vaultwarden/vaultwarden-service (Service)"]
host_vault_bstein_dev --> svc_vaultwarden_vaultwarden_service
wl_vaultwarden_vaultwarden["vaultwarden/vaultwarden (Deployment)"]
svc_vaultwarden_vaultwarden_service --> wl_vaultwarden_vaultwarden
subgraph bstein_dev_home[bstein-dev-home]
svc_bstein_dev_home_bstein_dev_home_frontend
wl_bstein_dev_home_bstein_dev_home_frontend
svc_bstein_dev_home_bstein_dev_home_backend
wl_bstein_dev_home_bstein_dev_home_backend
svc_bstein_dev_home_chat_ai_gateway
wl_bstein_dev_home_chat_ai_gateway
end
subgraph comms[comms]
svc_comms_matrix_wellknown
wl_comms_matrix_wellknown
svc_comms_element_call
wl_comms_element_call
svc_comms_livekit_token_service
wl_comms_livekit_token_service
svc_comms_livekit
wl_comms_livekit
svc_comms_othrys_element_element_web
wl_comms_othrys_element_element_web
svc_comms_othrys_synapse_matrix_synapse
wl_comms_othrys_synapse_matrix_synapse
svc_comms_matrix_authentication_service
wl_comms_matrix_authentication_service
svc_comms_matrix_guest_register
wl_comms_matrix_guest_register
end
subgraph crypto[crypto]
svc_crypto_monerod
wl_crypto_monerod
end
subgraph gitea[gitea]
svc_gitea_gitea
wl_gitea_gitea
end
subgraph jellyfin[jellyfin]
svc_jellyfin_pegasus
wl_jellyfin_pegasus
svc_jellyfin_jellyfin
wl_jellyfin_jellyfin
end
subgraph jenkins[jenkins]
svc_jenkins_jenkins
wl_jenkins_jenkins
end
subgraph longhorn_system[longhorn-system]
svc_longhorn_system_oauth2_proxy_longhorn
wl_longhorn_system_oauth2_proxy_longhorn
end
subgraph mailu_mailserver[mailu-mailserver]
svc_mailu_mailserver_mailu_front
end
subgraph nextcloud[nextcloud]
svc_nextcloud_nextcloud
wl_nextcloud_nextcloud
svc_nextcloud_collabora
wl_nextcloud_collabora
end
subgraph sso[sso]
svc_sso_oauth2_proxy
wl_sso_oauth2_proxy
svc_sso_keycloak
wl_sso_keycloak
end
subgraph vault[vault]
svc_vault_vault
wl_vault_vault
end
subgraph vaultwarden[vaultwarden]
svc_vaultwarden_vaultwarden_service
wl_vaultwarden_vaultwarden
end

26
knowledge/metis.md Normal file
View File

@ -0,0 +1,26 @@
# Metis (node recovery)
## Node classes (current map)
- rpi5 Ubuntu workers: titan-04,05,06,07,08,09,10,11,20,21 (Ubuntu 24.04.3, k3s agent)
- rpi5 control-plane: titan-0a/0b/0c (Ubuntu 24.04.1, k3s server, control-plane taint)
- rpi4 Armbian longhorn: titan-13/15/17/19 (Armbian 6.6.x, k3s agent, longhorn disks)
- rpi4 Armbian standard: titan-12/14/18 (Armbian 6.6.x, k3s agent)
- amd64 agents: titan-22/24 (Debian 13, k3s agent)
- External/non-cluster: tethys, titan-db, titan-jh, oceanus/titan-23, future titan-20/21 (when added), plus any newcomers.
## Longhorn disk UUIDs (critical nodes)
- titan-13: /mnt/astreae UUID=6031fa8b-f28c-45c3-b7bc-6133300e07c6 (ext4); /mnt/asteria UUID=cbd4989d-62b5-4741-8b2a-28fdae259cae (ext4)
- titan-15: /mnt/astreae UUID=f3362f14-5822-449f-944b-ac570b5cd615 (ext4); /mnt/asteria UUID=9c5316e6-f847-4884-b502-11f2d0d15d6f (ext4)
- titan-17: /mnt/astreae UUID=1fecdade-08b0-49cb-9ae3-be6c188b0a96 (ext4); /mnt/asteria UUID=2fe9f613-d372-47ca-b84f-82084e4edda0 (ext4)
- titan-19: /mnt/astreae UUID=4890abb9-dda2-4f4f-9c0f-081ee82849cf (ext4); /mnt/asteria UUID=2b4ea28d-b0e6-4fa3-841b-cd7067ae9153 (ext4)
## Metis repo (~/Development/metis)
- CLI skeleton in Go (`cmd/metis`), inventory loader (`pkg/inventory`), plan builder (`pkg/plan`).
- `inventory.example.yaml` shows expected schema (classes + per-node overlay, Longhorn disks, labels, taints).
- `AGENTS.md` in repo is untracked and holds raw notes.
## Next implementation steps
- Add per-class golden image refs and checksums (Harbor or file://) when ready.
- Implement burn execution: download with checksum, write via dd/etcher-equivalent, mount boot/root to inject hostname/IP/k3s tokens/labels/taints, journald/GC drop-ins, and Longhorn fstab entries. Add Windows writer (diskpart + wmic) and Linux writer (dd + sgdisk) paths.
- Add Keycloak/SSH bootstrap: ensure ssh user, authorized keys, and k3s token/URL injection for agents; control-plane restore path with etcd snapshot selection.
- Add per-host inventory entries for tethys, titan-db, titan-jh, oceanus/titan-23, future 20/21 once audited.

View File

@ -0,0 +1,27 @@
---
title: "CI: Gitea → Jenkins pipeline"
tags: ["atlas", "ci", "gitea", "jenkins"]
owners: ["brad"]
entrypoints: ["scm.bstein.dev", "ci.bstein.dev"]
source_paths: ["services/gitea", "services/jenkins", "scripts/jenkins_cred_sync.sh", "scripts/gitea_cred_sync.sh"]
---
# CI: Gitea → Jenkins pipeline
## What this is
Atlas uses Gitea for source control and Jenkins for CI. Authentication is via Keycloak (SSO).
## Where it is configured
- Gitea manifests: `services/gitea/`
- Jenkins manifests: `services/jenkins/`
- Credential sync helpers: `scripts/gitea_cred_sync.sh`, `scripts/jenkins_cred_sync.sh`
## What users do (typical flow)
- Create a repo in Gitea.
- Create/update a Jenkins job/pipeline that can fetch the repo.
- Configure a webhook (or SCM polling) so pushes trigger builds.
## Troubleshooting (common)
- “Webhook not firing”: confirm ingress host, webhook URL, and Jenkins job is reachable.
- “Auth denied cloning”: confirm Keycloak group membership and that Jenkins has a valid token/credential configured.

View File

@ -0,0 +1,30 @@
---
title: Othrys verification checklist
tags:
- comms
- matrix
- element
- livekit
entrypoints:
- https://live.bstein.dev
- https://matrix.live.bstein.dev
---
1) Guest join:
- Open a private window and visit:
`https://live.bstein.dev/#/room/#othrys:live.bstein.dev?action=join`
- Confirm the guest join flow works and the displayname becomes `<word>-<word>`.
2) Keycloak login:
- Log in from `https://live.bstein.dev` and confirm MAS -> Keycloak -> Element redirect.
3) Video rooms:
- Start an Element Call room and confirm audio/video with a second account.
- Check that guests can read public rooms but cannot start calls.
4) Well-known:
- `https://live.bstein.dev/.well-known/matrix/client` returns JSON.
- `https://matrix.live.bstein.dev/.well-known/matrix/client` returns JSON.
5) TURN reachability:
- Confirm `turn.live.bstein.dev:3478` and `turns:5349` are reachable from WAN.

View File

@ -0,0 +1,34 @@
---
title: "KB authoring: what to write (and what not to)"
tags: ["atlas", "kb", "runbooks"]
owners: ["brad"]
entrypoints: []
source_paths: ["knowledge/runbooks", "scripts/knowledge_render_atlas.py"]
---
# KB authoring: what to write (and what not to)
## The goal
Give Atlas assistants enough grounded, Atlas-specific context to answer “how do I…?” questions without guessing.
## What to capture (high value)
- User workflows: “click here, set X, expected result”
- Operator workflows: “edit these files, reconcile this kustomization, verify with these commands”
- Wiring: “this host routes to this service; this service depends on Postgres/Vault/etc”
- Failure modes: exact error messages + the 25 checks that usually resolve them
- Permissions: Keycloak groups/roles and what they unlock
## What to avoid (low value / fluff)
- Generic Kubernetes explanations (link to upstream docs instead)
- Copy-pasting large manifests (prefer file paths + small snippets)
- Anything that will drift quickly (render it from GitOps instead)
- Any secret values (reference Secret/Vault locations by name only)
## Document pattern (recommended)
Each runbook should answer:
- “What is this?”
- “What do users do?”
- “What do operators change (where in Git)?”
- “How do we verify it works?”
- “What breaks and how to debug it?”

View File

@ -0,0 +1,26 @@
---
title: "Observability: Grafana + VictoriaMetrics (how to query safely)"
tags: ["atlas", "monitoring", "grafana", "victoriametrics"]
owners: ["brad"]
entrypoints: ["metrics.bstein.dev", "alerts.bstein.dev"]
source_paths: ["services/monitoring"]
---
# Observability: Grafana + VictoriaMetrics (how to query safely)
## Where it is configured
- `services/monitoring/helmrelease.yaml` (Grafana + Alertmanager + VM values)
- `services/monitoring/grafana-dashboard-*.yaml` (dashboards and their PromQL)
## Using metrics as a “tool” for Atlas assistants
The safest pattern is: map a small set of intents → fixed PromQL queries, then summarize results.
Examples (intents)
- “Is the cluster healthy?” → node readiness + pod restart rate
- “Why is Element Call failing?” → LiveKit/coturn pod restarts + synapse errors + ingress 5xx
- “Is Jenkins slow?” → pod CPU/memory + HTTP latency metrics (if exported)
## Why dashboards are not the KB
Dashboards are great references, but the assistant should query VictoriaMetrics directly for live answers and keep the
KB focused on wiring, runbooks, and stable conventions.

View File

@ -0,0 +1,18 @@
---
title: "<short title>"
tags: ["atlas", "<service>", "<topic>"]
owners: ["brad"]
entrypoints: ["<hostnames if relevant>"]
source_paths: ["services/<svc>", "clusters/atlas/<...>"]
---
# <Short title>
## What this is
## For users (how to)
## For operators (where configured)
## Troubleshooting (symptoms → checks)

View File

@ -0,0 +1,73 @@
# Metis (node recovery)
## Node classes (current map)
- rpi5 Ubuntu workers: titan-04,05,06,07,08,09,10,11,20,21 (Ubuntu 24.04.3, k3s agent)
- rpi5 control-plane: titan-0a/0b/0c (Ubuntu 24.04.1, k3s server, control-plane taint)
- rpi4 Armbian longhorn: titan-13/15/17/19 (Armbian 6.6.x, k3s agent, longhorn disks)
- rpi4 Armbian standard: titan-12/14/18 (Armbian 6.6.x, k3s agent)
- amd64 agents: titan-22/24 (Debian 13, k3s agent)
- External/non-cluster: tethys, titan-db, titan-jh, oceanus/titan-23, plus any newcomers.
### Jetson nodes (titan-20/21)
- Ubuntu 20.04.6 (Focal), kernel 5.10.104-tegra, CRI containerd 2.0.5-k3s2, arch arm64.
- Storage: NVMe 232G at / (ext4); onboard mmc partitions present but root on NVMe; 1.9T sda present (unused).
- k3s agent with drop-in 99-nofile.conf.
## Longhorn disk UUIDs (critical nodes)
- titan-13: /mnt/astreae UUID=6031fa8b-f28c-45c3-b7bc-6133300e07c6 (ext4); /mnt/asteria UUID=cbd4989d-62b5-4741-8b2a-28fdae259cae (ext4)
- titan-15: /mnt/astreae UUID=f3362f14-5822-449f-944b-ac570b5cd615 (ext4); /mnt/asteria UUID=9c5316e6-f847-4884-b502-11f2d0d15d6f (ext4)
- titan-17: /mnt/astreae UUID=1fecdade-08b0-49cb-9ae3-be6c188b0a96 (ext4); /mnt/asteria UUID=2fe9f613-d372-47ca-b84f-82084e4edda0 (ext4)
- titan-19: /mnt/astreae UUID=4890abb9-dda2-4f4f-9c0f-081ee82849cf (ext4); /mnt/asteria UUID=2b4ea28d-b0e6-4fa3-841b-cd7067ae9153 (ext4)
## Metis repo (~/Development/metis)
- CLI skeleton in Go (`cmd/metis`), inventory loader (`pkg/inventory`), plan builder (`pkg/plan`).
- `inventory.example.yaml` shows expected schema (classes + per-node overlay, Longhorn disks, labels, taints).
- `AGENTS.md` in repo is untracked and holds raw notes.
## Next implementation steps
- Add per-class golden image refs and checksums (Harbor or file://) when ready.
- Implement burn execution: download with checksum, write via dd/etcher-equivalent, mount boot/root to inject hostname/IP/k3s tokens/labels/taints, journald/GC drop-ins, and Longhorn fstab entries. Add Windows writer (diskpart + wmic) and Linux writer (dd + sgdisk) paths.
- Add Keycloak/SSH bootstrap: ensure ssh user, authorized keys, and k3s token/URL injection for agents; control-plane restore path with etcd snapshot selection.
- Add per-host inventory entries for tethys, titan-db, titan-jh, oceanus/titan-23, future 20/21 once audited.
## Node OS/Kernel/CRI snapshot (Jan 2026)
- titan-04: Ubuntu 24.04.3 LTS, kernel 6.8.0-1031-raspi, CRI containerd://2.0.5-k3s2, arch arm64
- titan-05: Ubuntu 24.04.3 LTS, kernel 6.8.0-1039-raspi, CRI containerd://2.0.5-k3s2, arch arm64
- titan-06: Ubuntu 24.04.3 LTS, kernel 6.8.0-1039-raspi, CRI containerd://2.0.5-k3s2, arch arm64
- titan-07: Ubuntu 24.04.3 LTS, kernel 6.8.0-1039-raspi, CRI containerd://2.0.5-k3s2, arch arm64
- titan-08: Ubuntu 24.04.3 LTS, kernel 6.8.0-1039-raspi, CRI containerd://2.0.5-k3s2, arch arm64
- titan-09: Ubuntu 24.04.3 LTS, kernel 6.8.0-1031-raspi, CRI containerd://2.0.5-k3s2, arch arm64
- titan-0a: Ubuntu 24.04.1 LTS, kernel 6.8.0-1038-raspi, CRI containerd://2.0.5-k3s2, arch arm64
- titan-0b: Ubuntu 24.04.1 LTS, kernel 6.8.0-1038-raspi, CRI containerd://2.0.5-k3s2, arch arm64
- titan-0c: Ubuntu 24.04.1 LTS, kernel 6.8.0-1038-raspi, CRI containerd://2.0.5-k3s2, arch arm64
- titan-10: Ubuntu 24.04.3 LTS, kernel 6.8.0-1039-raspi, CRI containerd://2.0.5-k3s2, arch arm64
- titan-11: Ubuntu 24.04.3 LTS, kernel 6.8.0-1039-raspi, CRI containerd://2.0.5-k3s2, arch arm64
- titan-12: Armbian 24.11.1 noble, kernel 6.6.60-current-bcm2711, CRI containerd://1.7.23-k3s2, arch arm64
- titan-13: Armbian 25.2.1 noble, kernel 6.6.63-current-bcm2711, CRI containerd://1.7.23-k3s2, arch arm64
- titan-14: Armbian 24.11.1 noble, kernel 6.6.60-current-bcm2711, CRI containerd://1.7.23-k3s2, arch arm64
- titan-15: Armbian 25.2.1 noble, kernel 6.6.63-current-bcm2711, CRI containerd://1.7.23-k3s2, arch arm64
- titan-17: Armbian 25.2.1 noble, kernel 6.6.63-current-bcm2711, CRI containerd://1.7.23-k3s2, arch arm64
- titan-18: Armbian 24.11.1 noble, kernel 6.6.60-current-bcm2711, CRI containerd://1.7.23-k3s2, arch arm64
- titan-19: Armbian 25.2.1 noble, kernel 6.6.63-current-bcm2711, CRI containerd://1.7.23-k3s2, arch arm64
- titan-20: Ubuntu 20.04.6 LTS, kernel 5.10.104-tegra, CRI containerd://2.0.5-k3s2, arch arm64
- titan-21: Ubuntu 20.04.6 LTS, kernel 5.10.104-tegra, CRI containerd://2.0.5-k3s2, arch arm64
- titan-22: Debian 13 (trixie), kernel 6.12.41+deb13-amd64, CRI containerd://2.0.5-k3s2, arch amd64
- titan-24: Debian 13 (trixie), kernel 6.12.57+deb13-amd64, CRI containerd://2.0.5-k3s2, arch amd64
### External hosts
- titan-db: Ubuntu 24.10, kernel 6.11.0-1015-raspi, root on /dev/sda2 ext4 (465G), boot vfat /dev/sda1; PostgreSQL service enabled.
- titan-jh: Arch Linux ARM (rolling), kernel 6.18.4-2-rpi, NVMe root ext4 238G (/), boot vfat 512M; ~495 packages installed (pacman -Q).
- titan-23/oceanus: TODO audit (future).
### Control plane Pis (titan-0a/0b/0c)
- Ubuntu 24.04.1 LTS, kernel 6.8.0-1038-raspi, containerd 2.0.5-k3s2.
- Storage: 477G SSD root (/dev/sda2 ext4), /boot/firmware vfat (/dev/sda1). fstab uses LABEL=writable and LABEL=system-boot.
- k3s server (control-plane taint expected); etcd snapshots not yet cataloged (TODO).
## k3s versions
- rpi5 workers/control-plane: k3s v1.33.3+k3s1 (crictl v1.31.0-k3s2)
- rpi4 nodes: k3s v1.31.5+k3s1 (crictl v1.31.0-k3s2)
- Jetson titan-20/21: k3s v1.33.3+k3s1 (per node info), crictl v1.31.0-k3s2

5
scripts/comms_sync_kb.sh Executable file
View File

@ -0,0 +1,5 @@
#!/usr/bin/env bash
set -euo pipefail
python scripts/knowledge_render_atlas.py --write
python scripts/knowledge_render_atlas.py --write --out services/comms/knowledge

View File

@ -9,6 +9,7 @@ Usage:
import argparse import argparse
import json import json
import textwrap import textwrap
import urllib.parse
from pathlib import Path from pathlib import Path
# --------------------------------------------------------------------------- # ---------------------------------------------------------------------------
@ -45,12 +46,14 @@ PERCENT_THRESHOLDS = {
], ],
} }
NAMESPACE_CPU_WINDOW = "1m"
# --------------------------------------------------------------------------- # ---------------------------------------------------------------------------
# Cluster metadata # Cluster metadata
# --------------------------------------------------------------------------- # ---------------------------------------------------------------------------
CONTROL_PLANE_NODES = ["titan-0a", "titan-0b", "titan-0c"] CONTROL_PLANE_NODES = ["titan-0a", "titan-0b", "titan-0c"]
CONTROL_DEPENDENCIES = ["titan-db"] CONTROL_DEPENDENCIES = ["titan-db", "titan-jh"]
CONTROL_ALL = CONTROL_PLANE_NODES + CONTROL_DEPENDENCIES CONTROL_ALL = CONTROL_PLANE_NODES + CONTROL_DEPENDENCIES
WORKER_NODES = [ WORKER_NODES = [
"titan-04", "titan-04",
@ -61,11 +64,12 @@ WORKER_NODES = [
"titan-09", "titan-09",
"titan-10", "titan-10",
"titan-11", "titan-11",
"titan-20",
"titan-21",
"titan-12", "titan-12",
"titan-13", "titan-13",
"titan-14", "titan-14",
"titan-15", "titan-15",
"titan-16",
"titan-17", "titan-17",
"titan-18", "titan-18",
"titan-19", "titan-19",
@ -80,7 +84,22 @@ CONTROL_TOTAL = len(CONTROL_PLANE_NODES)
WORKER_TOTAL = len(WORKER_NODES) WORKER_TOTAL = len(WORKER_NODES)
CONTROL_SUFFIX = f"/{CONTROL_TOTAL}" CONTROL_SUFFIX = f"/{CONTROL_TOTAL}"
WORKER_SUFFIX = f"/{WORKER_TOTAL}" WORKER_SUFFIX = f"/{WORKER_TOTAL}"
CP_ALLOWED_NS = "kube-system|kube-public|kube-node-lease|longhorn-system|monitoring|flux-system" # Namespaces considered infrastructure (excluded from workload counts)
INFRA_NAMESPACES = [
"kube-system",
"longhorn-system",
"metallb-system",
"monitoring",
"logging",
"cert-manager",
"flux-system",
"traefik",
"maintenance",
"postgres",
]
INFRA_REGEX = f"^({'|'.join(INFRA_NAMESPACES)})$"
# Namespaces allowed on control plane without counting as workloads
CP_ALLOWED_NS = INFRA_REGEX
LONGHORN_NODE_REGEX = "titan-1[2-9]|titan-2[24]" LONGHORN_NODE_REGEX = "titan-1[2-9]|titan-2[24]"
GAUGE_WIDTHS = [4, 3, 3, 4, 3, 3, 4] GAUGE_WIDTHS = [4, 3, 3, 4, 3, 3, 4]
CONTROL_WORKLOADS_EXPR = ( CONTROL_WORKLOADS_EXPR = (
@ -170,22 +189,48 @@ def node_io_expr(scope=""):
return scoped_node_expr(base, scope) return scoped_node_expr(base, scope)
def namespace_selector(scope_var):
return f'namespace!="",pod!="",container!="",container!="POD",{scope_var}'
def namespace_gpu_selector(scope_var):
return f'namespace!="",pod!="",{scope_var}'
def namespace_cpu_raw(scope_var):
return (
"sum(rate(container_cpu_usage_seconds_total"
f"{{{namespace_selector(scope_var)}}}[{NAMESPACE_CPU_WINDOW}])) by (namespace)"
)
def namespace_ram_raw(scope_var):
return f"sum(container_memory_working_set_bytes{{{namespace_selector(scope_var)}}}) by (namespace)"
def namespace_gpu_usage_instant(scope_var):
return f"sum(DCGM_FI_DEV_GPU_UTIL{{{namespace_gpu_selector(scope_var)}}}) by (namespace)"
def namespace_share_expr(resource_expr): def namespace_share_expr(resource_expr):
selected = f"( {resource_expr} ) and on(namespace) ( {NAMESPACE_TOP_FILTER} )" total = f"clamp_min(sum( {resource_expr} ), 1)"
total = f"clamp_min(sum( {selected} ), 1)" return f"100 * ( {resource_expr} ) / {total}"
return f"100 * ( {selected} ) / {total}"
def namespace_cpu_share_expr(): def namespace_cpu_share_expr(scope_var):
return namespace_share_expr(NAMESPACE_CPU_RAW) return namespace_share_expr(namespace_cpu_raw(scope_var))
def namespace_ram_share_expr(): def namespace_ram_share_expr(scope_var):
return namespace_share_expr(NAMESPACE_RAM_RAW) return namespace_share_expr(namespace_ram_raw(scope_var))
def namespace_gpu_share_expr(): def namespace_gpu_share_expr(scope_var):
return namespace_share_expr(NAMESPACE_GPU_RAW) usage = namespace_gpu_usage_instant(scope_var)
total = f"(sum({usage}) or on() vector(0))"
share = f"100 * ({usage}) / clamp_min({total}, 1)"
idle = 'label_replace(vector(100), "namespace", "idle", "", "") and on() (' + total + " == 0)"
return f"({share}) or ({idle})"
PROBLEM_PODS_EXPR = ( PROBLEM_PODS_EXPR = (
@ -270,46 +315,12 @@ STUCK_TABLE_EXPR = (
")" ")"
) )
NAMESPACE_CPU_RAW = ( NAMESPACE_SCOPE_WORKLOAD = f'namespace!~"{INFRA_REGEX}"'
'sum(rate(container_cpu_usage_seconds_total{namespace!="",pod!="",container!=""}[5m])) by (namespace)' NAMESPACE_SCOPE_ALL = 'namespace=~".*"'
) NAMESPACE_SCOPE_INFRA = f'namespace=~"{INFRA_REGEX}"'
NAMESPACE_RAM_RAW = ( NAMESPACE_SCOPE_VARS = ["namespace_scope_cpu", "namespace_scope_gpu", "namespace_scope_ram"]
'sum(container_memory_working_set_bytes{namespace!="",pod!="",container!=""}) by (namespace)'
)
GPU_NODES = ["titan-20", "titan-21", "titan-22", "titan-24"] GPU_NODES = ["titan-20", "titan-21", "titan-22", "titan-24"]
GPU_NODE_REGEX = "|".join(GPU_NODES) GPU_NODE_REGEX = "|".join(GPU_NODES)
NAMESPACE_GPU_ALLOC = (
'sum((kube_pod_container_resource_requests{namespace!="",resource="nvidia.com/gpu"}'
' or kube_pod_container_resource_limits{namespace!="",resource="nvidia.com/gpu"})) by (namespace)'
)
NAMESPACE_GPU_USAGE_SHARE = (
'sum by (namespace) (max_over_time(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\"}[$__range]))'
)
NAMESPACE_GPU_USAGE_INSTANT = 'sum(DCGM_FI_DEV_GPU_UTIL{namespace!="",pod!=""}) by (namespace)'
NAMESPACE_GPU_RAW = (
"("
+ NAMESPACE_GPU_USAGE_SHARE
+ ") or on(namespace) ("
+ NAMESPACE_CPU_RAW
+ " * 0)"
)
NAMESPACE_GPU_WEIGHT = (
"("
+ NAMESPACE_GPU_ALLOC
+ ") or on(namespace) ("
+ NAMESPACE_CPU_RAW
+ " * 0)"
)
NAMESPACE_ACTIVITY_SCORE = (
"( "
+ NAMESPACE_CPU_RAW
+ " ) + ("
+ NAMESPACE_RAM_RAW
+ " / 1e9) + ("
+ NAMESPACE_GPU_WEIGHT
+ " * 100)"
)
NAMESPACE_TOP_FILTER = "(topk(10, " + NAMESPACE_ACTIVITY_SCORE + ") >= bool 0)"
TRAEFIK_ROUTER_EXPR = "sum by (router) (rate(traefik_router_requests_total[5m]))" TRAEFIK_ROUTER_EXPR = "sum by (router) (rate(traefik_router_requests_total[5m]))"
TRAEFIK_NET_INGRESS = ( TRAEFIK_NET_INGRESS = (
'sum(rate(container_network_receive_bytes_total{namespace="traefik",pod=~"traefik-.*"}[5m]))' 'sum(rate(container_network_receive_bytes_total{namespace="traefik",pod=~"traefik-.*"}[5m]))'
@ -560,9 +571,9 @@ def table_panel(
return panel return panel
def pie_panel(panel_id, title, expr, grid): def pie_panel(panel_id, title, expr, grid, *, links=None, description=None):
"""Return a pie chart panel with readable namespace labels.""" """Return a pie chart panel with readable namespace labels."""
return { panel = {
"id": panel_id, "id": panel_id,
"type": "piechart", "type": "piechart",
"title": title, "title": title,
@ -586,6 +597,71 @@ def pie_panel(panel_id, title, expr, grid):
"reduceOptions": {"calcs": ["lastNotNull"], "fields": "", "values": False}, "reduceOptions": {"calcs": ["lastNotNull"], "fields": "", "values": False},
}, },
} }
if links:
panel["links"] = links
if description:
panel["description"] = description
return panel
def namespace_scope_variable(var_name, label):
options = [
{
"text": "workload namespaces only",
"value": NAMESPACE_SCOPE_WORKLOAD,
"selected": True,
},
{"text": "all namespaces", "value": NAMESPACE_SCOPE_ALL, "selected": False},
{
"text": "infrastructure namespaces only",
"value": NAMESPACE_SCOPE_INFRA,
"selected": False,
},
]
query = (
"workload namespaces only : "
+ NAMESPACE_SCOPE_WORKLOAD
+ ",all namespaces : "
+ NAMESPACE_SCOPE_ALL
+ ",infrastructure namespaces only : "
+ NAMESPACE_SCOPE_INFRA
)
return {
"name": var_name,
"label": label,
"type": "custom",
"query": query,
"current": {"text": options[0]["text"], "value": options[0]["value"], "selected": True},
"options": options,
"hide": 2,
"multi": False,
"includeAll": False,
"refresh": 1,
"sort": 0,
"skipUrlSync": False,
}
def namespace_scope_links(var_name):
def with_value(value):
encoded = urllib.parse.quote(value, safe="")
params = []
for other in NAMESPACE_SCOPE_VARS:
if other == var_name:
params.append(f"var-{other}={encoded}")
else:
params.append(f"var-{other}=${{{other}}}")
return "?" + "&".join(params)
return [
{"title": "Workload namespaces only", "url": with_value(NAMESPACE_SCOPE_WORKLOAD), "targetBlank": False},
{"title": "All namespaces", "url": with_value(NAMESPACE_SCOPE_ALL), "targetBlank": False},
{
"title": "Infrastructure namespaces only",
"url": with_value(NAMESPACE_SCOPE_INFRA),
"targetBlank": False,
},
]
def bargauge_panel( def bargauge_panel(
@ -857,6 +933,115 @@ def build_overview():
) )
) )
mail_bounce_rate_thresholds = {
"mode": "absolute",
"steps": [
{"color": "green", "value": None},
{"color": "yellow", "value": 5},
{"color": "orange", "value": 8},
{"color": "red", "value": 10},
],
}
mail_limit_thresholds = {
"mode": "absolute",
"steps": [
{"color": "green", "value": None},
{"color": "yellow", "value": 70},
{"color": "orange", "value": 85},
{"color": "red", "value": 95},
],
}
mail_success_thresholds = {
"mode": "absolute",
"steps": [
{"color": "red", "value": None},
{"color": "orange", "value": 90},
{"color": "yellow", "value": 95},
{"color": "green", "value": 98},
],
}
panels.append(
stat_panel(
30,
"Mail Sent (1d)",
'max(postmark_outbound_sent{window="1d"})',
{"h": 2, "w": 6, "x": 0, "y": 8},
unit="none",
links=link_to("atlas-mail"),
)
)
panels.append(
{
"id": 31,
"type": "stat",
"title": "Mail Bounces (1d)",
"datasource": PROM_DS,
"gridPos": {"h": 2, "w": 6, "x": 12, "y": 8},
"targets": [
{
"expr": 'max(postmark_outbound_bounce_rate{window="1d"})',
"refId": "A",
"legendFormat": "Rate",
},
{
"expr": 'max(postmark_outbound_bounced{window="1d"})',
"refId": "B",
"legendFormat": "Count",
},
],
"fieldConfig": {
"defaults": {
"color": {"mode": "thresholds"},
"custom": {"displayMode": "auto"},
"thresholds": mail_bounce_rate_thresholds,
"unit": "none",
},
"overrides": [
{
"matcher": {"id": "byName", "options": "Rate"},
"properties": [{"id": "unit", "value": "percent"}],
},
{
"matcher": {"id": "byName", "options": "Count"},
"properties": [{"id": "unit", "value": "none"}],
},
],
},
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "center",
"reduceOptions": {"calcs": ["lastNotNull"], "fields": "", "values": False},
"textMode": "name_and_value",
},
"links": link_to("atlas-mail"),
}
)
panels.append(
stat_panel(
32,
"Mail Success Rate (1d)",
'clamp_min(100 - max(postmark_outbound_bounce_rate{window="1d"}), 0)',
{"h": 2, "w": 6, "x": 6, "y": 8},
unit="percent",
thresholds=mail_success_thresholds,
decimals=1,
links=link_to("atlas-mail"),
)
)
panels.append(
stat_panel(
33,
"Mail Limit Used (30d)",
"max(postmark_sending_limit_used_percent)",
{"h": 2, "w": 6, "x": 18, "y": 8},
unit="percent",
thresholds=mail_limit_thresholds,
decimals=1,
links=link_to("atlas-mail"),
)
)
storage_panels = [ storage_panels = [
(23, "Astreae Usage", astreae_usage_expr("/mnt/astreae"), "percent"), (23, "Astreae Usage", astreae_usage_expr("/mnt/astreae"), "percent"),
(24, "Asteria Usage", astreae_usage_expr("/mnt/asteria"), "percent"), (24, "Asteria Usage", astreae_usage_expr("/mnt/asteria"), "percent"),
@ -876,28 +1061,38 @@ def build_overview():
) )
) )
cpu_scope = "$namespace_scope_cpu"
gpu_scope = "$namespace_scope_gpu"
ram_scope = "$namespace_scope_ram"
panels.append( panels.append(
pie_panel( pie_panel(
11, 11,
"Namespace CPU Share", "Namespace CPU Share",
namespace_cpu_share_expr(), namespace_cpu_share_expr(cpu_scope),
{"h": 9, "w": 8, "x": 0, "y": 16}, {"h": 9, "w": 8, "x": 0, "y": 16},
links=namespace_scope_links("namespace_scope_cpu"),
description="Values are normalized within the selected scope; use panel links to switch scope.",
) )
) )
panels.append( panels.append(
pie_panel( pie_panel(
12, 12,
"Namespace GPU Share", "Namespace GPU Share",
namespace_gpu_share_expr(), namespace_gpu_share_expr(gpu_scope),
{"h": 9, "w": 8, "x": 8, "y": 16}, {"h": 9, "w": 8, "x": 8, "y": 16},
links=namespace_scope_links("namespace_scope_gpu"),
description="Values are normalized within the selected scope; use panel links to switch scope.",
) )
) )
panels.append( panels.append(
pie_panel( pie_panel(
13, 13,
"Namespace RAM Share", "Namespace RAM Share",
namespace_ram_share_expr(), namespace_ram_share_expr(ram_scope),
{"h": 9, "w": 8, "x": 16, "y": 16}, {"h": 9, "w": 8, "x": 16, "y": 16},
links=namespace_scope_links("namespace_scope_ram"),
description="Values are normalized within the selected scope; use panel links to switch scope.",
) )
) )
@ -1052,7 +1247,6 @@ def build_overview():
links=link_to("atlas-storage"), links=link_to("atlas-storage"),
) )
) )
return { return {
"uid": "atlas-overview", "uid": "atlas-overview",
"title": "Atlas Overview", "title": "Atlas Overview",
@ -1063,7 +1257,13 @@ def build_overview():
"schemaVersion": 39, "schemaVersion": 39,
"style": "dark", "style": "dark",
"tags": ["atlas", "overview"], "tags": ["atlas", "overview"],
"templating": {"list": []}, "templating": {
"list": [
namespace_scope_variable("namespace_scope_cpu", "CPU namespace filter"),
namespace_scope_variable("namespace_scope_gpu", "GPU namespace filter"),
namespace_scope_variable("namespace_scope_ram", "RAM namespace filter"),
]
},
"time": {"from": "now-1h", "to": "now"}, "time": {"from": "now-1h", "to": "now"},
"refresh": "1m", "refresh": "1m",
"links": [], "links": [],
@ -1513,6 +1713,33 @@ def build_storage_dashboard():
time_from="90d", time_from="90d",
) )
) )
panels.append(
stat_panel(
30,
"Maintenance Sweepers Ready",
'kube_daemonset_status_number_ready{namespace="maintenance",daemonset="node-image-sweeper"} / on(namespace,daemonset) kube_daemonset_status_desired_number_scheduled{namespace="maintenance",daemonset="node-image-sweeper"} * 100',
{"h": 4, "w": 12, "x": 0, "y": 44},
unit="percent",
thresholds=PERCENT_THRESHOLDS,
)
)
panels.append(
stat_panel(
31,
"Maintenance Cron Freshness (s)",
'time() - max by (cronjob) (kube_cronjob_status_last_successful_time{namespace="maintenance",cronjob=~"image-sweeper|grafana-smtp-sync"})',
{"h": 4, "w": 12, "x": 12, "y": 44},
unit="s",
thresholds={
"mode": "absolute",
"steps": [
{"color": "green", "value": None},
{"color": "yellow", "value": 3600},
{"color": "red", "value": 10800},
],
},
)
)
return { return {
"uid": "atlas-storage", "uid": "atlas-storage",
"title": "Atlas Storage", "title": "Atlas Storage",
@ -1702,21 +1929,231 @@ def build_network_dashboard():
} }
def build_mail_dashboard():
panels = []
bounce_rate_thresholds = {
"mode": "absolute",
"steps": [
{"color": "green", "value": None},
{"color": "yellow", "value": 5},
{"color": "orange", "value": 8},
{"color": "red", "value": 10},
],
}
limit_thresholds = {
"mode": "absolute",
"steps": [
{"color": "green", "value": None},
{"color": "yellow", "value": 70},
{"color": "orange", "value": 85},
{"color": "red", "value": 95},
],
}
success_thresholds = {
"mode": "absolute",
"steps": [
{"color": "red", "value": None},
{"color": "orange", "value": 90},
{"color": "yellow", "value": 95},
{"color": "green", "value": 98},
],
}
panels.append(
stat_panel(
1,
"Sent (1d)",
'max(postmark_outbound_sent{window="1d"})',
{"h": 4, "w": 6, "x": 0, "y": 0},
decimals=0,
)
)
panels.append(
stat_panel(
2,
"Sent (7d)",
'max(postmark_outbound_sent{window="7d"})',
{"h": 4, "w": 6, "x": 6, "y": 0},
decimals=0,
)
)
panels.append(
{
"id": 3,
"type": "stat",
"title": "Mail Bounces (1d)",
"datasource": PROM_DS,
"gridPos": {"h": 4, "w": 6, "x": 12, "y": 0},
"targets": [
{
"expr": 'max(postmark_outbound_bounce_rate{window="1d"})',
"refId": "A",
"legendFormat": "Rate",
},
{
"expr": 'max(postmark_outbound_bounced{window="1d"})',
"refId": "B",
"legendFormat": "Count",
},
],
"fieldConfig": {
"defaults": {
"color": {"mode": "thresholds"},
"custom": {"displayMode": "auto"},
"thresholds": bounce_rate_thresholds,
"unit": "none",
},
"overrides": [
{
"matcher": {"id": "byName", "options": "Rate"},
"properties": [{"id": "unit", "value": "percent"}],
},
{
"matcher": {"id": "byName", "options": "Count"},
"properties": [{"id": "unit", "value": "none"}],
},
],
},
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "center",
"reduceOptions": {"calcs": ["lastNotNull"], "fields": "", "values": False},
"textMode": "name_and_value",
},
}
)
panels.append(
stat_panel(
4,
"Success Rate (1d)",
'clamp_min(100 - max(postmark_outbound_bounce_rate{window="1d"}), 0)',
{"h": 4, "w": 6, "x": 18, "y": 0},
unit="percent",
thresholds=success_thresholds,
decimals=1,
)
)
panels.append(
stat_panel(
5,
"Limit Used (30d)",
"max(postmark_sending_limit_used_percent)",
{"h": 4, "w": 6, "x": 0, "y": 4},
thresholds=limit_thresholds,
unit="percent",
decimals=1,
)
)
panels.append(
stat_panel(
6,
"Send Limit (30d)",
"max(postmark_sending_limit)",
{"h": 4, "w": 6, "x": 6, "y": 4},
decimals=0,
)
)
panels.append(
stat_panel(
7,
"Last Success",
"max(postmark_last_success_timestamp_seconds)",
{"h": 4, "w": 6, "x": 12, "y": 4},
unit="dateTimeAsIso",
decimals=0,
)
)
panels.append(
stat_panel(
8,
"Exporter Errors",
"sum(postmark_request_errors_total)",
{"h": 4, "w": 6, "x": 18, "y": 4},
decimals=0,
)
)
panels.append(
timeseries_panel(
13,
"Bounce Rate (1d vs 7d)",
"max by (window) (postmark_outbound_bounce_rate)",
{"h": 8, "w": 12, "x": 0, "y": 12},
unit="percent",
legend="{{window}}",
legend_display="table",
legend_placement="right",
)
)
panels.append(
timeseries_panel(
14,
"Bounced (1d vs 7d)",
"max by (window) (postmark_outbound_bounced)",
{"h": 8, "w": 12, "x": 12, "y": 12},
unit="none",
legend="{{window}}",
legend_display="table",
legend_placement="right",
)
)
panels.append(
timeseries_panel(
15,
"Sent (1d vs 7d)",
"max by (window) (postmark_outbound_sent)",
{"h": 8, "w": 12, "x": 0, "y": 20},
unit="none",
legend="{{window}}",
legend_display="table",
legend_placement="right",
)
)
panels.append(
timeseries_panel(
16,
"Exporter Errors",
"sum(postmark_request_errors_total)",
{"h": 8, "w": 12, "x": 12, "y": 20},
unit="none",
)
)
return {
"uid": "atlas-mail",
"title": "Atlas Mail",
"folderUid": PRIVATE_FOLDER,
"editable": True,
"panels": panels,
"time": {"from": "now-30d", "to": "now"},
"annotations": {"list": []},
"schemaVersion": 39,
"style": "dark",
"tags": ["atlas", "mail"],
}
def build_gpu_dashboard(): def build_gpu_dashboard():
panels = [] panels = []
gpu_scope = "$namespace_scope_gpu"
panels.append( panels.append(
pie_panel( pie_panel(
1, 1,
"Namespace GPU Share", "Namespace GPU Share",
namespace_gpu_share_expr(), namespace_gpu_share_expr(gpu_scope),
{"h": 8, "w": 12, "x": 0, "y": 0}, {"h": 8, "w": 12, "x": 0, "y": 0},
links=namespace_scope_links("namespace_scope_gpu"),
description="Values are normalized within the selected scope; use panel links to switch scope.",
) )
) )
panels.append( panels.append(
timeseries_panel( timeseries_panel(
2, 2,
"GPU Util by Namespace", "GPU Util by Namespace",
NAMESPACE_GPU_USAGE_INSTANT, namespace_gpu_usage_instant(gpu_scope),
{"h": 8, "w": 12, "x": 12, "y": 0}, {"h": 8, "w": 12, "x": 12, "y": 0},
unit="percent", unit="percent",
legend="{{namespace}}", legend="{{namespace}}",
@ -1757,6 +2194,13 @@ def build_gpu_dashboard():
"schemaVersion": 39, "schemaVersion": 39,
"style": "dark", "style": "dark",
"tags": ["atlas", "gpu"], "tags": ["atlas", "gpu"],
"templating": {
"list": [
namespace_scope_variable("namespace_scope_cpu", "CPU namespace filter"),
namespace_scope_variable("namespace_scope_gpu", "GPU namespace filter"),
namespace_scope_variable("namespace_scope_ram", "RAM namespace filter"),
]
},
} }
@ -1781,6 +2225,10 @@ DASHBOARDS = {
"builder": build_network_dashboard, "builder": build_network_dashboard,
"configmap": ROOT / "services" / "monitoring" / "grafana-dashboard-network.yaml", "configmap": ROOT / "services" / "monitoring" / "grafana-dashboard-network.yaml",
}, },
"atlas-mail": {
"builder": build_mail_dashboard,
"configmap": ROOT / "services" / "monitoring" / "grafana-dashboard-mail.yaml",
},
"atlas-gpu": { "atlas-gpu": {
"builder": build_gpu_dashboard, "builder": build_gpu_dashboard,
"configmap": ROOT / "services" / "monitoring" / "grafana-dashboard-gpu.yaml", "configmap": ROOT / "services" / "monitoring" / "grafana-dashboard-gpu.yaml",

445
scripts/dashboards_render_logs.py Executable file
View File

@ -0,0 +1,445 @@
#!/usr/bin/env python3
"""Generate OpenSearch Dashboards saved objects and render them into ConfigMaps.
Usage:
scripts/dashboards_render_logs.py --build # rebuild NDJSON + ConfigMap
scripts/dashboards_render_logs.py # re-render ConfigMap from NDJSON
"""
from __future__ import annotations
import argparse
import json
import textwrap
from dataclasses import dataclass
from pathlib import Path
ROOT = Path(__file__).resolve().parents[1]
DASHBOARD_DIR = ROOT / "services" / "logging" / "dashboards"
NDJSON_PATH = DASHBOARD_DIR / "logs.ndjson"
CONFIG_PATH = ROOT / "services" / "logging" / "opensearch-dashboards-objects.yaml"
CONFIG_TEMPLATE = textwrap.dedent(
"""# {relative_path}
# Generated by scripts/dashboards_render_logs.py --build
apiVersion: v1
kind: ConfigMap
metadata:
name: opensearch-dashboards-objects
namespace: logging
data:
objects.ndjson: |
{payload}
"""
)
DASHBOARD_VERSION = "7.10.0"
GRID_COLUMNS = 48
H_CHART = 10
H_ERRORS = 8
H_TABLE = 16
H_SEARCH = 18
TABLE_SIZE = 15
TABLE_PER_PAGE = 15
ERROR_TERMS = ("*error*", "*exception*", "*fail*")
@dataclass(frozen=True)
class AppSpec:
slug: str
title: str
query: str
index_id: str = "kube-logs"
kind: str = "kube"
def error_query(base: str | None = None) -> str:
parts = [f'(log : "{term}" or message : "{term}")' for term in ERROR_TERMS]
expr = " or ".join(parts)
if base:
return f"({base}) and ({expr})"
return f"({expr})"
def json_line(obj: dict) -> str:
return json.dumps(obj, separators=(",", ":"))
def search_source(query: str) -> dict:
return {
"query": {"language": "kuery", "query": query},
"filter": [],
"indexRefName": "kibanaSavedObjectMeta.searchSourceJSON.index",
}
def index_pattern(object_id: str, title: str, time_field: str = "@timestamp") -> dict:
return {
"type": "index-pattern",
"id": object_id,
"attributes": {"title": title, "timeFieldName": time_field},
}
def histogram_vis(object_id: str, title: str, query: str, index_id: str) -> dict:
vis_state = {
"title": title,
"type": "histogram",
"aggs": [
{"id": "1", "enabled": True, "type": "count", "schema": "metric"},
{
"id": "2",
"enabled": True,
"type": "date_histogram",
"schema": "segment",
"params": {"field": "@timestamp", "interval": "auto", "min_doc_count": 1},
},
],
"params": {"addTooltip": True, "addLegend": False, "scale": "linear", "interpolate": "linear"},
}
return {
"type": "visualization",
"id": object_id,
"attributes": {
"title": title,
"visState": json.dumps(vis_state, separators=(",", ":")),
"uiStateJSON": "{}",
"description": "",
"version": 1,
"kibanaSavedObjectMeta": {
"searchSourceJSON": json.dumps(search_source(query), separators=(",", ":"))
},
},
"references": [
{
"name": "kibanaSavedObjectMeta.searchSourceJSON.index",
"type": "index-pattern",
"id": index_id,
}
],
}
def table_vis(object_id: str, title: str, field: str, query: str, index_id: str) -> dict:
vis_state = {
"title": title,
"type": "table",
"aggs": [
{"id": "1", "enabled": True, "type": "count", "schema": "metric"},
{
"id": "2",
"enabled": True,
"type": "terms",
"schema": "bucket",
"params": {"field": field, "size": TABLE_SIZE, "order": "desc", "orderBy": "1"},
},
],
"params": {
"perPage": TABLE_PER_PAGE,
"showPartialRows": False,
"showMetricsAtAllLevels": False,
"sort": {"columnIndex": 1, "direction": "desc"},
},
}
return {
"type": "visualization",
"id": object_id,
"attributes": {
"title": title,
"visState": json.dumps(vis_state, separators=(",", ":")),
"uiStateJSON": "{}",
"description": "",
"version": 1,
"kibanaSavedObjectMeta": {
"searchSourceJSON": json.dumps(search_source(query), separators=(",", ":"))
},
},
"references": [
{
"name": "kibanaSavedObjectMeta.searchSourceJSON.index",
"type": "index-pattern",
"id": index_id,
}
],
}
def search_object(object_id: str, title: str, columns: list[str], query: str, index_id: str) -> dict:
return {
"type": "search",
"id": object_id,
"attributes": {
"title": title,
"description": "",
"columns": columns,
"sort": [["@timestamp", "desc"]],
"kibanaSavedObjectMeta": {
"searchSourceJSON": json.dumps(search_source(query), separators=(",", ":"))
},
},
"references": [
{
"name": "kibanaSavedObjectMeta.searchSourceJSON.index",
"type": "index-pattern",
"id": index_id,
}
],
}
def grid(x: int, y: int, w: int, h: int, i: int) -> dict:
return {"x": x, "y": y, "w": w, "h": h, "i": str(i)}
def panel(panel_id: str, panel_type: str, grid_data: dict, index: int) -> dict:
return {
"panelIndex": str(index),
"gridData": grid_data,
"id": panel_id,
"type": panel_type,
"version": DASHBOARD_VERSION,
"embeddableConfig": {},
}
def full_width_panels(specs: list[tuple[str, str, int]]) -> list[dict]:
panels = []
y = 0
for index, (panel_id, panel_type, height) in enumerate(specs, start=1):
panels.append(panel(panel_id, panel_type, grid(0, y, GRID_COLUMNS, height, index), index))
y += height
return panels
def dashboard_object(object_id: str, title: str, panels: list[dict]) -> dict:
return {
"type": "dashboard",
"id": object_id,
"attributes": {
"title": title,
"description": "",
"hits": 0,
"panelsJSON": json.dumps(panels, separators=(",", ":")),
"optionsJSON": json.dumps({"useMargins": True, "hidePanelTitles": False}, separators=(",", ":")),
"version": 1,
"timeRestore": False,
"kibanaSavedObjectMeta": {
"searchSourceJSON": json.dumps({"query": {"language": "kuery", "query": ""}, "filter": []})
},
},
}
def app_dashboard_objects(app: AppSpec) -> list[dict]:
prefix = f"logs-{app.slug}"
objects = []
if app.kind == "journald":
columns = ["@timestamp", "_HOSTNAME", "_SYSTEMD_UNIT", "MESSAGE"]
objects.append(histogram_vis(f"{prefix}-volume", f"{app.title} logs", app.query, app.index_id))
objects.append(histogram_vis(f"{prefix}-errors", f"{app.title} errors", error_query(app.query), app.index_id))
objects.append(table_vis(f"{prefix}-top-units", "Top units", "_SYSTEMD_UNIT.keyword", app.query, app.index_id))
objects.append(search_object(f"{prefix}-recent", "Recent logs", columns, app.query, app.index_id))
objects.append(
search_object(
f"{prefix}-recent-errors",
"Recent errors",
columns,
error_query(app.query),
app.index_id,
)
)
panels = full_width_panels(
[
(f"{prefix}-volume", "visualization", H_CHART),
(f"{prefix}-errors", "visualization", H_ERRORS),
(f"{prefix}-top-units", "visualization", H_TABLE),
(f"{prefix}-recent", "search", H_SEARCH),
(f"{prefix}-recent-errors", "search", H_SEARCH),
]
)
objects.append(dashboard_object(prefix, f"{app.title} Logs", panels))
return objects
columns = ["@timestamp", "kubernetes.pod_name", "kubernetes.container_name", "log", "message"]
objects.append(histogram_vis(f"{prefix}-volume", f"{app.title} logs", app.query, app.index_id))
objects.append(histogram_vis(f"{prefix}-errors", f"{app.title} errors", error_query(app.query), app.index_id))
objects.append(table_vis(f"{prefix}-top-pods", "Top pods", "kubernetes.pod_name.keyword", app.query, app.index_id))
objects.append(
table_vis(f"{prefix}-top-containers", "Top containers", "kubernetes.container_name.keyword", app.query, app.index_id)
)
objects.append(search_object(f"{prefix}-recent", "Recent logs", columns, app.query, app.index_id))
objects.append(
search_object(
f"{prefix}-recent-errors",
"Recent errors",
columns,
error_query(app.query),
app.index_id,
)
)
panels = full_width_panels(
[
(f"{prefix}-volume", "visualization", H_CHART),
(f"{prefix}-errors", "visualization", H_ERRORS),
(f"{prefix}-top-pods", "visualization", H_TABLE),
(f"{prefix}-top-containers", "visualization", H_TABLE),
(f"{prefix}-recent", "search", H_SEARCH),
(f"{prefix}-recent-errors", "search", H_SEARCH),
]
)
objects.append(dashboard_object(prefix, f"{app.title} Logs", panels))
return objects
def overview_objects() -> list[dict]:
objects = []
objects.append(histogram_vis("logs-overview-volume", "Logs per minute", "*", "kube-logs"))
objects.append(histogram_vis("logs-overview-errors", "Errors per minute", error_query(), "kube-logs"))
objects.append(
table_vis(
"logs-overview-top-ns",
"Top namespaces",
"kubernetes.namespace_name.keyword",
"*",
"kube-logs",
)
)
objects.append(
table_vis(
"logs-overview-top-error-ns",
"Top error namespaces",
"kubernetes.namespace_name.keyword",
error_query(),
"kube-logs",
)
)
objects.append(table_vis("logs-overview-top-pods", "Top pods", "kubernetes.pod_name.keyword", "*", "kube-logs"))
objects.append(
table_vis(
"logs-overview-top-nodes",
"Top nodes",
"kubernetes.node_name.keyword",
"*",
"kube-logs",
)
)
objects.append(
search_object(
"logs-overview-recent-errors",
"Recent errors",
["@timestamp", "kubernetes.namespace_name", "kubernetes.pod_name", "log", "message"],
error_query(),
"kube-logs",
)
)
panels = full_width_panels(
[
("logs-overview-volume", "visualization", H_CHART),
("logs-overview-errors", "visualization", H_ERRORS),
("logs-overview-top-ns", "visualization", H_TABLE),
("logs-overview-top-error-ns", "visualization", H_TABLE),
("logs-overview-top-pods", "visualization", H_TABLE),
("logs-overview-top-nodes", "visualization", H_TABLE),
("logs-overview-recent-errors", "search", H_SEARCH),
]
)
objects.append(dashboard_object("logs-overview", "Atlas Logs Overview", panels))
return objects
def build_objects() -> list[dict]:
objects = [
index_pattern("kube-logs", "kube-*"),
index_pattern("journald-logs", "journald-*"),
]
objects.extend(overview_objects())
apps = [
AppSpec("bstein-dev-home", "bstein-dev-home", 'kubernetes.namespace_name: "bstein-dev-home"'),
AppSpec(
"pegasus",
"pegasus",
'kubernetes.namespace_name: "jellyfin" and kubernetes.labels.app: "pegasus"',
),
AppSpec(
"jellyfin",
"jellyfin",
'kubernetes.namespace_name: "jellyfin" and kubernetes.labels.app: "jellyfin"',
),
AppSpec("vaultwarden", "vaultwarden", 'kubernetes.namespace_name: "vaultwarden"'),
AppSpec("mailu", "mailu", 'kubernetes.namespace_name: "mailu-mailserver"'),
AppSpec("nextcloud", "nextcloud", 'kubernetes.namespace_name: "nextcloud"'),
AppSpec("gitea", "gitea", 'kubernetes.namespace_name: "gitea"'),
AppSpec("jenkins", "jenkins", 'kubernetes.namespace_name: "jenkins"'),
AppSpec("harbor", "harbor", 'kubernetes.namespace_name: "harbor"'),
AppSpec("vault", "vault", 'kubernetes.namespace_name: "vault"'),
AppSpec("keycloak", "keycloak", 'kubernetes.namespace_name: "sso"'),
AppSpec("flux-system", "flux-system", 'kubernetes.namespace_name: "flux-system"'),
AppSpec("comms", "comms", 'kubernetes.namespace_name: "comms"'),
AppSpec(
"element-web",
"element-web",
'kubernetes.namespace_name: "comms" and kubernetes.container_name: "element-web"',
),
AppSpec(
"element-call",
"element-call",
'kubernetes.namespace_name: "comms" and kubernetes.labels.app: "element-call"',
),
AppSpec(
"matrix-synapse",
"matrix-synapse",
'kubernetes.namespace_name: "comms" and kubernetes.container_name: "synapse"',
),
AppSpec(
"livekit",
"livekit",
'kubernetes.namespace_name: "comms" and kubernetes.labels.app: "livekit"',
),
AppSpec(
"coturn",
"coturn",
'kubernetes.namespace_name: "comms" and kubernetes.labels.app: "coturn"',
),
AppSpec("lesavka", "lesavka", '_HOSTNAME: "titan-jh"', index_id="journald-logs", kind="journald"),
]
for app in apps:
objects.extend(app_dashboard_objects(app))
return objects
def write_ndjson(objects: list[dict], path: Path) -> None:
path.parent.mkdir(parents=True, exist_ok=True)
payload = "\n".join(json_line(obj) for obj in objects)
path.write_text(payload + "\n")
def render_configmap(ndjson_path: Path, output_path: Path) -> None:
payload_lines = ndjson_path.read_text().splitlines()
payload = "\n".join(" " + line for line in payload_lines)
relative_path = output_path.relative_to(ROOT)
output_path.write_text(CONFIG_TEMPLATE.format(relative_path=relative_path, payload=payload))
def main() -> None:
parser = argparse.ArgumentParser()
parser.add_argument("--build", action="store_true", help="Regenerate saved object NDJSON and ConfigMap")
args = parser.parse_args()
if args.build:
objects = build_objects()
write_ndjson(objects, NDJSON_PATH)
if not NDJSON_PATH.exists():
raise SystemExit(f"Missing NDJSON file: {NDJSON_PATH}. Run with --build first.")
render_configmap(NDJSON_PATH, CONFIG_PATH)
if __name__ == "__main__":
main()

View File

@ -0,0 +1,554 @@
#!/usr/bin/env python3
"""Render Atlas knowledge artifacts from Flux + kustomize manifests.
Outputs (committed to git for stable diffs + RAG):
- knowledge/catalog/*.yaml
- knowledge/diagrams/*.mmd
This is intentionally conservative:
- never includes Secret objects
- never includes secret values
- keeps output deterministic (sorted)
"""
from __future__ import annotations
import argparse
import json
import re
import subprocess
import sys
from dataclasses import dataclass
from pathlib import Path
from typing import Any, Iterable
import yaml
REPO_ROOT = Path(__file__).resolve().parents[1]
CLUSTER_SCOPED_KINDS = {
"Namespace",
"Node",
"CustomResourceDefinition",
"ClusterRole",
"ClusterRoleBinding",
"StorageClass",
"PersistentVolume",
"MutatingWebhookConfiguration",
"ValidatingWebhookConfiguration",
"APIService",
}
INCLUDED_KINDS = {
"Namespace",
"Deployment",
"StatefulSet",
"DaemonSet",
"Service",
"Ingress",
"IngressRoute", # traefik
"HelmRelease", # only to harvest ingress hostnames from values
}
def _run(cmd: list[str], *, cwd: Path) -> str:
res = subprocess.run(cmd, cwd=cwd, capture_output=True, text=True, check=False)
if res.returncode != 0:
raise RuntimeError(
f"Command failed ({res.returncode}): {' '.join(cmd)}\n{res.stderr.strip()}"
)
return res.stdout
def kustomize_build(path: Path) -> str:
rel = path.relative_to(REPO_ROOT)
try:
return _run(["kubectl", "kustomize", str(rel)], cwd=REPO_ROOT)
except Exception as e:
msg = str(e)
if "is not in or below" in msg:
# Repo uses configMapGenerators that reference ../../scripts/*.py.
# Kustomize load restriction must be disabled for a full render.
try:
return _run(
["kubectl", "kustomize", "--load-restrictor=LoadRestrictionsNone", str(rel)],
cwd=REPO_ROOT,
)
except Exception:
pass
return _run(["kustomize", "build", "--load-restrictor=LoadRestrictionsNone", str(rel)], cwd=REPO_ROOT)
def _iter_docs(raw_yaml: str) -> Iterable[dict[str, Any]]:
for doc in yaml.safe_load_all(raw_yaml):
if not isinstance(doc, dict):
continue
kind = doc.get("kind")
if kind == "List" and isinstance(doc.get("items"), list):
for item in doc["items"]:
if isinstance(item, dict):
yield item
continue
if kind:
yield doc
def _meta(doc: dict[str, Any]) -> tuple[str, str | None]:
md = doc.get("metadata") or {}
name = md.get("name") or ""
namespace = md.get("namespace")
return name, namespace
def _is_namespaced(doc: dict[str, Any]) -> bool:
kind = doc.get("kind") or ""
return kind not in CLUSTER_SCOPED_KINDS
@dataclass(frozen=True)
class FluxKustomization:
name: str
path: str
target_namespace: str | None
def find_flux_kustomizations() -> list[FluxKustomization]:
"""Find Flux Kustomization CRs under clusters/atlas/flux-system."""
root = REPO_ROOT / "clusters" / "atlas" / "flux-system"
items: list[FluxKustomization] = []
for file in sorted(root.rglob("*.yaml")):
raw = file.read_text()
for doc in _iter_docs(raw):
if doc.get("kind") != "Kustomization":
continue
api = str(doc.get("apiVersion") or "")
if not api.startswith("kustomize.toolkit.fluxcd.io/"):
continue
name, _ = _meta(doc)
spec = doc.get("spec") or {}
path = spec.get("path")
if not isinstance(path, str) or not path.strip():
continue
items.append(
FluxKustomization(
name=name,
path=path.strip().lstrip("./"),
target_namespace=spec.get("targetNamespace"),
)
)
return sorted(items, key=lambda k: k.name)
def _safe_string_scan_for_hosts(value: Any) -> set[str]:
"""Best-effort host scan from HelmRelease values without chart rendering."""
hosts: set[str] = set()
if isinstance(value, str):
for m in re.finditer(r"(?i)([a-z0-9-]+(?:\.[a-z0-9-]+)+)", value):
host = m.group(1).lower()
if host.endswith("bstein.dev"):
hosts.add(host)
return hosts
if isinstance(value, list):
for item in value:
hosts |= _safe_string_scan_for_hosts(item)
return hosts
if isinstance(value, dict):
for item in value.values():
hosts |= _safe_string_scan_for_hosts(item)
return hosts
return hosts
def _service_ports(svc: dict[str, Any]) -> list[dict[str, Any]]:
spec = svc.get("spec") or {}
out: list[dict[str, Any]] = []
for p in spec.get("ports") or []:
if not isinstance(p, dict):
continue
out.append(
{
"name": p.get("name"),
"port": p.get("port"),
"targetPort": p.get("targetPort"),
"protocol": p.get("protocol", "TCP"),
}
)
return out
def _workload_labels(doc: dict[str, Any]) -> dict[str, str]:
tpl = (doc.get("spec") or {}).get("template") or {}
md = tpl.get("metadata") or {}
labels = md.get("labels") or {}
return {str(k): str(v) for k, v in labels.items()} if isinstance(labels, dict) else {}
def _service_selector(doc: dict[str, Any]) -> dict[str, str]:
spec = doc.get("spec") or {}
sel = spec.get("selector") or {}
return {str(k): str(v) for k, v in sel.items()} if isinstance(sel, dict) else {}
def _selector_matches(selector: dict[str, str], labels: dict[str, str]) -> bool:
if not selector:
return False
return all(labels.get(k) == v for k, v in selector.items())
def _sanitize_node_id(text: str) -> str:
return re.sub(r"[^a-zA-Z0-9_]", "_", text)
def extract_catalog(
rendered: list[tuple[FluxKustomization, list[dict[str, Any]]]],
) -> tuple[dict[str, Any], dict[str, Any], str]:
"""Build knowledge catalog + mermaid diagram from rendered docs."""
# Index workloads and services for mapping.
workloads: dict[tuple[str, str], dict[str, Any]] = {}
services: dict[tuple[str, str], dict[str, Any]] = {}
ingresses: list[dict[str, Any]] = []
ingressroutes: list[dict[str, Any]] = []
helmrelease_hosts: dict[str, list[str]] = {}
for src, docs in rendered:
for doc in docs:
kind = doc.get("kind")
if kind not in INCLUDED_KINDS:
continue
if kind == "Secret":
continue
name, namespace = _meta(doc)
if _is_namespaced(doc) and not namespace and src.target_namespace:
namespace = src.target_namespace
doc = dict(doc)
doc.setdefault("metadata", {})["namespace"] = namespace
if kind in ("Deployment", "StatefulSet", "DaemonSet"):
workloads[(namespace or "", name)] = {
"kind": kind,
"namespace": namespace or "",
"name": name,
"labels": _workload_labels(doc),
"serviceAccountName": ((doc.get("spec") or {}).get("template") or {})
.get("spec", {})
.get("serviceAccountName"),
"nodeSelector": ((doc.get("spec") or {}).get("template") or {})
.get("spec", {})
.get("nodeSelector", {}),
"images": sorted(
{
c.get("image")
for c in (
(((doc.get("spec") or {}).get("template") or {}).get("spec") or {}).get(
"containers"
)
or []
)
if isinstance(c, dict) and c.get("image")
}
),
}
elif kind == "Service":
services[(namespace or "", name)] = {
"namespace": namespace or "",
"name": name,
"type": (doc.get("spec") or {}).get("type", "ClusterIP"),
"selector": _service_selector(doc),
"ports": _service_ports(doc),
}
elif kind == "Ingress":
ingresses.append({"source": src.name, "doc": doc})
elif kind == "IngressRoute":
ingressroutes.append({"source": src.name, "doc": doc})
elif kind == "HelmRelease":
spec = doc.get("spec") or {}
vals = spec.get("values") or {}
hosts = sorted(_safe_string_scan_for_hosts(vals))
if hosts:
helmrelease_hosts[f"{src.name}:{namespace or ''}/{name}"] = hosts
# Map services to workloads.
service_to_workloads: dict[tuple[str, str], list[dict[str, str]]] = {}
for (ns, svc_name), svc in services.items():
selector = svc.get("selector") or {}
matches: list[dict[str, str]] = []
for (w_ns, w_name), w in workloads.items():
if w_ns != ns:
continue
if _selector_matches(selector, w.get("labels") or {}):
matches.append({"kind": w["kind"], "name": w_name})
service_to_workloads[(ns, svc_name)] = sorted(matches, key=lambda m: (m["kind"], m["name"]))
# Extract HTTP endpoints.
endpoints: list[dict[str, Any]] = []
def add_endpoint(
*,
host: str,
path: str,
namespace: str,
service: str,
port: Any,
source: str,
kind: str,
obj_name: str,
):
wk = service_to_workloads.get((namespace, service), [])
endpoints.append(
{
"host": host,
"path": path,
"backend": {
"namespace": namespace,
"service": service,
"port": port,
"workloads": wk,
},
"via": {"kind": kind, "name": obj_name, "source": source},
}
)
for item in ingresses:
doc = item["doc"]
source = item["source"]
name, namespace = _meta(doc)
namespace = namespace or ""
spec = doc.get("spec") or {}
for rule in spec.get("rules") or []:
if not isinstance(rule, dict):
continue
host = (rule.get("host") or "").strip()
http = rule.get("http") or {}
for p in http.get("paths") or []:
if not isinstance(p, dict):
continue
backend = (p.get("backend") or {}).get("service") or {}
svc_name = backend.get("name")
svc_port = (backend.get("port") or {}).get("number") or (backend.get("port") or {}).get("name")
if not host or not svc_name:
continue
add_endpoint(
host=host,
path=p.get("path") or "/",
namespace=namespace,
service=svc_name,
port=svc_port,
source=source,
kind="Ingress",
obj_name=name,
)
host_re = re.compile(r"Host\(`([^`]+)`\)")
pathprefix_re = re.compile(r"PathPrefix\(`([^`]+)`\)")
for item in ingressroutes:
doc = item["doc"]
source = item["source"]
name, namespace = _meta(doc)
namespace = namespace or ""
spec = doc.get("spec") or {}
for route in spec.get("routes") or []:
if not isinstance(route, dict):
continue
match = route.get("match") or ""
hosts = host_re.findall(match)
pathprefixes = pathprefix_re.findall(match) or ["/"]
for svc in route.get("services") or []:
if not isinstance(svc, dict):
continue
svc_name = svc.get("name")
svc_port = svc.get("port")
if not svc_name:
continue
for host in hosts:
for pp in pathprefixes:
add_endpoint(
host=host,
path=pp,
namespace=namespace,
service=svc_name,
port=svc_port,
source=source,
kind="IngressRoute",
obj_name=name,
)
endpoints = sorted(
endpoints,
key=lambda e: (
e["host"],
e["path"],
e["backend"]["namespace"],
e["backend"]["service"],
),
)
catalog = {
"cluster": "atlas",
"sources": [
{"name": k.name, "path": k.path, "targetNamespace": k.target_namespace}
for k, _ in rendered
],
"workloads": sorted(
list(workloads.values()),
key=lambda w: (w["namespace"], w["kind"], w["name"]),
),
"services": sorted(
list(services.values()),
key=lambda s: (s["namespace"], s["name"]),
),
"http_endpoints": endpoints,
"helmrelease_host_hints": {k: v for k, v in sorted(helmrelease_hosts.items())},
}
# Mermaid diagram: host -> service -> workload (grouped by namespace).
ns_nodes: dict[str, list[str]] = {}
lines: list[str] = ["flowchart LR"]
edges: set[tuple[str, str]] = set()
def ensure_ns_node(ns: str, node_id: str):
ns_nodes.setdefault(ns, [])
if node_id not in ns_nodes[ns]:
ns_nodes[ns].append(node_id)
host_nodes: dict[str, str] = {}
for ep in endpoints:
host = ep["host"]
host_id = host_nodes.get(host)
if not host_id:
host_id = f"host_{_sanitize_node_id(host)}"
host_nodes[host] = host_id
lines.append(f' {host_id}["{host}"]')
ns = ep["backend"]["namespace"]
svc = ep["backend"]["service"]
svc_id = f"svc_{_sanitize_node_id(ns)}_{_sanitize_node_id(svc)}"
if svc_id not in ns_nodes.get(ns, []):
lines.append(f' {svc_id}["{ns}/{svc} (Service)"]')
ensure_ns_node(ns, svc_id)
if (host_id, svc_id) not in edges:
edges.add((host_id, svc_id))
lines.append(f" {host_id} --> {svc_id}")
for w in ep["backend"]["workloads"]:
w_id = f"wl_{_sanitize_node_id(ns)}_{_sanitize_node_id(w['name'])}"
if w_id not in ns_nodes.get(ns, []):
lines.append(f' {w_id}["{ns}/{w["name"]} ({w["kind"]})"]')
ensure_ns_node(ns, w_id)
if (svc_id, w_id) not in edges:
edges.add((svc_id, w_id))
lines.append(f" {svc_id} --> {w_id}")
# Wrap namespace subgraphs at the end for stability (sorted namespaces).
if ns_nodes:
lines.append("")
for ns in sorted(ns_nodes.keys()):
lines.append(f" subgraph { _sanitize_node_id(ns) }[{ns}]")
for node_id in ns_nodes[ns]:
lines.append(f" {node_id}")
lines.append(" end")
diagram = "\n".join(lines).rstrip() + "\n"
summary = {
"counts": {
"workloads": len(workloads),
"services": len(services),
"http_endpoints": len(endpoints),
"helmrelease_host_hints": sum(len(v) for v in helmrelease_hosts.values()),
}
}
return catalog, summary, diagram
def main() -> int:
ap = argparse.ArgumentParser()
ap.add_argument("--out", default="knowledge", help="Output base directory (default: knowledge/)")
ap.add_argument(
"--write",
action="store_true",
help="Write generated files (otherwise just print a summary).",
)
args = ap.parse_args()
out_dir = REPO_ROOT / args.out
flux = find_flux_kustomizations()
if not flux:
print("No Flux Kustomizations found under clusters/atlas/flux-system.", file=sys.stderr)
return 2
rendered: list[tuple[FluxKustomization, list[dict[str, Any]]]] = []
for k in flux:
path = REPO_ROOT / k.path
if not path.exists():
continue
raw = kustomize_build(path)
docs = [d for d in _iter_docs(raw) if d.get("kind") != "Secret"]
rendered.append((k, docs))
rendered = sorted(rendered, key=lambda item: item[0].name)
catalog, summary, diagram = extract_catalog(rendered)
if not args.write:
print(json.dumps(summary, indent=2, sort_keys=True))
return 0
(out_dir / "catalog").mkdir(parents=True, exist_ok=True)
(out_dir / "diagrams").mkdir(parents=True, exist_ok=True)
catalog_path = out_dir / "catalog" / "atlas.yaml"
catalog_json_path = out_dir / "catalog" / "atlas.json"
summary_path = out_dir / "catalog" / "atlas-summary.json"
diagram_path = out_dir / "diagrams" / "atlas-http.mmd"
runbooks_json_path = out_dir / "catalog" / "runbooks.json"
catalog_path.write_text(
"# Generated by scripts/knowledge_render_atlas.py (do not edit by hand)\n"
+ yaml.safe_dump(catalog, sort_keys=False),
encoding="utf-8",
)
catalog_json_path.write_text(json.dumps(catalog, indent=2, sort_keys=False) + "\n", encoding="utf-8")
summary_path.write_text(json.dumps(summary, indent=2, sort_keys=True) + "\n", encoding="utf-8")
diagram_path.write_text(diagram, encoding="utf-8")
# Render runbooks into JSON for lightweight, dependency-free consumption in-cluster.
runbooks_dir = out_dir / "runbooks"
runbooks: list[dict[str, Any]] = []
if runbooks_dir.exists():
for md_file in sorted(runbooks_dir.glob("*.md")):
raw = md_file.read_text(encoding="utf-8")
fm: dict[str, Any] = {}
body = raw
if raw.startswith("---\n"):
try:
_, rest = raw.split("---\n", 1)
fm_raw, body = rest.split("\n---\n", 1)
fm = yaml.safe_load(fm_raw) or {}
except Exception:
fm = {}
body = raw
runbooks.append(
{
"path": str(md_file.relative_to(out_dir)),
"title": fm.get("title") or md_file.stem,
"tags": fm.get("tags") or [],
"entrypoints": fm.get("entrypoints") or [],
"source_paths": fm.get("source_paths") or [],
"body": body.strip(),
}
)
runbooks_json_path.write_text(json.dumps(runbooks, indent=2, sort_keys=False) + "\n", encoding="utf-8")
print(f"Wrote {catalog_path.relative_to(REPO_ROOT)}")
print(f"Wrote {catalog_json_path.relative_to(REPO_ROOT)}")
print(f"Wrote {summary_path.relative_to(REPO_ROOT)}")
print(f"Wrote {diagram_path.relative_to(REPO_ROOT)}")
print(f"Wrote {runbooks_json_path.relative_to(REPO_ROOT)}")
return 0
if __name__ == "__main__":
raise SystemExit(main())

View File

@ -0,0 +1,313 @@
#!/usr/bin/env python3
"""Generate OpenSearch Observability seed objects and render them into ConfigMaps.
Usage:
scripts/logging_render_observability.py --build # rebuild JSON + ConfigMap
scripts/logging_render_observability.py # re-render ConfigMap from JSON
"""
from __future__ import annotations
import argparse
import json
import textwrap
from dataclasses import dataclass
from pathlib import Path
ROOT = Path(__file__).resolve().parents[1]
OBS_DIR = ROOT / "services" / "logging" / "observability"
APPS_PATH = OBS_DIR / "applications.json"
QUERIES_PATH = OBS_DIR / "saved_queries.json"
VIS_PATH = OBS_DIR / "saved_visualizations.json"
CONFIG_PATH = ROOT / "services" / "logging" / "opensearch-observability-objects.yaml"
CONFIG_TEMPLATE = textwrap.dedent(
"""# {relative_path}
# Generated by scripts/logging_render_observability.py --build
apiVersion: v1
kind: ConfigMap
metadata:
name: opensearch-observability-objects
namespace: logging
data:
applications.json: |
{applications}
saved_queries.json: |
{queries}
saved_visualizations.json: |
{visualizations}
"""
)
DEFAULT_RANGE = {"start": "now-24h", "end": "now", "text": ""}
DEFAULT_TIMESTAMP = {"name": "@timestamp", "type": "timestamp"}
DEFAULT_FIELDS = {"text": "", "tokens": []}
@dataclass(frozen=True)
class AppSpec:
name: str
base_query: str
kind: str = "kube"
description: str = ""
@dataclass(frozen=True)
class QuerySpec:
name: str
query: str
description: str = ""
@dataclass(frozen=True)
class VisualizationSpec:
name: str
query: str
vis_type: str
description: str = ""
def source_query(index: str, where: str | None = None) -> str:
query = f"source = {index}"
if where:
query += f" | where {where}"
return query
def error_filter(fields: list[str]) -> str:
parts = [f"match({field}, 'error|exception|fail')" for field in fields]
return " or ".join(parts)
def saved_query(spec: QuerySpec) -> dict:
return {
"name": spec.name,
"description": spec.description,
"query": spec.query,
"selected_date_range": DEFAULT_RANGE,
"selected_timestamp": DEFAULT_TIMESTAMP,
"selected_fields": DEFAULT_FIELDS,
}
def saved_visualization(spec: VisualizationSpec) -> dict:
return {
"name": spec.name,
"description": spec.description,
"query": spec.query,
"type": spec.vis_type,
"selected_date_range": DEFAULT_RANGE,
"selected_timestamp": DEFAULT_TIMESTAMP,
"selected_fields": DEFAULT_FIELDS,
}
def build_objects() -> tuple[list[dict], list[dict], list[dict]]:
kube_error = error_filter(["log", "message"])
journald_error = error_filter(["MESSAGE"])
apps = [
AppSpec("bstein-dev-home", source_query("kube-*", "kubernetes.namespace_name = 'bstein-dev-home'")),
AppSpec(
"pegasus",
source_query(
"kube-*",
"kubernetes.namespace_name = 'jellyfin' and kubernetes.labels.app = 'pegasus'",
),
),
AppSpec(
"jellyfin",
source_query(
"kube-*",
"kubernetes.namespace_name = 'jellyfin' and kubernetes.labels.app = 'jellyfin'",
),
),
AppSpec("vaultwarden", source_query("kube-*", "kubernetes.namespace_name = 'vaultwarden'")),
AppSpec("mailu", source_query("kube-*", "kubernetes.namespace_name = 'mailu-mailserver'")),
AppSpec("nextcloud", source_query("kube-*", "kubernetes.namespace_name = 'nextcloud'")),
AppSpec("gitea", source_query("kube-*", "kubernetes.namespace_name = 'gitea'")),
AppSpec("jenkins", source_query("kube-*", "kubernetes.namespace_name = 'jenkins'")),
AppSpec("harbor", source_query("kube-*", "kubernetes.namespace_name = 'harbor'")),
AppSpec("vault", source_query("kube-*", "kubernetes.namespace_name = 'vault'")),
AppSpec("keycloak", source_query("kube-*", "kubernetes.namespace_name = 'sso'")),
AppSpec("flux-system", source_query("kube-*", "kubernetes.namespace_name = 'flux-system'")),
AppSpec("comms", source_query("kube-*", "kubernetes.namespace_name = 'comms'")),
AppSpec(
"element-web",
source_query(
"kube-*",
"kubernetes.namespace_name = 'comms' and kubernetes.container_name = 'element-web'",
),
),
AppSpec(
"element-call",
source_query(
"kube-*",
"kubernetes.namespace_name = 'comms' and kubernetes.labels.app = 'element-call'",
),
),
AppSpec(
"matrix-synapse",
source_query(
"kube-*",
"kubernetes.namespace_name = 'comms' and kubernetes.container_name = 'synapse'",
),
),
AppSpec(
"livekit",
source_query(
"kube-*",
"kubernetes.namespace_name = 'comms' and kubernetes.labels.app = 'livekit'",
),
),
AppSpec(
"coturn",
source_query(
"kube-*",
"kubernetes.namespace_name = 'comms' and kubernetes.labels.app = 'coturn'",
),
),
AppSpec(
"lesavka",
source_query("journald-*", "_HOSTNAME = 'titan-jh'"),
kind="journald",
),
]
applications = [
{
"name": app.name,
"description": app.description,
"baseQuery": app.base_query,
"servicesEntities": [],
"traceGroups": [app.name],
}
for app in apps
]
queries = [
saved_query(QuerySpec("kube logs", source_query("kube-*"))),
saved_query(QuerySpec("kube errors", f"{source_query('kube-*')} | where {kube_error}")),
saved_query(QuerySpec("journald logs", source_query("journald-*"))),
saved_query(QuerySpec("journald errors", f"{source_query('journald-*')} | where {journald_error}")),
]
for app in apps:
query_base = app.base_query
error_clause = journald_error if app.kind == "journald" else kube_error
queries.append(saved_query(QuerySpec(f"{app.name} logs", query_base)))
queries.append(saved_query(QuerySpec(f"{app.name} errors", f"{query_base} | where {error_clause}")))
visualizations = [
saved_visualization(
VisualizationSpec(
"[Kube] Logs per hour",
"source = kube-* | stats count() as log_count by span(`@timestamp`, 1h)",
"line",
)
),
saved_visualization(
VisualizationSpec(
"[Kube] Errors per hour",
f"source = kube-* | where {kube_error} | stats count() as error_count by span(`@timestamp`, 1h)",
"line",
)
),
saved_visualization(
VisualizationSpec(
"[Kube] Top namespaces",
"source = kube-* | stats count() as log_count by kubernetes.namespace_name | sort - log_count",
"bar",
)
),
saved_visualization(
VisualizationSpec(
"[Kube] Top error namespaces",
f"source = kube-* | where {kube_error} | stats count() as error_count by kubernetes.namespace_name | sort - error_count",
"bar",
)
),
saved_visualization(
VisualizationSpec(
"[Kube] Top pods",
"source = kube-* | stats count() as log_count by kubernetes.pod_name | sort - log_count",
"bar",
)
),
saved_visualization(
VisualizationSpec(
"[Kube] Top error pods",
f"source = kube-* | where {kube_error} | stats count() as error_count by kubernetes.pod_name | sort - error_count",
"bar",
)
),
saved_visualization(
VisualizationSpec(
"[Kube] Top nodes",
"source = kube-* | stats count() as log_count by kubernetes.node_name | sort - log_count",
"bar",
)
),
saved_visualization(
VisualizationSpec(
"[Journald] Top units",
"source = journald-* | stats count() as log_count by _SYSTEMD_UNIT | sort - log_count",
"bar",
)
),
saved_visualization(
VisualizationSpec(
"[Journald] Top error units",
f"source = journald-* | where {journald_error} | stats count() as error_count by _SYSTEMD_UNIT | sort - error_count",
"bar",
)
),
]
return applications, queries, visualizations
def write_json(payload: list[dict], path: Path) -> None:
path.parent.mkdir(parents=True, exist_ok=True)
path.write_text(json.dumps(payload, indent=2) + "\n")
def render_configmap(apps_path: Path, queries_path: Path, vis_path: Path, output_path: Path) -> None:
relative_path = output_path.relative_to(ROOT)
applications = indent_payload(apps_path)
queries = indent_payload(queries_path)
visualizations = indent_payload(vis_path)
output_path.write_text(
CONFIG_TEMPLATE.format(
relative_path=relative_path,
applications=applications,
queries=queries,
visualizations=visualizations,
)
)
def indent_payload(path: Path) -> str:
lines = path.read_text().splitlines()
return "\n".join(" " + line for line in lines)
def main() -> None:
parser = argparse.ArgumentParser()
parser.add_argument("--build", action="store_true", help="Regenerate JSON payloads and ConfigMap")
args = parser.parse_args()
if args.build:
applications, queries, visualizations = build_objects()
write_json(applications, APPS_PATH)
write_json(queries, QUERIES_PATH)
write_json(visualizations, VIS_PATH)
if not (APPS_PATH.exists() and QUERIES_PATH.exists() and VIS_PATH.exists()):
raise SystemExit("Missing observability JSON payloads. Run with --build first.")
render_configmap(APPS_PATH, QUERIES_PATH, VIS_PATH, CONFIG_PATH)
if __name__ == "__main__":
main()

View File

@ -0,0 +1,149 @@
#!/usr/bin/env python3
import datetime as dt
import os
import time
from dataclasses import dataclass
import requests
from prometheus_client import Gauge, Info, start_http_server
@dataclass(frozen=True)
class Window:
label: str
days: int
WINDOWS = [
Window("today", 0),
Window("1d", 1),
Window("7d", 7),
Window("30d", 30),
]
API_BASE = os.environ.get("POSTMARK_API_BASE", "https://api.postmarkapp.com").rstrip("/")
POLL_INTERVAL_SECONDS = int(os.environ.get("POLL_INTERVAL_SECONDS", "60"))
LISTEN_ADDRESS = os.environ.get("LISTEN_ADDRESS", "0.0.0.0")
LISTEN_PORT = int(os.environ.get("LISTEN_PORT", "8000"))
PRIMARY_TOKEN = os.environ.get("POSTMARK_SERVER_TOKEN", "").strip()
FALLBACK_TOKEN = os.environ.get("POSTMARK_SERVER_TOKEN_FALLBACK", "").strip()
LIMIT_WINDOW = os.environ.get("POSTMARK_SENDING_LIMIT_WINDOW", "30d").strip()
LIMIT_RAW = os.environ.get("POSTMARK_SENDING_LIMIT", "").strip()
try:
SENDING_LIMIT = float(LIMIT_RAW) if LIMIT_RAW else 0.0
except ValueError:
SENDING_LIMIT = 0.0
EXPORTER_INFO = Info("postmark_exporter", "Exporter build info")
EXPORTER_INFO.info(
{
"api_base": API_BASE,
"windows": ",".join(window.label for window in WINDOWS),
}
)
POSTMARK_API_UP = Gauge("postmark_api_up", "Whether Postmark API is reachable (1) or not (0)")
POSTMARK_LAST_SUCCESS = Gauge(
"postmark_last_success_timestamp_seconds",
"Unix timestamp of the last successful Postmark stats refresh",
)
POSTMARK_REQUEST_ERRORS = Gauge(
"postmark_request_errors_total",
"Total Postmark stats request errors since exporter start",
)
POSTMARK_OUTBOUND_SENT = Gauge(
"postmark_outbound_sent",
"Outbound emails sent within the selected window",
labelnames=("window",),
)
POSTMARK_OUTBOUND_BOUNCED = Gauge(
"postmark_outbound_bounced",
"Outbound emails bounced within the selected window",
labelnames=("window",),
)
POSTMARK_OUTBOUND_BOUNCE_RATE = Gauge(
"postmark_outbound_bounce_rate",
"Outbound bounce rate percentage within the selected window",
labelnames=("window",),
)
POSTMARK_SENDING_LIMIT_GAUGE = Gauge(
"postmark_sending_limit",
"Configured Postmark sending limit for the active account",
)
POSTMARK_SENDING_LIMIT_USED = Gauge(
"postmark_sending_limit_used",
"Messages sent within the configured send limit window",
)
POSTMARK_SENDING_LIMIT_USED_PERCENT = Gauge(
"postmark_sending_limit_used_percent",
"Percent of the configured send limit used within the limit window",
)
def fetch_outbound_stats(token: str, window: Window) -> dict:
today = dt.date.today()
fromdate = today - dt.timedelta(days=window.days)
params = {"fromdate": fromdate.isoformat(), "todate": today.isoformat()}
headers = {
"Accept": "application/json",
"X-Postmark-Server-Token": token,
}
response = requests.get(
f"{API_BASE}/stats/outbound",
headers=headers,
params=params,
timeout=15,
)
response.raise_for_status()
return response.json()
def update_metrics(token: str) -> None:
sent_by_window = {}
for window in WINDOWS:
data = fetch_outbound_stats(token, window)
sent = int(data.get("Sent", 0) or 0)
bounced = int(data.get("Bounced", 0) or 0)
rate = (bounced / sent * 100.0) if sent else 0.0
sent_by_window[window.label] = sent
POSTMARK_OUTBOUND_SENT.labels(window=window.label).set(sent)
POSTMARK_OUTBOUND_BOUNCED.labels(window=window.label).set(bounced)
POSTMARK_OUTBOUND_BOUNCE_RATE.labels(window=window.label).set(rate)
POSTMARK_SENDING_LIMIT_GAUGE.set(SENDING_LIMIT)
limit_window_sent = sent_by_window.get(LIMIT_WINDOW, 0)
POSTMARK_SENDING_LIMIT_USED.set(limit_window_sent)
if SENDING_LIMIT:
POSTMARK_SENDING_LIMIT_USED_PERCENT.set(limit_window_sent / SENDING_LIMIT * 100.0)
else:
POSTMARK_SENDING_LIMIT_USED_PERCENT.set(0.0)
def main() -> None:
if not PRIMARY_TOKEN and not FALLBACK_TOKEN:
raise SystemExit("POSTMARK_SERVER_TOKEN or POSTMARK_SERVER_TOKEN_FALLBACK is required")
start_http_server(LISTEN_PORT, addr=LISTEN_ADDRESS)
tokens = [token for token in (PRIMARY_TOKEN, FALLBACK_TOKEN) if token]
token_index = 0
while True:
token = tokens[token_index % len(tokens)]
token_index += 1
try:
update_metrics(token)
POSTMARK_API_UP.set(1)
POSTMARK_LAST_SUCCESS.set(time.time())
except Exception as exc: # noqa: BLE001
POSTMARK_API_UP.set(0)
POSTMARK_REQUEST_ERRORS.inc()
print(f"postmark_exporter: refresh failed: {exc}", flush=True)
time.sleep(POLL_INTERVAL_SECONDS)
if __name__ == "__main__":
main()

View File

@ -0,0 +1,35 @@
#!/usr/bin/env python3
from pathlib import Path
def indent(text: str, spaces: int) -> str:
prefix = " " * spaces
return "".join(prefix + line if line.strip("\n") else line for line in text.splitlines(keepends=True))
def main() -> None:
root = Path(__file__).resolve().parents[1]
source = root / "scripts" / "monitoring_postmark_exporter.py"
target = root / "services" / "monitoring" / "postmark-exporter-script.yaml"
payload = source.read_text(encoding="utf-8")
if not payload.endswith("\n"):
payload += "\n"
yaml = (
f"# services/monitoring/postmark-exporter-script.yaml\n"
f"apiVersion: v1\n"
f"kind: ConfigMap\n"
f"metadata:\n"
f" name: postmark-exporter-script\n"
f"data:\n"
f" monitoring_postmark_exporter.py: |\n"
f"{indent(payload, 4)}"
)
target.write_text(yaml, encoding="utf-8")
if __name__ == "__main__":
main()

View File

@ -1,49 +0,0 @@
#!/bin/bash
set -euo pipefail
KC_BASE="${KC_BASE:?}"
KC_REALM="${KC_REALM:?}"
KC_ADMIN_USER="${KC_ADMIN_USER:?}"
KC_ADMIN_PASS="${KC_ADMIN_PASS:?}"
if ! command -v jq >/dev/null 2>&1; then
apt-get update && apt-get install -y jq curl >/dev/null
fi
account_exists() {
# Skip if the account email is already present in the mail app.
runuser -u www-data -- php occ mail:account:list 2>/dev/null | grep -Fq " ${1}" || \
runuser -u www-data -- php occ mail:account:list 2>/dev/null | grep -Fq "${1} "
}
token=$(
curl -s -d "grant_type=password" \
-d "client_id=admin-cli" \
-d "username=${KC_ADMIN_USER}" \
-d "password=${KC_ADMIN_PASS}" \
"${KC_BASE}/realms/master/protocol/openid-connect/token" | jq -r '.access_token'
)
if [[ -z "${token}" || "${token}" == "null" ]]; then
echo "Failed to obtain admin token"
exit 1
fi
users=$(curl -s -H "Authorization: Bearer ${token}" \
"${KC_BASE}/admin/realms/${KC_REALM}/users?max=2000")
echo "${users}" | jq -c '.[]' | while read -r user; do
username=$(echo "${user}" | jq -r '.username')
email=$(echo "${user}" | jq -r '.email // empty')
app_pw=$(echo "${user}" | jq -r '.attributes.mailu_app_password[0] // empty')
[[ -z "${email}" || -z "${app_pw}" ]] && continue
if account_exists "${email}"; then
echo "Skipping ${email}, already exists"
continue
fi
echo "Syncing ${email}"
runuser -u www-data -- php occ mail:account:create \
"${username}" "${username}" "${email}" \
mail.bstein.dev 993 ssl "${email}" "${app_pw}" \
mail.bstein.dev 587 tls "${email}" "${app_pw}" login || true
done

View File

@ -1,65 +0,0 @@
#!/bin/bash
set -euo pipefail
NC_URL="${NC_URL:-https://cloud.bstein.dev}"
ADMIN_USER="${ADMIN_USER:?}"
ADMIN_PASS="${ADMIN_PASS:?}"
export DEBIAN_FRONTEND=noninteractive
apt-get update -qq
apt-get install -y -qq curl jq >/dev/null
run_occ() {
runuser -u www-data -- php occ "$@"
}
log() { echo "[$(date -Is)] $*"; }
log "Applying Atlas theming"
run_occ theming:config name "Atlas Cloud"
run_occ theming:config slogan "Unified access to Atlas services"
run_occ theming:config url "https://cloud.bstein.dev"
run_occ theming:config color "#0f172a"
run_occ theming:config disable-user-theming yes
log "Setting default quota to 200 GB"
run_occ config:app:set files default_quota --value "200 GB"
API_BASE="${NC_URL}/ocs/v2.php/apps/external/api/v1"
AUTH=(-u "${ADMIN_USER}:${ADMIN_PASS}" -H "OCS-APIRequest: true")
log "Removing existing external links"
existing=$(curl -sf "${AUTH[@]}" "${API_BASE}?format=json" | jq -r '.ocs.data[].id // empty')
for id in ${existing}; do
curl -sf "${AUTH[@]}" -X DELETE "${API_BASE}/sites/${id}?format=json" >/dev/null || true
done
SITES=(
"Vaultwarden|https://vault.bstein.dev"
"Jellyfin|https://stream.bstein.dev"
"Gitea|https://scm.bstein.dev"
"Jenkins|https://ci.bstein.dev"
"Harbor|https://registry.bstein.dev"
"Vault|https://secret.bstein.dev"
"Jitsi|https://meet.bstein.dev"
"Grafana|https://metrics.bstein.dev"
"Chat LLM|https://chat.ai.bstein.dev"
"Vision|https://draw.ai.bstein.dev"
"STT/TTS|https://talk.ai.bstein.dev"
)
log "Seeding external links"
for entry in "${SITES[@]}"; do
IFS="|" read -r name url <<<"${entry}"
curl -sf "${AUTH[@]}" -X POST "${API_BASE}/sites?format=json" \
-d "name=${name}" \
-d "url=${url}" \
-d "lang=" \
-d "type=link" \
-d "device=" \
-d "icon=" \
-d "groups[]=" \
-d "redirect=1" >/dev/null
done
log "Maintenance run completed"

View File

@ -0,0 +1,509 @@
#!/usr/bin/env python3
"""Clean up Atlas test users and portal requests (manual-only).
Default behavior is DRY RUN. This script is intended for operators to clean up
test accounts created via the bstein-dev-home onboarding portal.
Targets (best-effort):
- Keycloak users in realm "atlas"
- Atlas portal Postgres rows (access_requests + dependent tables)
- Vaultwarden users/invites created by the portal
Safety:
- Requires an explicit username prefix (e.g. "test-")
- Dry-run unless --apply is set
- --apply requires an explicit --confirm guard
- Validates prefixes to a conservative charset
"""
from __future__ import annotations
import argparse
import base64
import json
import os
import re
import subprocess
import sys
import time
import urllib.parse
import urllib.request
from dataclasses import dataclass
from typing import Any, Iterable
_SAFE_PREFIX_RE = re.compile(r"^[a-zA-Z0-9][a-zA-Z0-9._-]{0,63}$")
@dataclass(frozen=True)
class KeycloakUser:
user_id: str
username: str
email: str
@dataclass(frozen=True)
class PortalRequestRow:
request_code: str
username: str
status: str
@dataclass(frozen=True)
class VaultwardenUser:
user_id: str
email: str
status: int
def _run(cmd: list[str], *, input_bytes: bytes | None = None) -> str:
proc = subprocess.run(
cmd,
input=input_bytes,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
check=False,
)
if proc.returncode != 0:
stderr = proc.stderr.decode("utf-8", errors="replace").strip()
raise RuntimeError(f"command failed ({proc.returncode}): {' '.join(cmd)}\n{stderr}")
return proc.stdout.decode("utf-8", errors="replace")
def _kubectl_get_secret_value(namespace: str, name: str, key: str) -> str:
raw_b64 = _run(
[
"kubectl",
"-n",
namespace,
"get",
"secret",
name,
"-o",
f"jsonpath={{.data.{key}}}",
]
).strip()
if not raw_b64:
raise RuntimeError(f"secret {namespace}/{name} key {key} is empty")
return base64.b64decode(raw_b64).decode("utf-8").strip()
def _kubectl_first_pod(namespace: str) -> str:
raw = _run(
[
"kubectl",
"-n",
namespace,
"get",
"pods",
"-o",
"json",
]
)
data = json.loads(raw)
items = data.get("items") or []
if not isinstance(items, list) or not items:
raise RuntimeError(f"no pods found in namespace {namespace}")
pod_name = items[0].get("metadata", {}).get("name")
if not isinstance(pod_name, str) or not pod_name:
raise RuntimeError(f"unexpected pod list in namespace {namespace}")
return pod_name
def _validate_prefixes(prefixes: list[str]) -> list[str]:
cleaned: list[str] = []
for prefix in prefixes:
prefix = prefix.strip()
if not prefix:
continue
if not _SAFE_PREFIX_RE.match(prefix):
raise SystemExit(
f"invalid prefix '{prefix}': must match {_SAFE_PREFIX_RE.pattern} (alnum plus ._-)"
)
cleaned.append(prefix)
if not cleaned:
raise SystemExit("at least one --prefix is required")
return cleaned
def _starts_with_any(value: str, prefixes: Iterable[str]) -> bool:
return any(value.startswith(p) for p in prefixes)
def _keycloak_token(server: str, realm: str, client_id: str, client_secret: str) -> str:
data = urllib.parse.urlencode(
{
"grant_type": "client_credentials",
"client_id": client_id,
"client_secret": client_secret,
}
).encode("utf-8")
req = urllib.request.Request(
f"{server}/realms/{realm}/protocol/openid-connect/token",
data=data,
method="POST",
)
req.add_header("Content-Type", "application/x-www-form-urlencoded")
with urllib.request.urlopen(req, timeout=15) as resp:
payload = json.loads(resp.read().decode("utf-8"))
token = payload.get("access_token")
if not isinstance(token, str) or not token:
raise RuntimeError("failed to obtain keycloak access token")
return token
def _keycloak_list_users(server: str, realm: str, token: str, search: str) -> list[KeycloakUser]:
query = urllib.parse.urlencode({"max": "1000", "search": search})
req = urllib.request.Request(f"{server}/admin/realms/{realm}/users?{query}", method="GET")
req.add_header("Authorization", f"Bearer {token}")
with urllib.request.urlopen(req, timeout=30) as resp:
payload = json.loads(resp.read().decode("utf-8"))
if not isinstance(payload, list):
raise RuntimeError("unexpected keycloak users response")
users: list[KeycloakUser] = []
for item in payload:
if not isinstance(item, dict):
continue
user_id = item.get("id")
username = item.get("username") or ""
email = item.get("email") or ""
if not isinstance(user_id, str) or not user_id:
continue
if not isinstance(username, str):
continue
users.append(KeycloakUser(user_id=user_id, username=username, email=str(email)))
return users
def _keycloak_delete_user(server: str, realm: str, token: str, user_id: str) -> None:
req = urllib.request.Request(f"{server}/admin/realms/{realm}/users/{user_id}", method="DELETE")
req.add_header("Authorization", f"Bearer {token}")
try:
with urllib.request.urlopen(req, timeout=30) as resp:
_ = resp.read()
except urllib.error.HTTPError as exc:
if exc.code == 404:
return
raise
def _psql_json(portal_db_url: str, sql: str) -> list[dict[str, Any]]:
postgres_pod = _kubectl_first_pod("postgres")
out = _run(
[
"kubectl",
"-n",
"postgres",
"exec",
"-i",
postgres_pod,
"--",
"psql",
portal_db_url,
"-At",
"-F",
"\t",
"-c",
sql,
]
)
rows: list[dict[str, Any]] = []
for line in out.splitlines():
parts = line.split("\t")
rows.append({"cols": parts})
return rows
def _portal_list_requests(portal_db_url: str, prefixes: list[str]) -> list[PortalRequestRow]:
clauses = " OR ".join([f"username LIKE '{p}%'" for p in prefixes])
sql = (
"SELECT request_code, username, status "
"FROM access_requests "
f"WHERE {clauses} "
"ORDER BY created_at DESC;"
)
raw_rows = _psql_json(portal_db_url, sql)
parsed: list[PortalRequestRow] = []
for row in raw_rows:
cols = row.get("cols") or []
if len(cols) < 3:
continue
parsed.append(PortalRequestRow(request_code=cols[0], username=cols[1], status=cols[2]))
return parsed
def _portal_delete_requests(portal_db_url: str, prefixes: list[str]) -> int:
clauses = " OR ".join([f"username LIKE '{p}%'" for p in prefixes])
sql = f"DELETE FROM access_requests WHERE {clauses};"
postgres_pod = _kubectl_first_pod("postgres")
out = _run(
[
"kubectl",
"-n",
"postgres",
"exec",
"-i",
postgres_pod,
"--",
"psql",
portal_db_url,
"-c",
sql,
]
)
# psql prints "DELETE <n>"
match = re.search(r"DELETE\\s+(\\d+)", out)
return int(match.group(1)) if match else 0
def _vaultwarden_admin_cookie(admin_token: str, base_url: str) -> str:
data = urllib.parse.urlencode({"token": admin_token}).encode("utf-8")
req = urllib.request.Request(f"{base_url}/admin", data=data, method="POST")
req.add_header("Content-Type", "application/x-www-form-urlencoded")
try:
with urllib.request.urlopen(req, timeout=10) as resp:
set_cookie = resp.headers.get("Set-Cookie") or ""
except urllib.error.HTTPError as exc:
if exc.code == 429:
raise RuntimeError("vaultwarden admin rate limited (HTTP 429)") from exc
raise
cookie = set_cookie.split(";", 1)[0].strip()
if not cookie:
raise RuntimeError("vaultwarden admin cookie missing")
return cookie
def _vaultwarden_list_users(base_url: str, cookie: str) -> list[VaultwardenUser]:
req = urllib.request.Request(f"{base_url}/admin/users", method="GET")
req.add_header("Cookie", cookie)
try:
with urllib.request.urlopen(req, timeout=30) as resp:
payload = json.loads(resp.read().decode("utf-8"))
except urllib.error.HTTPError as exc:
if exc.code == 429:
raise RuntimeError("vaultwarden admin rate limited (HTTP 429)") from exc
raise
if not isinstance(payload, list):
raise RuntimeError("unexpected vaultwarden /admin/users response")
users: list[VaultwardenUser] = []
for item in payload:
if not isinstance(item, dict):
continue
user_id = item.get("id")
email = item.get("email")
status = item.get("_status")
if not isinstance(user_id, str) or not user_id:
continue
if not isinstance(email, str) or not email:
continue
if not isinstance(status, int):
status = -1
users.append(VaultwardenUser(user_id=user_id, email=email, status=status))
return users
def _vaultwarden_delete_user(base_url: str, cookie: str, user_id: str) -> None:
req = urllib.request.Request(f"{base_url}/admin/users/{user_id}", method="DELETE")
req.add_header("Cookie", cookie)
try:
with urllib.request.urlopen(req, timeout=30) as resp:
_ = resp.read()
except urllib.error.HTTPError as exc:
if exc.code in {404}:
return
if exc.code == 429:
raise RuntimeError("vaultwarden admin rate limited (HTTP 429)") from exc
raise
def _port_forward(namespace: str, target: str, local_port: int, remote_port: int) -> subprocess.Popen[bytes]:
# Keep stdout/stderr muted to avoid leaking internal details in output.
return subprocess.Popen(
[
"kubectl",
"-n",
namespace,
"port-forward",
target,
f"{local_port}:{remote_port}",
"--address",
"127.0.0.1",
],
stdout=subprocess.DEVNULL,
stderr=subprocess.DEVNULL,
)
def main() -> int:
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument(
"--prefix",
action="append",
default=[],
help="Username prefix to match (repeatable). Example: --prefix test-",
)
parser.add_argument(
"--apply",
action="store_true",
help="Actually delete; otherwise dry-run only.",
)
parser.add_argument(
"--confirm",
default="",
help=(
"Required when using --apply. Must exactly equal the comma-separated "
"sorted prefix list (e.g. 'atlas-,bob-,e2e-,test-')."
),
)
parser.add_argument("--skip-keycloak", action="store_true", help="Skip Keycloak user deletion.")
parser.add_argument("--skip-portal-db", action="store_true", help="Skip portal DB cleanup.")
parser.add_argument("--skip-vaultwarden", action="store_true", help="Skip Vaultwarden cleanup.")
parser.add_argument(
"--protect-keycloak-username",
action="append",
default=[],
help="Keycloak usernames that must never be deleted (repeatable).",
)
parser.add_argument(
"--protect-vaultwarden-email",
action="append",
default=[],
help="Vaultwarden emails that must never be deleted (repeatable).",
)
args = parser.parse_args()
prefixes = sorted(set(_validate_prefixes(args.prefix)))
apply = bool(args.apply)
expected_confirm = ",".join(prefixes)
protected_keycloak = {"bstein", "robotuser", *[u.strip() for u in args.protect_keycloak_username if u.strip()]}
protected_vaultwarden = {e.strip() for e in args.protect_vaultwarden_email if e.strip()}
if apply and args.confirm != expected_confirm:
raise SystemExit(
f"refusing to apply without --confirm '{expected_confirm}' (got '{args.confirm}')"
)
print("Atlas test-user cleanup")
print("prefixes:", expected_confirm)
print("mode:", "APPLY (destructive)" if apply else "DRY RUN (no changes)")
if protected_keycloak:
print("protected keycloak usernames:", ", ".join(sorted(protected_keycloak)))
if protected_vaultwarden:
print("protected vaultwarden emails:", ", ".join(sorted(protected_vaultwarden)))
print()
if not args.skip_portal_db:
portal_db_url = _kubectl_get_secret_value("bstein-dev-home", "atlas-portal-db", "PORTAL_DATABASE_URL")
requests = _portal_list_requests(portal_db_url, prefixes)
print(f"Portal DB: {len(requests)} access_requests matched")
for row in requests[:50]:
print(f" {row.request_code}\t{row.status}\t{row.username}")
if len(requests) > 50:
print(f" ... and {len(requests) - 50} more")
if apply and requests:
deleted = _portal_delete_requests(portal_db_url, prefixes)
print(f"Portal DB: deleted {deleted} access_requests (cascade removes tasks/steps/artifacts).")
print()
if not args.skip_keycloak:
kc_server = os.getenv("KEYCLOAK_PUBLIC_URL", "https://sso.bstein.dev").rstrip("/")
kc_realm = os.getenv("KEYCLOAK_REALM", "atlas")
kc_client_id = os.getenv("KEYCLOAK_ADMIN_CLIENT_ID", "bstein-dev-home-admin")
kc_client_secret = _kubectl_get_secret_value(
"bstein-dev-home", "bstein-dev-home-keycloak-admin", "client_secret"
)
token = _keycloak_token(kc_server, kc_realm, kc_client_id, kc_client_secret)
found: dict[str, KeycloakUser] = {}
for prefix in prefixes:
for user in _keycloak_list_users(kc_server, kc_realm, token, prefix):
if not _starts_with_any(user.username, prefixes):
continue
if user.username in protected_keycloak:
continue
found[user.user_id] = user
users = list(found.values())
users.sort(key=lambda u: u.username)
print(f"Keycloak: {len(users)} users matched")
for user in users[:50]:
email = user.email or "-"
print(f" {user.username}\t{email}\t{user.user_id}")
if len(users) > 50:
print(f" ... and {len(users) - 50} more")
if apply and users:
for user in users:
_keycloak_delete_user(kc_server, kc_realm, token, user.user_id)
print(f"Keycloak: deleted {len(users)} users.")
print()
if not args.skip_vaultwarden:
pf = _port_forward("vaultwarden", "svc/vaultwarden-service", 18081, 80)
try:
# wait briefly for the port-forward to come up
for _ in range(30):
try:
urllib.request.urlopen("http://127.0.0.1:18081/", timeout=1).read(1)
break
except Exception:
time.sleep(0.2)
admin_token = _kubectl_get_secret_value("vaultwarden", "vaultwarden-admin", "ADMIN_TOKEN")
base_url = "http://127.0.0.1:18081"
try:
cookie = ""
for attempt in range(7):
try:
cookie = _vaultwarden_admin_cookie(admin_token, base_url)
break
except RuntimeError as exc:
if "rate limited" in str(exc).lower():
time.sleep(min(60.0, 2.0**attempt))
continue
raise
if not cookie:
raise RuntimeError("vaultwarden admin login repeatedly rate limited")
users: list[VaultwardenUser] = []
for attempt in range(7):
try:
users = _vaultwarden_list_users(base_url, cookie)
break
except RuntimeError as exc:
if "rate limited" in str(exc).lower():
time.sleep(min(60.0, 2.0**attempt))
continue
raise
if not users:
raise RuntimeError("vaultwarden user list unavailable (possibly rate limited)")
except RuntimeError as exc:
print(f"Vaultwarden: ERROR: {exc}")
print()
return 1
matched: list[VaultwardenUser] = []
for user in users:
local = user.email.split("@", 1)[0]
if _starts_with_any(local, prefixes):
if user.email in protected_vaultwarden:
continue
matched.append(user)
matched.sort(key=lambda u: u.email)
print(f"Vaultwarden: {len(matched)} users matched")
for user in matched[:50]:
print(f" {user.email}\tstatus={user.status}\t{user.user_id}")
if len(matched) > 50:
print(f" ... and {len(matched) - 50} more")
if apply and matched:
for user in matched:
_vaultwarden_delete_user(base_url, cookie, user.user_id)
print(f"Vaultwarden: deleted {len(matched)} users.")
print()
finally:
pf.terminate()
try:
pf.wait(timeout=3)
except Exception:
pf.kill()
return 0
if __name__ == "__main__":
raise SystemExit(main())

276
scripts/test_user_cleanup.py Executable file
View File

@ -0,0 +1,276 @@
#!/usr/bin/env python3
from __future__ import annotations
import argparse
import sys
from collections import defaultdict
from dataclasses import dataclass
from typing import Any, Iterable
from urllib.parse import quote
import httpx
from atlas_portal import db, settings
from atlas_portal.keycloak import admin_client
@dataclass(frozen=True)
class KeycloakUser:
id: str
username: str
@dataclass(frozen=True)
class PortalRequest:
request_code: str
username: str
status: str
def _dedupe_by_id(users: Iterable[KeycloakUser]) -> list[KeycloakUser]:
seen: set[str] = set()
out: list[KeycloakUser] = []
for user in users:
if user.id in seen:
continue
seen.add(user.id)
out.append(user)
return out
def _iter_keycloak_users_for_prefix(prefix: str, max_results: int) -> list[KeycloakUser]:
client = admin_client()
if not client.ready():
raise RuntimeError("keycloak admin client not configured in this environment")
url = f"{settings.KEYCLOAK_ADMIN_URL}/admin/realms/{settings.KEYCLOAK_REALM}/users"
# Keycloak can return false positives for search; we do a strict prefix match client-side.
params = {"search": prefix, "max": str(max_results), "briefRepresentation": "true"}
with httpx.Client(timeout=settings.HTTP_CHECK_TIMEOUT_SEC) as http:
resp = http.get(url, params=params, headers=client.headers())
resp.raise_for_status()
payload = resp.json()
if not isinstance(payload, list):
return []
found: list[KeycloakUser] = []
for item in payload:
if not isinstance(item, dict):
continue
username = item.get("username")
user_id = item.get("id")
if not isinstance(username, str) or not isinstance(user_id, str):
continue
if not username.startswith(prefix):
continue
if username.startswith("service-account-"):
continue
found.append(KeycloakUser(id=user_id, username=username))
return found
def _find_keycloak_users(prefixes: list[str], max_results: int, protected: set[str]) -> list[KeycloakUser]:
matches: list[KeycloakUser] = []
for prefix in prefixes:
matches.extend(_iter_keycloak_users_for_prefix(prefix, max_results=max_results))
deduped = _dedupe_by_id(matches)
return [user for user in deduped if user.username not in protected]
def _delete_keycloak_users(users: list[KeycloakUser]) -> None:
if not users:
return
client = admin_client()
if not client.ready():
raise RuntimeError("keycloak admin client not configured in this environment")
base = f"{settings.KEYCLOAK_ADMIN_URL}/admin/realms/{settings.KEYCLOAK_REALM}/users"
with httpx.Client(timeout=settings.HTTP_CHECK_TIMEOUT_SEC) as http:
for user in users:
url = f"{base}/{quote(user.id, safe='')}"
resp = http.delete(url, headers=client.headers())
# Deleting a non-existent user is treated as success for idempotency.
if resp.status_code == 404:
continue
resp.raise_for_status()
def _find_portal_requests(prefixes: list[str], max_results: int) -> list[PortalRequest]:
if not db.configured():
return []
like_prefixes = [f"{prefix}%" for prefix in prefixes]
rows: list[dict[str, Any]] = []
with db.connect() as conn:
for like in like_prefixes:
cursor = conn.execute(
"""
SELECT request_code, username, status
FROM access_requests
WHERE username LIKE %s
ORDER BY created_at DESC
LIMIT %s
""",
(like, max_results),
)
batch = cursor.fetchall()
if isinstance(batch, list):
rows.extend([r for r in batch if isinstance(r, dict)])
out: list[PortalRequest] = []
for row in rows:
request_code = row.get("request_code")
username = row.get("username")
status = row.get("status")
if not isinstance(request_code, str) or not isinstance(username, str) or not isinstance(status, str):
continue
out.append(PortalRequest(request_code=request_code, username=username, status=status))
return out
def _delete_portal_requests(prefixes: list[str]) -> int:
if not db.configured():
return 0
like_prefixes = [f"{prefix}%" for prefix in prefixes]
deleted = 0
with db.connect() as conn:
for like in like_prefixes:
cursor = conn.execute("DELETE FROM access_requests WHERE username LIKE %s", (like,))
deleted += cursor.rowcount or 0
return deleted
def _summarize_portal_requests(rows: list[PortalRequest]) -> dict[str, int]:
counts: dict[str, int] = defaultdict(int)
for row in rows:
counts[row.status] += 1
return dict(counts)
def _parse_args(argv: list[str]) -> argparse.Namespace:
parser = argparse.ArgumentParser(
prog="test_user_cleanup",
description=(
"Manual-only cleanup for test users/requests. "
"This script is intended to be run inside the bstein-dev-home backend container."
),
)
parser.add_argument(
"--prefix",
action="append",
required=True,
help="Username prefix to target (repeatable). Example: --prefix test-",
)
parser.add_argument(
"--max",
type=int,
default=500,
help="Maximum users/requests to enumerate per prefix (default: 500).",
)
parser.add_argument(
"--apply",
action="store_true",
help="Apply deletions (default is dry-run). Requires --confirm.",
)
parser.add_argument(
"--confirm",
default="",
help="Required when using --apply. Must exactly equal the comma-separated prefix list.",
)
parser.add_argument(
"--skip-keycloak",
action="store_true",
help="Skip deleting Keycloak users.",
)
parser.add_argument(
"--skip-portal",
action="store_true",
help="Skip deleting portal (DB) access requests.",
)
parser.add_argument(
"--protect",
action="append",
default=[],
help="Extra usernames to never delete (repeatable).",
)
parser.add_argument(
"--verbose",
action="store_true",
help="List matched usernames/request codes.",
)
return parser.parse_args(argv)
def main(argv: list[str]) -> int:
args = _parse_args(argv)
prefixes = sorted({p.strip() for p in args.prefix if p.strip()})
if not prefixes:
print("error: no valid --prefix values provided", file=sys.stderr)
return 2
expected_confirm = ",".join(prefixes)
protected = {"bstein", "robotuser", *[p.strip() for p in args.protect if p.strip()]}
if args.apply and args.confirm != expected_confirm:
print(
f"error: refusing to apply without --confirm '{expected_confirm}' (got '{args.confirm}')",
file=sys.stderr,
)
return 2
keycloak_users: list[KeycloakUser] = []
portal_requests: list[PortalRequest] = []
if not args.skip_keycloak:
keycloak_users = _find_keycloak_users(prefixes, max_results=args.max, protected=protected)
if not args.skip_portal:
portal_requests = _find_portal_requests(prefixes, max_results=args.max)
print(f"prefixes: {expected_confirm}")
print(f"mode: {'APPLY' if args.apply else 'DRY-RUN'}")
if protected:
print(f"protected usernames: {', '.join(sorted(protected))}")
if not args.skip_keycloak:
print(f"keycloak users matched: {len(keycloak_users)}")
if args.verbose and keycloak_users:
for user in sorted(keycloak_users, key=lambda u: u.username):
print(f" - {user.username}")
if not args.skip_portal:
print(f"portal requests matched: {len(portal_requests)}")
if portal_requests:
summary = _summarize_portal_requests(portal_requests)
summary_str = ", ".join(f"{k}={v}" for k, v in sorted(summary.items()))
print(f" statuses: {summary_str}")
if args.verbose and portal_requests:
for req in portal_requests[: min(50, len(portal_requests))]:
print(f" - {req.request_code} ({req.status})")
if len(portal_requests) > 50:
print(f" ... and {len(portal_requests) - 50} more")
if not args.apply:
print("dry-run complete (no changes made)")
return 0
if not args.skip_portal:
deleted = _delete_portal_requests(prefixes)
print(f"deleted portal requests: {deleted}")
if not args.skip_keycloak:
_delete_keycloak_users(keycloak_users)
print(f"deleted keycloak users: {len(keycloak_users)}")
print("done")
return 0
if __name__ == "__main__":
raise SystemExit(main(sys.argv[1:]))

18
scripts/test_user_cleanup.sh Executable file
View File

@ -0,0 +1,18 @@
#!/usr/bin/env bash
set -euo pipefail
# Manual-only helper to run `scripts/test_user_cleanup.py` inside the portal backend container.
#
# Usage (dry-run):
# scripts/test_user_cleanup.sh --prefix test-
#
# Usage (apply):
# scripts/test_user_cleanup.sh --prefix test- --apply --confirm test-
NS="${PORTAL_NAMESPACE:-bstein-dev-home}"
TARGET="${PORTAL_BACKEND_EXEC_TARGET:-deploy/bstein-dev-home-backend}"
SCRIPT_DIR="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" >/dev/null 2>&1 && pwd)"
cat "${SCRIPT_DIR}/test_user_cleanup.py" | kubectl -n "${NS}" exec -i "${TARGET}" -- python - "$@"

View File

@ -0,0 +1,318 @@
#!/usr/bin/env python3
"""Clean up Vaultwarden test users and invites (manual-only).
This script deletes Vaultwarden rows directly from the Postgres database. It is
intended only for removing test fallout (e.g. e2e-*, test-*) and is deliberately
conservative:
- Requires one or more explicit email prefixes (repeatable).
- Dry-run by default; --apply requires an exact --confirm guard.
- Refuses to delete any user with dependent data in Vaultwarden tables.
- Supports a protected email allowlist to prevent catastrophic mistakes.
Example (dry-run):
scripts/test_vaultwarden_user_cleanup.py --prefix e2e-
Example (apply):
scripts/test_vaultwarden_user_cleanup.py --prefix e2e- --apply --confirm e2e-
"""
from __future__ import annotations
import argparse
import json
import re
import subprocess
import sys
from dataclasses import dataclass
from typing import Iterable, Sequence
_SAFE_PREFIX_RE = re.compile(r"^[a-zA-Z0-9][a-zA-Z0-9._-]{0,63}$")
_UUID_RE = re.compile(r"^[0-9a-fA-F-]{32,36}$")
@dataclass(frozen=True)
class VaultwardenUser:
uuid: str
email: str
dependent_rows: int
def _run(cmd: Sequence[str], *, input_bytes: bytes | None = None) -> str:
proc = subprocess.run(
list(cmd),
input=input_bytes,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
check=False,
)
if proc.returncode != 0:
stderr = proc.stderr.decode("utf-8", errors="replace").strip()
raise RuntimeError(f"command failed ({proc.returncode}): {' '.join(cmd)}\n{stderr}")
return proc.stdout.decode("utf-8", errors="replace")
def _kubectl_first_pod(namespace: str) -> str:
raw = _run(["kubectl", "-n", namespace, "get", "pods", "-o", "json"])
data = json.loads(raw)
items = data.get("items") or []
if not isinstance(items, list) or not items:
raise RuntimeError(f"no pods found in namespace {namespace}")
name = items[0].get("metadata", {}).get("name")
if not isinstance(name, str) or not name:
raise RuntimeError(f"unexpected pod list in namespace {namespace}")
return name
def _psql(sql: str) -> str:
pod = _kubectl_first_pod("postgres")
return _run(
[
"kubectl",
"-n",
"postgres",
"exec",
"-i",
pod,
"--",
"psql",
"-U",
"postgres",
"-d",
"vaultwarden",
"-At",
"-F",
"\t",
"-c",
sql,
]
)
def _validate_prefixes(prefixes: Iterable[str]) -> list[str]:
cleaned: list[str] = []
for prefix in prefixes:
prefix = prefix.strip()
if not prefix:
continue
if not _SAFE_PREFIX_RE.match(prefix):
raise SystemExit(
f"invalid prefix '{prefix}': must match {_SAFE_PREFIX_RE.pattern} (alnum plus ._-)"
)
if not prefix.endswith("-"):
raise SystemExit(f"refusing prefix '{prefix}': must end with '-' for safety")
cleaned.append(prefix)
if not cleaned:
raise SystemExit("at least one --prefix is required")
return sorted(set(cleaned))
def _parse_rows(tsv: str) -> list[list[str]]:
rows: list[list[str]] = []
for line in tsv.splitlines():
line = line.strip()
if not line:
continue
rows.append(line.split("\t"))
return rows
def _sql_or_email_prefixes(prefixes: list[str]) -> str:
# prefixes validated to safe charset; safe to interpolate.
clauses = [f"email LIKE '{p}%'" for p in prefixes]
return " OR ".join(clauses) if clauses else "FALSE"
def _sql_quote(value: str) -> str:
return "'" + value.replace("'", "''") + "'"
def _sql_text_array(values: Iterable[str]) -> str:
items = ",".join(_sql_quote(v) for v in values)
return f"ARRAY[{items}]::text[]"
def _list_users(prefixes: list[str], protected: set[str]) -> list[VaultwardenUser]:
clause = _sql_or_email_prefixes(prefixes)
sql = f"""
WITH candidates AS (
SELECT uuid, email
FROM users
WHERE enabled
AND ({clause})
AND email <> ALL({_sql_text_array(sorted(protected))})
)
SELECT
candidates.uuid,
candidates.email,
(
(SELECT COUNT(*) FROM auth_requests WHERE user_uuid = candidates.uuid) +
(SELECT COUNT(*) FROM ciphers WHERE user_uuid = candidates.uuid) +
(SELECT COUNT(*) FROM devices WHERE user_uuid = candidates.uuid) +
(SELECT COUNT(*) FROM emergency_access WHERE grantor_uuid = candidates.uuid OR grantee_uuid = candidates.uuid) +
(SELECT COUNT(*) FROM favorites WHERE user_uuid = candidates.uuid) +
(SELECT COUNT(*) FROM folders WHERE user_uuid = candidates.uuid) +
(SELECT COUNT(*) FROM sends WHERE user_uuid = candidates.uuid) +
(SELECT COUNT(*) FROM twofactor WHERE user_uuid = candidates.uuid) +
(SELECT COUNT(*) FROM twofactor_incomplete WHERE user_uuid = candidates.uuid) +
(SELECT COUNT(*) FROM users_collections WHERE user_uuid = candidates.uuid) +
(SELECT COUNT(*) FROM users_organizations WHERE user_uuid = candidates.uuid)
) AS dependent_rows
FROM candidates
ORDER BY candidates.email;
"""
out = _psql(sql)
users: list[VaultwardenUser] = []
for row in _parse_rows(out):
if len(row) < 3:
continue
uuid, email, dep_raw = row[0].strip(), row[1].strip(), row[2].strip()
if not uuid or not email:
continue
if not _UUID_RE.match(uuid):
continue
try:
dep = int(dep_raw)
except ValueError:
dep = 0
users.append(VaultwardenUser(uuid=uuid, email=email, dependent_rows=dep))
return users
def _list_invitations(prefixes: list[str], protected: set[str]) -> list[str]:
clause = _sql_or_email_prefixes(prefixes)
protected_clause = ""
if protected:
protected_clause = f"AND email <> ALL({_sql_text_array(sorted(protected))})"
sql = f"SELECT email FROM invitations WHERE ({clause}) {protected_clause} ORDER BY email;"
out = _psql(sql)
invites: list[str] = []
for row in _parse_rows(out):
if not row:
continue
email = row[0].strip()
if email:
invites.append(email)
return invites
def _delete_invitations(emails: list[str]) -> int:
if not emails:
return 0
email_list = ",".join(_sql_quote(e) for e in emails)
sql = f"DELETE FROM invitations WHERE email IN ({email_list});"
out = _psql(sql)
match = re.search(r"DELETE\s+(\d+)", out)
return int(match.group(1)) if match else 0
def _delete_users(uuids: list[str]) -> int:
if not uuids:
return 0
uuid_list = ",".join(_sql_quote(u) for u in uuids)
sql = f"DELETE FROM users WHERE uuid IN ({uuid_list});"
out = _psql(sql)
match = re.search(r"DELETE\s+(\d+)", out)
return int(match.group(1)) if match else 0
def _parse_args(argv: list[str]) -> argparse.Namespace:
parser = argparse.ArgumentParser(
prog="test_vaultwarden_user_cleanup",
description="Manual-only cleanup for Vaultwarden test users/invites (DB-level).",
)
parser.add_argument(
"--prefix",
action="append",
required=True,
help="Email prefix to target (repeatable). Example: --prefix e2e-",
)
parser.add_argument(
"--apply",
action="store_true",
help="Apply deletions (default is dry-run). Requires --confirm.",
)
parser.add_argument(
"--confirm",
default="",
help="Required when using --apply. Must exactly equal the comma-separated prefix list.",
)
parser.add_argument(
"--protect-email",
action="append",
default=[],
help="Vaultwarden emails that must never be deleted (repeatable).",
)
parser.add_argument(
"--verbose",
action="store_true",
help="List matched emails (and invitation emails).",
)
return parser.parse_args(argv)
def main(argv: list[str]) -> int:
args = _parse_args(argv)
prefixes = _validate_prefixes(args.prefix)
expected_confirm = ",".join(prefixes)
protected = {e.strip() for e in args.protect_email if e.strip()}
protected |= {
"brad@bstein.dev",
"edstein87@outlook.com",
"indifox8@gmail.com",
"mgs.stein@gmail.com",
"patriot87@gmail.com",
}
if args.apply and args.confirm != expected_confirm:
print(
f"error: refusing to apply without --confirm '{expected_confirm}' (got '{args.confirm}')",
file=sys.stderr,
)
return 2
users = _list_users(prefixes, protected=protected)
invites = _list_invitations(prefixes, protected=protected)
print(f"prefixes: {expected_confirm}")
print(f"mode: {'APPLY' if args.apply else 'DRY-RUN'}")
if protected:
print(f"protected emails: {', '.join(sorted(protected))}")
print(f"vaultwarden users matched: {len(users)}")
print(f"vaultwarden invitations matched: {len(invites)}")
if args.verbose:
for user in users[: min(100, len(users))]:
print(f" user: {user.email} (deps={user.dependent_rows})")
if len(users) > 100:
print(f" ... and {len(users) - 100} more users")
for email in invites[: min(100, len(invites))]:
print(f" invite: {email}")
if len(invites) > 100:
print(f" ... and {len(invites) - 100} more invitations")
unsafe = [u for u in users if u.dependent_rows > 0]
if unsafe:
print("refusing to delete users with dependent data:", file=sys.stderr)
for user in unsafe[: min(50, len(unsafe))]:
print(f" - {user.email} deps={user.dependent_rows}", file=sys.stderr)
if len(unsafe) > 50:
print(f" ... and {len(unsafe) - 50} more", file=sys.stderr)
return 2
if not args.apply:
print("dry-run complete (no changes made)")
return 0
deleted_invites = _delete_invitations(invites)
deleted_users = _delete_users([u.uuid for u in users])
print(f"deleted vaultwarden invitations: {deleted_invites}")
print(f"deleted vaultwarden users: {deleted_users}")
print("done")
return 0
if __name__ == "__main__":
raise SystemExit(main(sys.argv[1:]))

View File

@ -0,0 +1,15 @@
#!/usr/bin/env bash
set -euo pipefail
# Manual-only helper to clean Vaultwarden test users and invites from Postgres.
#
# Usage (dry-run):
# scripts/test_vaultwarden_user_cleanup.sh --prefix e2e-
#
# Usage (apply):
# scripts/test_vaultwarden_user_cleanup.sh --prefix e2e- --apply --confirm e2e-
SCRIPT_DIR="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" >/dev/null 2>&1 && pwd)"
python3 "${SCRIPT_DIR}/test_vaultwarden_user_cleanup.py" "$@"

View File

@ -20,7 +20,13 @@ def load_sync_module(monkeypatch):
} }
for k, v in env.items(): for k, v in env.items():
monkeypatch.setenv(k, v) monkeypatch.setenv(k, v)
module_path = pathlib.Path(__file__).resolve().parents[1] / "mailu_sync.py" module_path = (
pathlib.Path(__file__).resolve().parents[2]
/ "services"
/ "mailu"
/ "scripts"
/ "mailu_sync.py"
)
spec = importlib.util.spec_from_file_location("mailu_sync_testmod", module_path) spec = importlib.util.spec_from_file_location("mailu_sync_testmod", module_path)
module = importlib.util.module_from_spec(spec) module = importlib.util.module_from_spec(spec)
assert spec.loader is not None assert spec.loader is not None
@ -102,7 +108,8 @@ def test_kc_get_users_paginates(monkeypatch):
sync.SESSION = _PagedSession() sync.SESSION = _PagedSession()
users = sync.kc_get_users("tok") users = sync.kc_get_users("tok")
assert [u["id"] for u in users] == ["u1", "u2"] assert [u["id"] for u in users] == ["u1", "u2"]
assert sync.SESSION.calls == 2 # Pagination stops when results < page size.
assert sync.SESSION.calls == 1
def test_ensure_mailu_user_skips_foreign_domain(monkeypatch): def test_ensure_mailu_user_skips_foreign_domain(monkeypatch):
@ -119,6 +126,7 @@ def test_ensure_mailu_user_skips_foreign_domain(monkeypatch):
def test_ensure_mailu_user_upserts(monkeypatch): def test_ensure_mailu_user_upserts(monkeypatch):
sync = load_sync_module(monkeypatch) sync = load_sync_module(monkeypatch)
monkeypatch.setattr(sync.bcrypt_sha256, "hash", lambda password: f"hash:{password}")
captured = {} captured = {}
class _Cursor: class _Cursor:
@ -134,6 +142,7 @@ def test_ensure_mailu_user_upserts(monkeypatch):
def test_main_generates_password_and_upserts(monkeypatch): def test_main_generates_password_and_upserts(monkeypatch):
sync = load_sync_module(monkeypatch) sync = load_sync_module(monkeypatch)
monkeypatch.setattr(sync.bcrypt_sha256, "hash", lambda password: f"hash:{password}")
users = [ users = [
{"id": "u1", "username": "user1", "email": "user1@example.com", "attributes": {}}, {"id": "u1", "username": "user1", "email": "user1@example.com", "attributes": {}},
{"id": "u2", "username": "user2", "email": "user2@example.com", "attributes": {"mailu_app_password": ["keepme"]}}, {"id": "u2", "username": "user2", "email": "user2@example.com", "attributes": {"mailu_app_password": ["keepme"]}},
@ -176,6 +185,6 @@ def test_main_generates_password_and_upserts(monkeypatch):
sync.main() sync.main()
# Should attempt two inserts (third user skipped due to domain mismatch) # Always backfill mailu_email, even if Keycloak recovery email is external.
assert len(updated) == 1 # only one missing attr was backfilled assert len(updated) == 3
assert conns and len(conns[0]._cursor.executions) == 2 assert conns and len(conns[0]._cursor.executions) == 3

View File

@ -0,0 +1,105 @@
# services/ai-llm/deployment.yaml
apiVersion: apps/v1
kind: Deployment
metadata:
name: ollama
namespace: ai
spec:
replicas: 1
revisionHistoryLimit: 2
strategy:
type: RollingUpdate
rollingUpdate:
maxSurge: 0
maxUnavailable: 1
selector:
matchLabels:
app: ollama
template:
metadata:
labels:
app: ollama
annotations:
ai.bstein.dev/model: qwen2.5-coder:7b-instruct-q4_0
ai.bstein.dev/gpu: GPU pool (titan-20/21/22/24)
spec:
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: kubernetes.io/hostname
operator: In
values:
- titan-20
- titan-21
- titan-22
- titan-24
runtimeClassName: nvidia
volumes:
- name: models
persistentVolumeClaim:
claimName: ollama-models
initContainers:
- name: warm-model
image: ollama/ollama:latest
env:
- name: OLLAMA_HOST
value: 0.0.0.0
- name: NVIDIA_VISIBLE_DEVICES
value: all
- name: NVIDIA_DRIVER_CAPABILITIES
value: compute,utility
- name: OLLAMA_MODELS
value: /root/.ollama
- name: OLLAMA_MODEL
value: qwen2.5-coder:7b-instruct-q4_0
command:
- /bin/sh
- -c
- |
set -e
ollama serve >/tmp/ollama.log 2>&1 &
sleep 6
ollama pull "${OLLAMA_MODEL}"
pkill ollama || true
volumeMounts:
- name: models
mountPath: /root/.ollama
resources:
requests:
cpu: 250m
memory: 1Gi
nvidia.com/gpu.shared: 1
limits:
nvidia.com/gpu.shared: 1
containers:
- name: ollama
image: ollama/ollama:latest
imagePullPolicy: IfNotPresent
ports:
- name: http
containerPort: 11434
env:
- name: OLLAMA_HOST
value: 0.0.0.0
- name: OLLAMA_KEEP_ALIVE
value: 6h
- name: OLLAMA_MODELS
value: /root/.ollama
- name: NVIDIA_VISIBLE_DEVICES
value: all
- name: NVIDIA_DRIVER_CAPABILITIES
value: compute,utility
volumeMounts:
- name: models
mountPath: /root/.ollama
resources:
requests:
cpu: "2"
memory: 8Gi
nvidia.com/gpu.shared: 1
limits:
cpu: "4"
memory: 12Gi
nvidia.com/gpu.shared: 1

View File

@ -0,0 +1,9 @@
# services/ai-llm/kustomization.yaml
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
namespace: ai
resources:
- namespace.yaml
- pvc.yaml
- deployment.yaml
- service.yaml

View File

@ -0,0 +1,5 @@
# services/ai-llm/namespace.yaml
apiVersion: v1
kind: Namespace
metadata:
name: ai

13
services/ai-llm/pvc.yaml Normal file
View File

@ -0,0 +1,13 @@
# services/ai-llm/pvc.yaml
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
name: ollama-models
namespace: ai
spec:
accessModes:
- ReadWriteOnce
resources:
requests:
storage: 30Gi
storageClassName: astreae

View File

@ -0,0 +1,14 @@
# services/ai-llm/service.yaml
apiVersion: v1
kind: Service
metadata:
name: ollama
namespace: ai
spec:
type: ClusterIP
selector:
app: ollama
ports:
- name: http
port: 11434
targetPort: 11434

View File

@ -5,7 +5,7 @@ metadata:
name: bstein-dev-home-backend name: bstein-dev-home-backend
namespace: bstein-dev-home namespace: bstein-dev-home
spec: spec:
replicas: 2 replicas: 1
revisionHistoryLimit: 3 revisionHistoryLimit: 3
selector: selector:
matchLabels: matchLabels:
@ -15,6 +15,8 @@ spec:
labels: labels:
app: bstein-dev-home-backend app: bstein-dev-home-backend
spec: spec:
automountServiceAccountToken: true
serviceAccountName: bstein-dev-home
nodeSelector: nodeSelector:
kubernetes.io/arch: arm64 kubernetes.io/arch: arm64
node-role.kubernetes.io/worker: "true" node-role.kubernetes.io/worker: "true"
@ -22,8 +24,73 @@ spec:
- name: harbor-bstein-robot - name: harbor-bstein-robot
containers: containers:
- name: backend - name: backend
image: registry.bstein.dev/bstein/bstein-dev-home-backend:latest image: registry.bstein.dev/bstein/bstein-dev-home-backend:0.1.1-92 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend"}
imagePullPolicy: Always imagePullPolicy: Always
command: ["gunicorn"]
args:
- -b
- 0.0.0.0:8080
- --workers
- "2"
- --timeout
- "180"
- app:app
env:
- name: AI_CHAT_API
value: http://ollama.ai.svc.cluster.local:11434
- name: AI_CHAT_MODEL
value: qwen2.5-coder:7b-instruct-q4_0
- name: AI_CHAT_TIMEOUT_SEC
value: "60"
- name: AI_NODE_NAME
valueFrom:
fieldRef:
fieldPath: spec.nodeName
- name: AI_NODE_GPU_MAP
value: |
{"titan-20": "Jetson Xavier (edge GPU)", "titan-21": "Jetson Xavier (edge GPU)", "titan-22": "RTX 3050 8GB (local GPU)", "titan-24": "RTX 3080 8GB (local GPU)"}
- name: KEYCLOAK_ENABLED
value: "true"
- name: KEYCLOAK_URL
value: https://sso.bstein.dev
- name: KEYCLOAK_REALM
value: atlas
- name: KEYCLOAK_CLIENT_ID
value: bstein-dev-home
- name: KEYCLOAK_ISSUER
value: https://sso.bstein.dev/realms/atlas
- name: KEYCLOAK_JWKS_URL
value: http://keycloak.sso.svc.cluster.local/realms/atlas/protocol/openid-connect/certs
- name: KEYCLOAK_ADMIN_URL
value: http://keycloak.sso.svc.cluster.local
- name: KEYCLOAK_ADMIN_REALM
value: atlas
- name: KEYCLOAK_ADMIN_CLIENT_ID
value: bstein-dev-home-admin
- name: KEYCLOAK_ADMIN_CLIENT_SECRET
valueFrom:
secretKeyRef:
name: bstein-dev-home-keycloak-admin
key: client_secret
- name: ACCOUNT_ALLOWED_GROUPS
value: ""
- name: PORTAL_DATABASE_URL
valueFrom:
secretKeyRef:
name: atlas-portal-db
key: PORTAL_DATABASE_URL
- name: HTTP_CHECK_TIMEOUT_SEC
value: "2"
- name: ACCESS_REQUEST_SUBMIT_RATE_LIMIT
value: "30"
- name: ACCESS_REQUEST_SUBMIT_RATE_WINDOW_SEC
value: "3600"
- name: ACCESS_REQUEST_STATUS_RATE_LIMIT
value: "120"
- name: ACCESS_REQUEST_STATUS_RATE_WINDOW_SEC
value: "60"
- name: ACCESS_REQUEST_INTERNAL_EMAIL_ALLOWLIST
value: robotuser@bstein.dev
ports: ports:
- name: http - name: http
containerPort: 8080 containerPort: 8080
@ -33,16 +100,18 @@ spec:
port: http port: http
initialDelaySeconds: 2 initialDelaySeconds: 2
periodSeconds: 5 periodSeconds: 5
timeoutSeconds: 3
livenessProbe: livenessProbe:
httpGet: httpGet:
path: /api/healthz path: /api/healthz
port: http port: http
initialDelaySeconds: 10 initialDelaySeconds: 10
periodSeconds: 10 periodSeconds: 10
timeoutSeconds: 3
resources: resources:
requests: requests:
cpu: 50m cpu: 100m
memory: 64Mi memory: 128Mi
limits: limits:
cpu: 300m cpu: 500m
memory: 256Mi memory: 512Mi

View File

@ -0,0 +1,69 @@
# services/bstein-dev-home/chat-ai-gateway-deployment.yaml
apiVersion: apps/v1
kind: Deployment
metadata:
name: chat-ai-gateway
namespace: bstein-dev-home
spec:
replicas: 1
revisionHistoryLimit: 2
selector:
matchLabels:
app: chat-ai-gateway
template:
metadata:
labels:
app: chat-ai-gateway
spec:
nodeSelector:
kubernetes.io/arch: arm64
node-role.kubernetes.io/worker: "true"
containers:
- name: gateway
image: python:3.11-slim
command: ["/bin/sh","-c"]
args:
- python /app/gateway.py
env:
- name: UPSTREAM_URL
value: http://bstein-dev-home-backend/api/chat
- name: CHAT_KEY_MATRIX
valueFrom:
secretKeyRef:
name: chat-ai-keys-runtime
key: matrix
- name: CHAT_KEY_HOMEPAGE
valueFrom:
secretKeyRef:
name: chat-ai-keys-runtime
key: homepage
ports:
- name: http
containerPort: 8080
readinessProbe:
httpGet:
path: /healthz
port: http
initialDelaySeconds: 2
periodSeconds: 5
livenessProbe:
httpGet:
path: /healthz
port: http
initialDelaySeconds: 10
periodSeconds: 10
resources:
requests:
cpu: 20m
memory: 64Mi
limits:
cpu: 200m
memory: 256Mi
volumeMounts:
- name: code
mountPath: /app/gateway.py
subPath: gateway.py
volumes:
- name: code
configMap:
name: chat-ai-gateway

View File

@ -0,0 +1,13 @@
# services/bstein-dev-home/chat-ai-gateway-service.yaml
apiVersion: v1
kind: Service
metadata:
name: chat-ai-gateway
namespace: bstein-dev-home
spec:
selector:
app: chat-ai-gateway
ports:
- name: http
port: 80
targetPort: 8080

View File

@ -5,7 +5,7 @@ metadata:
name: bstein-dev-home-frontend name: bstein-dev-home-frontend
namespace: bstein-dev-home namespace: bstein-dev-home
spec: spec:
replicas: 2 replicas: 1
revisionHistoryLimit: 3 revisionHistoryLimit: 3
selector: selector:
matchLabels: matchLabels:
@ -22,7 +22,7 @@ spec:
- name: harbor-bstein-robot - name: harbor-bstein-robot
containers: containers:
- name: frontend - name: frontend
image: registry.bstein.dev/bstein/bstein-dev-home-frontend:latest image: registry.bstein.dev/bstein/bstein-dev-home-frontend:0.1.1-92 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend"}
imagePullPolicy: Always imagePullPolicy: Always
ports: ports:
- name: http - name: http

View File

@ -11,7 +11,7 @@ metadata:
cert-manager.io/cluster-issuer: letsencrypt cert-manager.io/cluster-issuer: letsencrypt
spec: spec:
tls: tls:
- hosts: [ "bstein.dev" ] - hosts: [ "bstein.dev", "chat.ai.bstein.dev" ]
secretName: bstein-dev-home-tls secretName: bstein-dev-home-tls
rules: rules:
- host: bstein.dev - host: bstein.dev
@ -29,3 +29,12 @@ spec:
service: service:
name: bstein-dev-home-frontend name: bstein-dev-home-frontend
port: { number: 80 } port: { number: 80 }
- host: chat.ai.bstein.dev
http:
paths:
- path: /
pathType: Prefix
backend:
service:
name: chat-ai-gateway
port: { number: 80 }

View File

@ -5,13 +5,38 @@ namespace: bstein-dev-home
resources: resources:
- namespace.yaml - namespace.yaml
- image.yaml - image.yaml
- rbac.yaml
- portal-e2e-client-secret-sync-rbac.yaml
- chat-ai-gateway-deployment.yaml
- chat-ai-gateway-service.yaml
- frontend-deployment.yaml - frontend-deployment.yaml
- frontend-service.yaml - frontend-service.yaml
- backend-deployment.yaml - backend-deployment.yaml
- backend-service.yaml - backend-service.yaml
- vaultwarden-cred-sync-cronjob.yaml
- portal-onboarding-e2e-test-job.yaml
- ingress.yaml - ingress.yaml
images: images:
- name: registry.bstein.dev/bstein/bstein-dev-home-frontend - name: registry.bstein.dev/bstein/bstein-dev-home-frontend
newTag: 0.1.1-0 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend"} newTag: registry.bstein.dev/bstein/bstein-dev-home-frontend:0.1.1-92 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend"}
- name: registry.bstein.dev/bstein/bstein-dev-home-backend - name: registry.bstein.dev/bstein/bstein-dev-home-backend
newTag: 0.1.1-0 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend"} newTag: registry.bstein.dev/bstein/bstein-dev-home-backend:0.1.1-92 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend"}
configMapGenerator:
- name: chat-ai-gateway
namespace: bstein-dev-home
files:
- gateway.py=scripts/gateway.py
options:
disableNameSuffixHash: true
- name: vaultwarden-cred-sync-script
namespace: bstein-dev-home
files:
- vaultwarden_cred_sync.py=scripts/vaultwarden_cred_sync.py
options:
disableNameSuffixHash: true
- name: portal-onboarding-e2e-tests
namespace: bstein-dev-home
files:
- test_portal_onboarding_flow.py=scripts/test_portal_onboarding_flow.py
options:
disableNameSuffixHash: true

View File

@ -0,0 +1,24 @@
# services/bstein-dev-home/portal-e2e-client-secret-sync-rbac.yaml
apiVersion: rbac.authorization.k8s.io/v1
kind: Role
metadata:
name: portal-e2e-client-secret-sync-target
namespace: bstein-dev-home
rules:
- apiGroups: [""]
resources: ["secrets"]
verbs: ["get", "create", "patch", "update"]
---
apiVersion: rbac.authorization.k8s.io/v1
kind: RoleBinding
metadata:
name: portal-e2e-client-secret-sync-target
namespace: bstein-dev-home
subjects:
- kind: ServiceAccount
name: portal-e2e-client-secret-sync
namespace: sso
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: Role
name: portal-e2e-client-secret-sync-target

View File

@ -0,0 +1,66 @@
# services/bstein-dev-home/portal-onboarding-e2e-test-job.yaml
apiVersion: batch/v1
kind: Job
metadata:
name: portal-onboarding-e2e-test-11
namespace: bstein-dev-home
spec:
backoffLimit: 0
template:
spec:
restartPolicy: Never
containers:
- name: test
image: python:3.11-slim
env:
- name: PORTAL_BASE_URL
value: http://bstein-dev-home-backend.bstein-dev-home.svc.cluster.local
- name: KEYCLOAK_ADMIN_URL
value: https://sso.bstein.dev
- name: KEYCLOAK_REALM
value: atlas
- name: KEYCLOAK_ADMIN_CLIENT_ID
value: bstein-dev-home-admin
- name: KEYCLOAK_ADMIN_CLIENT_SECRET
valueFrom:
secretKeyRef:
name: bstein-dev-home-keycloak-admin
key: client_secret
- name: PORTAL_E2E_CLIENT_ID
valueFrom:
secretKeyRef:
name: portal-e2e-client
key: client_id
- name: PORTAL_E2E_CLIENT_SECRET
valueFrom:
secretKeyRef:
name: portal-e2e-client
key: client_secret
- name: PORTAL_TARGET_CLIENT_ID
value: bstein-dev-home
- name: E2E_PORTAL_ADMIN_USERNAME
value: bstein
- name: E2E_USERNAME_PREFIX
value: e2e-portal
- name: E2E_CONTACT_EMAIL
value: robotuser@bstein.dev
- name: E2E_IMAP_KEYCLOAK_USERNAME
value: robotuser
- name: E2E_DEADLINE_SECONDS
value: "600"
- name: E2E_POLL_SECONDS
value: "10"
command: ["/bin/sh", "-c"]
args:
- |
set -euo pipefail
python /scripts/test_portal_onboarding_flow.py
volumeMounts:
- name: tests
mountPath: /scripts
readOnly: true
volumes:
- name: tests
configMap:
name: portal-onboarding-e2e-tests
defaultMode: 0555

View File

@ -0,0 +1,108 @@
# services/bstein-dev-home/rbac.yaml
apiVersion: v1
kind: ServiceAccount
metadata:
name: bstein-dev-home
namespace: bstein-dev-home
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
name: bstein-dev-home-ai-reader
rules:
- apiGroups: [""]
resources: ["pods"]
verbs: ["get", "list", "watch"]
resourceNames: []
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
name: bstein-dev-home-ai-reader
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: bstein-dev-home-ai-reader
subjects:
- kind: ServiceAccount
name: bstein-dev-home
namespace: bstein-dev-home
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
name: bstein-dev-home-vaultwarden-admin-secret-reader
rules:
- apiGroups: [""]
resources: ["secrets"]
verbs: ["get"]
resourceNames: ["vaultwarden-admin"]
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
name: bstein-dev-home-vaultwarden-admin-secret-reader
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: bstein-dev-home-vaultwarden-admin-secret-reader
subjects:
- kind: ServiceAccount
name: bstein-dev-home
namespace: bstein-dev-home
---
apiVersion: rbac.authorization.k8s.io/v1
kind: Role
metadata:
name: bstein-dev-home-vaultwarden-admin-token-reader
namespace: vaultwarden
rules:
- apiGroups: [""]
resources: ["secrets"]
verbs: ["get"]
resourceNames: ["vaultwarden-admin"]
---
apiVersion: rbac.authorization.k8s.io/v1
kind: RoleBinding
metadata:
name: bstein-dev-home-vaultwarden-admin-token-reader
namespace: vaultwarden
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: Role
name: bstein-dev-home-vaultwarden-admin-token-reader
subjects:
- kind: ServiceAccount
name: bstein-dev-home
namespace: bstein-dev-home
---
apiVersion: rbac.authorization.k8s.io/v1
kind: Role
metadata:
name: bstein-dev-home-nextcloud-mail-sync
namespace: nextcloud
rules:
- apiGroups: ["batch"]
resources: ["cronjobs"]
verbs: ["get"]
resourceNames: ["nextcloud-mail-sync"]
- apiGroups: ["batch"]
resources: ["jobs"]
verbs: ["create", "get", "list", "watch"]
- apiGroups: [""]
resources: ["pods"]
verbs: ["get", "list"]
---
apiVersion: rbac.authorization.k8s.io/v1
kind: RoleBinding
metadata:
name: bstein-dev-home-nextcloud-mail-sync
namespace: nextcloud
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: Role
name: bstein-dev-home-nextcloud-mail-sync
subjects:
- kind: ServiceAccount
name: bstein-dev-home
namespace: bstein-dev-home

View File

@ -0,0 +1,70 @@
import json
import os
from http.server import BaseHTTPRequestHandler, HTTPServer
from urllib import request, error
UPSTREAM = os.environ.get("UPSTREAM_URL", "http://bstein-dev-home-backend/api/chat")
KEY_MATRIX = os.environ.get("CHAT_KEY_MATRIX", "")
KEY_HOMEPAGE = os.environ.get("CHAT_KEY_HOMEPAGE", "")
ALLOWED = {k for k in (KEY_MATRIX, KEY_HOMEPAGE) if k}
class Handler(BaseHTTPRequestHandler):
def _send_json(self, code: int, payload: dict):
body = json.dumps(payload).encode()
self.send_response(code)
self.send_header("Content-Type", "application/json")
self.send_header("Content-Length", str(len(body)))
self.end_headers()
self.wfile.write(body)
def do_GET(self): # noqa: N802
if self.path in ("/healthz", "/"):
return self._send_json(200, {"ok": True})
return self._send_json(404, {"error": "not_found"})
def do_POST(self): # noqa: N802
if self.path != "/":
return self._send_json(404, {"error": "not_found"})
key = self.headers.get("x-api-key", "")
if not key or key not in ALLOWED:
return self._send_json(401, {"error": "unauthorized"})
length = int(self.headers.get("content-length", "0") or "0")
raw = self.rfile.read(length) if length else b"{}"
try:
upstream_req = request.Request(
UPSTREAM,
data=raw,
headers={"Content-Type": "application/json"},
method="POST",
)
with request.urlopen(upstream_req, timeout=90) as resp:
data = resp.read()
self.send_response(resp.status)
for k, v in resp.headers.items():
if k.lower() in ("content-length", "connection", "server", "date"):
continue
self.send_header(k, v)
self.send_header("Content-Length", str(len(data)))
self.end_headers()
self.wfile.write(data)
except error.HTTPError as e:
data = e.read() if hasattr(e, "read") else b""
self.send_response(e.code)
self.send_header("Content-Type", "application/json")
self.send_header("Content-Length", str(len(data)))
self.end_headers()
self.wfile.write(data)
except Exception:
return self._send_json(502, {"error": "bad_gateway"})
def main():
port = int(os.environ.get("PORT", "8080"))
httpd = HTTPServer(("0.0.0.0", port), Handler)
httpd.serve_forever()
if __name__ == "__main__":
main()

View File

@ -0,0 +1,428 @@
#!/usr/bin/env python3
import email
import http.client
import imaplib
import json
import os
import re
import ssl
import sys
import time
import urllib.error
import urllib.parse
import urllib.request
def _env(name: str, default: str | None = None) -> str:
value = os.environ.get(name, default)
if value is None or value == "":
raise SystemExit(f"missing required env var: {name}")
return value
def _post_json(url: str, payload: dict, timeout_s: int = 30) -> dict:
body = json.dumps(payload).encode()
req = urllib.request.Request(
url,
data=body,
headers={"Content-Type": "application/json"},
method="POST",
)
try:
with urllib.request.urlopen(req, timeout=timeout_s) as resp:
raw = resp.read().decode()
return json.loads(raw) if raw else {}
except urllib.error.HTTPError as exc:
raw = exc.read().decode(errors="replace")
raise SystemExit(f"HTTP {exc.code} from {url}: {raw}")
def _post_form(url: str, data: dict[str, str], timeout_s: int = 30) -> dict:
body = urllib.parse.urlencode(data).encode()
req = urllib.request.Request(
url,
data=body,
headers={"Content-Type": "application/x-www-form-urlencoded"},
method="POST",
)
try:
with urllib.request.urlopen(req, timeout=timeout_s) as resp:
raw = resp.read().decode()
return json.loads(raw) if raw else {}
except urllib.error.HTTPError as exc:
raw = exc.read().decode(errors="replace")
raise SystemExit(f"HTTP {exc.code} from {url}: {raw}")
def _get_json(url: str, headers: dict[str, str] | None = None, timeout_s: int = 30) -> object:
req = urllib.request.Request(url, headers=headers or {}, method="GET")
try:
with urllib.request.urlopen(req, timeout=timeout_s) as resp:
raw = resp.read().decode()
return json.loads(raw) if raw else None
except urllib.error.HTTPError as exc:
raw = exc.read().decode(errors="replace")
raise SystemExit(f"HTTP {exc.code} from {url}: {raw}")
def _request_json(
method: str,
url: str,
token: str,
payload: dict | None = None,
timeout_s: int = 30,
) -> dict:
data = None
headers = {"Authorization": f"Bearer {token}"}
if payload is not None:
data = json.dumps(payload).encode()
headers["Content-Type"] = "application/json"
req = urllib.request.Request(url, data=data, headers=headers, method=method)
try:
with urllib.request.urlopen(req, timeout=timeout_s) as resp:
raw = resp.read().decode()
return json.loads(raw) if raw else {}
except urllib.error.HTTPError as exc:
raw = exc.read().decode(errors="replace")
raise SystemExit(f"HTTP {exc.code} from {url}: {raw}")
def _keycloak_client_token(keycloak_base: str, realm: str, client_id: str, client_secret: str) -> str:
token_url = f"{keycloak_base.rstrip('/')}/realms/{realm}/protocol/openid-connect/token"
payload = _post_form(
token_url,
{
"grant_type": "client_credentials",
"client_id": client_id,
"client_secret": client_secret,
},
timeout_s=20,
)
token = payload.get("access_token")
if not isinstance(token, str) or not token:
raise SystemExit("keycloak token response missing access_token")
return token
def _keycloak_token_exchange(
*,
keycloak_base: str,
realm: str,
client_id: str,
client_secret: str,
subject_token: str,
requested_subject: str,
audience: str,
) -> str:
token_url = f"{keycloak_base.rstrip('/')}/realms/{realm}/protocol/openid-connect/token"
payload = _post_form(
token_url,
{
"grant_type": "urn:ietf:params:oauth:grant-type:token-exchange",
"client_id": client_id,
"client_secret": client_secret,
"subject_token": subject_token,
"requested_subject": requested_subject,
"audience": audience,
},
timeout_s=20,
)
token = payload.get("access_token")
if not isinstance(token, str) or not token:
raise SystemExit("keycloak token exchange response missing access_token")
return token
def _keycloak_find_user(keycloak_base: str, realm: str, token: str, username: str) -> dict | None:
url = f"{keycloak_base.rstrip('/')}/admin/realms/{realm}/users?{urllib.parse.urlencode({'username': username, 'exact': 'true', 'max': '1'})}"
users = _get_json(url, headers={"Authorization": f"Bearer {token}"}, timeout_s=20)
if not isinstance(users, list) or not users:
return None
user = users[0]
return user if isinstance(user, dict) else None
def _keycloak_get_user(keycloak_base: str, realm: str, token: str, user_id: str) -> dict:
url = f"{keycloak_base.rstrip('/')}/admin/realms/{realm}/users/{urllib.parse.quote(user_id, safe='')}"
data = _get_json(url, headers={"Authorization": f"Bearer {token}"}, timeout_s=20)
if not isinstance(data, dict):
raise SystemExit("unexpected keycloak user payload")
return data
def _extract_attr(attributes: object, key: str) -> str:
if not isinstance(attributes, dict):
return ""
value = attributes.get(key)
if isinstance(value, list) and value and isinstance(value[0], str):
return value[0]
if isinstance(value, str):
return value
return ""
def _imap_wait_for_verify_token(
*,
host: str,
port: int,
username: str,
password: str,
request_code: str,
deadline_sec: int,
) -> str:
ssl_context = ssl._create_unverified_context()
deadline_at = time.monotonic() + deadline_sec
with imaplib.IMAP4_SSL(host, port, ssl_context=ssl_context) as client:
client.login(username, password)
client.select("INBOX")
while time.monotonic() < deadline_at:
status, data = client.search(None, "TEXT", request_code)
if status == "OK" and data and data[0]:
ids = data[0].split()
msg_id = ids[-1]
fetch_status, msg_data = client.fetch(msg_id, "(RFC822)")
if fetch_status != "OK" or not msg_data:
time.sleep(2)
continue
raw = msg_data[0][1] if isinstance(msg_data[0], tuple) and len(msg_data[0]) > 1 else None
if not isinstance(raw, (bytes, bytearray)):
time.sleep(2)
continue
message = email.message_from_bytes(raw)
body = None
if message.is_multipart():
for part in message.walk():
if part.get_content_type() == "text/plain":
payload = part.get_payload(decode=True)
if isinstance(payload, (bytes, bytearray)):
body = payload.decode(errors="replace")
break
else:
payload = message.get_payload(decode=True)
if isinstance(payload, (bytes, bytearray)):
body = payload.decode(errors="replace")
if not body:
time.sleep(2)
continue
url = None
for line in body.splitlines():
candidate = line.strip()
if "verify=" in candidate and candidate.startswith("http"):
url = candidate
break
if not url:
match = re.search(r"https?://\\S+verify=\\S+", body)
url = match.group(0) if match else None
if not url:
time.sleep(2)
continue
parsed = urllib.parse.urlparse(url)
query = urllib.parse.parse_qs(parsed.query)
token = query.get("verify", [""])[0]
if isinstance(token, str) and token:
return token
time.sleep(2)
raise SystemExit("verification email not found before deadline")
def main() -> int:
portal_base = _env("PORTAL_BASE_URL").rstrip("/")
keycloak_base = _env("KEYCLOAK_ADMIN_URL").rstrip("/")
realm = _env("KEYCLOAK_REALM", "atlas")
kc_admin_client_id = _env("KEYCLOAK_ADMIN_CLIENT_ID")
kc_admin_client_secret = _env("KEYCLOAK_ADMIN_CLIENT_SECRET")
portal_e2e_client_id = _env("PORTAL_E2E_CLIENT_ID")
portal_e2e_client_secret = _env("PORTAL_E2E_CLIENT_SECRET")
portal_target_client_id = os.environ.get("PORTAL_TARGET_CLIENT_ID", "bstein-dev-home").strip() or "bstein-dev-home"
portal_admin_username = os.environ.get("E2E_PORTAL_ADMIN_USERNAME", "bstein").strip() or "bstein"
contact_email = os.environ.get("E2E_CONTACT_EMAIL", "robotuser@bstein.dev").strip()
if not contact_email:
raise SystemExit("E2E_CONTACT_EMAIL must not be empty")
imap_host = os.environ.get("E2E_IMAP_HOST", "mailu-front.mailu-mailserver.svc.cluster.local").strip()
imap_port = int(os.environ.get("E2E_IMAP_PORT", "993"))
imap_keycloak_username = os.environ.get("E2E_IMAP_KEYCLOAK_USERNAME", "robotuser").strip()
imap_wait_sec = int(os.environ.get("E2E_IMAP_WAIT_SECONDS", "90"))
try:
token = _keycloak_client_token(keycloak_base, realm, kc_admin_client_id, kc_admin_client_secret)
except SystemExit as exc:
raise SystemExit(f"failed to fetch keycloak token for admin client {kc_admin_client_id!r}: {exc}")
mailbox_user = _keycloak_find_user(keycloak_base, realm, token, imap_keycloak_username)
if not mailbox_user:
raise SystemExit(f"unable to locate Keycloak mailbox user {imap_keycloak_username!r}")
mailbox_user_id = mailbox_user.get("id")
if not isinstance(mailbox_user_id, str) or not mailbox_user_id:
raise SystemExit("mailbox user missing id")
mailbox_full = _keycloak_get_user(keycloak_base, realm, token, mailbox_user_id)
mailbox_attrs = mailbox_full.get("attributes")
mailu_email = _extract_attr(mailbox_attrs, "mailu_email")
if not mailu_email:
mailu_email = contact_email
mailu_password = _extract_attr(mailbox_attrs, "mailu_app_password")
if not mailu_password:
raise SystemExit(f"Keycloak user {imap_keycloak_username!r} missing mailu_app_password attribute")
username_prefix = os.environ.get("E2E_USERNAME_PREFIX", "e2e-user")
now = int(time.time())
username = f"{username_prefix}-{now}"
submit_url = f"{portal_base}/api/access/request"
submit_payload = {"username": username, "email": contact_email, "note": "portal onboarding e2e"}
submit = None
for attempt in range(1, 6):
try:
submit = _post_json(submit_url, submit_payload, timeout_s=20)
break
except (http.client.RemoteDisconnected, TimeoutError, urllib.error.URLError) as exc:
if attempt == 5:
raise SystemExit(f"portal submit failed after {attempt} attempts: {exc}")
time.sleep(2)
if not isinstance(submit, dict):
raise SystemExit("portal submit did not return json")
request_code = submit.get("request_code")
if not isinstance(request_code, str) or not request_code:
raise SystemExit(f"request submit did not return request_code: {submit}")
verify_token = _imap_wait_for_verify_token(
host=imap_host,
port=imap_port,
username=mailu_email,
password=mailu_password,
request_code=request_code,
deadline_sec=imap_wait_sec,
)
verify_resp = _post_json(
f"{portal_base}/api/access/request/verify",
{"request_code": request_code, "token": verify_token},
timeout_s=30,
)
if not isinstance(verify_resp, dict) or verify_resp.get("ok") is not True:
raise SystemExit(f"unexpected verify response: {verify_resp}")
portal_admin = _keycloak_find_user(keycloak_base, realm, token, portal_admin_username)
if not portal_admin:
raise SystemExit(f"unable to locate portal admin user {portal_admin_username!r} via Keycloak admin API")
portal_admin_user_id = portal_admin.get("id")
if not isinstance(portal_admin_user_id, str) or not portal_admin_user_id:
raise SystemExit("portal admin user missing id")
try:
e2e_subject_token = _keycloak_client_token(keycloak_base, realm, portal_e2e_client_id, portal_e2e_client_secret)
except SystemExit as exc:
raise SystemExit(f"failed to fetch keycloak token for E2E client {portal_e2e_client_id!r}: {exc}")
try:
portal_bearer = _keycloak_token_exchange(
keycloak_base=keycloak_base,
realm=realm,
client_id=portal_e2e_client_id,
client_secret=portal_e2e_client_secret,
subject_token=e2e_subject_token,
requested_subject=portal_admin_user_id,
audience=portal_target_client_id,
)
except SystemExit as exc:
raise SystemExit(f"failed to exchange token for portal approval as {portal_admin_username!r}: {exc}")
approve_url = f"{portal_base}/api/admin/access/requests/{urllib.parse.quote(username, safe='')}/approve"
approve_timeout_s = int(os.environ.get("E2E_APPROVE_TIMEOUT_SECONDS", "180"))
approve_attempts = int(os.environ.get("E2E_APPROVE_ATTEMPTS", "3"))
approve_resp = None
approve_error = None
for attempt in range(1, approve_attempts + 1):
try:
approve_resp = _request_json("POST", approve_url, portal_bearer, payload=None, timeout_s=approve_timeout_s)
approve_error = None
break
except (http.client.RemoteDisconnected, TimeoutError, urllib.error.URLError) as exc:
approve_error = str(exc)
if attempt == approve_attempts:
break
time.sleep(3)
if approve_resp is None:
print(
"WARNING: portal approval request did not return a response; "
f"continuing to poll status (last_error={approve_error})"
)
elif not isinstance(approve_resp, dict) or approve_resp.get("ok") is not True:
raise SystemExit(f"unexpected approval response: {approve_resp}")
status_url = f"{portal_base}/api/access/request/status"
deadline_s = int(os.environ.get("E2E_DEADLINE_SECONDS", "600"))
interval_s = int(os.environ.get("E2E_POLL_SECONDS", "10"))
deadline_at = time.monotonic() + deadline_s
last_status = None
last_error = None
while True:
try:
status_payload = _post_json(status_url, {"request_code": request_code}, timeout_s=60)
last_error = None
except (http.client.RemoteDisconnected, TimeoutError, urllib.error.URLError) as exc:
last_error = str(exc)
if time.monotonic() >= deadline_at:
raise SystemExit(f"timed out waiting for provisioning to finish (last error={last_error})")
time.sleep(interval_s)
continue
status = status_payload.get("status")
if isinstance(status, str):
last_status = status
if status in ("awaiting_onboarding", "ready"):
break
if status in ("denied", "unknown"):
raise SystemExit(f"request transitioned to unexpected terminal status: {status_payload}")
if time.monotonic() >= deadline_at:
suffix = f" (last error={last_error})" if last_error else ""
raise SystemExit(f"timed out waiting for provisioning to finish (last status={last_status}){suffix}")
time.sleep(interval_s)
# Refresh admin token (it may expire during the provisioning wait).
token = _keycloak_client_token(keycloak_base, realm, kc_admin_client_id, kc_admin_client_secret)
user = _keycloak_find_user(keycloak_base, realm, token, username)
if not user:
raise SystemExit("expected Keycloak user was not created")
user_id = user.get("id")
if not isinstance(user_id, str) or not user_id:
raise SystemExit("created user missing id")
full = _keycloak_get_user(keycloak_base, realm, token, user_id)
required_actions = full.get("requiredActions") or []
required: set[str] = set()
if isinstance(required_actions, list):
required = {a for a in required_actions if isinstance(a, str)}
unexpected = sorted(required.intersection({"UPDATE_PASSWORD", "VERIFY_EMAIL", "CONFIGURE_TOTP"}))
if unexpected:
raise SystemExit(
"Keycloak user should not require actions at first login "
f"(Vaultwarden-first onboarding): unexpected requiredActions={unexpected} full={sorted(required)}"
)
email_verified = full.get("emailVerified")
if email_verified is not True:
raise SystemExit(f"Keycloak user should have emailVerified=true: emailVerified={email_verified!r}")
kc_email = full.get("email")
if isinstance(kc_email, str) and contact_email and kc_email != contact_email:
raise SystemExit(f"Keycloak user email mismatch: expected {contact_email!r} got {kc_email!r}")
print(f"PASS: onboarding provisioning completed for {request_code} ({username})")
return 0
if __name__ == "__main__":
sys.exit(main())

View File

@ -0,0 +1,193 @@
#!/usr/bin/env python3
from __future__ import annotations
import sys
import time
from typing import Any, Iterable
import httpx
from atlas_portal import settings
from atlas_portal.keycloak import admin_client
from atlas_portal.vaultwarden import invite_user
VAULTWARDEN_EMAIL_ATTR = "vaultwarden_email"
VAULTWARDEN_STATUS_ATTR = "vaultwarden_status"
VAULTWARDEN_SYNCED_AT_ATTR = "vaultwarden_synced_at"
def _iter_keycloak_users(page_size: int = 200) -> Iterable[dict[str, Any]]:
client = admin_client()
if not client.ready():
raise RuntimeError("keycloak admin client not configured")
url = f"{settings.KEYCLOAK_ADMIN_URL}/admin/realms/{settings.KEYCLOAK_REALM}/users"
first = 0
while True:
headers = client.headers()
# We need attributes for idempotency (vaultwarden_status/vaultwarden_email). Keycloak defaults to a
# brief representation which may omit these.
params = {"first": str(first), "max": str(page_size), "briefRepresentation": "false"}
with httpx.Client(timeout=settings.HTTP_CHECK_TIMEOUT_SEC) as http:
resp = http.get(url, params=params, headers=headers)
resp.raise_for_status()
payload = resp.json()
if not isinstance(payload, list) or not payload:
return
for item in payload:
if isinstance(item, dict):
yield item
if len(payload) < page_size:
return
first += page_size
def _extract_attr(attrs: Any, key: str) -> str:
if not isinstance(attrs, dict):
return ""
raw = attrs.get(key)
if isinstance(raw, list):
for item in raw:
if isinstance(item, str) and item.strip():
return item.strip()
return ""
if isinstance(raw, str) and raw.strip():
return raw.strip()
return ""
def _vaultwarden_email_for_user(user: dict[str, Any]) -> str:
username = (user.get("username") if isinstance(user.get("username"), str) else "") or ""
username = username.strip()
if not username:
return ""
attrs = user.get("attributes")
vaultwarden_email = _extract_attr(attrs, VAULTWARDEN_EMAIL_ATTR)
if vaultwarden_email:
return vaultwarden_email
mailu_email = _extract_attr(attrs, "mailu_email")
if mailu_email:
return mailu_email
email = (user.get("email") if isinstance(user.get("email"), str) else "") or ""
email = email.strip()
if email and email.lower().endswith(f"@{settings.MAILU_DOMAIN.lower()}"):
return email
# Don't guess an internal mailbox address until Mailu sync has run and stored mailu_email.
# This avoids spamming Vaultwarden invites that can never be delivered (unknown recipient).
return ""
def _set_user_attribute_if_missing(username: str, user: dict[str, Any], key: str, value: str) -> None:
value = (value or "").strip()
if not value:
return
existing = _extract_attr(user.get("attributes"), key)
if existing:
return
admin_client().set_user_attribute(username, key, value)
def _set_user_attribute(username: str, key: str, value: str) -> None:
value = (value or "").strip()
if not value:
return
admin_client().set_user_attribute(username, key, value)
def main() -> int:
processed = 0
created = 0
skipped = 0
failures = 0
for user in _iter_keycloak_users():
username = (user.get("username") if isinstance(user.get("username"), str) else "") or ""
username = username.strip()
if not username:
skipped += 1
continue
enabled = user.get("enabled")
if enabled is False:
skipped += 1
continue
if user.get("serviceAccountClientId") or username.startswith("service-account-"):
skipped += 1
continue
# Fetch the full user payload so we can reliably read attributes (and skip re-invites).
user_id = (user.get("id") if isinstance(user.get("id"), str) else "") or ""
user_id = user_id.strip()
full_user = user
if user_id:
try:
full_user = admin_client().get_user(user_id)
except Exception:
full_user = user
current_status = _extract_attr(full_user.get("attributes"), VAULTWARDEN_STATUS_ATTR)
current_synced_at = _extract_attr(full_user.get("attributes"), VAULTWARDEN_SYNCED_AT_ATTR)
email = _vaultwarden_email_for_user(full_user)
if not email:
print(f"skip {username}: missing email", file=sys.stderr)
skipped += 1
continue
try:
_set_user_attribute_if_missing(username, full_user, VAULTWARDEN_EMAIL_ATTR, email)
except Exception:
pass
# If we've already successfully invited or confirmed presence, do not re-invite on every cron run.
# Vaultwarden returns 409 for "already exists", which is idempotent but noisy and can trigger rate limits.
if current_status in {"invited", "already_present"}:
if not current_synced_at:
try:
_set_user_attribute(
username,
VAULTWARDEN_SYNCED_AT_ATTR,
time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()),
)
except Exception:
pass
skipped += 1
continue
processed += 1
result = invite_user(email)
if result.ok:
created += 1
print(f"ok {username}: {result.status}")
try:
_set_user_attribute(username, VAULTWARDEN_STATUS_ATTR, result.status)
_set_user_attribute(username, VAULTWARDEN_SYNCED_AT_ATTR, time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()))
except Exception:
pass
else:
failures += 1
print(f"err {username}: {result.status} {result.detail}", file=sys.stderr)
try:
_set_user_attribute(username, VAULTWARDEN_STATUS_ATTR, result.status)
_set_user_attribute(username, VAULTWARDEN_SYNCED_AT_ATTR, time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()))
except Exception:
pass
print(
f"done processed={processed} created_or_present={created} skipped={skipped} failures={failures}",
file=sys.stderr,
)
return 0 if failures == 0 else 2
if __name__ == "__main__":
raise SystemExit(main())

View File

@ -0,0 +1,59 @@
# services/bstein-dev-home/vaultwarden-cred-sync-cronjob.yaml
apiVersion: batch/v1
kind: CronJob
metadata:
name: vaultwarden-cred-sync
namespace: bstein-dev-home
spec:
schedule: "*/15 * * * *"
concurrencyPolicy: Forbid
successfulJobsHistoryLimit: 1
failedJobsHistoryLimit: 3
jobTemplate:
spec:
backoffLimit: 0
template:
spec:
serviceAccountName: bstein-dev-home
restartPolicy: Never
nodeSelector:
kubernetes.io/arch: arm64
node-role.kubernetes.io/worker: "true"
imagePullSecrets:
- name: harbor-bstein-robot
containers:
- name: sync
image: registry.bstein.dev/bstein/bstein-dev-home-backend:0.1.1-92 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend"}
imagePullPolicy: Always
command:
- python
- /scripts/vaultwarden_cred_sync.py
env:
- name: PYTHONPATH
value: /app
- name: KEYCLOAK_ENABLED
value: "true"
- name: KEYCLOAK_REALM
value: atlas
- name: KEYCLOAK_ADMIN_URL
value: http://keycloak.sso.svc.cluster.local
- name: KEYCLOAK_ADMIN_REALM
value: atlas
- name: KEYCLOAK_ADMIN_CLIENT_ID
value: bstein-dev-home-admin
- name: KEYCLOAK_ADMIN_CLIENT_SECRET
valueFrom:
secretKeyRef:
name: bstein-dev-home-keycloak-admin
key: client_secret
- name: HTTP_CHECK_TIMEOUT_SEC
value: "20"
volumeMounts:
- name: vaultwarden-cred-sync-script
mountPath: /scripts
readOnly: true
volumes:
- name: vaultwarden-cred-sync-script
configMap:
name: vaultwarden-cred-sync-script
defaultMode: 0555

View File

@ -1,31 +0,0 @@
# services/ci-demo/deployment.yaml
apiVersion: apps/v1
kind: Deployment
metadata:
name: ci-demo
namespace: ci-demo
spec:
replicas: 1
selector:
matchLabels:
app.kubernetes.io/name: ci-demo
template:
metadata:
labels:
app.kubernetes.io/name: ci-demo
spec:
nodeSelector:
hardware: rpi4
containers:
- name: ci-demo
image: registry.bstein.dev/infra/ci-demo:latest
ports:
- name: http
containerPort: 8080
readinessProbe:
httpGet:
path: /
port: http
initialDelaySeconds: 2
periodSeconds: 5

View File

@ -1,24 +0,0 @@
# services/ci-demo/image.yaml
apiVersion: image.toolkit.fluxcd.io/v1
kind: ImageRepository
metadata:
name: ci-demo
namespace: flux-system
spec:
image: registry.bstein.dev/infra/ci-demo
interval: 1m0s
---
apiVersion: image.toolkit.fluxcd.io/v1
kind: ImagePolicy
metadata:
name: ci-demo
namespace: flux-system
spec:
imageRepositoryRef:
name: ci-demo
filterTags:
pattern: '^v(?P<version>0\.0\.0-\d+)$'
extract: '$version'
policy:
semver:
range: ">=0.0.0-0"

View File

@ -1,11 +0,0 @@
# services/ci-demo/kustomization.yaml
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
resources:
- namespace.yaml
- image.yaml
- deployment.yaml
- service.yaml
images:
- name: registry.bstein.dev/infra/ci-demo
newTag: registry.bstein.dev/infra/ci-demo:v0.0.0-3 # {"$imagepolicy": "flux-system:ci-demo"}

View File

@ -1,6 +0,0 @@
# services/ci-demo/namespace.yaml
apiVersion: v1
kind: Namespace
metadata:
name: ci-demo

31
services/comms/NOTES.md Normal file
View File

@ -0,0 +1,31 @@
# services/comms/NOTES.md
Purpose: Matrix + Element + LiveKit stack for Othrys (live.bstein.dev).
Core flow
- Matrix Authentication Service (MAS) handles login/SSO and issues Matrix access tokens.
- Synapse is the homeserver; MAS fronts login, Synapse serves client/server APIs.
- Element Web provides the main UI; Element Call embeds LiveKit for group video.
- LiveKit handles SFU media; Coturn provides TURN for NAT traversal.
- matrix-guest-register provisions MAS guest accounts and performs MAS password login to mint device-bound guest tokens (no Keycloak).
Operational jobs
- mas-db-ensure-job: ensures MAS database role/database + secret in comms.
- comms-secrets-ensure-job: creates runtime secrets (TURN, LiveKit, Synapse, atlasbot).
- synapse-signingkey-ensure-job: ensures Synapse signing key secret.
- synapse-seeder-admin-ensure-job: ensures Synapse admin user exists.
- synapse-user-seed-job: seeds atlasbot + othrys-seeder users/passwords.
- mas-local-users-ensure-job: ensures MAS local users exist (seeder/bot).
- seed-othrys-room: (suspended) creates Othrys + joins locals.
- reset-othrys-room: suspended CronJob for a manual room reset + pin invite.
- pin-othrys-invite: (suspended) pin invite message if missing.
- guest-name-randomizer: renames numeric/guest users to adj-noun names.
- bstein-force-leave: one-off room leave cleanup.
Manual re-runs
- Unsuspend a CronJob only when needed; re-suspend after completion.
Ports
- Traefik (HTTPS) via LB on 192.168.22.9.
- Coturn LB on 192.168.22.5 (3478/5349 + UDP range).
- LiveKit LB on 192.168.22.6 (7880/7881/7882/7883).

Some files were not shown because too many files have changed in this diff Show More