feature/sso-hardening #9
6
.gitignore
vendored
6
.gitignore
vendored
@ -1,2 +1,8 @@
|
||||
*.md
|
||||
!README.md
|
||||
!knowledge/**/*.md
|
||||
!services/comms/knowledge/**/*.md
|
||||
__pycache__/
|
||||
*.py[cod]
|
||||
.pytest_cache
|
||||
.venv
|
||||
|
||||
@ -5,8 +5,9 @@ resources:
|
||||
- ../../services/crypto
|
||||
- ../../services/gitea
|
||||
- ../../services/jellyfin
|
||||
- ../../services/jitsi
|
||||
- ../../services/comms
|
||||
- ../../services/monitoring
|
||||
- ../../services/logging
|
||||
- ../../services/pegasus
|
||||
- ../../services/vault
|
||||
- ../../services/bstein-dev-home
|
||||
|
||||
@ -0,0 +1,23 @@
|
||||
# clusters/atlas/flux-system/applications/ai-llm/kustomization.yaml
|
||||
apiVersion: kustomize.toolkit.fluxcd.io/v1
|
||||
kind: Kustomization
|
||||
metadata:
|
||||
name: ai-llm
|
||||
namespace: flux-system
|
||||
spec:
|
||||
interval: 10m
|
||||
path: ./services/ai-llm
|
||||
targetNamespace: ai
|
||||
prune: true
|
||||
sourceRef:
|
||||
kind: GitRepository
|
||||
name: flux-system
|
||||
namespace: flux-system
|
||||
wait: true
|
||||
healthChecks:
|
||||
- apiVersion: apps/v1
|
||||
kind: Deployment
|
||||
name: ollama
|
||||
namespace: ai
|
||||
dependsOn:
|
||||
- name: core
|
||||
@ -1,26 +0,0 @@
|
||||
# clusters/atlas/flux-system/applications/ci-demo/image-automation.yaml
|
||||
apiVersion: image.toolkit.fluxcd.io/v1
|
||||
kind: ImageUpdateAutomation
|
||||
metadata:
|
||||
name: ci-demo
|
||||
namespace: flux-system
|
||||
spec:
|
||||
interval: 1m0s
|
||||
sourceRef:
|
||||
kind: GitRepository
|
||||
name: flux-system
|
||||
namespace: flux-system
|
||||
git:
|
||||
checkout:
|
||||
ref:
|
||||
branch: feature/ci-gitops
|
||||
commit:
|
||||
author:
|
||||
email: ops@bstein.dev
|
||||
name: flux-bot
|
||||
messageTemplate: "chore(ci-demo): apply image updates"
|
||||
push:
|
||||
branch: feature/ci-gitops
|
||||
update:
|
||||
strategy: Setters
|
||||
path: services/ci-demo
|
||||
@ -0,0 +1,17 @@
|
||||
# clusters/atlas/flux-system/applications/communication/kustomization.yaml
|
||||
apiVersion: kustomize.toolkit.fluxcd.io/v1
|
||||
kind: Kustomization
|
||||
metadata:
|
||||
name: comms
|
||||
namespace: flux-system
|
||||
spec:
|
||||
interval: 10m
|
||||
prune: true
|
||||
sourceRef:
|
||||
kind: GitRepository
|
||||
name: flux-system
|
||||
path: ./services/comms
|
||||
targetNamespace: comms
|
||||
timeout: 2m
|
||||
dependsOn:
|
||||
- name: traefik
|
||||
@ -15,5 +15,6 @@ spec:
|
||||
namespace: flux-system
|
||||
dependsOn:
|
||||
- name: core
|
||||
- name: openldap
|
||||
wait: true
|
||||
timeout: 5m
|
||||
|
||||
@ -16,8 +16,12 @@ spec:
|
||||
- name: helm
|
||||
- name: traefik
|
||||
healthChecks:
|
||||
- apiVersion: helm.toolkit.fluxcd.io/v2
|
||||
kind: HelmRelease
|
||||
- apiVersion: apps/v1
|
||||
kind: Deployment
|
||||
name: jenkins
|
||||
namespace: jenkins
|
||||
- apiVersion: v1
|
||||
kind: Service
|
||||
name: jenkins
|
||||
namespace: jenkins
|
||||
wait: false
|
||||
|
||||
@ -4,7 +4,8 @@ kind: Kustomization
|
||||
resources:
|
||||
- gitea/kustomization.yaml
|
||||
- vault/kustomization.yaml
|
||||
- jitsi/kustomization.yaml
|
||||
- vaultwarden/kustomization.yaml
|
||||
- comms/kustomization.yaml
|
||||
- crypto/kustomization.yaml
|
||||
- monerod/kustomization.yaml
|
||||
- pegasus/kustomization.yaml
|
||||
@ -16,9 +17,14 @@ resources:
|
||||
- jellyfin/kustomization.yaml
|
||||
- xmr-miner/kustomization.yaml
|
||||
- sui-metrics/kustomization.yaml
|
||||
- openldap/kustomization.yaml
|
||||
- keycloak/kustomization.yaml
|
||||
- oauth2-proxy/kustomization.yaml
|
||||
- mailu/kustomization.yaml
|
||||
- jenkins/kustomization.yaml
|
||||
- ci-demo/kustomization.yaml
|
||||
- ci-demo/image-automation.yaml
|
||||
- ai-llm/kustomization.yaml
|
||||
- nextcloud/kustomization.yaml
|
||||
- nextcloud-mail-sync/kustomization.yaml
|
||||
- postgres/kustomization.yaml
|
||||
- outline/kustomization.yaml
|
||||
- planka/kustomization.yaml
|
||||
|
||||
@ -0,0 +1,17 @@
|
||||
# clusters/atlas/flux-system/applications/nextcloud-mail-sync/kustomization.yaml
|
||||
apiVersion: kustomize.toolkit.fluxcd.io/v1
|
||||
kind: Kustomization
|
||||
metadata:
|
||||
name: nextcloud-mail-sync
|
||||
namespace: flux-system
|
||||
spec:
|
||||
interval: 10m
|
||||
prune: true
|
||||
sourceRef:
|
||||
kind: GitRepository
|
||||
name: flux-system
|
||||
path: ./services/nextcloud-mail-sync
|
||||
targetNamespace: nextcloud
|
||||
timeout: 2m
|
||||
dependsOn:
|
||||
- name: keycloak
|
||||
@ -0,0 +1,16 @@
|
||||
# clusters/atlas/flux-system/applications/nextcloud/kustomization.yaml
|
||||
apiVersion: kustomize.toolkit.fluxcd.io/v1beta2
|
||||
kind: Kustomization
|
||||
metadata:
|
||||
name: nextcloud
|
||||
namespace: flux-system
|
||||
spec:
|
||||
interval: 10m
|
||||
path: ./services/nextcloud
|
||||
targetNamespace: nextcloud
|
||||
prune: true
|
||||
sourceRef:
|
||||
kind: GitRepository
|
||||
name: flux-system
|
||||
namespace: flux-system
|
||||
wait: true
|
||||
@ -1,18 +1,18 @@
|
||||
# clusters/atlas/flux-system/applications/jitsi/kustomization.yaml
|
||||
# clusters/atlas/flux-system/applications/openldap/kustomization.yaml
|
||||
apiVersion: kustomize.toolkit.fluxcd.io/v1
|
||||
kind: Kustomization
|
||||
metadata:
|
||||
name: jitsi
|
||||
name: openldap
|
||||
namespace: flux-system
|
||||
spec:
|
||||
interval: 10m
|
||||
path: ./services/jitsi
|
||||
targetNamespace: jitsi
|
||||
prune: true
|
||||
sourceRef:
|
||||
kind: GitRepository
|
||||
name: flux-system
|
||||
namespace: flux-system
|
||||
path: ./services/openldap
|
||||
targetNamespace: sso
|
||||
dependsOn:
|
||||
- name: core
|
||||
wait: true
|
||||
@ -0,0 +1,28 @@
|
||||
# clusters/atlas/flux-system/applications/outline/kustomization.yaml
|
||||
apiVersion: kustomize.toolkit.fluxcd.io/v1
|
||||
kind: Kustomization
|
||||
metadata:
|
||||
name: outline
|
||||
namespace: flux-system
|
||||
spec:
|
||||
interval: 10m
|
||||
path: ./services/outline
|
||||
prune: true
|
||||
sourceRef:
|
||||
kind: GitRepository
|
||||
name: flux-system
|
||||
targetNamespace: outline
|
||||
dependsOn:
|
||||
- name: keycloak
|
||||
- name: mailu
|
||||
- name: traefik
|
||||
healthChecks:
|
||||
- apiVersion: apps/v1
|
||||
kind: Deployment
|
||||
name: outline
|
||||
namespace: outline
|
||||
- apiVersion: v1
|
||||
kind: Service
|
||||
name: outline
|
||||
namespace: outline
|
||||
wait: false
|
||||
@ -0,0 +1,28 @@
|
||||
# clusters/atlas/flux-system/applications/planka/kustomization.yaml
|
||||
apiVersion: kustomize.toolkit.fluxcd.io/v1
|
||||
kind: Kustomization
|
||||
metadata:
|
||||
name: planka
|
||||
namespace: flux-system
|
||||
spec:
|
||||
interval: 10m
|
||||
path: ./services/planka
|
||||
prune: true
|
||||
sourceRef:
|
||||
kind: GitRepository
|
||||
name: flux-system
|
||||
targetNamespace: planka
|
||||
dependsOn:
|
||||
- name: keycloak
|
||||
- name: mailu
|
||||
- name: traefik
|
||||
healthChecks:
|
||||
- apiVersion: apps/v1
|
||||
kind: Deployment
|
||||
name: planka
|
||||
namespace: planka
|
||||
- apiVersion: v1
|
||||
kind: Service
|
||||
name: planka
|
||||
namespace: planka
|
||||
wait: false
|
||||
@ -0,0 +1,24 @@
|
||||
# clusters/atlas/flux-system/applications/postgres/kustomization.yaml
|
||||
apiVersion: kustomize.toolkit.fluxcd.io/v1
|
||||
kind: Kustomization
|
||||
metadata:
|
||||
name: postgres
|
||||
namespace: flux-system
|
||||
spec:
|
||||
interval: 10m
|
||||
path: ./services/postgres
|
||||
prune: true
|
||||
force: true
|
||||
sourceRef:
|
||||
kind: GitRepository
|
||||
name: flux-system
|
||||
targetNamespace: postgres
|
||||
dependsOn:
|
||||
- name: vault
|
||||
- name: vault-csi
|
||||
healthChecks:
|
||||
- apiVersion: apps/v1
|
||||
kind: StatefulSet
|
||||
name: postgres
|
||||
namespace: postgres
|
||||
wait: true
|
||||
@ -0,0 +1,20 @@
|
||||
# clusters/atlas/flux-system/applications/vaultwarden/kustomization.yaml
|
||||
apiVersion: kustomize.toolkit.fluxcd.io/v1
|
||||
kind: Kustomization
|
||||
metadata:
|
||||
name: vaultwarden
|
||||
namespace: flux-system
|
||||
spec:
|
||||
interval: 10m
|
||||
suspend: false
|
||||
sourceRef:
|
||||
kind: GitRepository
|
||||
name: flux-system
|
||||
namespace: flux-system
|
||||
path: ./services/vaultwarden
|
||||
targetNamespace: vaultwarden
|
||||
prune: true
|
||||
wait: true
|
||||
dependsOn:
|
||||
- name: helm
|
||||
- name: traefik
|
||||
@ -8,7 +8,7 @@ metadata:
|
||||
spec:
|
||||
interval: 1m0s
|
||||
ref:
|
||||
branch: main
|
||||
branch: feature/sso-hardening
|
||||
secretRef:
|
||||
name: flux-system-gitea
|
||||
url: ssh://git@scm.bstein.dev:2242/bstein/titan-iac.git
|
||||
|
||||
@ -4,7 +4,11 @@ kind: Kustomization
|
||||
resources:
|
||||
- core/kustomization.yaml
|
||||
- helm/kustomization.yaml
|
||||
- metallb/kustomization.yaml
|
||||
- traefik/kustomization.yaml
|
||||
- gitops-ui/kustomization.yaml
|
||||
- monitoring/kustomization.yaml
|
||||
- logging/kustomization.yaml
|
||||
- maintenance/kustomization.yaml
|
||||
- longhorn-ui/kustomization.yaml
|
||||
- ../platform/vault-csi/kustomization.yaml
|
||||
|
||||
@ -0,0 +1,14 @@
|
||||
# clusters/atlas/flux-system/platform/logging/kustomization.yaml
|
||||
apiVersion: kustomize.toolkit.fluxcd.io/v1
|
||||
kind: Kustomization
|
||||
metadata:
|
||||
name: logging
|
||||
namespace: flux-system
|
||||
spec:
|
||||
interval: 10m
|
||||
path: ./services/logging
|
||||
prune: true
|
||||
sourceRef:
|
||||
kind: GitRepository
|
||||
name: flux-system
|
||||
wait: false
|
||||
@ -1,17 +1,14 @@
|
||||
# clusters/atlas/flux-system/applications/ci-demo/kustomization.yaml
|
||||
# clusters/atlas/flux-system/platform/maintenance/kustomization.yaml
|
||||
apiVersion: kustomize.toolkit.fluxcd.io/v1
|
||||
kind: Kustomization
|
||||
metadata:
|
||||
name: ci-demo
|
||||
name: maintenance
|
||||
namespace: flux-system
|
||||
spec:
|
||||
interval: 10m
|
||||
path: ./services/ci-demo
|
||||
path: ./services/maintenance
|
||||
prune: true
|
||||
sourceRef:
|
||||
kind: GitRepository
|
||||
name: flux-system
|
||||
namespace: flux-system
|
||||
dependsOn:
|
||||
- name: core
|
||||
wait: false
|
||||
@ -0,0 +1,16 @@
|
||||
# clusters/atlas/flux-system/platform/metallb/kustomization.yaml
|
||||
apiVersion: kustomize.toolkit.fluxcd.io/v1
|
||||
kind: Kustomization
|
||||
metadata:
|
||||
name: metallb
|
||||
namespace: flux-system
|
||||
spec:
|
||||
interval: 30m
|
||||
sourceRef:
|
||||
kind: GitRepository
|
||||
name: flux-system
|
||||
namespace: flux-system
|
||||
path: ./infrastructure/metallb
|
||||
prune: true
|
||||
wait: true
|
||||
targetNamespace: metallb-system
|
||||
@ -15,4 +15,5 @@ spec:
|
||||
namespace: flux-system
|
||||
dependsOn:
|
||||
- name: core
|
||||
- name: metallb
|
||||
wait: true
|
||||
|
||||
@ -0,0 +1,16 @@
|
||||
# clusters/atlas/flux-system/platform/vault-csi/kustomization.yaml
|
||||
apiVersion: kustomize.toolkit.fluxcd.io/v1
|
||||
kind: Kustomization
|
||||
metadata:
|
||||
name: vault-csi
|
||||
namespace: flux-system
|
||||
spec:
|
||||
interval: 30m
|
||||
sourceRef:
|
||||
kind: GitRepository
|
||||
name: flux-system
|
||||
namespace: flux-system
|
||||
path: ./infrastructure/vault-csi
|
||||
prune: true
|
||||
wait: true
|
||||
targetNamespace: kube-system
|
||||
@ -5,3 +5,4 @@ resources:
|
||||
- ../../../infrastructure/modules/base
|
||||
- ../../../infrastructure/modules/profiles/atlas-ha
|
||||
- ../../../infrastructure/sources/cert-manager/letsencrypt.yaml
|
||||
- ../../../infrastructure/metallb
|
||||
|
||||
16
dockerfiles/Dockerfile.data-prepper
Normal file
16
dockerfiles/Dockerfile.data-prepper
Normal file
@ -0,0 +1,16 @@
|
||||
FROM --platform=$BUILDPLATFORM opensearchproject/data-prepper:2.8.0 AS source
|
||||
|
||||
FROM --platform=$TARGETPLATFORM eclipse-temurin:17-jre
|
||||
|
||||
ENV DATA_PREPPER_PATH=/usr/share/data-prepper
|
||||
|
||||
RUN useradd -u 10001 -M -U -d / -s /usr/sbin/nologin data_prepper \
|
||||
&& mkdir -p /var/log/data-prepper
|
||||
|
||||
COPY --from=source /usr/share/data-prepper /usr/share/data-prepper
|
||||
|
||||
RUN chown -R 10001:10001 /usr/share/data-prepper /var/log/data-prepper
|
||||
|
||||
USER 10001
|
||||
WORKDIR /usr/share/data-prepper
|
||||
CMD ["bin/data-prepper"]
|
||||
@ -1,5 +1,18 @@
|
||||
# hosts/roles/titan_jh/tasks/main.yaml
|
||||
---
|
||||
- name: Install node exporter
|
||||
ansible.builtin.package:
|
||||
name: prometheus-node-exporter
|
||||
state: present
|
||||
tags: ['jumphost', 'monitoring']
|
||||
|
||||
- name: Enable node exporter
|
||||
ansible.builtin.service:
|
||||
name: prometheus-node-exporter
|
||||
enabled: true
|
||||
state: started
|
||||
tags: ['jumphost', 'monitoring']
|
||||
|
||||
- name: Placeholder for jumphost hardening
|
||||
ansible.builtin.debug:
|
||||
msg: "Harden SSH, manage bastion tooling, and configure audit logging here."
|
||||
|
||||
20
infrastructure/metallb/ippool.yaml
Normal file
20
infrastructure/metallb/ippool.yaml
Normal file
@ -0,0 +1,20 @@
|
||||
# infrastructure/metallb/ippool.yaml
|
||||
apiVersion: metallb.io/v1beta1
|
||||
kind: IPAddressPool
|
||||
metadata:
|
||||
name: communication-pool
|
||||
namespace: metallb-system
|
||||
spec:
|
||||
addresses:
|
||||
- 192.168.22.4-192.168.22.6
|
||||
- 192.168.22.9-192.168.22.9
|
||||
autoAssign: true
|
||||
---
|
||||
apiVersion: metallb.io/v1beta1
|
||||
kind: L2Advertisement
|
||||
metadata:
|
||||
name: communication-adv
|
||||
namespace: metallb-system
|
||||
spec:
|
||||
ipAddressPools:
|
||||
- communication-pool
|
||||
10
infrastructure/metallb/kustomization.yaml
Normal file
10
infrastructure/metallb/kustomization.yaml
Normal file
@ -0,0 +1,10 @@
|
||||
# infrastructure/metallb/kustomization.yaml
|
||||
apiVersion: kustomize.config.k8s.io/v1beta1
|
||||
kind: Kustomization
|
||||
resources:
|
||||
- namespace.yaml
|
||||
- metallb-rendered.yaml
|
||||
- ippool.yaml
|
||||
patchesStrategicMerge:
|
||||
- patches/node-placement.yaml
|
||||
- patches/speaker-loglevel.yaml
|
||||
2411
infrastructure/metallb/metallb-rendered.yaml
Normal file
2411
infrastructure/metallb/metallb-rendered.yaml
Normal file
File diff suppressed because it is too large
Load Diff
5
infrastructure/metallb/namespace.yaml
Normal file
5
infrastructure/metallb/namespace.yaml
Normal file
@ -0,0 +1,5 @@
|
||||
# infrastructure/metallb/namespace.yaml
|
||||
apiVersion: v1
|
||||
kind: Namespace
|
||||
metadata:
|
||||
name: metallb-system
|
||||
27
infrastructure/metallb/patches/node-placement.yaml
Normal file
27
infrastructure/metallb/patches/node-placement.yaml
Normal file
@ -0,0 +1,27 @@
|
||||
# infrastructure/metallb/patches/node-placement.yaml
|
||||
apiVersion: apps/v1
|
||||
kind: Deployment
|
||||
metadata:
|
||||
name: metallb-controller
|
||||
namespace: metallb-system
|
||||
spec:
|
||||
template:
|
||||
spec:
|
||||
containers:
|
||||
- name: controller
|
||||
args:
|
||||
- --port=7472
|
||||
- --log-level=info
|
||||
- --webhook-mode=enabled
|
||||
- --tls-min-version=VersionTLS12
|
||||
- --lb-class=metallb
|
||||
affinity:
|
||||
nodeAffinity:
|
||||
requiredDuringSchedulingIgnoredDuringExecution:
|
||||
nodeSelectorTerms:
|
||||
- matchExpressions:
|
||||
- key: hardware
|
||||
operator: In
|
||||
values:
|
||||
- rpi4
|
||||
- rpi5
|
||||
15
infrastructure/metallb/patches/speaker-loglevel.yaml
Normal file
15
infrastructure/metallb/patches/speaker-loglevel.yaml
Normal file
@ -0,0 +1,15 @@
|
||||
# infrastructure/metallb/patches/speaker-loglevel.yaml
|
||||
apiVersion: apps/v1
|
||||
kind: DaemonSet
|
||||
metadata:
|
||||
name: metallb-speaker
|
||||
namespace: metallb-system
|
||||
spec:
|
||||
template:
|
||||
spec:
|
||||
containers:
|
||||
- name: speaker
|
||||
args:
|
||||
- --port=7472
|
||||
- --log-level=info
|
||||
- --lb-class=metallb
|
||||
@ -2,6 +2,7 @@
|
||||
apiVersion: kustomize.config.k8s.io/v1beta1
|
||||
kind: Kustomization
|
||||
resources:
|
||||
- ../components/device-plugin-config
|
||||
- ../components/device-plugin-jetson
|
||||
- ../components/device-plugin-minipc
|
||||
- ../components/device-plugin-tethys
|
||||
|
||||
@ -0,0 +1,15 @@
|
||||
# infrastructure/modules/profiles/components/device-plugin-config/configmap.yaml
|
||||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
metadata:
|
||||
name: nvidia-device-plugin-config
|
||||
namespace: kube-system
|
||||
data:
|
||||
config.yaml: |
|
||||
version: v1
|
||||
sharing:
|
||||
timeSlicing:
|
||||
renameByDefault: true
|
||||
resources:
|
||||
- name: nvidia.com/gpu
|
||||
replicas: 4
|
||||
@ -0,0 +1,5 @@
|
||||
# infrastructure/modules/profiles/components/device-plugin-config/kustomization.yaml
|
||||
apiVersion: kustomize.config.k8s.io/v1beta1
|
||||
kind: Kustomization
|
||||
resources:
|
||||
- configmap.yaml
|
||||
@ -30,7 +30,8 @@ spec:
|
||||
imagePullPolicy: IfNotPresent
|
||||
args:
|
||||
- "--fail-on-init-error=false"
|
||||
- "--device-list-strategy=envvar,cdi"
|
||||
- "--device-list-strategy=envvar"
|
||||
- "--config-file=/config/config.yaml"
|
||||
securityContext:
|
||||
privileged: true
|
||||
env:
|
||||
@ -41,7 +42,12 @@ spec:
|
||||
volumeMounts:
|
||||
- name: device-plugin
|
||||
mountPath: /var/lib/kubelet/device-plugins
|
||||
- name: config
|
||||
mountPath: /config
|
||||
volumes:
|
||||
- name: device-plugin
|
||||
hostPath:
|
||||
path: /var/lib/kubelet/device-plugins
|
||||
- name: config
|
||||
configMap:
|
||||
name: nvidia-device-plugin-config
|
||||
|
||||
@ -32,6 +32,7 @@ spec:
|
||||
- "--fail-on-init-error=false"
|
||||
- "--device-list-strategy=envvar"
|
||||
- "--mig-strategy=none"
|
||||
- "--config-file=/config/config.yaml"
|
||||
securityContext:
|
||||
privileged: true
|
||||
env:
|
||||
@ -42,7 +43,12 @@ spec:
|
||||
volumeMounts:
|
||||
- name: device-plugin
|
||||
mountPath: /var/lib/kubelet/device-plugins
|
||||
- name: config
|
||||
mountPath: /config
|
||||
volumes:
|
||||
- name: device-plugin
|
||||
hostPath:
|
||||
path: /var/lib/kubelet/device-plugins
|
||||
- name: config
|
||||
configMap:
|
||||
name: nvidia-device-plugin-config
|
||||
|
||||
@ -33,6 +33,7 @@ spec:
|
||||
- "--fail-on-init-error=false"
|
||||
- "--device-list-strategy=envvar"
|
||||
- "--mig-strategy=none"
|
||||
- "--config-file=/config/config.yaml"
|
||||
securityContext:
|
||||
privileged: true
|
||||
env:
|
||||
@ -43,7 +44,12 @@ spec:
|
||||
volumeMounts:
|
||||
- name: device-plugin
|
||||
mountPath: /var/lib/kubelet/device-plugins
|
||||
- name: config
|
||||
mountPath: /config
|
||||
volumes:
|
||||
- name: device-plugin
|
||||
hostPath:
|
||||
path: /var/lib/kubelet/device-plugins
|
||||
- name: config
|
||||
configMap:
|
||||
name: nvidia-device-plugin-config
|
||||
|
||||
@ -2,4 +2,5 @@
|
||||
apiVersion: kustomize.config.k8s.io/v1beta1
|
||||
kind: Kustomization
|
||||
resources:
|
||||
- ../components/device-plugin-config
|
||||
- ../components/device-plugin-tethys
|
||||
|
||||
9
infrastructure/sources/helm/fluent-bit.yaml
Normal file
9
infrastructure/sources/helm/fluent-bit.yaml
Normal file
@ -0,0 +1,9 @@
|
||||
# infrastructure/sources/helm/fluent-bit.yaml
|
||||
apiVersion: source.toolkit.fluxcd.io/v1
|
||||
kind: HelmRepository
|
||||
metadata:
|
||||
name: fluent
|
||||
namespace: flux-system
|
||||
spec:
|
||||
interval: 1h
|
||||
url: https://fluent.github.io/helm-charts
|
||||
@ -2,11 +2,15 @@
|
||||
apiVersion: kustomize.config.k8s.io/v1beta1
|
||||
kind: Kustomization
|
||||
resources:
|
||||
- fluent-bit.yaml
|
||||
- grafana.yaml
|
||||
- hashicorp.yaml
|
||||
- jetstack.yaml
|
||||
- jenkins.yaml
|
||||
- mailu.yaml
|
||||
- opentelemetry.yaml
|
||||
- opensearch.yaml
|
||||
- harbor.yaml
|
||||
- prometheus.yaml
|
||||
- victoria-metrics.yaml
|
||||
- secrets-store-csi.yaml
|
||||
|
||||
9
infrastructure/sources/helm/opensearch.yaml
Normal file
9
infrastructure/sources/helm/opensearch.yaml
Normal file
@ -0,0 +1,9 @@
|
||||
# infrastructure/sources/helm/opensearch.yaml
|
||||
apiVersion: source.toolkit.fluxcd.io/v1
|
||||
kind: HelmRepository
|
||||
metadata:
|
||||
name: opensearch
|
||||
namespace: flux-system
|
||||
spec:
|
||||
interval: 1h
|
||||
url: https://opensearch-project.github.io/helm-charts
|
||||
9
infrastructure/sources/helm/opentelemetry.yaml
Normal file
9
infrastructure/sources/helm/opentelemetry.yaml
Normal file
@ -0,0 +1,9 @@
|
||||
# infrastructure/sources/helm/opentelemetry.yaml
|
||||
apiVersion: source.toolkit.fluxcd.io/v1
|
||||
kind: HelmRepository
|
||||
metadata:
|
||||
name: opentelemetry
|
||||
namespace: flux-system
|
||||
spec:
|
||||
interval: 1h
|
||||
url: https://open-telemetry.github.io/opentelemetry-helm-charts
|
||||
9
infrastructure/sources/helm/secrets-store-csi.yaml
Normal file
9
infrastructure/sources/helm/secrets-store-csi.yaml
Normal file
@ -0,0 +1,9 @@
|
||||
# infrastructure/sources/helm/secrets-store-csi.yaml
|
||||
apiVersion: source.toolkit.fluxcd.io/v1
|
||||
kind: HelmRepository
|
||||
metadata:
|
||||
name: secrets-store-csi-driver
|
||||
namespace: flux-system
|
||||
spec:
|
||||
interval: 1h
|
||||
url: https://kubernetes-sigs.github.io/secrets-store-csi-driver/charts
|
||||
@ -71,9 +71,10 @@ rules:
|
||||
- tlsoptions
|
||||
- tlsstores
|
||||
- serverstransports
|
||||
- serverstransporttcps
|
||||
- traefikservices
|
||||
- middlewaretcps
|
||||
verbs:
|
||||
- get
|
||||
- list
|
||||
- watch
|
||||
|
||||
|
||||
@ -10,3 +10,4 @@ resources:
|
||||
- clusterrole.yaml
|
||||
- clusterrolebinding.yaml
|
||||
- service.yaml
|
||||
- traefik-service-lb.yaml
|
||||
|
||||
24
infrastructure/traefik/traefik-service-lb.yaml
Normal file
24
infrastructure/traefik/traefik-service-lb.yaml
Normal file
@ -0,0 +1,24 @@
|
||||
# infrastructure/traefik/traefik-service-lb.yaml
|
||||
apiVersion: v1
|
||||
kind: Service
|
||||
metadata:
|
||||
name: traefik
|
||||
namespace: kube-system
|
||||
annotations:
|
||||
metallb.universe.tf/address-pool: communication-pool
|
||||
spec:
|
||||
type: LoadBalancer
|
||||
loadBalancerClass: metallb
|
||||
loadBalancerIP: 192.168.22.9
|
||||
ports:
|
||||
- name: web
|
||||
port: 80
|
||||
targetPort: web
|
||||
protocol: TCP
|
||||
- name: websecure
|
||||
port: 443
|
||||
targetPort: websecure
|
||||
protocol: TCP
|
||||
selector:
|
||||
app.kubernetes.io/instance: traefik-kube-system
|
||||
app.kubernetes.io/name: traefik
|
||||
6
infrastructure/vault-csi/kustomization.yaml
Normal file
6
infrastructure/vault-csi/kustomization.yaml
Normal file
@ -0,0 +1,6 @@
|
||||
# infrastructure/vault-csi/kustomization.yaml
|
||||
apiVersion: kustomize.config.k8s.io/v1beta1
|
||||
kind: Kustomization
|
||||
resources:
|
||||
- secrets-store-csi-driver.yaml
|
||||
- vault-csi-provider.yaml
|
||||
20
infrastructure/vault-csi/secrets-store-csi-driver.yaml
Normal file
20
infrastructure/vault-csi/secrets-store-csi-driver.yaml
Normal file
@ -0,0 +1,20 @@
|
||||
# infrastructure/vault-csi/secrets-store-csi-driver.yaml
|
||||
apiVersion: helm.toolkit.fluxcd.io/v2
|
||||
kind: HelmRelease
|
||||
metadata:
|
||||
name: secrets-store-csi-driver
|
||||
namespace: kube-system
|
||||
spec:
|
||||
interval: 15m
|
||||
chart:
|
||||
spec:
|
||||
chart: secrets-store-csi-driver
|
||||
version: "~1.3.0"
|
||||
sourceRef:
|
||||
kind: HelmRepository
|
||||
name: secrets-store-csi-driver
|
||||
namespace: flux-system
|
||||
values:
|
||||
syncSecret:
|
||||
enabled: true
|
||||
enableSecretRotation: false
|
||||
111
infrastructure/vault-csi/vault-csi-provider.yaml
Normal file
111
infrastructure/vault-csi/vault-csi-provider.yaml
Normal file
@ -0,0 +1,111 @@
|
||||
# infrastructure/vault-csi/vault-csi-provider.yaml
|
||||
apiVersion: v1
|
||||
kind: ServiceAccount
|
||||
metadata:
|
||||
name: vault-csi-provider
|
||||
namespace: kube-system
|
||||
---
|
||||
apiVersion: rbac.authorization.k8s.io/v1
|
||||
kind: ClusterRole
|
||||
metadata:
|
||||
name: vault-csi-provider-clusterrole
|
||||
rules:
|
||||
- apiGroups: [""]
|
||||
resources: ["serviceaccounts/token"]
|
||||
verbs: ["create"]
|
||||
---
|
||||
apiVersion: rbac.authorization.k8s.io/v1
|
||||
kind: ClusterRoleBinding
|
||||
metadata:
|
||||
name: vault-csi-provider-clusterrolebinding
|
||||
roleRef:
|
||||
apiGroup: rbac.authorization.k8s.io
|
||||
kind: ClusterRole
|
||||
name: vault-csi-provider-clusterrole
|
||||
subjects:
|
||||
- kind: ServiceAccount
|
||||
name: vault-csi-provider
|
||||
namespace: kube-system
|
||||
---
|
||||
apiVersion: rbac.authorization.k8s.io/v1
|
||||
kind: Role
|
||||
metadata:
|
||||
name: vault-csi-provider-role
|
||||
namespace: kube-system
|
||||
rules:
|
||||
- apiGroups: [""]
|
||||
resources: ["secrets"]
|
||||
verbs: ["get"]
|
||||
resourceNames: ["vault-csi-provider-hmac-key"]
|
||||
- apiGroups: [""]
|
||||
resources: ["secrets"]
|
||||
verbs: ["create"]
|
||||
---
|
||||
apiVersion: rbac.authorization.k8s.io/v1
|
||||
kind: RoleBinding
|
||||
metadata:
|
||||
name: vault-csi-provider-rolebinding
|
||||
namespace: kube-system
|
||||
roleRef:
|
||||
apiGroup: rbac.authorization.k8s.io
|
||||
kind: Role
|
||||
name: vault-csi-provider-role
|
||||
subjects:
|
||||
- kind: ServiceAccount
|
||||
name: vault-csi-provider
|
||||
namespace: kube-system
|
||||
---
|
||||
apiVersion: apps/v1
|
||||
kind: DaemonSet
|
||||
metadata:
|
||||
name: vault-csi-provider
|
||||
namespace: kube-system
|
||||
labels: { app.kubernetes.io/name: vault-csi-provider }
|
||||
spec:
|
||||
updateStrategy:
|
||||
type: RollingUpdate
|
||||
selector:
|
||||
matchLabels: { app.kubernetes.io/name: vault-csi-provider }
|
||||
template:
|
||||
metadata:
|
||||
labels: { app.kubernetes.io/name: vault-csi-provider }
|
||||
spec:
|
||||
serviceAccountName: vault-csi-provider
|
||||
containers:
|
||||
- name: provider-vault-installer
|
||||
image: hashicorp/vault-csi-provider:1.7.0
|
||||
imagePullPolicy: IfNotPresent
|
||||
args:
|
||||
- -endpoint=/provider/vault.sock
|
||||
- -log-level=info
|
||||
resources:
|
||||
requests: { cpu: 50m, memory: 100Mi }
|
||||
limits: { cpu: 50m, memory: 100Mi }
|
||||
volumeMounts:
|
||||
- { name: providervol, mountPath: "/provider" }
|
||||
livenessProbe:
|
||||
httpGet:
|
||||
path: "/health/ready"
|
||||
port: 8080
|
||||
scheme: "HTTP"
|
||||
failureThreshold: 2
|
||||
initialDelaySeconds: 5
|
||||
periodSeconds: 5
|
||||
successThreshold: 1
|
||||
timeoutSeconds: 3
|
||||
readinessProbe:
|
||||
httpGet:
|
||||
path: "/health/ready"
|
||||
port: 8080
|
||||
scheme: "HTTP"
|
||||
failureThreshold: 2
|
||||
initialDelaySeconds: 5
|
||||
periodSeconds: 5
|
||||
successThreshold: 1
|
||||
timeoutSeconds: 3
|
||||
volumes:
|
||||
- name: providervol
|
||||
hostPath:
|
||||
path: "/var/run/secrets-store-csi-providers"
|
||||
nodeSelector:
|
||||
kubernetes.io/os: linux
|
||||
22
knowledge/INDEX.md
Normal file
22
knowledge/INDEX.md
Normal file
@ -0,0 +1,22 @@
|
||||
Atlas Knowledge Base (KB)
|
||||
|
||||
This folder is the source-of-truth “memory” for Atlas/Titan assistants (and for humans). It is designed to be:
|
||||
- Accurate (grounded in GitOps + read-only cluster tools)
|
||||
- Maintainable (small docs + deterministic generators)
|
||||
- Safe (no secrets; refer to Secret/Vault paths by name only)
|
||||
|
||||
Layout
|
||||
- `knowledge/runbooks/`: human-written docs (short, chunkable Markdown).
|
||||
- `knowledge/catalog/`: generated machine-readable facts (YAML/JSON).
|
||||
- `knowledge/diagrams/`: generated Mermaid diagrams (`.mmd`) derived from the catalog.
|
||||
|
||||
Regeneration
|
||||
- Update manifests/docs, then regenerate generated artifacts:
|
||||
- `python scripts/knowledge_render_atlas.py --write`
|
||||
|
||||
Authoring rules
|
||||
- Never include secret values. Prefer `secretRef` names or Vault paths like `kv/atlas/...`.
|
||||
- Prefer stable identifiers: Kubernetes `namespace/name`, DNS hostnames, Flux kustomization paths.
|
||||
- Keep each runbook small; one topic per file; use headings.
|
||||
- When in doubt, link to the exact file path in this repo that configures the behavior.
|
||||
|
||||
8
knowledge/catalog/atlas-summary.json
Normal file
8
knowledge/catalog/atlas-summary.json
Normal file
@ -0,0 +1,8 @@
|
||||
{
|
||||
"counts": {
|
||||
"helmrelease_host_hints": 7,
|
||||
"http_endpoints": 35,
|
||||
"services": 44,
|
||||
"workloads": 49
|
||||
}
|
||||
}
|
||||
2771
knowledge/catalog/atlas.json
Normal file
2771
knowledge/catalog/atlas.json
Normal file
File diff suppressed because it is too large
Load Diff
1786
knowledge/catalog/atlas.yaml
Normal file
1786
knowledge/catalog/atlas.yaml
Normal file
File diff suppressed because it is too large
Load Diff
89
knowledge/catalog/runbooks.json
Normal file
89
knowledge/catalog/runbooks.json
Normal file
@ -0,0 +1,89 @@
|
||||
[
|
||||
{
|
||||
"path": "runbooks/ci-gitea-jenkins.md",
|
||||
"title": "CI: Gitea \u2192 Jenkins pipeline",
|
||||
"tags": [
|
||||
"atlas",
|
||||
"ci",
|
||||
"gitea",
|
||||
"jenkins"
|
||||
],
|
||||
"entrypoints": [
|
||||
"scm.bstein.dev",
|
||||
"ci.bstein.dev"
|
||||
],
|
||||
"source_paths": [
|
||||
"services/gitea",
|
||||
"services/jenkins",
|
||||
"scripts/jenkins_cred_sync.sh",
|
||||
"scripts/gitea_cred_sync.sh"
|
||||
],
|
||||
"body": "# CI: Gitea \u2192 Jenkins pipeline\n\n## What this is\nAtlas uses Gitea for source control and Jenkins for CI. Authentication is via Keycloak (SSO).\n\n## Where it is configured\n- Gitea manifests: `services/gitea/`\n- Jenkins manifests: `services/jenkins/`\n- Credential sync helpers: `scripts/gitea_cred_sync.sh`, `scripts/jenkins_cred_sync.sh`\n\n## What users do (typical flow)\n- Create a repo in Gitea.\n- Create/update a Jenkins job/pipeline that can fetch the repo.\n- Configure a webhook (or SCM polling) so pushes trigger builds.\n\n## Troubleshooting (common)\n- \u201cWebhook not firing\u201d: confirm ingress host, webhook URL, and Jenkins job is reachable.\n- \u201cAuth denied cloning\u201d: confirm Keycloak group membership and that Jenkins has a valid token/credential configured."
|
||||
},
|
||||
{
|
||||
"path": "runbooks/comms-verify.md",
|
||||
"title": "Othrys verification checklist",
|
||||
"tags": [
|
||||
"comms",
|
||||
"matrix",
|
||||
"element",
|
||||
"livekit"
|
||||
],
|
||||
"entrypoints": [
|
||||
"https://live.bstein.dev",
|
||||
"https://matrix.live.bstein.dev"
|
||||
],
|
||||
"source_paths": [],
|
||||
"body": "1) Guest join:\n- Open a private window and visit:\n `https://live.bstein.dev/#/room/#othrys:live.bstein.dev?action=join`\n- Confirm the guest join flow works and the displayname becomes `<word>-<word>`.\n\n2) Keycloak login:\n- Log in from `https://live.bstein.dev` and confirm MAS -> Keycloak -> Element redirect.\n\n3) Video rooms:\n- Start an Element Call room and confirm audio/video with a second account.\n- Check that guests can read public rooms but cannot start calls.\n\n4) Well-known:\n- `https://live.bstein.dev/.well-known/matrix/client` returns JSON.\n- `https://matrix.live.bstein.dev/.well-known/matrix/client` returns JSON.\n\n5) TURN reachability:\n- Confirm `turn.live.bstein.dev:3478` and `turns:5349` are reachable from WAN."
|
||||
},
|
||||
{
|
||||
"path": "runbooks/kb-authoring.md",
|
||||
"title": "KB authoring: what to write (and what not to)",
|
||||
"tags": [
|
||||
"atlas",
|
||||
"kb",
|
||||
"runbooks"
|
||||
],
|
||||
"entrypoints": [],
|
||||
"source_paths": [
|
||||
"knowledge/runbooks",
|
||||
"scripts/knowledge_render_atlas.py"
|
||||
],
|
||||
"body": "# KB authoring: what to write (and what not to)\n\n## The goal\nGive Atlas assistants enough grounded, Atlas-specific context to answer \u201chow do I\u2026?\u201d questions without guessing.\n\n## What to capture (high value)\n- User workflows: \u201cclick here, set X, expected result\u201d\n- Operator workflows: \u201cedit these files, reconcile this kustomization, verify with these commands\u201d\n- Wiring: \u201cthis host routes to this service; this service depends on Postgres/Vault/etc\u201d\n- Failure modes: exact error messages + the 2\u20135 checks that usually resolve them\n- Permissions: Keycloak groups/roles and what they unlock\n\n## What to avoid (low value / fluff)\n- Generic Kubernetes explanations (link to upstream docs instead)\n- Copy-pasting large manifests (prefer file paths + small snippets)\n- Anything that will drift quickly (render it from GitOps instead)\n- Any secret values (reference Secret/Vault locations by name only)\n\n## Document pattern (recommended)\nEach runbook should answer:\n- \u201cWhat is this?\u201d\n- \u201cWhat do users do?\u201d\n- \u201cWhat do operators change (where in Git)?\u201d\n- \u201cHow do we verify it works?\u201d\n- \u201cWhat breaks and how to debug it?\u201d"
|
||||
},
|
||||
{
|
||||
"path": "runbooks/observability.md",
|
||||
"title": "Observability: Grafana + VictoriaMetrics (how to query safely)",
|
||||
"tags": [
|
||||
"atlas",
|
||||
"monitoring",
|
||||
"grafana",
|
||||
"victoriametrics"
|
||||
],
|
||||
"entrypoints": [
|
||||
"metrics.bstein.dev",
|
||||
"alerts.bstein.dev"
|
||||
],
|
||||
"source_paths": [
|
||||
"services/monitoring"
|
||||
],
|
||||
"body": "# Observability: Grafana + VictoriaMetrics (how to query safely)\n\n## Where it is configured\n- `services/monitoring/helmrelease.yaml` (Grafana + Alertmanager + VM values)\n- `services/monitoring/grafana-dashboard-*.yaml` (dashboards and their PromQL)\n\n## Using metrics as a \u201ctool\u201d for Atlas assistants\nThe safest pattern is: map a small set of intents \u2192 fixed PromQL queries, then summarize results.\n\nExamples (intents)\n- \u201cIs the cluster healthy?\u201d \u2192 node readiness + pod restart rate\n- \u201cWhy is Element Call failing?\u201d \u2192 LiveKit/coturn pod restarts + synapse errors + ingress 5xx\n- \u201cIs Jenkins slow?\u201d \u2192 pod CPU/memory + HTTP latency metrics (if exported)\n\n## Why dashboards are not the KB\nDashboards are great references, but the assistant should query VictoriaMetrics directly for live answers and keep the\nKB focused on wiring, runbooks, and stable conventions."
|
||||
},
|
||||
{
|
||||
"path": "runbooks/template.md",
|
||||
"title": "<short title>",
|
||||
"tags": [
|
||||
"atlas",
|
||||
"<service>",
|
||||
"<topic>"
|
||||
],
|
||||
"entrypoints": [
|
||||
"<hostnames if relevant>"
|
||||
],
|
||||
"source_paths": [
|
||||
"services/<svc>",
|
||||
"clusters/atlas/<...>"
|
||||
],
|
||||
"body": "# <Short title>\n\n## What this is\n\n## For users (how to)\n\n## For operators (where configured)\n\n## Troubleshooting (symptoms \u2192 checks)"
|
||||
}
|
||||
]
|
||||
189
knowledge/diagrams/atlas-http.mmd
Normal file
189
knowledge/diagrams/atlas-http.mmd
Normal file
@ -0,0 +1,189 @@
|
||||
flowchart LR
|
||||
host_auth_bstein_dev["auth.bstein.dev"]
|
||||
svc_sso_oauth2_proxy["sso/oauth2-proxy (Service)"]
|
||||
host_auth_bstein_dev --> svc_sso_oauth2_proxy
|
||||
wl_sso_oauth2_proxy["sso/oauth2-proxy (Deployment)"]
|
||||
svc_sso_oauth2_proxy --> wl_sso_oauth2_proxy
|
||||
host_bstein_dev["bstein.dev"]
|
||||
svc_bstein_dev_home_bstein_dev_home_frontend["bstein-dev-home/bstein-dev-home-frontend (Service)"]
|
||||
host_bstein_dev --> svc_bstein_dev_home_bstein_dev_home_frontend
|
||||
wl_bstein_dev_home_bstein_dev_home_frontend["bstein-dev-home/bstein-dev-home-frontend (Deployment)"]
|
||||
svc_bstein_dev_home_bstein_dev_home_frontend --> wl_bstein_dev_home_bstein_dev_home_frontend
|
||||
svc_comms_matrix_wellknown["comms/matrix-wellknown (Service)"]
|
||||
host_bstein_dev --> svc_comms_matrix_wellknown
|
||||
wl_comms_matrix_wellknown["comms/matrix-wellknown (Deployment)"]
|
||||
svc_comms_matrix_wellknown --> wl_comms_matrix_wellknown
|
||||
svc_bstein_dev_home_bstein_dev_home_backend["bstein-dev-home/bstein-dev-home-backend (Service)"]
|
||||
host_bstein_dev --> svc_bstein_dev_home_bstein_dev_home_backend
|
||||
wl_bstein_dev_home_bstein_dev_home_backend["bstein-dev-home/bstein-dev-home-backend (Deployment)"]
|
||||
svc_bstein_dev_home_bstein_dev_home_backend --> wl_bstein_dev_home_bstein_dev_home_backend
|
||||
host_call_live_bstein_dev["call.live.bstein.dev"]
|
||||
svc_comms_element_call["comms/element-call (Service)"]
|
||||
host_call_live_bstein_dev --> svc_comms_element_call
|
||||
wl_comms_element_call["comms/element-call (Deployment)"]
|
||||
svc_comms_element_call --> wl_comms_element_call
|
||||
host_chat_ai_bstein_dev["chat.ai.bstein.dev"]
|
||||
svc_bstein_dev_home_chat_ai_gateway["bstein-dev-home/chat-ai-gateway (Service)"]
|
||||
host_chat_ai_bstein_dev --> svc_bstein_dev_home_chat_ai_gateway
|
||||
wl_bstein_dev_home_chat_ai_gateway["bstein-dev-home/chat-ai-gateway (Deployment)"]
|
||||
svc_bstein_dev_home_chat_ai_gateway --> wl_bstein_dev_home_chat_ai_gateway
|
||||
host_ci_bstein_dev["ci.bstein.dev"]
|
||||
svc_jenkins_jenkins["jenkins/jenkins (Service)"]
|
||||
host_ci_bstein_dev --> svc_jenkins_jenkins
|
||||
wl_jenkins_jenkins["jenkins/jenkins (Deployment)"]
|
||||
svc_jenkins_jenkins --> wl_jenkins_jenkins
|
||||
host_cloud_bstein_dev["cloud.bstein.dev"]
|
||||
svc_nextcloud_nextcloud["nextcloud/nextcloud (Service)"]
|
||||
host_cloud_bstein_dev --> svc_nextcloud_nextcloud
|
||||
wl_nextcloud_nextcloud["nextcloud/nextcloud (Deployment)"]
|
||||
svc_nextcloud_nextcloud --> wl_nextcloud_nextcloud
|
||||
host_kit_live_bstein_dev["kit.live.bstein.dev"]
|
||||
svc_comms_livekit_token_service["comms/livekit-token-service (Service)"]
|
||||
host_kit_live_bstein_dev --> svc_comms_livekit_token_service
|
||||
wl_comms_livekit_token_service["comms/livekit-token-service (Deployment)"]
|
||||
svc_comms_livekit_token_service --> wl_comms_livekit_token_service
|
||||
svc_comms_livekit["comms/livekit (Service)"]
|
||||
host_kit_live_bstein_dev --> svc_comms_livekit
|
||||
wl_comms_livekit["comms/livekit (Deployment)"]
|
||||
svc_comms_livekit --> wl_comms_livekit
|
||||
host_live_bstein_dev["live.bstein.dev"]
|
||||
svc_comms_othrys_element_element_web["comms/othrys-element-element-web (Service)"]
|
||||
host_live_bstein_dev --> svc_comms_othrys_element_element_web
|
||||
wl_comms_othrys_element_element_web["comms/othrys-element-element-web (Deployment)"]
|
||||
svc_comms_othrys_element_element_web --> wl_comms_othrys_element_element_web
|
||||
host_live_bstein_dev --> svc_comms_matrix_wellknown
|
||||
svc_comms_othrys_synapse_matrix_synapse["comms/othrys-synapse-matrix-synapse (Service)"]
|
||||
host_live_bstein_dev --> svc_comms_othrys_synapse_matrix_synapse
|
||||
wl_comms_othrys_synapse_matrix_synapse["comms/othrys-synapse-matrix-synapse (Deployment)"]
|
||||
svc_comms_othrys_synapse_matrix_synapse --> wl_comms_othrys_synapse_matrix_synapse
|
||||
host_longhorn_bstein_dev["longhorn.bstein.dev"]
|
||||
svc_longhorn_system_oauth2_proxy_longhorn["longhorn-system/oauth2-proxy-longhorn (Service)"]
|
||||
host_longhorn_bstein_dev --> svc_longhorn_system_oauth2_proxy_longhorn
|
||||
wl_longhorn_system_oauth2_proxy_longhorn["longhorn-system/oauth2-proxy-longhorn (Deployment)"]
|
||||
svc_longhorn_system_oauth2_proxy_longhorn --> wl_longhorn_system_oauth2_proxy_longhorn
|
||||
host_mail_bstein_dev["mail.bstein.dev"]
|
||||
svc_mailu_mailserver_mailu_front["mailu-mailserver/mailu-front (Service)"]
|
||||
host_mail_bstein_dev --> svc_mailu_mailserver_mailu_front
|
||||
host_matrix_live_bstein_dev["matrix.live.bstein.dev"]
|
||||
svc_comms_matrix_authentication_service["comms/matrix-authentication-service (Service)"]
|
||||
host_matrix_live_bstein_dev --> svc_comms_matrix_authentication_service
|
||||
wl_comms_matrix_authentication_service["comms/matrix-authentication-service (Deployment)"]
|
||||
svc_comms_matrix_authentication_service --> wl_comms_matrix_authentication_service
|
||||
host_matrix_live_bstein_dev --> svc_comms_matrix_wellknown
|
||||
host_matrix_live_bstein_dev --> svc_comms_othrys_synapse_matrix_synapse
|
||||
svc_comms_matrix_guest_register["comms/matrix-guest-register (Service)"]
|
||||
host_matrix_live_bstein_dev --> svc_comms_matrix_guest_register
|
||||
wl_comms_matrix_guest_register["comms/matrix-guest-register (Deployment)"]
|
||||
svc_comms_matrix_guest_register --> wl_comms_matrix_guest_register
|
||||
host_monero_bstein_dev["monero.bstein.dev"]
|
||||
svc_crypto_monerod["crypto/monerod (Service)"]
|
||||
host_monero_bstein_dev --> svc_crypto_monerod
|
||||
wl_crypto_monerod["crypto/monerod (Deployment)"]
|
||||
svc_crypto_monerod --> wl_crypto_monerod
|
||||
host_office_bstein_dev["office.bstein.dev"]
|
||||
svc_nextcloud_collabora["nextcloud/collabora (Service)"]
|
||||
host_office_bstein_dev --> svc_nextcloud_collabora
|
||||
wl_nextcloud_collabora["nextcloud/collabora (Deployment)"]
|
||||
svc_nextcloud_collabora --> wl_nextcloud_collabora
|
||||
host_pegasus_bstein_dev["pegasus.bstein.dev"]
|
||||
svc_jellyfin_pegasus["jellyfin/pegasus (Service)"]
|
||||
host_pegasus_bstein_dev --> svc_jellyfin_pegasus
|
||||
wl_jellyfin_pegasus["jellyfin/pegasus (Deployment)"]
|
||||
svc_jellyfin_pegasus --> wl_jellyfin_pegasus
|
||||
host_scm_bstein_dev["scm.bstein.dev"]
|
||||
svc_gitea_gitea["gitea/gitea (Service)"]
|
||||
host_scm_bstein_dev --> svc_gitea_gitea
|
||||
wl_gitea_gitea["gitea/gitea (Deployment)"]
|
||||
svc_gitea_gitea --> wl_gitea_gitea
|
||||
host_secret_bstein_dev["secret.bstein.dev"]
|
||||
svc_vault_vault["vault/vault (Service)"]
|
||||
host_secret_bstein_dev --> svc_vault_vault
|
||||
wl_vault_vault["vault/vault (StatefulSet)"]
|
||||
svc_vault_vault --> wl_vault_vault
|
||||
host_sso_bstein_dev["sso.bstein.dev"]
|
||||
svc_sso_keycloak["sso/keycloak (Service)"]
|
||||
host_sso_bstein_dev --> svc_sso_keycloak
|
||||
wl_sso_keycloak["sso/keycloak (Deployment)"]
|
||||
svc_sso_keycloak --> wl_sso_keycloak
|
||||
host_stream_bstein_dev["stream.bstein.dev"]
|
||||
svc_jellyfin_jellyfin["jellyfin/jellyfin (Service)"]
|
||||
host_stream_bstein_dev --> svc_jellyfin_jellyfin
|
||||
wl_jellyfin_jellyfin["jellyfin/jellyfin (Deployment)"]
|
||||
svc_jellyfin_jellyfin --> wl_jellyfin_jellyfin
|
||||
host_vault_bstein_dev["vault.bstein.dev"]
|
||||
svc_vaultwarden_vaultwarden_service["vaultwarden/vaultwarden-service (Service)"]
|
||||
host_vault_bstein_dev --> svc_vaultwarden_vaultwarden_service
|
||||
wl_vaultwarden_vaultwarden["vaultwarden/vaultwarden (Deployment)"]
|
||||
svc_vaultwarden_vaultwarden_service --> wl_vaultwarden_vaultwarden
|
||||
|
||||
subgraph bstein_dev_home[bstein-dev-home]
|
||||
svc_bstein_dev_home_bstein_dev_home_frontend
|
||||
wl_bstein_dev_home_bstein_dev_home_frontend
|
||||
svc_bstein_dev_home_bstein_dev_home_backend
|
||||
wl_bstein_dev_home_bstein_dev_home_backend
|
||||
svc_bstein_dev_home_chat_ai_gateway
|
||||
wl_bstein_dev_home_chat_ai_gateway
|
||||
end
|
||||
subgraph comms[comms]
|
||||
svc_comms_matrix_wellknown
|
||||
wl_comms_matrix_wellknown
|
||||
svc_comms_element_call
|
||||
wl_comms_element_call
|
||||
svc_comms_livekit_token_service
|
||||
wl_comms_livekit_token_service
|
||||
svc_comms_livekit
|
||||
wl_comms_livekit
|
||||
svc_comms_othrys_element_element_web
|
||||
wl_comms_othrys_element_element_web
|
||||
svc_comms_othrys_synapse_matrix_synapse
|
||||
wl_comms_othrys_synapse_matrix_synapse
|
||||
svc_comms_matrix_authentication_service
|
||||
wl_comms_matrix_authentication_service
|
||||
svc_comms_matrix_guest_register
|
||||
wl_comms_matrix_guest_register
|
||||
end
|
||||
subgraph crypto[crypto]
|
||||
svc_crypto_monerod
|
||||
wl_crypto_monerod
|
||||
end
|
||||
subgraph gitea[gitea]
|
||||
svc_gitea_gitea
|
||||
wl_gitea_gitea
|
||||
end
|
||||
subgraph jellyfin[jellyfin]
|
||||
svc_jellyfin_pegasus
|
||||
wl_jellyfin_pegasus
|
||||
svc_jellyfin_jellyfin
|
||||
wl_jellyfin_jellyfin
|
||||
end
|
||||
subgraph jenkins[jenkins]
|
||||
svc_jenkins_jenkins
|
||||
wl_jenkins_jenkins
|
||||
end
|
||||
subgraph longhorn_system[longhorn-system]
|
||||
svc_longhorn_system_oauth2_proxy_longhorn
|
||||
wl_longhorn_system_oauth2_proxy_longhorn
|
||||
end
|
||||
subgraph mailu_mailserver[mailu-mailserver]
|
||||
svc_mailu_mailserver_mailu_front
|
||||
end
|
||||
subgraph nextcloud[nextcloud]
|
||||
svc_nextcloud_nextcloud
|
||||
wl_nextcloud_nextcloud
|
||||
svc_nextcloud_collabora
|
||||
wl_nextcloud_collabora
|
||||
end
|
||||
subgraph sso[sso]
|
||||
svc_sso_oauth2_proxy
|
||||
wl_sso_oauth2_proxy
|
||||
svc_sso_keycloak
|
||||
wl_sso_keycloak
|
||||
end
|
||||
subgraph vault[vault]
|
||||
svc_vault_vault
|
||||
wl_vault_vault
|
||||
end
|
||||
subgraph vaultwarden[vaultwarden]
|
||||
svc_vaultwarden_vaultwarden_service
|
||||
wl_vaultwarden_vaultwarden
|
||||
end
|
||||
26
knowledge/metis.md
Normal file
26
knowledge/metis.md
Normal file
@ -0,0 +1,26 @@
|
||||
# Metis (node recovery)
|
||||
|
||||
## Node classes (current map)
|
||||
- rpi5 Ubuntu workers: titan-04,05,06,07,08,09,10,11,20,21 (Ubuntu 24.04.3, k3s agent)
|
||||
- rpi5 control-plane: titan-0a/0b/0c (Ubuntu 24.04.1, k3s server, control-plane taint)
|
||||
- rpi4 Armbian longhorn: titan-13/15/17/19 (Armbian 6.6.x, k3s agent, longhorn disks)
|
||||
- rpi4 Armbian standard: titan-12/14/18 (Armbian 6.6.x, k3s agent)
|
||||
- amd64 agents: titan-22/24 (Debian 13, k3s agent)
|
||||
- External/non-cluster: tethys, titan-db, titan-jh, oceanus/titan-23, future titan-20/21 (when added), plus any newcomers.
|
||||
|
||||
## Longhorn disk UUIDs (critical nodes)
|
||||
- titan-13: /mnt/astreae UUID=6031fa8b-f28c-45c3-b7bc-6133300e07c6 (ext4); /mnt/asteria UUID=cbd4989d-62b5-4741-8b2a-28fdae259cae (ext4)
|
||||
- titan-15: /mnt/astreae UUID=f3362f14-5822-449f-944b-ac570b5cd615 (ext4); /mnt/asteria UUID=9c5316e6-f847-4884-b502-11f2d0d15d6f (ext4)
|
||||
- titan-17: /mnt/astreae UUID=1fecdade-08b0-49cb-9ae3-be6c188b0a96 (ext4); /mnt/asteria UUID=2fe9f613-d372-47ca-b84f-82084e4edda0 (ext4)
|
||||
- titan-19: /mnt/astreae UUID=4890abb9-dda2-4f4f-9c0f-081ee82849cf (ext4); /mnt/asteria UUID=2b4ea28d-b0e6-4fa3-841b-cd7067ae9153 (ext4)
|
||||
|
||||
## Metis repo (~/Development/metis)
|
||||
- CLI skeleton in Go (`cmd/metis`), inventory loader (`pkg/inventory`), plan builder (`pkg/plan`).
|
||||
- `inventory.example.yaml` shows expected schema (classes + per-node overlay, Longhorn disks, labels, taints).
|
||||
- `AGENTS.md` in repo is untracked and holds raw notes.
|
||||
|
||||
## Next implementation steps
|
||||
- Add per-class golden image refs and checksums (Harbor or file://) when ready.
|
||||
- Implement burn execution: download with checksum, write via dd/etcher-equivalent, mount boot/root to inject hostname/IP/k3s tokens/labels/taints, journald/GC drop-ins, and Longhorn fstab entries. Add Windows writer (diskpart + wmic) and Linux writer (dd + sgdisk) paths.
|
||||
- Add Keycloak/SSH bootstrap: ensure ssh user, authorized keys, and k3s token/URL injection for agents; control-plane restore path with etcd snapshot selection.
|
||||
- Add per-host inventory entries for tethys, titan-db, titan-jh, oceanus/titan-23, future 20/21 once audited.
|
||||
27
knowledge/runbooks/ci-gitea-jenkins.md
Normal file
27
knowledge/runbooks/ci-gitea-jenkins.md
Normal file
@ -0,0 +1,27 @@
|
||||
---
|
||||
title: "CI: Gitea → Jenkins pipeline"
|
||||
tags: ["atlas", "ci", "gitea", "jenkins"]
|
||||
owners: ["brad"]
|
||||
entrypoints: ["scm.bstein.dev", "ci.bstein.dev"]
|
||||
source_paths: ["services/gitea", "services/jenkins", "scripts/jenkins_cred_sync.sh", "scripts/gitea_cred_sync.sh"]
|
||||
---
|
||||
|
||||
# CI: Gitea → Jenkins pipeline
|
||||
|
||||
## What this is
|
||||
Atlas uses Gitea for source control and Jenkins for CI. Authentication is via Keycloak (SSO).
|
||||
|
||||
## Where it is configured
|
||||
- Gitea manifests: `services/gitea/`
|
||||
- Jenkins manifests: `services/jenkins/`
|
||||
- Credential sync helpers: `scripts/gitea_cred_sync.sh`, `scripts/jenkins_cred_sync.sh`
|
||||
|
||||
## What users do (typical flow)
|
||||
- Create a repo in Gitea.
|
||||
- Create/update a Jenkins job/pipeline that can fetch the repo.
|
||||
- Configure a webhook (or SCM polling) so pushes trigger builds.
|
||||
|
||||
## Troubleshooting (common)
|
||||
- “Webhook not firing”: confirm ingress host, webhook URL, and Jenkins job is reachable.
|
||||
- “Auth denied cloning”: confirm Keycloak group membership and that Jenkins has a valid token/credential configured.
|
||||
|
||||
30
knowledge/runbooks/comms-verify.md
Normal file
30
knowledge/runbooks/comms-verify.md
Normal file
@ -0,0 +1,30 @@
|
||||
---
|
||||
title: Othrys verification checklist
|
||||
tags:
|
||||
- comms
|
||||
- matrix
|
||||
- element
|
||||
- livekit
|
||||
entrypoints:
|
||||
- https://live.bstein.dev
|
||||
- https://matrix.live.bstein.dev
|
||||
---
|
||||
|
||||
1) Guest join:
|
||||
- Open a private window and visit:
|
||||
`https://live.bstein.dev/#/room/#othrys:live.bstein.dev?action=join`
|
||||
- Confirm the guest join flow works and the displayname becomes `<word>-<word>`.
|
||||
|
||||
2) Keycloak login:
|
||||
- Log in from `https://live.bstein.dev` and confirm MAS -> Keycloak -> Element redirect.
|
||||
|
||||
3) Video rooms:
|
||||
- Start an Element Call room and confirm audio/video with a second account.
|
||||
- Check that guests can read public rooms but cannot start calls.
|
||||
|
||||
4) Well-known:
|
||||
- `https://live.bstein.dev/.well-known/matrix/client` returns JSON.
|
||||
- `https://matrix.live.bstein.dev/.well-known/matrix/client` returns JSON.
|
||||
|
||||
5) TURN reachability:
|
||||
- Confirm `turn.live.bstein.dev:3478` and `turns:5349` are reachable from WAN.
|
||||
34
knowledge/runbooks/kb-authoring.md
Normal file
34
knowledge/runbooks/kb-authoring.md
Normal file
@ -0,0 +1,34 @@
|
||||
---
|
||||
title: "KB authoring: what to write (and what not to)"
|
||||
tags: ["atlas", "kb", "runbooks"]
|
||||
owners: ["brad"]
|
||||
entrypoints: []
|
||||
source_paths: ["knowledge/runbooks", "scripts/knowledge_render_atlas.py"]
|
||||
---
|
||||
|
||||
# KB authoring: what to write (and what not to)
|
||||
|
||||
## The goal
|
||||
Give Atlas assistants enough grounded, Atlas-specific context to answer “how do I…?” questions without guessing.
|
||||
|
||||
## What to capture (high value)
|
||||
- User workflows: “click here, set X, expected result”
|
||||
- Operator workflows: “edit these files, reconcile this kustomization, verify with these commands”
|
||||
- Wiring: “this host routes to this service; this service depends on Postgres/Vault/etc”
|
||||
- Failure modes: exact error messages + the 2–5 checks that usually resolve them
|
||||
- Permissions: Keycloak groups/roles and what they unlock
|
||||
|
||||
## What to avoid (low value / fluff)
|
||||
- Generic Kubernetes explanations (link to upstream docs instead)
|
||||
- Copy-pasting large manifests (prefer file paths + small snippets)
|
||||
- Anything that will drift quickly (render it from GitOps instead)
|
||||
- Any secret values (reference Secret/Vault locations by name only)
|
||||
|
||||
## Document pattern (recommended)
|
||||
Each runbook should answer:
|
||||
- “What is this?”
|
||||
- “What do users do?”
|
||||
- “What do operators change (where in Git)?”
|
||||
- “How do we verify it works?”
|
||||
- “What breaks and how to debug it?”
|
||||
|
||||
26
knowledge/runbooks/observability.md
Normal file
26
knowledge/runbooks/observability.md
Normal file
@ -0,0 +1,26 @@
|
||||
---
|
||||
title: "Observability: Grafana + VictoriaMetrics (how to query safely)"
|
||||
tags: ["atlas", "monitoring", "grafana", "victoriametrics"]
|
||||
owners: ["brad"]
|
||||
entrypoints: ["metrics.bstein.dev", "alerts.bstein.dev"]
|
||||
source_paths: ["services/monitoring"]
|
||||
---
|
||||
|
||||
# Observability: Grafana + VictoriaMetrics (how to query safely)
|
||||
|
||||
## Where it is configured
|
||||
- `services/monitoring/helmrelease.yaml` (Grafana + Alertmanager + VM values)
|
||||
- `services/monitoring/grafana-dashboard-*.yaml` (dashboards and their PromQL)
|
||||
|
||||
## Using metrics as a “tool” for Atlas assistants
|
||||
The safest pattern is: map a small set of intents → fixed PromQL queries, then summarize results.
|
||||
|
||||
Examples (intents)
|
||||
- “Is the cluster healthy?” → node readiness + pod restart rate
|
||||
- “Why is Element Call failing?” → LiveKit/coturn pod restarts + synapse errors + ingress 5xx
|
||||
- “Is Jenkins slow?” → pod CPU/memory + HTTP latency metrics (if exported)
|
||||
|
||||
## Why dashboards are not the KB
|
||||
Dashboards are great references, but the assistant should query VictoriaMetrics directly for live answers and keep the
|
||||
KB focused on wiring, runbooks, and stable conventions.
|
||||
|
||||
18
knowledge/runbooks/template.md
Normal file
18
knowledge/runbooks/template.md
Normal file
@ -0,0 +1,18 @@
|
||||
---
|
||||
title: "<short title>"
|
||||
tags: ["atlas", "<service>", "<topic>"]
|
||||
owners: ["brad"]
|
||||
entrypoints: ["<hostnames if relevant>"]
|
||||
source_paths: ["services/<svc>", "clusters/atlas/<...>"]
|
||||
---
|
||||
|
||||
# <Short title>
|
||||
|
||||
## What this is
|
||||
|
||||
## For users (how to)
|
||||
|
||||
## For operators (where configured)
|
||||
|
||||
## Troubleshooting (symptoms → checks)
|
||||
|
||||
73
knowledge/software/metis.md
Normal file
73
knowledge/software/metis.md
Normal file
@ -0,0 +1,73 @@
|
||||
# Metis (node recovery)
|
||||
|
||||
## Node classes (current map)
|
||||
- rpi5 Ubuntu workers: titan-04,05,06,07,08,09,10,11,20,21 (Ubuntu 24.04.3, k3s agent)
|
||||
- rpi5 control-plane: titan-0a/0b/0c (Ubuntu 24.04.1, k3s server, control-plane taint)
|
||||
- rpi4 Armbian longhorn: titan-13/15/17/19 (Armbian 6.6.x, k3s agent, longhorn disks)
|
||||
- rpi4 Armbian standard: titan-12/14/18 (Armbian 6.6.x, k3s agent)
|
||||
- amd64 agents: titan-22/24 (Debian 13, k3s agent)
|
||||
- External/non-cluster: tethys, titan-db, titan-jh, oceanus/titan-23, plus any newcomers.
|
||||
|
||||
### Jetson nodes (titan-20/21)
|
||||
- Ubuntu 20.04.6 (Focal), kernel 5.10.104-tegra, CRI containerd 2.0.5-k3s2, arch arm64.
|
||||
- Storage: NVMe 232G at / (ext4); onboard mmc partitions present but root on NVMe; 1.9T sda present (unused).
|
||||
- k3s agent with drop-in 99-nofile.conf.
|
||||
|
||||
## Longhorn disk UUIDs (critical nodes)
|
||||
- titan-13: /mnt/astreae UUID=6031fa8b-f28c-45c3-b7bc-6133300e07c6 (ext4); /mnt/asteria UUID=cbd4989d-62b5-4741-8b2a-28fdae259cae (ext4)
|
||||
- titan-15: /mnt/astreae UUID=f3362f14-5822-449f-944b-ac570b5cd615 (ext4); /mnt/asteria UUID=9c5316e6-f847-4884-b502-11f2d0d15d6f (ext4)
|
||||
- titan-17: /mnt/astreae UUID=1fecdade-08b0-49cb-9ae3-be6c188b0a96 (ext4); /mnt/asteria UUID=2fe9f613-d372-47ca-b84f-82084e4edda0 (ext4)
|
||||
- titan-19: /mnt/astreae UUID=4890abb9-dda2-4f4f-9c0f-081ee82849cf (ext4); /mnt/asteria UUID=2b4ea28d-b0e6-4fa3-841b-cd7067ae9153 (ext4)
|
||||
|
||||
## Metis repo (~/Development/metis)
|
||||
- CLI skeleton in Go (`cmd/metis`), inventory loader (`pkg/inventory`), plan builder (`pkg/plan`).
|
||||
- `inventory.example.yaml` shows expected schema (classes + per-node overlay, Longhorn disks, labels, taints).
|
||||
- `AGENTS.md` in repo is untracked and holds raw notes.
|
||||
|
||||
## Next implementation steps
|
||||
- Add per-class golden image refs and checksums (Harbor or file://) when ready.
|
||||
- Implement burn execution: download with checksum, write via dd/etcher-equivalent, mount boot/root to inject hostname/IP/k3s tokens/labels/taints, journald/GC drop-ins, and Longhorn fstab entries. Add Windows writer (diskpart + wmic) and Linux writer (dd + sgdisk) paths.
|
||||
- Add Keycloak/SSH bootstrap: ensure ssh user, authorized keys, and k3s token/URL injection for agents; control-plane restore path with etcd snapshot selection.
|
||||
- Add per-host inventory entries for tethys, titan-db, titan-jh, oceanus/titan-23, future 20/21 once audited.
|
||||
|
||||
## Node OS/Kernel/CRI snapshot (Jan 2026)
|
||||
- titan-04: Ubuntu 24.04.3 LTS, kernel 6.8.0-1031-raspi, CRI containerd://2.0.5-k3s2, arch arm64
|
||||
- titan-05: Ubuntu 24.04.3 LTS, kernel 6.8.0-1039-raspi, CRI containerd://2.0.5-k3s2, arch arm64
|
||||
- titan-06: Ubuntu 24.04.3 LTS, kernel 6.8.0-1039-raspi, CRI containerd://2.0.5-k3s2, arch arm64
|
||||
- titan-07: Ubuntu 24.04.3 LTS, kernel 6.8.0-1039-raspi, CRI containerd://2.0.5-k3s2, arch arm64
|
||||
- titan-08: Ubuntu 24.04.3 LTS, kernel 6.8.0-1039-raspi, CRI containerd://2.0.5-k3s2, arch arm64
|
||||
- titan-09: Ubuntu 24.04.3 LTS, kernel 6.8.0-1031-raspi, CRI containerd://2.0.5-k3s2, arch arm64
|
||||
- titan-0a: Ubuntu 24.04.1 LTS, kernel 6.8.0-1038-raspi, CRI containerd://2.0.5-k3s2, arch arm64
|
||||
- titan-0b: Ubuntu 24.04.1 LTS, kernel 6.8.0-1038-raspi, CRI containerd://2.0.5-k3s2, arch arm64
|
||||
- titan-0c: Ubuntu 24.04.1 LTS, kernel 6.8.0-1038-raspi, CRI containerd://2.0.5-k3s2, arch arm64
|
||||
- titan-10: Ubuntu 24.04.3 LTS, kernel 6.8.0-1039-raspi, CRI containerd://2.0.5-k3s2, arch arm64
|
||||
- titan-11: Ubuntu 24.04.3 LTS, kernel 6.8.0-1039-raspi, CRI containerd://2.0.5-k3s2, arch arm64
|
||||
- titan-12: Armbian 24.11.1 noble, kernel 6.6.60-current-bcm2711, CRI containerd://1.7.23-k3s2, arch arm64
|
||||
- titan-13: Armbian 25.2.1 noble, kernel 6.6.63-current-bcm2711, CRI containerd://1.7.23-k3s2, arch arm64
|
||||
- titan-14: Armbian 24.11.1 noble, kernel 6.6.60-current-bcm2711, CRI containerd://1.7.23-k3s2, arch arm64
|
||||
- titan-15: Armbian 25.2.1 noble, kernel 6.6.63-current-bcm2711, CRI containerd://1.7.23-k3s2, arch arm64
|
||||
- titan-17: Armbian 25.2.1 noble, kernel 6.6.63-current-bcm2711, CRI containerd://1.7.23-k3s2, arch arm64
|
||||
- titan-18: Armbian 24.11.1 noble, kernel 6.6.60-current-bcm2711, CRI containerd://1.7.23-k3s2, arch arm64
|
||||
- titan-19: Armbian 25.2.1 noble, kernel 6.6.63-current-bcm2711, CRI containerd://1.7.23-k3s2, arch arm64
|
||||
- titan-20: Ubuntu 20.04.6 LTS, kernel 5.10.104-tegra, CRI containerd://2.0.5-k3s2, arch arm64
|
||||
- titan-21: Ubuntu 20.04.6 LTS, kernel 5.10.104-tegra, CRI containerd://2.0.5-k3s2, arch arm64
|
||||
- titan-22: Debian 13 (trixie), kernel 6.12.41+deb13-amd64, CRI containerd://2.0.5-k3s2, arch amd64
|
||||
- titan-24: Debian 13 (trixie), kernel 6.12.57+deb13-amd64, CRI containerd://2.0.5-k3s2, arch amd64
|
||||
|
||||
|
||||
### External hosts
|
||||
- titan-db: Ubuntu 24.10, kernel 6.11.0-1015-raspi, root on /dev/sda2 ext4 (465G), boot vfat /dev/sda1; PostgreSQL service enabled.
|
||||
- titan-jh: Arch Linux ARM (rolling), kernel 6.18.4-2-rpi, NVMe root ext4 238G (/), boot vfat 512M; ~495 packages installed (pacman -Q).
|
||||
- titan-23/oceanus: TODO audit (future).
|
||||
|
||||
|
||||
### Control plane Pis (titan-0a/0b/0c)
|
||||
- Ubuntu 24.04.1 LTS, kernel 6.8.0-1038-raspi, containerd 2.0.5-k3s2.
|
||||
- Storage: 477G SSD root (/dev/sda2 ext4), /boot/firmware vfat (/dev/sda1). fstab uses LABEL=writable and LABEL=system-boot.
|
||||
- k3s server (control-plane taint expected); etcd snapshots not yet cataloged (TODO).
|
||||
|
||||
|
||||
## k3s versions
|
||||
- rpi5 workers/control-plane: k3s v1.33.3+k3s1 (crictl v1.31.0-k3s2)
|
||||
- rpi4 nodes: k3s v1.31.5+k3s1 (crictl v1.31.0-k3s2)
|
||||
- Jetson titan-20/21: k3s v1.33.3+k3s1 (per node info), crictl v1.31.0-k3s2
|
||||
5
scripts/comms_sync_kb.sh
Executable file
5
scripts/comms_sync_kb.sh
Executable file
@ -0,0 +1,5 @@
|
||||
#!/usr/bin/env bash
|
||||
set -euo pipefail
|
||||
|
||||
python scripts/knowledge_render_atlas.py --write
|
||||
python scripts/knowledge_render_atlas.py --write --out services/comms/knowledge
|
||||
@ -9,6 +9,7 @@ Usage:
|
||||
import argparse
|
||||
import json
|
||||
import textwrap
|
||||
import urllib.parse
|
||||
from pathlib import Path
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
@ -45,12 +46,14 @@ PERCENT_THRESHOLDS = {
|
||||
],
|
||||
}
|
||||
|
||||
NAMESPACE_CPU_WINDOW = "1m"
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Cluster metadata
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
CONTROL_PLANE_NODES = ["titan-0a", "titan-0b", "titan-0c"]
|
||||
CONTROL_DEPENDENCIES = ["titan-db"]
|
||||
CONTROL_DEPENDENCIES = ["titan-db", "titan-jh"]
|
||||
CONTROL_ALL = CONTROL_PLANE_NODES + CONTROL_DEPENDENCIES
|
||||
WORKER_NODES = [
|
||||
"titan-04",
|
||||
@ -61,11 +64,12 @@ WORKER_NODES = [
|
||||
"titan-09",
|
||||
"titan-10",
|
||||
"titan-11",
|
||||
"titan-20",
|
||||
"titan-21",
|
||||
"titan-12",
|
||||
"titan-13",
|
||||
"titan-14",
|
||||
"titan-15",
|
||||
"titan-16",
|
||||
"titan-17",
|
||||
"titan-18",
|
||||
"titan-19",
|
||||
@ -80,7 +84,22 @@ CONTROL_TOTAL = len(CONTROL_PLANE_NODES)
|
||||
WORKER_TOTAL = len(WORKER_NODES)
|
||||
CONTROL_SUFFIX = f"/{CONTROL_TOTAL}"
|
||||
WORKER_SUFFIX = f"/{WORKER_TOTAL}"
|
||||
CP_ALLOWED_NS = "kube-system|kube-public|kube-node-lease|longhorn-system|monitoring|flux-system"
|
||||
# Namespaces considered infrastructure (excluded from workload counts)
|
||||
INFRA_NAMESPACES = [
|
||||
"kube-system",
|
||||
"longhorn-system",
|
||||
"metallb-system",
|
||||
"monitoring",
|
||||
"logging",
|
||||
"cert-manager",
|
||||
"flux-system",
|
||||
"traefik",
|
||||
"maintenance",
|
||||
"postgres",
|
||||
]
|
||||
INFRA_REGEX = f"^({'|'.join(INFRA_NAMESPACES)})$"
|
||||
# Namespaces allowed on control plane without counting as workloads
|
||||
CP_ALLOWED_NS = INFRA_REGEX
|
||||
LONGHORN_NODE_REGEX = "titan-1[2-9]|titan-2[24]"
|
||||
GAUGE_WIDTHS = [4, 3, 3, 4, 3, 3, 4]
|
||||
CONTROL_WORKLOADS_EXPR = (
|
||||
@ -170,22 +189,48 @@ def node_io_expr(scope=""):
|
||||
return scoped_node_expr(base, scope)
|
||||
|
||||
|
||||
def namespace_selector(scope_var):
|
||||
return f'namespace!="",pod!="",container!="",container!="POD",{scope_var}'
|
||||
|
||||
|
||||
def namespace_gpu_selector(scope_var):
|
||||
return f'namespace!="",pod!="",{scope_var}'
|
||||
|
||||
|
||||
def namespace_cpu_raw(scope_var):
|
||||
return (
|
||||
"sum(rate(container_cpu_usage_seconds_total"
|
||||
f"{{{namespace_selector(scope_var)}}}[{NAMESPACE_CPU_WINDOW}])) by (namespace)"
|
||||
)
|
||||
|
||||
|
||||
def namespace_ram_raw(scope_var):
|
||||
return f"sum(container_memory_working_set_bytes{{{namespace_selector(scope_var)}}}) by (namespace)"
|
||||
|
||||
|
||||
def namespace_gpu_usage_instant(scope_var):
|
||||
return f"sum(DCGM_FI_DEV_GPU_UTIL{{{namespace_gpu_selector(scope_var)}}}) by (namespace)"
|
||||
|
||||
|
||||
def namespace_share_expr(resource_expr):
|
||||
selected = f"( {resource_expr} ) and on(namespace) ( {NAMESPACE_TOP_FILTER} )"
|
||||
total = f"clamp_min(sum( {selected} ), 1)"
|
||||
return f"100 * ( {selected} ) / {total}"
|
||||
total = f"clamp_min(sum( {resource_expr} ), 1)"
|
||||
return f"100 * ( {resource_expr} ) / {total}"
|
||||
|
||||
|
||||
def namespace_cpu_share_expr():
|
||||
return namespace_share_expr(NAMESPACE_CPU_RAW)
|
||||
def namespace_cpu_share_expr(scope_var):
|
||||
return namespace_share_expr(namespace_cpu_raw(scope_var))
|
||||
|
||||
|
||||
def namespace_ram_share_expr():
|
||||
return namespace_share_expr(NAMESPACE_RAM_RAW)
|
||||
def namespace_ram_share_expr(scope_var):
|
||||
return namespace_share_expr(namespace_ram_raw(scope_var))
|
||||
|
||||
|
||||
def namespace_gpu_share_expr():
|
||||
return namespace_share_expr(NAMESPACE_GPU_RAW)
|
||||
def namespace_gpu_share_expr(scope_var):
|
||||
usage = namespace_gpu_usage_instant(scope_var)
|
||||
total = f"(sum({usage}) or on() vector(0))"
|
||||
share = f"100 * ({usage}) / clamp_min({total}, 1)"
|
||||
idle = 'label_replace(vector(100), "namespace", "idle", "", "") and on() (' + total + " == 0)"
|
||||
return f"({share}) or ({idle})"
|
||||
|
||||
|
||||
PROBLEM_PODS_EXPR = (
|
||||
@ -270,46 +315,12 @@ STUCK_TABLE_EXPR = (
|
||||
")"
|
||||
)
|
||||
|
||||
NAMESPACE_CPU_RAW = (
|
||||
'sum(rate(container_cpu_usage_seconds_total{namespace!="",pod!="",container!=""}[5m])) by (namespace)'
|
||||
)
|
||||
NAMESPACE_RAM_RAW = (
|
||||
'sum(container_memory_working_set_bytes{namespace!="",pod!="",container!=""}) by (namespace)'
|
||||
)
|
||||
NAMESPACE_SCOPE_WORKLOAD = f'namespace!~"{INFRA_REGEX}"'
|
||||
NAMESPACE_SCOPE_ALL = 'namespace=~".*"'
|
||||
NAMESPACE_SCOPE_INFRA = f'namespace=~"{INFRA_REGEX}"'
|
||||
NAMESPACE_SCOPE_VARS = ["namespace_scope_cpu", "namespace_scope_gpu", "namespace_scope_ram"]
|
||||
GPU_NODES = ["titan-20", "titan-21", "titan-22", "titan-24"]
|
||||
GPU_NODE_REGEX = "|".join(GPU_NODES)
|
||||
NAMESPACE_GPU_ALLOC = (
|
||||
'sum((kube_pod_container_resource_requests{namespace!="",resource="nvidia.com/gpu"}'
|
||||
' or kube_pod_container_resource_limits{namespace!="",resource="nvidia.com/gpu"})) by (namespace)'
|
||||
)
|
||||
NAMESPACE_GPU_USAGE_SHARE = (
|
||||
'sum by (namespace) (max_over_time(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\"}[$__range]))'
|
||||
)
|
||||
NAMESPACE_GPU_USAGE_INSTANT = 'sum(DCGM_FI_DEV_GPU_UTIL{namespace!="",pod!=""}) by (namespace)'
|
||||
NAMESPACE_GPU_RAW = (
|
||||
"("
|
||||
+ NAMESPACE_GPU_USAGE_SHARE
|
||||
+ ") or on(namespace) ("
|
||||
+ NAMESPACE_CPU_RAW
|
||||
+ " * 0)"
|
||||
)
|
||||
NAMESPACE_GPU_WEIGHT = (
|
||||
"("
|
||||
+ NAMESPACE_GPU_ALLOC
|
||||
+ ") or on(namespace) ("
|
||||
+ NAMESPACE_CPU_RAW
|
||||
+ " * 0)"
|
||||
)
|
||||
NAMESPACE_ACTIVITY_SCORE = (
|
||||
"( "
|
||||
+ NAMESPACE_CPU_RAW
|
||||
+ " ) + ("
|
||||
+ NAMESPACE_RAM_RAW
|
||||
+ " / 1e9) + ("
|
||||
+ NAMESPACE_GPU_WEIGHT
|
||||
+ " * 100)"
|
||||
)
|
||||
NAMESPACE_TOP_FILTER = "(topk(10, " + NAMESPACE_ACTIVITY_SCORE + ") >= bool 0)"
|
||||
TRAEFIK_ROUTER_EXPR = "sum by (router) (rate(traefik_router_requests_total[5m]))"
|
||||
TRAEFIK_NET_INGRESS = (
|
||||
'sum(rate(container_network_receive_bytes_total{namespace="traefik",pod=~"traefik-.*"}[5m]))'
|
||||
@ -560,9 +571,9 @@ def table_panel(
|
||||
return panel
|
||||
|
||||
|
||||
def pie_panel(panel_id, title, expr, grid):
|
||||
def pie_panel(panel_id, title, expr, grid, *, links=None, description=None):
|
||||
"""Return a pie chart panel with readable namespace labels."""
|
||||
return {
|
||||
panel = {
|
||||
"id": panel_id,
|
||||
"type": "piechart",
|
||||
"title": title,
|
||||
@ -586,6 +597,71 @@ def pie_panel(panel_id, title, expr, grid):
|
||||
"reduceOptions": {"calcs": ["lastNotNull"], "fields": "", "values": False},
|
||||
},
|
||||
}
|
||||
if links:
|
||||
panel["links"] = links
|
||||
if description:
|
||||
panel["description"] = description
|
||||
return panel
|
||||
|
||||
|
||||
def namespace_scope_variable(var_name, label):
|
||||
options = [
|
||||
{
|
||||
"text": "workload namespaces only",
|
||||
"value": NAMESPACE_SCOPE_WORKLOAD,
|
||||
"selected": True,
|
||||
},
|
||||
{"text": "all namespaces", "value": NAMESPACE_SCOPE_ALL, "selected": False},
|
||||
{
|
||||
"text": "infrastructure namespaces only",
|
||||
"value": NAMESPACE_SCOPE_INFRA,
|
||||
"selected": False,
|
||||
},
|
||||
]
|
||||
query = (
|
||||
"workload namespaces only : "
|
||||
+ NAMESPACE_SCOPE_WORKLOAD
|
||||
+ ",all namespaces : "
|
||||
+ NAMESPACE_SCOPE_ALL
|
||||
+ ",infrastructure namespaces only : "
|
||||
+ NAMESPACE_SCOPE_INFRA
|
||||
)
|
||||
return {
|
||||
"name": var_name,
|
||||
"label": label,
|
||||
"type": "custom",
|
||||
"query": query,
|
||||
"current": {"text": options[0]["text"], "value": options[0]["value"], "selected": True},
|
||||
"options": options,
|
||||
"hide": 2,
|
||||
"multi": False,
|
||||
"includeAll": False,
|
||||
"refresh": 1,
|
||||
"sort": 0,
|
||||
"skipUrlSync": False,
|
||||
}
|
||||
|
||||
|
||||
def namespace_scope_links(var_name):
|
||||
def with_value(value):
|
||||
encoded = urllib.parse.quote(value, safe="")
|
||||
params = []
|
||||
for other in NAMESPACE_SCOPE_VARS:
|
||||
if other == var_name:
|
||||
params.append(f"var-{other}={encoded}")
|
||||
else:
|
||||
params.append(f"var-{other}=${{{other}}}")
|
||||
return "?" + "&".join(params)
|
||||
|
||||
return [
|
||||
{"title": "Workload namespaces only", "url": with_value(NAMESPACE_SCOPE_WORKLOAD), "targetBlank": False},
|
||||
{"title": "All namespaces", "url": with_value(NAMESPACE_SCOPE_ALL), "targetBlank": False},
|
||||
{
|
||||
"title": "Infrastructure namespaces only",
|
||||
"url": with_value(NAMESPACE_SCOPE_INFRA),
|
||||
"targetBlank": False,
|
||||
},
|
||||
]
|
||||
|
||||
|
||||
def bargauge_panel(
|
||||
@ -857,6 +933,115 @@ def build_overview():
|
||||
)
|
||||
)
|
||||
|
||||
mail_bounce_rate_thresholds = {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{"color": "green", "value": None},
|
||||
{"color": "yellow", "value": 5},
|
||||
{"color": "orange", "value": 8},
|
||||
{"color": "red", "value": 10},
|
||||
],
|
||||
}
|
||||
mail_limit_thresholds = {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{"color": "green", "value": None},
|
||||
{"color": "yellow", "value": 70},
|
||||
{"color": "orange", "value": 85},
|
||||
{"color": "red", "value": 95},
|
||||
],
|
||||
}
|
||||
mail_success_thresholds = {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{"color": "red", "value": None},
|
||||
{"color": "orange", "value": 90},
|
||||
{"color": "yellow", "value": 95},
|
||||
{"color": "green", "value": 98},
|
||||
],
|
||||
}
|
||||
panels.append(
|
||||
stat_panel(
|
||||
30,
|
||||
"Mail Sent (1d)",
|
||||
'max(postmark_outbound_sent{window="1d"})',
|
||||
{"h": 2, "w": 6, "x": 0, "y": 8},
|
||||
unit="none",
|
||||
links=link_to("atlas-mail"),
|
||||
)
|
||||
)
|
||||
panels.append(
|
||||
{
|
||||
"id": 31,
|
||||
"type": "stat",
|
||||
"title": "Mail Bounces (1d)",
|
||||
"datasource": PROM_DS,
|
||||
"gridPos": {"h": 2, "w": 6, "x": 12, "y": 8},
|
||||
"targets": [
|
||||
{
|
||||
"expr": 'max(postmark_outbound_bounce_rate{window="1d"})',
|
||||
"refId": "A",
|
||||
"legendFormat": "Rate",
|
||||
},
|
||||
{
|
||||
"expr": 'max(postmark_outbound_bounced{window="1d"})',
|
||||
"refId": "B",
|
||||
"legendFormat": "Count",
|
||||
},
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": {"mode": "thresholds"},
|
||||
"custom": {"displayMode": "auto"},
|
||||
"thresholds": mail_bounce_rate_thresholds,
|
||||
"unit": "none",
|
||||
},
|
||||
"overrides": [
|
||||
{
|
||||
"matcher": {"id": "byName", "options": "Rate"},
|
||||
"properties": [{"id": "unit", "value": "percent"}],
|
||||
},
|
||||
{
|
||||
"matcher": {"id": "byName", "options": "Count"},
|
||||
"properties": [{"id": "unit", "value": "none"}],
|
||||
},
|
||||
],
|
||||
},
|
||||
"options": {
|
||||
"colorMode": "value",
|
||||
"graphMode": "area",
|
||||
"justifyMode": "center",
|
||||
"reduceOptions": {"calcs": ["lastNotNull"], "fields": "", "values": False},
|
||||
"textMode": "name_and_value",
|
||||
},
|
||||
"links": link_to("atlas-mail"),
|
||||
}
|
||||
)
|
||||
panels.append(
|
||||
stat_panel(
|
||||
32,
|
||||
"Mail Success Rate (1d)",
|
||||
'clamp_min(100 - max(postmark_outbound_bounce_rate{window="1d"}), 0)',
|
||||
{"h": 2, "w": 6, "x": 6, "y": 8},
|
||||
unit="percent",
|
||||
thresholds=mail_success_thresholds,
|
||||
decimals=1,
|
||||
links=link_to("atlas-mail"),
|
||||
)
|
||||
)
|
||||
panels.append(
|
||||
stat_panel(
|
||||
33,
|
||||
"Mail Limit Used (30d)",
|
||||
"max(postmark_sending_limit_used_percent)",
|
||||
{"h": 2, "w": 6, "x": 18, "y": 8},
|
||||
unit="percent",
|
||||
thresholds=mail_limit_thresholds,
|
||||
decimals=1,
|
||||
links=link_to("atlas-mail"),
|
||||
)
|
||||
)
|
||||
|
||||
storage_panels = [
|
||||
(23, "Astreae Usage", astreae_usage_expr("/mnt/astreae"), "percent"),
|
||||
(24, "Asteria Usage", astreae_usage_expr("/mnt/asteria"), "percent"),
|
||||
@ -876,28 +1061,38 @@ def build_overview():
|
||||
)
|
||||
)
|
||||
|
||||
cpu_scope = "$namespace_scope_cpu"
|
||||
gpu_scope = "$namespace_scope_gpu"
|
||||
ram_scope = "$namespace_scope_ram"
|
||||
|
||||
panels.append(
|
||||
pie_panel(
|
||||
11,
|
||||
"Namespace CPU Share",
|
||||
namespace_cpu_share_expr(),
|
||||
namespace_cpu_share_expr(cpu_scope),
|
||||
{"h": 9, "w": 8, "x": 0, "y": 16},
|
||||
links=namespace_scope_links("namespace_scope_cpu"),
|
||||
description="Values are normalized within the selected scope; use panel links to switch scope.",
|
||||
)
|
||||
)
|
||||
panels.append(
|
||||
pie_panel(
|
||||
12,
|
||||
"Namespace GPU Share",
|
||||
namespace_gpu_share_expr(),
|
||||
namespace_gpu_share_expr(gpu_scope),
|
||||
{"h": 9, "w": 8, "x": 8, "y": 16},
|
||||
links=namespace_scope_links("namespace_scope_gpu"),
|
||||
description="Values are normalized within the selected scope; use panel links to switch scope.",
|
||||
)
|
||||
)
|
||||
panels.append(
|
||||
pie_panel(
|
||||
13,
|
||||
"Namespace RAM Share",
|
||||
namespace_ram_share_expr(),
|
||||
namespace_ram_share_expr(ram_scope),
|
||||
{"h": 9, "w": 8, "x": 16, "y": 16},
|
||||
links=namespace_scope_links("namespace_scope_ram"),
|
||||
description="Values are normalized within the selected scope; use panel links to switch scope.",
|
||||
)
|
||||
)
|
||||
|
||||
@ -1052,7 +1247,6 @@ def build_overview():
|
||||
links=link_to("atlas-storage"),
|
||||
)
|
||||
)
|
||||
|
||||
return {
|
||||
"uid": "atlas-overview",
|
||||
"title": "Atlas Overview",
|
||||
@ -1063,7 +1257,13 @@ def build_overview():
|
||||
"schemaVersion": 39,
|
||||
"style": "dark",
|
||||
"tags": ["atlas", "overview"],
|
||||
"templating": {"list": []},
|
||||
"templating": {
|
||||
"list": [
|
||||
namespace_scope_variable("namespace_scope_cpu", "CPU namespace filter"),
|
||||
namespace_scope_variable("namespace_scope_gpu", "GPU namespace filter"),
|
||||
namespace_scope_variable("namespace_scope_ram", "RAM namespace filter"),
|
||||
]
|
||||
},
|
||||
"time": {"from": "now-1h", "to": "now"},
|
||||
"refresh": "1m",
|
||||
"links": [],
|
||||
@ -1513,6 +1713,33 @@ def build_storage_dashboard():
|
||||
time_from="90d",
|
||||
)
|
||||
)
|
||||
panels.append(
|
||||
stat_panel(
|
||||
30,
|
||||
"Maintenance Sweepers Ready",
|
||||
'kube_daemonset_status_number_ready{namespace="maintenance",daemonset="node-image-sweeper"} / on(namespace,daemonset) kube_daemonset_status_desired_number_scheduled{namespace="maintenance",daemonset="node-image-sweeper"} * 100',
|
||||
{"h": 4, "w": 12, "x": 0, "y": 44},
|
||||
unit="percent",
|
||||
thresholds=PERCENT_THRESHOLDS,
|
||||
)
|
||||
)
|
||||
panels.append(
|
||||
stat_panel(
|
||||
31,
|
||||
"Maintenance Cron Freshness (s)",
|
||||
'time() - max by (cronjob) (kube_cronjob_status_last_successful_time{namespace="maintenance",cronjob=~"image-sweeper|grafana-smtp-sync"})',
|
||||
{"h": 4, "w": 12, "x": 12, "y": 44},
|
||||
unit="s",
|
||||
thresholds={
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{"color": "green", "value": None},
|
||||
{"color": "yellow", "value": 3600},
|
||||
{"color": "red", "value": 10800},
|
||||
],
|
||||
},
|
||||
)
|
||||
)
|
||||
return {
|
||||
"uid": "atlas-storage",
|
||||
"title": "Atlas Storage",
|
||||
@ -1702,21 +1929,231 @@ def build_network_dashboard():
|
||||
}
|
||||
|
||||
|
||||
def build_mail_dashboard():
|
||||
panels = []
|
||||
|
||||
bounce_rate_thresholds = {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{"color": "green", "value": None},
|
||||
{"color": "yellow", "value": 5},
|
||||
{"color": "orange", "value": 8},
|
||||
{"color": "red", "value": 10},
|
||||
],
|
||||
}
|
||||
limit_thresholds = {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{"color": "green", "value": None},
|
||||
{"color": "yellow", "value": 70},
|
||||
{"color": "orange", "value": 85},
|
||||
{"color": "red", "value": 95},
|
||||
],
|
||||
}
|
||||
success_thresholds = {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{"color": "red", "value": None},
|
||||
{"color": "orange", "value": 90},
|
||||
{"color": "yellow", "value": 95},
|
||||
{"color": "green", "value": 98},
|
||||
],
|
||||
}
|
||||
|
||||
panels.append(
|
||||
stat_panel(
|
||||
1,
|
||||
"Sent (1d)",
|
||||
'max(postmark_outbound_sent{window="1d"})',
|
||||
{"h": 4, "w": 6, "x": 0, "y": 0},
|
||||
decimals=0,
|
||||
)
|
||||
)
|
||||
panels.append(
|
||||
stat_panel(
|
||||
2,
|
||||
"Sent (7d)",
|
||||
'max(postmark_outbound_sent{window="7d"})',
|
||||
{"h": 4, "w": 6, "x": 6, "y": 0},
|
||||
decimals=0,
|
||||
)
|
||||
)
|
||||
panels.append(
|
||||
{
|
||||
"id": 3,
|
||||
"type": "stat",
|
||||
"title": "Mail Bounces (1d)",
|
||||
"datasource": PROM_DS,
|
||||
"gridPos": {"h": 4, "w": 6, "x": 12, "y": 0},
|
||||
"targets": [
|
||||
{
|
||||
"expr": 'max(postmark_outbound_bounce_rate{window="1d"})',
|
||||
"refId": "A",
|
||||
"legendFormat": "Rate",
|
||||
},
|
||||
{
|
||||
"expr": 'max(postmark_outbound_bounced{window="1d"})',
|
||||
"refId": "B",
|
||||
"legendFormat": "Count",
|
||||
},
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": {"mode": "thresholds"},
|
||||
"custom": {"displayMode": "auto"},
|
||||
"thresholds": bounce_rate_thresholds,
|
||||
"unit": "none",
|
||||
},
|
||||
"overrides": [
|
||||
{
|
||||
"matcher": {"id": "byName", "options": "Rate"},
|
||||
"properties": [{"id": "unit", "value": "percent"}],
|
||||
},
|
||||
{
|
||||
"matcher": {"id": "byName", "options": "Count"},
|
||||
"properties": [{"id": "unit", "value": "none"}],
|
||||
},
|
||||
],
|
||||
},
|
||||
"options": {
|
||||
"colorMode": "value",
|
||||
"graphMode": "area",
|
||||
"justifyMode": "center",
|
||||
"reduceOptions": {"calcs": ["lastNotNull"], "fields": "", "values": False},
|
||||
"textMode": "name_and_value",
|
||||
},
|
||||
}
|
||||
)
|
||||
panels.append(
|
||||
stat_panel(
|
||||
4,
|
||||
"Success Rate (1d)",
|
||||
'clamp_min(100 - max(postmark_outbound_bounce_rate{window="1d"}), 0)',
|
||||
{"h": 4, "w": 6, "x": 18, "y": 0},
|
||||
unit="percent",
|
||||
thresholds=success_thresholds,
|
||||
decimals=1,
|
||||
)
|
||||
)
|
||||
|
||||
panels.append(
|
||||
stat_panel(
|
||||
5,
|
||||
"Limit Used (30d)",
|
||||
"max(postmark_sending_limit_used_percent)",
|
||||
{"h": 4, "w": 6, "x": 0, "y": 4},
|
||||
thresholds=limit_thresholds,
|
||||
unit="percent",
|
||||
decimals=1,
|
||||
)
|
||||
)
|
||||
panels.append(
|
||||
stat_panel(
|
||||
6,
|
||||
"Send Limit (30d)",
|
||||
"max(postmark_sending_limit)",
|
||||
{"h": 4, "w": 6, "x": 6, "y": 4},
|
||||
decimals=0,
|
||||
)
|
||||
)
|
||||
panels.append(
|
||||
stat_panel(
|
||||
7,
|
||||
"Last Success",
|
||||
"max(postmark_last_success_timestamp_seconds)",
|
||||
{"h": 4, "w": 6, "x": 12, "y": 4},
|
||||
unit="dateTimeAsIso",
|
||||
decimals=0,
|
||||
)
|
||||
)
|
||||
panels.append(
|
||||
stat_panel(
|
||||
8,
|
||||
"Exporter Errors",
|
||||
"sum(postmark_request_errors_total)",
|
||||
{"h": 4, "w": 6, "x": 18, "y": 4},
|
||||
decimals=0,
|
||||
)
|
||||
)
|
||||
|
||||
panels.append(
|
||||
timeseries_panel(
|
||||
13,
|
||||
"Bounce Rate (1d vs 7d)",
|
||||
"max by (window) (postmark_outbound_bounce_rate)",
|
||||
{"h": 8, "w": 12, "x": 0, "y": 12},
|
||||
unit="percent",
|
||||
legend="{{window}}",
|
||||
legend_display="table",
|
||||
legend_placement="right",
|
||||
)
|
||||
)
|
||||
panels.append(
|
||||
timeseries_panel(
|
||||
14,
|
||||
"Bounced (1d vs 7d)",
|
||||
"max by (window) (postmark_outbound_bounced)",
|
||||
{"h": 8, "w": 12, "x": 12, "y": 12},
|
||||
unit="none",
|
||||
legend="{{window}}",
|
||||
legend_display="table",
|
||||
legend_placement="right",
|
||||
)
|
||||
)
|
||||
panels.append(
|
||||
timeseries_panel(
|
||||
15,
|
||||
"Sent (1d vs 7d)",
|
||||
"max by (window) (postmark_outbound_sent)",
|
||||
{"h": 8, "w": 12, "x": 0, "y": 20},
|
||||
unit="none",
|
||||
legend="{{window}}",
|
||||
legend_display="table",
|
||||
legend_placement="right",
|
||||
)
|
||||
)
|
||||
panels.append(
|
||||
timeseries_panel(
|
||||
16,
|
||||
"Exporter Errors",
|
||||
"sum(postmark_request_errors_total)",
|
||||
{"h": 8, "w": 12, "x": 12, "y": 20},
|
||||
unit="none",
|
||||
)
|
||||
)
|
||||
|
||||
return {
|
||||
"uid": "atlas-mail",
|
||||
"title": "Atlas Mail",
|
||||
"folderUid": PRIVATE_FOLDER,
|
||||
"editable": True,
|
||||
"panels": panels,
|
||||
"time": {"from": "now-30d", "to": "now"},
|
||||
"annotations": {"list": []},
|
||||
"schemaVersion": 39,
|
||||
"style": "dark",
|
||||
"tags": ["atlas", "mail"],
|
||||
}
|
||||
|
||||
|
||||
def build_gpu_dashboard():
|
||||
panels = []
|
||||
gpu_scope = "$namespace_scope_gpu"
|
||||
panels.append(
|
||||
pie_panel(
|
||||
1,
|
||||
"Namespace GPU Share",
|
||||
namespace_gpu_share_expr(),
|
||||
namespace_gpu_share_expr(gpu_scope),
|
||||
{"h": 8, "w": 12, "x": 0, "y": 0},
|
||||
links=namespace_scope_links("namespace_scope_gpu"),
|
||||
description="Values are normalized within the selected scope; use panel links to switch scope.",
|
||||
)
|
||||
)
|
||||
panels.append(
|
||||
timeseries_panel(
|
||||
2,
|
||||
"GPU Util by Namespace",
|
||||
NAMESPACE_GPU_USAGE_INSTANT,
|
||||
namespace_gpu_usage_instant(gpu_scope),
|
||||
{"h": 8, "w": 12, "x": 12, "y": 0},
|
||||
unit="percent",
|
||||
legend="{{namespace}}",
|
||||
@ -1757,6 +2194,13 @@ def build_gpu_dashboard():
|
||||
"schemaVersion": 39,
|
||||
"style": "dark",
|
||||
"tags": ["atlas", "gpu"],
|
||||
"templating": {
|
||||
"list": [
|
||||
namespace_scope_variable("namespace_scope_cpu", "CPU namespace filter"),
|
||||
namespace_scope_variable("namespace_scope_gpu", "GPU namespace filter"),
|
||||
namespace_scope_variable("namespace_scope_ram", "RAM namespace filter"),
|
||||
]
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
@ -1781,6 +2225,10 @@ DASHBOARDS = {
|
||||
"builder": build_network_dashboard,
|
||||
"configmap": ROOT / "services" / "monitoring" / "grafana-dashboard-network.yaml",
|
||||
},
|
||||
"atlas-mail": {
|
||||
"builder": build_mail_dashboard,
|
||||
"configmap": ROOT / "services" / "monitoring" / "grafana-dashboard-mail.yaml",
|
||||
},
|
||||
"atlas-gpu": {
|
||||
"builder": build_gpu_dashboard,
|
||||
"configmap": ROOT / "services" / "monitoring" / "grafana-dashboard-gpu.yaml",
|
||||
|
||||
445
scripts/dashboards_render_logs.py
Executable file
445
scripts/dashboards_render_logs.py
Executable file
@ -0,0 +1,445 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Generate OpenSearch Dashboards saved objects and render them into ConfigMaps.
|
||||
|
||||
Usage:
|
||||
scripts/dashboards_render_logs.py --build # rebuild NDJSON + ConfigMap
|
||||
scripts/dashboards_render_logs.py # re-render ConfigMap from NDJSON
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import textwrap
|
||||
from dataclasses import dataclass
|
||||
from pathlib import Path
|
||||
|
||||
ROOT = Path(__file__).resolve().parents[1]
|
||||
DASHBOARD_DIR = ROOT / "services" / "logging" / "dashboards"
|
||||
NDJSON_PATH = DASHBOARD_DIR / "logs.ndjson"
|
||||
CONFIG_PATH = ROOT / "services" / "logging" / "opensearch-dashboards-objects.yaml"
|
||||
|
||||
CONFIG_TEMPLATE = textwrap.dedent(
|
||||
"""# {relative_path}
|
||||
# Generated by scripts/dashboards_render_logs.py --build
|
||||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
metadata:
|
||||
name: opensearch-dashboards-objects
|
||||
namespace: logging
|
||||
data:
|
||||
objects.ndjson: |
|
||||
{payload}
|
||||
"""
|
||||
)
|
||||
|
||||
DASHBOARD_VERSION = "7.10.0"
|
||||
GRID_COLUMNS = 48
|
||||
H_CHART = 10
|
||||
H_ERRORS = 8
|
||||
H_TABLE = 16
|
||||
H_SEARCH = 18
|
||||
TABLE_SIZE = 15
|
||||
TABLE_PER_PAGE = 15
|
||||
|
||||
ERROR_TERMS = ("*error*", "*exception*", "*fail*")
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class AppSpec:
|
||||
slug: str
|
||||
title: str
|
||||
query: str
|
||||
index_id: str = "kube-logs"
|
||||
kind: str = "kube"
|
||||
|
||||
|
||||
def error_query(base: str | None = None) -> str:
|
||||
parts = [f'(log : "{term}" or message : "{term}")' for term in ERROR_TERMS]
|
||||
expr = " or ".join(parts)
|
||||
if base:
|
||||
return f"({base}) and ({expr})"
|
||||
return f"({expr})"
|
||||
|
||||
|
||||
def json_line(obj: dict) -> str:
|
||||
return json.dumps(obj, separators=(",", ":"))
|
||||
|
||||
|
||||
def search_source(query: str) -> dict:
|
||||
return {
|
||||
"query": {"language": "kuery", "query": query},
|
||||
"filter": [],
|
||||
"indexRefName": "kibanaSavedObjectMeta.searchSourceJSON.index",
|
||||
}
|
||||
|
||||
|
||||
def index_pattern(object_id: str, title: str, time_field: str = "@timestamp") -> dict:
|
||||
return {
|
||||
"type": "index-pattern",
|
||||
"id": object_id,
|
||||
"attributes": {"title": title, "timeFieldName": time_field},
|
||||
}
|
||||
|
||||
|
||||
def histogram_vis(object_id: str, title: str, query: str, index_id: str) -> dict:
|
||||
vis_state = {
|
||||
"title": title,
|
||||
"type": "histogram",
|
||||
"aggs": [
|
||||
{"id": "1", "enabled": True, "type": "count", "schema": "metric"},
|
||||
{
|
||||
"id": "2",
|
||||
"enabled": True,
|
||||
"type": "date_histogram",
|
||||
"schema": "segment",
|
||||
"params": {"field": "@timestamp", "interval": "auto", "min_doc_count": 1},
|
||||
},
|
||||
],
|
||||
"params": {"addTooltip": True, "addLegend": False, "scale": "linear", "interpolate": "linear"},
|
||||
}
|
||||
return {
|
||||
"type": "visualization",
|
||||
"id": object_id,
|
||||
"attributes": {
|
||||
"title": title,
|
||||
"visState": json.dumps(vis_state, separators=(",", ":")),
|
||||
"uiStateJSON": "{}",
|
||||
"description": "",
|
||||
"version": 1,
|
||||
"kibanaSavedObjectMeta": {
|
||||
"searchSourceJSON": json.dumps(search_source(query), separators=(",", ":"))
|
||||
},
|
||||
},
|
||||
"references": [
|
||||
{
|
||||
"name": "kibanaSavedObjectMeta.searchSourceJSON.index",
|
||||
"type": "index-pattern",
|
||||
"id": index_id,
|
||||
}
|
||||
],
|
||||
}
|
||||
|
||||
|
||||
def table_vis(object_id: str, title: str, field: str, query: str, index_id: str) -> dict:
|
||||
vis_state = {
|
||||
"title": title,
|
||||
"type": "table",
|
||||
"aggs": [
|
||||
{"id": "1", "enabled": True, "type": "count", "schema": "metric"},
|
||||
{
|
||||
"id": "2",
|
||||
"enabled": True,
|
||||
"type": "terms",
|
||||
"schema": "bucket",
|
||||
"params": {"field": field, "size": TABLE_SIZE, "order": "desc", "orderBy": "1"},
|
||||
},
|
||||
],
|
||||
"params": {
|
||||
"perPage": TABLE_PER_PAGE,
|
||||
"showPartialRows": False,
|
||||
"showMetricsAtAllLevels": False,
|
||||
"sort": {"columnIndex": 1, "direction": "desc"},
|
||||
},
|
||||
}
|
||||
return {
|
||||
"type": "visualization",
|
||||
"id": object_id,
|
||||
"attributes": {
|
||||
"title": title,
|
||||
"visState": json.dumps(vis_state, separators=(",", ":")),
|
||||
"uiStateJSON": "{}",
|
||||
"description": "",
|
||||
"version": 1,
|
||||
"kibanaSavedObjectMeta": {
|
||||
"searchSourceJSON": json.dumps(search_source(query), separators=(",", ":"))
|
||||
},
|
||||
},
|
||||
"references": [
|
||||
{
|
||||
"name": "kibanaSavedObjectMeta.searchSourceJSON.index",
|
||||
"type": "index-pattern",
|
||||
"id": index_id,
|
||||
}
|
||||
],
|
||||
}
|
||||
|
||||
|
||||
def search_object(object_id: str, title: str, columns: list[str], query: str, index_id: str) -> dict:
|
||||
return {
|
||||
"type": "search",
|
||||
"id": object_id,
|
||||
"attributes": {
|
||||
"title": title,
|
||||
"description": "",
|
||||
"columns": columns,
|
||||
"sort": [["@timestamp", "desc"]],
|
||||
"kibanaSavedObjectMeta": {
|
||||
"searchSourceJSON": json.dumps(search_source(query), separators=(",", ":"))
|
||||
},
|
||||
},
|
||||
"references": [
|
||||
{
|
||||
"name": "kibanaSavedObjectMeta.searchSourceJSON.index",
|
||||
"type": "index-pattern",
|
||||
"id": index_id,
|
||||
}
|
||||
],
|
||||
}
|
||||
|
||||
|
||||
def grid(x: int, y: int, w: int, h: int, i: int) -> dict:
|
||||
return {"x": x, "y": y, "w": w, "h": h, "i": str(i)}
|
||||
|
||||
|
||||
def panel(panel_id: str, panel_type: str, grid_data: dict, index: int) -> dict:
|
||||
return {
|
||||
"panelIndex": str(index),
|
||||
"gridData": grid_data,
|
||||
"id": panel_id,
|
||||
"type": panel_type,
|
||||
"version": DASHBOARD_VERSION,
|
||||
"embeddableConfig": {},
|
||||
}
|
||||
|
||||
|
||||
def full_width_panels(specs: list[tuple[str, str, int]]) -> list[dict]:
|
||||
panels = []
|
||||
y = 0
|
||||
for index, (panel_id, panel_type, height) in enumerate(specs, start=1):
|
||||
panels.append(panel(panel_id, panel_type, grid(0, y, GRID_COLUMNS, height, index), index))
|
||||
y += height
|
||||
return panels
|
||||
|
||||
|
||||
def dashboard_object(object_id: str, title: str, panels: list[dict]) -> dict:
|
||||
return {
|
||||
"type": "dashboard",
|
||||
"id": object_id,
|
||||
"attributes": {
|
||||
"title": title,
|
||||
"description": "",
|
||||
"hits": 0,
|
||||
"panelsJSON": json.dumps(panels, separators=(",", ":")),
|
||||
"optionsJSON": json.dumps({"useMargins": True, "hidePanelTitles": False}, separators=(",", ":")),
|
||||
"version": 1,
|
||||
"timeRestore": False,
|
||||
"kibanaSavedObjectMeta": {
|
||||
"searchSourceJSON": json.dumps({"query": {"language": "kuery", "query": ""}, "filter": []})
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
def app_dashboard_objects(app: AppSpec) -> list[dict]:
|
||||
prefix = f"logs-{app.slug}"
|
||||
objects = []
|
||||
|
||||
if app.kind == "journald":
|
||||
columns = ["@timestamp", "_HOSTNAME", "_SYSTEMD_UNIT", "MESSAGE"]
|
||||
objects.append(histogram_vis(f"{prefix}-volume", f"{app.title} logs", app.query, app.index_id))
|
||||
objects.append(histogram_vis(f"{prefix}-errors", f"{app.title} errors", error_query(app.query), app.index_id))
|
||||
objects.append(table_vis(f"{prefix}-top-units", "Top units", "_SYSTEMD_UNIT.keyword", app.query, app.index_id))
|
||||
objects.append(search_object(f"{prefix}-recent", "Recent logs", columns, app.query, app.index_id))
|
||||
objects.append(
|
||||
search_object(
|
||||
f"{prefix}-recent-errors",
|
||||
"Recent errors",
|
||||
columns,
|
||||
error_query(app.query),
|
||||
app.index_id,
|
||||
)
|
||||
)
|
||||
panels = full_width_panels(
|
||||
[
|
||||
(f"{prefix}-volume", "visualization", H_CHART),
|
||||
(f"{prefix}-errors", "visualization", H_ERRORS),
|
||||
(f"{prefix}-top-units", "visualization", H_TABLE),
|
||||
(f"{prefix}-recent", "search", H_SEARCH),
|
||||
(f"{prefix}-recent-errors", "search", H_SEARCH),
|
||||
]
|
||||
)
|
||||
objects.append(dashboard_object(prefix, f"{app.title} Logs", panels))
|
||||
return objects
|
||||
|
||||
columns = ["@timestamp", "kubernetes.pod_name", "kubernetes.container_name", "log", "message"]
|
||||
objects.append(histogram_vis(f"{prefix}-volume", f"{app.title} logs", app.query, app.index_id))
|
||||
objects.append(histogram_vis(f"{prefix}-errors", f"{app.title} errors", error_query(app.query), app.index_id))
|
||||
objects.append(table_vis(f"{prefix}-top-pods", "Top pods", "kubernetes.pod_name.keyword", app.query, app.index_id))
|
||||
objects.append(
|
||||
table_vis(f"{prefix}-top-containers", "Top containers", "kubernetes.container_name.keyword", app.query, app.index_id)
|
||||
)
|
||||
objects.append(search_object(f"{prefix}-recent", "Recent logs", columns, app.query, app.index_id))
|
||||
objects.append(
|
||||
search_object(
|
||||
f"{prefix}-recent-errors",
|
||||
"Recent errors",
|
||||
columns,
|
||||
error_query(app.query),
|
||||
app.index_id,
|
||||
)
|
||||
)
|
||||
panels = full_width_panels(
|
||||
[
|
||||
(f"{prefix}-volume", "visualization", H_CHART),
|
||||
(f"{prefix}-errors", "visualization", H_ERRORS),
|
||||
(f"{prefix}-top-pods", "visualization", H_TABLE),
|
||||
(f"{prefix}-top-containers", "visualization", H_TABLE),
|
||||
(f"{prefix}-recent", "search", H_SEARCH),
|
||||
(f"{prefix}-recent-errors", "search", H_SEARCH),
|
||||
]
|
||||
)
|
||||
objects.append(dashboard_object(prefix, f"{app.title} Logs", panels))
|
||||
return objects
|
||||
|
||||
|
||||
def overview_objects() -> list[dict]:
|
||||
objects = []
|
||||
objects.append(histogram_vis("logs-overview-volume", "Logs per minute", "*", "kube-logs"))
|
||||
objects.append(histogram_vis("logs-overview-errors", "Errors per minute", error_query(), "kube-logs"))
|
||||
objects.append(
|
||||
table_vis(
|
||||
"logs-overview-top-ns",
|
||||
"Top namespaces",
|
||||
"kubernetes.namespace_name.keyword",
|
||||
"*",
|
||||
"kube-logs",
|
||||
)
|
||||
)
|
||||
objects.append(
|
||||
table_vis(
|
||||
"logs-overview-top-error-ns",
|
||||
"Top error namespaces",
|
||||
"kubernetes.namespace_name.keyword",
|
||||
error_query(),
|
||||
"kube-logs",
|
||||
)
|
||||
)
|
||||
objects.append(table_vis("logs-overview-top-pods", "Top pods", "kubernetes.pod_name.keyword", "*", "kube-logs"))
|
||||
objects.append(
|
||||
table_vis(
|
||||
"logs-overview-top-nodes",
|
||||
"Top nodes",
|
||||
"kubernetes.node_name.keyword",
|
||||
"*",
|
||||
"kube-logs",
|
||||
)
|
||||
)
|
||||
objects.append(
|
||||
search_object(
|
||||
"logs-overview-recent-errors",
|
||||
"Recent errors",
|
||||
["@timestamp", "kubernetes.namespace_name", "kubernetes.pod_name", "log", "message"],
|
||||
error_query(),
|
||||
"kube-logs",
|
||||
)
|
||||
)
|
||||
panels = full_width_panels(
|
||||
[
|
||||
("logs-overview-volume", "visualization", H_CHART),
|
||||
("logs-overview-errors", "visualization", H_ERRORS),
|
||||
("logs-overview-top-ns", "visualization", H_TABLE),
|
||||
("logs-overview-top-error-ns", "visualization", H_TABLE),
|
||||
("logs-overview-top-pods", "visualization", H_TABLE),
|
||||
("logs-overview-top-nodes", "visualization", H_TABLE),
|
||||
("logs-overview-recent-errors", "search", H_SEARCH),
|
||||
]
|
||||
)
|
||||
objects.append(dashboard_object("logs-overview", "Atlas Logs Overview", panels))
|
||||
return objects
|
||||
|
||||
|
||||
def build_objects() -> list[dict]:
|
||||
objects = [
|
||||
index_pattern("kube-logs", "kube-*"),
|
||||
index_pattern("journald-logs", "journald-*"),
|
||||
]
|
||||
|
||||
objects.extend(overview_objects())
|
||||
|
||||
apps = [
|
||||
AppSpec("bstein-dev-home", "bstein-dev-home", 'kubernetes.namespace_name: "bstein-dev-home"'),
|
||||
AppSpec(
|
||||
"pegasus",
|
||||
"pegasus",
|
||||
'kubernetes.namespace_name: "jellyfin" and kubernetes.labels.app: "pegasus"',
|
||||
),
|
||||
AppSpec(
|
||||
"jellyfin",
|
||||
"jellyfin",
|
||||
'kubernetes.namespace_name: "jellyfin" and kubernetes.labels.app: "jellyfin"',
|
||||
),
|
||||
AppSpec("vaultwarden", "vaultwarden", 'kubernetes.namespace_name: "vaultwarden"'),
|
||||
AppSpec("mailu", "mailu", 'kubernetes.namespace_name: "mailu-mailserver"'),
|
||||
AppSpec("nextcloud", "nextcloud", 'kubernetes.namespace_name: "nextcloud"'),
|
||||
AppSpec("gitea", "gitea", 'kubernetes.namespace_name: "gitea"'),
|
||||
AppSpec("jenkins", "jenkins", 'kubernetes.namespace_name: "jenkins"'),
|
||||
AppSpec("harbor", "harbor", 'kubernetes.namespace_name: "harbor"'),
|
||||
AppSpec("vault", "vault", 'kubernetes.namespace_name: "vault"'),
|
||||
AppSpec("keycloak", "keycloak", 'kubernetes.namespace_name: "sso"'),
|
||||
AppSpec("flux-system", "flux-system", 'kubernetes.namespace_name: "flux-system"'),
|
||||
AppSpec("comms", "comms", 'kubernetes.namespace_name: "comms"'),
|
||||
AppSpec(
|
||||
"element-web",
|
||||
"element-web",
|
||||
'kubernetes.namespace_name: "comms" and kubernetes.container_name: "element-web"',
|
||||
),
|
||||
AppSpec(
|
||||
"element-call",
|
||||
"element-call",
|
||||
'kubernetes.namespace_name: "comms" and kubernetes.labels.app: "element-call"',
|
||||
),
|
||||
AppSpec(
|
||||
"matrix-synapse",
|
||||
"matrix-synapse",
|
||||
'kubernetes.namespace_name: "comms" and kubernetes.container_name: "synapse"',
|
||||
),
|
||||
AppSpec(
|
||||
"livekit",
|
||||
"livekit",
|
||||
'kubernetes.namespace_name: "comms" and kubernetes.labels.app: "livekit"',
|
||||
),
|
||||
AppSpec(
|
||||
"coturn",
|
||||
"coturn",
|
||||
'kubernetes.namespace_name: "comms" and kubernetes.labels.app: "coturn"',
|
||||
),
|
||||
AppSpec("lesavka", "lesavka", '_HOSTNAME: "titan-jh"', index_id="journald-logs", kind="journald"),
|
||||
]
|
||||
|
||||
for app in apps:
|
||||
objects.extend(app_dashboard_objects(app))
|
||||
|
||||
return objects
|
||||
|
||||
|
||||
def write_ndjson(objects: list[dict], path: Path) -> None:
|
||||
path.parent.mkdir(parents=True, exist_ok=True)
|
||||
payload = "\n".join(json_line(obj) for obj in objects)
|
||||
path.write_text(payload + "\n")
|
||||
|
||||
|
||||
def render_configmap(ndjson_path: Path, output_path: Path) -> None:
|
||||
payload_lines = ndjson_path.read_text().splitlines()
|
||||
payload = "\n".join(" " + line for line in payload_lines)
|
||||
relative_path = output_path.relative_to(ROOT)
|
||||
output_path.write_text(CONFIG_TEMPLATE.format(relative_path=relative_path, payload=payload))
|
||||
|
||||
|
||||
def main() -> None:
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--build", action="store_true", help="Regenerate saved object NDJSON and ConfigMap")
|
||||
args = parser.parse_args()
|
||||
|
||||
if args.build:
|
||||
objects = build_objects()
|
||||
write_ndjson(objects, NDJSON_PATH)
|
||||
|
||||
if not NDJSON_PATH.exists():
|
||||
raise SystemExit(f"Missing NDJSON file: {NDJSON_PATH}. Run with --build first.")
|
||||
|
||||
render_configmap(NDJSON_PATH, CONFIG_PATH)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
554
scripts/knowledge_render_atlas.py
Normal file
554
scripts/knowledge_render_atlas.py
Normal file
@ -0,0 +1,554 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Render Atlas knowledge artifacts from Flux + kustomize manifests.
|
||||
|
||||
Outputs (committed to git for stable diffs + RAG):
|
||||
- knowledge/catalog/*.yaml
|
||||
- knowledge/diagrams/*.mmd
|
||||
|
||||
This is intentionally conservative:
|
||||
- never includes Secret objects
|
||||
- never includes secret values
|
||||
- keeps output deterministic (sorted)
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import re
|
||||
import subprocess
|
||||
import sys
|
||||
from dataclasses import dataclass
|
||||
from pathlib import Path
|
||||
from typing import Any, Iterable
|
||||
|
||||
import yaml
|
||||
|
||||
REPO_ROOT = Path(__file__).resolve().parents[1]
|
||||
|
||||
CLUSTER_SCOPED_KINDS = {
|
||||
"Namespace",
|
||||
"Node",
|
||||
"CustomResourceDefinition",
|
||||
"ClusterRole",
|
||||
"ClusterRoleBinding",
|
||||
"StorageClass",
|
||||
"PersistentVolume",
|
||||
"MutatingWebhookConfiguration",
|
||||
"ValidatingWebhookConfiguration",
|
||||
"APIService",
|
||||
}
|
||||
|
||||
INCLUDED_KINDS = {
|
||||
"Namespace",
|
||||
"Deployment",
|
||||
"StatefulSet",
|
||||
"DaemonSet",
|
||||
"Service",
|
||||
"Ingress",
|
||||
"IngressRoute", # traefik
|
||||
"HelmRelease", # only to harvest ingress hostnames from values
|
||||
}
|
||||
|
||||
|
||||
def _run(cmd: list[str], *, cwd: Path) -> str:
|
||||
res = subprocess.run(cmd, cwd=cwd, capture_output=True, text=True, check=False)
|
||||
if res.returncode != 0:
|
||||
raise RuntimeError(
|
||||
f"Command failed ({res.returncode}): {' '.join(cmd)}\n{res.stderr.strip()}"
|
||||
)
|
||||
return res.stdout
|
||||
|
||||
|
||||
def kustomize_build(path: Path) -> str:
|
||||
rel = path.relative_to(REPO_ROOT)
|
||||
try:
|
||||
return _run(["kubectl", "kustomize", str(rel)], cwd=REPO_ROOT)
|
||||
except Exception as e:
|
||||
msg = str(e)
|
||||
if "is not in or below" in msg:
|
||||
# Repo uses configMapGenerators that reference ../../scripts/*.py.
|
||||
# Kustomize load restriction must be disabled for a full render.
|
||||
try:
|
||||
return _run(
|
||||
["kubectl", "kustomize", "--load-restrictor=LoadRestrictionsNone", str(rel)],
|
||||
cwd=REPO_ROOT,
|
||||
)
|
||||
except Exception:
|
||||
pass
|
||||
return _run(["kustomize", "build", "--load-restrictor=LoadRestrictionsNone", str(rel)], cwd=REPO_ROOT)
|
||||
|
||||
|
||||
def _iter_docs(raw_yaml: str) -> Iterable[dict[str, Any]]:
|
||||
for doc in yaml.safe_load_all(raw_yaml):
|
||||
if not isinstance(doc, dict):
|
||||
continue
|
||||
kind = doc.get("kind")
|
||||
if kind == "List" and isinstance(doc.get("items"), list):
|
||||
for item in doc["items"]:
|
||||
if isinstance(item, dict):
|
||||
yield item
|
||||
continue
|
||||
if kind:
|
||||
yield doc
|
||||
|
||||
|
||||
def _meta(doc: dict[str, Any]) -> tuple[str, str | None]:
|
||||
md = doc.get("metadata") or {}
|
||||
name = md.get("name") or ""
|
||||
namespace = md.get("namespace")
|
||||
return name, namespace
|
||||
|
||||
|
||||
def _is_namespaced(doc: dict[str, Any]) -> bool:
|
||||
kind = doc.get("kind") or ""
|
||||
return kind not in CLUSTER_SCOPED_KINDS
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class FluxKustomization:
|
||||
name: str
|
||||
path: str
|
||||
target_namespace: str | None
|
||||
|
||||
|
||||
def find_flux_kustomizations() -> list[FluxKustomization]:
|
||||
"""Find Flux Kustomization CRs under clusters/atlas/flux-system."""
|
||||
root = REPO_ROOT / "clusters" / "atlas" / "flux-system"
|
||||
items: list[FluxKustomization] = []
|
||||
for file in sorted(root.rglob("*.yaml")):
|
||||
raw = file.read_text()
|
||||
for doc in _iter_docs(raw):
|
||||
if doc.get("kind") != "Kustomization":
|
||||
continue
|
||||
api = str(doc.get("apiVersion") or "")
|
||||
if not api.startswith("kustomize.toolkit.fluxcd.io/"):
|
||||
continue
|
||||
name, _ = _meta(doc)
|
||||
spec = doc.get("spec") or {}
|
||||
path = spec.get("path")
|
||||
if not isinstance(path, str) or not path.strip():
|
||||
continue
|
||||
items.append(
|
||||
FluxKustomization(
|
||||
name=name,
|
||||
path=path.strip().lstrip("./"),
|
||||
target_namespace=spec.get("targetNamespace"),
|
||||
)
|
||||
)
|
||||
return sorted(items, key=lambda k: k.name)
|
||||
|
||||
|
||||
def _safe_string_scan_for_hosts(value: Any) -> set[str]:
|
||||
"""Best-effort host scan from HelmRelease values without chart rendering."""
|
||||
hosts: set[str] = set()
|
||||
if isinstance(value, str):
|
||||
for m in re.finditer(r"(?i)([a-z0-9-]+(?:\.[a-z0-9-]+)+)", value):
|
||||
host = m.group(1).lower()
|
||||
if host.endswith("bstein.dev"):
|
||||
hosts.add(host)
|
||||
return hosts
|
||||
if isinstance(value, list):
|
||||
for item in value:
|
||||
hosts |= _safe_string_scan_for_hosts(item)
|
||||
return hosts
|
||||
if isinstance(value, dict):
|
||||
for item in value.values():
|
||||
hosts |= _safe_string_scan_for_hosts(item)
|
||||
return hosts
|
||||
return hosts
|
||||
|
||||
|
||||
def _service_ports(svc: dict[str, Any]) -> list[dict[str, Any]]:
|
||||
spec = svc.get("spec") or {}
|
||||
out: list[dict[str, Any]] = []
|
||||
for p in spec.get("ports") or []:
|
||||
if not isinstance(p, dict):
|
||||
continue
|
||||
out.append(
|
||||
{
|
||||
"name": p.get("name"),
|
||||
"port": p.get("port"),
|
||||
"targetPort": p.get("targetPort"),
|
||||
"protocol": p.get("protocol", "TCP"),
|
||||
}
|
||||
)
|
||||
return out
|
||||
|
||||
|
||||
def _workload_labels(doc: dict[str, Any]) -> dict[str, str]:
|
||||
tpl = (doc.get("spec") or {}).get("template") or {}
|
||||
md = tpl.get("metadata") or {}
|
||||
labels = md.get("labels") or {}
|
||||
return {str(k): str(v) for k, v in labels.items()} if isinstance(labels, dict) else {}
|
||||
|
||||
|
||||
def _service_selector(doc: dict[str, Any]) -> dict[str, str]:
|
||||
spec = doc.get("spec") or {}
|
||||
sel = spec.get("selector") or {}
|
||||
return {str(k): str(v) for k, v in sel.items()} if isinstance(sel, dict) else {}
|
||||
|
||||
|
||||
def _selector_matches(selector: dict[str, str], labels: dict[str, str]) -> bool:
|
||||
if not selector:
|
||||
return False
|
||||
return all(labels.get(k) == v for k, v in selector.items())
|
||||
|
||||
|
||||
def _sanitize_node_id(text: str) -> str:
|
||||
return re.sub(r"[^a-zA-Z0-9_]", "_", text)
|
||||
|
||||
|
||||
def extract_catalog(
|
||||
rendered: list[tuple[FluxKustomization, list[dict[str, Any]]]],
|
||||
) -> tuple[dict[str, Any], dict[str, Any], str]:
|
||||
"""Build knowledge catalog + mermaid diagram from rendered docs."""
|
||||
# Index workloads and services for mapping.
|
||||
workloads: dict[tuple[str, str], dict[str, Any]] = {}
|
||||
services: dict[tuple[str, str], dict[str, Any]] = {}
|
||||
ingresses: list[dict[str, Any]] = []
|
||||
ingressroutes: list[dict[str, Any]] = []
|
||||
helmrelease_hosts: dict[str, list[str]] = {}
|
||||
|
||||
for src, docs in rendered:
|
||||
for doc in docs:
|
||||
kind = doc.get("kind")
|
||||
if kind not in INCLUDED_KINDS:
|
||||
continue
|
||||
if kind == "Secret":
|
||||
continue
|
||||
|
||||
name, namespace = _meta(doc)
|
||||
if _is_namespaced(doc) and not namespace and src.target_namespace:
|
||||
namespace = src.target_namespace
|
||||
doc = dict(doc)
|
||||
doc.setdefault("metadata", {})["namespace"] = namespace
|
||||
|
||||
if kind in ("Deployment", "StatefulSet", "DaemonSet"):
|
||||
workloads[(namespace or "", name)] = {
|
||||
"kind": kind,
|
||||
"namespace": namespace or "",
|
||||
"name": name,
|
||||
"labels": _workload_labels(doc),
|
||||
"serviceAccountName": ((doc.get("spec") or {}).get("template") or {})
|
||||
.get("spec", {})
|
||||
.get("serviceAccountName"),
|
||||
"nodeSelector": ((doc.get("spec") or {}).get("template") or {})
|
||||
.get("spec", {})
|
||||
.get("nodeSelector", {}),
|
||||
"images": sorted(
|
||||
{
|
||||
c.get("image")
|
||||
for c in (
|
||||
(((doc.get("spec") or {}).get("template") or {}).get("spec") or {}).get(
|
||||
"containers"
|
||||
)
|
||||
or []
|
||||
)
|
||||
if isinstance(c, dict) and c.get("image")
|
||||
}
|
||||
),
|
||||
}
|
||||
elif kind == "Service":
|
||||
services[(namespace or "", name)] = {
|
||||
"namespace": namespace or "",
|
||||
"name": name,
|
||||
"type": (doc.get("spec") or {}).get("type", "ClusterIP"),
|
||||
"selector": _service_selector(doc),
|
||||
"ports": _service_ports(doc),
|
||||
}
|
||||
elif kind == "Ingress":
|
||||
ingresses.append({"source": src.name, "doc": doc})
|
||||
elif kind == "IngressRoute":
|
||||
ingressroutes.append({"source": src.name, "doc": doc})
|
||||
elif kind == "HelmRelease":
|
||||
spec = doc.get("spec") or {}
|
||||
vals = spec.get("values") or {}
|
||||
hosts = sorted(_safe_string_scan_for_hosts(vals))
|
||||
if hosts:
|
||||
helmrelease_hosts[f"{src.name}:{namespace or ''}/{name}"] = hosts
|
||||
|
||||
# Map services to workloads.
|
||||
service_to_workloads: dict[tuple[str, str], list[dict[str, str]]] = {}
|
||||
for (ns, svc_name), svc in services.items():
|
||||
selector = svc.get("selector") or {}
|
||||
matches: list[dict[str, str]] = []
|
||||
for (w_ns, w_name), w in workloads.items():
|
||||
if w_ns != ns:
|
||||
continue
|
||||
if _selector_matches(selector, w.get("labels") or {}):
|
||||
matches.append({"kind": w["kind"], "name": w_name})
|
||||
service_to_workloads[(ns, svc_name)] = sorted(matches, key=lambda m: (m["kind"], m["name"]))
|
||||
|
||||
# Extract HTTP endpoints.
|
||||
endpoints: list[dict[str, Any]] = []
|
||||
|
||||
def add_endpoint(
|
||||
*,
|
||||
host: str,
|
||||
path: str,
|
||||
namespace: str,
|
||||
service: str,
|
||||
port: Any,
|
||||
source: str,
|
||||
kind: str,
|
||||
obj_name: str,
|
||||
):
|
||||
wk = service_to_workloads.get((namespace, service), [])
|
||||
endpoints.append(
|
||||
{
|
||||
"host": host,
|
||||
"path": path,
|
||||
"backend": {
|
||||
"namespace": namespace,
|
||||
"service": service,
|
||||
"port": port,
|
||||
"workloads": wk,
|
||||
},
|
||||
"via": {"kind": kind, "name": obj_name, "source": source},
|
||||
}
|
||||
)
|
||||
|
||||
for item in ingresses:
|
||||
doc = item["doc"]
|
||||
source = item["source"]
|
||||
name, namespace = _meta(doc)
|
||||
namespace = namespace or ""
|
||||
spec = doc.get("spec") or {}
|
||||
for rule in spec.get("rules") or []:
|
||||
if not isinstance(rule, dict):
|
||||
continue
|
||||
host = (rule.get("host") or "").strip()
|
||||
http = rule.get("http") or {}
|
||||
for p in http.get("paths") or []:
|
||||
if not isinstance(p, dict):
|
||||
continue
|
||||
backend = (p.get("backend") or {}).get("service") or {}
|
||||
svc_name = backend.get("name")
|
||||
svc_port = (backend.get("port") or {}).get("number") or (backend.get("port") or {}).get("name")
|
||||
if not host or not svc_name:
|
||||
continue
|
||||
add_endpoint(
|
||||
host=host,
|
||||
path=p.get("path") or "/",
|
||||
namespace=namespace,
|
||||
service=svc_name,
|
||||
port=svc_port,
|
||||
source=source,
|
||||
kind="Ingress",
|
||||
obj_name=name,
|
||||
)
|
||||
|
||||
host_re = re.compile(r"Host\(`([^`]+)`\)")
|
||||
pathprefix_re = re.compile(r"PathPrefix\(`([^`]+)`\)")
|
||||
for item in ingressroutes:
|
||||
doc = item["doc"]
|
||||
source = item["source"]
|
||||
name, namespace = _meta(doc)
|
||||
namespace = namespace or ""
|
||||
spec = doc.get("spec") or {}
|
||||
for route in spec.get("routes") or []:
|
||||
if not isinstance(route, dict):
|
||||
continue
|
||||
match = route.get("match") or ""
|
||||
hosts = host_re.findall(match)
|
||||
pathprefixes = pathprefix_re.findall(match) or ["/"]
|
||||
for svc in route.get("services") or []:
|
||||
if not isinstance(svc, dict):
|
||||
continue
|
||||
svc_name = svc.get("name")
|
||||
svc_port = svc.get("port")
|
||||
if not svc_name:
|
||||
continue
|
||||
for host in hosts:
|
||||
for pp in pathprefixes:
|
||||
add_endpoint(
|
||||
host=host,
|
||||
path=pp,
|
||||
namespace=namespace,
|
||||
service=svc_name,
|
||||
port=svc_port,
|
||||
source=source,
|
||||
kind="IngressRoute",
|
||||
obj_name=name,
|
||||
)
|
||||
|
||||
endpoints = sorted(
|
||||
endpoints,
|
||||
key=lambda e: (
|
||||
e["host"],
|
||||
e["path"],
|
||||
e["backend"]["namespace"],
|
||||
e["backend"]["service"],
|
||||
),
|
||||
)
|
||||
|
||||
catalog = {
|
||||
"cluster": "atlas",
|
||||
"sources": [
|
||||
{"name": k.name, "path": k.path, "targetNamespace": k.target_namespace}
|
||||
for k, _ in rendered
|
||||
],
|
||||
"workloads": sorted(
|
||||
list(workloads.values()),
|
||||
key=lambda w: (w["namespace"], w["kind"], w["name"]),
|
||||
),
|
||||
"services": sorted(
|
||||
list(services.values()),
|
||||
key=lambda s: (s["namespace"], s["name"]),
|
||||
),
|
||||
"http_endpoints": endpoints,
|
||||
"helmrelease_host_hints": {k: v for k, v in sorted(helmrelease_hosts.items())},
|
||||
}
|
||||
|
||||
# Mermaid diagram: host -> service -> workload (grouped by namespace).
|
||||
ns_nodes: dict[str, list[str]] = {}
|
||||
lines: list[str] = ["flowchart LR"]
|
||||
edges: set[tuple[str, str]] = set()
|
||||
|
||||
def ensure_ns_node(ns: str, node_id: str):
|
||||
ns_nodes.setdefault(ns, [])
|
||||
if node_id not in ns_nodes[ns]:
|
||||
ns_nodes[ns].append(node_id)
|
||||
|
||||
host_nodes: dict[str, str] = {}
|
||||
|
||||
for ep in endpoints:
|
||||
host = ep["host"]
|
||||
host_id = host_nodes.get(host)
|
||||
if not host_id:
|
||||
host_id = f"host_{_sanitize_node_id(host)}"
|
||||
host_nodes[host] = host_id
|
||||
lines.append(f' {host_id}["{host}"]')
|
||||
|
||||
ns = ep["backend"]["namespace"]
|
||||
svc = ep["backend"]["service"]
|
||||
svc_id = f"svc_{_sanitize_node_id(ns)}_{_sanitize_node_id(svc)}"
|
||||
if svc_id not in ns_nodes.get(ns, []):
|
||||
lines.append(f' {svc_id}["{ns}/{svc} (Service)"]')
|
||||
ensure_ns_node(ns, svc_id)
|
||||
|
||||
if (host_id, svc_id) not in edges:
|
||||
edges.add((host_id, svc_id))
|
||||
lines.append(f" {host_id} --> {svc_id}")
|
||||
|
||||
for w in ep["backend"]["workloads"]:
|
||||
w_id = f"wl_{_sanitize_node_id(ns)}_{_sanitize_node_id(w['name'])}"
|
||||
if w_id not in ns_nodes.get(ns, []):
|
||||
lines.append(f' {w_id}["{ns}/{w["name"]} ({w["kind"]})"]')
|
||||
ensure_ns_node(ns, w_id)
|
||||
if (svc_id, w_id) not in edges:
|
||||
edges.add((svc_id, w_id))
|
||||
lines.append(f" {svc_id} --> {w_id}")
|
||||
|
||||
# Wrap namespace subgraphs at the end for stability (sorted namespaces).
|
||||
if ns_nodes:
|
||||
lines.append("")
|
||||
for ns in sorted(ns_nodes.keys()):
|
||||
lines.append(f" subgraph { _sanitize_node_id(ns) }[{ns}]")
|
||||
for node_id in ns_nodes[ns]:
|
||||
lines.append(f" {node_id}")
|
||||
lines.append(" end")
|
||||
|
||||
diagram = "\n".join(lines).rstrip() + "\n"
|
||||
|
||||
summary = {
|
||||
"counts": {
|
||||
"workloads": len(workloads),
|
||||
"services": len(services),
|
||||
"http_endpoints": len(endpoints),
|
||||
"helmrelease_host_hints": sum(len(v) for v in helmrelease_hosts.values()),
|
||||
}
|
||||
}
|
||||
|
||||
return catalog, summary, diagram
|
||||
|
||||
|
||||
def main() -> int:
|
||||
ap = argparse.ArgumentParser()
|
||||
ap.add_argument("--out", default="knowledge", help="Output base directory (default: knowledge/)")
|
||||
ap.add_argument(
|
||||
"--write",
|
||||
action="store_true",
|
||||
help="Write generated files (otherwise just print a summary).",
|
||||
)
|
||||
args = ap.parse_args()
|
||||
|
||||
out_dir = REPO_ROOT / args.out
|
||||
flux = find_flux_kustomizations()
|
||||
if not flux:
|
||||
print("No Flux Kustomizations found under clusters/atlas/flux-system.", file=sys.stderr)
|
||||
return 2
|
||||
|
||||
rendered: list[tuple[FluxKustomization, list[dict[str, Any]]]] = []
|
||||
for k in flux:
|
||||
path = REPO_ROOT / k.path
|
||||
if not path.exists():
|
||||
continue
|
||||
raw = kustomize_build(path)
|
||||
docs = [d for d in _iter_docs(raw) if d.get("kind") != "Secret"]
|
||||
rendered.append((k, docs))
|
||||
|
||||
rendered = sorted(rendered, key=lambda item: item[0].name)
|
||||
catalog, summary, diagram = extract_catalog(rendered)
|
||||
|
||||
if not args.write:
|
||||
print(json.dumps(summary, indent=2, sort_keys=True))
|
||||
return 0
|
||||
|
||||
(out_dir / "catalog").mkdir(parents=True, exist_ok=True)
|
||||
(out_dir / "diagrams").mkdir(parents=True, exist_ok=True)
|
||||
|
||||
catalog_path = out_dir / "catalog" / "atlas.yaml"
|
||||
catalog_json_path = out_dir / "catalog" / "atlas.json"
|
||||
summary_path = out_dir / "catalog" / "atlas-summary.json"
|
||||
diagram_path = out_dir / "diagrams" / "atlas-http.mmd"
|
||||
runbooks_json_path = out_dir / "catalog" / "runbooks.json"
|
||||
|
||||
catalog_path.write_text(
|
||||
"# Generated by scripts/knowledge_render_atlas.py (do not edit by hand)\n"
|
||||
+ yaml.safe_dump(catalog, sort_keys=False),
|
||||
encoding="utf-8",
|
||||
)
|
||||
catalog_json_path.write_text(json.dumps(catalog, indent=2, sort_keys=False) + "\n", encoding="utf-8")
|
||||
summary_path.write_text(json.dumps(summary, indent=2, sort_keys=True) + "\n", encoding="utf-8")
|
||||
diagram_path.write_text(diagram, encoding="utf-8")
|
||||
|
||||
# Render runbooks into JSON for lightweight, dependency-free consumption in-cluster.
|
||||
runbooks_dir = out_dir / "runbooks"
|
||||
runbooks: list[dict[str, Any]] = []
|
||||
if runbooks_dir.exists():
|
||||
for md_file in sorted(runbooks_dir.glob("*.md")):
|
||||
raw = md_file.read_text(encoding="utf-8")
|
||||
fm: dict[str, Any] = {}
|
||||
body = raw
|
||||
if raw.startswith("---\n"):
|
||||
try:
|
||||
_, rest = raw.split("---\n", 1)
|
||||
fm_raw, body = rest.split("\n---\n", 1)
|
||||
fm = yaml.safe_load(fm_raw) or {}
|
||||
except Exception:
|
||||
fm = {}
|
||||
body = raw
|
||||
runbooks.append(
|
||||
{
|
||||
"path": str(md_file.relative_to(out_dir)),
|
||||
"title": fm.get("title") or md_file.stem,
|
||||
"tags": fm.get("tags") or [],
|
||||
"entrypoints": fm.get("entrypoints") or [],
|
||||
"source_paths": fm.get("source_paths") or [],
|
||||
"body": body.strip(),
|
||||
}
|
||||
)
|
||||
runbooks_json_path.write_text(json.dumps(runbooks, indent=2, sort_keys=False) + "\n", encoding="utf-8")
|
||||
|
||||
print(f"Wrote {catalog_path.relative_to(REPO_ROOT)}")
|
||||
print(f"Wrote {catalog_json_path.relative_to(REPO_ROOT)}")
|
||||
print(f"Wrote {summary_path.relative_to(REPO_ROOT)}")
|
||||
print(f"Wrote {diagram_path.relative_to(REPO_ROOT)}")
|
||||
print(f"Wrote {runbooks_json_path.relative_to(REPO_ROOT)}")
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
raise SystemExit(main())
|
||||
313
scripts/logging_render_observability.py
Executable file
313
scripts/logging_render_observability.py
Executable file
@ -0,0 +1,313 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Generate OpenSearch Observability seed objects and render them into ConfigMaps.
|
||||
|
||||
Usage:
|
||||
scripts/logging_render_observability.py --build # rebuild JSON + ConfigMap
|
||||
scripts/logging_render_observability.py # re-render ConfigMap from JSON
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import textwrap
|
||||
from dataclasses import dataclass
|
||||
from pathlib import Path
|
||||
|
||||
ROOT = Path(__file__).resolve().parents[1]
|
||||
OBS_DIR = ROOT / "services" / "logging" / "observability"
|
||||
APPS_PATH = OBS_DIR / "applications.json"
|
||||
QUERIES_PATH = OBS_DIR / "saved_queries.json"
|
||||
VIS_PATH = OBS_DIR / "saved_visualizations.json"
|
||||
CONFIG_PATH = ROOT / "services" / "logging" / "opensearch-observability-objects.yaml"
|
||||
|
||||
CONFIG_TEMPLATE = textwrap.dedent(
|
||||
"""# {relative_path}
|
||||
# Generated by scripts/logging_render_observability.py --build
|
||||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
metadata:
|
||||
name: opensearch-observability-objects
|
||||
namespace: logging
|
||||
data:
|
||||
applications.json: |
|
||||
{applications}
|
||||
saved_queries.json: |
|
||||
{queries}
|
||||
saved_visualizations.json: |
|
||||
{visualizations}
|
||||
"""
|
||||
)
|
||||
|
||||
DEFAULT_RANGE = {"start": "now-24h", "end": "now", "text": ""}
|
||||
DEFAULT_TIMESTAMP = {"name": "@timestamp", "type": "timestamp"}
|
||||
DEFAULT_FIELDS = {"text": "", "tokens": []}
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class AppSpec:
|
||||
name: str
|
||||
base_query: str
|
||||
kind: str = "kube"
|
||||
description: str = ""
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class QuerySpec:
|
||||
name: str
|
||||
query: str
|
||||
description: str = ""
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class VisualizationSpec:
|
||||
name: str
|
||||
query: str
|
||||
vis_type: str
|
||||
description: str = ""
|
||||
|
||||
|
||||
def source_query(index: str, where: str | None = None) -> str:
|
||||
query = f"source = {index}"
|
||||
if where:
|
||||
query += f" | where {where}"
|
||||
return query
|
||||
|
||||
|
||||
def error_filter(fields: list[str]) -> str:
|
||||
parts = [f"match({field}, 'error|exception|fail')" for field in fields]
|
||||
return " or ".join(parts)
|
||||
|
||||
|
||||
def saved_query(spec: QuerySpec) -> dict:
|
||||
return {
|
||||
"name": spec.name,
|
||||
"description": spec.description,
|
||||
"query": spec.query,
|
||||
"selected_date_range": DEFAULT_RANGE,
|
||||
"selected_timestamp": DEFAULT_TIMESTAMP,
|
||||
"selected_fields": DEFAULT_FIELDS,
|
||||
}
|
||||
|
||||
|
||||
def saved_visualization(spec: VisualizationSpec) -> dict:
|
||||
return {
|
||||
"name": spec.name,
|
||||
"description": spec.description,
|
||||
"query": spec.query,
|
||||
"type": spec.vis_type,
|
||||
"selected_date_range": DEFAULT_RANGE,
|
||||
"selected_timestamp": DEFAULT_TIMESTAMP,
|
||||
"selected_fields": DEFAULT_FIELDS,
|
||||
}
|
||||
|
||||
|
||||
def build_objects() -> tuple[list[dict], list[dict], list[dict]]:
|
||||
kube_error = error_filter(["log", "message"])
|
||||
journald_error = error_filter(["MESSAGE"])
|
||||
|
||||
apps = [
|
||||
AppSpec("bstein-dev-home", source_query("kube-*", "kubernetes.namespace_name = 'bstein-dev-home'")),
|
||||
AppSpec(
|
||||
"pegasus",
|
||||
source_query(
|
||||
"kube-*",
|
||||
"kubernetes.namespace_name = 'jellyfin' and kubernetes.labels.app = 'pegasus'",
|
||||
),
|
||||
),
|
||||
AppSpec(
|
||||
"jellyfin",
|
||||
source_query(
|
||||
"kube-*",
|
||||
"kubernetes.namespace_name = 'jellyfin' and kubernetes.labels.app = 'jellyfin'",
|
||||
),
|
||||
),
|
||||
AppSpec("vaultwarden", source_query("kube-*", "kubernetes.namespace_name = 'vaultwarden'")),
|
||||
AppSpec("mailu", source_query("kube-*", "kubernetes.namespace_name = 'mailu-mailserver'")),
|
||||
AppSpec("nextcloud", source_query("kube-*", "kubernetes.namespace_name = 'nextcloud'")),
|
||||
AppSpec("gitea", source_query("kube-*", "kubernetes.namespace_name = 'gitea'")),
|
||||
AppSpec("jenkins", source_query("kube-*", "kubernetes.namespace_name = 'jenkins'")),
|
||||
AppSpec("harbor", source_query("kube-*", "kubernetes.namespace_name = 'harbor'")),
|
||||
AppSpec("vault", source_query("kube-*", "kubernetes.namespace_name = 'vault'")),
|
||||
AppSpec("keycloak", source_query("kube-*", "kubernetes.namespace_name = 'sso'")),
|
||||
AppSpec("flux-system", source_query("kube-*", "kubernetes.namespace_name = 'flux-system'")),
|
||||
AppSpec("comms", source_query("kube-*", "kubernetes.namespace_name = 'comms'")),
|
||||
AppSpec(
|
||||
"element-web",
|
||||
source_query(
|
||||
"kube-*",
|
||||
"kubernetes.namespace_name = 'comms' and kubernetes.container_name = 'element-web'",
|
||||
),
|
||||
),
|
||||
AppSpec(
|
||||
"element-call",
|
||||
source_query(
|
||||
"kube-*",
|
||||
"kubernetes.namespace_name = 'comms' and kubernetes.labels.app = 'element-call'",
|
||||
),
|
||||
),
|
||||
AppSpec(
|
||||
"matrix-synapse",
|
||||
source_query(
|
||||
"kube-*",
|
||||
"kubernetes.namespace_name = 'comms' and kubernetes.container_name = 'synapse'",
|
||||
),
|
||||
),
|
||||
AppSpec(
|
||||
"livekit",
|
||||
source_query(
|
||||
"kube-*",
|
||||
"kubernetes.namespace_name = 'comms' and kubernetes.labels.app = 'livekit'",
|
||||
),
|
||||
),
|
||||
AppSpec(
|
||||
"coturn",
|
||||
source_query(
|
||||
"kube-*",
|
||||
"kubernetes.namespace_name = 'comms' and kubernetes.labels.app = 'coturn'",
|
||||
),
|
||||
),
|
||||
AppSpec(
|
||||
"lesavka",
|
||||
source_query("journald-*", "_HOSTNAME = 'titan-jh'"),
|
||||
kind="journald",
|
||||
),
|
||||
]
|
||||
|
||||
applications = [
|
||||
{
|
||||
"name": app.name,
|
||||
"description": app.description,
|
||||
"baseQuery": app.base_query,
|
||||
"servicesEntities": [],
|
||||
"traceGroups": [app.name],
|
||||
}
|
||||
for app in apps
|
||||
]
|
||||
|
||||
queries = [
|
||||
saved_query(QuerySpec("kube logs", source_query("kube-*"))),
|
||||
saved_query(QuerySpec("kube errors", f"{source_query('kube-*')} | where {kube_error}")),
|
||||
saved_query(QuerySpec("journald logs", source_query("journald-*"))),
|
||||
saved_query(QuerySpec("journald errors", f"{source_query('journald-*')} | where {journald_error}")),
|
||||
]
|
||||
|
||||
for app in apps:
|
||||
query_base = app.base_query
|
||||
error_clause = journald_error if app.kind == "journald" else kube_error
|
||||
queries.append(saved_query(QuerySpec(f"{app.name} logs", query_base)))
|
||||
queries.append(saved_query(QuerySpec(f"{app.name} errors", f"{query_base} | where {error_clause}")))
|
||||
|
||||
visualizations = [
|
||||
saved_visualization(
|
||||
VisualizationSpec(
|
||||
"[Kube] Logs per hour",
|
||||
"source = kube-* | stats count() as log_count by span(`@timestamp`, 1h)",
|
||||
"line",
|
||||
)
|
||||
),
|
||||
saved_visualization(
|
||||
VisualizationSpec(
|
||||
"[Kube] Errors per hour",
|
||||
f"source = kube-* | where {kube_error} | stats count() as error_count by span(`@timestamp`, 1h)",
|
||||
"line",
|
||||
)
|
||||
),
|
||||
saved_visualization(
|
||||
VisualizationSpec(
|
||||
"[Kube] Top namespaces",
|
||||
"source = kube-* | stats count() as log_count by kubernetes.namespace_name | sort - log_count",
|
||||
"bar",
|
||||
)
|
||||
),
|
||||
saved_visualization(
|
||||
VisualizationSpec(
|
||||
"[Kube] Top error namespaces",
|
||||
f"source = kube-* | where {kube_error} | stats count() as error_count by kubernetes.namespace_name | sort - error_count",
|
||||
"bar",
|
||||
)
|
||||
),
|
||||
saved_visualization(
|
||||
VisualizationSpec(
|
||||
"[Kube] Top pods",
|
||||
"source = kube-* | stats count() as log_count by kubernetes.pod_name | sort - log_count",
|
||||
"bar",
|
||||
)
|
||||
),
|
||||
saved_visualization(
|
||||
VisualizationSpec(
|
||||
"[Kube] Top error pods",
|
||||
f"source = kube-* | where {kube_error} | stats count() as error_count by kubernetes.pod_name | sort - error_count",
|
||||
"bar",
|
||||
)
|
||||
),
|
||||
saved_visualization(
|
||||
VisualizationSpec(
|
||||
"[Kube] Top nodes",
|
||||
"source = kube-* | stats count() as log_count by kubernetes.node_name | sort - log_count",
|
||||
"bar",
|
||||
)
|
||||
),
|
||||
saved_visualization(
|
||||
VisualizationSpec(
|
||||
"[Journald] Top units",
|
||||
"source = journald-* | stats count() as log_count by _SYSTEMD_UNIT | sort - log_count",
|
||||
"bar",
|
||||
)
|
||||
),
|
||||
saved_visualization(
|
||||
VisualizationSpec(
|
||||
"[Journald] Top error units",
|
||||
f"source = journald-* | where {journald_error} | stats count() as error_count by _SYSTEMD_UNIT | sort - error_count",
|
||||
"bar",
|
||||
)
|
||||
),
|
||||
]
|
||||
|
||||
return applications, queries, visualizations
|
||||
|
||||
|
||||
def write_json(payload: list[dict], path: Path) -> None:
|
||||
path.parent.mkdir(parents=True, exist_ok=True)
|
||||
path.write_text(json.dumps(payload, indent=2) + "\n")
|
||||
|
||||
|
||||
def render_configmap(apps_path: Path, queries_path: Path, vis_path: Path, output_path: Path) -> None:
|
||||
relative_path = output_path.relative_to(ROOT)
|
||||
applications = indent_payload(apps_path)
|
||||
queries = indent_payload(queries_path)
|
||||
visualizations = indent_payload(vis_path)
|
||||
output_path.write_text(
|
||||
CONFIG_TEMPLATE.format(
|
||||
relative_path=relative_path,
|
||||
applications=applications,
|
||||
queries=queries,
|
||||
visualizations=visualizations,
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
def indent_payload(path: Path) -> str:
|
||||
lines = path.read_text().splitlines()
|
||||
return "\n".join(" " + line for line in lines)
|
||||
|
||||
|
||||
def main() -> None:
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--build", action="store_true", help="Regenerate JSON payloads and ConfigMap")
|
||||
args = parser.parse_args()
|
||||
|
||||
if args.build:
|
||||
applications, queries, visualizations = build_objects()
|
||||
write_json(applications, APPS_PATH)
|
||||
write_json(queries, QUERIES_PATH)
|
||||
write_json(visualizations, VIS_PATH)
|
||||
|
||||
if not (APPS_PATH.exists() and QUERIES_PATH.exists() and VIS_PATH.exists()):
|
||||
raise SystemExit("Missing observability JSON payloads. Run with --build first.")
|
||||
|
||||
render_configmap(APPS_PATH, QUERIES_PATH, VIS_PATH, CONFIG_PATH)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
149
scripts/monitoring_postmark_exporter.py
Normal file
149
scripts/monitoring_postmark_exporter.py
Normal file
@ -0,0 +1,149 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
import datetime as dt
|
||||
import os
|
||||
import time
|
||||
from dataclasses import dataclass
|
||||
|
||||
import requests
|
||||
from prometheus_client import Gauge, Info, start_http_server
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class Window:
|
||||
label: str
|
||||
days: int
|
||||
|
||||
|
||||
WINDOWS = [
|
||||
Window("today", 0),
|
||||
Window("1d", 1),
|
||||
Window("7d", 7),
|
||||
Window("30d", 30),
|
||||
]
|
||||
|
||||
API_BASE = os.environ.get("POSTMARK_API_BASE", "https://api.postmarkapp.com").rstrip("/")
|
||||
POLL_INTERVAL_SECONDS = int(os.environ.get("POLL_INTERVAL_SECONDS", "60"))
|
||||
LISTEN_ADDRESS = os.environ.get("LISTEN_ADDRESS", "0.0.0.0")
|
||||
LISTEN_PORT = int(os.environ.get("LISTEN_PORT", "8000"))
|
||||
|
||||
PRIMARY_TOKEN = os.environ.get("POSTMARK_SERVER_TOKEN", "").strip()
|
||||
FALLBACK_TOKEN = os.environ.get("POSTMARK_SERVER_TOKEN_FALLBACK", "").strip()
|
||||
LIMIT_WINDOW = os.environ.get("POSTMARK_SENDING_LIMIT_WINDOW", "30d").strip()
|
||||
LIMIT_RAW = os.environ.get("POSTMARK_SENDING_LIMIT", "").strip()
|
||||
try:
|
||||
SENDING_LIMIT = float(LIMIT_RAW) if LIMIT_RAW else 0.0
|
||||
except ValueError:
|
||||
SENDING_LIMIT = 0.0
|
||||
|
||||
EXPORTER_INFO = Info("postmark_exporter", "Exporter build info")
|
||||
EXPORTER_INFO.info(
|
||||
{
|
||||
"api_base": API_BASE,
|
||||
"windows": ",".join(window.label for window in WINDOWS),
|
||||
}
|
||||
)
|
||||
|
||||
POSTMARK_API_UP = Gauge("postmark_api_up", "Whether Postmark API is reachable (1) or not (0)")
|
||||
POSTMARK_LAST_SUCCESS = Gauge(
|
||||
"postmark_last_success_timestamp_seconds",
|
||||
"Unix timestamp of the last successful Postmark stats refresh",
|
||||
)
|
||||
POSTMARK_REQUEST_ERRORS = Gauge(
|
||||
"postmark_request_errors_total",
|
||||
"Total Postmark stats request errors since exporter start",
|
||||
)
|
||||
|
||||
POSTMARK_OUTBOUND_SENT = Gauge(
|
||||
"postmark_outbound_sent",
|
||||
"Outbound emails sent within the selected window",
|
||||
labelnames=("window",),
|
||||
)
|
||||
POSTMARK_OUTBOUND_BOUNCED = Gauge(
|
||||
"postmark_outbound_bounced",
|
||||
"Outbound emails bounced within the selected window",
|
||||
labelnames=("window",),
|
||||
)
|
||||
POSTMARK_OUTBOUND_BOUNCE_RATE = Gauge(
|
||||
"postmark_outbound_bounce_rate",
|
||||
"Outbound bounce rate percentage within the selected window",
|
||||
labelnames=("window",),
|
||||
)
|
||||
POSTMARK_SENDING_LIMIT_GAUGE = Gauge(
|
||||
"postmark_sending_limit",
|
||||
"Configured Postmark sending limit for the active account",
|
||||
)
|
||||
POSTMARK_SENDING_LIMIT_USED = Gauge(
|
||||
"postmark_sending_limit_used",
|
||||
"Messages sent within the configured send limit window",
|
||||
)
|
||||
POSTMARK_SENDING_LIMIT_USED_PERCENT = Gauge(
|
||||
"postmark_sending_limit_used_percent",
|
||||
"Percent of the configured send limit used within the limit window",
|
||||
)
|
||||
|
||||
|
||||
def fetch_outbound_stats(token: str, window: Window) -> dict:
|
||||
today = dt.date.today()
|
||||
fromdate = today - dt.timedelta(days=window.days)
|
||||
params = {"fromdate": fromdate.isoformat(), "todate": today.isoformat()}
|
||||
headers = {
|
||||
"Accept": "application/json",
|
||||
"X-Postmark-Server-Token": token,
|
||||
}
|
||||
response = requests.get(
|
||||
f"{API_BASE}/stats/outbound",
|
||||
headers=headers,
|
||||
params=params,
|
||||
timeout=15,
|
||||
)
|
||||
response.raise_for_status()
|
||||
return response.json()
|
||||
|
||||
|
||||
def update_metrics(token: str) -> None:
|
||||
sent_by_window = {}
|
||||
for window in WINDOWS:
|
||||
data = fetch_outbound_stats(token, window)
|
||||
sent = int(data.get("Sent", 0) or 0)
|
||||
bounced = int(data.get("Bounced", 0) or 0)
|
||||
rate = (bounced / sent * 100.0) if sent else 0.0
|
||||
sent_by_window[window.label] = sent
|
||||
POSTMARK_OUTBOUND_SENT.labels(window=window.label).set(sent)
|
||||
POSTMARK_OUTBOUND_BOUNCED.labels(window=window.label).set(bounced)
|
||||
POSTMARK_OUTBOUND_BOUNCE_RATE.labels(window=window.label).set(rate)
|
||||
|
||||
POSTMARK_SENDING_LIMIT_GAUGE.set(SENDING_LIMIT)
|
||||
limit_window_sent = sent_by_window.get(LIMIT_WINDOW, 0)
|
||||
POSTMARK_SENDING_LIMIT_USED.set(limit_window_sent)
|
||||
if SENDING_LIMIT:
|
||||
POSTMARK_SENDING_LIMIT_USED_PERCENT.set(limit_window_sent / SENDING_LIMIT * 100.0)
|
||||
else:
|
||||
POSTMARK_SENDING_LIMIT_USED_PERCENT.set(0.0)
|
||||
|
||||
|
||||
def main() -> None:
|
||||
if not PRIMARY_TOKEN and not FALLBACK_TOKEN:
|
||||
raise SystemExit("POSTMARK_SERVER_TOKEN or POSTMARK_SERVER_TOKEN_FALLBACK is required")
|
||||
|
||||
start_http_server(LISTEN_PORT, addr=LISTEN_ADDRESS)
|
||||
|
||||
tokens = [token for token in (PRIMARY_TOKEN, FALLBACK_TOKEN) if token]
|
||||
token_index = 0
|
||||
|
||||
while True:
|
||||
token = tokens[token_index % len(tokens)]
|
||||
token_index += 1
|
||||
try:
|
||||
update_metrics(token)
|
||||
POSTMARK_API_UP.set(1)
|
||||
POSTMARK_LAST_SUCCESS.set(time.time())
|
||||
except Exception as exc: # noqa: BLE001
|
||||
POSTMARK_API_UP.set(0)
|
||||
POSTMARK_REQUEST_ERRORS.inc()
|
||||
print(f"postmark_exporter: refresh failed: {exc}", flush=True)
|
||||
time.sleep(POLL_INTERVAL_SECONDS)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
35
scripts/monitoring_render_postmark_exporter.py
Normal file
35
scripts/monitoring_render_postmark_exporter.py
Normal file
@ -0,0 +1,35 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
def indent(text: str, spaces: int) -> str:
|
||||
prefix = " " * spaces
|
||||
return "".join(prefix + line if line.strip("\n") else line for line in text.splitlines(keepends=True))
|
||||
|
||||
|
||||
def main() -> None:
|
||||
root = Path(__file__).resolve().parents[1]
|
||||
source = root / "scripts" / "monitoring_postmark_exporter.py"
|
||||
target = root / "services" / "monitoring" / "postmark-exporter-script.yaml"
|
||||
|
||||
payload = source.read_text(encoding="utf-8")
|
||||
if not payload.endswith("\n"):
|
||||
payload += "\n"
|
||||
|
||||
yaml = (
|
||||
f"# services/monitoring/postmark-exporter-script.yaml\n"
|
||||
f"apiVersion: v1\n"
|
||||
f"kind: ConfigMap\n"
|
||||
f"metadata:\n"
|
||||
f" name: postmark-exporter-script\n"
|
||||
f"data:\n"
|
||||
f" monitoring_postmark_exporter.py: |\n"
|
||||
f"{indent(payload, 4)}"
|
||||
)
|
||||
|
||||
target.write_text(yaml, encoding="utf-8")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@ -1,49 +0,0 @@
|
||||
#!/bin/bash
|
||||
set -euo pipefail
|
||||
|
||||
KC_BASE="${KC_BASE:?}"
|
||||
KC_REALM="${KC_REALM:?}"
|
||||
KC_ADMIN_USER="${KC_ADMIN_USER:?}"
|
||||
KC_ADMIN_PASS="${KC_ADMIN_PASS:?}"
|
||||
|
||||
if ! command -v jq >/dev/null 2>&1; then
|
||||
apt-get update && apt-get install -y jq curl >/dev/null
|
||||
fi
|
||||
|
||||
account_exists() {
|
||||
# Skip if the account email is already present in the mail app.
|
||||
runuser -u www-data -- php occ mail:account:list 2>/dev/null | grep -Fq " ${1}" || \
|
||||
runuser -u www-data -- php occ mail:account:list 2>/dev/null | grep -Fq "${1} "
|
||||
}
|
||||
|
||||
token=$(
|
||||
curl -s -d "grant_type=password" \
|
||||
-d "client_id=admin-cli" \
|
||||
-d "username=${KC_ADMIN_USER}" \
|
||||
-d "password=${KC_ADMIN_PASS}" \
|
||||
"${KC_BASE}/realms/master/protocol/openid-connect/token" | jq -r '.access_token'
|
||||
)
|
||||
|
||||
if [[ -z "${token}" || "${token}" == "null" ]]; then
|
||||
echo "Failed to obtain admin token"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
users=$(curl -s -H "Authorization: Bearer ${token}" \
|
||||
"${KC_BASE}/admin/realms/${KC_REALM}/users?max=2000")
|
||||
|
||||
echo "${users}" | jq -c '.[]' | while read -r user; do
|
||||
username=$(echo "${user}" | jq -r '.username')
|
||||
email=$(echo "${user}" | jq -r '.email // empty')
|
||||
app_pw=$(echo "${user}" | jq -r '.attributes.mailu_app_password[0] // empty')
|
||||
[[ -z "${email}" || -z "${app_pw}" ]] && continue
|
||||
if account_exists "${email}"; then
|
||||
echo "Skipping ${email}, already exists"
|
||||
continue
|
||||
fi
|
||||
echo "Syncing ${email}"
|
||||
runuser -u www-data -- php occ mail:account:create \
|
||||
"${username}" "${username}" "${email}" \
|
||||
mail.bstein.dev 993 ssl "${email}" "${app_pw}" \
|
||||
mail.bstein.dev 587 tls "${email}" "${app_pw}" login || true
|
||||
done
|
||||
@ -1,65 +0,0 @@
|
||||
#!/bin/bash
|
||||
set -euo pipefail
|
||||
|
||||
NC_URL="${NC_URL:-https://cloud.bstein.dev}"
|
||||
ADMIN_USER="${ADMIN_USER:?}"
|
||||
ADMIN_PASS="${ADMIN_PASS:?}"
|
||||
|
||||
export DEBIAN_FRONTEND=noninteractive
|
||||
apt-get update -qq
|
||||
apt-get install -y -qq curl jq >/dev/null
|
||||
|
||||
run_occ() {
|
||||
runuser -u www-data -- php occ "$@"
|
||||
}
|
||||
|
||||
log() { echo "[$(date -Is)] $*"; }
|
||||
|
||||
log "Applying Atlas theming"
|
||||
run_occ theming:config name "Atlas Cloud"
|
||||
run_occ theming:config slogan "Unified access to Atlas services"
|
||||
run_occ theming:config url "https://cloud.bstein.dev"
|
||||
run_occ theming:config color "#0f172a"
|
||||
run_occ theming:config disable-user-theming yes
|
||||
|
||||
log "Setting default quota to 200 GB"
|
||||
run_occ config:app:set files default_quota --value "200 GB"
|
||||
|
||||
API_BASE="${NC_URL}/ocs/v2.php/apps/external/api/v1"
|
||||
AUTH=(-u "${ADMIN_USER}:${ADMIN_PASS}" -H "OCS-APIRequest: true")
|
||||
|
||||
log "Removing existing external links"
|
||||
existing=$(curl -sf "${AUTH[@]}" "${API_BASE}?format=json" | jq -r '.ocs.data[].id // empty')
|
||||
for id in ${existing}; do
|
||||
curl -sf "${AUTH[@]}" -X DELETE "${API_BASE}/sites/${id}?format=json" >/dev/null || true
|
||||
done
|
||||
|
||||
SITES=(
|
||||
"Vaultwarden|https://vault.bstein.dev"
|
||||
"Jellyfin|https://stream.bstein.dev"
|
||||
"Gitea|https://scm.bstein.dev"
|
||||
"Jenkins|https://ci.bstein.dev"
|
||||
"Harbor|https://registry.bstein.dev"
|
||||
"Vault|https://secret.bstein.dev"
|
||||
"Jitsi|https://meet.bstein.dev"
|
||||
"Grafana|https://metrics.bstein.dev"
|
||||
"Chat LLM|https://chat.ai.bstein.dev"
|
||||
"Vision|https://draw.ai.bstein.dev"
|
||||
"STT/TTS|https://talk.ai.bstein.dev"
|
||||
)
|
||||
|
||||
log "Seeding external links"
|
||||
for entry in "${SITES[@]}"; do
|
||||
IFS="|" read -r name url <<<"${entry}"
|
||||
curl -sf "${AUTH[@]}" -X POST "${API_BASE}/sites?format=json" \
|
||||
-d "name=${name}" \
|
||||
-d "url=${url}" \
|
||||
-d "lang=" \
|
||||
-d "type=link" \
|
||||
-d "device=" \
|
||||
-d "icon=" \
|
||||
-d "groups[]=" \
|
||||
-d "redirect=1" >/dev/null
|
||||
done
|
||||
|
||||
log "Maintenance run completed"
|
||||
509
scripts/test_atlas_user_cleanup.py
Executable file
509
scripts/test_atlas_user_cleanup.py
Executable file
@ -0,0 +1,509 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Clean up Atlas test users and portal requests (manual-only).
|
||||
|
||||
Default behavior is DRY RUN. This script is intended for operators to clean up
|
||||
test accounts created via the bstein-dev-home onboarding portal.
|
||||
|
||||
Targets (best-effort):
|
||||
- Keycloak users in realm "atlas"
|
||||
- Atlas portal Postgres rows (access_requests + dependent tables)
|
||||
- Vaultwarden users/invites created by the portal
|
||||
|
||||
Safety:
|
||||
- Requires an explicit username prefix (e.g. "test-")
|
||||
- Dry-run unless --apply is set
|
||||
- --apply requires an explicit --confirm guard
|
||||
- Validates prefixes to a conservative charset
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import base64
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
import subprocess
|
||||
import sys
|
||||
import time
|
||||
import urllib.parse
|
||||
import urllib.request
|
||||
from dataclasses import dataclass
|
||||
from typing import Any, Iterable
|
||||
|
||||
|
||||
_SAFE_PREFIX_RE = re.compile(r"^[a-zA-Z0-9][a-zA-Z0-9._-]{0,63}$")
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class KeycloakUser:
|
||||
user_id: str
|
||||
username: str
|
||||
email: str
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class PortalRequestRow:
|
||||
request_code: str
|
||||
username: str
|
||||
status: str
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class VaultwardenUser:
|
||||
user_id: str
|
||||
email: str
|
||||
status: int
|
||||
|
||||
|
||||
def _run(cmd: list[str], *, input_bytes: bytes | None = None) -> str:
|
||||
proc = subprocess.run(
|
||||
cmd,
|
||||
input=input_bytes,
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.PIPE,
|
||||
check=False,
|
||||
)
|
||||
if proc.returncode != 0:
|
||||
stderr = proc.stderr.decode("utf-8", errors="replace").strip()
|
||||
raise RuntimeError(f"command failed ({proc.returncode}): {' '.join(cmd)}\n{stderr}")
|
||||
return proc.stdout.decode("utf-8", errors="replace")
|
||||
|
||||
|
||||
def _kubectl_get_secret_value(namespace: str, name: str, key: str) -> str:
|
||||
raw_b64 = _run(
|
||||
[
|
||||
"kubectl",
|
||||
"-n",
|
||||
namespace,
|
||||
"get",
|
||||
"secret",
|
||||
name,
|
||||
"-o",
|
||||
f"jsonpath={{.data.{key}}}",
|
||||
]
|
||||
).strip()
|
||||
if not raw_b64:
|
||||
raise RuntimeError(f"secret {namespace}/{name} key {key} is empty")
|
||||
return base64.b64decode(raw_b64).decode("utf-8").strip()
|
||||
|
||||
|
||||
def _kubectl_first_pod(namespace: str) -> str:
|
||||
raw = _run(
|
||||
[
|
||||
"kubectl",
|
||||
"-n",
|
||||
namespace,
|
||||
"get",
|
||||
"pods",
|
||||
"-o",
|
||||
"json",
|
||||
]
|
||||
)
|
||||
data = json.loads(raw)
|
||||
items = data.get("items") or []
|
||||
if not isinstance(items, list) or not items:
|
||||
raise RuntimeError(f"no pods found in namespace {namespace}")
|
||||
pod_name = items[0].get("metadata", {}).get("name")
|
||||
if not isinstance(pod_name, str) or not pod_name:
|
||||
raise RuntimeError(f"unexpected pod list in namespace {namespace}")
|
||||
return pod_name
|
||||
|
||||
|
||||
def _validate_prefixes(prefixes: list[str]) -> list[str]:
|
||||
cleaned: list[str] = []
|
||||
for prefix in prefixes:
|
||||
prefix = prefix.strip()
|
||||
if not prefix:
|
||||
continue
|
||||
if not _SAFE_PREFIX_RE.match(prefix):
|
||||
raise SystemExit(
|
||||
f"invalid prefix '{prefix}': must match {_SAFE_PREFIX_RE.pattern} (alnum plus ._-)"
|
||||
)
|
||||
cleaned.append(prefix)
|
||||
if not cleaned:
|
||||
raise SystemExit("at least one --prefix is required")
|
||||
return cleaned
|
||||
|
||||
|
||||
def _starts_with_any(value: str, prefixes: Iterable[str]) -> bool:
|
||||
return any(value.startswith(p) for p in prefixes)
|
||||
|
||||
|
||||
def _keycloak_token(server: str, realm: str, client_id: str, client_secret: str) -> str:
|
||||
data = urllib.parse.urlencode(
|
||||
{
|
||||
"grant_type": "client_credentials",
|
||||
"client_id": client_id,
|
||||
"client_secret": client_secret,
|
||||
}
|
||||
).encode("utf-8")
|
||||
req = urllib.request.Request(
|
||||
f"{server}/realms/{realm}/protocol/openid-connect/token",
|
||||
data=data,
|
||||
method="POST",
|
||||
)
|
||||
req.add_header("Content-Type", "application/x-www-form-urlencoded")
|
||||
with urllib.request.urlopen(req, timeout=15) as resp:
|
||||
payload = json.loads(resp.read().decode("utf-8"))
|
||||
token = payload.get("access_token")
|
||||
if not isinstance(token, str) or not token:
|
||||
raise RuntimeError("failed to obtain keycloak access token")
|
||||
return token
|
||||
|
||||
|
||||
def _keycloak_list_users(server: str, realm: str, token: str, search: str) -> list[KeycloakUser]:
|
||||
query = urllib.parse.urlencode({"max": "1000", "search": search})
|
||||
req = urllib.request.Request(f"{server}/admin/realms/{realm}/users?{query}", method="GET")
|
||||
req.add_header("Authorization", f"Bearer {token}")
|
||||
with urllib.request.urlopen(req, timeout=30) as resp:
|
||||
payload = json.loads(resp.read().decode("utf-8"))
|
||||
if not isinstance(payload, list):
|
||||
raise RuntimeError("unexpected keycloak users response")
|
||||
users: list[KeycloakUser] = []
|
||||
for item in payload:
|
||||
if not isinstance(item, dict):
|
||||
continue
|
||||
user_id = item.get("id")
|
||||
username = item.get("username") or ""
|
||||
email = item.get("email") or ""
|
||||
if not isinstance(user_id, str) or not user_id:
|
||||
continue
|
||||
if not isinstance(username, str):
|
||||
continue
|
||||
users.append(KeycloakUser(user_id=user_id, username=username, email=str(email)))
|
||||
return users
|
||||
|
||||
|
||||
def _keycloak_delete_user(server: str, realm: str, token: str, user_id: str) -> None:
|
||||
req = urllib.request.Request(f"{server}/admin/realms/{realm}/users/{user_id}", method="DELETE")
|
||||
req.add_header("Authorization", f"Bearer {token}")
|
||||
try:
|
||||
with urllib.request.urlopen(req, timeout=30) as resp:
|
||||
_ = resp.read()
|
||||
except urllib.error.HTTPError as exc:
|
||||
if exc.code == 404:
|
||||
return
|
||||
raise
|
||||
|
||||
|
||||
def _psql_json(portal_db_url: str, sql: str) -> list[dict[str, Any]]:
|
||||
postgres_pod = _kubectl_first_pod("postgres")
|
||||
out = _run(
|
||||
[
|
||||
"kubectl",
|
||||
"-n",
|
||||
"postgres",
|
||||
"exec",
|
||||
"-i",
|
||||
postgres_pod,
|
||||
"--",
|
||||
"psql",
|
||||
portal_db_url,
|
||||
"-At",
|
||||
"-F",
|
||||
"\t",
|
||||
"-c",
|
||||
sql,
|
||||
]
|
||||
)
|
||||
rows: list[dict[str, Any]] = []
|
||||
for line in out.splitlines():
|
||||
parts = line.split("\t")
|
||||
rows.append({"cols": parts})
|
||||
return rows
|
||||
|
||||
|
||||
def _portal_list_requests(portal_db_url: str, prefixes: list[str]) -> list[PortalRequestRow]:
|
||||
clauses = " OR ".join([f"username LIKE '{p}%'" for p in prefixes])
|
||||
sql = (
|
||||
"SELECT request_code, username, status "
|
||||
"FROM access_requests "
|
||||
f"WHERE {clauses} "
|
||||
"ORDER BY created_at DESC;"
|
||||
)
|
||||
raw_rows = _psql_json(portal_db_url, sql)
|
||||
parsed: list[PortalRequestRow] = []
|
||||
for row in raw_rows:
|
||||
cols = row.get("cols") or []
|
||||
if len(cols) < 3:
|
||||
continue
|
||||
parsed.append(PortalRequestRow(request_code=cols[0], username=cols[1], status=cols[2]))
|
||||
return parsed
|
||||
|
||||
|
||||
def _portal_delete_requests(portal_db_url: str, prefixes: list[str]) -> int:
|
||||
clauses = " OR ".join([f"username LIKE '{p}%'" for p in prefixes])
|
||||
sql = f"DELETE FROM access_requests WHERE {clauses};"
|
||||
postgres_pod = _kubectl_first_pod("postgres")
|
||||
out = _run(
|
||||
[
|
||||
"kubectl",
|
||||
"-n",
|
||||
"postgres",
|
||||
"exec",
|
||||
"-i",
|
||||
postgres_pod,
|
||||
"--",
|
||||
"psql",
|
||||
portal_db_url,
|
||||
"-c",
|
||||
sql,
|
||||
]
|
||||
)
|
||||
# psql prints "DELETE <n>"
|
||||
match = re.search(r"DELETE\\s+(\\d+)", out)
|
||||
return int(match.group(1)) if match else 0
|
||||
|
||||
|
||||
def _vaultwarden_admin_cookie(admin_token: str, base_url: str) -> str:
|
||||
data = urllib.parse.urlencode({"token": admin_token}).encode("utf-8")
|
||||
req = urllib.request.Request(f"{base_url}/admin", data=data, method="POST")
|
||||
req.add_header("Content-Type", "application/x-www-form-urlencoded")
|
||||
try:
|
||||
with urllib.request.urlopen(req, timeout=10) as resp:
|
||||
set_cookie = resp.headers.get("Set-Cookie") or ""
|
||||
except urllib.error.HTTPError as exc:
|
||||
if exc.code == 429:
|
||||
raise RuntimeError("vaultwarden admin rate limited (HTTP 429)") from exc
|
||||
raise
|
||||
cookie = set_cookie.split(";", 1)[0].strip()
|
||||
if not cookie:
|
||||
raise RuntimeError("vaultwarden admin cookie missing")
|
||||
return cookie
|
||||
|
||||
|
||||
def _vaultwarden_list_users(base_url: str, cookie: str) -> list[VaultwardenUser]:
|
||||
req = urllib.request.Request(f"{base_url}/admin/users", method="GET")
|
||||
req.add_header("Cookie", cookie)
|
||||
try:
|
||||
with urllib.request.urlopen(req, timeout=30) as resp:
|
||||
payload = json.loads(resp.read().decode("utf-8"))
|
||||
except urllib.error.HTTPError as exc:
|
||||
if exc.code == 429:
|
||||
raise RuntimeError("vaultwarden admin rate limited (HTTP 429)") from exc
|
||||
raise
|
||||
if not isinstance(payload, list):
|
||||
raise RuntimeError("unexpected vaultwarden /admin/users response")
|
||||
users: list[VaultwardenUser] = []
|
||||
for item in payload:
|
||||
if not isinstance(item, dict):
|
||||
continue
|
||||
user_id = item.get("id")
|
||||
email = item.get("email")
|
||||
status = item.get("_status")
|
||||
if not isinstance(user_id, str) or not user_id:
|
||||
continue
|
||||
if not isinstance(email, str) or not email:
|
||||
continue
|
||||
if not isinstance(status, int):
|
||||
status = -1
|
||||
users.append(VaultwardenUser(user_id=user_id, email=email, status=status))
|
||||
return users
|
||||
|
||||
|
||||
def _vaultwarden_delete_user(base_url: str, cookie: str, user_id: str) -> None:
|
||||
req = urllib.request.Request(f"{base_url}/admin/users/{user_id}", method="DELETE")
|
||||
req.add_header("Cookie", cookie)
|
||||
try:
|
||||
with urllib.request.urlopen(req, timeout=30) as resp:
|
||||
_ = resp.read()
|
||||
except urllib.error.HTTPError as exc:
|
||||
if exc.code in {404}:
|
||||
return
|
||||
if exc.code == 429:
|
||||
raise RuntimeError("vaultwarden admin rate limited (HTTP 429)") from exc
|
||||
raise
|
||||
|
||||
|
||||
def _port_forward(namespace: str, target: str, local_port: int, remote_port: int) -> subprocess.Popen[bytes]:
|
||||
# Keep stdout/stderr muted to avoid leaking internal details in output.
|
||||
return subprocess.Popen(
|
||||
[
|
||||
"kubectl",
|
||||
"-n",
|
||||
namespace,
|
||||
"port-forward",
|
||||
target,
|
||||
f"{local_port}:{remote_port}",
|
||||
"--address",
|
||||
"127.0.0.1",
|
||||
],
|
||||
stdout=subprocess.DEVNULL,
|
||||
stderr=subprocess.DEVNULL,
|
||||
)
|
||||
|
||||
|
||||
def main() -> int:
|
||||
parser = argparse.ArgumentParser(description=__doc__)
|
||||
parser.add_argument(
|
||||
"--prefix",
|
||||
action="append",
|
||||
default=[],
|
||||
help="Username prefix to match (repeatable). Example: --prefix test-",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--apply",
|
||||
action="store_true",
|
||||
help="Actually delete; otherwise dry-run only.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--confirm",
|
||||
default="",
|
||||
help=(
|
||||
"Required when using --apply. Must exactly equal the comma-separated "
|
||||
"sorted prefix list (e.g. 'atlas-,bob-,e2e-,test-')."
|
||||
),
|
||||
)
|
||||
parser.add_argument("--skip-keycloak", action="store_true", help="Skip Keycloak user deletion.")
|
||||
parser.add_argument("--skip-portal-db", action="store_true", help="Skip portal DB cleanup.")
|
||||
parser.add_argument("--skip-vaultwarden", action="store_true", help="Skip Vaultwarden cleanup.")
|
||||
parser.add_argument(
|
||||
"--protect-keycloak-username",
|
||||
action="append",
|
||||
default=[],
|
||||
help="Keycloak usernames that must never be deleted (repeatable).",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--protect-vaultwarden-email",
|
||||
action="append",
|
||||
default=[],
|
||||
help="Vaultwarden emails that must never be deleted (repeatable).",
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
prefixes = sorted(set(_validate_prefixes(args.prefix)))
|
||||
apply = bool(args.apply)
|
||||
expected_confirm = ",".join(prefixes)
|
||||
protected_keycloak = {"bstein", "robotuser", *[u.strip() for u in args.protect_keycloak_username if u.strip()]}
|
||||
protected_vaultwarden = {e.strip() for e in args.protect_vaultwarden_email if e.strip()}
|
||||
|
||||
if apply and args.confirm != expected_confirm:
|
||||
raise SystemExit(
|
||||
f"refusing to apply without --confirm '{expected_confirm}' (got '{args.confirm}')"
|
||||
)
|
||||
|
||||
print("Atlas test-user cleanup")
|
||||
print("prefixes:", expected_confirm)
|
||||
print("mode:", "APPLY (destructive)" if apply else "DRY RUN (no changes)")
|
||||
if protected_keycloak:
|
||||
print("protected keycloak usernames:", ", ".join(sorted(protected_keycloak)))
|
||||
if protected_vaultwarden:
|
||||
print("protected vaultwarden emails:", ", ".join(sorted(protected_vaultwarden)))
|
||||
print()
|
||||
|
||||
if not args.skip_portal_db:
|
||||
portal_db_url = _kubectl_get_secret_value("bstein-dev-home", "atlas-portal-db", "PORTAL_DATABASE_URL")
|
||||
requests = _portal_list_requests(portal_db_url, prefixes)
|
||||
print(f"Portal DB: {len(requests)} access_requests matched")
|
||||
for row in requests[:50]:
|
||||
print(f" {row.request_code}\t{row.status}\t{row.username}")
|
||||
if len(requests) > 50:
|
||||
print(f" ... and {len(requests) - 50} more")
|
||||
if apply and requests:
|
||||
deleted = _portal_delete_requests(portal_db_url, prefixes)
|
||||
print(f"Portal DB: deleted {deleted} access_requests (cascade removes tasks/steps/artifacts).")
|
||||
print()
|
||||
|
||||
if not args.skip_keycloak:
|
||||
kc_server = os.getenv("KEYCLOAK_PUBLIC_URL", "https://sso.bstein.dev").rstrip("/")
|
||||
kc_realm = os.getenv("KEYCLOAK_REALM", "atlas")
|
||||
kc_client_id = os.getenv("KEYCLOAK_ADMIN_CLIENT_ID", "bstein-dev-home-admin")
|
||||
kc_client_secret = _kubectl_get_secret_value(
|
||||
"bstein-dev-home", "bstein-dev-home-keycloak-admin", "client_secret"
|
||||
)
|
||||
token = _keycloak_token(kc_server, kc_realm, kc_client_id, kc_client_secret)
|
||||
found: dict[str, KeycloakUser] = {}
|
||||
for prefix in prefixes:
|
||||
for user in _keycloak_list_users(kc_server, kc_realm, token, prefix):
|
||||
if not _starts_with_any(user.username, prefixes):
|
||||
continue
|
||||
if user.username in protected_keycloak:
|
||||
continue
|
||||
found[user.user_id] = user
|
||||
users = list(found.values())
|
||||
users.sort(key=lambda u: u.username)
|
||||
print(f"Keycloak: {len(users)} users matched")
|
||||
for user in users[:50]:
|
||||
email = user.email or "-"
|
||||
print(f" {user.username}\t{email}\t{user.user_id}")
|
||||
if len(users) > 50:
|
||||
print(f" ... and {len(users) - 50} more")
|
||||
if apply and users:
|
||||
for user in users:
|
||||
_keycloak_delete_user(kc_server, kc_realm, token, user.user_id)
|
||||
print(f"Keycloak: deleted {len(users)} users.")
|
||||
print()
|
||||
|
||||
if not args.skip_vaultwarden:
|
||||
pf = _port_forward("vaultwarden", "svc/vaultwarden-service", 18081, 80)
|
||||
try:
|
||||
# wait briefly for the port-forward to come up
|
||||
for _ in range(30):
|
||||
try:
|
||||
urllib.request.urlopen("http://127.0.0.1:18081/", timeout=1).read(1)
|
||||
break
|
||||
except Exception:
|
||||
time.sleep(0.2)
|
||||
|
||||
admin_token = _kubectl_get_secret_value("vaultwarden", "vaultwarden-admin", "ADMIN_TOKEN")
|
||||
base_url = "http://127.0.0.1:18081"
|
||||
try:
|
||||
cookie = ""
|
||||
for attempt in range(7):
|
||||
try:
|
||||
cookie = _vaultwarden_admin_cookie(admin_token, base_url)
|
||||
break
|
||||
except RuntimeError as exc:
|
||||
if "rate limited" in str(exc).lower():
|
||||
time.sleep(min(60.0, 2.0**attempt))
|
||||
continue
|
||||
raise
|
||||
if not cookie:
|
||||
raise RuntimeError("vaultwarden admin login repeatedly rate limited")
|
||||
|
||||
users: list[VaultwardenUser] = []
|
||||
for attempt in range(7):
|
||||
try:
|
||||
users = _vaultwarden_list_users(base_url, cookie)
|
||||
break
|
||||
except RuntimeError as exc:
|
||||
if "rate limited" in str(exc).lower():
|
||||
time.sleep(min(60.0, 2.0**attempt))
|
||||
continue
|
||||
raise
|
||||
if not users:
|
||||
raise RuntimeError("vaultwarden user list unavailable (possibly rate limited)")
|
||||
except RuntimeError as exc:
|
||||
print(f"Vaultwarden: ERROR: {exc}")
|
||||
print()
|
||||
return 1
|
||||
matched: list[VaultwardenUser] = []
|
||||
for user in users:
|
||||
local = user.email.split("@", 1)[0]
|
||||
if _starts_with_any(local, prefixes):
|
||||
if user.email in protected_vaultwarden:
|
||||
continue
|
||||
matched.append(user)
|
||||
matched.sort(key=lambda u: u.email)
|
||||
print(f"Vaultwarden: {len(matched)} users matched")
|
||||
for user in matched[:50]:
|
||||
print(f" {user.email}\tstatus={user.status}\t{user.user_id}")
|
||||
if len(matched) > 50:
|
||||
print(f" ... and {len(matched) - 50} more")
|
||||
if apply and matched:
|
||||
for user in matched:
|
||||
_vaultwarden_delete_user(base_url, cookie, user.user_id)
|
||||
print(f"Vaultwarden: deleted {len(matched)} users.")
|
||||
print()
|
||||
finally:
|
||||
pf.terminate()
|
||||
try:
|
||||
pf.wait(timeout=3)
|
||||
except Exception:
|
||||
pf.kill()
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
raise SystemExit(main())
|
||||
276
scripts/test_user_cleanup.py
Executable file
276
scripts/test_user_cleanup.py
Executable file
@ -0,0 +1,276 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import sys
|
||||
from collections import defaultdict
|
||||
from dataclasses import dataclass
|
||||
from typing import Any, Iterable
|
||||
from urllib.parse import quote
|
||||
|
||||
import httpx
|
||||
|
||||
from atlas_portal import db, settings
|
||||
from atlas_portal.keycloak import admin_client
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class KeycloakUser:
|
||||
id: str
|
||||
username: str
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class PortalRequest:
|
||||
request_code: str
|
||||
username: str
|
||||
status: str
|
||||
|
||||
|
||||
def _dedupe_by_id(users: Iterable[KeycloakUser]) -> list[KeycloakUser]:
|
||||
seen: set[str] = set()
|
||||
out: list[KeycloakUser] = []
|
||||
for user in users:
|
||||
if user.id in seen:
|
||||
continue
|
||||
seen.add(user.id)
|
||||
out.append(user)
|
||||
return out
|
||||
|
||||
|
||||
def _iter_keycloak_users_for_prefix(prefix: str, max_results: int) -> list[KeycloakUser]:
|
||||
client = admin_client()
|
||||
if not client.ready():
|
||||
raise RuntimeError("keycloak admin client not configured in this environment")
|
||||
|
||||
url = f"{settings.KEYCLOAK_ADMIN_URL}/admin/realms/{settings.KEYCLOAK_REALM}/users"
|
||||
# Keycloak can return false positives for search; we do a strict prefix match client-side.
|
||||
params = {"search": prefix, "max": str(max_results), "briefRepresentation": "true"}
|
||||
with httpx.Client(timeout=settings.HTTP_CHECK_TIMEOUT_SEC) as http:
|
||||
resp = http.get(url, params=params, headers=client.headers())
|
||||
resp.raise_for_status()
|
||||
payload = resp.json()
|
||||
|
||||
if not isinstance(payload, list):
|
||||
return []
|
||||
|
||||
found: list[KeycloakUser] = []
|
||||
for item in payload:
|
||||
if not isinstance(item, dict):
|
||||
continue
|
||||
username = item.get("username")
|
||||
user_id = item.get("id")
|
||||
if not isinstance(username, str) or not isinstance(user_id, str):
|
||||
continue
|
||||
if not username.startswith(prefix):
|
||||
continue
|
||||
if username.startswith("service-account-"):
|
||||
continue
|
||||
found.append(KeycloakUser(id=user_id, username=username))
|
||||
return found
|
||||
|
||||
|
||||
def _find_keycloak_users(prefixes: list[str], max_results: int, protected: set[str]) -> list[KeycloakUser]:
|
||||
matches: list[KeycloakUser] = []
|
||||
for prefix in prefixes:
|
||||
matches.extend(_iter_keycloak_users_for_prefix(prefix, max_results=max_results))
|
||||
|
||||
deduped = _dedupe_by_id(matches)
|
||||
return [user for user in deduped if user.username not in protected]
|
||||
|
||||
|
||||
def _delete_keycloak_users(users: list[KeycloakUser]) -> None:
|
||||
if not users:
|
||||
return
|
||||
|
||||
client = admin_client()
|
||||
if not client.ready():
|
||||
raise RuntimeError("keycloak admin client not configured in this environment")
|
||||
|
||||
base = f"{settings.KEYCLOAK_ADMIN_URL}/admin/realms/{settings.KEYCLOAK_REALM}/users"
|
||||
with httpx.Client(timeout=settings.HTTP_CHECK_TIMEOUT_SEC) as http:
|
||||
for user in users:
|
||||
url = f"{base}/{quote(user.id, safe='')}"
|
||||
resp = http.delete(url, headers=client.headers())
|
||||
# Deleting a non-existent user is treated as success for idempotency.
|
||||
if resp.status_code == 404:
|
||||
continue
|
||||
resp.raise_for_status()
|
||||
|
||||
|
||||
def _find_portal_requests(prefixes: list[str], max_results: int) -> list[PortalRequest]:
|
||||
if not db.configured():
|
||||
return []
|
||||
|
||||
like_prefixes = [f"{prefix}%" for prefix in prefixes]
|
||||
rows: list[dict[str, Any]] = []
|
||||
with db.connect() as conn:
|
||||
for like in like_prefixes:
|
||||
cursor = conn.execute(
|
||||
"""
|
||||
SELECT request_code, username, status
|
||||
FROM access_requests
|
||||
WHERE username LIKE %s
|
||||
ORDER BY created_at DESC
|
||||
LIMIT %s
|
||||
""",
|
||||
(like, max_results),
|
||||
)
|
||||
batch = cursor.fetchall()
|
||||
if isinstance(batch, list):
|
||||
rows.extend([r for r in batch if isinstance(r, dict)])
|
||||
|
||||
out: list[PortalRequest] = []
|
||||
for row in rows:
|
||||
request_code = row.get("request_code")
|
||||
username = row.get("username")
|
||||
status = row.get("status")
|
||||
if not isinstance(request_code, str) or not isinstance(username, str) or not isinstance(status, str):
|
||||
continue
|
||||
out.append(PortalRequest(request_code=request_code, username=username, status=status))
|
||||
return out
|
||||
|
||||
|
||||
def _delete_portal_requests(prefixes: list[str]) -> int:
|
||||
if not db.configured():
|
||||
return 0
|
||||
|
||||
like_prefixes = [f"{prefix}%" for prefix in prefixes]
|
||||
deleted = 0
|
||||
with db.connect() as conn:
|
||||
for like in like_prefixes:
|
||||
cursor = conn.execute("DELETE FROM access_requests WHERE username LIKE %s", (like,))
|
||||
deleted += cursor.rowcount or 0
|
||||
return deleted
|
||||
|
||||
|
||||
def _summarize_portal_requests(rows: list[PortalRequest]) -> dict[str, int]:
|
||||
counts: dict[str, int] = defaultdict(int)
|
||||
for row in rows:
|
||||
counts[row.status] += 1
|
||||
return dict(counts)
|
||||
|
||||
|
||||
def _parse_args(argv: list[str]) -> argparse.Namespace:
|
||||
parser = argparse.ArgumentParser(
|
||||
prog="test_user_cleanup",
|
||||
description=(
|
||||
"Manual-only cleanup for test users/requests. "
|
||||
"This script is intended to be run inside the bstein-dev-home backend container."
|
||||
),
|
||||
)
|
||||
parser.add_argument(
|
||||
"--prefix",
|
||||
action="append",
|
||||
required=True,
|
||||
help="Username prefix to target (repeatable). Example: --prefix test-",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--max",
|
||||
type=int,
|
||||
default=500,
|
||||
help="Maximum users/requests to enumerate per prefix (default: 500).",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--apply",
|
||||
action="store_true",
|
||||
help="Apply deletions (default is dry-run). Requires --confirm.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--confirm",
|
||||
default="",
|
||||
help="Required when using --apply. Must exactly equal the comma-separated prefix list.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--skip-keycloak",
|
||||
action="store_true",
|
||||
help="Skip deleting Keycloak users.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--skip-portal",
|
||||
action="store_true",
|
||||
help="Skip deleting portal (DB) access requests.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--protect",
|
||||
action="append",
|
||||
default=[],
|
||||
help="Extra usernames to never delete (repeatable).",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--verbose",
|
||||
action="store_true",
|
||||
help="List matched usernames/request codes.",
|
||||
)
|
||||
return parser.parse_args(argv)
|
||||
|
||||
|
||||
def main(argv: list[str]) -> int:
|
||||
args = _parse_args(argv)
|
||||
prefixes = sorted({p.strip() for p in args.prefix if p.strip()})
|
||||
if not prefixes:
|
||||
print("error: no valid --prefix values provided", file=sys.stderr)
|
||||
return 2
|
||||
|
||||
expected_confirm = ",".join(prefixes)
|
||||
protected = {"bstein", "robotuser", *[p.strip() for p in args.protect if p.strip()]}
|
||||
|
||||
if args.apply and args.confirm != expected_confirm:
|
||||
print(
|
||||
f"error: refusing to apply without --confirm '{expected_confirm}' (got '{args.confirm}')",
|
||||
file=sys.stderr,
|
||||
)
|
||||
return 2
|
||||
|
||||
keycloak_users: list[KeycloakUser] = []
|
||||
portal_requests: list[PortalRequest] = []
|
||||
|
||||
if not args.skip_keycloak:
|
||||
keycloak_users = _find_keycloak_users(prefixes, max_results=args.max, protected=protected)
|
||||
|
||||
if not args.skip_portal:
|
||||
portal_requests = _find_portal_requests(prefixes, max_results=args.max)
|
||||
|
||||
print(f"prefixes: {expected_confirm}")
|
||||
print(f"mode: {'APPLY' if args.apply else 'DRY-RUN'}")
|
||||
if protected:
|
||||
print(f"protected usernames: {', '.join(sorted(protected))}")
|
||||
|
||||
if not args.skip_keycloak:
|
||||
print(f"keycloak users matched: {len(keycloak_users)}")
|
||||
if args.verbose and keycloak_users:
|
||||
for user in sorted(keycloak_users, key=lambda u: u.username):
|
||||
print(f" - {user.username}")
|
||||
|
||||
if not args.skip_portal:
|
||||
print(f"portal requests matched: {len(portal_requests)}")
|
||||
if portal_requests:
|
||||
summary = _summarize_portal_requests(portal_requests)
|
||||
summary_str = ", ".join(f"{k}={v}" for k, v in sorted(summary.items()))
|
||||
print(f" statuses: {summary_str}")
|
||||
if args.verbose and portal_requests:
|
||||
for req in portal_requests[: min(50, len(portal_requests))]:
|
||||
print(f" - {req.request_code} ({req.status})")
|
||||
if len(portal_requests) > 50:
|
||||
print(f" ... and {len(portal_requests) - 50} more")
|
||||
|
||||
if not args.apply:
|
||||
print("dry-run complete (no changes made)")
|
||||
return 0
|
||||
|
||||
if not args.skip_portal:
|
||||
deleted = _delete_portal_requests(prefixes)
|
||||
print(f"deleted portal requests: {deleted}")
|
||||
|
||||
if not args.skip_keycloak:
|
||||
_delete_keycloak_users(keycloak_users)
|
||||
print(f"deleted keycloak users: {len(keycloak_users)}")
|
||||
|
||||
print("done")
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
raise SystemExit(main(sys.argv[1:]))
|
||||
|
||||
18
scripts/test_user_cleanup.sh
Executable file
18
scripts/test_user_cleanup.sh
Executable file
@ -0,0 +1,18 @@
|
||||
#!/usr/bin/env bash
|
||||
set -euo pipefail
|
||||
|
||||
# Manual-only helper to run `scripts/test_user_cleanup.py` inside the portal backend container.
|
||||
#
|
||||
# Usage (dry-run):
|
||||
# scripts/test_user_cleanup.sh --prefix test-
|
||||
#
|
||||
# Usage (apply):
|
||||
# scripts/test_user_cleanup.sh --prefix test- --apply --confirm test-
|
||||
|
||||
NS="${PORTAL_NAMESPACE:-bstein-dev-home}"
|
||||
TARGET="${PORTAL_BACKEND_EXEC_TARGET:-deploy/bstein-dev-home-backend}"
|
||||
|
||||
SCRIPT_DIR="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" >/dev/null 2>&1 && pwd)"
|
||||
|
||||
cat "${SCRIPT_DIR}/test_user_cleanup.py" | kubectl -n "${NS}" exec -i "${TARGET}" -- python - "$@"
|
||||
|
||||
318
scripts/test_vaultwarden_user_cleanup.py
Executable file
318
scripts/test_vaultwarden_user_cleanup.py
Executable file
@ -0,0 +1,318 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Clean up Vaultwarden test users and invites (manual-only).
|
||||
|
||||
This script deletes Vaultwarden rows directly from the Postgres database. It is
|
||||
intended only for removing test fallout (e.g. e2e-*, test-*) and is deliberately
|
||||
conservative:
|
||||
|
||||
- Requires one or more explicit email prefixes (repeatable).
|
||||
- Dry-run by default; --apply requires an exact --confirm guard.
|
||||
- Refuses to delete any user with dependent data in Vaultwarden tables.
|
||||
- Supports a protected email allowlist to prevent catastrophic mistakes.
|
||||
|
||||
Example (dry-run):
|
||||
scripts/test_vaultwarden_user_cleanup.py --prefix e2e-
|
||||
|
||||
Example (apply):
|
||||
scripts/test_vaultwarden_user_cleanup.py --prefix e2e- --apply --confirm e2e-
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import re
|
||||
import subprocess
|
||||
import sys
|
||||
from dataclasses import dataclass
|
||||
from typing import Iterable, Sequence
|
||||
|
||||
|
||||
_SAFE_PREFIX_RE = re.compile(r"^[a-zA-Z0-9][a-zA-Z0-9._-]{0,63}$")
|
||||
_UUID_RE = re.compile(r"^[0-9a-fA-F-]{32,36}$")
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class VaultwardenUser:
|
||||
uuid: str
|
||||
email: str
|
||||
dependent_rows: int
|
||||
|
||||
|
||||
def _run(cmd: Sequence[str], *, input_bytes: bytes | None = None) -> str:
|
||||
proc = subprocess.run(
|
||||
list(cmd),
|
||||
input=input_bytes,
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.PIPE,
|
||||
check=False,
|
||||
)
|
||||
if proc.returncode != 0:
|
||||
stderr = proc.stderr.decode("utf-8", errors="replace").strip()
|
||||
raise RuntimeError(f"command failed ({proc.returncode}): {' '.join(cmd)}\n{stderr}")
|
||||
return proc.stdout.decode("utf-8", errors="replace")
|
||||
|
||||
|
||||
def _kubectl_first_pod(namespace: str) -> str:
|
||||
raw = _run(["kubectl", "-n", namespace, "get", "pods", "-o", "json"])
|
||||
data = json.loads(raw)
|
||||
items = data.get("items") or []
|
||||
if not isinstance(items, list) or not items:
|
||||
raise RuntimeError(f"no pods found in namespace {namespace}")
|
||||
name = items[0].get("metadata", {}).get("name")
|
||||
if not isinstance(name, str) or not name:
|
||||
raise RuntimeError(f"unexpected pod list in namespace {namespace}")
|
||||
return name
|
||||
|
||||
|
||||
def _psql(sql: str) -> str:
|
||||
pod = _kubectl_first_pod("postgres")
|
||||
return _run(
|
||||
[
|
||||
"kubectl",
|
||||
"-n",
|
||||
"postgres",
|
||||
"exec",
|
||||
"-i",
|
||||
pod,
|
||||
"--",
|
||||
"psql",
|
||||
"-U",
|
||||
"postgres",
|
||||
"-d",
|
||||
"vaultwarden",
|
||||
"-At",
|
||||
"-F",
|
||||
"\t",
|
||||
"-c",
|
||||
sql,
|
||||
]
|
||||
)
|
||||
|
||||
|
||||
def _validate_prefixes(prefixes: Iterable[str]) -> list[str]:
|
||||
cleaned: list[str] = []
|
||||
for prefix in prefixes:
|
||||
prefix = prefix.strip()
|
||||
if not prefix:
|
||||
continue
|
||||
if not _SAFE_PREFIX_RE.match(prefix):
|
||||
raise SystemExit(
|
||||
f"invalid prefix '{prefix}': must match {_SAFE_PREFIX_RE.pattern} (alnum plus ._-)"
|
||||
)
|
||||
if not prefix.endswith("-"):
|
||||
raise SystemExit(f"refusing prefix '{prefix}': must end with '-' for safety")
|
||||
cleaned.append(prefix)
|
||||
if not cleaned:
|
||||
raise SystemExit("at least one --prefix is required")
|
||||
return sorted(set(cleaned))
|
||||
|
||||
|
||||
def _parse_rows(tsv: str) -> list[list[str]]:
|
||||
rows: list[list[str]] = []
|
||||
for line in tsv.splitlines():
|
||||
line = line.strip()
|
||||
if not line:
|
||||
continue
|
||||
rows.append(line.split("\t"))
|
||||
return rows
|
||||
|
||||
|
||||
def _sql_or_email_prefixes(prefixes: list[str]) -> str:
|
||||
# prefixes validated to safe charset; safe to interpolate.
|
||||
clauses = [f"email LIKE '{p}%'" for p in prefixes]
|
||||
return " OR ".join(clauses) if clauses else "FALSE"
|
||||
|
||||
|
||||
def _sql_quote(value: str) -> str:
|
||||
return "'" + value.replace("'", "''") + "'"
|
||||
|
||||
|
||||
def _sql_text_array(values: Iterable[str]) -> str:
|
||||
items = ",".join(_sql_quote(v) for v in values)
|
||||
return f"ARRAY[{items}]::text[]"
|
||||
|
||||
|
||||
def _list_users(prefixes: list[str], protected: set[str]) -> list[VaultwardenUser]:
|
||||
clause = _sql_or_email_prefixes(prefixes)
|
||||
sql = f"""
|
||||
WITH candidates AS (
|
||||
SELECT uuid, email
|
||||
FROM users
|
||||
WHERE enabled
|
||||
AND ({clause})
|
||||
AND email <> ALL({_sql_text_array(sorted(protected))})
|
||||
)
|
||||
SELECT
|
||||
candidates.uuid,
|
||||
candidates.email,
|
||||
(
|
||||
(SELECT COUNT(*) FROM auth_requests WHERE user_uuid = candidates.uuid) +
|
||||
(SELECT COUNT(*) FROM ciphers WHERE user_uuid = candidates.uuid) +
|
||||
(SELECT COUNT(*) FROM devices WHERE user_uuid = candidates.uuid) +
|
||||
(SELECT COUNT(*) FROM emergency_access WHERE grantor_uuid = candidates.uuid OR grantee_uuid = candidates.uuid) +
|
||||
(SELECT COUNT(*) FROM favorites WHERE user_uuid = candidates.uuid) +
|
||||
(SELECT COUNT(*) FROM folders WHERE user_uuid = candidates.uuid) +
|
||||
(SELECT COUNT(*) FROM sends WHERE user_uuid = candidates.uuid) +
|
||||
(SELECT COUNT(*) FROM twofactor WHERE user_uuid = candidates.uuid) +
|
||||
(SELECT COUNT(*) FROM twofactor_incomplete WHERE user_uuid = candidates.uuid) +
|
||||
(SELECT COUNT(*) FROM users_collections WHERE user_uuid = candidates.uuid) +
|
||||
(SELECT COUNT(*) FROM users_organizations WHERE user_uuid = candidates.uuid)
|
||||
) AS dependent_rows
|
||||
FROM candidates
|
||||
ORDER BY candidates.email;
|
||||
"""
|
||||
out = _psql(sql)
|
||||
users: list[VaultwardenUser] = []
|
||||
for row in _parse_rows(out):
|
||||
if len(row) < 3:
|
||||
continue
|
||||
uuid, email, dep_raw = row[0].strip(), row[1].strip(), row[2].strip()
|
||||
if not uuid or not email:
|
||||
continue
|
||||
if not _UUID_RE.match(uuid):
|
||||
continue
|
||||
try:
|
||||
dep = int(dep_raw)
|
||||
except ValueError:
|
||||
dep = 0
|
||||
users.append(VaultwardenUser(uuid=uuid, email=email, dependent_rows=dep))
|
||||
return users
|
||||
|
||||
|
||||
def _list_invitations(prefixes: list[str], protected: set[str]) -> list[str]:
|
||||
clause = _sql_or_email_prefixes(prefixes)
|
||||
protected_clause = ""
|
||||
if protected:
|
||||
protected_clause = f"AND email <> ALL({_sql_text_array(sorted(protected))})"
|
||||
sql = f"SELECT email FROM invitations WHERE ({clause}) {protected_clause} ORDER BY email;"
|
||||
out = _psql(sql)
|
||||
invites: list[str] = []
|
||||
for row in _parse_rows(out):
|
||||
if not row:
|
||||
continue
|
||||
email = row[0].strip()
|
||||
if email:
|
||||
invites.append(email)
|
||||
return invites
|
||||
|
||||
|
||||
def _delete_invitations(emails: list[str]) -> int:
|
||||
if not emails:
|
||||
return 0
|
||||
email_list = ",".join(_sql_quote(e) for e in emails)
|
||||
sql = f"DELETE FROM invitations WHERE email IN ({email_list});"
|
||||
out = _psql(sql)
|
||||
match = re.search(r"DELETE\s+(\d+)", out)
|
||||
return int(match.group(1)) if match else 0
|
||||
|
||||
|
||||
def _delete_users(uuids: list[str]) -> int:
|
||||
if not uuids:
|
||||
return 0
|
||||
uuid_list = ",".join(_sql_quote(u) for u in uuids)
|
||||
sql = f"DELETE FROM users WHERE uuid IN ({uuid_list});"
|
||||
out = _psql(sql)
|
||||
match = re.search(r"DELETE\s+(\d+)", out)
|
||||
return int(match.group(1)) if match else 0
|
||||
|
||||
|
||||
def _parse_args(argv: list[str]) -> argparse.Namespace:
|
||||
parser = argparse.ArgumentParser(
|
||||
prog="test_vaultwarden_user_cleanup",
|
||||
description="Manual-only cleanup for Vaultwarden test users/invites (DB-level).",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--prefix",
|
||||
action="append",
|
||||
required=True,
|
||||
help="Email prefix to target (repeatable). Example: --prefix e2e-",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--apply",
|
||||
action="store_true",
|
||||
help="Apply deletions (default is dry-run). Requires --confirm.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--confirm",
|
||||
default="",
|
||||
help="Required when using --apply. Must exactly equal the comma-separated prefix list.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--protect-email",
|
||||
action="append",
|
||||
default=[],
|
||||
help="Vaultwarden emails that must never be deleted (repeatable).",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--verbose",
|
||||
action="store_true",
|
||||
help="List matched emails (and invitation emails).",
|
||||
)
|
||||
return parser.parse_args(argv)
|
||||
|
||||
|
||||
def main(argv: list[str]) -> int:
|
||||
args = _parse_args(argv)
|
||||
prefixes = _validate_prefixes(args.prefix)
|
||||
expected_confirm = ",".join(prefixes)
|
||||
|
||||
protected = {e.strip() for e in args.protect_email if e.strip()}
|
||||
protected |= {
|
||||
"brad@bstein.dev",
|
||||
"edstein87@outlook.com",
|
||||
"indifox8@gmail.com",
|
||||
"mgs.stein@gmail.com",
|
||||
"patriot87@gmail.com",
|
||||
}
|
||||
|
||||
if args.apply and args.confirm != expected_confirm:
|
||||
print(
|
||||
f"error: refusing to apply without --confirm '{expected_confirm}' (got '{args.confirm}')",
|
||||
file=sys.stderr,
|
||||
)
|
||||
return 2
|
||||
|
||||
users = _list_users(prefixes, protected=protected)
|
||||
invites = _list_invitations(prefixes, protected=protected)
|
||||
|
||||
print(f"prefixes: {expected_confirm}")
|
||||
print(f"mode: {'APPLY' if args.apply else 'DRY-RUN'}")
|
||||
if protected:
|
||||
print(f"protected emails: {', '.join(sorted(protected))}")
|
||||
print(f"vaultwarden users matched: {len(users)}")
|
||||
print(f"vaultwarden invitations matched: {len(invites)}")
|
||||
|
||||
if args.verbose:
|
||||
for user in users[: min(100, len(users))]:
|
||||
print(f" user: {user.email} (deps={user.dependent_rows})")
|
||||
if len(users) > 100:
|
||||
print(f" ... and {len(users) - 100} more users")
|
||||
for email in invites[: min(100, len(invites))]:
|
||||
print(f" invite: {email}")
|
||||
if len(invites) > 100:
|
||||
print(f" ... and {len(invites) - 100} more invitations")
|
||||
|
||||
unsafe = [u for u in users if u.dependent_rows > 0]
|
||||
if unsafe:
|
||||
print("refusing to delete users with dependent data:", file=sys.stderr)
|
||||
for user in unsafe[: min(50, len(unsafe))]:
|
||||
print(f" - {user.email} deps={user.dependent_rows}", file=sys.stderr)
|
||||
if len(unsafe) > 50:
|
||||
print(f" ... and {len(unsafe) - 50} more", file=sys.stderr)
|
||||
return 2
|
||||
|
||||
if not args.apply:
|
||||
print("dry-run complete (no changes made)")
|
||||
return 0
|
||||
|
||||
deleted_invites = _delete_invitations(invites)
|
||||
deleted_users = _delete_users([u.uuid for u in users])
|
||||
print(f"deleted vaultwarden invitations: {deleted_invites}")
|
||||
print(f"deleted vaultwarden users: {deleted_users}")
|
||||
print("done")
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
raise SystemExit(main(sys.argv[1:]))
|
||||
15
scripts/test_vaultwarden_user_cleanup.sh
Executable file
15
scripts/test_vaultwarden_user_cleanup.sh
Executable file
@ -0,0 +1,15 @@
|
||||
#!/usr/bin/env bash
|
||||
set -euo pipefail
|
||||
|
||||
# Manual-only helper to clean Vaultwarden test users and invites from Postgres.
|
||||
#
|
||||
# Usage (dry-run):
|
||||
# scripts/test_vaultwarden_user_cleanup.sh --prefix e2e-
|
||||
#
|
||||
# Usage (apply):
|
||||
# scripts/test_vaultwarden_user_cleanup.sh --prefix e2e- --apply --confirm e2e-
|
||||
|
||||
SCRIPT_DIR="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" >/dev/null 2>&1 && pwd)"
|
||||
|
||||
python3 "${SCRIPT_DIR}/test_vaultwarden_user_cleanup.py" "$@"
|
||||
|
||||
@ -20,7 +20,13 @@ def load_sync_module(monkeypatch):
|
||||
}
|
||||
for k, v in env.items():
|
||||
monkeypatch.setenv(k, v)
|
||||
module_path = pathlib.Path(__file__).resolve().parents[1] / "mailu_sync.py"
|
||||
module_path = (
|
||||
pathlib.Path(__file__).resolve().parents[2]
|
||||
/ "services"
|
||||
/ "mailu"
|
||||
/ "scripts"
|
||||
/ "mailu_sync.py"
|
||||
)
|
||||
spec = importlib.util.spec_from_file_location("mailu_sync_testmod", module_path)
|
||||
module = importlib.util.module_from_spec(spec)
|
||||
assert spec.loader is not None
|
||||
@ -102,7 +108,8 @@ def test_kc_get_users_paginates(monkeypatch):
|
||||
sync.SESSION = _PagedSession()
|
||||
users = sync.kc_get_users("tok")
|
||||
assert [u["id"] for u in users] == ["u1", "u2"]
|
||||
assert sync.SESSION.calls == 2
|
||||
# Pagination stops when results < page size.
|
||||
assert sync.SESSION.calls == 1
|
||||
|
||||
|
||||
def test_ensure_mailu_user_skips_foreign_domain(monkeypatch):
|
||||
@ -119,6 +126,7 @@ def test_ensure_mailu_user_skips_foreign_domain(monkeypatch):
|
||||
|
||||
def test_ensure_mailu_user_upserts(monkeypatch):
|
||||
sync = load_sync_module(monkeypatch)
|
||||
monkeypatch.setattr(sync.bcrypt_sha256, "hash", lambda password: f"hash:{password}")
|
||||
captured = {}
|
||||
|
||||
class _Cursor:
|
||||
@ -134,6 +142,7 @@ def test_ensure_mailu_user_upserts(monkeypatch):
|
||||
|
||||
def test_main_generates_password_and_upserts(monkeypatch):
|
||||
sync = load_sync_module(monkeypatch)
|
||||
monkeypatch.setattr(sync.bcrypt_sha256, "hash", lambda password: f"hash:{password}")
|
||||
users = [
|
||||
{"id": "u1", "username": "user1", "email": "user1@example.com", "attributes": {}},
|
||||
{"id": "u2", "username": "user2", "email": "user2@example.com", "attributes": {"mailu_app_password": ["keepme"]}},
|
||||
@ -176,6 +185,6 @@ def test_main_generates_password_and_upserts(monkeypatch):
|
||||
|
||||
sync.main()
|
||||
|
||||
# Should attempt two inserts (third user skipped due to domain mismatch)
|
||||
assert len(updated) == 1 # only one missing attr was backfilled
|
||||
assert conns and len(conns[0]._cursor.executions) == 2
|
||||
# Always backfill mailu_email, even if Keycloak recovery email is external.
|
||||
assert len(updated) == 3
|
||||
assert conns and len(conns[0]._cursor.executions) == 3
|
||||
|
||||
105
services/ai-llm/deployment.yaml
Normal file
105
services/ai-llm/deployment.yaml
Normal file
@ -0,0 +1,105 @@
|
||||
# services/ai-llm/deployment.yaml
|
||||
apiVersion: apps/v1
|
||||
kind: Deployment
|
||||
metadata:
|
||||
name: ollama
|
||||
namespace: ai
|
||||
spec:
|
||||
replicas: 1
|
||||
revisionHistoryLimit: 2
|
||||
strategy:
|
||||
type: RollingUpdate
|
||||
rollingUpdate:
|
||||
maxSurge: 0
|
||||
maxUnavailable: 1
|
||||
selector:
|
||||
matchLabels:
|
||||
app: ollama
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
app: ollama
|
||||
annotations:
|
||||
ai.bstein.dev/model: qwen2.5-coder:7b-instruct-q4_0
|
||||
ai.bstein.dev/gpu: GPU pool (titan-20/21/22/24)
|
||||
spec:
|
||||
affinity:
|
||||
nodeAffinity:
|
||||
requiredDuringSchedulingIgnoredDuringExecution:
|
||||
nodeSelectorTerms:
|
||||
- matchExpressions:
|
||||
- key: kubernetes.io/hostname
|
||||
operator: In
|
||||
values:
|
||||
- titan-20
|
||||
- titan-21
|
||||
- titan-22
|
||||
- titan-24
|
||||
runtimeClassName: nvidia
|
||||
volumes:
|
||||
- name: models
|
||||
persistentVolumeClaim:
|
||||
claimName: ollama-models
|
||||
initContainers:
|
||||
- name: warm-model
|
||||
image: ollama/ollama:latest
|
||||
env:
|
||||
- name: OLLAMA_HOST
|
||||
value: 0.0.0.0
|
||||
- name: NVIDIA_VISIBLE_DEVICES
|
||||
value: all
|
||||
- name: NVIDIA_DRIVER_CAPABILITIES
|
||||
value: compute,utility
|
||||
- name: OLLAMA_MODELS
|
||||
value: /root/.ollama
|
||||
- name: OLLAMA_MODEL
|
||||
value: qwen2.5-coder:7b-instruct-q4_0
|
||||
command:
|
||||
- /bin/sh
|
||||
- -c
|
||||
- |
|
||||
set -e
|
||||
ollama serve >/tmp/ollama.log 2>&1 &
|
||||
sleep 6
|
||||
ollama pull "${OLLAMA_MODEL}"
|
||||
pkill ollama || true
|
||||
volumeMounts:
|
||||
- name: models
|
||||
mountPath: /root/.ollama
|
||||
resources:
|
||||
requests:
|
||||
cpu: 250m
|
||||
memory: 1Gi
|
||||
nvidia.com/gpu.shared: 1
|
||||
limits:
|
||||
nvidia.com/gpu.shared: 1
|
||||
containers:
|
||||
- name: ollama
|
||||
image: ollama/ollama:latest
|
||||
imagePullPolicy: IfNotPresent
|
||||
ports:
|
||||
- name: http
|
||||
containerPort: 11434
|
||||
env:
|
||||
- name: OLLAMA_HOST
|
||||
value: 0.0.0.0
|
||||
- name: OLLAMA_KEEP_ALIVE
|
||||
value: 6h
|
||||
- name: OLLAMA_MODELS
|
||||
value: /root/.ollama
|
||||
- name: NVIDIA_VISIBLE_DEVICES
|
||||
value: all
|
||||
- name: NVIDIA_DRIVER_CAPABILITIES
|
||||
value: compute,utility
|
||||
volumeMounts:
|
||||
- name: models
|
||||
mountPath: /root/.ollama
|
||||
resources:
|
||||
requests:
|
||||
cpu: "2"
|
||||
memory: 8Gi
|
||||
nvidia.com/gpu.shared: 1
|
||||
limits:
|
||||
cpu: "4"
|
||||
memory: 12Gi
|
||||
nvidia.com/gpu.shared: 1
|
||||
9
services/ai-llm/kustomization.yaml
Normal file
9
services/ai-llm/kustomization.yaml
Normal file
@ -0,0 +1,9 @@
|
||||
# services/ai-llm/kustomization.yaml
|
||||
apiVersion: kustomize.config.k8s.io/v1beta1
|
||||
kind: Kustomization
|
||||
namespace: ai
|
||||
resources:
|
||||
- namespace.yaml
|
||||
- pvc.yaml
|
||||
- deployment.yaml
|
||||
- service.yaml
|
||||
5
services/ai-llm/namespace.yaml
Normal file
5
services/ai-llm/namespace.yaml
Normal file
@ -0,0 +1,5 @@
|
||||
# services/ai-llm/namespace.yaml
|
||||
apiVersion: v1
|
||||
kind: Namespace
|
||||
metadata:
|
||||
name: ai
|
||||
13
services/ai-llm/pvc.yaml
Normal file
13
services/ai-llm/pvc.yaml
Normal file
@ -0,0 +1,13 @@
|
||||
# services/ai-llm/pvc.yaml
|
||||
apiVersion: v1
|
||||
kind: PersistentVolumeClaim
|
||||
metadata:
|
||||
name: ollama-models
|
||||
namespace: ai
|
||||
spec:
|
||||
accessModes:
|
||||
- ReadWriteOnce
|
||||
resources:
|
||||
requests:
|
||||
storage: 30Gi
|
||||
storageClassName: astreae
|
||||
14
services/ai-llm/service.yaml
Normal file
14
services/ai-llm/service.yaml
Normal file
@ -0,0 +1,14 @@
|
||||
# services/ai-llm/service.yaml
|
||||
apiVersion: v1
|
||||
kind: Service
|
||||
metadata:
|
||||
name: ollama
|
||||
namespace: ai
|
||||
spec:
|
||||
type: ClusterIP
|
||||
selector:
|
||||
app: ollama
|
||||
ports:
|
||||
- name: http
|
||||
port: 11434
|
||||
targetPort: 11434
|
||||
@ -5,7 +5,7 @@ metadata:
|
||||
name: bstein-dev-home-backend
|
||||
namespace: bstein-dev-home
|
||||
spec:
|
||||
replicas: 2
|
||||
replicas: 1
|
||||
revisionHistoryLimit: 3
|
||||
selector:
|
||||
matchLabels:
|
||||
@ -15,6 +15,8 @@ spec:
|
||||
labels:
|
||||
app: bstein-dev-home-backend
|
||||
spec:
|
||||
automountServiceAccountToken: true
|
||||
serviceAccountName: bstein-dev-home
|
||||
nodeSelector:
|
||||
kubernetes.io/arch: arm64
|
||||
node-role.kubernetes.io/worker: "true"
|
||||
@ -22,8 +24,73 @@ spec:
|
||||
- name: harbor-bstein-robot
|
||||
containers:
|
||||
- name: backend
|
||||
image: registry.bstein.dev/bstein/bstein-dev-home-backend:latest
|
||||
image: registry.bstein.dev/bstein/bstein-dev-home-backend:0.1.1-92 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend"}
|
||||
imagePullPolicy: Always
|
||||
command: ["gunicorn"]
|
||||
args:
|
||||
- -b
|
||||
- 0.0.0.0:8080
|
||||
- --workers
|
||||
- "2"
|
||||
- --timeout
|
||||
- "180"
|
||||
- app:app
|
||||
env:
|
||||
- name: AI_CHAT_API
|
||||
value: http://ollama.ai.svc.cluster.local:11434
|
||||
- name: AI_CHAT_MODEL
|
||||
value: qwen2.5-coder:7b-instruct-q4_0
|
||||
- name: AI_CHAT_TIMEOUT_SEC
|
||||
value: "60"
|
||||
- name: AI_NODE_NAME
|
||||
valueFrom:
|
||||
fieldRef:
|
||||
fieldPath: spec.nodeName
|
||||
- name: AI_NODE_GPU_MAP
|
||||
value: |
|
||||
{"titan-20": "Jetson Xavier (edge GPU)", "titan-21": "Jetson Xavier (edge GPU)", "titan-22": "RTX 3050 8GB (local GPU)", "titan-24": "RTX 3080 8GB (local GPU)"}
|
||||
- name: KEYCLOAK_ENABLED
|
||||
value: "true"
|
||||
- name: KEYCLOAK_URL
|
||||
value: https://sso.bstein.dev
|
||||
- name: KEYCLOAK_REALM
|
||||
value: atlas
|
||||
- name: KEYCLOAK_CLIENT_ID
|
||||
value: bstein-dev-home
|
||||
- name: KEYCLOAK_ISSUER
|
||||
value: https://sso.bstein.dev/realms/atlas
|
||||
- name: KEYCLOAK_JWKS_URL
|
||||
value: http://keycloak.sso.svc.cluster.local/realms/atlas/protocol/openid-connect/certs
|
||||
- name: KEYCLOAK_ADMIN_URL
|
||||
value: http://keycloak.sso.svc.cluster.local
|
||||
- name: KEYCLOAK_ADMIN_REALM
|
||||
value: atlas
|
||||
- name: KEYCLOAK_ADMIN_CLIENT_ID
|
||||
value: bstein-dev-home-admin
|
||||
- name: KEYCLOAK_ADMIN_CLIENT_SECRET
|
||||
valueFrom:
|
||||
secretKeyRef:
|
||||
name: bstein-dev-home-keycloak-admin
|
||||
key: client_secret
|
||||
- name: ACCOUNT_ALLOWED_GROUPS
|
||||
value: ""
|
||||
- name: PORTAL_DATABASE_URL
|
||||
valueFrom:
|
||||
secretKeyRef:
|
||||
name: atlas-portal-db
|
||||
key: PORTAL_DATABASE_URL
|
||||
- name: HTTP_CHECK_TIMEOUT_SEC
|
||||
value: "2"
|
||||
- name: ACCESS_REQUEST_SUBMIT_RATE_LIMIT
|
||||
value: "30"
|
||||
- name: ACCESS_REQUEST_SUBMIT_RATE_WINDOW_SEC
|
||||
value: "3600"
|
||||
- name: ACCESS_REQUEST_STATUS_RATE_LIMIT
|
||||
value: "120"
|
||||
- name: ACCESS_REQUEST_STATUS_RATE_WINDOW_SEC
|
||||
value: "60"
|
||||
- name: ACCESS_REQUEST_INTERNAL_EMAIL_ALLOWLIST
|
||||
value: robotuser@bstein.dev
|
||||
ports:
|
||||
- name: http
|
||||
containerPort: 8080
|
||||
@ -33,16 +100,18 @@ spec:
|
||||
port: http
|
||||
initialDelaySeconds: 2
|
||||
periodSeconds: 5
|
||||
timeoutSeconds: 3
|
||||
livenessProbe:
|
||||
httpGet:
|
||||
path: /api/healthz
|
||||
port: http
|
||||
initialDelaySeconds: 10
|
||||
periodSeconds: 10
|
||||
timeoutSeconds: 3
|
||||
resources:
|
||||
requests:
|
||||
cpu: 50m
|
||||
memory: 64Mi
|
||||
cpu: 100m
|
||||
memory: 128Mi
|
||||
limits:
|
||||
cpu: 300m
|
||||
memory: 256Mi
|
||||
cpu: 500m
|
||||
memory: 512Mi
|
||||
|
||||
69
services/bstein-dev-home/chat-ai-gateway-deployment.yaml
Normal file
69
services/bstein-dev-home/chat-ai-gateway-deployment.yaml
Normal file
@ -0,0 +1,69 @@
|
||||
# services/bstein-dev-home/chat-ai-gateway-deployment.yaml
|
||||
apiVersion: apps/v1
|
||||
kind: Deployment
|
||||
metadata:
|
||||
name: chat-ai-gateway
|
||||
namespace: bstein-dev-home
|
||||
spec:
|
||||
replicas: 1
|
||||
revisionHistoryLimit: 2
|
||||
selector:
|
||||
matchLabels:
|
||||
app: chat-ai-gateway
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
app: chat-ai-gateway
|
||||
spec:
|
||||
nodeSelector:
|
||||
kubernetes.io/arch: arm64
|
||||
node-role.kubernetes.io/worker: "true"
|
||||
containers:
|
||||
- name: gateway
|
||||
image: python:3.11-slim
|
||||
command: ["/bin/sh","-c"]
|
||||
args:
|
||||
- python /app/gateway.py
|
||||
env:
|
||||
- name: UPSTREAM_URL
|
||||
value: http://bstein-dev-home-backend/api/chat
|
||||
- name: CHAT_KEY_MATRIX
|
||||
valueFrom:
|
||||
secretKeyRef:
|
||||
name: chat-ai-keys-runtime
|
||||
key: matrix
|
||||
- name: CHAT_KEY_HOMEPAGE
|
||||
valueFrom:
|
||||
secretKeyRef:
|
||||
name: chat-ai-keys-runtime
|
||||
key: homepage
|
||||
ports:
|
||||
- name: http
|
||||
containerPort: 8080
|
||||
readinessProbe:
|
||||
httpGet:
|
||||
path: /healthz
|
||||
port: http
|
||||
initialDelaySeconds: 2
|
||||
periodSeconds: 5
|
||||
livenessProbe:
|
||||
httpGet:
|
||||
path: /healthz
|
||||
port: http
|
||||
initialDelaySeconds: 10
|
||||
periodSeconds: 10
|
||||
resources:
|
||||
requests:
|
||||
cpu: 20m
|
||||
memory: 64Mi
|
||||
limits:
|
||||
cpu: 200m
|
||||
memory: 256Mi
|
||||
volumeMounts:
|
||||
- name: code
|
||||
mountPath: /app/gateway.py
|
||||
subPath: gateway.py
|
||||
volumes:
|
||||
- name: code
|
||||
configMap:
|
||||
name: chat-ai-gateway
|
||||
13
services/bstein-dev-home/chat-ai-gateway-service.yaml
Normal file
13
services/bstein-dev-home/chat-ai-gateway-service.yaml
Normal file
@ -0,0 +1,13 @@
|
||||
# services/bstein-dev-home/chat-ai-gateway-service.yaml
|
||||
apiVersion: v1
|
||||
kind: Service
|
||||
metadata:
|
||||
name: chat-ai-gateway
|
||||
namespace: bstein-dev-home
|
||||
spec:
|
||||
selector:
|
||||
app: chat-ai-gateway
|
||||
ports:
|
||||
- name: http
|
||||
port: 80
|
||||
targetPort: 8080
|
||||
@ -5,7 +5,7 @@ metadata:
|
||||
name: bstein-dev-home-frontend
|
||||
namespace: bstein-dev-home
|
||||
spec:
|
||||
replicas: 2
|
||||
replicas: 1
|
||||
revisionHistoryLimit: 3
|
||||
selector:
|
||||
matchLabels:
|
||||
@ -22,7 +22,7 @@ spec:
|
||||
- name: harbor-bstein-robot
|
||||
containers:
|
||||
- name: frontend
|
||||
image: registry.bstein.dev/bstein/bstein-dev-home-frontend:latest
|
||||
image: registry.bstein.dev/bstein/bstein-dev-home-frontend:0.1.1-92 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend"}
|
||||
imagePullPolicy: Always
|
||||
ports:
|
||||
- name: http
|
||||
|
||||
@ -11,7 +11,7 @@ metadata:
|
||||
cert-manager.io/cluster-issuer: letsencrypt
|
||||
spec:
|
||||
tls:
|
||||
- hosts: [ "bstein.dev" ]
|
||||
- hosts: [ "bstein.dev", "chat.ai.bstein.dev" ]
|
||||
secretName: bstein-dev-home-tls
|
||||
rules:
|
||||
- host: bstein.dev
|
||||
@ -29,3 +29,12 @@ spec:
|
||||
service:
|
||||
name: bstein-dev-home-frontend
|
||||
port: { number: 80 }
|
||||
- host: chat.ai.bstein.dev
|
||||
http:
|
||||
paths:
|
||||
- path: /
|
||||
pathType: Prefix
|
||||
backend:
|
||||
service:
|
||||
name: chat-ai-gateway
|
||||
port: { number: 80 }
|
||||
|
||||
@ -5,13 +5,38 @@ namespace: bstein-dev-home
|
||||
resources:
|
||||
- namespace.yaml
|
||||
- image.yaml
|
||||
- rbac.yaml
|
||||
- portal-e2e-client-secret-sync-rbac.yaml
|
||||
- chat-ai-gateway-deployment.yaml
|
||||
- chat-ai-gateway-service.yaml
|
||||
- frontend-deployment.yaml
|
||||
- frontend-service.yaml
|
||||
- backend-deployment.yaml
|
||||
- backend-service.yaml
|
||||
- vaultwarden-cred-sync-cronjob.yaml
|
||||
- portal-onboarding-e2e-test-job.yaml
|
||||
- ingress.yaml
|
||||
images:
|
||||
- name: registry.bstein.dev/bstein/bstein-dev-home-frontend
|
||||
newTag: 0.1.1-0 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend"}
|
||||
newTag: registry.bstein.dev/bstein/bstein-dev-home-frontend:0.1.1-92 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend"}
|
||||
- name: registry.bstein.dev/bstein/bstein-dev-home-backend
|
||||
newTag: 0.1.1-0 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend"}
|
||||
newTag: registry.bstein.dev/bstein/bstein-dev-home-backend:0.1.1-92 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend"}
|
||||
configMapGenerator:
|
||||
- name: chat-ai-gateway
|
||||
namespace: bstein-dev-home
|
||||
files:
|
||||
- gateway.py=scripts/gateway.py
|
||||
options:
|
||||
disableNameSuffixHash: true
|
||||
- name: vaultwarden-cred-sync-script
|
||||
namespace: bstein-dev-home
|
||||
files:
|
||||
- vaultwarden_cred_sync.py=scripts/vaultwarden_cred_sync.py
|
||||
options:
|
||||
disableNameSuffixHash: true
|
||||
- name: portal-onboarding-e2e-tests
|
||||
namespace: bstein-dev-home
|
||||
files:
|
||||
- test_portal_onboarding_flow.py=scripts/test_portal_onboarding_flow.py
|
||||
options:
|
||||
disableNameSuffixHash: true
|
||||
|
||||
@ -0,0 +1,24 @@
|
||||
# services/bstein-dev-home/portal-e2e-client-secret-sync-rbac.yaml
|
||||
apiVersion: rbac.authorization.k8s.io/v1
|
||||
kind: Role
|
||||
metadata:
|
||||
name: portal-e2e-client-secret-sync-target
|
||||
namespace: bstein-dev-home
|
||||
rules:
|
||||
- apiGroups: [""]
|
||||
resources: ["secrets"]
|
||||
verbs: ["get", "create", "patch", "update"]
|
||||
---
|
||||
apiVersion: rbac.authorization.k8s.io/v1
|
||||
kind: RoleBinding
|
||||
metadata:
|
||||
name: portal-e2e-client-secret-sync-target
|
||||
namespace: bstein-dev-home
|
||||
subjects:
|
||||
- kind: ServiceAccount
|
||||
name: portal-e2e-client-secret-sync
|
||||
namespace: sso
|
||||
roleRef:
|
||||
apiGroup: rbac.authorization.k8s.io
|
||||
kind: Role
|
||||
name: portal-e2e-client-secret-sync-target
|
||||
66
services/bstein-dev-home/portal-onboarding-e2e-test-job.yaml
Normal file
66
services/bstein-dev-home/portal-onboarding-e2e-test-job.yaml
Normal file
@ -0,0 +1,66 @@
|
||||
# services/bstein-dev-home/portal-onboarding-e2e-test-job.yaml
|
||||
apiVersion: batch/v1
|
||||
kind: Job
|
||||
metadata:
|
||||
name: portal-onboarding-e2e-test-11
|
||||
namespace: bstein-dev-home
|
||||
spec:
|
||||
backoffLimit: 0
|
||||
template:
|
||||
spec:
|
||||
restartPolicy: Never
|
||||
containers:
|
||||
- name: test
|
||||
image: python:3.11-slim
|
||||
env:
|
||||
- name: PORTAL_BASE_URL
|
||||
value: http://bstein-dev-home-backend.bstein-dev-home.svc.cluster.local
|
||||
- name: KEYCLOAK_ADMIN_URL
|
||||
value: https://sso.bstein.dev
|
||||
- name: KEYCLOAK_REALM
|
||||
value: atlas
|
||||
- name: KEYCLOAK_ADMIN_CLIENT_ID
|
||||
value: bstein-dev-home-admin
|
||||
- name: KEYCLOAK_ADMIN_CLIENT_SECRET
|
||||
valueFrom:
|
||||
secretKeyRef:
|
||||
name: bstein-dev-home-keycloak-admin
|
||||
key: client_secret
|
||||
- name: PORTAL_E2E_CLIENT_ID
|
||||
valueFrom:
|
||||
secretKeyRef:
|
||||
name: portal-e2e-client
|
||||
key: client_id
|
||||
- name: PORTAL_E2E_CLIENT_SECRET
|
||||
valueFrom:
|
||||
secretKeyRef:
|
||||
name: portal-e2e-client
|
||||
key: client_secret
|
||||
- name: PORTAL_TARGET_CLIENT_ID
|
||||
value: bstein-dev-home
|
||||
- name: E2E_PORTAL_ADMIN_USERNAME
|
||||
value: bstein
|
||||
- name: E2E_USERNAME_PREFIX
|
||||
value: e2e-portal
|
||||
- name: E2E_CONTACT_EMAIL
|
||||
value: robotuser@bstein.dev
|
||||
- name: E2E_IMAP_KEYCLOAK_USERNAME
|
||||
value: robotuser
|
||||
- name: E2E_DEADLINE_SECONDS
|
||||
value: "600"
|
||||
- name: E2E_POLL_SECONDS
|
||||
value: "10"
|
||||
command: ["/bin/sh", "-c"]
|
||||
args:
|
||||
- |
|
||||
set -euo pipefail
|
||||
python /scripts/test_portal_onboarding_flow.py
|
||||
volumeMounts:
|
||||
- name: tests
|
||||
mountPath: /scripts
|
||||
readOnly: true
|
||||
volumes:
|
||||
- name: tests
|
||||
configMap:
|
||||
name: portal-onboarding-e2e-tests
|
||||
defaultMode: 0555
|
||||
108
services/bstein-dev-home/rbac.yaml
Normal file
108
services/bstein-dev-home/rbac.yaml
Normal file
@ -0,0 +1,108 @@
|
||||
# services/bstein-dev-home/rbac.yaml
|
||||
apiVersion: v1
|
||||
kind: ServiceAccount
|
||||
metadata:
|
||||
name: bstein-dev-home
|
||||
namespace: bstein-dev-home
|
||||
---
|
||||
apiVersion: rbac.authorization.k8s.io/v1
|
||||
kind: ClusterRole
|
||||
metadata:
|
||||
name: bstein-dev-home-ai-reader
|
||||
rules:
|
||||
- apiGroups: [""]
|
||||
resources: ["pods"]
|
||||
verbs: ["get", "list", "watch"]
|
||||
resourceNames: []
|
||||
---
|
||||
apiVersion: rbac.authorization.k8s.io/v1
|
||||
kind: ClusterRoleBinding
|
||||
metadata:
|
||||
name: bstein-dev-home-ai-reader
|
||||
roleRef:
|
||||
apiGroup: rbac.authorization.k8s.io
|
||||
kind: ClusterRole
|
||||
name: bstein-dev-home-ai-reader
|
||||
subjects:
|
||||
- kind: ServiceAccount
|
||||
name: bstein-dev-home
|
||||
namespace: bstein-dev-home
|
||||
---
|
||||
apiVersion: rbac.authorization.k8s.io/v1
|
||||
kind: ClusterRole
|
||||
metadata:
|
||||
name: bstein-dev-home-vaultwarden-admin-secret-reader
|
||||
rules:
|
||||
- apiGroups: [""]
|
||||
resources: ["secrets"]
|
||||
verbs: ["get"]
|
||||
resourceNames: ["vaultwarden-admin"]
|
||||
---
|
||||
apiVersion: rbac.authorization.k8s.io/v1
|
||||
kind: ClusterRoleBinding
|
||||
metadata:
|
||||
name: bstein-dev-home-vaultwarden-admin-secret-reader
|
||||
roleRef:
|
||||
apiGroup: rbac.authorization.k8s.io
|
||||
kind: ClusterRole
|
||||
name: bstein-dev-home-vaultwarden-admin-secret-reader
|
||||
subjects:
|
||||
- kind: ServiceAccount
|
||||
name: bstein-dev-home
|
||||
namespace: bstein-dev-home
|
||||
---
|
||||
apiVersion: rbac.authorization.k8s.io/v1
|
||||
kind: Role
|
||||
metadata:
|
||||
name: bstein-dev-home-vaultwarden-admin-token-reader
|
||||
namespace: vaultwarden
|
||||
rules:
|
||||
- apiGroups: [""]
|
||||
resources: ["secrets"]
|
||||
verbs: ["get"]
|
||||
resourceNames: ["vaultwarden-admin"]
|
||||
---
|
||||
apiVersion: rbac.authorization.k8s.io/v1
|
||||
kind: RoleBinding
|
||||
metadata:
|
||||
name: bstein-dev-home-vaultwarden-admin-token-reader
|
||||
namespace: vaultwarden
|
||||
roleRef:
|
||||
apiGroup: rbac.authorization.k8s.io
|
||||
kind: Role
|
||||
name: bstein-dev-home-vaultwarden-admin-token-reader
|
||||
subjects:
|
||||
- kind: ServiceAccount
|
||||
name: bstein-dev-home
|
||||
namespace: bstein-dev-home
|
||||
---
|
||||
apiVersion: rbac.authorization.k8s.io/v1
|
||||
kind: Role
|
||||
metadata:
|
||||
name: bstein-dev-home-nextcloud-mail-sync
|
||||
namespace: nextcloud
|
||||
rules:
|
||||
- apiGroups: ["batch"]
|
||||
resources: ["cronjobs"]
|
||||
verbs: ["get"]
|
||||
resourceNames: ["nextcloud-mail-sync"]
|
||||
- apiGroups: ["batch"]
|
||||
resources: ["jobs"]
|
||||
verbs: ["create", "get", "list", "watch"]
|
||||
- apiGroups: [""]
|
||||
resources: ["pods"]
|
||||
verbs: ["get", "list"]
|
||||
---
|
||||
apiVersion: rbac.authorization.k8s.io/v1
|
||||
kind: RoleBinding
|
||||
metadata:
|
||||
name: bstein-dev-home-nextcloud-mail-sync
|
||||
namespace: nextcloud
|
||||
roleRef:
|
||||
apiGroup: rbac.authorization.k8s.io
|
||||
kind: Role
|
||||
name: bstein-dev-home-nextcloud-mail-sync
|
||||
subjects:
|
||||
- kind: ServiceAccount
|
||||
name: bstein-dev-home
|
||||
namespace: bstein-dev-home
|
||||
70
services/bstein-dev-home/scripts/gateway.py
Normal file
70
services/bstein-dev-home/scripts/gateway.py
Normal file
@ -0,0 +1,70 @@
|
||||
import json
|
||||
import os
|
||||
from http.server import BaseHTTPRequestHandler, HTTPServer
|
||||
from urllib import request, error
|
||||
|
||||
UPSTREAM = os.environ.get("UPSTREAM_URL", "http://bstein-dev-home-backend/api/chat")
|
||||
KEY_MATRIX = os.environ.get("CHAT_KEY_MATRIX", "")
|
||||
KEY_HOMEPAGE = os.environ.get("CHAT_KEY_HOMEPAGE", "")
|
||||
|
||||
ALLOWED = {k for k in (KEY_MATRIX, KEY_HOMEPAGE) if k}
|
||||
|
||||
class Handler(BaseHTTPRequestHandler):
|
||||
def _send_json(self, code: int, payload: dict):
|
||||
body = json.dumps(payload).encode()
|
||||
self.send_response(code)
|
||||
self.send_header("Content-Type", "application/json")
|
||||
self.send_header("Content-Length", str(len(body)))
|
||||
self.end_headers()
|
||||
self.wfile.write(body)
|
||||
|
||||
def do_GET(self): # noqa: N802
|
||||
if self.path in ("/healthz", "/"):
|
||||
return self._send_json(200, {"ok": True})
|
||||
return self._send_json(404, {"error": "not_found"})
|
||||
|
||||
def do_POST(self): # noqa: N802
|
||||
if self.path != "/":
|
||||
return self._send_json(404, {"error": "not_found"})
|
||||
|
||||
key = self.headers.get("x-api-key", "")
|
||||
if not key or key not in ALLOWED:
|
||||
return self._send_json(401, {"error": "unauthorized"})
|
||||
|
||||
length = int(self.headers.get("content-length", "0") or "0")
|
||||
raw = self.rfile.read(length) if length else b"{}"
|
||||
|
||||
try:
|
||||
upstream_req = request.Request(
|
||||
UPSTREAM,
|
||||
data=raw,
|
||||
headers={"Content-Type": "application/json"},
|
||||
method="POST",
|
||||
)
|
||||
with request.urlopen(upstream_req, timeout=90) as resp:
|
||||
data = resp.read()
|
||||
self.send_response(resp.status)
|
||||
for k, v in resp.headers.items():
|
||||
if k.lower() in ("content-length", "connection", "server", "date"):
|
||||
continue
|
||||
self.send_header(k, v)
|
||||
self.send_header("Content-Length", str(len(data)))
|
||||
self.end_headers()
|
||||
self.wfile.write(data)
|
||||
except error.HTTPError as e:
|
||||
data = e.read() if hasattr(e, "read") else b""
|
||||
self.send_response(e.code)
|
||||
self.send_header("Content-Type", "application/json")
|
||||
self.send_header("Content-Length", str(len(data)))
|
||||
self.end_headers()
|
||||
self.wfile.write(data)
|
||||
except Exception:
|
||||
return self._send_json(502, {"error": "bad_gateway"})
|
||||
|
||||
def main():
|
||||
port = int(os.environ.get("PORT", "8080"))
|
||||
httpd = HTTPServer(("0.0.0.0", port), Handler)
|
||||
httpd.serve_forever()
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
428
services/bstein-dev-home/scripts/test_portal_onboarding_flow.py
Normal file
428
services/bstein-dev-home/scripts/test_portal_onboarding_flow.py
Normal file
@ -0,0 +1,428 @@
|
||||
#!/usr/bin/env python3
|
||||
import email
|
||||
import http.client
|
||||
import imaplib
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
import ssl
|
||||
import sys
|
||||
import time
|
||||
import urllib.error
|
||||
import urllib.parse
|
||||
import urllib.request
|
||||
|
||||
|
||||
def _env(name: str, default: str | None = None) -> str:
|
||||
value = os.environ.get(name, default)
|
||||
if value is None or value == "":
|
||||
raise SystemExit(f"missing required env var: {name}")
|
||||
return value
|
||||
|
||||
|
||||
def _post_json(url: str, payload: dict, timeout_s: int = 30) -> dict:
|
||||
body = json.dumps(payload).encode()
|
||||
req = urllib.request.Request(
|
||||
url,
|
||||
data=body,
|
||||
headers={"Content-Type": "application/json"},
|
||||
method="POST",
|
||||
)
|
||||
try:
|
||||
with urllib.request.urlopen(req, timeout=timeout_s) as resp:
|
||||
raw = resp.read().decode()
|
||||
return json.loads(raw) if raw else {}
|
||||
except urllib.error.HTTPError as exc:
|
||||
raw = exc.read().decode(errors="replace")
|
||||
raise SystemExit(f"HTTP {exc.code} from {url}: {raw}")
|
||||
|
||||
|
||||
def _post_form(url: str, data: dict[str, str], timeout_s: int = 30) -> dict:
|
||||
body = urllib.parse.urlencode(data).encode()
|
||||
req = urllib.request.Request(
|
||||
url,
|
||||
data=body,
|
||||
headers={"Content-Type": "application/x-www-form-urlencoded"},
|
||||
method="POST",
|
||||
)
|
||||
try:
|
||||
with urllib.request.urlopen(req, timeout=timeout_s) as resp:
|
||||
raw = resp.read().decode()
|
||||
return json.loads(raw) if raw else {}
|
||||
except urllib.error.HTTPError as exc:
|
||||
raw = exc.read().decode(errors="replace")
|
||||
raise SystemExit(f"HTTP {exc.code} from {url}: {raw}")
|
||||
|
||||
|
||||
def _get_json(url: str, headers: dict[str, str] | None = None, timeout_s: int = 30) -> object:
|
||||
req = urllib.request.Request(url, headers=headers or {}, method="GET")
|
||||
try:
|
||||
with urllib.request.urlopen(req, timeout=timeout_s) as resp:
|
||||
raw = resp.read().decode()
|
||||
return json.loads(raw) if raw else None
|
||||
except urllib.error.HTTPError as exc:
|
||||
raw = exc.read().decode(errors="replace")
|
||||
raise SystemExit(f"HTTP {exc.code} from {url}: {raw}")
|
||||
|
||||
|
||||
def _request_json(
|
||||
method: str,
|
||||
url: str,
|
||||
token: str,
|
||||
payload: dict | None = None,
|
||||
timeout_s: int = 30,
|
||||
) -> dict:
|
||||
data = None
|
||||
headers = {"Authorization": f"Bearer {token}"}
|
||||
if payload is not None:
|
||||
data = json.dumps(payload).encode()
|
||||
headers["Content-Type"] = "application/json"
|
||||
req = urllib.request.Request(url, data=data, headers=headers, method=method)
|
||||
try:
|
||||
with urllib.request.urlopen(req, timeout=timeout_s) as resp:
|
||||
raw = resp.read().decode()
|
||||
return json.loads(raw) if raw else {}
|
||||
except urllib.error.HTTPError as exc:
|
||||
raw = exc.read().decode(errors="replace")
|
||||
raise SystemExit(f"HTTP {exc.code} from {url}: {raw}")
|
||||
|
||||
|
||||
def _keycloak_client_token(keycloak_base: str, realm: str, client_id: str, client_secret: str) -> str:
|
||||
token_url = f"{keycloak_base.rstrip('/')}/realms/{realm}/protocol/openid-connect/token"
|
||||
payload = _post_form(
|
||||
token_url,
|
||||
{
|
||||
"grant_type": "client_credentials",
|
||||
"client_id": client_id,
|
||||
"client_secret": client_secret,
|
||||
},
|
||||
timeout_s=20,
|
||||
)
|
||||
token = payload.get("access_token")
|
||||
if not isinstance(token, str) or not token:
|
||||
raise SystemExit("keycloak token response missing access_token")
|
||||
return token
|
||||
|
||||
|
||||
def _keycloak_token_exchange(
|
||||
*,
|
||||
keycloak_base: str,
|
||||
realm: str,
|
||||
client_id: str,
|
||||
client_secret: str,
|
||||
subject_token: str,
|
||||
requested_subject: str,
|
||||
audience: str,
|
||||
) -> str:
|
||||
token_url = f"{keycloak_base.rstrip('/')}/realms/{realm}/protocol/openid-connect/token"
|
||||
payload = _post_form(
|
||||
token_url,
|
||||
{
|
||||
"grant_type": "urn:ietf:params:oauth:grant-type:token-exchange",
|
||||
"client_id": client_id,
|
||||
"client_secret": client_secret,
|
||||
"subject_token": subject_token,
|
||||
"requested_subject": requested_subject,
|
||||
"audience": audience,
|
||||
},
|
||||
timeout_s=20,
|
||||
)
|
||||
token = payload.get("access_token")
|
||||
if not isinstance(token, str) or not token:
|
||||
raise SystemExit("keycloak token exchange response missing access_token")
|
||||
return token
|
||||
|
||||
|
||||
def _keycloak_find_user(keycloak_base: str, realm: str, token: str, username: str) -> dict | None:
|
||||
url = f"{keycloak_base.rstrip('/')}/admin/realms/{realm}/users?{urllib.parse.urlencode({'username': username, 'exact': 'true', 'max': '1'})}"
|
||||
users = _get_json(url, headers={"Authorization": f"Bearer {token}"}, timeout_s=20)
|
||||
if not isinstance(users, list) or not users:
|
||||
return None
|
||||
user = users[0]
|
||||
return user if isinstance(user, dict) else None
|
||||
|
||||
|
||||
def _keycloak_get_user(keycloak_base: str, realm: str, token: str, user_id: str) -> dict:
|
||||
url = f"{keycloak_base.rstrip('/')}/admin/realms/{realm}/users/{urllib.parse.quote(user_id, safe='')}"
|
||||
data = _get_json(url, headers={"Authorization": f"Bearer {token}"}, timeout_s=20)
|
||||
if not isinstance(data, dict):
|
||||
raise SystemExit("unexpected keycloak user payload")
|
||||
return data
|
||||
|
||||
|
||||
def _extract_attr(attributes: object, key: str) -> str:
|
||||
if not isinstance(attributes, dict):
|
||||
return ""
|
||||
value = attributes.get(key)
|
||||
if isinstance(value, list) and value and isinstance(value[0], str):
|
||||
return value[0]
|
||||
if isinstance(value, str):
|
||||
return value
|
||||
return ""
|
||||
|
||||
|
||||
def _imap_wait_for_verify_token(
|
||||
*,
|
||||
host: str,
|
||||
port: int,
|
||||
username: str,
|
||||
password: str,
|
||||
request_code: str,
|
||||
deadline_sec: int,
|
||||
) -> str:
|
||||
ssl_context = ssl._create_unverified_context()
|
||||
deadline_at = time.monotonic() + deadline_sec
|
||||
|
||||
with imaplib.IMAP4_SSL(host, port, ssl_context=ssl_context) as client:
|
||||
client.login(username, password)
|
||||
client.select("INBOX")
|
||||
|
||||
while time.monotonic() < deadline_at:
|
||||
status, data = client.search(None, "TEXT", request_code)
|
||||
if status == "OK" and data and data[0]:
|
||||
ids = data[0].split()
|
||||
msg_id = ids[-1]
|
||||
fetch_status, msg_data = client.fetch(msg_id, "(RFC822)")
|
||||
if fetch_status != "OK" or not msg_data:
|
||||
time.sleep(2)
|
||||
continue
|
||||
|
||||
raw = msg_data[0][1] if isinstance(msg_data[0], tuple) and len(msg_data[0]) > 1 else None
|
||||
if not isinstance(raw, (bytes, bytearray)):
|
||||
time.sleep(2)
|
||||
continue
|
||||
|
||||
message = email.message_from_bytes(raw)
|
||||
body = None
|
||||
if message.is_multipart():
|
||||
for part in message.walk():
|
||||
if part.get_content_type() == "text/plain":
|
||||
payload = part.get_payload(decode=True)
|
||||
if isinstance(payload, (bytes, bytearray)):
|
||||
body = payload.decode(errors="replace")
|
||||
break
|
||||
else:
|
||||
payload = message.get_payload(decode=True)
|
||||
if isinstance(payload, (bytes, bytearray)):
|
||||
body = payload.decode(errors="replace")
|
||||
|
||||
if not body:
|
||||
time.sleep(2)
|
||||
continue
|
||||
|
||||
url = None
|
||||
for line in body.splitlines():
|
||||
candidate = line.strip()
|
||||
if "verify=" in candidate and candidate.startswith("http"):
|
||||
url = candidate
|
||||
break
|
||||
if not url:
|
||||
match = re.search(r"https?://\\S+verify=\\S+", body)
|
||||
url = match.group(0) if match else None
|
||||
if not url:
|
||||
time.sleep(2)
|
||||
continue
|
||||
|
||||
parsed = urllib.parse.urlparse(url)
|
||||
query = urllib.parse.parse_qs(parsed.query)
|
||||
token = query.get("verify", [""])[0]
|
||||
if isinstance(token, str) and token:
|
||||
return token
|
||||
time.sleep(2)
|
||||
|
||||
raise SystemExit("verification email not found before deadline")
|
||||
|
||||
|
||||
def main() -> int:
|
||||
portal_base = _env("PORTAL_BASE_URL").rstrip("/")
|
||||
|
||||
keycloak_base = _env("KEYCLOAK_ADMIN_URL").rstrip("/")
|
||||
realm = _env("KEYCLOAK_REALM", "atlas")
|
||||
kc_admin_client_id = _env("KEYCLOAK_ADMIN_CLIENT_ID")
|
||||
kc_admin_client_secret = _env("KEYCLOAK_ADMIN_CLIENT_SECRET")
|
||||
portal_e2e_client_id = _env("PORTAL_E2E_CLIENT_ID")
|
||||
portal_e2e_client_secret = _env("PORTAL_E2E_CLIENT_SECRET")
|
||||
portal_target_client_id = os.environ.get("PORTAL_TARGET_CLIENT_ID", "bstein-dev-home").strip() or "bstein-dev-home"
|
||||
portal_admin_username = os.environ.get("E2E_PORTAL_ADMIN_USERNAME", "bstein").strip() or "bstein"
|
||||
|
||||
contact_email = os.environ.get("E2E_CONTACT_EMAIL", "robotuser@bstein.dev").strip()
|
||||
if not contact_email:
|
||||
raise SystemExit("E2E_CONTACT_EMAIL must not be empty")
|
||||
|
||||
imap_host = os.environ.get("E2E_IMAP_HOST", "mailu-front.mailu-mailserver.svc.cluster.local").strip()
|
||||
imap_port = int(os.environ.get("E2E_IMAP_PORT", "993"))
|
||||
imap_keycloak_username = os.environ.get("E2E_IMAP_KEYCLOAK_USERNAME", "robotuser").strip()
|
||||
imap_wait_sec = int(os.environ.get("E2E_IMAP_WAIT_SECONDS", "90"))
|
||||
|
||||
try:
|
||||
token = _keycloak_client_token(keycloak_base, realm, kc_admin_client_id, kc_admin_client_secret)
|
||||
except SystemExit as exc:
|
||||
raise SystemExit(f"failed to fetch keycloak token for admin client {kc_admin_client_id!r}: {exc}")
|
||||
mailbox_user = _keycloak_find_user(keycloak_base, realm, token, imap_keycloak_username)
|
||||
if not mailbox_user:
|
||||
raise SystemExit(f"unable to locate Keycloak mailbox user {imap_keycloak_username!r}")
|
||||
mailbox_user_id = mailbox_user.get("id")
|
||||
if not isinstance(mailbox_user_id, str) or not mailbox_user_id:
|
||||
raise SystemExit("mailbox user missing id")
|
||||
|
||||
mailbox_full = _keycloak_get_user(keycloak_base, realm, token, mailbox_user_id)
|
||||
mailbox_attrs = mailbox_full.get("attributes")
|
||||
mailu_email = _extract_attr(mailbox_attrs, "mailu_email")
|
||||
if not mailu_email:
|
||||
mailu_email = contact_email
|
||||
mailu_password = _extract_attr(mailbox_attrs, "mailu_app_password")
|
||||
if not mailu_password:
|
||||
raise SystemExit(f"Keycloak user {imap_keycloak_username!r} missing mailu_app_password attribute")
|
||||
|
||||
username_prefix = os.environ.get("E2E_USERNAME_PREFIX", "e2e-user")
|
||||
now = int(time.time())
|
||||
username = f"{username_prefix}-{now}"
|
||||
|
||||
submit_url = f"{portal_base}/api/access/request"
|
||||
submit_payload = {"username": username, "email": contact_email, "note": "portal onboarding e2e"}
|
||||
submit = None
|
||||
for attempt in range(1, 6):
|
||||
try:
|
||||
submit = _post_json(submit_url, submit_payload, timeout_s=20)
|
||||
break
|
||||
except (http.client.RemoteDisconnected, TimeoutError, urllib.error.URLError) as exc:
|
||||
if attempt == 5:
|
||||
raise SystemExit(f"portal submit failed after {attempt} attempts: {exc}")
|
||||
time.sleep(2)
|
||||
if not isinstance(submit, dict):
|
||||
raise SystemExit("portal submit did not return json")
|
||||
|
||||
request_code = submit.get("request_code")
|
||||
if not isinstance(request_code, str) or not request_code:
|
||||
raise SystemExit(f"request submit did not return request_code: {submit}")
|
||||
|
||||
verify_token = _imap_wait_for_verify_token(
|
||||
host=imap_host,
|
||||
port=imap_port,
|
||||
username=mailu_email,
|
||||
password=mailu_password,
|
||||
request_code=request_code,
|
||||
deadline_sec=imap_wait_sec,
|
||||
)
|
||||
verify_resp = _post_json(
|
||||
f"{portal_base}/api/access/request/verify",
|
||||
{"request_code": request_code, "token": verify_token},
|
||||
timeout_s=30,
|
||||
)
|
||||
if not isinstance(verify_resp, dict) or verify_resp.get("ok") is not True:
|
||||
raise SystemExit(f"unexpected verify response: {verify_resp}")
|
||||
|
||||
portal_admin = _keycloak_find_user(keycloak_base, realm, token, portal_admin_username)
|
||||
if not portal_admin:
|
||||
raise SystemExit(f"unable to locate portal admin user {portal_admin_username!r} via Keycloak admin API")
|
||||
portal_admin_user_id = portal_admin.get("id")
|
||||
if not isinstance(portal_admin_user_id, str) or not portal_admin_user_id:
|
||||
raise SystemExit("portal admin user missing id")
|
||||
|
||||
try:
|
||||
e2e_subject_token = _keycloak_client_token(keycloak_base, realm, portal_e2e_client_id, portal_e2e_client_secret)
|
||||
except SystemExit as exc:
|
||||
raise SystemExit(f"failed to fetch keycloak token for E2E client {portal_e2e_client_id!r}: {exc}")
|
||||
try:
|
||||
portal_bearer = _keycloak_token_exchange(
|
||||
keycloak_base=keycloak_base,
|
||||
realm=realm,
|
||||
client_id=portal_e2e_client_id,
|
||||
client_secret=portal_e2e_client_secret,
|
||||
subject_token=e2e_subject_token,
|
||||
requested_subject=portal_admin_user_id,
|
||||
audience=portal_target_client_id,
|
||||
)
|
||||
except SystemExit as exc:
|
||||
raise SystemExit(f"failed to exchange token for portal approval as {portal_admin_username!r}: {exc}")
|
||||
|
||||
approve_url = f"{portal_base}/api/admin/access/requests/{urllib.parse.quote(username, safe='')}/approve"
|
||||
approve_timeout_s = int(os.environ.get("E2E_APPROVE_TIMEOUT_SECONDS", "180"))
|
||||
approve_attempts = int(os.environ.get("E2E_APPROVE_ATTEMPTS", "3"))
|
||||
approve_resp = None
|
||||
approve_error = None
|
||||
for attempt in range(1, approve_attempts + 1):
|
||||
try:
|
||||
approve_resp = _request_json("POST", approve_url, portal_bearer, payload=None, timeout_s=approve_timeout_s)
|
||||
approve_error = None
|
||||
break
|
||||
except (http.client.RemoteDisconnected, TimeoutError, urllib.error.URLError) as exc:
|
||||
approve_error = str(exc)
|
||||
if attempt == approve_attempts:
|
||||
break
|
||||
time.sleep(3)
|
||||
if approve_resp is None:
|
||||
print(
|
||||
"WARNING: portal approval request did not return a response; "
|
||||
f"continuing to poll status (last_error={approve_error})"
|
||||
)
|
||||
elif not isinstance(approve_resp, dict) or approve_resp.get("ok") is not True:
|
||||
raise SystemExit(f"unexpected approval response: {approve_resp}")
|
||||
|
||||
status_url = f"{portal_base}/api/access/request/status"
|
||||
deadline_s = int(os.environ.get("E2E_DEADLINE_SECONDS", "600"))
|
||||
interval_s = int(os.environ.get("E2E_POLL_SECONDS", "10"))
|
||||
deadline_at = time.monotonic() + deadline_s
|
||||
|
||||
last_status = None
|
||||
last_error = None
|
||||
while True:
|
||||
try:
|
||||
status_payload = _post_json(status_url, {"request_code": request_code}, timeout_s=60)
|
||||
last_error = None
|
||||
except (http.client.RemoteDisconnected, TimeoutError, urllib.error.URLError) as exc:
|
||||
last_error = str(exc)
|
||||
if time.monotonic() >= deadline_at:
|
||||
raise SystemExit(f"timed out waiting for provisioning to finish (last error={last_error})")
|
||||
time.sleep(interval_s)
|
||||
continue
|
||||
status = status_payload.get("status")
|
||||
if isinstance(status, str):
|
||||
last_status = status
|
||||
|
||||
if status in ("awaiting_onboarding", "ready"):
|
||||
break
|
||||
if status in ("denied", "unknown"):
|
||||
raise SystemExit(f"request transitioned to unexpected terminal status: {status_payload}")
|
||||
if time.monotonic() >= deadline_at:
|
||||
suffix = f" (last error={last_error})" if last_error else ""
|
||||
raise SystemExit(f"timed out waiting for provisioning to finish (last status={last_status}){suffix}")
|
||||
time.sleep(interval_s)
|
||||
|
||||
# Refresh admin token (it may expire during the provisioning wait).
|
||||
token = _keycloak_client_token(keycloak_base, realm, kc_admin_client_id, kc_admin_client_secret)
|
||||
|
||||
user = _keycloak_find_user(keycloak_base, realm, token, username)
|
||||
if not user:
|
||||
raise SystemExit("expected Keycloak user was not created")
|
||||
user_id = user.get("id")
|
||||
if not isinstance(user_id, str) or not user_id:
|
||||
raise SystemExit("created user missing id")
|
||||
|
||||
full = _keycloak_get_user(keycloak_base, realm, token, user_id)
|
||||
required_actions = full.get("requiredActions") or []
|
||||
required: set[str] = set()
|
||||
if isinstance(required_actions, list):
|
||||
required = {a for a in required_actions if isinstance(a, str)}
|
||||
|
||||
unexpected = sorted(required.intersection({"UPDATE_PASSWORD", "VERIFY_EMAIL", "CONFIGURE_TOTP"}))
|
||||
if unexpected:
|
||||
raise SystemExit(
|
||||
"Keycloak user should not require actions at first login "
|
||||
f"(Vaultwarden-first onboarding): unexpected requiredActions={unexpected} full={sorted(required)}"
|
||||
)
|
||||
|
||||
email_verified = full.get("emailVerified")
|
||||
if email_verified is not True:
|
||||
raise SystemExit(f"Keycloak user should have emailVerified=true: emailVerified={email_verified!r}")
|
||||
|
||||
kc_email = full.get("email")
|
||||
if isinstance(kc_email, str) and contact_email and kc_email != contact_email:
|
||||
raise SystemExit(f"Keycloak user email mismatch: expected {contact_email!r} got {kc_email!r}")
|
||||
|
||||
print(f"PASS: onboarding provisioning completed for {request_code} ({username})")
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(main())
|
||||
193
services/bstein-dev-home/scripts/vaultwarden_cred_sync.py
Normal file
193
services/bstein-dev-home/scripts/vaultwarden_cred_sync.py
Normal file
@ -0,0 +1,193 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import sys
|
||||
import time
|
||||
from typing import Any, Iterable
|
||||
|
||||
import httpx
|
||||
|
||||
from atlas_portal import settings
|
||||
from atlas_portal.keycloak import admin_client
|
||||
from atlas_portal.vaultwarden import invite_user
|
||||
|
||||
|
||||
VAULTWARDEN_EMAIL_ATTR = "vaultwarden_email"
|
||||
VAULTWARDEN_STATUS_ATTR = "vaultwarden_status"
|
||||
VAULTWARDEN_SYNCED_AT_ATTR = "vaultwarden_synced_at"
|
||||
|
||||
|
||||
def _iter_keycloak_users(page_size: int = 200) -> Iterable[dict[str, Any]]:
|
||||
client = admin_client()
|
||||
if not client.ready():
|
||||
raise RuntimeError("keycloak admin client not configured")
|
||||
|
||||
url = f"{settings.KEYCLOAK_ADMIN_URL}/admin/realms/{settings.KEYCLOAK_REALM}/users"
|
||||
first = 0
|
||||
while True:
|
||||
headers = client.headers()
|
||||
# We need attributes for idempotency (vaultwarden_status/vaultwarden_email). Keycloak defaults to a
|
||||
# brief representation which may omit these.
|
||||
params = {"first": str(first), "max": str(page_size), "briefRepresentation": "false"}
|
||||
with httpx.Client(timeout=settings.HTTP_CHECK_TIMEOUT_SEC) as http:
|
||||
resp = http.get(url, params=params, headers=headers)
|
||||
resp.raise_for_status()
|
||||
payload = resp.json()
|
||||
|
||||
if not isinstance(payload, list) or not payload:
|
||||
return
|
||||
|
||||
for item in payload:
|
||||
if isinstance(item, dict):
|
||||
yield item
|
||||
|
||||
if len(payload) < page_size:
|
||||
return
|
||||
first += page_size
|
||||
|
||||
|
||||
def _extract_attr(attrs: Any, key: str) -> str:
|
||||
if not isinstance(attrs, dict):
|
||||
return ""
|
||||
raw = attrs.get(key)
|
||||
if isinstance(raw, list):
|
||||
for item in raw:
|
||||
if isinstance(item, str) and item.strip():
|
||||
return item.strip()
|
||||
return ""
|
||||
if isinstance(raw, str) and raw.strip():
|
||||
return raw.strip()
|
||||
return ""
|
||||
|
||||
|
||||
def _vaultwarden_email_for_user(user: dict[str, Any]) -> str:
|
||||
username = (user.get("username") if isinstance(user.get("username"), str) else "") or ""
|
||||
username = username.strip()
|
||||
if not username:
|
||||
return ""
|
||||
|
||||
attrs = user.get("attributes")
|
||||
vaultwarden_email = _extract_attr(attrs, VAULTWARDEN_EMAIL_ATTR)
|
||||
if vaultwarden_email:
|
||||
return vaultwarden_email
|
||||
|
||||
mailu_email = _extract_attr(attrs, "mailu_email")
|
||||
if mailu_email:
|
||||
return mailu_email
|
||||
|
||||
email = (user.get("email") if isinstance(user.get("email"), str) else "") or ""
|
||||
email = email.strip()
|
||||
if email and email.lower().endswith(f"@{settings.MAILU_DOMAIN.lower()}"):
|
||||
return email
|
||||
|
||||
# Don't guess an internal mailbox address until Mailu sync has run and stored mailu_email.
|
||||
# This avoids spamming Vaultwarden invites that can never be delivered (unknown recipient).
|
||||
return ""
|
||||
|
||||
|
||||
def _set_user_attribute_if_missing(username: str, user: dict[str, Any], key: str, value: str) -> None:
|
||||
value = (value or "").strip()
|
||||
if not value:
|
||||
return
|
||||
existing = _extract_attr(user.get("attributes"), key)
|
||||
if existing:
|
||||
return
|
||||
admin_client().set_user_attribute(username, key, value)
|
||||
|
||||
|
||||
def _set_user_attribute(username: str, key: str, value: str) -> None:
|
||||
value = (value or "").strip()
|
||||
if not value:
|
||||
return
|
||||
admin_client().set_user_attribute(username, key, value)
|
||||
|
||||
|
||||
def main() -> int:
|
||||
processed = 0
|
||||
created = 0
|
||||
skipped = 0
|
||||
failures = 0
|
||||
|
||||
for user in _iter_keycloak_users():
|
||||
username = (user.get("username") if isinstance(user.get("username"), str) else "") or ""
|
||||
username = username.strip()
|
||||
if not username:
|
||||
skipped += 1
|
||||
continue
|
||||
|
||||
enabled = user.get("enabled")
|
||||
if enabled is False:
|
||||
skipped += 1
|
||||
continue
|
||||
|
||||
if user.get("serviceAccountClientId") or username.startswith("service-account-"):
|
||||
skipped += 1
|
||||
continue
|
||||
|
||||
# Fetch the full user payload so we can reliably read attributes (and skip re-invites).
|
||||
user_id = (user.get("id") if isinstance(user.get("id"), str) else "") or ""
|
||||
user_id = user_id.strip()
|
||||
full_user = user
|
||||
if user_id:
|
||||
try:
|
||||
full_user = admin_client().get_user(user_id)
|
||||
except Exception:
|
||||
full_user = user
|
||||
|
||||
current_status = _extract_attr(full_user.get("attributes"), VAULTWARDEN_STATUS_ATTR)
|
||||
current_synced_at = _extract_attr(full_user.get("attributes"), VAULTWARDEN_SYNCED_AT_ATTR)
|
||||
email = _vaultwarden_email_for_user(full_user)
|
||||
if not email:
|
||||
print(f"skip {username}: missing email", file=sys.stderr)
|
||||
skipped += 1
|
||||
continue
|
||||
|
||||
try:
|
||||
_set_user_attribute_if_missing(username, full_user, VAULTWARDEN_EMAIL_ATTR, email)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# If we've already successfully invited or confirmed presence, do not re-invite on every cron run.
|
||||
# Vaultwarden returns 409 for "already exists", which is idempotent but noisy and can trigger rate limits.
|
||||
if current_status in {"invited", "already_present"}:
|
||||
if not current_synced_at:
|
||||
try:
|
||||
_set_user_attribute(
|
||||
username,
|
||||
VAULTWARDEN_SYNCED_AT_ATTR,
|
||||
time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()),
|
||||
)
|
||||
except Exception:
|
||||
pass
|
||||
skipped += 1
|
||||
continue
|
||||
|
||||
processed += 1
|
||||
result = invite_user(email)
|
||||
if result.ok:
|
||||
created += 1
|
||||
print(f"ok {username}: {result.status}")
|
||||
try:
|
||||
_set_user_attribute(username, VAULTWARDEN_STATUS_ATTR, result.status)
|
||||
_set_user_attribute(username, VAULTWARDEN_SYNCED_AT_ATTR, time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()))
|
||||
except Exception:
|
||||
pass
|
||||
else:
|
||||
failures += 1
|
||||
print(f"err {username}: {result.status} {result.detail}", file=sys.stderr)
|
||||
try:
|
||||
_set_user_attribute(username, VAULTWARDEN_STATUS_ATTR, result.status)
|
||||
_set_user_attribute(username, VAULTWARDEN_SYNCED_AT_ATTR, time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()))
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
print(
|
||||
f"done processed={processed} created_or_present={created} skipped={skipped} failures={failures}",
|
||||
file=sys.stderr,
|
||||
)
|
||||
return 0 if failures == 0 else 2
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
raise SystemExit(main())
|
||||
59
services/bstein-dev-home/vaultwarden-cred-sync-cronjob.yaml
Normal file
59
services/bstein-dev-home/vaultwarden-cred-sync-cronjob.yaml
Normal file
@ -0,0 +1,59 @@
|
||||
# services/bstein-dev-home/vaultwarden-cred-sync-cronjob.yaml
|
||||
apiVersion: batch/v1
|
||||
kind: CronJob
|
||||
metadata:
|
||||
name: vaultwarden-cred-sync
|
||||
namespace: bstein-dev-home
|
||||
spec:
|
||||
schedule: "*/15 * * * *"
|
||||
concurrencyPolicy: Forbid
|
||||
successfulJobsHistoryLimit: 1
|
||||
failedJobsHistoryLimit: 3
|
||||
jobTemplate:
|
||||
spec:
|
||||
backoffLimit: 0
|
||||
template:
|
||||
spec:
|
||||
serviceAccountName: bstein-dev-home
|
||||
restartPolicy: Never
|
||||
nodeSelector:
|
||||
kubernetes.io/arch: arm64
|
||||
node-role.kubernetes.io/worker: "true"
|
||||
imagePullSecrets:
|
||||
- name: harbor-bstein-robot
|
||||
containers:
|
||||
- name: sync
|
||||
image: registry.bstein.dev/bstein/bstein-dev-home-backend:0.1.1-92 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend"}
|
||||
imagePullPolicy: Always
|
||||
command:
|
||||
- python
|
||||
- /scripts/vaultwarden_cred_sync.py
|
||||
env:
|
||||
- name: PYTHONPATH
|
||||
value: /app
|
||||
- name: KEYCLOAK_ENABLED
|
||||
value: "true"
|
||||
- name: KEYCLOAK_REALM
|
||||
value: atlas
|
||||
- name: KEYCLOAK_ADMIN_URL
|
||||
value: http://keycloak.sso.svc.cluster.local
|
||||
- name: KEYCLOAK_ADMIN_REALM
|
||||
value: atlas
|
||||
- name: KEYCLOAK_ADMIN_CLIENT_ID
|
||||
value: bstein-dev-home-admin
|
||||
- name: KEYCLOAK_ADMIN_CLIENT_SECRET
|
||||
valueFrom:
|
||||
secretKeyRef:
|
||||
name: bstein-dev-home-keycloak-admin
|
||||
key: client_secret
|
||||
- name: HTTP_CHECK_TIMEOUT_SEC
|
||||
value: "20"
|
||||
volumeMounts:
|
||||
- name: vaultwarden-cred-sync-script
|
||||
mountPath: /scripts
|
||||
readOnly: true
|
||||
volumes:
|
||||
- name: vaultwarden-cred-sync-script
|
||||
configMap:
|
||||
name: vaultwarden-cred-sync-script
|
||||
defaultMode: 0555
|
||||
@ -1,31 +0,0 @@
|
||||
# services/ci-demo/deployment.yaml
|
||||
apiVersion: apps/v1
|
||||
kind: Deployment
|
||||
metadata:
|
||||
name: ci-demo
|
||||
namespace: ci-demo
|
||||
spec:
|
||||
replicas: 1
|
||||
selector:
|
||||
matchLabels:
|
||||
app.kubernetes.io/name: ci-demo
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
app.kubernetes.io/name: ci-demo
|
||||
spec:
|
||||
nodeSelector:
|
||||
hardware: rpi4
|
||||
containers:
|
||||
- name: ci-demo
|
||||
image: registry.bstein.dev/infra/ci-demo:latest
|
||||
ports:
|
||||
- name: http
|
||||
containerPort: 8080
|
||||
readinessProbe:
|
||||
httpGet:
|
||||
path: /
|
||||
port: http
|
||||
initialDelaySeconds: 2
|
||||
periodSeconds: 5
|
||||
|
||||
@ -1,24 +0,0 @@
|
||||
# services/ci-demo/image.yaml
|
||||
apiVersion: image.toolkit.fluxcd.io/v1
|
||||
kind: ImageRepository
|
||||
metadata:
|
||||
name: ci-demo
|
||||
namespace: flux-system
|
||||
spec:
|
||||
image: registry.bstein.dev/infra/ci-demo
|
||||
interval: 1m0s
|
||||
---
|
||||
apiVersion: image.toolkit.fluxcd.io/v1
|
||||
kind: ImagePolicy
|
||||
metadata:
|
||||
name: ci-demo
|
||||
namespace: flux-system
|
||||
spec:
|
||||
imageRepositoryRef:
|
||||
name: ci-demo
|
||||
filterTags:
|
||||
pattern: '^v(?P<version>0\.0\.0-\d+)$'
|
||||
extract: '$version'
|
||||
policy:
|
||||
semver:
|
||||
range: ">=0.0.0-0"
|
||||
@ -1,11 +0,0 @@
|
||||
# services/ci-demo/kustomization.yaml
|
||||
apiVersion: kustomize.config.k8s.io/v1beta1
|
||||
kind: Kustomization
|
||||
resources:
|
||||
- namespace.yaml
|
||||
- image.yaml
|
||||
- deployment.yaml
|
||||
- service.yaml
|
||||
images:
|
||||
- name: registry.bstein.dev/infra/ci-demo
|
||||
newTag: registry.bstein.dev/infra/ci-demo:v0.0.0-3 # {"$imagepolicy": "flux-system:ci-demo"}
|
||||
@ -1,6 +0,0 @@
|
||||
# services/ci-demo/namespace.yaml
|
||||
apiVersion: v1
|
||||
kind: Namespace
|
||||
metadata:
|
||||
name: ci-demo
|
||||
|
||||
31
services/comms/NOTES.md
Normal file
31
services/comms/NOTES.md
Normal file
@ -0,0 +1,31 @@
|
||||
# services/comms/NOTES.md
|
||||
|
||||
Purpose: Matrix + Element + LiveKit stack for Othrys (live.bstein.dev).
|
||||
|
||||
Core flow
|
||||
- Matrix Authentication Service (MAS) handles login/SSO and issues Matrix access tokens.
|
||||
- Synapse is the homeserver; MAS fronts login, Synapse serves client/server APIs.
|
||||
- Element Web provides the main UI; Element Call embeds LiveKit for group video.
|
||||
- LiveKit handles SFU media; Coturn provides TURN for NAT traversal.
|
||||
- matrix-guest-register provisions MAS guest accounts and performs MAS password login to mint device-bound guest tokens (no Keycloak).
|
||||
|
||||
Operational jobs
|
||||
- mas-db-ensure-job: ensures MAS database role/database + secret in comms.
|
||||
- comms-secrets-ensure-job: creates runtime secrets (TURN, LiveKit, Synapse, atlasbot).
|
||||
- synapse-signingkey-ensure-job: ensures Synapse signing key secret.
|
||||
- synapse-seeder-admin-ensure-job: ensures Synapse admin user exists.
|
||||
- synapse-user-seed-job: seeds atlasbot + othrys-seeder users/passwords.
|
||||
- mas-local-users-ensure-job: ensures MAS local users exist (seeder/bot).
|
||||
- seed-othrys-room: (suspended) creates Othrys + joins locals.
|
||||
- reset-othrys-room: suspended CronJob for a manual room reset + pin invite.
|
||||
- pin-othrys-invite: (suspended) pin invite message if missing.
|
||||
- guest-name-randomizer: renames numeric/guest users to adj-noun names.
|
||||
- bstein-force-leave: one-off room leave cleanup.
|
||||
|
||||
Manual re-runs
|
||||
- Unsuspend a CronJob only when needed; re-suspend after completion.
|
||||
|
||||
Ports
|
||||
- Traefik (HTTPS) via LB on 192.168.22.9.
|
||||
- Coturn LB on 192.168.22.5 (3478/5349 + UDP range).
|
||||
- LiveKit LB on 192.168.22.6 (7880/7881/7882/7883).
|
||||
Some files were not shown because too many files have changed in this diff Show More
Loading…
x
Reference in New Issue
Block a user