diff --git a/.gitignore b/.gitignore index 88b0632..8e09aa9 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,8 @@ *.md !README.md +!knowledge/**/*.md +!services/comms/knowledge/**/*.md +__pycache__/ +*.py[cod] +.pytest_cache +.venv diff --git a/clusters/atlas/applications/kustomization.yaml b/clusters/atlas/applications/kustomization.yaml index a32ec81..ed6d795 100644 --- a/clusters/atlas/applications/kustomization.yaml +++ b/clusters/atlas/applications/kustomization.yaml @@ -5,8 +5,9 @@ resources: - ../../services/crypto - ../../services/gitea - ../../services/jellyfin - - ../../services/jitsi + - ../../services/comms - ../../services/monitoring + - ../../services/logging - ../../services/pegasus - ../../services/vault - ../../services/bstein-dev-home diff --git a/clusters/atlas/flux-system/applications/ai-llm/kustomization.yaml b/clusters/atlas/flux-system/applications/ai-llm/kustomization.yaml new file mode 100644 index 0000000..3572a6c --- /dev/null +++ b/clusters/atlas/flux-system/applications/ai-llm/kustomization.yaml @@ -0,0 +1,23 @@ +# clusters/atlas/flux-system/applications/ai-llm/kustomization.yaml +apiVersion: kustomize.toolkit.fluxcd.io/v1 +kind: Kustomization +metadata: + name: ai-llm + namespace: flux-system +spec: + interval: 10m + path: ./services/ai-llm + targetNamespace: ai + prune: true + sourceRef: + kind: GitRepository + name: flux-system + namespace: flux-system + wait: true + healthChecks: + - apiVersion: apps/v1 + kind: Deployment + name: ollama + namespace: ai + dependsOn: + - name: core diff --git a/clusters/atlas/flux-system/applications/bstein-dev-home/image-automation.yaml b/clusters/atlas/flux-system/applications/bstein-dev-home/image-automation.yaml index 6245fb0..ddd55a1 100644 --- a/clusters/atlas/flux-system/applications/bstein-dev-home/image-automation.yaml +++ b/clusters/atlas/flux-system/applications/bstein-dev-home/image-automation.yaml @@ -3,7 +3,7 @@ apiVersion: image.toolkit.fluxcd.io/v1 kind: ImageUpdateAutomation metadata: name: bstein-dev-home - namespace: flux-system + namespace: bstein-dev-home spec: interval: 1m0s sourceRef: @@ -13,14 +13,14 @@ spec: git: checkout: ref: - branch: feature/bstein-dev-home + branch: feature/sso-hardening commit: author: email: ops@bstein.dev name: flux-bot - messageTemplate: "chore(bstein-dev-home): update images to {{range .Updated.Images}}{{.}}{{end}}" + messageTemplate: "chore(bstein-dev-home): automated image update" push: - branch: feature/bstein-dev-home + branch: feature/sso-hardening update: strategy: Setters path: services/bstein-dev-home diff --git a/clusters/atlas/flux-system/applications/ci-demo/image-automation.yaml b/clusters/atlas/flux-system/applications/ci-demo/image-automation.yaml deleted file mode 100644 index dd3e85e..0000000 --- a/clusters/atlas/flux-system/applications/ci-demo/image-automation.yaml +++ /dev/null @@ -1,26 +0,0 @@ -# clusters/atlas/flux-system/applications/ci-demo/image-automation.yaml -apiVersion: image.toolkit.fluxcd.io/v1 -kind: ImageUpdateAutomation -metadata: - name: ci-demo - namespace: flux-system -spec: - interval: 1m0s - sourceRef: - kind: GitRepository - name: flux-system - namespace: flux-system - git: - checkout: - ref: - branch: feature/ci-gitops - commit: - author: - email: ops@bstein.dev - name: flux-bot - messageTemplate: "chore(ci-demo): apply image updates" - push: - branch: feature/ci-gitops - update: - strategy: Setters - path: services/ci-demo diff --git a/clusters/atlas/flux-system/applications/comms/kustomization.yaml b/clusters/atlas/flux-system/applications/comms/kustomization.yaml new file mode 100644 index 0000000..0fb664a --- /dev/null +++ b/clusters/atlas/flux-system/applications/comms/kustomization.yaml @@ -0,0 +1,17 @@ +# clusters/atlas/flux-system/applications/communication/kustomization.yaml +apiVersion: kustomize.toolkit.fluxcd.io/v1 +kind: Kustomization +metadata: + name: comms + namespace: flux-system +spec: + interval: 10m + prune: true + sourceRef: + kind: GitRepository + name: flux-system + path: ./services/comms + targetNamespace: comms + timeout: 2m + dependsOn: + - name: traefik diff --git a/clusters/atlas/flux-system/applications/jellyfin/kustomization.yaml b/clusters/atlas/flux-system/applications/jellyfin/kustomization.yaml index 0d314ca..dda35d7 100644 --- a/clusters/atlas/flux-system/applications/jellyfin/kustomization.yaml +++ b/clusters/atlas/flux-system/applications/jellyfin/kustomization.yaml @@ -15,5 +15,6 @@ spec: namespace: flux-system dependsOn: - name: core + - name: openldap wait: true timeout: 5m diff --git a/clusters/atlas/flux-system/applications/jenkins/kustomization.yaml b/clusters/atlas/flux-system/applications/jenkins/kustomization.yaml index 98a7211..37d3d23 100644 --- a/clusters/atlas/flux-system/applications/jenkins/kustomization.yaml +++ b/clusters/atlas/flux-system/applications/jenkins/kustomization.yaml @@ -16,8 +16,12 @@ spec: - name: helm - name: traefik healthChecks: - - apiVersion: helm.toolkit.fluxcd.io/v2 - kind: HelmRelease + - apiVersion: apps/v1 + kind: Deployment + name: jenkins + namespace: jenkins + - apiVersion: v1 + kind: Service name: jenkins namespace: jenkins wait: false diff --git a/clusters/atlas/flux-system/applications/kustomization.yaml b/clusters/atlas/flux-system/applications/kustomization.yaml index 5825734..6788653 100644 --- a/clusters/atlas/flux-system/applications/kustomization.yaml +++ b/clusters/atlas/flux-system/applications/kustomization.yaml @@ -4,7 +4,8 @@ kind: Kustomization resources: - gitea/kustomization.yaml - vault/kustomization.yaml - - jitsi/kustomization.yaml + - vaultwarden/kustomization.yaml + - comms/kustomization.yaml - crypto/kustomization.yaml - monerod/kustomization.yaml - pegasus/kustomization.yaml @@ -16,9 +17,14 @@ resources: - jellyfin/kustomization.yaml - xmr-miner/kustomization.yaml - sui-metrics/kustomization.yaml + - openldap/kustomization.yaml - keycloak/kustomization.yaml - oauth2-proxy/kustomization.yaml - mailu/kustomization.yaml - jenkins/kustomization.yaml - - ci-demo/kustomization.yaml - - ci-demo/image-automation.yaml + - ai-llm/kustomization.yaml + - nextcloud/kustomization.yaml + - nextcloud-mail-sync/kustomization.yaml + - postgres/kustomization.yaml + - outline/kustomization.yaml + - planka/kustomization.yaml diff --git a/clusters/atlas/flux-system/applications/nextcloud-mail-sync/kustomization.yaml b/clusters/atlas/flux-system/applications/nextcloud-mail-sync/kustomization.yaml new file mode 100644 index 0000000..1eef5c4 --- /dev/null +++ b/clusters/atlas/flux-system/applications/nextcloud-mail-sync/kustomization.yaml @@ -0,0 +1,17 @@ +# clusters/atlas/flux-system/applications/nextcloud-mail-sync/kustomization.yaml +apiVersion: kustomize.toolkit.fluxcd.io/v1 +kind: Kustomization +metadata: + name: nextcloud-mail-sync + namespace: flux-system +spec: + interval: 10m + prune: true + sourceRef: + kind: GitRepository + name: flux-system + path: ./services/nextcloud-mail-sync + targetNamespace: nextcloud + timeout: 2m + dependsOn: + - name: keycloak diff --git a/clusters/atlas/flux-system/applications/nextcloud/kustomization.yaml b/clusters/atlas/flux-system/applications/nextcloud/kustomization.yaml new file mode 100644 index 0000000..9bc39c1 --- /dev/null +++ b/clusters/atlas/flux-system/applications/nextcloud/kustomization.yaml @@ -0,0 +1,16 @@ +# clusters/atlas/flux-system/applications/nextcloud/kustomization.yaml +apiVersion: kustomize.toolkit.fluxcd.io/v1beta2 +kind: Kustomization +metadata: + name: nextcloud + namespace: flux-system +spec: + interval: 10m + path: ./services/nextcloud + targetNamespace: nextcloud + prune: true + sourceRef: + kind: GitRepository + name: flux-system + namespace: flux-system + wait: true diff --git a/clusters/atlas/flux-system/applications/jitsi/kustomization.yaml b/clusters/atlas/flux-system/applications/openldap/kustomization.yaml similarity index 66% rename from clusters/atlas/flux-system/applications/jitsi/kustomization.yaml rename to clusters/atlas/flux-system/applications/openldap/kustomization.yaml index 8e96feb..d4657c0 100644 --- a/clusters/atlas/flux-system/applications/jitsi/kustomization.yaml +++ b/clusters/atlas/flux-system/applications/openldap/kustomization.yaml @@ -1,18 +1,18 @@ -# clusters/atlas/flux-system/applications/jitsi/kustomization.yaml +# clusters/atlas/flux-system/applications/openldap/kustomization.yaml apiVersion: kustomize.toolkit.fluxcd.io/v1 kind: Kustomization metadata: - name: jitsi + name: openldap namespace: flux-system spec: interval: 10m - path: ./services/jitsi - targetNamespace: jitsi prune: true sourceRef: kind: GitRepository name: flux-system namespace: flux-system + path: ./services/openldap + targetNamespace: sso dependsOn: - name: core wait: true diff --git a/clusters/atlas/flux-system/applications/outline/kustomization.yaml b/clusters/atlas/flux-system/applications/outline/kustomization.yaml new file mode 100644 index 0000000..429d093 --- /dev/null +++ b/clusters/atlas/flux-system/applications/outline/kustomization.yaml @@ -0,0 +1,28 @@ +# clusters/atlas/flux-system/applications/outline/kustomization.yaml +apiVersion: kustomize.toolkit.fluxcd.io/v1 +kind: Kustomization +metadata: + name: outline + namespace: flux-system +spec: + interval: 10m + path: ./services/outline + prune: true + sourceRef: + kind: GitRepository + name: flux-system + targetNamespace: outline + dependsOn: + - name: keycloak + - name: mailu + - name: traefik + healthChecks: + - apiVersion: apps/v1 + kind: Deployment + name: outline + namespace: outline + - apiVersion: v1 + kind: Service + name: outline + namespace: outline + wait: false diff --git a/clusters/atlas/flux-system/applications/planka/kustomization.yaml b/clusters/atlas/flux-system/applications/planka/kustomization.yaml new file mode 100644 index 0000000..5219a5d --- /dev/null +++ b/clusters/atlas/flux-system/applications/planka/kustomization.yaml @@ -0,0 +1,28 @@ +# clusters/atlas/flux-system/applications/planka/kustomization.yaml +apiVersion: kustomize.toolkit.fluxcd.io/v1 +kind: Kustomization +metadata: + name: planka + namespace: flux-system +spec: + interval: 10m + path: ./services/planka + prune: true + sourceRef: + kind: GitRepository + name: flux-system + targetNamespace: planka + dependsOn: + - name: keycloak + - name: mailu + - name: traefik + healthChecks: + - apiVersion: apps/v1 + kind: Deployment + name: planka + namespace: planka + - apiVersion: v1 + kind: Service + name: planka + namespace: planka + wait: false diff --git a/clusters/atlas/flux-system/applications/postgres/kustomization.yaml b/clusters/atlas/flux-system/applications/postgres/kustomization.yaml new file mode 100644 index 0000000..07df4c7 --- /dev/null +++ b/clusters/atlas/flux-system/applications/postgres/kustomization.yaml @@ -0,0 +1,24 @@ +# clusters/atlas/flux-system/applications/postgres/kustomization.yaml +apiVersion: kustomize.toolkit.fluxcd.io/v1 +kind: Kustomization +metadata: + name: postgres + namespace: flux-system +spec: + interval: 10m + path: ./services/postgres + prune: true + force: true + sourceRef: + kind: GitRepository + name: flux-system + targetNamespace: postgres + dependsOn: + - name: vault + - name: vault-csi + healthChecks: + - apiVersion: apps/v1 + kind: StatefulSet + name: postgres + namespace: postgres + wait: true diff --git a/clusters/atlas/flux-system/applications/vaultwarden/kustomization.yaml b/clusters/atlas/flux-system/applications/vaultwarden/kustomization.yaml new file mode 100644 index 0000000..783d5e1 --- /dev/null +++ b/clusters/atlas/flux-system/applications/vaultwarden/kustomization.yaml @@ -0,0 +1,20 @@ +# clusters/atlas/flux-system/applications/vaultwarden/kustomization.yaml +apiVersion: kustomize.toolkit.fluxcd.io/v1 +kind: Kustomization +metadata: + name: vaultwarden + namespace: flux-system +spec: + interval: 10m + suspend: false + sourceRef: + kind: GitRepository + name: flux-system + namespace: flux-system + path: ./services/vaultwarden + targetNamespace: vaultwarden + prune: true + wait: true + dependsOn: + - name: helm + - name: traefik diff --git a/clusters/atlas/flux-system/gotk-sync.yaml b/clusters/atlas/flux-system/gotk-sync.yaml index 473ab99..713e739 100644 --- a/clusters/atlas/flux-system/gotk-sync.yaml +++ b/clusters/atlas/flux-system/gotk-sync.yaml @@ -8,7 +8,7 @@ metadata: spec: interval: 1m0s ref: - branch: main + branch: feature/sso-hardening secretRef: name: flux-system-gitea url: ssh://git@scm.bstein.dev:2242/bstein/titan-iac.git diff --git a/clusters/atlas/flux-system/platform/kustomization.yaml b/clusters/atlas/flux-system/platform/kustomization.yaml index 040e478..7da2ca3 100644 --- a/clusters/atlas/flux-system/platform/kustomization.yaml +++ b/clusters/atlas/flux-system/platform/kustomization.yaml @@ -4,7 +4,11 @@ kind: Kustomization resources: - core/kustomization.yaml - helm/kustomization.yaml + - metallb/kustomization.yaml - traefik/kustomization.yaml - gitops-ui/kustomization.yaml - monitoring/kustomization.yaml + - logging/kustomization.yaml + - maintenance/kustomization.yaml - longhorn-ui/kustomization.yaml + - ../platform/vault-csi/kustomization.yaml diff --git a/clusters/atlas/flux-system/platform/logging/kustomization.yaml b/clusters/atlas/flux-system/platform/logging/kustomization.yaml new file mode 100644 index 0000000..c51eb5e --- /dev/null +++ b/clusters/atlas/flux-system/platform/logging/kustomization.yaml @@ -0,0 +1,14 @@ +# clusters/atlas/flux-system/platform/logging/kustomization.yaml +apiVersion: kustomize.toolkit.fluxcd.io/v1 +kind: Kustomization +metadata: + name: logging + namespace: flux-system +spec: + interval: 10m + path: ./services/logging + prune: true + sourceRef: + kind: GitRepository + name: flux-system + wait: false diff --git a/clusters/atlas/flux-system/applications/ci-demo/kustomization.yaml b/clusters/atlas/flux-system/platform/maintenance/kustomization.yaml similarity index 56% rename from clusters/atlas/flux-system/applications/ci-demo/kustomization.yaml rename to clusters/atlas/flux-system/platform/maintenance/kustomization.yaml index 09f598d..fc655a4 100644 --- a/clusters/atlas/flux-system/applications/ci-demo/kustomization.yaml +++ b/clusters/atlas/flux-system/platform/maintenance/kustomization.yaml @@ -1,17 +1,14 @@ -# clusters/atlas/flux-system/applications/ci-demo/kustomization.yaml +# clusters/atlas/flux-system/platform/maintenance/kustomization.yaml apiVersion: kustomize.toolkit.fluxcd.io/v1 kind: Kustomization metadata: - name: ci-demo + name: maintenance namespace: flux-system spec: interval: 10m - path: ./services/ci-demo + path: ./services/maintenance prune: true sourceRef: kind: GitRepository name: flux-system - namespace: flux-system - dependsOn: - - name: core wait: false diff --git a/clusters/atlas/flux-system/platform/metallb/kustomization.yaml b/clusters/atlas/flux-system/platform/metallb/kustomization.yaml new file mode 100644 index 0000000..98baaff --- /dev/null +++ b/clusters/atlas/flux-system/platform/metallb/kustomization.yaml @@ -0,0 +1,16 @@ +# clusters/atlas/flux-system/platform/metallb/kustomization.yaml +apiVersion: kustomize.toolkit.fluxcd.io/v1 +kind: Kustomization +metadata: + name: metallb + namespace: flux-system +spec: + interval: 30m + sourceRef: + kind: GitRepository + name: flux-system + namespace: flux-system + path: ./infrastructure/metallb + prune: true + wait: true + targetNamespace: metallb-system diff --git a/clusters/atlas/flux-system/platform/traefik/kustomization.yaml b/clusters/atlas/flux-system/platform/traefik/kustomization.yaml index 0f53de7..336eb89 100644 --- a/clusters/atlas/flux-system/platform/traefik/kustomization.yaml +++ b/clusters/atlas/flux-system/platform/traefik/kustomization.yaml @@ -15,4 +15,5 @@ spec: namespace: flux-system dependsOn: - name: core + - name: metallb wait: true diff --git a/clusters/atlas/flux-system/platform/vault-csi/kustomization.yaml b/clusters/atlas/flux-system/platform/vault-csi/kustomization.yaml new file mode 100644 index 0000000..5a56941 --- /dev/null +++ b/clusters/atlas/flux-system/platform/vault-csi/kustomization.yaml @@ -0,0 +1,16 @@ +# clusters/atlas/flux-system/platform/vault-csi/kustomization.yaml +apiVersion: kustomize.toolkit.fluxcd.io/v1 +kind: Kustomization +metadata: + name: vault-csi + namespace: flux-system +spec: + interval: 30m + sourceRef: + kind: GitRepository + name: flux-system + namespace: flux-system + path: ./infrastructure/vault-csi + prune: true + wait: true + targetNamespace: kube-system diff --git a/clusters/atlas/platform/kustomization.yaml b/clusters/atlas/platform/kustomization.yaml index c7b144a..43fa993 100644 --- a/clusters/atlas/platform/kustomization.yaml +++ b/clusters/atlas/platform/kustomization.yaml @@ -5,3 +5,4 @@ resources: - ../../../infrastructure/modules/base - ../../../infrastructure/modules/profiles/atlas-ha - ../../../infrastructure/sources/cert-manager/letsencrypt.yaml + - ../../../infrastructure/metallb diff --git a/dockerfiles/Dockerfile.data-prepper b/dockerfiles/Dockerfile.data-prepper new file mode 100644 index 0000000..b33c18c --- /dev/null +++ b/dockerfiles/Dockerfile.data-prepper @@ -0,0 +1,16 @@ +FROM --platform=$BUILDPLATFORM opensearchproject/data-prepper:2.8.0 AS source + +FROM --platform=$TARGETPLATFORM eclipse-temurin:17-jre + +ENV DATA_PREPPER_PATH=/usr/share/data-prepper + +RUN useradd -u 10001 -M -U -d / -s /usr/sbin/nologin data_prepper \ + && mkdir -p /var/log/data-prepper + +COPY --from=source /usr/share/data-prepper /usr/share/data-prepper + +RUN chown -R 10001:10001 /usr/share/data-prepper /var/log/data-prepper + +USER 10001 +WORKDIR /usr/share/data-prepper +CMD ["bin/data-prepper"] diff --git a/hosts/roles/titan_jh/tasks/main.yaml b/hosts/roles/titan_jh/tasks/main.yaml index 0f66730..ad899a3 100644 --- a/hosts/roles/titan_jh/tasks/main.yaml +++ b/hosts/roles/titan_jh/tasks/main.yaml @@ -1,5 +1,18 @@ # hosts/roles/titan_jh/tasks/main.yaml --- +- name: Install node exporter + ansible.builtin.package: + name: prometheus-node-exporter + state: present + tags: ['jumphost', 'monitoring'] + +- name: Enable node exporter + ansible.builtin.service: + name: prometheus-node-exporter + enabled: true + state: started + tags: ['jumphost', 'monitoring'] + - name: Placeholder for jumphost hardening ansible.builtin.debug: msg: "Harden SSH, manage bastion tooling, and configure audit logging here." diff --git a/infrastructure/metallb/ippool.yaml b/infrastructure/metallb/ippool.yaml new file mode 100644 index 0000000..e792280 --- /dev/null +++ b/infrastructure/metallb/ippool.yaml @@ -0,0 +1,20 @@ +# infrastructure/metallb/ippool.yaml +apiVersion: metallb.io/v1beta1 +kind: IPAddressPool +metadata: + name: communication-pool + namespace: metallb-system +spec: + addresses: + - 192.168.22.4-192.168.22.6 + - 192.168.22.9-192.168.22.9 + autoAssign: true +--- +apiVersion: metallb.io/v1beta1 +kind: L2Advertisement +metadata: + name: communication-adv + namespace: metallb-system +spec: + ipAddressPools: + - communication-pool diff --git a/infrastructure/metallb/kustomization.yaml b/infrastructure/metallb/kustomization.yaml new file mode 100644 index 0000000..1a1452c --- /dev/null +++ b/infrastructure/metallb/kustomization.yaml @@ -0,0 +1,10 @@ +# infrastructure/metallb/kustomization.yaml +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization +resources: + - namespace.yaml + - metallb-rendered.yaml + - ippool.yaml +patchesStrategicMerge: + - patches/node-placement.yaml + - patches/speaker-loglevel.yaml diff --git a/infrastructure/metallb/metallb-rendered.yaml b/infrastructure/metallb/metallb-rendered.yaml new file mode 100644 index 0000000..0f8ad10 --- /dev/null +++ b/infrastructure/metallb/metallb-rendered.yaml @@ -0,0 +1,2411 @@ +--- +# Source: metallb/templates/service-accounts.yaml +apiVersion: v1 +kind: ServiceAccount +metadata: + name: metallb-controller + namespace: "metallb-system" + labels: + helm.sh/chart: metallb-0.15.3 + app.kubernetes.io/name: metallb + app.kubernetes.io/instance: metallb + app.kubernetes.io/version: "v0.15.3" + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/component: controller +--- +# Source: metallb/templates/service-accounts.yaml +apiVersion: v1 +kind: ServiceAccount +metadata: + name: metallb-speaker + namespace: "metallb-system" + labels: + helm.sh/chart: metallb-0.15.3 + app.kubernetes.io/name: metallb + app.kubernetes.io/instance: metallb + app.kubernetes.io/version: "v0.15.3" + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/component: speaker +--- +# Source: metallb/templates/webhooks.yaml +apiVersion: v1 +kind: Secret +metadata: + name: metallb-webhook-cert + namespace: "metallb-system" + labels: + helm.sh/chart: metallb-0.15.3 + app.kubernetes.io/name: metallb + app.kubernetes.io/instance: metallb + app.kubernetes.io/version: "v0.15.3" + app.kubernetes.io/managed-by: Helm +--- +# Source: metallb/templates/exclude-l2-config.yaml +apiVersion: v1 +kind: ConfigMap +metadata: + name: metallb-excludel2 + namespace: "metallb-system" + labels: + helm.sh/chart: metallb-0.15.3 + app.kubernetes.io/name: metallb + app.kubernetes.io/instance: metallb + app.kubernetes.io/version: "v0.15.3" + app.kubernetes.io/managed-by: Helm +data: + excludel2.yaml: | + announcedInterfacesToExclude: + - ^docker.* + - ^cbr.* + - ^dummy.* + - ^virbr.* + - ^lxcbr.* + - ^veth.* + - ^lo$ + - ^cali.* + - ^tunl.* + - ^flannel.* + - ^kube-ipvs.* + - ^cni.* + - ^nodelocaldns.* + - ^lxc.* +--- +# Source: metallb/templates/speaker.yaml +# FRR expects to have these files owned by frr:frr on startup. +# Having them in a ConfigMap allows us to modify behaviors: for example enabling more daemons on startup. +apiVersion: v1 +kind: ConfigMap +metadata: + name: metallb-frr-startup + namespace: "metallb-system" + labels: + helm.sh/chart: metallb-0.15.3 + app.kubernetes.io/name: metallb + app.kubernetes.io/instance: metallb + app.kubernetes.io/version: "v0.15.3" + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/component: speaker +data: + daemons: | + # This file tells the frr package which daemons to start. + # + # Sample configurations for these daemons can be found in + # /usr/share/doc/frr/examples/. + # + # ATTENTION: + # + # When activating a daemon for the first time, a config file, even if it is + # empty, has to be present *and* be owned by the user and group "frr", else + # the daemon will not be started by /etc/init.d/frr. The permissions should + # be u=rw,g=r,o=. + # When using "vtysh" such a config file is also needed. It should be owned by + # group "frrvty" and set to ug=rw,o= though. Check /etc/pam.d/frr, too. + # + # The watchfrr and zebra daemons are always started. + # + bgpd=yes + ospfd=no + ospf6d=no + ripd=no + ripngd=no + isisd=no + pimd=no + ldpd=no + nhrpd=no + eigrpd=no + babeld=no + sharpd=no + pbrd=no + bfdd=yes + fabricd=no + vrrpd=no + + # + # If this option is set the /etc/init.d/frr script automatically loads + # the config via "vtysh -b" when the servers are started. + # Check /etc/pam.d/frr if you intend to use "vtysh"! + # + vtysh_enable=yes + zebra_options=" -A 127.0.0.1 -s 90000000 --limit-fds 100000" + bgpd_options=" -A 127.0.0.1 -p 0 --limit-fds 100000" + ospfd_options=" -A 127.0.0.1" + ospf6d_options=" -A ::1" + ripd_options=" -A 127.0.0.1" + ripngd_options=" -A ::1" + isisd_options=" -A 127.0.0.1" + pimd_options=" -A 127.0.0.1" + ldpd_options=" -A 127.0.0.1" + nhrpd_options=" -A 127.0.0.1" + eigrpd_options=" -A 127.0.0.1" + babeld_options=" -A 127.0.0.1" + sharpd_options=" -A 127.0.0.1" + pbrd_options=" -A 127.0.0.1" + staticd_options="-A 127.0.0.1 --limit-fds 100000" + bfdd_options=" -A 127.0.0.1 --limit-fds 100000" + fabricd_options="-A 127.0.0.1" + vrrpd_options=" -A 127.0.0.1" + + # configuration profile + # + #frr_profile="traditional" + #frr_profile="datacenter" + + # + # This is the maximum number of FD's that will be available. + # Upon startup this is read by the control files and ulimit + # is called. Uncomment and use a reasonable value for your + # setup if you are expecting a large number of peers in + # say BGP. + #MAX_FDS=1024 + + # The list of daemons to watch is automatically generated by the init script. + #watchfrr_options="" + + # for debugging purposes, you can specify a "wrap" command to start instead + # of starting the daemon directly, e.g. to use valgrind on ospfd: + # ospfd_wrap="/usr/bin/valgrind" + # or you can use "all_wrap" for all daemons, e.g. to use perf record: + # all_wrap="/usr/bin/perf record --call-graph -" + # the normal daemon command is added to this at the end. + vtysh.conf: |+ + service integrated-vtysh-config + frr.conf: |+ + ! This file gets overriden the first time the speaker renders a config. + ! So anything configured here is only temporary. + frr version 8.0 + frr defaults traditional + hostname Router + line vty + log file /etc/frr/frr.log informational +--- +# Source: metallb/charts/crds/templates/crds.yaml +apiVersion: apiextensions.k8s.io/v1 +kind: CustomResourceDefinition +metadata: + annotations: + controller-gen.kubebuilder.io/version: v0.19.0 + name: bfdprofiles.metallb.io +spec: + group: metallb.io + names: + kind: BFDProfile + listKind: BFDProfileList + plural: bfdprofiles + singular: bfdprofile + scope: Namespaced + versions: + - additionalPrinterColumns: + - jsonPath: .spec.passiveMode + name: Passive Mode + type: boolean + - jsonPath: .spec.transmitInterval + name: Transmit Interval + type: integer + - jsonPath: .spec.receiveInterval + name: Receive Interval + type: integer + - jsonPath: .spec.detectMultiplier + name: Multiplier + type: integer + name: v1beta1 + schema: + openAPIV3Schema: + description: |- + BFDProfile represents the settings of the bfd session that can be + optionally associated with a BGP session. + properties: + apiVersion: + description: |- + APIVersion defines the versioned schema of this representation of an object. + Servers should convert recognized schemas to the latest internal value, and + may reject unrecognized values. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources + type: string + kind: + description: |- + Kind is a string value representing the REST resource this object represents. + Servers may infer this from the endpoint the client submits requests to. + Cannot be updated. + In CamelCase. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds + type: string + metadata: + type: object + spec: + description: BFDProfileSpec defines the desired state of BFDProfile. + properties: + detectMultiplier: + description: |- + Configures the detection multiplier to determine + packet loss. The remote transmission interval will be multiplied + by this value to determine the connection loss detection timer. + format: int32 + maximum: 255 + minimum: 2 + type: integer + echoInterval: + description: |- + Configures the minimal echo receive transmission + interval that this system is capable of handling in milliseconds. + Defaults to 50ms + format: int32 + maximum: 60000 + minimum: 10 + type: integer + echoMode: + description: |- + Enables or disables the echo transmission mode. + This mode is disabled by default, and not supported on multi + hops setups. + type: boolean + minimumTtl: + description: |- + For multi hop sessions only: configure the minimum + expected TTL for an incoming BFD control packet. + format: int32 + maximum: 254 + minimum: 1 + type: integer + passiveMode: + description: |- + Mark session as passive: a passive session will not + attempt to start the connection and will wait for control packets + from peer before it begins replying. + type: boolean + receiveInterval: + description: |- + The minimum interval that this system is capable of + receiving control packets in milliseconds. + Defaults to 300ms. + format: int32 + maximum: 60000 + minimum: 10 + type: integer + transmitInterval: + description: |- + The minimum transmission interval (less jitter) + that this system wants to use to send BFD control packets in + milliseconds. Defaults to 300ms + format: int32 + maximum: 60000 + minimum: 10 + type: integer + type: object + status: + description: BFDProfileStatus defines the observed state of BFDProfile. + type: object + type: object + served: true + storage: true + subresources: + status: {} +--- +# Source: metallb/charts/crds/templates/crds.yaml +apiVersion: apiextensions.k8s.io/v1 +kind: CustomResourceDefinition +metadata: + annotations: + controller-gen.kubebuilder.io/version: v0.19.0 + name: bgpadvertisements.metallb.io +spec: + group: metallb.io + names: + kind: BGPAdvertisement + listKind: BGPAdvertisementList + plural: bgpadvertisements + singular: bgpadvertisement + scope: Namespaced + versions: + - additionalPrinterColumns: + - jsonPath: .spec.ipAddressPools + name: IPAddressPools + type: string + - jsonPath: .spec.ipAddressPoolSelectors + name: IPAddressPool Selectors + type: string + - jsonPath: .spec.peers + name: Peers + type: string + - jsonPath: .spec.nodeSelectors + name: Node Selectors + priority: 10 + type: string + name: v1beta1 + schema: + openAPIV3Schema: + description: |- + BGPAdvertisement allows to advertise the IPs coming + from the selected IPAddressPools via BGP, setting the parameters of the + BGP Advertisement. + properties: + apiVersion: + description: |- + APIVersion defines the versioned schema of this representation of an object. + Servers should convert recognized schemas to the latest internal value, and + may reject unrecognized values. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources + type: string + kind: + description: |- + Kind is a string value representing the REST resource this object represents. + Servers may infer this from the endpoint the client submits requests to. + Cannot be updated. + In CamelCase. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds + type: string + metadata: + type: object + spec: + description: BGPAdvertisementSpec defines the desired state of BGPAdvertisement. + properties: + aggregationLength: + default: 32 + description: The aggregation-length advertisement option lets you “roll up” the /32s into a larger prefix. Defaults to 32. Works for IPv4 addresses. + format: int32 + minimum: 1 + type: integer + aggregationLengthV6: + default: 128 + description: The aggregation-length advertisement option lets you “roll up” the /128s into a larger prefix. Defaults to 128. Works for IPv6 addresses. + format: int32 + type: integer + communities: + description: |- + The BGP communities to be associated with the announcement. Each item can be a standard community of the + form 1234:1234, a large community of the form large:1234:1234:1234 or the name of an alias defined in the + Community CRD. + items: + type: string + type: array + ipAddressPoolSelectors: + description: |- + A selector for the IPAddressPools which would get advertised via this advertisement. + If no IPAddressPool is selected by this or by the list, the advertisement is applied to all the IPAddressPools. + items: + description: |- + A label selector is a label query over a set of resources. The result of matchLabels and + matchExpressions are ANDed. An empty label selector matches all objects. A null + label selector matches no objects. + properties: + matchExpressions: + description: matchExpressions is a list of label selector requirements. The requirements are ANDed. + items: + description: |- + A label selector requirement is a selector that contains values, a key, and an operator that + relates the key and values. + properties: + key: + description: key is the label key that the selector applies to. + type: string + operator: + description: |- + operator represents a key's relationship to a set of values. + Valid operators are In, NotIn, Exists and DoesNotExist. + type: string + values: + description: |- + values is an array of string values. If the operator is In or NotIn, + the values array must be non-empty. If the operator is Exists or DoesNotExist, + the values array must be empty. This array is replaced during a strategic + merge patch. + items: + type: string + type: array + x-kubernetes-list-type: atomic + required: + - key + - operator + type: object + type: array + x-kubernetes-list-type: atomic + matchLabels: + additionalProperties: + type: string + description: |- + matchLabels is a map of {key,value} pairs. A single {key,value} in the matchLabels + map is equivalent to an element of matchExpressions, whose key field is "key", the + operator is "In", and the values array contains only "value". The requirements are ANDed. + type: object + type: object + x-kubernetes-map-type: atomic + type: array + ipAddressPools: + description: The list of IPAddressPools to advertise via this advertisement, selected by name. + items: + type: string + type: array + localPref: + description: |- + The BGP LOCAL_PREF attribute which is used by BGP best path algorithm, + Path with higher localpref is preferred over one with lower localpref. + format: int32 + type: integer + nodeSelectors: + description: NodeSelectors allows to limit the nodes to announce as next hops for the LoadBalancer IP. When empty, all the nodes having are announced as next hops. + items: + description: |- + A label selector is a label query over a set of resources. The result of matchLabels and + matchExpressions are ANDed. An empty label selector matches all objects. A null + label selector matches no objects. + properties: + matchExpressions: + description: matchExpressions is a list of label selector requirements. The requirements are ANDed. + items: + description: |- + A label selector requirement is a selector that contains values, a key, and an operator that + relates the key and values. + properties: + key: + description: key is the label key that the selector applies to. + type: string + operator: + description: |- + operator represents a key's relationship to a set of values. + Valid operators are In, NotIn, Exists and DoesNotExist. + type: string + values: + description: |- + values is an array of string values. If the operator is In or NotIn, + the values array must be non-empty. If the operator is Exists or DoesNotExist, + the values array must be empty. This array is replaced during a strategic + merge patch. + items: + type: string + type: array + x-kubernetes-list-type: atomic + required: + - key + - operator + type: object + type: array + x-kubernetes-list-type: atomic + matchLabels: + additionalProperties: + type: string + description: |- + matchLabels is a map of {key,value} pairs. A single {key,value} in the matchLabels + map is equivalent to an element of matchExpressions, whose key field is "key", the + operator is "In", and the values array contains only "value". The requirements are ANDed. + type: object + type: object + x-kubernetes-map-type: atomic + type: array + peers: + description: |- + Peers limits the bgppeer to advertise the ips of the selected pools to. + When empty, the loadbalancer IP is announced to all the BGPPeers configured. + items: + type: string + type: array + type: object + status: + description: BGPAdvertisementStatus defines the observed state of BGPAdvertisement. + type: object + type: object + served: true + storage: true + subresources: + status: {} +--- +# Source: metallb/charts/crds/templates/crds.yaml +apiVersion: apiextensions.k8s.io/v1 +kind: CustomResourceDefinition +metadata: + annotations: + controller-gen.kubebuilder.io/version: v0.19.0 + name: bgppeers.metallb.io +spec: + conversion: + strategy: Webhook + webhook: + clientConfig: + service: + name: metallb-webhook-service + namespace: metallb-system + path: /convert + conversionReviewVersions: + - v1beta1 + - v1beta2 + group: metallb.io + names: + kind: BGPPeer + listKind: BGPPeerList + plural: bgppeers + singular: bgppeer + scope: Namespaced + versions: + - additionalPrinterColumns: + - jsonPath: .spec.peerAddress + name: Address + type: string + - jsonPath: .spec.peerASN + name: ASN + type: string + - jsonPath: .spec.bfdProfile + name: BFD Profile + type: string + - jsonPath: .spec.ebgpMultiHop + name: Multi Hops + type: string + deprecated: true + deprecationWarning: v1beta1 is deprecated, please use v1beta2 + name: v1beta1 + schema: + openAPIV3Schema: + description: BGPPeer is the Schema for the peers API. + properties: + apiVersion: + description: |- + APIVersion defines the versioned schema of this representation of an object. + Servers should convert recognized schemas to the latest internal value, and + may reject unrecognized values. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources + type: string + kind: + description: |- + Kind is a string value representing the REST resource this object represents. + Servers may infer this from the endpoint the client submits requests to. + Cannot be updated. + In CamelCase. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds + type: string + metadata: + type: object + spec: + description: BGPPeerSpec defines the desired state of Peer. + properties: + bfdProfile: + type: string + ebgpMultiHop: + description: EBGP peer is multi-hops away + type: boolean + holdTime: + description: Requested BGP hold time, per RFC4271. + type: string + keepaliveTime: + description: Requested BGP keepalive time, per RFC4271. + type: string + myASN: + description: AS number to use for the local end of the session. + format: int32 + maximum: 4294967295 + minimum: 0 + type: integer + nodeSelectors: + description: |- + Only connect to this peer on nodes that match one of these + selectors. + items: + properties: + matchExpressions: + items: + properties: + key: + type: string + operator: + type: string + values: + items: + type: string + minItems: 1 + type: array + required: + - key + - operator + - values + type: object + type: array + matchLabels: + additionalProperties: + type: string + type: object + type: object + type: array + password: + description: Authentication password for routers enforcing TCP MD5 authenticated sessions + type: string + peerASN: + description: AS number to expect from the remote end of the session. + format: int32 + maximum: 4294967295 + minimum: 0 + type: integer + peerAddress: + description: Address to dial when establishing the session. + type: string + peerPort: + description: Port to dial when establishing the session. + maximum: 16384 + minimum: 0 + type: integer + routerID: + description: BGP router ID to advertise to the peer + type: string + sourceAddress: + description: Source address to use when establishing the session. + type: string + required: + - myASN + - peerASN + - peerAddress + type: object + status: + description: BGPPeerStatus defines the observed state of Peer. + type: object + type: object + served: true + storage: false + subresources: + status: {} + - additionalPrinterColumns: + - jsonPath: .spec.peerAddress + name: Address + type: string + - jsonPath: .spec.peerASN + name: ASN + type: string + - jsonPath: .spec.bfdProfile + name: BFD Profile + type: string + - jsonPath: .spec.ebgpMultiHop + name: Multi Hops + type: string + name: v1beta2 + schema: + openAPIV3Schema: + description: BGPPeer is the Schema for the peers API. + properties: + apiVersion: + description: |- + APIVersion defines the versioned schema of this representation of an object. + Servers should convert recognized schemas to the latest internal value, and + may reject unrecognized values. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources + type: string + kind: + description: |- + Kind is a string value representing the REST resource this object represents. + Servers may infer this from the endpoint the client submits requests to. + Cannot be updated. + In CamelCase. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds + type: string + metadata: + type: object + spec: + description: BGPPeerSpec defines the desired state of Peer. + properties: + bfdProfile: + description: The name of the BFD Profile to be used for the BFD session associated to the BGP session. If not set, the BFD session won't be set up. + type: string + connectTime: + description: Requested BGP connect time, controls how long BGP waits between connection attempts to a neighbor. + type: string + x-kubernetes-validations: + - message: connect time should be between 1 seconds to 65535 + rule: duration(self).getSeconds() >= 1 && duration(self).getSeconds() <= 65535 + - message: connect time should contain a whole number of seconds + rule: duration(self).getMilliseconds() % 1000 == 0 + disableMP: + default: false + description: |- + To set if we want to disable MP BGP that will separate IPv4 and IPv6 route exchanges into distinct BGP sessions. + Deprecated: DisableMP is deprecated in favor of dualStackAddressFamily. + type: boolean + dualStackAddressFamily: + default: false + description: |- + To set if we want to enable the neighbor not only for the ipfamily related to its session, + but also the other one. This allows to advertise/receive IPv4 prefixes over IPv6 sessions and vice versa. + type: boolean + dynamicASN: + description: |- + DynamicASN detects the AS number to use for the remote end of the session + without explicitly setting it via the ASN field. Limited to: + internal - if the neighbor's ASN is different than MyASN connection is denied. + external - if the neighbor's ASN is the same as MyASN the connection is denied. + ASN and DynamicASN are mutually exclusive and one of them must be specified. + enum: + - internal + - external + type: string + ebgpMultiHop: + description: To set if the BGPPeer is multi-hops away. Needed for FRR mode only. + type: boolean + enableGracefulRestart: + description: |- + EnableGracefulRestart allows BGP peer to continue to forward data packets + along known routes while the routing protocol information is being + restored. This field is immutable because it requires restart of the BGP + session. Supported for FRR mode only. + type: boolean + x-kubernetes-validations: + - message: EnableGracefulRestart cannot be changed after creation + rule: self == oldSelf + holdTime: + description: Requested BGP hold time, per RFC4271. + type: string + interface: + description: |- + Interface is the node interface over which the unnumbered BGP peering will + be established. No API validation takes place as that string value + represents an interface name on the host and if user provides an invalid + value, only the actual BGP session will not be established. + Address and Interface are mutually exclusive and one of them must be specified. + type: string + keepaliveTime: + description: Requested BGP keepalive time, per RFC4271. + type: string + myASN: + description: AS number to use for the local end of the session. + format: int32 + maximum: 4294967295 + minimum: 0 + type: integer + nodeSelectors: + description: |- + Only connect to this peer on nodes that match one of these + selectors. + items: + description: |- + A label selector is a label query over a set of resources. The result of matchLabels and + matchExpressions are ANDed. An empty label selector matches all objects. A null + label selector matches no objects. + properties: + matchExpressions: + description: matchExpressions is a list of label selector requirements. The requirements are ANDed. + items: + description: |- + A label selector requirement is a selector that contains values, a key, and an operator that + relates the key and values. + properties: + key: + description: key is the label key that the selector applies to. + type: string + operator: + description: |- + operator represents a key's relationship to a set of values. + Valid operators are In, NotIn, Exists and DoesNotExist. + type: string + values: + description: |- + values is an array of string values. If the operator is In or NotIn, + the values array must be non-empty. If the operator is Exists or DoesNotExist, + the values array must be empty. This array is replaced during a strategic + merge patch. + items: + type: string + type: array + x-kubernetes-list-type: atomic + required: + - key + - operator + type: object + type: array + x-kubernetes-list-type: atomic + matchLabels: + additionalProperties: + type: string + description: |- + matchLabels is a map of {key,value} pairs. A single {key,value} in the matchLabels + map is equivalent to an element of matchExpressions, whose key field is "key", the + operator is "In", and the values array contains only "value". The requirements are ANDed. + type: object + type: object + x-kubernetes-map-type: atomic + type: array + password: + description: Authentication password for routers enforcing TCP MD5 authenticated sessions + type: string + passwordSecret: + description: |- + passwordSecret is name of the authentication secret for BGP Peer. + the secret must be of type "kubernetes.io/basic-auth", and created in the + same namespace as the MetalLB deployment. The password is stored in the + secret as the key "password". + properties: + name: + description: name is unique within a namespace to reference a secret resource. + type: string + namespace: + description: namespace defines the space within which the secret name must be unique. + type: string + type: object + x-kubernetes-map-type: atomic + peerASN: + description: |- + AS number to expect from the remote end of the session. + ASN and DynamicASN are mutually exclusive and one of them must be specified. + format: int32 + maximum: 4294967295 + minimum: 0 + type: integer + peerAddress: + description: Address to dial when establishing the session. + type: string + peerPort: + default: 179 + description: Port to dial when establishing the session. + maximum: 16384 + minimum: 1 + type: integer + routerID: + description: BGP router ID to advertise to the peer + type: string + sourceAddress: + description: Source address to use when establishing the session. + type: string + vrf: + description: |- + To set if we want to peer with the BGPPeer using an interface belonging to + a host vrf + type: string + required: + - myASN + type: object + status: + description: BGPPeerStatus defines the observed state of Peer. + type: object + type: object + served: true + storage: true + subresources: + status: {} +--- +# Source: metallb/charts/crds/templates/crds.yaml +apiVersion: apiextensions.k8s.io/v1 +kind: CustomResourceDefinition +metadata: + annotations: + controller-gen.kubebuilder.io/version: v0.19.0 + name: communities.metallb.io +spec: + group: metallb.io + names: + kind: Community + listKind: CommunityList + plural: communities + singular: community + scope: Namespaced + versions: + - name: v1beta1 + schema: + openAPIV3Schema: + description: |- + Community is a collection of aliases for communities. + Users can define named aliases to be used in the BGPPeer CRD. + properties: + apiVersion: + description: |- + APIVersion defines the versioned schema of this representation of an object. + Servers should convert recognized schemas to the latest internal value, and + may reject unrecognized values. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources + type: string + kind: + description: |- + Kind is a string value representing the REST resource this object represents. + Servers may infer this from the endpoint the client submits requests to. + Cannot be updated. + In CamelCase. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds + type: string + metadata: + type: object + spec: + description: CommunitySpec defines the desired state of Community. + properties: + communities: + items: + properties: + name: + description: The name of the alias for the community. + type: string + value: + description: |- + The BGP community value corresponding to the given name. Can be a standard community of the form 1234:1234 + or a large community of the form large:1234:1234:1234. + type: string + type: object + type: array + type: object + status: + description: CommunityStatus defines the observed state of Community. + type: object + type: object + served: true + storage: true + subresources: + status: {} +--- +# Source: metallb/charts/crds/templates/crds.yaml +apiVersion: apiextensions.k8s.io/v1 +kind: CustomResourceDefinition +metadata: + annotations: + controller-gen.kubebuilder.io/version: v0.19.0 + name: configurationstates.metallb.io +spec: + group: metallb.io + names: + kind: ConfigurationState + listKind: ConfigurationStateList + plural: configurationstates + singular: configurationstate + scope: Namespaced + versions: + - additionalPrinterColumns: + - jsonPath: .status.result + name: Result + type: string + - jsonPath: .status.errorSummary + name: ErrorSummary + type: string + - jsonPath: .metadata.creationTimestamp + name: Age + type: date + name: v1beta1 + schema: + openAPIV3Schema: + description: |- + ConfigurationState is a status-only CRD that reports configuration validation results from MetalLB components. + Labels: + - metallb.io/component-type: "controller" or "speaker" + - metallb.io/node-name: node name (only for speaker) + properties: + apiVersion: + description: |- + APIVersion defines the versioned schema of this representation of an object. + Servers should convert recognized schemas to the latest internal value, and + may reject unrecognized values. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources + type: string + kind: + description: |- + Kind is a string value representing the REST resource this object represents. + Servers may infer this from the endpoint the client submits requests to. + Cannot be updated. + In CamelCase. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds + type: string + metadata: + type: object + status: + description: ConfigurationStateStatus defines the observed state of ConfigurationState. + properties: + conditions: + description: Conditions contains the status conditions from the reconcilers running in this component. + items: + description: Condition contains details for one aspect of the current state of this API Resource. + properties: + lastTransitionTime: + description: |- + lastTransitionTime is the last time the condition transitioned from one status to another. + This should be when the underlying condition changed. If that is not known, then using the time when the API field changed is acceptable. + format: date-time + type: string + message: + description: |- + message is a human readable message indicating details about the transition. + This may be an empty string. + maxLength: 32768 + type: string + observedGeneration: + description: |- + observedGeneration represents the .metadata.generation that the condition was set based upon. + For instance, if .metadata.generation is currently 12, but the .status.conditions[x].observedGeneration is 9, the condition is out of date + with respect to the current state of the instance. + format: int64 + minimum: 0 + type: integer + reason: + description: |- + reason contains a programmatic identifier indicating the reason for the condition's last transition. + Producers of specific condition types may define expected values and meanings for this field, + and whether the values are considered a guaranteed API. + The value should be a CamelCase string. + This field may not be empty. + maxLength: 1024 + minLength: 1 + pattern: ^[A-Za-z]([A-Za-z0-9_,:]*[A-Za-z0-9_])?$ + type: string + status: + description: status of the condition, one of True, False, Unknown. + enum: + - "True" + - "False" + - Unknown + type: string + type: + description: type of condition in CamelCase or in foo.example.com/CamelCase. + maxLength: 316 + pattern: ^([a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*/)?(([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9])$ + type: string + required: + - lastTransitionTime + - message + - reason + - status + - type + type: object + type: array + x-kubernetes-list-map-keys: + - type + x-kubernetes-list-type: map + errorSummary: + description: |- + ErrorSummary contains the aggregated error messages from reconciliation failures. + This field is empty when Result is "Valid". + type: string + result: + description: Result indicates the configuration validation result. + enum: + - Valid + - Invalid + - Unknown + type: string + type: object + type: object + served: true + storage: true + subresources: + status: {} +--- +# Source: metallb/charts/crds/templates/crds.yaml +apiVersion: apiextensions.k8s.io/v1 +kind: CustomResourceDefinition +metadata: + annotations: + controller-gen.kubebuilder.io/version: v0.19.0 + name: ipaddresspools.metallb.io +spec: + group: metallb.io + names: + kind: IPAddressPool + listKind: IPAddressPoolList + plural: ipaddresspools + singular: ipaddresspool + scope: Namespaced + versions: + - additionalPrinterColumns: + - jsonPath: .spec.autoAssign + name: Auto Assign + type: boolean + - jsonPath: .spec.avoidBuggyIPs + name: Avoid Buggy IPs + type: boolean + - jsonPath: .spec.addresses + name: Addresses + type: string + name: v1beta1 + schema: + openAPIV3Schema: + description: |- + IPAddressPool represents a pool of IP addresses that can be allocated + to LoadBalancer services. + properties: + apiVersion: + description: |- + APIVersion defines the versioned schema of this representation of an object. + Servers should convert recognized schemas to the latest internal value, and + may reject unrecognized values. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources + type: string + kind: + description: |- + Kind is a string value representing the REST resource this object represents. + Servers may infer this from the endpoint the client submits requests to. + Cannot be updated. + In CamelCase. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds + type: string + metadata: + type: object + spec: + description: IPAddressPoolSpec defines the desired state of IPAddressPool. + properties: + addresses: + description: |- + A list of IP address ranges over which MetalLB has authority. + You can list multiple ranges in a single pool, they will all share the + same settings. Each range can be either a CIDR prefix, or an explicit + start-end range of IPs. + items: + type: string + type: array + autoAssign: + default: true + description: |- + AutoAssign flag used to prevent MetallB from automatic allocation + for a pool. + type: boolean + avoidBuggyIPs: + default: false + description: |- + AvoidBuggyIPs prevents addresses ending with .0 and .255 + to be used by a pool. + type: boolean + serviceAllocation: + description: |- + AllocateTo makes ip pool allocation to specific namespace and/or service. + The controller will use the pool with lowest value of priority in case of + multiple matches. A pool with no priority set will be used only if the + pools with priority can't be used. If multiple matching IPAddressPools are + available it will check for the availability of IPs sorting the matching + IPAddressPools by priority, starting from the highest to the lowest. If + multiple IPAddressPools have the same priority, choice will be random. + properties: + namespaceSelectors: + description: |- + NamespaceSelectors list of label selectors to select namespace(s) for ip pool, + an alternative to using namespace list. + items: + description: |- + A label selector is a label query over a set of resources. The result of matchLabels and + matchExpressions are ANDed. An empty label selector matches all objects. A null + label selector matches no objects. + properties: + matchExpressions: + description: matchExpressions is a list of label selector requirements. The requirements are ANDed. + items: + description: |- + A label selector requirement is a selector that contains values, a key, and an operator that + relates the key and values. + properties: + key: + description: key is the label key that the selector applies to. + type: string + operator: + description: |- + operator represents a key's relationship to a set of values. + Valid operators are In, NotIn, Exists and DoesNotExist. + type: string + values: + description: |- + values is an array of string values. If the operator is In or NotIn, + the values array must be non-empty. If the operator is Exists or DoesNotExist, + the values array must be empty. This array is replaced during a strategic + merge patch. + items: + type: string + type: array + x-kubernetes-list-type: atomic + required: + - key + - operator + type: object + type: array + x-kubernetes-list-type: atomic + matchLabels: + additionalProperties: + type: string + description: |- + matchLabels is a map of {key,value} pairs. A single {key,value} in the matchLabels + map is equivalent to an element of matchExpressions, whose key field is "key", the + operator is "In", and the values array contains only "value". The requirements are ANDed. + type: object + type: object + x-kubernetes-map-type: atomic + type: array + namespaces: + description: Namespaces list of namespace(s) on which ip pool can be attached. + items: + type: string + type: array + priority: + description: Priority priority given for ip pool while ip allocation on a service. + type: integer + serviceSelectors: + description: |- + ServiceSelectors list of label selector to select service(s) for which ip pool + can be used for ip allocation. + items: + description: |- + A label selector is a label query over a set of resources. The result of matchLabels and + matchExpressions are ANDed. An empty label selector matches all objects. A null + label selector matches no objects. + properties: + matchExpressions: + description: matchExpressions is a list of label selector requirements. The requirements are ANDed. + items: + description: |- + A label selector requirement is a selector that contains values, a key, and an operator that + relates the key and values. + properties: + key: + description: key is the label key that the selector applies to. + type: string + operator: + description: |- + operator represents a key's relationship to a set of values. + Valid operators are In, NotIn, Exists and DoesNotExist. + type: string + values: + description: |- + values is an array of string values. If the operator is In or NotIn, + the values array must be non-empty. If the operator is Exists or DoesNotExist, + the values array must be empty. This array is replaced during a strategic + merge patch. + items: + type: string + type: array + x-kubernetes-list-type: atomic + required: + - key + - operator + type: object + type: array + x-kubernetes-list-type: atomic + matchLabels: + additionalProperties: + type: string + description: |- + matchLabels is a map of {key,value} pairs. A single {key,value} in the matchLabels + map is equivalent to an element of matchExpressions, whose key field is "key", the + operator is "In", and the values array contains only "value". The requirements are ANDed. + type: object + type: object + x-kubernetes-map-type: atomic + type: array + type: object + required: + - addresses + type: object + status: + description: IPAddressPoolStatus defines the observed state of IPAddressPool. + properties: + assignedIPv4: + description: AssignedIPv4 is the number of assigned IPv4 addresses. + format: int64 + type: integer + assignedIPv6: + description: AssignedIPv6 is the number of assigned IPv6 addresses. + format: int64 + type: integer + availableIPv4: + description: AvailableIPv4 is the number of available IPv4 addresses. + format: int64 + type: integer + availableIPv6: + description: AvailableIPv6 is the number of available IPv6 addresses. + format: int64 + type: integer + required: + - assignedIPv4 + - assignedIPv6 + - availableIPv4 + - availableIPv6 + type: object + required: + - spec + type: object + served: true + storage: true + subresources: + status: {} +--- +# Source: metallb/charts/crds/templates/crds.yaml +apiVersion: apiextensions.k8s.io/v1 +kind: CustomResourceDefinition +metadata: + annotations: + controller-gen.kubebuilder.io/version: v0.19.0 + name: l2advertisements.metallb.io +spec: + group: metallb.io + names: + kind: L2Advertisement + listKind: L2AdvertisementList + plural: l2advertisements + singular: l2advertisement + scope: Namespaced + versions: + - additionalPrinterColumns: + - jsonPath: .spec.ipAddressPools + name: IPAddressPools + type: string + - jsonPath: .spec.ipAddressPoolSelectors + name: IPAddressPool Selectors + type: string + - jsonPath: .spec.interfaces + name: Interfaces + type: string + - jsonPath: .spec.nodeSelectors + name: Node Selectors + priority: 10 + type: string + name: v1beta1 + schema: + openAPIV3Schema: + description: |- + L2Advertisement allows to advertise the LoadBalancer IPs provided + by the selected pools via L2. + properties: + apiVersion: + description: |- + APIVersion defines the versioned schema of this representation of an object. + Servers should convert recognized schemas to the latest internal value, and + may reject unrecognized values. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources + type: string + kind: + description: |- + Kind is a string value representing the REST resource this object represents. + Servers may infer this from the endpoint the client submits requests to. + Cannot be updated. + In CamelCase. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds + type: string + metadata: + type: object + spec: + description: L2AdvertisementSpec defines the desired state of L2Advertisement. + properties: + interfaces: + description: |- + A list of interfaces to announce from. The LB IP will be announced only from these interfaces. + If the field is not set, we advertise from all the interfaces on the host. + items: + type: string + type: array + ipAddressPoolSelectors: + description: |- + A selector for the IPAddressPools which would get advertised via this advertisement. + If no IPAddressPool is selected by this or by the list, the advertisement is applied to all the IPAddressPools. + items: + description: |- + A label selector is a label query over a set of resources. The result of matchLabels and + matchExpressions are ANDed. An empty label selector matches all objects. A null + label selector matches no objects. + properties: + matchExpressions: + description: matchExpressions is a list of label selector requirements. The requirements are ANDed. + items: + description: |- + A label selector requirement is a selector that contains values, a key, and an operator that + relates the key and values. + properties: + key: + description: key is the label key that the selector applies to. + type: string + operator: + description: |- + operator represents a key's relationship to a set of values. + Valid operators are In, NotIn, Exists and DoesNotExist. + type: string + values: + description: |- + values is an array of string values. If the operator is In or NotIn, + the values array must be non-empty. If the operator is Exists or DoesNotExist, + the values array must be empty. This array is replaced during a strategic + merge patch. + items: + type: string + type: array + x-kubernetes-list-type: atomic + required: + - key + - operator + type: object + type: array + x-kubernetes-list-type: atomic + matchLabels: + additionalProperties: + type: string + description: |- + matchLabels is a map of {key,value} pairs. A single {key,value} in the matchLabels + map is equivalent to an element of matchExpressions, whose key field is "key", the + operator is "In", and the values array contains only "value". The requirements are ANDed. + type: object + type: object + x-kubernetes-map-type: atomic + type: array + ipAddressPools: + description: The list of IPAddressPools to advertise via this advertisement, selected by name. + items: + type: string + type: array + nodeSelectors: + description: NodeSelectors allows to limit the nodes to announce as next hops for the LoadBalancer IP. When empty, all the nodes having are announced as next hops. + items: + description: |- + A label selector is a label query over a set of resources. The result of matchLabels and + matchExpressions are ANDed. An empty label selector matches all objects. A null + label selector matches no objects. + properties: + matchExpressions: + description: matchExpressions is a list of label selector requirements. The requirements are ANDed. + items: + description: |- + A label selector requirement is a selector that contains values, a key, and an operator that + relates the key and values. + properties: + key: + description: key is the label key that the selector applies to. + type: string + operator: + description: |- + operator represents a key's relationship to a set of values. + Valid operators are In, NotIn, Exists and DoesNotExist. + type: string + values: + description: |- + values is an array of string values. If the operator is In or NotIn, + the values array must be non-empty. If the operator is Exists or DoesNotExist, + the values array must be empty. This array is replaced during a strategic + merge patch. + items: + type: string + type: array + x-kubernetes-list-type: atomic + required: + - key + - operator + type: object + type: array + x-kubernetes-list-type: atomic + matchLabels: + additionalProperties: + type: string + description: |- + matchLabels is a map of {key,value} pairs. A single {key,value} in the matchLabels + map is equivalent to an element of matchExpressions, whose key field is "key", the + operator is "In", and the values array contains only "value". The requirements are ANDed. + type: object + type: object + x-kubernetes-map-type: atomic + type: array + type: object + status: + description: L2AdvertisementStatus defines the observed state of L2Advertisement. + type: object + type: object + served: true + storage: true + subresources: + status: {} +--- +# Source: metallb/charts/crds/templates/crds.yaml +apiVersion: apiextensions.k8s.io/v1 +kind: CustomResourceDefinition +metadata: + annotations: + controller-gen.kubebuilder.io/version: v0.19.0 + name: servicebgpstatuses.metallb.io +spec: + group: metallb.io + names: + kind: ServiceBGPStatus + listKind: ServiceBGPStatusList + plural: servicebgpstatuses + singular: servicebgpstatus + scope: Namespaced + versions: + - additionalPrinterColumns: + - jsonPath: .status.node + name: Node + type: string + - jsonPath: .status.serviceName + name: Service Name + type: string + - jsonPath: .status.serviceNamespace + name: Service Namespace + type: string + name: v1beta1 + schema: + openAPIV3Schema: + description: ServiceBGPStatus exposes the BGP peers a service is configured to be advertised to, per relevant node. + properties: + apiVersion: + description: |- + APIVersion defines the versioned schema of this representation of an object. + Servers should convert recognized schemas to the latest internal value, and + may reject unrecognized values. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources + type: string + kind: + description: |- + Kind is a string value representing the REST resource this object represents. + Servers may infer this from the endpoint the client submits requests to. + Cannot be updated. + In CamelCase. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds + type: string + metadata: + type: object + spec: + description: ServiceBGPStatusSpec defines the desired state of ServiceBGPStatus. + type: object + status: + description: MetalLBServiceBGPStatus defines the observed state of ServiceBGPStatus. + properties: + node: + description: Node indicates the node announcing the service. + type: string + x-kubernetes-validations: + - message: Value is immutable + rule: self == oldSelf + peers: + description: |- + Peers indicate the BGP peers for which the service is configured to be advertised to. + The service being actually advertised to a given peer depends on the session state and is not indicated here. + items: + type: string + type: array + serviceName: + description: ServiceName indicates the service this status represents. + type: string + x-kubernetes-validations: + - message: Value is immutable + rule: self == oldSelf + serviceNamespace: + description: ServiceNamespace indicates the namespace of the service. + type: string + x-kubernetes-validations: + - message: Value is immutable + rule: self == oldSelf + type: object + type: object + served: true + storage: true + subresources: + status: {} +--- +# Source: metallb/charts/crds/templates/crds.yaml +apiVersion: apiextensions.k8s.io/v1 +kind: CustomResourceDefinition +metadata: + annotations: + controller-gen.kubebuilder.io/version: v0.19.0 + name: servicel2statuses.metallb.io +spec: + group: metallb.io + names: + kind: ServiceL2Status + listKind: ServiceL2StatusList + plural: servicel2statuses + singular: servicel2status + scope: Namespaced + versions: + - additionalPrinterColumns: + - jsonPath: .status.node + name: Allocated Node + type: string + - jsonPath: .status.serviceName + name: Service Name + type: string + - jsonPath: .status.serviceNamespace + name: Service Namespace + type: string + name: v1beta1 + schema: + openAPIV3Schema: + description: ServiceL2Status reveals the actual traffic status of loadbalancer services in layer2 mode. + properties: + apiVersion: + description: |- + APIVersion defines the versioned schema of this representation of an object. + Servers should convert recognized schemas to the latest internal value, and + may reject unrecognized values. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources + type: string + kind: + description: |- + Kind is a string value representing the REST resource this object represents. + Servers may infer this from the endpoint the client submits requests to. + Cannot be updated. + In CamelCase. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds + type: string + metadata: + type: object + spec: + description: ServiceL2StatusSpec defines the desired state of ServiceL2Status. + type: object + status: + description: MetalLBServiceL2Status defines the observed state of ServiceL2Status. + properties: + interfaces: + description: Interfaces indicates the interfaces that receive the directed traffic + items: + description: InterfaceInfo defines interface info of layer2 announcement. + properties: + name: + description: Name the name of network interface card + type: string + type: object + type: array + node: + description: Node indicates the node that receives the directed traffic + type: string + x-kubernetes-validations: + - message: Value is immutable + rule: self == oldSelf + serviceName: + description: ServiceName indicates the service this status represents + type: string + x-kubernetes-validations: + - message: Value is immutable + rule: self == oldSelf + serviceNamespace: + description: ServiceNamespace indicates the namespace of the service + type: string + x-kubernetes-validations: + - message: Value is immutable + rule: self == oldSelf + type: object + type: object + served: true + storage: true + subresources: + status: {} +--- +# Source: metallb/templates/rbac.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: metallb:controller + labels: + helm.sh/chart: metallb-0.15.3 + app.kubernetes.io/name: metallb + app.kubernetes.io/instance: metallb + app.kubernetes.io/version: "v0.15.3" + app.kubernetes.io/managed-by: Helm +rules: +- apiGroups: [""] + resources: ["services", "namespaces"] + verbs: ["get", "list", "watch"] +- apiGroups: [""] + resources: ["nodes"] + verbs: ["list"] +- apiGroups: [""] + resources: ["services/status"] + verbs: ["update"] +- apiGroups: [""] + resources: ["events"] + verbs: ["create", "patch"] +- apiGroups: ["admissionregistration.k8s.io"] + resources: ["validatingwebhookconfigurations"] + resourceNames: ["metallb-webhook-configuration"] + verbs: ["create", "delete", "get", "list", "patch", "update", "watch"] +- apiGroups: ["admissionregistration.k8s.io"] + resources: ["validatingwebhookconfigurations"] + verbs: ["list", "watch"] +- apiGroups: ["apiextensions.k8s.io"] + resources: ["customresourcedefinitions"] + resourceNames: ["bfdprofiles.metallb.io","bgpadvertisements.metallb.io", + "bgppeers.metallb.io","ipaddresspools.metallb.io","l2advertisements.metallb.io","communities.metallb.io","configurationstates.metallb.io"] + verbs: ["create", "delete", "get", "list", "patch", "update", "watch"] +- apiGroups: ["apiextensions.k8s.io"] + resources: ["customresourcedefinitions"] + verbs: ["list", "watch"] +- apiGroups: ["metallb.io"] + resources: ["configurationstates"] + verbs: ["create", "delete", "get", "list", "patch", "update", "watch"] +- apiGroups: ["metallb.io"] + resources: ["configurationstates/status"] + verbs: ["get", "patch", "update"] +--- +# Source: metallb/templates/rbac.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: metallb:speaker + labels: + helm.sh/chart: metallb-0.15.3 + app.kubernetes.io/name: metallb + app.kubernetes.io/instance: metallb + app.kubernetes.io/version: "v0.15.3" + app.kubernetes.io/managed-by: Helm +rules: +- apiGroups: [""] + resources: ["services", "endpoints", "nodes", "namespaces"] + verbs: ["get", "list", "watch"] +- apiGroups: ["discovery.k8s.io"] + resources: ["endpointslices"] + verbs: ["get", "list", "watch"] +- apiGroups: [""] + resources: ["events"] + verbs: ["create", "patch"] +- apiGroups: ["metallb.io"] + resources: ["servicel2statuses","servicel2statuses/status","configurationstates","configurationstates/status"] + verbs: ["*"] +--- +# Source: metallb/templates/rbac.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: metallb:controller + labels: + helm.sh/chart: metallb-0.15.3 + app.kubernetes.io/name: metallb + app.kubernetes.io/instance: metallb + app.kubernetes.io/version: "v0.15.3" + app.kubernetes.io/managed-by: Helm +subjects: +- kind: ServiceAccount + name: metallb-controller + namespace: metallb-system +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: metallb:controller +--- +# Source: metallb/templates/rbac.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: metallb:speaker + labels: + helm.sh/chart: metallb-0.15.3 + app.kubernetes.io/name: metallb + app.kubernetes.io/instance: metallb + app.kubernetes.io/version: "v0.15.3" + app.kubernetes.io/managed-by: Helm +subjects: +- kind: ServiceAccount + name: metallb-speaker + namespace: metallb-system +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: metallb:speaker +--- +# Source: metallb/templates/rbac.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: Role +metadata: + name: metallb-pod-lister + namespace: "metallb-system" + labels: + helm.sh/chart: metallb-0.15.3 + app.kubernetes.io/name: metallb + app.kubernetes.io/instance: metallb + app.kubernetes.io/version: "v0.15.3" + app.kubernetes.io/managed-by: Helm +rules: +- apiGroups: [""] + resources: ["pods"] + verbs: ["list", "get"] +- apiGroups: [""] + resources: ["secrets"] + verbs: ["get", "list", "watch"] +- apiGroups: [""] + resources: ["configmaps"] + verbs: ["get", "list", "watch"] +- apiGroups: ["metallb.io"] + resources: ["bfdprofiles"] + verbs: ["get", "list", "watch"] +- apiGroups: ["metallb.io"] + resources: ["bgppeers"] + verbs: ["get", "list", "watch"] +- apiGroups: ["metallb.io"] + resources: ["l2advertisements"] + verbs: ["get", "list", "watch"] +- apiGroups: ["metallb.io"] + resources: ["bgpadvertisements"] + verbs: ["get", "list", "watch"] +- apiGroups: ["metallb.io"] + resources: ["ipaddresspools"] + verbs: ["get", "list", "watch"] +- apiGroups: ["metallb.io"] + resources: ["communities"] + verbs: ["get", "list", "watch"] +- apiGroups: ["metallb.io"] + resources: ["servicebgpstatuses","servicebgpstatuses/status"] + verbs: ["*"] +--- +# Source: metallb/templates/rbac.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: Role +metadata: + name: metallb-controller + namespace: "metallb-system" + labels: + helm.sh/chart: metallb-0.15.3 + app.kubernetes.io/name: metallb + app.kubernetes.io/instance: metallb + app.kubernetes.io/version: "v0.15.3" + app.kubernetes.io/managed-by: Helm +rules: +- apiGroups: [""] + resources: ["secrets"] + verbs: ["create", "get", "list", "watch"] +- apiGroups: [""] + resources: ["secrets"] + resourceNames: ["metallb-memberlist"] + verbs: ["list"] +- apiGroups: ["apps"] + resources: ["deployments"] + resourceNames: ["metallb-controller"] + verbs: ["get"] +- apiGroups: [""] + resources: ["secrets"] + verbs: ["create", "delete", "get", "list", "patch", "update", "watch"] +- apiGroups: ["metallb.io"] + resources: ["ipaddresspools"] + verbs: ["get", "list", "watch"] +- apiGroups: ["metallb.io"] + resources: ["ipaddresspools/status"] + verbs: ["update"] +- apiGroups: ["metallb.io"] + resources: ["bgppeers"] + verbs: ["get", "list"] +- apiGroups: ["metallb.io"] + resources: ["bgpadvertisements"] + verbs: ["get", "list"] +- apiGroups: ["metallb.io"] + resources: ["l2advertisements"] + verbs: ["get", "list"] +- apiGroups: ["metallb.io"] + resources: ["communities"] + verbs: ["get", "list","watch"] +- apiGroups: ["metallb.io"] + resources: ["bfdprofiles"] + verbs: ["get", "list","watch"] +--- +# Source: metallb/templates/rbac.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + name: metallb-pod-lister + namespace: "metallb-system" + labels: + helm.sh/chart: metallb-0.15.3 + app.kubernetes.io/name: metallb + app.kubernetes.io/instance: metallb + app.kubernetes.io/version: "v0.15.3" + app.kubernetes.io/managed-by: Helm +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: Role + name: metallb-pod-lister +subjects: +- kind: ServiceAccount + name: metallb-speaker +--- +# Source: metallb/templates/rbac.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + name: metallb-controller + namespace: "metallb-system" + labels: + helm.sh/chart: metallb-0.15.3 + app.kubernetes.io/name: metallb + app.kubernetes.io/instance: metallb + app.kubernetes.io/version: "v0.15.3" + app.kubernetes.io/managed-by: Helm +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: Role + name: metallb-controller +subjects: +- kind: ServiceAccount + name: metallb-controller +--- +# Source: metallb/templates/webhooks.yaml +apiVersion: v1 +kind: Service +metadata: + name: metallb-webhook-service + namespace: "metallb-system" + labels: + helm.sh/chart: metallb-0.15.3 + app.kubernetes.io/name: metallb + app.kubernetes.io/instance: metallb + app.kubernetes.io/version: "v0.15.3" + app.kubernetes.io/managed-by: Helm +spec: + ports: + - port: 443 + targetPort: 9443 + selector: + app.kubernetes.io/name: metallb + app.kubernetes.io/instance: metallb + app.kubernetes.io/component: controller +--- +# Source: metallb/templates/speaker.yaml +apiVersion: apps/v1 +kind: DaemonSet +metadata: + name: metallb-speaker + namespace: "metallb-system" + labels: + helm.sh/chart: metallb-0.15.3 + app.kubernetes.io/name: metallb + app.kubernetes.io/instance: metallb + app.kubernetes.io/version: "v0.15.3" + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/component: speaker +spec: + updateStrategy: + type: RollingUpdate + selector: + matchLabels: + app.kubernetes.io/name: metallb + app.kubernetes.io/instance: metallb + app.kubernetes.io/component: speaker + template: + metadata: + labels: + app.kubernetes.io/name: metallb + app.kubernetes.io/instance: metallb + app.kubernetes.io/component: speaker + spec: + serviceAccountName: metallb-speaker + terminationGracePeriodSeconds: 0 + hostNetwork: true + volumes: + - name: memberlist + secret: + secretName: metallb-memberlist + defaultMode: 420 + - name: metallb-excludel2 + configMap: + defaultMode: 256 + name: metallb-excludel2 + - name: frr-sockets + emptyDir: {} + - name: frr-startup + configMap: + name: metallb-frr-startup + - name: frr-conf + emptyDir: {} + - name: reloader + emptyDir: {} + - name: metrics + emptyDir: {} + - name: frr-tmp + emptyDir: {} + - name: frr-lib + emptyDir: {} + - name: frr-log + emptyDir: {} + initContainers: + # Copies the initial config files with the right permissions to the shared volume. + - name: cp-frr-files + image: quay.io/frrouting/frr:10.4.1 + securityContext: + runAsUser: 100 + runAsGroup: 101 + command: ["/bin/sh", "-c", "cp -rLf /tmp/frr/* /etc/frr/"] + volumeMounts: + - name: frr-startup + mountPath: /tmp/frr + - name: frr-conf + mountPath: /etc/frr + # Copies the reloader to the shared volume between the speaker and reloader. + - name: cp-reloader + image: quay.io/metallb/speaker:v0.15.3 + command: ["/cp-tool","/frr-reloader.sh","/etc/frr_reloader/frr-reloader.sh"] + volumeMounts: + - name: reloader + mountPath: /etc/frr_reloader + # Copies the metrics exporter + - name: cp-metrics + image: quay.io/metallb/speaker:v0.15.3 + command: ["/cp-tool","/frr-metrics","/etc/frr_metrics/frr-metrics"] + volumeMounts: + - name: metrics + mountPath: /etc/frr_metrics + shareProcessNamespace: true + containers: + - name: speaker + image: quay.io/metallb/speaker:v0.15.3 + args: + - --port=7472 + - --log-level=info + env: + - name: METALLB_NODE_NAME + valueFrom: + fieldRef: + fieldPath: spec.nodeName + - name: METALLB_HOST + valueFrom: + fieldRef: + fieldPath: status.hostIP + - name: METALLB_ML_BIND_ADDR + valueFrom: + fieldRef: + fieldPath: status.podIP + + - name: METALLB_ML_LABELS + value: "app.kubernetes.io/name=metallb,app.kubernetes.io/component=speaker" + - name: METALLB_ML_BIND_PORT + value: "7946" + - name: METALLB_ML_SECRET_KEY_PATH + value: "/etc/ml_secret_key" + - name: FRR_CONFIG_FILE + value: /etc/frr_reloader/frr.conf + - name: FRR_RELOADER_PID_FILE + value: /etc/frr_reloader/reloader.pid + - name: METALLB_BGP_TYPE + value: frr + - name: METALLB_POD_NAME + valueFrom: + fieldRef: + fieldPath: metadata.name + ports: + - name: monitoring + containerPort: 7472 + - name: memberlist-tcp + containerPort: 7946 + protocol: TCP + - name: memberlist-udp + containerPort: 7946 + protocol: UDP + livenessProbe: + httpGet: + path: /metrics + port: monitoring + initialDelaySeconds: 10 + periodSeconds: 10 + timeoutSeconds: 1 + successThreshold: 1 + failureThreshold: 3 + readinessProbe: + httpGet: + path: /metrics + port: monitoring + initialDelaySeconds: 10 + periodSeconds: 10 + timeoutSeconds: 1 + successThreshold: 1 + failureThreshold: 3 + securityContext: + allowPrivilegeEscalation: false + readOnlyRootFilesystem: true + capabilities: + drop: + - ALL + add: + - NET_RAW + volumeMounts: + - name: memberlist + mountPath: /etc/ml_secret_key + - name: reloader + mountPath: /etc/frr_reloader + - name: metallb-excludel2 + mountPath: /etc/metallb + - name: frr + securityContext: + readOnlyRootFilesystem: true + allowPrivilegeEscalation: false + capabilities: + add: + - NET_ADMIN + - NET_RAW + - SYS_ADMIN + - NET_BIND_SERVICE + image: quay.io/frrouting/frr:10.4.1 + env: + - name: TINI_SUBREAPER + value: "true" + volumeMounts: + - name: frr-sockets + mountPath: /var/run/frr + - name: frr-conf + mountPath: /etc/frr + - name: frr-tmp + mountPath: /var/tmp/frr + - name: frr-lib + mountPath: /var/lib/frr + # The command is FRR's default entrypoint & waiting for the log file to appear and tailing it. + # If the log file isn't created in 60 seconds the tail fails and the container is restarted. + # This workaround is needed to have the frr logs as part of kubectl logs -c frr < speaker_pod_name >. + command: + - /bin/sh + - -c + - | + /sbin/tini -- /usr/lib/frr/docker-start & + attempts=0 + until [[ -f /etc/frr/frr.log || $attempts -eq 60 ]]; do + sleep 1 + attempts=$(( $attempts + 1 )) + done + tail -f /etc/frr/frr.log + livenessProbe: + httpGet: + path: livez + port: 7473 + initialDelaySeconds: 10 + periodSeconds: 10 + timeoutSeconds: 1 + successThreshold: 1 + failureThreshold: 3 + startupProbe: + httpGet: + path: /livez + port: 7473 + failureThreshold: 30 + periodSeconds: 5 + - name: reloader + image: quay.io/frrouting/frr:10.4.1 + securityContext: + readOnlyRootFilesystem: true + allowPrivilegeEscalation: false + command: ["/etc/frr_reloader/frr-reloader.sh"] + volumeMounts: + - name: frr-sockets + mountPath: /var/run/frr + - name: frr-conf + mountPath: /etc/frr + - name: reloader + mountPath: /etc/frr_reloader + - name: frr-log + mountPath: /var/log/frr + - name: frr-metrics + image: quay.io/frrouting/frr:10.4.1 + securityContext: + readOnlyRootFilesystem: true + allowPrivilegeEscalation: false + command: ["/etc/frr_metrics/frr-metrics"] + args: + - --metrics-port=7473 + env: + - name: VTYSH_HISTFILE + value: /dev/null + ports: + - containerPort: 7473 + name: frrmetrics + volumeMounts: + - name: frr-sockets + mountPath: /var/run/frr + - name: frr-conf + mountPath: /etc/frr + - name: metrics + mountPath: /etc/frr_metrics + nodeSelector: + "kubernetes.io/os": linux + tolerations: + - key: node-role.kubernetes.io/master + effect: NoSchedule + operator: Exists + - key: node-role.kubernetes.io/control-plane + effect: NoSchedule + operator: Exists +--- +# Source: metallb/templates/controller.yaml +apiVersion: apps/v1 +kind: Deployment +metadata: + name: metallb-controller + namespace: "metallb-system" + labels: + helm.sh/chart: metallb-0.15.3 + app.kubernetes.io/name: metallb + app.kubernetes.io/instance: metallb + app.kubernetes.io/version: "v0.15.3" + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/component: controller +spec: + strategy: + type: RollingUpdate + selector: + matchLabels: + app.kubernetes.io/name: metallb + app.kubernetes.io/instance: metallb + app.kubernetes.io/component: controller + template: + metadata: + labels: + app.kubernetes.io/name: metallb + app.kubernetes.io/instance: metallb + app.kubernetes.io/component: controller + spec: + serviceAccountName: metallb-controller + terminationGracePeriodSeconds: 0 + securityContext: + fsGroup: 65534 + runAsNonRoot: true + runAsUser: 65534 + containers: + - name: controller + image: quay.io/metallb/controller:v0.15.3 + args: + - --port=7472 + - --log-level=info + - --webhook-mode=enabled + - --tls-min-version=VersionTLS12 + env: + - name: METALLB_ML_SECRET_NAME + value: metallb-memberlist + - name: METALLB_DEPLOYMENT + value: metallb-controller + - name: METALLB_BGP_TYPE + value: frr + ports: + - name: monitoring + containerPort: 7472 + - containerPort: 9443 + name: webhook-server + protocol: TCP + volumeMounts: + - mountPath: /tmp/k8s-webhook-server/serving-certs + name: cert + readOnly: true + livenessProbe: + httpGet: + path: /metrics + port: monitoring + initialDelaySeconds: 10 + periodSeconds: 10 + timeoutSeconds: 1 + successThreshold: 1 + failureThreshold: 3 + readinessProbe: + httpGet: + path: /metrics + port: monitoring + initialDelaySeconds: 10 + periodSeconds: 10 + timeoutSeconds: 1 + successThreshold: 1 + failureThreshold: 3 + securityContext: + allowPrivilegeEscalation: false + readOnlyRootFilesystem: true + capabilities: + drop: + - ALL + nodeSelector: + "kubernetes.io/os": linux + volumes: + - name: cert + secret: + defaultMode: 420 + secretName: metallb-webhook-cert +--- +# Source: metallb/templates/webhooks.yaml +apiVersion: admissionregistration.k8s.io/v1 +kind: ValidatingWebhookConfiguration +metadata: + name: metallb-webhook-configuration + labels: + helm.sh/chart: metallb-0.15.3 + app.kubernetes.io/name: metallb + app.kubernetes.io/instance: metallb + app.kubernetes.io/version: "v0.15.3" + app.kubernetes.io/managed-by: Helm +webhooks: +- admissionReviewVersions: + - v1 + clientConfig: + service: + name: metallb-webhook-service + namespace: metallb-system + path: /validate-metallb-io-v1beta2-bgppeer + failurePolicy: Fail + name: bgppeervalidationwebhook.metallb.io + rules: + - apiGroups: + - metallb.io + apiVersions: + - v1beta2 + operations: + - CREATE + - UPDATE + resources: + - bgppeers + sideEffects: None +- admissionReviewVersions: + - v1 + clientConfig: + service: + name: metallb-webhook-service + namespace: metallb-system + path: /validate-metallb-io-v1beta1-ipaddresspool + failurePolicy: Fail + name: ipaddresspoolvalidationwebhook.metallb.io + rules: + - apiGroups: + - metallb.io + apiVersions: + - v1beta1 + operations: + - CREATE + - UPDATE + resources: + - ipaddresspools + sideEffects: None +- admissionReviewVersions: + - v1 + clientConfig: + service: + name: metallb-webhook-service + namespace: metallb-system + path: /validate-metallb-io-v1beta1-bgpadvertisement + failurePolicy: Fail + name: bgpadvertisementvalidationwebhook.metallb.io + rules: + - apiGroups: + - metallb.io + apiVersions: + - v1beta1 + operations: + - CREATE + - UPDATE + resources: + - bgpadvertisements + sideEffects: None +- admissionReviewVersions: + - v1 + clientConfig: + service: + name: metallb-webhook-service + namespace: metallb-system + path: /validate-metallb-io-v1beta1-community + failurePolicy: Fail + name: communityvalidationwebhook.metallb.io + rules: + - apiGroups: + - metallb.io + apiVersions: + - v1beta1 + operations: + - CREATE + - UPDATE + resources: + - communities + sideEffects: None +- admissionReviewVersions: + - v1 + clientConfig: + service: + name: metallb-webhook-service + namespace: metallb-system + path: /validate-metallb-io-v1beta1-bfdprofile + failurePolicy: Fail + name: bfdprofilevalidationwebhook.metallb.io + rules: + - apiGroups: + - metallb.io + apiVersions: + - v1beta1 + operations: + - CREATE + - DELETE + resources: + - bfdprofiles + sideEffects: None +- admissionReviewVersions: + - v1 + clientConfig: + service: + name: metallb-webhook-service + namespace: metallb-system + path: /validate-metallb-io-v1beta1-l2advertisement + failurePolicy: Fail + name: l2advertisementvalidationwebhook.metallb.io + rules: + - apiGroups: + - metallb.io + apiVersions: + - v1beta1 + operations: + - CREATE + - UPDATE + resources: + - l2advertisements + sideEffects: None diff --git a/infrastructure/metallb/namespace.yaml b/infrastructure/metallb/namespace.yaml new file mode 100644 index 0000000..02b2add --- /dev/null +++ b/infrastructure/metallb/namespace.yaml @@ -0,0 +1,5 @@ +# infrastructure/metallb/namespace.yaml +apiVersion: v1 +kind: Namespace +metadata: + name: metallb-system diff --git a/infrastructure/metallb/patches/node-placement.yaml b/infrastructure/metallb/patches/node-placement.yaml new file mode 100644 index 0000000..c42ae99 --- /dev/null +++ b/infrastructure/metallb/patches/node-placement.yaml @@ -0,0 +1,27 @@ +# infrastructure/metallb/patches/node-placement.yaml +apiVersion: apps/v1 +kind: Deployment +metadata: + name: metallb-controller + namespace: metallb-system +spec: + template: + spec: + containers: + - name: controller + args: + - --port=7472 + - --log-level=info + - --webhook-mode=enabled + - --tls-min-version=VersionTLS12 + - --lb-class=metallb + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: hardware + operator: In + values: + - rpi4 + - rpi5 diff --git a/infrastructure/metallb/patches/speaker-loglevel.yaml b/infrastructure/metallb/patches/speaker-loglevel.yaml new file mode 100644 index 0000000..61b8942 --- /dev/null +++ b/infrastructure/metallb/patches/speaker-loglevel.yaml @@ -0,0 +1,15 @@ +# infrastructure/metallb/patches/speaker-loglevel.yaml +apiVersion: apps/v1 +kind: DaemonSet +metadata: + name: metallb-speaker + namespace: metallb-system +spec: + template: + spec: + containers: + - name: speaker + args: + - --port=7472 + - --log-level=info + - --lb-class=metallb diff --git a/infrastructure/modules/profiles/atlas-ha/kustomization.yaml b/infrastructure/modules/profiles/atlas-ha/kustomization.yaml index 7e69171..0502e01 100644 --- a/infrastructure/modules/profiles/atlas-ha/kustomization.yaml +++ b/infrastructure/modules/profiles/atlas-ha/kustomization.yaml @@ -2,6 +2,7 @@ apiVersion: kustomize.config.k8s.io/v1beta1 kind: Kustomization resources: + - ../components/device-plugin-config - ../components/device-plugin-jetson - ../components/device-plugin-minipc - ../components/device-plugin-tethys diff --git a/infrastructure/modules/profiles/components/device-plugin-config/configmap.yaml b/infrastructure/modules/profiles/components/device-plugin-config/configmap.yaml new file mode 100644 index 0000000..73c61cf --- /dev/null +++ b/infrastructure/modules/profiles/components/device-plugin-config/configmap.yaml @@ -0,0 +1,15 @@ +# infrastructure/modules/profiles/components/device-plugin-config/configmap.yaml +apiVersion: v1 +kind: ConfigMap +metadata: + name: nvidia-device-plugin-config + namespace: kube-system +data: + config.yaml: | + version: v1 + sharing: + timeSlicing: + renameByDefault: true + resources: + - name: nvidia.com/gpu + replicas: 4 diff --git a/infrastructure/modules/profiles/components/device-plugin-config/kustomization.yaml b/infrastructure/modules/profiles/components/device-plugin-config/kustomization.yaml new file mode 100644 index 0000000..346f526 --- /dev/null +++ b/infrastructure/modules/profiles/components/device-plugin-config/kustomization.yaml @@ -0,0 +1,5 @@ +# infrastructure/modules/profiles/components/device-plugin-config/kustomization.yaml +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization +resources: + - configmap.yaml diff --git a/infrastructure/modules/profiles/components/device-plugin-jetson/daemonset.yaml b/infrastructure/modules/profiles/components/device-plugin-jetson/daemonset.yaml index f4953ea..2e29134 100644 --- a/infrastructure/modules/profiles/components/device-plugin-jetson/daemonset.yaml +++ b/infrastructure/modules/profiles/components/device-plugin-jetson/daemonset.yaml @@ -30,7 +30,8 @@ spec: imagePullPolicy: IfNotPresent args: - "--fail-on-init-error=false" - - "--device-list-strategy=envvar,cdi" + - "--device-list-strategy=envvar" + - "--config-file=/config/config.yaml" securityContext: privileged: true env: @@ -41,7 +42,12 @@ spec: volumeMounts: - name: device-plugin mountPath: /var/lib/kubelet/device-plugins + - name: config + mountPath: /config volumes: - name: device-plugin hostPath: path: /var/lib/kubelet/device-plugins + - name: config + configMap: + name: nvidia-device-plugin-config diff --git a/infrastructure/modules/profiles/components/device-plugin-minipc/daemonset.yaml b/infrastructure/modules/profiles/components/device-plugin-minipc/daemonset.yaml index 76b6c06..309593a 100644 --- a/infrastructure/modules/profiles/components/device-plugin-minipc/daemonset.yaml +++ b/infrastructure/modules/profiles/components/device-plugin-minipc/daemonset.yaml @@ -32,6 +32,7 @@ spec: - "--fail-on-init-error=false" - "--device-list-strategy=envvar" - "--mig-strategy=none" + - "--config-file=/config/config.yaml" securityContext: privileged: true env: @@ -42,7 +43,12 @@ spec: volumeMounts: - name: device-plugin mountPath: /var/lib/kubelet/device-plugins + - name: config + mountPath: /config volumes: - name: device-plugin hostPath: path: /var/lib/kubelet/device-plugins + - name: config + configMap: + name: nvidia-device-plugin-config diff --git a/infrastructure/modules/profiles/components/device-plugin-tethys/daemonset.yaml b/infrastructure/modules/profiles/components/device-plugin-tethys/daemonset.yaml index a15930a..884befa 100644 --- a/infrastructure/modules/profiles/components/device-plugin-tethys/daemonset.yaml +++ b/infrastructure/modules/profiles/components/device-plugin-tethys/daemonset.yaml @@ -33,6 +33,7 @@ spec: - "--fail-on-init-error=false" - "--device-list-strategy=envvar" - "--mig-strategy=none" + - "--config-file=/config/config.yaml" securityContext: privileged: true env: @@ -43,7 +44,12 @@ spec: volumeMounts: - name: device-plugin mountPath: /var/lib/kubelet/device-plugins + - name: config + mountPath: /config volumes: - name: device-plugin hostPath: path: /var/lib/kubelet/device-plugins + - name: config + configMap: + name: nvidia-device-plugin-config diff --git a/infrastructure/modules/profiles/tethys-hybrid/kustomization.yaml b/infrastructure/modules/profiles/tethys-hybrid/kustomization.yaml index b55c059..ad951ec 100644 --- a/infrastructure/modules/profiles/tethys-hybrid/kustomization.yaml +++ b/infrastructure/modules/profiles/tethys-hybrid/kustomization.yaml @@ -2,4 +2,5 @@ apiVersion: kustomize.config.k8s.io/v1beta1 kind: Kustomization resources: + - ../components/device-plugin-config - ../components/device-plugin-tethys diff --git a/infrastructure/sources/helm/fluent-bit.yaml b/infrastructure/sources/helm/fluent-bit.yaml new file mode 100644 index 0000000..b4cb214 --- /dev/null +++ b/infrastructure/sources/helm/fluent-bit.yaml @@ -0,0 +1,9 @@ +# infrastructure/sources/helm/fluent-bit.yaml +apiVersion: source.toolkit.fluxcd.io/v1 +kind: HelmRepository +metadata: + name: fluent + namespace: flux-system +spec: + interval: 1h + url: https://fluent.github.io/helm-charts diff --git a/infrastructure/sources/helm/kustomization.yaml b/infrastructure/sources/helm/kustomization.yaml index 3ded0f1..c8d20bb 100644 --- a/infrastructure/sources/helm/kustomization.yaml +++ b/infrastructure/sources/helm/kustomization.yaml @@ -2,11 +2,15 @@ apiVersion: kustomize.config.k8s.io/v1beta1 kind: Kustomization resources: + - fluent-bit.yaml - grafana.yaml - hashicorp.yaml - jetstack.yaml - jenkins.yaml - mailu.yaml + - opentelemetry.yaml + - opensearch.yaml - harbor.yaml - prometheus.yaml - victoria-metrics.yaml + - secrets-store-csi.yaml diff --git a/infrastructure/sources/helm/opensearch.yaml b/infrastructure/sources/helm/opensearch.yaml new file mode 100644 index 0000000..e5b60c3 --- /dev/null +++ b/infrastructure/sources/helm/opensearch.yaml @@ -0,0 +1,9 @@ +# infrastructure/sources/helm/opensearch.yaml +apiVersion: source.toolkit.fluxcd.io/v1 +kind: HelmRepository +metadata: + name: opensearch + namespace: flux-system +spec: + interval: 1h + url: https://opensearch-project.github.io/helm-charts diff --git a/infrastructure/sources/helm/opentelemetry.yaml b/infrastructure/sources/helm/opentelemetry.yaml new file mode 100644 index 0000000..03d0b00 --- /dev/null +++ b/infrastructure/sources/helm/opentelemetry.yaml @@ -0,0 +1,9 @@ +# infrastructure/sources/helm/opentelemetry.yaml +apiVersion: source.toolkit.fluxcd.io/v1 +kind: HelmRepository +metadata: + name: opentelemetry + namespace: flux-system +spec: + interval: 1h + url: https://open-telemetry.github.io/opentelemetry-helm-charts diff --git a/infrastructure/sources/helm/secrets-store-csi.yaml b/infrastructure/sources/helm/secrets-store-csi.yaml new file mode 100644 index 0000000..1fc4ae5 --- /dev/null +++ b/infrastructure/sources/helm/secrets-store-csi.yaml @@ -0,0 +1,9 @@ +# infrastructure/sources/helm/secrets-store-csi.yaml +apiVersion: source.toolkit.fluxcd.io/v1 +kind: HelmRepository +metadata: + name: secrets-store-csi-driver + namespace: flux-system +spec: + interval: 1h + url: https://kubernetes-sigs.github.io/secrets-store-csi-driver/charts diff --git a/infrastructure/traefik/clusterrole.yaml b/infrastructure/traefik/clusterrole.yaml index 52ed126..353eaff 100644 --- a/infrastructure/traefik/clusterrole.yaml +++ b/infrastructure/traefik/clusterrole.yaml @@ -71,9 +71,10 @@ rules: - tlsoptions - tlsstores - serverstransports + - serverstransporttcps - traefikservices + - middlewaretcps verbs: - get - list - watch - diff --git a/infrastructure/traefik/kustomization.yaml b/infrastructure/traefik/kustomization.yaml index 1dce445..4e36574 100644 --- a/infrastructure/traefik/kustomization.yaml +++ b/infrastructure/traefik/kustomization.yaml @@ -10,3 +10,4 @@ resources: - clusterrole.yaml - clusterrolebinding.yaml - service.yaml + - traefik-service-lb.yaml diff --git a/infrastructure/traefik/traefik-service-lb.yaml b/infrastructure/traefik/traefik-service-lb.yaml new file mode 100644 index 0000000..e4929f1 --- /dev/null +++ b/infrastructure/traefik/traefik-service-lb.yaml @@ -0,0 +1,24 @@ +# infrastructure/traefik/traefik-service-lb.yaml +apiVersion: v1 +kind: Service +metadata: + name: traefik + namespace: kube-system + annotations: + metallb.universe.tf/address-pool: communication-pool +spec: + type: LoadBalancer + loadBalancerClass: metallb + loadBalancerIP: 192.168.22.9 + ports: + - name: web + port: 80 + targetPort: web + protocol: TCP + - name: websecure + port: 443 + targetPort: websecure + protocol: TCP + selector: + app.kubernetes.io/instance: traefik-kube-system + app.kubernetes.io/name: traefik diff --git a/infrastructure/vault-csi/kustomization.yaml b/infrastructure/vault-csi/kustomization.yaml new file mode 100644 index 0000000..a5d223d --- /dev/null +++ b/infrastructure/vault-csi/kustomization.yaml @@ -0,0 +1,6 @@ +# infrastructure/vault-csi/kustomization.yaml +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization +resources: + - secrets-store-csi-driver.yaml + - vault-csi-provider.yaml diff --git a/infrastructure/vault-csi/secrets-store-csi-driver.yaml b/infrastructure/vault-csi/secrets-store-csi-driver.yaml new file mode 100644 index 0000000..0b249fc --- /dev/null +++ b/infrastructure/vault-csi/secrets-store-csi-driver.yaml @@ -0,0 +1,20 @@ +# infrastructure/vault-csi/secrets-store-csi-driver.yaml +apiVersion: helm.toolkit.fluxcd.io/v2 +kind: HelmRelease +metadata: + name: secrets-store-csi-driver + namespace: kube-system +spec: + interval: 15m + chart: + spec: + chart: secrets-store-csi-driver + version: "~1.3.0" + sourceRef: + kind: HelmRepository + name: secrets-store-csi-driver + namespace: flux-system + values: + syncSecret: + enabled: true + enableSecretRotation: false diff --git a/infrastructure/vault-csi/vault-csi-provider.yaml b/infrastructure/vault-csi/vault-csi-provider.yaml new file mode 100644 index 0000000..0b63d1c --- /dev/null +++ b/infrastructure/vault-csi/vault-csi-provider.yaml @@ -0,0 +1,111 @@ +# infrastructure/vault-csi/vault-csi-provider.yaml +apiVersion: v1 +kind: ServiceAccount +metadata: + name: vault-csi-provider + namespace: kube-system +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: vault-csi-provider-clusterrole +rules: + - apiGroups: [""] + resources: ["serviceaccounts/token"] + verbs: ["create"] +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: vault-csi-provider-clusterrolebinding +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: vault-csi-provider-clusterrole +subjects: + - kind: ServiceAccount + name: vault-csi-provider + namespace: kube-system +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: Role +metadata: + name: vault-csi-provider-role + namespace: kube-system +rules: + - apiGroups: [""] + resources: ["secrets"] + verbs: ["get"] + resourceNames: ["vault-csi-provider-hmac-key"] + - apiGroups: [""] + resources: ["secrets"] + verbs: ["create"] +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + name: vault-csi-provider-rolebinding + namespace: kube-system +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: Role + name: vault-csi-provider-role +subjects: + - kind: ServiceAccount + name: vault-csi-provider + namespace: kube-system +--- +apiVersion: apps/v1 +kind: DaemonSet +metadata: + name: vault-csi-provider + namespace: kube-system + labels: { app.kubernetes.io/name: vault-csi-provider } +spec: + updateStrategy: + type: RollingUpdate + selector: + matchLabels: { app.kubernetes.io/name: vault-csi-provider } + template: + metadata: + labels: { app.kubernetes.io/name: vault-csi-provider } + spec: + serviceAccountName: vault-csi-provider + containers: + - name: provider-vault-installer + image: hashicorp/vault-csi-provider:1.7.0 + imagePullPolicy: IfNotPresent + args: + - -endpoint=/provider/vault.sock + - -log-level=info + resources: + requests: { cpu: 50m, memory: 100Mi } + limits: { cpu: 50m, memory: 100Mi } + volumeMounts: + - { name: providervol, mountPath: "/provider" } + livenessProbe: + httpGet: + path: "/health/ready" + port: 8080 + scheme: "HTTP" + failureThreshold: 2 + initialDelaySeconds: 5 + periodSeconds: 5 + successThreshold: 1 + timeoutSeconds: 3 + readinessProbe: + httpGet: + path: "/health/ready" + port: 8080 + scheme: "HTTP" + failureThreshold: 2 + initialDelaySeconds: 5 + periodSeconds: 5 + successThreshold: 1 + timeoutSeconds: 3 + volumes: + - name: providervol + hostPath: + path: "/var/run/secrets-store-csi-providers" + nodeSelector: + kubernetes.io/os: linux diff --git a/knowledge/INDEX.md b/knowledge/INDEX.md new file mode 100644 index 0000000..fac9153 --- /dev/null +++ b/knowledge/INDEX.md @@ -0,0 +1,22 @@ +Atlas Knowledge Base (KB) + +This folder is the source-of-truth “memory” for Atlas/Titan assistants (and for humans). It is designed to be: +- Accurate (grounded in GitOps + read-only cluster tools) +- Maintainable (small docs + deterministic generators) +- Safe (no secrets; refer to Secret/Vault paths by name only) + +Layout +- `knowledge/runbooks/`: human-written docs (short, chunkable Markdown). +- `knowledge/catalog/`: generated machine-readable facts (YAML/JSON). +- `knowledge/diagrams/`: generated Mermaid diagrams (`.mmd`) derived from the catalog. + +Regeneration +- Update manifests/docs, then regenerate generated artifacts: + - `python scripts/knowledge_render_atlas.py --write` + +Authoring rules +- Never include secret values. Prefer `secretRef` names or Vault paths like `kv/atlas/...`. +- Prefer stable identifiers: Kubernetes `namespace/name`, DNS hostnames, Flux kustomization paths. +- Keep each runbook small; one topic per file; use headings. +- When in doubt, link to the exact file path in this repo that configures the behavior. + diff --git a/knowledge/catalog/atlas-summary.json b/knowledge/catalog/atlas-summary.json new file mode 100644 index 0000000..2139e29 --- /dev/null +++ b/knowledge/catalog/atlas-summary.json @@ -0,0 +1,8 @@ +{ + "counts": { + "helmrelease_host_hints": 7, + "http_endpoints": 35, + "services": 44, + "workloads": 49 + } +} diff --git a/knowledge/catalog/atlas.json b/knowledge/catalog/atlas.json new file mode 100644 index 0000000..92f08f4 --- /dev/null +++ b/knowledge/catalog/atlas.json @@ -0,0 +1,2771 @@ +{ + "cluster": "atlas", + "sources": [ + { + "name": "ai-llm", + "path": "services/ai-llm", + "targetNamespace": "ai" + }, + { + "name": "bstein-dev-home", + "path": "services/bstein-dev-home", + "targetNamespace": "bstein-dev-home" + }, + { + "name": "ci-demo", + "path": "services/ci-demo", + "targetNamespace": null + }, + { + "name": "communication", + "path": "services/comms", + "targetNamespace": "comms" + }, + { + "name": "core", + "path": "infrastructure/core", + "targetNamespace": null + }, + { + "name": "crypto", + "path": "services/crypto", + "targetNamespace": "crypto" + }, + { + "name": "flux-system", + "path": "clusters/atlas/flux-system", + "targetNamespace": null + }, + { + "name": "gitea", + "path": "services/gitea", + "targetNamespace": "gitea" + }, + { + "name": "gitops-ui", + "path": "services/gitops-ui", + "targetNamespace": "flux-system" + }, + { + "name": "harbor", + "path": "services/harbor", + "targetNamespace": "harbor" + }, + { + "name": "helm", + "path": "infrastructure/sources/helm", + "targetNamespace": "flux-system" + }, + { + "name": "jellyfin", + "path": "services/jellyfin", + "targetNamespace": "jellyfin" + }, + { + "name": "jenkins", + "path": "services/jenkins", + "targetNamespace": "jenkins" + }, + { + "name": "keycloak", + "path": "services/keycloak", + "targetNamespace": "sso" + }, + { + "name": "longhorn-ui", + "path": "infrastructure/longhorn/ui-ingress", + "targetNamespace": "longhorn-system" + }, + { + "name": "mailu", + "path": "services/mailu", + "targetNamespace": "mailu-mailserver" + }, + { + "name": "metallb", + "path": "infrastructure/metallb", + "targetNamespace": "metallb-system" + }, + { + "name": "monerod", + "path": "services/crypto/monerod", + "targetNamespace": "crypto" + }, + { + "name": "monitoring", + "path": "services/monitoring", + "targetNamespace": null + }, + { + "name": "nextcloud", + "path": "services/nextcloud", + "targetNamespace": "nextcloud" + }, + { + "name": "nextcloud-mail-sync", + "path": "services/nextcloud-mail-sync", + "targetNamespace": "nextcloud" + }, + { + "name": "oauth2-proxy", + "path": "services/oauth2-proxy", + "targetNamespace": "sso" + }, + { + "name": "openldap", + "path": "services/openldap", + "targetNamespace": "sso" + }, + { + "name": "pegasus", + "path": "services/pegasus", + "targetNamespace": "jellyfin" + }, + { + "name": "sui-metrics", + "path": "services/sui-metrics/overlays/atlas", + "targetNamespace": "sui-metrics" + }, + { + "name": "traefik", + "path": "infrastructure/traefik", + "targetNamespace": "traefik" + }, + { + "name": "vault", + "path": "services/vault", + "targetNamespace": "vault" + }, + { + "name": "vault-csi", + "path": "infrastructure/vault-csi", + "targetNamespace": "kube-system" + }, + { + "name": "vaultwarden", + "path": "services/vaultwarden", + "targetNamespace": "vaultwarden" + }, + { + "name": "xmr-miner", + "path": "services/crypto/xmr-miner", + "targetNamespace": "crypto" + } + ], + "workloads": [ + { + "kind": "Deployment", + "namespace": "ai", + "name": "ollama", + "labels": { + "app": "ollama" + }, + "serviceAccountName": null, + "nodeSelector": {}, + "images": [ + "ollama/ollama:latest" + ] + }, + { + "kind": "Deployment", + "namespace": "bstein-dev-home", + "name": "bstein-dev-home-backend", + "labels": { + "app": "bstein-dev-home-backend" + }, + "serviceAccountName": "bstein-dev-home", + "nodeSelector": { + "kubernetes.io/arch": "arm64", + "node-role.kubernetes.io/worker": "true" + }, + "images": [ + "registry.bstein.dev/bstein/bstein-dev-home-backend:0.1.1-84" + ] + }, + { + "kind": "Deployment", + "namespace": "bstein-dev-home", + "name": "bstein-dev-home-frontend", + "labels": { + "app": "bstein-dev-home-frontend" + }, + "serviceAccountName": null, + "nodeSelector": { + "kubernetes.io/arch": "arm64", + "node-role.kubernetes.io/worker": "true" + }, + "images": [ + "registry.bstein.dev/bstein/bstein-dev-home-frontend:0.1.1-84" + ] + }, + { + "kind": "Deployment", + "namespace": "bstein-dev-home", + "name": "chat-ai-gateway", + "labels": { + "app": "chat-ai-gateway" + }, + "serviceAccountName": null, + "nodeSelector": { + "kubernetes.io/arch": "arm64", + "node-role.kubernetes.io/worker": "true" + }, + "images": [ + "python:3.11-slim" + ] + }, + { + "kind": "Deployment", + "namespace": "ci-demo", + "name": "ci-demo", + "labels": { + "app.kubernetes.io/name": "ci-demo" + }, + "serviceAccountName": null, + "nodeSelector": { + "hardware": "rpi4" + }, + "images": [ + "registry.bstein.dev/infra/ci-demo:v0.0.0-3" + ] + }, + { + "kind": "Deployment", + "namespace": "comms", + "name": "atlasbot", + "labels": { + "app": "atlasbot" + }, + "serviceAccountName": "atlasbot", + "nodeSelector": { + "hardware": "rpi5" + }, + "images": [ + "python:3.11-slim" + ] + }, + { + "kind": "Deployment", + "namespace": "comms", + "name": "coturn", + "labels": { + "app": "coturn" + }, + "serviceAccountName": null, + "nodeSelector": { + "hardware": "rpi5" + }, + "images": [ + "ghcr.io/coturn/coturn:4.6.2" + ] + }, + { + "kind": "Deployment", + "namespace": "comms", + "name": "element-call", + "labels": { + "app": "element-call" + }, + "serviceAccountName": null, + "nodeSelector": { + "hardware": "rpi5" + }, + "images": [ + "ghcr.io/element-hq/element-call:latest" + ] + }, + { + "kind": "Deployment", + "namespace": "comms", + "name": "livekit", + "labels": { + "app": "livekit" + }, + "serviceAccountName": null, + "nodeSelector": { + "hardware": "rpi5" + }, + "images": [ + "livekit/livekit-server:v1.9.0" + ] + }, + { + "kind": "Deployment", + "namespace": "comms", + "name": "livekit-token-service", + "labels": { + "app": "livekit-token-service" + }, + "serviceAccountName": null, + "nodeSelector": { + "hardware": "rpi5" + }, + "images": [ + "ghcr.io/element-hq/lk-jwt-service:0.3.0" + ] + }, + { + "kind": "Deployment", + "namespace": "comms", + "name": "matrix-authentication-service", + "labels": { + "app": "matrix-authentication-service" + }, + "serviceAccountName": null, + "nodeSelector": { + "hardware": "rpi5" + }, + "images": [ + "ghcr.io/element-hq/matrix-authentication-service:1.8.0" + ] + }, + { + "kind": "Deployment", + "namespace": "comms", + "name": "matrix-guest-register", + "labels": { + "app.kubernetes.io/name": "matrix-guest-register" + }, + "serviceAccountName": null, + "nodeSelector": {}, + "images": [ + "python:3.11-slim" + ] + }, + { + "kind": "Deployment", + "namespace": "comms", + "name": "matrix-wellknown", + "labels": { + "app": "matrix-wellknown" + }, + "serviceAccountName": null, + "nodeSelector": {}, + "images": [ + "nginx:1.27-alpine" + ] + }, + { + "kind": "Deployment", + "namespace": "comms", + "name": "othrys-element-element-web", + "labels": { + "app.kubernetes.io/instance": "othrys-element", + "app.kubernetes.io/name": "element-web" + }, + "serviceAccountName": "othrys-element-element-web", + "nodeSelector": { + "hardware": "rpi5" + }, + "images": [ + "ghcr.io/element-hq/element-web:v1.12.6" + ] + }, + { + "kind": "Deployment", + "namespace": "comms", + "name": "othrys-synapse-matrix-synapse", + "labels": { + "app.kubernetes.io/component": "synapse", + "app.kubernetes.io/instance": "othrys-synapse", + "app.kubernetes.io/name": "matrix-synapse" + }, + "serviceAccountName": "default", + "nodeSelector": { + "hardware": "rpi5" + }, + "images": [ + "ghcr.io/element-hq/synapse:v1.144.0" + ] + }, + { + "kind": "Deployment", + "namespace": "comms", + "name": "othrys-synapse-redis-master", + "labels": { + "app.kubernetes.io/component": "master", + "app.kubernetes.io/instance": "othrys-synapse", + "app.kubernetes.io/managed-by": "Helm", + "app.kubernetes.io/name": "redis", + "helm.sh/chart": "redis-17.17.1" + }, + "serviceAccountName": "othrys-synapse-redis", + "nodeSelector": {}, + "images": [ + "docker.io/bitnamilegacy/redis:7.0.12-debian-11-r34" + ] + }, + { + "kind": "DaemonSet", + "namespace": "crypto", + "name": "monero-xmrig", + "labels": { + "app": "monero-xmrig" + }, + "serviceAccountName": null, + "nodeSelector": { + "node-role.kubernetes.io/worker": "true" + }, + "images": [ + "ghcr.io/tari-project/xmrig:latest" + ] + }, + { + "kind": "Deployment", + "namespace": "crypto", + "name": "monero-p2pool", + "labels": { + "app": "monero-p2pool" + }, + "serviceAccountName": null, + "nodeSelector": { + "node-role.kubernetes.io/worker": "true" + }, + "images": [ + "debian:bookworm-slim" + ] + }, + { + "kind": "Deployment", + "namespace": "crypto", + "name": "monerod", + "labels": { + "app": "monerod" + }, + "serviceAccountName": null, + "nodeSelector": { + "node-role.kubernetes.io/worker": "true" + }, + "images": [ + "registry.bstein.dev/crypto/monerod:0.18.4.1" + ] + }, + { + "kind": "Deployment", + "namespace": "flux-system", + "name": "helm-controller", + "labels": { + "app": "helm-controller", + "app.kubernetes.io/component": "helm-controller", + "app.kubernetes.io/instance": "flux-system", + "app.kubernetes.io/part-of": "flux", + "app.kubernetes.io/version": "v2.7.5" + }, + "serviceAccountName": "helm-controller", + "nodeSelector": { + "kubernetes.io/os": "linux" + }, + "images": [ + "ghcr.io/fluxcd/helm-controller:v1.4.5" + ] + }, + { + "kind": "Deployment", + "namespace": "flux-system", + "name": "image-automation-controller", + "labels": { + "app": "image-automation-controller", + "app.kubernetes.io/component": "image-automation-controller", + "app.kubernetes.io/instance": "flux-system", + "app.kubernetes.io/part-of": "flux", + "app.kubernetes.io/version": "v2.7.5" + }, + "serviceAccountName": "image-automation-controller", + "nodeSelector": { + "kubernetes.io/os": "linux" + }, + "images": [ + "ghcr.io/fluxcd/image-automation-controller:v1.0.4" + ] + }, + { + "kind": "Deployment", + "namespace": "flux-system", + "name": "image-reflector-controller", + "labels": { + "app": "image-reflector-controller", + "app.kubernetes.io/component": "image-reflector-controller", + "app.kubernetes.io/instance": "flux-system", + "app.kubernetes.io/part-of": "flux", + "app.kubernetes.io/version": "v2.7.5" + }, + "serviceAccountName": "image-reflector-controller", + "nodeSelector": { + "kubernetes.io/os": "linux" + }, + "images": [ + "ghcr.io/fluxcd/image-reflector-controller:v1.0.4" + ] + }, + { + "kind": "Deployment", + "namespace": "flux-system", + "name": "kustomize-controller", + "labels": { + "app": "kustomize-controller", + "app.kubernetes.io/component": "kustomize-controller", + "app.kubernetes.io/instance": "flux-system", + "app.kubernetes.io/part-of": "flux", + "app.kubernetes.io/version": "v2.7.5" + }, + "serviceAccountName": "kustomize-controller", + "nodeSelector": { + "kubernetes.io/os": "linux" + }, + "images": [ + "ghcr.io/fluxcd/kustomize-controller:v1.7.3" + ] + }, + { + "kind": "Deployment", + "namespace": "flux-system", + "name": "notification-controller", + "labels": { + "app": "notification-controller", + "app.kubernetes.io/component": "notification-controller", + "app.kubernetes.io/instance": "flux-system", + "app.kubernetes.io/part-of": "flux", + "app.kubernetes.io/version": "v2.7.5" + }, + "serviceAccountName": "notification-controller", + "nodeSelector": { + "kubernetes.io/os": "linux" + }, + "images": [ + "ghcr.io/fluxcd/notification-controller:v1.7.5" + ] + }, + { + "kind": "Deployment", + "namespace": "flux-system", + "name": "source-controller", + "labels": { + "app": "source-controller", + "app.kubernetes.io/component": "source-controller", + "app.kubernetes.io/instance": "flux-system", + "app.kubernetes.io/part-of": "flux", + "app.kubernetes.io/version": "v2.7.5" + }, + "serviceAccountName": "source-controller", + "nodeSelector": { + "kubernetes.io/os": "linux" + }, + "images": [ + "ghcr.io/fluxcd/source-controller:v1.7.4" + ] + }, + { + "kind": "Deployment", + "namespace": "gitea", + "name": "gitea", + "labels": { + "app": "gitea" + }, + "serviceAccountName": null, + "nodeSelector": { + "node-role.kubernetes.io/worker": "true" + }, + "images": [ + "gitea/gitea:1.23" + ] + }, + { + "kind": "Deployment", + "namespace": "jellyfin", + "name": "jellyfin", + "labels": { + "app": "jellyfin" + }, + "serviceAccountName": null, + "nodeSelector": {}, + "images": [ + "docker.io/jellyfin/jellyfin:10.11.5" + ] + }, + { + "kind": "Deployment", + "namespace": "jellyfin", + "name": "pegasus", + "labels": { + "app": "pegasus" + }, + "serviceAccountName": null, + "nodeSelector": { + "kubernetes.io/arch": "arm64", + "node-role.kubernetes.io/worker": "true" + }, + "images": [ + "alpine:3.20", + "registry.bstein.dev/streaming/pegasus:1.2.32" + ] + }, + { + "kind": "Deployment", + "namespace": "jenkins", + "name": "jenkins", + "labels": { + "app": "jenkins" + }, + "serviceAccountName": "jenkins", + "nodeSelector": { + "kubernetes.io/arch": "arm64", + "node-role.kubernetes.io/worker": "true" + }, + "images": [ + "jenkins/jenkins:2.528.3-jdk21" + ] + }, + { + "kind": "DaemonSet", + "namespace": "kube-system", + "name": "nvidia-device-plugin-jetson", + "labels": { + "app.kubernetes.io/instance": "jetson", + "app.kubernetes.io/name": "nvidia-device-plugin" + }, + "serviceAccountName": null, + "nodeSelector": { + "jetson": "true", + "kubernetes.io/arch": "arm64" + }, + "images": [ + "nvcr.io/nvidia/k8s-device-plugin:v0.16.2" + ] + }, + { + "kind": "DaemonSet", + "namespace": "kube-system", + "name": "nvidia-device-plugin-minipc", + "labels": { + "app.kubernetes.io/instance": "titan22", + "app.kubernetes.io/name": "nvidia-device-plugin" + }, + "serviceAccountName": null, + "nodeSelector": { + "kubernetes.io/arch": "amd64", + "kubernetes.io/hostname": "titan-22" + }, + "images": [ + "nvcr.io/nvidia/k8s-device-plugin:v0.16.2" + ] + }, + { + "kind": "DaemonSet", + "namespace": "kube-system", + "name": "nvidia-device-plugin-tethys", + "labels": { + "app.kubernetes.io/instance": "titan24", + "app.kubernetes.io/name": "nvidia-device-plugin" + }, + "serviceAccountName": null, + "nodeSelector": { + "kubernetes.io/arch": "amd64", + "kubernetes.io/hostname": "titan-24" + }, + "images": [ + "nvcr.io/nvidia/k8s-device-plugin:v0.16.2" + ] + }, + { + "kind": "DaemonSet", + "namespace": "kube-system", + "name": "vault-csi-provider", + "labels": { + "app.kubernetes.io/name": "vault-csi-provider" + }, + "serviceAccountName": "vault-csi-provider", + "nodeSelector": { + "kubernetes.io/os": "linux" + }, + "images": [ + "hashicorp/vault-csi-provider:1.7.0" + ] + }, + { + "kind": "Deployment", + "namespace": "longhorn-system", + "name": "oauth2-proxy-longhorn", + "labels": { + "app": "oauth2-proxy-longhorn" + }, + "serviceAccountName": null, + "nodeSelector": { + "node-role.kubernetes.io/worker": "true" + }, + "images": [ + "quay.io/oauth2-proxy/oauth2-proxy:v7.6.0" + ] + }, + { + "kind": "DaemonSet", + "namespace": "mailu-mailserver", + "name": "vip-controller", + "labels": { + "app": "vip-controller" + }, + "serviceAccountName": "vip-controller", + "nodeSelector": { + "mailu.bstein.dev/vip": "true" + }, + "images": [ + "lachlanevenson/k8s-kubectl:latest" + ] + }, + { + "kind": "Deployment", + "namespace": "mailu-mailserver", + "name": "mailu-sync-listener", + "labels": { + "app": "mailu-sync-listener" + }, + "serviceAccountName": null, + "nodeSelector": {}, + "images": [ + "python:3.11-alpine" + ] + }, + { + "kind": "DaemonSet", + "namespace": "metallb-system", + "name": "metallb-speaker", + "labels": { + "app.kubernetes.io/component": "speaker", + "app.kubernetes.io/instance": "metallb", + "app.kubernetes.io/name": "metallb" + }, + "serviceAccountName": "metallb-speaker", + "nodeSelector": { + "kubernetes.io/os": "linux" + }, + "images": [ + "quay.io/frrouting/frr:10.4.1", + "quay.io/metallb/speaker:v0.15.3" + ] + }, + { + "kind": "Deployment", + "namespace": "metallb-system", + "name": "metallb-controller", + "labels": { + "app.kubernetes.io/component": "controller", + "app.kubernetes.io/instance": "metallb", + "app.kubernetes.io/name": "metallb" + }, + "serviceAccountName": "metallb-controller", + "nodeSelector": { + "kubernetes.io/os": "linux" + }, + "images": [ + "quay.io/metallb/controller:v0.15.3" + ] + }, + { + "kind": "DaemonSet", + "namespace": "monitoring", + "name": "dcgm-exporter", + "labels": { + "app": "dcgm-exporter" + }, + "serviceAccountName": "default", + "nodeSelector": {}, + "images": [ + "registry.bstein.dev/monitoring/dcgm-exporter:4.4.2-4.7.0-ubuntu22.04" + ] + }, + { + "kind": "Deployment", + "namespace": "monitoring", + "name": "postmark-exporter", + "labels": { + "app": "postmark-exporter" + }, + "serviceAccountName": null, + "nodeSelector": {}, + "images": [ + "python:3.12-alpine" + ] + }, + { + "kind": "Deployment", + "namespace": "nextcloud", + "name": "collabora", + "labels": { + "app": "collabora" + }, + "serviceAccountName": null, + "nodeSelector": { + "hardware": "rpi5" + }, + "images": [ + "collabora/code:latest" + ] + }, + { + "kind": "Deployment", + "namespace": "nextcloud", + "name": "nextcloud", + "labels": { + "app": "nextcloud" + }, + "serviceAccountName": null, + "nodeSelector": { + "hardware": "rpi5" + }, + "images": [ + "nextcloud:29-apache" + ] + }, + { + "kind": "Deployment", + "namespace": "sso", + "name": "keycloak", + "labels": { + "app": "keycloak" + }, + "serviceAccountName": null, + "nodeSelector": {}, + "images": [ + "quay.io/keycloak/keycloak:26.0.7" + ] + }, + { + "kind": "Deployment", + "namespace": "sso", + "name": "oauth2-proxy", + "labels": { + "app": "oauth2-proxy" + }, + "serviceAccountName": null, + "nodeSelector": { + "node-role.kubernetes.io/worker": "true" + }, + "images": [ + "quay.io/oauth2-proxy/oauth2-proxy:v7.6.0" + ] + }, + { + "kind": "StatefulSet", + "namespace": "sso", + "name": "openldap", + "labels": { + "app": "openldap" + }, + "serviceAccountName": null, + "nodeSelector": { + "kubernetes.io/arch": "arm64", + "node-role.kubernetes.io/worker": "true" + }, + "images": [ + "docker.io/osixia/openldap:1.5.0" + ] + }, + { + "kind": "Deployment", + "namespace": "sui-metrics", + "name": "sui-metrics", + "labels": { + "app": "sui-metrics" + }, + "serviceAccountName": "sui-metrics", + "nodeSelector": { + "kubernetes.io/hostname": "titan-24" + }, + "images": [ + "victoriametrics/vmagent:v1.103.0" + ] + }, + { + "kind": "Deployment", + "namespace": "traefik", + "name": "traefik", + "labels": { + "app": "traefik" + }, + "serviceAccountName": "traefik-ingress-controller", + "nodeSelector": { + "node-role.kubernetes.io/worker": "true" + }, + "images": [ + "traefik:v3.3.3" + ] + }, + { + "kind": "StatefulSet", + "namespace": "vault", + "name": "vault", + "labels": { + "app": "vault" + }, + "serviceAccountName": "vault", + "nodeSelector": { + "kubernetes.io/arch": "arm64", + "node-role.kubernetes.io/worker": "true" + }, + "images": [ + "hashicorp/vault:1.17.6" + ] + }, + { + "kind": "Deployment", + "namespace": "vaultwarden", + "name": "vaultwarden", + "labels": { + "app": "vaultwarden" + }, + "serviceAccountName": null, + "nodeSelector": {}, + "images": [ + "vaultwarden/server:1.33.2" + ] + } + ], + "services": [ + { + "namespace": "ai", + "name": "ollama", + "type": "ClusterIP", + "selector": { + "app": "ollama" + }, + "ports": [ + { + "name": "http", + "port": 11434, + "targetPort": 11434, + "protocol": "TCP" + } + ] + }, + { + "namespace": "bstein-dev-home", + "name": "bstein-dev-home-backend", + "type": "ClusterIP", + "selector": { + "app": "bstein-dev-home-backend" + }, + "ports": [ + { + "name": "http", + "port": 80, + "targetPort": 8080, + "protocol": "TCP" + } + ] + }, + { + "namespace": "bstein-dev-home", + "name": "bstein-dev-home-frontend", + "type": "ClusterIP", + "selector": { + "app": "bstein-dev-home-frontend" + }, + "ports": [ + { + "name": "http", + "port": 80, + "targetPort": 80, + "protocol": "TCP" + } + ] + }, + { + "namespace": "bstein-dev-home", + "name": "chat-ai-gateway", + "type": "ClusterIP", + "selector": { + "app": "chat-ai-gateway" + }, + "ports": [ + { + "name": "http", + "port": 80, + "targetPort": 8080, + "protocol": "TCP" + } + ] + }, + { + "namespace": "ci-demo", + "name": "ci-demo", + "type": "ClusterIP", + "selector": { + "app.kubernetes.io/name": "ci-demo" + }, + "ports": [ + { + "name": "http", + "port": 80, + "targetPort": "http", + "protocol": "TCP" + } + ] + }, + { + "namespace": "comms", + "name": "coturn", + "type": "LoadBalancer", + "selector": { + "app": "coturn" + }, + "ports": [ + { + "name": "turn-udp", + "port": 3478, + "targetPort": 3478, + "protocol": "UDP" + }, + { + "name": "turn-tcp", + "port": 3478, + "targetPort": 3478, + "protocol": "TCP" + }, + { + "name": "turn-tls", + "port": 5349, + "targetPort": 5349, + "protocol": "TCP" + }, + { + "name": "relay-50000", + "port": 50000, + "targetPort": 50000, + "protocol": "UDP" + }, + { + "name": "relay-50001", + "port": 50001, + "targetPort": 50001, + "protocol": "UDP" + }, + { + "name": "relay-50002", + "port": 50002, + "targetPort": 50002, + "protocol": "UDP" + }, + { + "name": "relay-50003", + "port": 50003, + "targetPort": 50003, + "protocol": "UDP" + }, + { + "name": "relay-50004", + "port": 50004, + "targetPort": 50004, + "protocol": "UDP" + }, + { + "name": "relay-50005", + "port": 50005, + "targetPort": 50005, + "protocol": "UDP" + }, + { + "name": "relay-50006", + "port": 50006, + "targetPort": 50006, + "protocol": "UDP" + }, + { + "name": "relay-50007", + "port": 50007, + "targetPort": 50007, + "protocol": "UDP" + }, + { + "name": "relay-50008", + "port": 50008, + "targetPort": 50008, + "protocol": "UDP" + }, + { + "name": "relay-50009", + "port": 50009, + "targetPort": 50009, + "protocol": "UDP" + }, + { + "name": "relay-50010", + "port": 50010, + "targetPort": 50010, + "protocol": "UDP" + }, + { + "name": "relay-50011", + "port": 50011, + "targetPort": 50011, + "protocol": "UDP" + }, + { + "name": "relay-50012", + "port": 50012, + "targetPort": 50012, + "protocol": "UDP" + }, + { + "name": "relay-50013", + "port": 50013, + "targetPort": 50013, + "protocol": "UDP" + }, + { + "name": "relay-50014", + "port": 50014, + "targetPort": 50014, + "protocol": "UDP" + }, + { + "name": "relay-50015", + "port": 50015, + "targetPort": 50015, + "protocol": "UDP" + }, + { + "name": "relay-50016", + "port": 50016, + "targetPort": 50016, + "protocol": "UDP" + }, + { + "name": "relay-50017", + "port": 50017, + "targetPort": 50017, + "protocol": "UDP" + }, + { + "name": "relay-50018", + "port": 50018, + "targetPort": 50018, + "protocol": "UDP" + }, + { + "name": "relay-50019", + "port": 50019, + "targetPort": 50019, + "protocol": "UDP" + }, + { + "name": "relay-50020", + "port": 50020, + "targetPort": 50020, + "protocol": "UDP" + }, + { + "name": "relay-50021", + "port": 50021, + "targetPort": 50021, + "protocol": "UDP" + }, + { + "name": "relay-50022", + "port": 50022, + "targetPort": 50022, + "protocol": "UDP" + }, + { + "name": "relay-50023", + "port": 50023, + "targetPort": 50023, + "protocol": "UDP" + }, + { + "name": "relay-50024", + "port": 50024, + "targetPort": 50024, + "protocol": "UDP" + }, + { + "name": "relay-50025", + "port": 50025, + "targetPort": 50025, + "protocol": "UDP" + }, + { + "name": "relay-50026", + "port": 50026, + "targetPort": 50026, + "protocol": "UDP" + }, + { + "name": "relay-50027", + "port": 50027, + "targetPort": 50027, + "protocol": "UDP" + }, + { + "name": "relay-50028", + "port": 50028, + "targetPort": 50028, + "protocol": "UDP" + }, + { + "name": "relay-50029", + "port": 50029, + "targetPort": 50029, + "protocol": "UDP" + }, + { + "name": "relay-50030", + "port": 50030, + "targetPort": 50030, + "protocol": "UDP" + }, + { + "name": "relay-50031", + "port": 50031, + "targetPort": 50031, + "protocol": "UDP" + }, + { + "name": "relay-50032", + "port": 50032, + "targetPort": 50032, + "protocol": "UDP" + }, + { + "name": "relay-50033", + "port": 50033, + "targetPort": 50033, + "protocol": "UDP" + }, + { + "name": "relay-50034", + "port": 50034, + "targetPort": 50034, + "protocol": "UDP" + }, + { + "name": "relay-50035", + "port": 50035, + "targetPort": 50035, + "protocol": "UDP" + }, + { + "name": "relay-50036", + "port": 50036, + "targetPort": 50036, + "protocol": "UDP" + }, + { + "name": "relay-50037", + "port": 50037, + "targetPort": 50037, + "protocol": "UDP" + }, + { + "name": "relay-50038", + "port": 50038, + "targetPort": 50038, + "protocol": "UDP" + }, + { + "name": "relay-50039", + "port": 50039, + "targetPort": 50039, + "protocol": "UDP" + }, + { + "name": "relay-50040", + "port": 50040, + "targetPort": 50040, + "protocol": "UDP" + }, + { + "name": "relay-50041", + "port": 50041, + "targetPort": 50041, + "protocol": "UDP" + }, + { + "name": "relay-50042", + "port": 50042, + "targetPort": 50042, + "protocol": "UDP" + }, + { + "name": "relay-50043", + "port": 50043, + "targetPort": 50043, + "protocol": "UDP" + }, + { + "name": "relay-50044", + "port": 50044, + "targetPort": 50044, + "protocol": "UDP" + }, + { + "name": "relay-50045", + "port": 50045, + "targetPort": 50045, + "protocol": "UDP" + }, + { + "name": "relay-50046", + "port": 50046, + "targetPort": 50046, + "protocol": "UDP" + }, + { + "name": "relay-50047", + "port": 50047, + "targetPort": 50047, + "protocol": "UDP" + }, + { + "name": "relay-50048", + "port": 50048, + "targetPort": 50048, + "protocol": "UDP" + }, + { + "name": "relay-50049", + "port": 50049, + "targetPort": 50049, + "protocol": "UDP" + }, + { + "name": "relay-50050", + "port": 50050, + "targetPort": 50050, + "protocol": "UDP" + } + ] + }, + { + "namespace": "comms", + "name": "element-call", + "type": "ClusterIP", + "selector": { + "app": "element-call" + }, + "ports": [ + { + "name": "http", + "port": 80, + "targetPort": 8080, + "protocol": "TCP" + } + ] + }, + { + "namespace": "comms", + "name": "livekit", + "type": "LoadBalancer", + "selector": { + "app": "livekit" + }, + "ports": [ + { + "name": "http", + "port": 7880, + "targetPort": 7880, + "protocol": "TCP" + }, + { + "name": "rtc-tcp", + "port": 7881, + "targetPort": 7881, + "protocol": "TCP" + }, + { + "name": "rtc-udp-7882", + "port": 7882, + "targetPort": 7882, + "protocol": "UDP" + }, + { + "name": "rtc-udp-7883", + "port": 7883, + "targetPort": 7883, + "protocol": "UDP" + } + ] + }, + { + "namespace": "comms", + "name": "livekit-token-service", + "type": "ClusterIP", + "selector": { + "app": "livekit-token-service" + }, + "ports": [ + { + "name": "http", + "port": 8080, + "targetPort": 8080, + "protocol": "TCP" + } + ] + }, + { + "namespace": "comms", + "name": "matrix-authentication-service", + "type": "ClusterIP", + "selector": { + "app": "matrix-authentication-service" + }, + "ports": [ + { + "name": "http", + "port": 8080, + "targetPort": "http", + "protocol": "TCP" + }, + { + "name": "internal", + "port": 8081, + "targetPort": "internal", + "protocol": "TCP" + } + ] + }, + { + "namespace": "comms", + "name": "matrix-guest-register", + "type": "ClusterIP", + "selector": { + "app.kubernetes.io/name": "matrix-guest-register" + }, + "ports": [ + { + "name": "http", + "port": 8080, + "targetPort": "http", + "protocol": "TCP" + } + ] + }, + { + "namespace": "comms", + "name": "matrix-wellknown", + "type": "ClusterIP", + "selector": { + "app": "matrix-wellknown" + }, + "ports": [ + { + "name": "http", + "port": 80, + "targetPort": 80, + "protocol": "TCP" + } + ] + }, + { + "namespace": "comms", + "name": "othrys-element-element-web", + "type": "ClusterIP", + "selector": { + "app.kubernetes.io/instance": "othrys-element", + "app.kubernetes.io/name": "element-web" + }, + "ports": [ + { + "name": "http", + "port": 80, + "targetPort": "http", + "protocol": "TCP" + } + ] + }, + { + "namespace": "comms", + "name": "othrys-synapse-matrix-synapse", + "type": "ClusterIP", + "selector": { + "app.kubernetes.io/component": "synapse", + "app.kubernetes.io/instance": "othrys-synapse", + "app.kubernetes.io/name": "matrix-synapse" + }, + "ports": [ + { + "name": "http", + "port": 8008, + "targetPort": "http", + "protocol": "TCP" + } + ] + }, + { + "namespace": "comms", + "name": "othrys-synapse-redis-headless", + "type": "ClusterIP", + "selector": { + "app.kubernetes.io/instance": "othrys-synapse", + "app.kubernetes.io/name": "redis" + }, + "ports": [ + { + "name": "tcp-redis", + "port": 6379, + "targetPort": "redis", + "protocol": "TCP" + } + ] + }, + { + "namespace": "comms", + "name": "othrys-synapse-redis-master", + "type": "ClusterIP", + "selector": { + "app.kubernetes.io/component": "master", + "app.kubernetes.io/instance": "othrys-synapse", + "app.kubernetes.io/name": "redis" + }, + "ports": [ + { + "name": "tcp-redis", + "port": 6379, + "targetPort": "redis", + "protocol": "TCP" + } + ] + }, + { + "namespace": "comms", + "name": "othrys-synapse-replication", + "type": "ClusterIP", + "selector": { + "app.kubernetes.io/component": "synapse", + "app.kubernetes.io/instance": "othrys-synapse", + "app.kubernetes.io/name": "matrix-synapse" + }, + "ports": [ + { + "name": "replication", + "port": 9093, + "targetPort": "replication", + "protocol": "TCP" + } + ] + }, + { + "namespace": "crypto", + "name": "monerod", + "type": "ClusterIP", + "selector": { + "app": "monerod" + }, + "ports": [ + { + "name": "rpc", + "port": 18081, + "targetPort": 18081, + "protocol": "TCP" + }, + { + "name": "p2p", + "port": 18080, + "targetPort": 18080, + "protocol": "TCP" + }, + { + "name": "zmq", + "port": 18083, + "targetPort": 18083, + "protocol": "TCP" + } + ] + }, + { + "namespace": "crypto", + "name": "p2pool", + "type": "ClusterIP", + "selector": { + "app": "p2pool" + }, + "ports": [ + { + "name": "stratum", + "port": 3333, + "targetPort": 3333, + "protocol": "TCP" + } + ] + }, + { + "namespace": "flux-system", + "name": "notification-controller", + "type": "ClusterIP", + "selector": { + "app": "notification-controller" + }, + "ports": [ + { + "name": "http", + "port": 80, + "targetPort": "http", + "protocol": "TCP" + } + ] + }, + { + "namespace": "flux-system", + "name": "source-controller", + "type": "ClusterIP", + "selector": { + "app": "source-controller" + }, + "ports": [ + { + "name": "http", + "port": 80, + "targetPort": "http", + "protocol": "TCP" + } + ] + }, + { + "namespace": "flux-system", + "name": "webhook-receiver", + "type": "ClusterIP", + "selector": { + "app": "notification-controller" + }, + "ports": [ + { + "name": "http", + "port": 80, + "targetPort": "http-webhook", + "protocol": "TCP" + } + ] + }, + { + "namespace": "gitea", + "name": "gitea", + "type": "ClusterIP", + "selector": { + "app": "gitea" + }, + "ports": [ + { + "name": "http", + "port": 3000, + "targetPort": 3000, + "protocol": "TCP" + } + ] + }, + { + "namespace": "gitea", + "name": "gitea-ssh", + "type": "NodePort", + "selector": { + "app": "gitea" + }, + "ports": [ + { + "name": "ssh", + "port": 2242, + "targetPort": 2242, + "protocol": "TCP" + } + ] + }, + { + "namespace": "jellyfin", + "name": "jellyfin", + "type": "ClusterIP", + "selector": { + "app": "jellyfin" + }, + "ports": [ + { + "name": "http", + "port": 80, + "targetPort": 8096, + "protocol": "TCP" + } + ] + }, + { + "namespace": "jellyfin", + "name": "pegasus", + "type": "ClusterIP", + "selector": { + "app": "pegasus" + }, + "ports": [ + { + "name": "http", + "port": 80, + "targetPort": "http", + "protocol": "TCP" + } + ] + }, + { + "namespace": "jenkins", + "name": "jenkins", + "type": "ClusterIP", + "selector": { + "app": "jenkins" + }, + "ports": [ + { + "name": "http", + "port": 8080, + "targetPort": 8080, + "protocol": "TCP" + }, + { + "name": "agent-listener", + "port": 50000, + "targetPort": 50000, + "protocol": "TCP" + } + ] + }, + { + "namespace": "kube-system", + "name": "traefik", + "type": "LoadBalancer", + "selector": { + "app.kubernetes.io/instance": "traefik-kube-system", + "app.kubernetes.io/name": "traefik" + }, + "ports": [ + { + "name": "web", + "port": 80, + "targetPort": "web", + "protocol": "TCP" + }, + { + "name": "websecure", + "port": 443, + "targetPort": "websecure", + "protocol": "TCP" + } + ] + }, + { + "namespace": "longhorn-system", + "name": "oauth2-proxy-longhorn", + "type": "ClusterIP", + "selector": { + "app": "oauth2-proxy-longhorn" + }, + "ports": [ + { + "name": "http", + "port": 80, + "targetPort": 4180, + "protocol": "TCP" + } + ] + }, + { + "namespace": "mailu-mailserver", + "name": "mailu-front-lb", + "type": "LoadBalancer", + "selector": { + "app.kubernetes.io/component": "front", + "app.kubernetes.io/instance": "mailu", + "app.kubernetes.io/name": "mailu" + }, + "ports": [ + { + "name": "smtp", + "port": 25, + "targetPort": 25, + "protocol": "TCP" + }, + { + "name": "smtps", + "port": 465, + "targetPort": 465, + "protocol": "TCP" + }, + { + "name": "submission", + "port": 587, + "targetPort": 587, + "protocol": "TCP" + }, + { + "name": "imaps", + "port": 993, + "targetPort": 993, + "protocol": "TCP" + }, + { + "name": "pop3s", + "port": 995, + "targetPort": 995, + "protocol": "TCP" + }, + { + "name": "sieve", + "port": 4190, + "targetPort": 4190, + "protocol": "TCP" + } + ] + }, + { + "namespace": "mailu-mailserver", + "name": "mailu-sync-listener", + "type": "ClusterIP", + "selector": { + "app": "mailu-sync-listener" + }, + "ports": [ + { + "name": "http", + "port": 8080, + "targetPort": 8080, + "protocol": "TCP" + } + ] + }, + { + "namespace": "metallb-system", + "name": "metallb-webhook-service", + "type": "ClusterIP", + "selector": { + "app.kubernetes.io/component": "controller", + "app.kubernetes.io/instance": "metallb", + "app.kubernetes.io/name": "metallb" + }, + "ports": [ + { + "name": null, + "port": 443, + "targetPort": 9443, + "protocol": "TCP" + } + ] + }, + { + "namespace": "monitoring", + "name": "dcgm-exporter", + "type": "ClusterIP", + "selector": { + "app": "dcgm-exporter" + }, + "ports": [ + { + "name": "metrics", + "port": 9400, + "targetPort": "metrics", + "protocol": "TCP" + } + ] + }, + { + "namespace": "monitoring", + "name": "postmark-exporter", + "type": "ClusterIP", + "selector": { + "app": "postmark-exporter" + }, + "ports": [ + { + "name": "http", + "port": 8000, + "targetPort": "http", + "protocol": "TCP" + } + ] + }, + { + "namespace": "nextcloud", + "name": "collabora", + "type": "ClusterIP", + "selector": { + "app": "collabora" + }, + "ports": [ + { + "name": "http", + "port": 9980, + "targetPort": "http", + "protocol": "TCP" + } + ] + }, + { + "namespace": "nextcloud", + "name": "nextcloud", + "type": "ClusterIP", + "selector": { + "app": "nextcloud" + }, + "ports": [ + { + "name": "http", + "port": 80, + "targetPort": "http", + "protocol": "TCP" + } + ] + }, + { + "namespace": "sso", + "name": "keycloak", + "type": "ClusterIP", + "selector": { + "app": "keycloak" + }, + "ports": [ + { + "name": "http", + "port": 80, + "targetPort": "http", + "protocol": "TCP" + } + ] + }, + { + "namespace": "sso", + "name": "oauth2-proxy", + "type": "ClusterIP", + "selector": { + "app": "oauth2-proxy" + }, + "ports": [ + { + "name": "http", + "port": 80, + "targetPort": 4180, + "protocol": "TCP" + } + ] + }, + { + "namespace": "sso", + "name": "openldap", + "type": "ClusterIP", + "selector": { + "app": "openldap" + }, + "ports": [ + { + "name": "ldap", + "port": 389, + "targetPort": "ldap", + "protocol": "TCP" + }, + { + "name": "ldaps", + "port": 636, + "targetPort": "ldaps", + "protocol": "TCP" + } + ] + }, + { + "namespace": "sui-metrics", + "name": "sui-metrics", + "type": "ClusterIP", + "selector": { + "app": "sui-metrics" + }, + "ports": [ + { + "name": "http", + "port": 8429, + "targetPort": 8429, + "protocol": "TCP" + } + ] + }, + { + "namespace": "traefik", + "name": "traefik-metrics", + "type": "ClusterIP", + "selector": { + "app": "traefik" + }, + "ports": [ + { + "name": "metrics", + "port": 9100, + "targetPort": "metrics", + "protocol": "TCP" + } + ] + }, + { + "namespace": "vault", + "name": "vault", + "type": "ClusterIP", + "selector": { + "app": "vault" + }, + "ports": [ + { + "name": "api", + "port": 8200, + "targetPort": 8200, + "protocol": "TCP" + }, + { + "name": "cluster", + "port": 8201, + "targetPort": 8201, + "protocol": "TCP" + } + ] + }, + { + "namespace": "vault", + "name": "vault-internal", + "type": "ClusterIP", + "selector": { + "app": "vault" + }, + "ports": [ + { + "name": "api", + "port": 8200, + "targetPort": 8200, + "protocol": "TCP" + }, + { + "name": "cluster", + "port": 8201, + "targetPort": 8201, + "protocol": "TCP" + } + ] + }, + { + "namespace": "vaultwarden", + "name": "vaultwarden-service", + "type": "ClusterIP", + "selector": { + "app": "vaultwarden" + }, + "ports": [ + { + "name": "http", + "port": 80, + "targetPort": "http", + "protocol": "TCP" + } + ] + } + ], + "http_endpoints": [ + { + "host": "auth.bstein.dev", + "path": "/", + "backend": { + "namespace": "sso", + "service": "oauth2-proxy", + "port": 80, + "workloads": [ + { + "kind": "Deployment", + "name": "oauth2-proxy" + } + ] + }, + "via": { + "kind": "Ingress", + "name": "oauth2-proxy", + "source": "oauth2-proxy" + } + }, + { + "host": "bstein.dev", + "path": "/", + "backend": { + "namespace": "bstein-dev-home", + "service": "bstein-dev-home-frontend", + "port": 80, + "workloads": [ + { + "kind": "Deployment", + "name": "bstein-dev-home-frontend" + } + ] + }, + "via": { + "kind": "Ingress", + "name": "bstein-dev-home", + "source": "bstein-dev-home" + } + }, + { + "host": "bstein.dev", + "path": "/.well-known/matrix/client", + "backend": { + "namespace": "comms", + "service": "matrix-wellknown", + "port": 80, + "workloads": [ + { + "kind": "Deployment", + "name": "matrix-wellknown" + } + ] + }, + "via": { + "kind": "Ingress", + "name": "matrix-wellknown-bstein-dev", + "source": "communication" + } + }, + { + "host": "bstein.dev", + "path": "/.well-known/matrix/server", + "backend": { + "namespace": "comms", + "service": "matrix-wellknown", + "port": 80, + "workloads": [ + { + "kind": "Deployment", + "name": "matrix-wellknown" + } + ] + }, + "via": { + "kind": "Ingress", + "name": "matrix-wellknown-bstein-dev", + "source": "communication" + } + }, + { + "host": "bstein.dev", + "path": "/api", + "backend": { + "namespace": "bstein-dev-home", + "service": "bstein-dev-home-backend", + "port": 80, + "workloads": [ + { + "kind": "Deployment", + "name": "bstein-dev-home-backend" + } + ] + }, + "via": { + "kind": "Ingress", + "name": "bstein-dev-home", + "source": "bstein-dev-home" + } + }, + { + "host": "call.live.bstein.dev", + "path": "/", + "backend": { + "namespace": "comms", + "service": "element-call", + "port": 80, + "workloads": [ + { + "kind": "Deployment", + "name": "element-call" + } + ] + }, + "via": { + "kind": "Ingress", + "name": "element-call", + "source": "communication" + } + }, + { + "host": "chat.ai.bstein.dev", + "path": "/", + "backend": { + "namespace": "bstein-dev-home", + "service": "chat-ai-gateway", + "port": 80, + "workloads": [ + { + "kind": "Deployment", + "name": "chat-ai-gateway" + } + ] + }, + "via": { + "kind": "Ingress", + "name": "bstein-dev-home", + "source": "bstein-dev-home" + } + }, + { + "host": "ci.bstein.dev", + "path": "/", + "backend": { + "namespace": "jenkins", + "service": "jenkins", + "port": "http", + "workloads": [ + { + "kind": "Deployment", + "name": "jenkins" + } + ] + }, + "via": { + "kind": "Ingress", + "name": "jenkins", + "source": "jenkins" + } + }, + { + "host": "cloud.bstein.dev", + "path": "/", + "backend": { + "namespace": "nextcloud", + "service": "nextcloud", + "port": 80, + "workloads": [ + { + "kind": "Deployment", + "name": "nextcloud" + } + ] + }, + "via": { + "kind": "Ingress", + "name": "nextcloud", + "source": "nextcloud" + } + }, + { + "host": "kit.live.bstein.dev", + "path": "/livekit/jwt", + "backend": { + "namespace": "comms", + "service": "livekit-token-service", + "port": 8080, + "workloads": [ + { + "kind": "Deployment", + "name": "livekit-token-service" + } + ] + }, + "via": { + "kind": "Ingress", + "name": "livekit-jwt-ingress", + "source": "communication" + } + }, + { + "host": "kit.live.bstein.dev", + "path": "/livekit/sfu", + "backend": { + "namespace": "comms", + "service": "livekit", + "port": 7880, + "workloads": [ + { + "kind": "Deployment", + "name": "livekit" + } + ] + }, + "via": { + "kind": "Ingress", + "name": "livekit-ingress", + "source": "communication" + } + }, + { + "host": "live.bstein.dev", + "path": "/", + "backend": { + "namespace": "comms", + "service": "othrys-element-element-web", + "port": 80, + "workloads": [ + { + "kind": "Deployment", + "name": "othrys-element-element-web" + } + ] + }, + "via": { + "kind": "Ingress", + "name": "othrys-element-element-web", + "source": "communication" + } + }, + { + "host": "live.bstein.dev", + "path": "/.well-known/matrix/client", + "backend": { + "namespace": "comms", + "service": "matrix-wellknown", + "port": 80, + "workloads": [ + { + "kind": "Deployment", + "name": "matrix-wellknown" + } + ] + }, + "via": { + "kind": "Ingress", + "name": "matrix-wellknown", + "source": "communication" + } + }, + { + "host": "live.bstein.dev", + "path": "/.well-known/matrix/server", + "backend": { + "namespace": "comms", + "service": "matrix-wellknown", + "port": 80, + "workloads": [ + { + "kind": "Deployment", + "name": "matrix-wellknown" + } + ] + }, + "via": { + "kind": "Ingress", + "name": "matrix-wellknown", + "source": "communication" + } + }, + { + "host": "live.bstein.dev", + "path": "/_matrix", + "backend": { + "namespace": "comms", + "service": "othrys-synapse-matrix-synapse", + "port": 8008, + "workloads": [ + { + "kind": "Deployment", + "name": "othrys-synapse-matrix-synapse" + } + ] + }, + "via": { + "kind": "Ingress", + "name": "matrix-routing", + "source": "communication" + } + }, + { + "host": "longhorn.bstein.dev", + "path": "/", + "backend": { + "namespace": "longhorn-system", + "service": "oauth2-proxy-longhorn", + "port": 80, + "workloads": [ + { + "kind": "Deployment", + "name": "oauth2-proxy-longhorn" + } + ] + }, + "via": { + "kind": "Ingress", + "name": "longhorn-ingress", + "source": "longhorn-ui" + } + }, + { + "host": "mail.bstein.dev", + "path": "/", + "backend": { + "namespace": "mailu-mailserver", + "service": "mailu-front", + "port": 443, + "workloads": [] + }, + "via": { + "kind": "IngressRoute", + "name": "mailu", + "source": "mailu" + } + }, + { + "host": "matrix.live.bstein.dev", + "path": "/", + "backend": { + "namespace": "comms", + "service": "matrix-authentication-service", + "port": 8080, + "workloads": [ + { + "kind": "Deployment", + "name": "matrix-authentication-service" + } + ] + }, + "via": { + "kind": "Ingress", + "name": "matrix-routing", + "source": "communication" + } + }, + { + "host": "matrix.live.bstein.dev", + "path": "/.well-known/matrix/client", + "backend": { + "namespace": "comms", + "service": "matrix-wellknown", + "port": 80, + "workloads": [ + { + "kind": "Deployment", + "name": "matrix-wellknown" + } + ] + }, + "via": { + "kind": "Ingress", + "name": "matrix-wellknown-matrix-live", + "source": "communication" + } + }, + { + "host": "matrix.live.bstein.dev", + "path": "/.well-known/matrix/server", + "backend": { + "namespace": "comms", + "service": "matrix-wellknown", + "port": 80, + "workloads": [ + { + "kind": "Deployment", + "name": "matrix-wellknown" + } + ] + }, + "via": { + "kind": "Ingress", + "name": "matrix-wellknown-matrix-live", + "source": "communication" + } + }, + { + "host": "matrix.live.bstein.dev", + "path": "/_matrix", + "backend": { + "namespace": "comms", + "service": "othrys-synapse-matrix-synapse", + "port": 8008, + "workloads": [ + { + "kind": "Deployment", + "name": "othrys-synapse-matrix-synapse" + } + ] + }, + "via": { + "kind": "Ingress", + "name": "matrix-routing", + "source": "communication" + } + }, + { + "host": "matrix.live.bstein.dev", + "path": "/_matrix/client/r0/register", + "backend": { + "namespace": "comms", + "service": "matrix-guest-register", + "port": 8080, + "workloads": [ + { + "kind": "Deployment", + "name": "matrix-guest-register" + } + ] + }, + "via": { + "kind": "Ingress", + "name": "matrix-routing", + "source": "communication" + } + }, + { + "host": "matrix.live.bstein.dev", + "path": "/_matrix/client/v3/login", + "backend": { + "namespace": "comms", + "service": "matrix-authentication-service", + "port": 8080, + "workloads": [ + { + "kind": "Deployment", + "name": "matrix-authentication-service" + } + ] + }, + "via": { + "kind": "Ingress", + "name": "matrix-routing", + "source": "communication" + } + }, + { + "host": "matrix.live.bstein.dev", + "path": "/_matrix/client/v3/logout", + "backend": { + "namespace": "comms", + "service": "matrix-authentication-service", + "port": 8080, + "workloads": [ + { + "kind": "Deployment", + "name": "matrix-authentication-service" + } + ] + }, + "via": { + "kind": "Ingress", + "name": "matrix-routing", + "source": "communication" + } + }, + { + "host": "matrix.live.bstein.dev", + "path": "/_matrix/client/v3/refresh", + "backend": { + "namespace": "comms", + "service": "matrix-authentication-service", + "port": 8080, + "workloads": [ + { + "kind": "Deployment", + "name": "matrix-authentication-service" + } + ] + }, + "via": { + "kind": "Ingress", + "name": "matrix-routing", + "source": "communication" + } + }, + { + "host": "matrix.live.bstein.dev", + "path": "/_matrix/client/v3/register", + "backend": { + "namespace": "comms", + "service": "matrix-guest-register", + "port": 8080, + "workloads": [ + { + "kind": "Deployment", + "name": "matrix-guest-register" + } + ] + }, + "via": { + "kind": "Ingress", + "name": "matrix-routing", + "source": "communication" + } + }, + { + "host": "matrix.live.bstein.dev", + "path": "/_synapse", + "backend": { + "namespace": "comms", + "service": "othrys-synapse-matrix-synapse", + "port": 8008, + "workloads": [ + { + "kind": "Deployment", + "name": "othrys-synapse-matrix-synapse" + } + ] + }, + "via": { + "kind": "Ingress", + "name": "matrix-routing", + "source": "communication" + } + }, + { + "host": "monero.bstein.dev", + "path": "/", + "backend": { + "namespace": "crypto", + "service": "monerod", + "port": 18081, + "workloads": [ + { + "kind": "Deployment", + "name": "monerod" + } + ] + }, + "via": { + "kind": "Ingress", + "name": "monerod", + "source": "monerod" + } + }, + { + "host": "office.bstein.dev", + "path": "/", + "backend": { + "namespace": "nextcloud", + "service": "collabora", + "port": 9980, + "workloads": [ + { + "kind": "Deployment", + "name": "collabora" + } + ] + }, + "via": { + "kind": "Ingress", + "name": "collabora", + "source": "nextcloud" + } + }, + { + "host": "pegasus.bstein.dev", + "path": "/", + "backend": { + "namespace": "jellyfin", + "service": "pegasus", + "port": 80, + "workloads": [ + { + "kind": "Deployment", + "name": "pegasus" + } + ] + }, + "via": { + "kind": "Ingress", + "name": "pegasus", + "source": "pegasus" + } + }, + { + "host": "scm.bstein.dev", + "path": "/", + "backend": { + "namespace": "gitea", + "service": "gitea", + "port": 3000, + "workloads": [ + { + "kind": "Deployment", + "name": "gitea" + } + ] + }, + "via": { + "kind": "Ingress", + "name": "gitea-ingress", + "source": "gitea" + } + }, + { + "host": "secret.bstein.dev", + "path": "/", + "backend": { + "namespace": "vault", + "service": "vault", + "port": 8200, + "workloads": [ + { + "kind": "StatefulSet", + "name": "vault" + } + ] + }, + "via": { + "kind": "Ingress", + "name": "vault", + "source": "vault" + } + }, + { + "host": "sso.bstein.dev", + "path": "/", + "backend": { + "namespace": "sso", + "service": "keycloak", + "port": 80, + "workloads": [ + { + "kind": "Deployment", + "name": "keycloak" + } + ] + }, + "via": { + "kind": "Ingress", + "name": "keycloak", + "source": "keycloak" + } + }, + { + "host": "stream.bstein.dev", + "path": "/", + "backend": { + "namespace": "jellyfin", + "service": "jellyfin", + "port": 80, + "workloads": [ + { + "kind": "Deployment", + "name": "jellyfin" + } + ] + }, + "via": { + "kind": "Ingress", + "name": "jellyfin", + "source": "jellyfin" + } + }, + { + "host": "vault.bstein.dev", + "path": "/", + "backend": { + "namespace": "vaultwarden", + "service": "vaultwarden-service", + "port": 80, + "workloads": [ + { + "kind": "Deployment", + "name": "vaultwarden" + } + ] + }, + "via": { + "kind": "Ingress", + "name": "vaultwarden-ingress", + "source": "vaultwarden" + } + } + ], + "helmrelease_host_hints": { + "gitops-ui:flux-system/weave-gitops": [ + "cd.bstein.dev" + ], + "harbor:harbor/harbor": [ + "registry.bstein.dev" + ], + "mailu:mailu-mailserver/mailu": [ + "bstein.dev", + "mail.bstein.dev" + ], + "monitoring:monitoring/alertmanager": [ + "alerts.bstein.dev" + ], + "monitoring:monitoring/grafana": [ + "metrics.bstein.dev", + "sso.bstein.dev" + ] + } +} diff --git a/knowledge/catalog/atlas.yaml b/knowledge/catalog/atlas.yaml new file mode 100644 index 0000000..d628b7b --- /dev/null +++ b/knowledge/catalog/atlas.yaml @@ -0,0 +1,1786 @@ +# Generated by scripts/knowledge_render_atlas.py (do not edit by hand) +cluster: atlas +sources: +- name: ai-llm + path: services/ai-llm + targetNamespace: ai +- name: bstein-dev-home + path: services/bstein-dev-home + targetNamespace: bstein-dev-home +- name: communication + path: services/comms + targetNamespace: comms +- name: core + path: infrastructure/core + targetNamespace: null +- name: crypto + path: services/crypto + targetNamespace: crypto +- name: flux-system + path: clusters/atlas/flux-system + targetNamespace: null +- name: gitea + path: services/gitea + targetNamespace: gitea +- name: gitops-ui + path: services/gitops-ui + targetNamespace: flux-system +- name: harbor + path: services/harbor + targetNamespace: harbor +- name: helm + path: infrastructure/sources/helm + targetNamespace: flux-system +- name: jellyfin + path: services/jellyfin + targetNamespace: jellyfin +- name: jenkins + path: services/jenkins + targetNamespace: jenkins +- name: keycloak + path: services/keycloak + targetNamespace: sso +- name: longhorn-ui + path: infrastructure/longhorn/ui-ingress + targetNamespace: longhorn-system +- name: mailu + path: services/mailu + targetNamespace: mailu-mailserver +- name: metallb + path: infrastructure/metallb + targetNamespace: metallb-system +- name: monerod + path: services/crypto/monerod + targetNamespace: crypto +- name: monitoring + path: services/monitoring + targetNamespace: null +- name: nextcloud + path: services/nextcloud + targetNamespace: nextcloud +- name: nextcloud-mail-sync + path: services/nextcloud-mail-sync + targetNamespace: nextcloud +- name: oauth2-proxy + path: services/oauth2-proxy + targetNamespace: sso +- name: openldap + path: services/openldap + targetNamespace: sso +- name: pegasus + path: services/pegasus + targetNamespace: jellyfin +- name: sui-metrics + path: services/sui-metrics/overlays/atlas + targetNamespace: sui-metrics +- name: traefik + path: infrastructure/traefik + targetNamespace: traefik +- name: vault + path: services/vault + targetNamespace: vault +- name: vault-csi + path: infrastructure/vault-csi + targetNamespace: kube-system +- name: vaultwarden + path: services/vaultwarden + targetNamespace: vaultwarden +- name: xmr-miner + path: services/crypto/xmr-miner + targetNamespace: crypto +workloads: +- kind: Deployment + namespace: ai + name: ollama + labels: + app: ollama + serviceAccountName: null + nodeSelector: {} + images: + - ollama/ollama:latest +- kind: Deployment + namespace: bstein-dev-home + name: bstein-dev-home-backend + labels: + app: bstein-dev-home-backend + serviceAccountName: bstein-dev-home + nodeSelector: + kubernetes.io/arch: arm64 + node-role.kubernetes.io/worker: 'true' + images: + - registry.bstein.dev/bstein/bstein-dev-home-backend:0.1.1-84 +- kind: Deployment + namespace: bstein-dev-home + name: bstein-dev-home-frontend + labels: + app: bstein-dev-home-frontend + serviceAccountName: null + nodeSelector: + kubernetes.io/arch: arm64 + node-role.kubernetes.io/worker: 'true' + images: + - registry.bstein.dev/bstein/bstein-dev-home-frontend:0.1.1-84 +- kind: Deployment + namespace: bstein-dev-home + name: chat-ai-gateway + labels: + app: chat-ai-gateway + serviceAccountName: null + nodeSelector: + kubernetes.io/arch: arm64 + node-role.kubernetes.io/worker: 'true' + images: + - python:3.11-slim +- kind: Deployment + namespace: comms + name: atlasbot + labels: + app: atlasbot + serviceAccountName: atlasbot + nodeSelector: + hardware: rpi5 + images: + - python:3.11-slim +- kind: Deployment + namespace: comms + name: coturn + labels: + app: coturn + serviceAccountName: null + nodeSelector: + hardware: rpi5 + images: + - ghcr.io/coturn/coturn:4.6.2 +- kind: Deployment + namespace: comms + name: element-call + labels: + app: element-call + serviceAccountName: null + nodeSelector: + hardware: rpi5 + images: + - ghcr.io/element-hq/element-call:latest +- kind: Deployment + namespace: comms + name: livekit + labels: + app: livekit + serviceAccountName: null + nodeSelector: + hardware: rpi5 + images: + - livekit/livekit-server:v1.9.0 +- kind: Deployment + namespace: comms + name: livekit-token-service + labels: + app: livekit-token-service + serviceAccountName: null + nodeSelector: + hardware: rpi5 + images: + - ghcr.io/element-hq/lk-jwt-service:0.3.0 +- kind: Deployment + namespace: comms + name: matrix-authentication-service + labels: + app: matrix-authentication-service + serviceAccountName: null + nodeSelector: + hardware: rpi5 + images: + - ghcr.io/element-hq/matrix-authentication-service:1.8.0 +- kind: Deployment + namespace: comms + name: matrix-guest-register + labels: + app.kubernetes.io/name: matrix-guest-register + serviceAccountName: null + nodeSelector: {} + images: + - python:3.11-slim +- kind: Deployment + namespace: comms + name: matrix-wellknown + labels: + app: matrix-wellknown + serviceAccountName: null + nodeSelector: {} + images: + - nginx:1.27-alpine +- kind: Deployment + namespace: comms + name: othrys-element-element-web + labels: + app.kubernetes.io/instance: othrys-element + app.kubernetes.io/name: element-web + serviceAccountName: othrys-element-element-web + nodeSelector: + hardware: rpi5 + images: + - ghcr.io/element-hq/element-web:v1.12.6 +- kind: Deployment + namespace: comms + name: othrys-synapse-matrix-synapse + labels: + app.kubernetes.io/component: synapse + app.kubernetes.io/instance: othrys-synapse + app.kubernetes.io/name: matrix-synapse + serviceAccountName: default + nodeSelector: + hardware: rpi5 + images: + - ghcr.io/element-hq/synapse:v1.144.0 +- kind: Deployment + namespace: comms + name: othrys-synapse-redis-master + labels: + app.kubernetes.io/component: master + app.kubernetes.io/instance: othrys-synapse + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/name: redis + helm.sh/chart: redis-17.17.1 + serviceAccountName: othrys-synapse-redis + nodeSelector: {} + images: + - docker.io/bitnamilegacy/redis:7.0.12-debian-11-r34 +- kind: DaemonSet + namespace: crypto + name: monero-xmrig + labels: + app: monero-xmrig + serviceAccountName: null + nodeSelector: + node-role.kubernetes.io/worker: 'true' + images: + - ghcr.io/tari-project/xmrig:latest +- kind: Deployment + namespace: crypto + name: monero-p2pool + labels: + app: monero-p2pool + serviceAccountName: null + nodeSelector: + node-role.kubernetes.io/worker: 'true' + images: + - debian:bookworm-slim +- kind: Deployment + namespace: crypto + name: monerod + labels: + app: monerod + serviceAccountName: null + nodeSelector: + node-role.kubernetes.io/worker: 'true' + images: + - registry.bstein.dev/crypto/monerod:0.18.4.1 +- kind: Deployment + namespace: flux-system + name: helm-controller + labels: + app: helm-controller + app.kubernetes.io/component: helm-controller + app.kubernetes.io/instance: flux-system + app.kubernetes.io/part-of: flux + app.kubernetes.io/version: v2.7.5 + serviceAccountName: helm-controller + nodeSelector: + kubernetes.io/os: linux + images: + - ghcr.io/fluxcd/helm-controller:v1.4.5 +- kind: Deployment + namespace: flux-system + name: image-automation-controller + labels: + app: image-automation-controller + app.kubernetes.io/component: image-automation-controller + app.kubernetes.io/instance: flux-system + app.kubernetes.io/part-of: flux + app.kubernetes.io/version: v2.7.5 + serviceAccountName: image-automation-controller + nodeSelector: + kubernetes.io/os: linux + images: + - ghcr.io/fluxcd/image-automation-controller:v1.0.4 +- kind: Deployment + namespace: flux-system + name: image-reflector-controller + labels: + app: image-reflector-controller + app.kubernetes.io/component: image-reflector-controller + app.kubernetes.io/instance: flux-system + app.kubernetes.io/part-of: flux + app.kubernetes.io/version: v2.7.5 + serviceAccountName: image-reflector-controller + nodeSelector: + kubernetes.io/os: linux + images: + - ghcr.io/fluxcd/image-reflector-controller:v1.0.4 +- kind: Deployment + namespace: flux-system + name: kustomize-controller + labels: + app: kustomize-controller + app.kubernetes.io/component: kustomize-controller + app.kubernetes.io/instance: flux-system + app.kubernetes.io/part-of: flux + app.kubernetes.io/version: v2.7.5 + serviceAccountName: kustomize-controller + nodeSelector: + kubernetes.io/os: linux + images: + - ghcr.io/fluxcd/kustomize-controller:v1.7.3 +- kind: Deployment + namespace: flux-system + name: notification-controller + labels: + app: notification-controller + app.kubernetes.io/component: notification-controller + app.kubernetes.io/instance: flux-system + app.kubernetes.io/part-of: flux + app.kubernetes.io/version: v2.7.5 + serviceAccountName: notification-controller + nodeSelector: + kubernetes.io/os: linux + images: + - ghcr.io/fluxcd/notification-controller:v1.7.5 +- kind: Deployment + namespace: flux-system + name: source-controller + labels: + app: source-controller + app.kubernetes.io/component: source-controller + app.kubernetes.io/instance: flux-system + app.kubernetes.io/part-of: flux + app.kubernetes.io/version: v2.7.5 + serviceAccountName: source-controller + nodeSelector: + kubernetes.io/os: linux + images: + - ghcr.io/fluxcd/source-controller:v1.7.4 +- kind: Deployment + namespace: gitea + name: gitea + labels: + app: gitea + serviceAccountName: null + nodeSelector: + node-role.kubernetes.io/worker: 'true' + images: + - gitea/gitea:1.23 +- kind: Deployment + namespace: jellyfin + name: jellyfin + labels: + app: jellyfin + serviceAccountName: null + nodeSelector: {} + images: + - docker.io/jellyfin/jellyfin:10.11.5 +- kind: Deployment + namespace: jellyfin + name: pegasus + labels: + app: pegasus + serviceAccountName: null + nodeSelector: + kubernetes.io/arch: arm64 + node-role.kubernetes.io/worker: 'true' + images: + - alpine:3.20 + - registry.bstein.dev/streaming/pegasus:1.2.32 +- kind: Deployment + namespace: jenkins + name: jenkins + labels: + app: jenkins + serviceAccountName: jenkins + nodeSelector: + kubernetes.io/arch: arm64 + node-role.kubernetes.io/worker: 'true' + images: + - jenkins/jenkins:2.528.3-jdk21 +- kind: DaemonSet + namespace: kube-system + name: nvidia-device-plugin-jetson + labels: + app.kubernetes.io/instance: jetson + app.kubernetes.io/name: nvidia-device-plugin + serviceAccountName: null + nodeSelector: + jetson: 'true' + kubernetes.io/arch: arm64 + images: + - nvcr.io/nvidia/k8s-device-plugin:v0.16.2 +- kind: DaemonSet + namespace: kube-system + name: nvidia-device-plugin-minipc + labels: + app.kubernetes.io/instance: titan22 + app.kubernetes.io/name: nvidia-device-plugin + serviceAccountName: null + nodeSelector: + kubernetes.io/arch: amd64 + kubernetes.io/hostname: titan-22 + images: + - nvcr.io/nvidia/k8s-device-plugin:v0.16.2 +- kind: DaemonSet + namespace: kube-system + name: nvidia-device-plugin-tethys + labels: + app.kubernetes.io/instance: titan24 + app.kubernetes.io/name: nvidia-device-plugin + serviceAccountName: null + nodeSelector: + kubernetes.io/arch: amd64 + kubernetes.io/hostname: titan-24 + images: + - nvcr.io/nvidia/k8s-device-plugin:v0.16.2 +- kind: DaemonSet + namespace: kube-system + name: vault-csi-provider + labels: + app.kubernetes.io/name: vault-csi-provider + serviceAccountName: vault-csi-provider + nodeSelector: + kubernetes.io/os: linux + images: + - hashicorp/vault-csi-provider:1.7.0 +- kind: Deployment + namespace: longhorn-system + name: oauth2-proxy-longhorn + labels: + app: oauth2-proxy-longhorn + serviceAccountName: null + nodeSelector: + node-role.kubernetes.io/worker: 'true' + images: + - quay.io/oauth2-proxy/oauth2-proxy:v7.6.0 +- kind: DaemonSet + namespace: mailu-mailserver + name: vip-controller + labels: + app: vip-controller + serviceAccountName: vip-controller + nodeSelector: + mailu.bstein.dev/vip: 'true' + images: + - lachlanevenson/k8s-kubectl:latest +- kind: Deployment + namespace: mailu-mailserver + name: mailu-sync-listener + labels: + app: mailu-sync-listener + serviceAccountName: null + nodeSelector: {} + images: + - python:3.11-alpine +- kind: DaemonSet + namespace: metallb-system + name: metallb-speaker + labels: + app.kubernetes.io/component: speaker + app.kubernetes.io/instance: metallb + app.kubernetes.io/name: metallb + serviceAccountName: metallb-speaker + nodeSelector: + kubernetes.io/os: linux + images: + - quay.io/frrouting/frr:10.4.1 + - quay.io/metallb/speaker:v0.15.3 +- kind: Deployment + namespace: metallb-system + name: metallb-controller + labels: + app.kubernetes.io/component: controller + app.kubernetes.io/instance: metallb + app.kubernetes.io/name: metallb + serviceAccountName: metallb-controller + nodeSelector: + kubernetes.io/os: linux + images: + - quay.io/metallb/controller:v0.15.3 +- kind: DaemonSet + namespace: monitoring + name: dcgm-exporter + labels: + app: dcgm-exporter + serviceAccountName: default + nodeSelector: {} + images: + - registry.bstein.dev/monitoring/dcgm-exporter:4.4.2-4.7.0-ubuntu22.04 +- kind: Deployment + namespace: monitoring + name: postmark-exporter + labels: + app: postmark-exporter + serviceAccountName: null + nodeSelector: {} + images: + - python:3.12-alpine +- kind: Deployment + namespace: nextcloud + name: collabora + labels: + app: collabora + serviceAccountName: null + nodeSelector: + hardware: rpi5 + images: + - collabora/code:latest +- kind: Deployment + namespace: nextcloud + name: nextcloud + labels: + app: nextcloud + serviceAccountName: null + nodeSelector: + hardware: rpi5 + images: + - nextcloud:29-apache +- kind: Deployment + namespace: sso + name: keycloak + labels: + app: keycloak + serviceAccountName: null + nodeSelector: {} + images: + - quay.io/keycloak/keycloak:26.0.7 +- kind: Deployment + namespace: sso + name: oauth2-proxy + labels: + app: oauth2-proxy + serviceAccountName: null + nodeSelector: + node-role.kubernetes.io/worker: 'true' + images: + - quay.io/oauth2-proxy/oauth2-proxy:v7.6.0 +- kind: StatefulSet + namespace: sso + name: openldap + labels: + app: openldap + serviceAccountName: null + nodeSelector: + kubernetes.io/arch: arm64 + node-role.kubernetes.io/worker: 'true' + images: + - docker.io/osixia/openldap:1.5.0 +- kind: Deployment + namespace: sui-metrics + name: sui-metrics + labels: + app: sui-metrics + serviceAccountName: sui-metrics + nodeSelector: + kubernetes.io/hostname: titan-24 + images: + - victoriametrics/vmagent:v1.103.0 +- kind: Deployment + namespace: traefik + name: traefik + labels: + app: traefik + serviceAccountName: traefik-ingress-controller + nodeSelector: + node-role.kubernetes.io/worker: 'true' + images: + - traefik:v3.3.3 +- kind: StatefulSet + namespace: vault + name: vault + labels: + app: vault + serviceAccountName: vault + nodeSelector: + kubernetes.io/arch: arm64 + node-role.kubernetes.io/worker: 'true' + images: + - hashicorp/vault:1.17.6 +- kind: Deployment + namespace: vaultwarden + name: vaultwarden + labels: + app: vaultwarden + serviceAccountName: null + nodeSelector: {} + images: + - vaultwarden/server:1.33.2 +services: +- namespace: ai + name: ollama + type: ClusterIP + selector: + app: ollama + ports: + - name: http + port: 11434 + targetPort: 11434 + protocol: TCP +- namespace: bstein-dev-home + name: bstein-dev-home-backend + type: ClusterIP + selector: + app: bstein-dev-home-backend + ports: + - name: http + port: 80 + targetPort: 8080 + protocol: TCP +- namespace: bstein-dev-home + name: bstein-dev-home-frontend + type: ClusterIP + selector: + app: bstein-dev-home-frontend + ports: + - name: http + port: 80 + targetPort: 80 + protocol: TCP +- namespace: bstein-dev-home + name: chat-ai-gateway + type: ClusterIP + selector: + app: chat-ai-gateway + ports: + - name: http + port: 80 + targetPort: 8080 + protocol: TCP +- namespace: ci-demo + name: ci-demo + type: ClusterIP + selector: + app.kubernetes.io/name: ci-demo + ports: + - name: http + port: 80 + targetPort: http + protocol: TCP +- namespace: comms + name: coturn + type: LoadBalancer + selector: + app: coturn + ports: + - name: turn-udp + port: 3478 + targetPort: 3478 + protocol: UDP + - name: turn-tcp + port: 3478 + targetPort: 3478 + protocol: TCP + - name: turn-tls + port: 5349 + targetPort: 5349 + protocol: TCP + - name: relay-50000 + port: 50000 + targetPort: 50000 + protocol: UDP + - name: relay-50001 + port: 50001 + targetPort: 50001 + protocol: UDP + - name: relay-50002 + port: 50002 + targetPort: 50002 + protocol: UDP + - name: relay-50003 + port: 50003 + targetPort: 50003 + protocol: UDP + - name: relay-50004 + port: 50004 + targetPort: 50004 + protocol: UDP + - name: relay-50005 + port: 50005 + targetPort: 50005 + protocol: UDP + - name: relay-50006 + port: 50006 + targetPort: 50006 + protocol: UDP + - name: relay-50007 + port: 50007 + targetPort: 50007 + protocol: UDP + - name: relay-50008 + port: 50008 + targetPort: 50008 + protocol: UDP + - name: relay-50009 + port: 50009 + targetPort: 50009 + protocol: UDP + - name: relay-50010 + port: 50010 + targetPort: 50010 + protocol: UDP + - name: relay-50011 + port: 50011 + targetPort: 50011 + protocol: UDP + - name: relay-50012 + port: 50012 + targetPort: 50012 + protocol: UDP + - name: relay-50013 + port: 50013 + targetPort: 50013 + protocol: UDP + - name: relay-50014 + port: 50014 + targetPort: 50014 + protocol: UDP + - name: relay-50015 + port: 50015 + targetPort: 50015 + protocol: UDP + - name: relay-50016 + port: 50016 + targetPort: 50016 + protocol: UDP + - name: relay-50017 + port: 50017 + targetPort: 50017 + protocol: UDP + - name: relay-50018 + port: 50018 + targetPort: 50018 + protocol: UDP + - name: relay-50019 + port: 50019 + targetPort: 50019 + protocol: UDP + - name: relay-50020 + port: 50020 + targetPort: 50020 + protocol: UDP + - name: relay-50021 + port: 50021 + targetPort: 50021 + protocol: UDP + - name: relay-50022 + port: 50022 + targetPort: 50022 + protocol: UDP + - name: relay-50023 + port: 50023 + targetPort: 50023 + protocol: UDP + - name: relay-50024 + port: 50024 + targetPort: 50024 + protocol: UDP + - name: relay-50025 + port: 50025 + targetPort: 50025 + protocol: UDP + - name: relay-50026 + port: 50026 + targetPort: 50026 + protocol: UDP + - name: relay-50027 + port: 50027 + targetPort: 50027 + protocol: UDP + - name: relay-50028 + port: 50028 + targetPort: 50028 + protocol: UDP + - name: relay-50029 + port: 50029 + targetPort: 50029 + protocol: UDP + - name: relay-50030 + port: 50030 + targetPort: 50030 + protocol: UDP + - name: relay-50031 + port: 50031 + targetPort: 50031 + protocol: UDP + - name: relay-50032 + port: 50032 + targetPort: 50032 + protocol: UDP + - name: relay-50033 + port: 50033 + targetPort: 50033 + protocol: UDP + - name: relay-50034 + port: 50034 + targetPort: 50034 + protocol: UDP + - name: relay-50035 + port: 50035 + targetPort: 50035 + protocol: UDP + - name: relay-50036 + port: 50036 + targetPort: 50036 + protocol: UDP + - name: relay-50037 + port: 50037 + targetPort: 50037 + protocol: UDP + - name: relay-50038 + port: 50038 + targetPort: 50038 + protocol: UDP + - name: relay-50039 + port: 50039 + targetPort: 50039 + protocol: UDP + - name: relay-50040 + port: 50040 + targetPort: 50040 + protocol: UDP + - name: relay-50041 + port: 50041 + targetPort: 50041 + protocol: UDP + - name: relay-50042 + port: 50042 + targetPort: 50042 + protocol: UDP + - name: relay-50043 + port: 50043 + targetPort: 50043 + protocol: UDP + - name: relay-50044 + port: 50044 + targetPort: 50044 + protocol: UDP + - name: relay-50045 + port: 50045 + targetPort: 50045 + protocol: UDP + - name: relay-50046 + port: 50046 + targetPort: 50046 + protocol: UDP + - name: relay-50047 + port: 50047 + targetPort: 50047 + protocol: UDP + - name: relay-50048 + port: 50048 + targetPort: 50048 + protocol: UDP + - name: relay-50049 + port: 50049 + targetPort: 50049 + protocol: UDP + - name: relay-50050 + port: 50050 + targetPort: 50050 + protocol: UDP +- namespace: comms + name: element-call + type: ClusterIP + selector: + app: element-call + ports: + - name: http + port: 80 + targetPort: 8080 + protocol: TCP +- namespace: comms + name: livekit + type: LoadBalancer + selector: + app: livekit + ports: + - name: http + port: 7880 + targetPort: 7880 + protocol: TCP + - name: rtc-tcp + port: 7881 + targetPort: 7881 + protocol: TCP + - name: rtc-udp-7882 + port: 7882 + targetPort: 7882 + protocol: UDP + - name: rtc-udp-7883 + port: 7883 + targetPort: 7883 + protocol: UDP +- namespace: comms + name: livekit-token-service + type: ClusterIP + selector: + app: livekit-token-service + ports: + - name: http + port: 8080 + targetPort: 8080 + protocol: TCP +- namespace: comms + name: matrix-authentication-service + type: ClusterIP + selector: + app: matrix-authentication-service + ports: + - name: http + port: 8080 + targetPort: http + protocol: TCP + - name: internal + port: 8081 + targetPort: internal + protocol: TCP +- namespace: comms + name: matrix-guest-register + type: ClusterIP + selector: + app.kubernetes.io/name: matrix-guest-register + ports: + - name: http + port: 8080 + targetPort: http + protocol: TCP +- namespace: comms + name: matrix-wellknown + type: ClusterIP + selector: + app: matrix-wellknown + ports: + - name: http + port: 80 + targetPort: 80 + protocol: TCP +- namespace: comms + name: othrys-element-element-web + type: ClusterIP + selector: + app.kubernetes.io/instance: othrys-element + app.kubernetes.io/name: element-web + ports: + - name: http + port: 80 + targetPort: http + protocol: TCP +- namespace: comms + name: othrys-synapse-matrix-synapse + type: ClusterIP + selector: + app.kubernetes.io/component: synapse + app.kubernetes.io/instance: othrys-synapse + app.kubernetes.io/name: matrix-synapse + ports: + - name: http + port: 8008 + targetPort: http + protocol: TCP +- namespace: comms + name: othrys-synapse-redis-headless + type: ClusterIP + selector: + app.kubernetes.io/instance: othrys-synapse + app.kubernetes.io/name: redis + ports: + - name: tcp-redis + port: 6379 + targetPort: redis + protocol: TCP +- namespace: comms + name: othrys-synapse-redis-master + type: ClusterIP + selector: + app.kubernetes.io/component: master + app.kubernetes.io/instance: othrys-synapse + app.kubernetes.io/name: redis + ports: + - name: tcp-redis + port: 6379 + targetPort: redis + protocol: TCP +- namespace: comms + name: othrys-synapse-replication + type: ClusterIP + selector: + app.kubernetes.io/component: synapse + app.kubernetes.io/instance: othrys-synapse + app.kubernetes.io/name: matrix-synapse + ports: + - name: replication + port: 9093 + targetPort: replication + protocol: TCP +- namespace: crypto + name: monerod + type: ClusterIP + selector: + app: monerod + ports: + - name: rpc + port: 18081 + targetPort: 18081 + protocol: TCP + - name: p2p + port: 18080 + targetPort: 18080 + protocol: TCP + - name: zmq + port: 18083 + targetPort: 18083 + protocol: TCP +- namespace: crypto + name: p2pool + type: ClusterIP + selector: + app: p2pool + ports: + - name: stratum + port: 3333 + targetPort: 3333 + protocol: TCP +- namespace: flux-system + name: notification-controller + type: ClusterIP + selector: + app: notification-controller + ports: + - name: http + port: 80 + targetPort: http + protocol: TCP +- namespace: flux-system + name: source-controller + type: ClusterIP + selector: + app: source-controller + ports: + - name: http + port: 80 + targetPort: http + protocol: TCP +- namespace: flux-system + name: webhook-receiver + type: ClusterIP + selector: + app: notification-controller + ports: + - name: http + port: 80 + targetPort: http-webhook + protocol: TCP +- namespace: gitea + name: gitea + type: ClusterIP + selector: + app: gitea + ports: + - name: http + port: 3000 + targetPort: 3000 + protocol: TCP +- namespace: gitea + name: gitea-ssh + type: NodePort + selector: + app: gitea + ports: + - name: ssh + port: 2242 + targetPort: 2242 + protocol: TCP +- namespace: jellyfin + name: jellyfin + type: ClusterIP + selector: + app: jellyfin + ports: + - name: http + port: 80 + targetPort: 8096 + protocol: TCP +- namespace: jellyfin + name: pegasus + type: ClusterIP + selector: + app: pegasus + ports: + - name: http + port: 80 + targetPort: http + protocol: TCP +- namespace: jenkins + name: jenkins + type: ClusterIP + selector: + app: jenkins + ports: + - name: http + port: 8080 + targetPort: 8080 + protocol: TCP + - name: agent-listener + port: 50000 + targetPort: 50000 + protocol: TCP +- namespace: kube-system + name: traefik + type: LoadBalancer + selector: + app.kubernetes.io/instance: traefik-kube-system + app.kubernetes.io/name: traefik + ports: + - name: web + port: 80 + targetPort: web + protocol: TCP + - name: websecure + port: 443 + targetPort: websecure + protocol: TCP +- namespace: longhorn-system + name: oauth2-proxy-longhorn + type: ClusterIP + selector: + app: oauth2-proxy-longhorn + ports: + - name: http + port: 80 + targetPort: 4180 + protocol: TCP +- namespace: mailu-mailserver + name: mailu-front-lb + type: LoadBalancer + selector: + app.kubernetes.io/component: front + app.kubernetes.io/instance: mailu + app.kubernetes.io/name: mailu + ports: + - name: smtp + port: 25 + targetPort: 25 + protocol: TCP + - name: smtps + port: 465 + targetPort: 465 + protocol: TCP + - name: submission + port: 587 + targetPort: 587 + protocol: TCP + - name: imaps + port: 993 + targetPort: 993 + protocol: TCP + - name: pop3s + port: 995 + targetPort: 995 + protocol: TCP + - name: sieve + port: 4190 + targetPort: 4190 + protocol: TCP +- namespace: mailu-mailserver + name: mailu-sync-listener + type: ClusterIP + selector: + app: mailu-sync-listener + ports: + - name: http + port: 8080 + targetPort: 8080 + protocol: TCP +- namespace: metallb-system + name: metallb-webhook-service + type: ClusterIP + selector: + app.kubernetes.io/component: controller + app.kubernetes.io/instance: metallb + app.kubernetes.io/name: metallb + ports: + - name: null + port: 443 + targetPort: 9443 + protocol: TCP +- namespace: monitoring + name: dcgm-exporter + type: ClusterIP + selector: + app: dcgm-exporter + ports: + - name: metrics + port: 9400 + targetPort: metrics + protocol: TCP +- namespace: monitoring + name: postmark-exporter + type: ClusterIP + selector: + app: postmark-exporter + ports: + - name: http + port: 8000 + targetPort: http + protocol: TCP +- namespace: nextcloud + name: collabora + type: ClusterIP + selector: + app: collabora + ports: + - name: http + port: 9980 + targetPort: http + protocol: TCP +- namespace: nextcloud + name: nextcloud + type: ClusterIP + selector: + app: nextcloud + ports: + - name: http + port: 80 + targetPort: http + protocol: TCP +- namespace: sso + name: keycloak + type: ClusterIP + selector: + app: keycloak + ports: + - name: http + port: 80 + targetPort: http + protocol: TCP +- namespace: sso + name: oauth2-proxy + type: ClusterIP + selector: + app: oauth2-proxy + ports: + - name: http + port: 80 + targetPort: 4180 + protocol: TCP +- namespace: sso + name: openldap + type: ClusterIP + selector: + app: openldap + ports: + - name: ldap + port: 389 + targetPort: ldap + protocol: TCP + - name: ldaps + port: 636 + targetPort: ldaps + protocol: TCP +- namespace: sui-metrics + name: sui-metrics + type: ClusterIP + selector: + app: sui-metrics + ports: + - name: http + port: 8429 + targetPort: 8429 + protocol: TCP +- namespace: traefik + name: traefik-metrics + type: ClusterIP + selector: + app: traefik + ports: + - name: metrics + port: 9100 + targetPort: metrics + protocol: TCP +- namespace: vault + name: vault + type: ClusterIP + selector: + app: vault + ports: + - name: api + port: 8200 + targetPort: 8200 + protocol: TCP + - name: cluster + port: 8201 + targetPort: 8201 + protocol: TCP +- namespace: vault + name: vault-internal + type: ClusterIP + selector: + app: vault + ports: + - name: api + port: 8200 + targetPort: 8200 + protocol: TCP + - name: cluster + port: 8201 + targetPort: 8201 + protocol: TCP +- namespace: vaultwarden + name: vaultwarden-service + type: ClusterIP + selector: + app: vaultwarden + ports: + - name: http + port: 80 + targetPort: http + protocol: TCP +http_endpoints: +- host: auth.bstein.dev + path: / + backend: + namespace: sso + service: oauth2-proxy + port: 80 + workloads: + - kind: Deployment + name: oauth2-proxy + via: + kind: Ingress + name: oauth2-proxy + source: oauth2-proxy +- host: bstein.dev + path: / + backend: + namespace: bstein-dev-home + service: bstein-dev-home-frontend + port: 80 + workloads: + - kind: Deployment + name: bstein-dev-home-frontend + via: + kind: Ingress + name: bstein-dev-home + source: bstein-dev-home +- host: bstein.dev + path: /.well-known/matrix/client + backend: + namespace: comms + service: matrix-wellknown + port: 80 + workloads: &id001 + - kind: Deployment + name: matrix-wellknown + via: + kind: Ingress + name: matrix-wellknown-bstein-dev + source: communication +- host: bstein.dev + path: /.well-known/matrix/server + backend: + namespace: comms + service: matrix-wellknown + port: 80 + workloads: *id001 + via: + kind: Ingress + name: matrix-wellknown-bstein-dev + source: communication +- host: bstein.dev + path: /api + backend: + namespace: bstein-dev-home + service: bstein-dev-home-backend + port: 80 + workloads: + - kind: Deployment + name: bstein-dev-home-backend + via: + kind: Ingress + name: bstein-dev-home + source: bstein-dev-home +- host: call.live.bstein.dev + path: / + backend: + namespace: comms + service: element-call + port: 80 + workloads: + - kind: Deployment + name: element-call + via: + kind: Ingress + name: element-call + source: communication +- host: chat.ai.bstein.dev + path: / + backend: + namespace: bstein-dev-home + service: chat-ai-gateway + port: 80 + workloads: + - kind: Deployment + name: chat-ai-gateway + via: + kind: Ingress + name: bstein-dev-home + source: bstein-dev-home +- host: ci.bstein.dev + path: / + backend: + namespace: jenkins + service: jenkins + port: http + workloads: + - kind: Deployment + name: jenkins + via: + kind: Ingress + name: jenkins + source: jenkins +- host: cloud.bstein.dev + path: / + backend: + namespace: nextcloud + service: nextcloud + port: 80 + workloads: + - kind: Deployment + name: nextcloud + via: + kind: Ingress + name: nextcloud + source: nextcloud +- host: kit.live.bstein.dev + path: /livekit/jwt + backend: + namespace: comms + service: livekit-token-service + port: 8080 + workloads: + - kind: Deployment + name: livekit-token-service + via: + kind: Ingress + name: livekit-jwt-ingress + source: communication +- host: kit.live.bstein.dev + path: /livekit/sfu + backend: + namespace: comms + service: livekit + port: 7880 + workloads: + - kind: Deployment + name: livekit + via: + kind: Ingress + name: livekit-ingress + source: communication +- host: live.bstein.dev + path: / + backend: + namespace: comms + service: othrys-element-element-web + port: 80 + workloads: + - kind: Deployment + name: othrys-element-element-web + via: + kind: Ingress + name: othrys-element-element-web + source: communication +- host: live.bstein.dev + path: /.well-known/matrix/client + backend: + namespace: comms + service: matrix-wellknown + port: 80 + workloads: *id001 + via: + kind: Ingress + name: matrix-wellknown + source: communication +- host: live.bstein.dev + path: /.well-known/matrix/server + backend: + namespace: comms + service: matrix-wellknown + port: 80 + workloads: *id001 + via: + kind: Ingress + name: matrix-wellknown + source: communication +- host: live.bstein.dev + path: /_matrix + backend: + namespace: comms + service: othrys-synapse-matrix-synapse + port: 8008 + workloads: &id002 + - kind: Deployment + name: othrys-synapse-matrix-synapse + via: + kind: Ingress + name: matrix-routing + source: communication +- host: longhorn.bstein.dev + path: / + backend: + namespace: longhorn-system + service: oauth2-proxy-longhorn + port: 80 + workloads: + - kind: Deployment + name: oauth2-proxy-longhorn + via: + kind: Ingress + name: longhorn-ingress + source: longhorn-ui +- host: mail.bstein.dev + path: / + backend: + namespace: mailu-mailserver + service: mailu-front + port: 443 + workloads: [] + via: + kind: IngressRoute + name: mailu + source: mailu +- host: matrix.live.bstein.dev + path: / + backend: + namespace: comms + service: matrix-authentication-service + port: 8080 + workloads: &id003 + - kind: Deployment + name: matrix-authentication-service + via: + kind: Ingress + name: matrix-routing + source: communication +- host: matrix.live.bstein.dev + path: /.well-known/matrix/client + backend: + namespace: comms + service: matrix-wellknown + port: 80 + workloads: *id001 + via: + kind: Ingress + name: matrix-wellknown-matrix-live + source: communication +- host: matrix.live.bstein.dev + path: /.well-known/matrix/server + backend: + namespace: comms + service: matrix-wellknown + port: 80 + workloads: *id001 + via: + kind: Ingress + name: matrix-wellknown-matrix-live + source: communication +- host: matrix.live.bstein.dev + path: /_matrix + backend: + namespace: comms + service: othrys-synapse-matrix-synapse + port: 8008 + workloads: *id002 + via: + kind: Ingress + name: matrix-routing + source: communication +- host: matrix.live.bstein.dev + path: /_matrix/client/r0/register + backend: + namespace: comms + service: matrix-guest-register + port: 8080 + workloads: &id004 + - kind: Deployment + name: matrix-guest-register + via: + kind: Ingress + name: matrix-routing + source: communication +- host: matrix.live.bstein.dev + path: /_matrix/client/v3/login + backend: + namespace: comms + service: matrix-authentication-service + port: 8080 + workloads: *id003 + via: + kind: Ingress + name: matrix-routing + source: communication +- host: matrix.live.bstein.dev + path: /_matrix/client/v3/logout + backend: + namespace: comms + service: matrix-authentication-service + port: 8080 + workloads: *id003 + via: + kind: Ingress + name: matrix-routing + source: communication +- host: matrix.live.bstein.dev + path: /_matrix/client/v3/refresh + backend: + namespace: comms + service: matrix-authentication-service + port: 8080 + workloads: *id003 + via: + kind: Ingress + name: matrix-routing + source: communication +- host: matrix.live.bstein.dev + path: /_matrix/client/v3/register + backend: + namespace: comms + service: matrix-guest-register + port: 8080 + workloads: *id004 + via: + kind: Ingress + name: matrix-routing + source: communication +- host: matrix.live.bstein.dev + path: /_synapse + backend: + namespace: comms + service: othrys-synapse-matrix-synapse + port: 8008 + workloads: *id002 + via: + kind: Ingress + name: matrix-routing + source: communication +- host: monero.bstein.dev + path: / + backend: + namespace: crypto + service: monerod + port: 18081 + workloads: + - kind: Deployment + name: monerod + via: + kind: Ingress + name: monerod + source: monerod +- host: office.bstein.dev + path: / + backend: + namespace: nextcloud + service: collabora + port: 9980 + workloads: + - kind: Deployment + name: collabora + via: + kind: Ingress + name: collabora + source: nextcloud +- host: pegasus.bstein.dev + path: / + backend: + namespace: jellyfin + service: pegasus + port: 80 + workloads: + - kind: Deployment + name: pegasus + via: + kind: Ingress + name: pegasus + source: pegasus +- host: scm.bstein.dev + path: / + backend: + namespace: gitea + service: gitea + port: 3000 + workloads: + - kind: Deployment + name: gitea + via: + kind: Ingress + name: gitea-ingress + source: gitea +- host: secret.bstein.dev + path: / + backend: + namespace: vault + service: vault + port: 8200 + workloads: + - kind: StatefulSet + name: vault + via: + kind: Ingress + name: vault + source: vault +- host: sso.bstein.dev + path: / + backend: + namespace: sso + service: keycloak + port: 80 + workloads: + - kind: Deployment + name: keycloak + via: + kind: Ingress + name: keycloak + source: keycloak +- host: stream.bstein.dev + path: / + backend: + namespace: jellyfin + service: jellyfin + port: 80 + workloads: + - kind: Deployment + name: jellyfin + via: + kind: Ingress + name: jellyfin + source: jellyfin +- host: vault.bstein.dev + path: / + backend: + namespace: vaultwarden + service: vaultwarden-service + port: 80 + workloads: + - kind: Deployment + name: vaultwarden + via: + kind: Ingress + name: vaultwarden-ingress + source: vaultwarden +helmrelease_host_hints: + gitops-ui:flux-system/weave-gitops: + - cd.bstein.dev + harbor:harbor/harbor: + - registry.bstein.dev + mailu:mailu-mailserver/mailu: + - bstein.dev + - mail.bstein.dev + monitoring:monitoring/alertmanager: + - alerts.bstein.dev + monitoring:monitoring/grafana: + - metrics.bstein.dev + - sso.bstein.dev diff --git a/knowledge/catalog/runbooks.json b/knowledge/catalog/runbooks.json new file mode 100644 index 0000000..0718562 --- /dev/null +++ b/knowledge/catalog/runbooks.json @@ -0,0 +1,89 @@ +[ + { + "path": "runbooks/ci-gitea-jenkins.md", + "title": "CI: Gitea \u2192 Jenkins pipeline", + "tags": [ + "atlas", + "ci", + "gitea", + "jenkins" + ], + "entrypoints": [ + "scm.bstein.dev", + "ci.bstein.dev" + ], + "source_paths": [ + "services/gitea", + "services/jenkins", + "scripts/jenkins_cred_sync.sh", + "scripts/gitea_cred_sync.sh" + ], + "body": "# CI: Gitea \u2192 Jenkins pipeline\n\n## What this is\nAtlas uses Gitea for source control and Jenkins for CI. Authentication is via Keycloak (SSO).\n\n## Where it is configured\n- Gitea manifests: `services/gitea/`\n- Jenkins manifests: `services/jenkins/`\n- Credential sync helpers: `scripts/gitea_cred_sync.sh`, `scripts/jenkins_cred_sync.sh`\n\n## What users do (typical flow)\n- Create a repo in Gitea.\n- Create/update a Jenkins job/pipeline that can fetch the repo.\n- Configure a webhook (or SCM polling) so pushes trigger builds.\n\n## Troubleshooting (common)\n- \u201cWebhook not firing\u201d: confirm ingress host, webhook URL, and Jenkins job is reachable.\n- \u201cAuth denied cloning\u201d: confirm Keycloak group membership and that Jenkins has a valid token/credential configured." + }, + { + "path": "runbooks/comms-verify.md", + "title": "Othrys verification checklist", + "tags": [ + "comms", + "matrix", + "element", + "livekit" + ], + "entrypoints": [ + "https://live.bstein.dev", + "https://matrix.live.bstein.dev" + ], + "source_paths": [], + "body": "1) Guest join:\n- Open a private window and visit:\n `https://live.bstein.dev/#/room/#othrys:live.bstein.dev?action=join`\n- Confirm the guest join flow works and the displayname becomes `-`.\n\n2) Keycloak login:\n- Log in from `https://live.bstein.dev` and confirm MAS -> Keycloak -> Element redirect.\n\n3) Video rooms:\n- Start an Element Call room and confirm audio/video with a second account.\n- Check that guests can read public rooms but cannot start calls.\n\n4) Well-known:\n- `https://live.bstein.dev/.well-known/matrix/client` returns JSON.\n- `https://matrix.live.bstein.dev/.well-known/matrix/client` returns JSON.\n\n5) TURN reachability:\n- Confirm `turn.live.bstein.dev:3478` and `turns:5349` are reachable from WAN." + }, + { + "path": "runbooks/kb-authoring.md", + "title": "KB authoring: what to write (and what not to)", + "tags": [ + "atlas", + "kb", + "runbooks" + ], + "entrypoints": [], + "source_paths": [ + "knowledge/runbooks", + "scripts/knowledge_render_atlas.py" + ], + "body": "# KB authoring: what to write (and what not to)\n\n## The goal\nGive Atlas assistants enough grounded, Atlas-specific context to answer \u201chow do I\u2026?\u201d questions without guessing.\n\n## What to capture (high value)\n- User workflows: \u201cclick here, set X, expected result\u201d\n- Operator workflows: \u201cedit these files, reconcile this kustomization, verify with these commands\u201d\n- Wiring: \u201cthis host routes to this service; this service depends on Postgres/Vault/etc\u201d\n- Failure modes: exact error messages + the 2\u20135 checks that usually resolve them\n- Permissions: Keycloak groups/roles and what they unlock\n\n## What to avoid (low value / fluff)\n- Generic Kubernetes explanations (link to upstream docs instead)\n- Copy-pasting large manifests (prefer file paths + small snippets)\n- Anything that will drift quickly (render it from GitOps instead)\n- Any secret values (reference Secret/Vault locations by name only)\n\n## Document pattern (recommended)\nEach runbook should answer:\n- \u201cWhat is this?\u201d\n- \u201cWhat do users do?\u201d\n- \u201cWhat do operators change (where in Git)?\u201d\n- \u201cHow do we verify it works?\u201d\n- \u201cWhat breaks and how to debug it?\u201d" + }, + { + "path": "runbooks/observability.md", + "title": "Observability: Grafana + VictoriaMetrics (how to query safely)", + "tags": [ + "atlas", + "monitoring", + "grafana", + "victoriametrics" + ], + "entrypoints": [ + "metrics.bstein.dev", + "alerts.bstein.dev" + ], + "source_paths": [ + "services/monitoring" + ], + "body": "# Observability: Grafana + VictoriaMetrics (how to query safely)\n\n## Where it is configured\n- `services/monitoring/helmrelease.yaml` (Grafana + Alertmanager + VM values)\n- `services/monitoring/grafana-dashboard-*.yaml` (dashboards and their PromQL)\n\n## Using metrics as a \u201ctool\u201d for Atlas assistants\nThe safest pattern is: map a small set of intents \u2192 fixed PromQL queries, then summarize results.\n\nExamples (intents)\n- \u201cIs the cluster healthy?\u201d \u2192 node readiness + pod restart rate\n- \u201cWhy is Element Call failing?\u201d \u2192 LiveKit/coturn pod restarts + synapse errors + ingress 5xx\n- \u201cIs Jenkins slow?\u201d \u2192 pod CPU/memory + HTTP latency metrics (if exported)\n\n## Why dashboards are not the KB\nDashboards are great references, but the assistant should query VictoriaMetrics directly for live answers and keep the\nKB focused on wiring, runbooks, and stable conventions." + }, + { + "path": "runbooks/template.md", + "title": "", + "tags": [ + "atlas", + "", + "" + ], + "entrypoints": [ + "" + ], + "source_paths": [ + "services/", + "clusters/atlas/<...>" + ], + "body": "# \n\n## What this is\n\n## For users (how to)\n\n## For operators (where configured)\n\n## Troubleshooting (symptoms \u2192 checks)" + } +] diff --git a/knowledge/diagrams/atlas-http.mmd b/knowledge/diagrams/atlas-http.mmd new file mode 100644 index 0000000..ddd33d8 --- /dev/null +++ b/knowledge/diagrams/atlas-http.mmd @@ -0,0 +1,189 @@ +flowchart LR + host_auth_bstein_dev["auth.bstein.dev"] + svc_sso_oauth2_proxy["sso/oauth2-proxy (Service)"] + host_auth_bstein_dev --> svc_sso_oauth2_proxy + wl_sso_oauth2_proxy["sso/oauth2-proxy (Deployment)"] + svc_sso_oauth2_proxy --> wl_sso_oauth2_proxy + host_bstein_dev["bstein.dev"] + svc_bstein_dev_home_bstein_dev_home_frontend["bstein-dev-home/bstein-dev-home-frontend (Service)"] + host_bstein_dev --> svc_bstein_dev_home_bstein_dev_home_frontend + wl_bstein_dev_home_bstein_dev_home_frontend["bstein-dev-home/bstein-dev-home-frontend (Deployment)"] + svc_bstein_dev_home_bstein_dev_home_frontend --> wl_bstein_dev_home_bstein_dev_home_frontend + svc_comms_matrix_wellknown["comms/matrix-wellknown (Service)"] + host_bstein_dev --> svc_comms_matrix_wellknown + wl_comms_matrix_wellknown["comms/matrix-wellknown (Deployment)"] + svc_comms_matrix_wellknown --> wl_comms_matrix_wellknown + svc_bstein_dev_home_bstein_dev_home_backend["bstein-dev-home/bstein-dev-home-backend (Service)"] + host_bstein_dev --> svc_bstein_dev_home_bstein_dev_home_backend + wl_bstein_dev_home_bstein_dev_home_backend["bstein-dev-home/bstein-dev-home-backend (Deployment)"] + svc_bstein_dev_home_bstein_dev_home_backend --> wl_bstein_dev_home_bstein_dev_home_backend + host_call_live_bstein_dev["call.live.bstein.dev"] + svc_comms_element_call["comms/element-call (Service)"] + host_call_live_bstein_dev --> svc_comms_element_call + wl_comms_element_call["comms/element-call (Deployment)"] + svc_comms_element_call --> wl_comms_element_call + host_chat_ai_bstein_dev["chat.ai.bstein.dev"] + svc_bstein_dev_home_chat_ai_gateway["bstein-dev-home/chat-ai-gateway (Service)"] + host_chat_ai_bstein_dev --> svc_bstein_dev_home_chat_ai_gateway + wl_bstein_dev_home_chat_ai_gateway["bstein-dev-home/chat-ai-gateway (Deployment)"] + svc_bstein_dev_home_chat_ai_gateway --> wl_bstein_dev_home_chat_ai_gateway + host_ci_bstein_dev["ci.bstein.dev"] + svc_jenkins_jenkins["jenkins/jenkins (Service)"] + host_ci_bstein_dev --> svc_jenkins_jenkins + wl_jenkins_jenkins["jenkins/jenkins (Deployment)"] + svc_jenkins_jenkins --> wl_jenkins_jenkins + host_cloud_bstein_dev["cloud.bstein.dev"] + svc_nextcloud_nextcloud["nextcloud/nextcloud (Service)"] + host_cloud_bstein_dev --> svc_nextcloud_nextcloud + wl_nextcloud_nextcloud["nextcloud/nextcloud (Deployment)"] + svc_nextcloud_nextcloud --> wl_nextcloud_nextcloud + host_kit_live_bstein_dev["kit.live.bstein.dev"] + svc_comms_livekit_token_service["comms/livekit-token-service (Service)"] + host_kit_live_bstein_dev --> svc_comms_livekit_token_service + wl_comms_livekit_token_service["comms/livekit-token-service (Deployment)"] + svc_comms_livekit_token_service --> wl_comms_livekit_token_service + svc_comms_livekit["comms/livekit (Service)"] + host_kit_live_bstein_dev --> svc_comms_livekit + wl_comms_livekit["comms/livekit (Deployment)"] + svc_comms_livekit --> wl_comms_livekit + host_live_bstein_dev["live.bstein.dev"] + svc_comms_othrys_element_element_web["comms/othrys-element-element-web (Service)"] + host_live_bstein_dev --> svc_comms_othrys_element_element_web + wl_comms_othrys_element_element_web["comms/othrys-element-element-web (Deployment)"] + svc_comms_othrys_element_element_web --> wl_comms_othrys_element_element_web + host_live_bstein_dev --> svc_comms_matrix_wellknown + svc_comms_othrys_synapse_matrix_synapse["comms/othrys-synapse-matrix-synapse (Service)"] + host_live_bstein_dev --> svc_comms_othrys_synapse_matrix_synapse + wl_comms_othrys_synapse_matrix_synapse["comms/othrys-synapse-matrix-synapse (Deployment)"] + svc_comms_othrys_synapse_matrix_synapse --> wl_comms_othrys_synapse_matrix_synapse + host_longhorn_bstein_dev["longhorn.bstein.dev"] + svc_longhorn_system_oauth2_proxy_longhorn["longhorn-system/oauth2-proxy-longhorn (Service)"] + host_longhorn_bstein_dev --> svc_longhorn_system_oauth2_proxy_longhorn + wl_longhorn_system_oauth2_proxy_longhorn["longhorn-system/oauth2-proxy-longhorn (Deployment)"] + svc_longhorn_system_oauth2_proxy_longhorn --> wl_longhorn_system_oauth2_proxy_longhorn + host_mail_bstein_dev["mail.bstein.dev"] + svc_mailu_mailserver_mailu_front["mailu-mailserver/mailu-front (Service)"] + host_mail_bstein_dev --> svc_mailu_mailserver_mailu_front + host_matrix_live_bstein_dev["matrix.live.bstein.dev"] + svc_comms_matrix_authentication_service["comms/matrix-authentication-service (Service)"] + host_matrix_live_bstein_dev --> svc_comms_matrix_authentication_service + wl_comms_matrix_authentication_service["comms/matrix-authentication-service (Deployment)"] + svc_comms_matrix_authentication_service --> wl_comms_matrix_authentication_service + host_matrix_live_bstein_dev --> svc_comms_matrix_wellknown + host_matrix_live_bstein_dev --> svc_comms_othrys_synapse_matrix_synapse + svc_comms_matrix_guest_register["comms/matrix-guest-register (Service)"] + host_matrix_live_bstein_dev --> svc_comms_matrix_guest_register + wl_comms_matrix_guest_register["comms/matrix-guest-register (Deployment)"] + svc_comms_matrix_guest_register --> wl_comms_matrix_guest_register + host_monero_bstein_dev["monero.bstein.dev"] + svc_crypto_monerod["crypto/monerod (Service)"] + host_monero_bstein_dev --> svc_crypto_monerod + wl_crypto_monerod["crypto/monerod (Deployment)"] + svc_crypto_monerod --> wl_crypto_monerod + host_office_bstein_dev["office.bstein.dev"] + svc_nextcloud_collabora["nextcloud/collabora (Service)"] + host_office_bstein_dev --> svc_nextcloud_collabora + wl_nextcloud_collabora["nextcloud/collabora (Deployment)"] + svc_nextcloud_collabora --> wl_nextcloud_collabora + host_pegasus_bstein_dev["pegasus.bstein.dev"] + svc_jellyfin_pegasus["jellyfin/pegasus (Service)"] + host_pegasus_bstein_dev --> svc_jellyfin_pegasus + wl_jellyfin_pegasus["jellyfin/pegasus (Deployment)"] + svc_jellyfin_pegasus --> wl_jellyfin_pegasus + host_scm_bstein_dev["scm.bstein.dev"] + svc_gitea_gitea["gitea/gitea (Service)"] + host_scm_bstein_dev --> svc_gitea_gitea + wl_gitea_gitea["gitea/gitea (Deployment)"] + svc_gitea_gitea --> wl_gitea_gitea + host_secret_bstein_dev["secret.bstein.dev"] + svc_vault_vault["vault/vault (Service)"] + host_secret_bstein_dev --> svc_vault_vault + wl_vault_vault["vault/vault (StatefulSet)"] + svc_vault_vault --> wl_vault_vault + host_sso_bstein_dev["sso.bstein.dev"] + svc_sso_keycloak["sso/keycloak (Service)"] + host_sso_bstein_dev --> svc_sso_keycloak + wl_sso_keycloak["sso/keycloak (Deployment)"] + svc_sso_keycloak --> wl_sso_keycloak + host_stream_bstein_dev["stream.bstein.dev"] + svc_jellyfin_jellyfin["jellyfin/jellyfin (Service)"] + host_stream_bstein_dev --> svc_jellyfin_jellyfin + wl_jellyfin_jellyfin["jellyfin/jellyfin (Deployment)"] + svc_jellyfin_jellyfin --> wl_jellyfin_jellyfin + host_vault_bstein_dev["vault.bstein.dev"] + svc_vaultwarden_vaultwarden_service["vaultwarden/vaultwarden-service (Service)"] + host_vault_bstein_dev --> svc_vaultwarden_vaultwarden_service + wl_vaultwarden_vaultwarden["vaultwarden/vaultwarden (Deployment)"] + svc_vaultwarden_vaultwarden_service --> wl_vaultwarden_vaultwarden + + subgraph bstein_dev_home[bstein-dev-home] + svc_bstein_dev_home_bstein_dev_home_frontend + wl_bstein_dev_home_bstein_dev_home_frontend + svc_bstein_dev_home_bstein_dev_home_backend + wl_bstein_dev_home_bstein_dev_home_backend + svc_bstein_dev_home_chat_ai_gateway + wl_bstein_dev_home_chat_ai_gateway + end + subgraph comms[comms] + svc_comms_matrix_wellknown + wl_comms_matrix_wellknown + svc_comms_element_call + wl_comms_element_call + svc_comms_livekit_token_service + wl_comms_livekit_token_service + svc_comms_livekit + wl_comms_livekit + svc_comms_othrys_element_element_web + wl_comms_othrys_element_element_web + svc_comms_othrys_synapse_matrix_synapse + wl_comms_othrys_synapse_matrix_synapse + svc_comms_matrix_authentication_service + wl_comms_matrix_authentication_service + svc_comms_matrix_guest_register + wl_comms_matrix_guest_register + end + subgraph crypto[crypto] + svc_crypto_monerod + wl_crypto_monerod + end + subgraph gitea[gitea] + svc_gitea_gitea + wl_gitea_gitea + end + subgraph jellyfin[jellyfin] + svc_jellyfin_pegasus + wl_jellyfin_pegasus + svc_jellyfin_jellyfin + wl_jellyfin_jellyfin + end + subgraph jenkins[jenkins] + svc_jenkins_jenkins + wl_jenkins_jenkins + end + subgraph longhorn_system[longhorn-system] + svc_longhorn_system_oauth2_proxy_longhorn + wl_longhorn_system_oauth2_proxy_longhorn + end + subgraph mailu_mailserver[mailu-mailserver] + svc_mailu_mailserver_mailu_front + end + subgraph nextcloud[nextcloud] + svc_nextcloud_nextcloud + wl_nextcloud_nextcloud + svc_nextcloud_collabora + wl_nextcloud_collabora + end + subgraph sso[sso] + svc_sso_oauth2_proxy + wl_sso_oauth2_proxy + svc_sso_keycloak + wl_sso_keycloak + end + subgraph vault[vault] + svc_vault_vault + wl_vault_vault + end + subgraph vaultwarden[vaultwarden] + svc_vaultwarden_vaultwarden_service + wl_vaultwarden_vaultwarden + end diff --git a/knowledge/metis.md b/knowledge/metis.md new file mode 100644 index 0000000..5b0d06b --- /dev/null +++ b/knowledge/metis.md @@ -0,0 +1,26 @@ +# Metis (node recovery) + +## Node classes (current map) +- rpi5 Ubuntu workers: titan-04,05,06,07,08,09,10,11,20,21 (Ubuntu 24.04.3, k3s agent) +- rpi5 control-plane: titan-0a/0b/0c (Ubuntu 24.04.1, k3s server, control-plane taint) +- rpi4 Armbian longhorn: titan-13/15/17/19 (Armbian 6.6.x, k3s agent, longhorn disks) +- rpi4 Armbian standard: titan-12/14/18 (Armbian 6.6.x, k3s agent) +- amd64 agents: titan-22/24 (Debian 13, k3s agent) +- External/non-cluster: tethys, titan-db, titan-jh, oceanus/titan-23, future titan-20/21 (when added), plus any newcomers. + +## Longhorn disk UUIDs (critical nodes) +- titan-13: /mnt/astreae UUID=6031fa8b-f28c-45c3-b7bc-6133300e07c6 (ext4); /mnt/asteria UUID=cbd4989d-62b5-4741-8b2a-28fdae259cae (ext4) +- titan-15: /mnt/astreae UUID=f3362f14-5822-449f-944b-ac570b5cd615 (ext4); /mnt/asteria UUID=9c5316e6-f847-4884-b502-11f2d0d15d6f (ext4) +- titan-17: /mnt/astreae UUID=1fecdade-08b0-49cb-9ae3-be6c188b0a96 (ext4); /mnt/asteria UUID=2fe9f613-d372-47ca-b84f-82084e4edda0 (ext4) +- titan-19: /mnt/astreae UUID=4890abb9-dda2-4f4f-9c0f-081ee82849cf (ext4); /mnt/asteria UUID=2b4ea28d-b0e6-4fa3-841b-cd7067ae9153 (ext4) + +## Metis repo (~/Development/metis) +- CLI skeleton in Go (`cmd/metis`), inventory loader (`pkg/inventory`), plan builder (`pkg/plan`). +- `inventory.example.yaml` shows expected schema (classes + per-node overlay, Longhorn disks, labels, taints). +- `AGENTS.md` in repo is untracked and holds raw notes. + +## Next implementation steps +- Add per-class golden image refs and checksums (Harbor or file://) when ready. +- Implement burn execution: download with checksum, write via dd/etcher-equivalent, mount boot/root to inject hostname/IP/k3s tokens/labels/taints, journald/GC drop-ins, and Longhorn fstab entries. Add Windows writer (diskpart + wmic) and Linux writer (dd + sgdisk) paths. +- Add Keycloak/SSH bootstrap: ensure ssh user, authorized keys, and k3s token/URL injection for agents; control-plane restore path with etcd snapshot selection. +- Add per-host inventory entries for tethys, titan-db, titan-jh, oceanus/titan-23, future 20/21 once audited. diff --git a/knowledge/runbooks/ci-gitea-jenkins.md b/knowledge/runbooks/ci-gitea-jenkins.md new file mode 100644 index 0000000..48dc91f --- /dev/null +++ b/knowledge/runbooks/ci-gitea-jenkins.md @@ -0,0 +1,27 @@ +--- +title: "CI: Gitea → Jenkins pipeline" +tags: ["atlas", "ci", "gitea", "jenkins"] +owners: ["brad"] +entrypoints: ["scm.bstein.dev", "ci.bstein.dev"] +source_paths: ["services/gitea", "services/jenkins", "scripts/jenkins_cred_sync.sh", "scripts/gitea_cred_sync.sh"] +--- + +# CI: Gitea → Jenkins pipeline + +## What this is +Atlas uses Gitea for source control and Jenkins for CI. Authentication is via Keycloak (SSO). + +## Where it is configured +- Gitea manifests: `services/gitea/` +- Jenkins manifests: `services/jenkins/` +- Credential sync helpers: `scripts/gitea_cred_sync.sh`, `scripts/jenkins_cred_sync.sh` + +## What users do (typical flow) +- Create a repo in Gitea. +- Create/update a Jenkins job/pipeline that can fetch the repo. +- Configure a webhook (or SCM polling) so pushes trigger builds. + +## Troubleshooting (common) +- “Webhook not firing”: confirm ingress host, webhook URL, and Jenkins job is reachable. +- “Auth denied cloning”: confirm Keycloak group membership and that Jenkins has a valid token/credential configured. + diff --git a/knowledge/runbooks/comms-verify.md b/knowledge/runbooks/comms-verify.md new file mode 100644 index 0000000..8c09d0a --- /dev/null +++ b/knowledge/runbooks/comms-verify.md @@ -0,0 +1,30 @@ +--- +title: Othrys verification checklist +tags: + - comms + - matrix + - element + - livekit +entrypoints: + - https://live.bstein.dev + - https://matrix.live.bstein.dev +--- + +1) Guest join: +- Open a private window and visit: + `https://live.bstein.dev/#/room/#othrys:live.bstein.dev?action=join` +- Confirm the guest join flow works and the displayname becomes `-`. + +2) Keycloak login: +- Log in from `https://live.bstein.dev` and confirm MAS -> Keycloak -> Element redirect. + +3) Video rooms: +- Start an Element Call room and confirm audio/video with a second account. +- Check that guests can read public rooms but cannot start calls. + +4) Well-known: +- `https://live.bstein.dev/.well-known/matrix/client` returns JSON. +- `https://matrix.live.bstein.dev/.well-known/matrix/client` returns JSON. + +5) TURN reachability: +- Confirm `turn.live.bstein.dev:3478` and `turns:5349` are reachable from WAN. diff --git a/knowledge/runbooks/kb-authoring.md b/knowledge/runbooks/kb-authoring.md new file mode 100644 index 0000000..9378d1d --- /dev/null +++ b/knowledge/runbooks/kb-authoring.md @@ -0,0 +1,34 @@ +--- +title: "KB authoring: what to write (and what not to)" +tags: ["atlas", "kb", "runbooks"] +owners: ["brad"] +entrypoints: [] +source_paths: ["knowledge/runbooks", "scripts/knowledge_render_atlas.py"] +--- + +# KB authoring: what to write (and what not to) + +## The goal +Give Atlas assistants enough grounded, Atlas-specific context to answer “how do I…?” questions without guessing. + +## What to capture (high value) +- User workflows: “click here, set X, expected result” +- Operator workflows: “edit these files, reconcile this kustomization, verify with these commands” +- Wiring: “this host routes to this service; this service depends on Postgres/Vault/etc” +- Failure modes: exact error messages + the 2–5 checks that usually resolve them +- Permissions: Keycloak groups/roles and what they unlock + +## What to avoid (low value / fluff) +- Generic Kubernetes explanations (link to upstream docs instead) +- Copy-pasting large manifests (prefer file paths + small snippets) +- Anything that will drift quickly (render it from GitOps instead) +- Any secret values (reference Secret/Vault locations by name only) + +## Document pattern (recommended) +Each runbook should answer: +- “What is this?” +- “What do users do?” +- “What do operators change (where in Git)?” +- “How do we verify it works?” +- “What breaks and how to debug it?” + diff --git a/knowledge/runbooks/observability.md b/knowledge/runbooks/observability.md new file mode 100644 index 0000000..4c5be6e --- /dev/null +++ b/knowledge/runbooks/observability.md @@ -0,0 +1,26 @@ +--- +title: "Observability: Grafana + VictoriaMetrics (how to query safely)" +tags: ["atlas", "monitoring", "grafana", "victoriametrics"] +owners: ["brad"] +entrypoints: ["metrics.bstein.dev", "alerts.bstein.dev"] +source_paths: ["services/monitoring"] +--- + +# Observability: Grafana + VictoriaMetrics (how to query safely) + +## Where it is configured +- `services/monitoring/helmrelease.yaml` (Grafana + Alertmanager + VM values) +- `services/monitoring/grafana-dashboard-*.yaml` (dashboards and their PromQL) + +## Using metrics as a “tool” for Atlas assistants +The safest pattern is: map a small set of intents → fixed PromQL queries, then summarize results. + +Examples (intents) +- “Is the cluster healthy?” → node readiness + pod restart rate +- “Why is Element Call failing?” → LiveKit/coturn pod restarts + synapse errors + ingress 5xx +- “Is Jenkins slow?” → pod CPU/memory + HTTP latency metrics (if exported) + +## Why dashboards are not the KB +Dashboards are great references, but the assistant should query VictoriaMetrics directly for live answers and keep the +KB focused on wiring, runbooks, and stable conventions. + diff --git a/knowledge/runbooks/template.md b/knowledge/runbooks/template.md new file mode 100644 index 0000000..086c65f --- /dev/null +++ b/knowledge/runbooks/template.md @@ -0,0 +1,18 @@ +--- +title: "" +tags: ["atlas", "", ""] +owners: ["brad"] +entrypoints: [""] +source_paths: ["services/", "clusters/atlas/<...>"] +--- + +# + +## What this is + +## For users (how to) + +## For operators (where configured) + +## Troubleshooting (symptoms → checks) + diff --git a/knowledge/software/metis.md b/knowledge/software/metis.md new file mode 100644 index 0000000..7ca3b39 --- /dev/null +++ b/knowledge/software/metis.md @@ -0,0 +1,73 @@ +# Metis (node recovery) + +## Node classes (current map) +- rpi5 Ubuntu workers: titan-04,05,06,07,08,09,10,11,20,21 (Ubuntu 24.04.3, k3s agent) +- rpi5 control-plane: titan-0a/0b/0c (Ubuntu 24.04.1, k3s server, control-plane taint) +- rpi4 Armbian longhorn: titan-13/15/17/19 (Armbian 6.6.x, k3s agent, longhorn disks) +- rpi4 Armbian standard: titan-12/14/18 (Armbian 6.6.x, k3s agent) +- amd64 agents: titan-22/24 (Debian 13, k3s agent) +- External/non-cluster: tethys, titan-db, titan-jh, oceanus/titan-23, plus any newcomers. + +### Jetson nodes (titan-20/21) +- Ubuntu 20.04.6 (Focal), kernel 5.10.104-tegra, CRI containerd 2.0.5-k3s2, arch arm64. +- Storage: NVMe 232G at / (ext4); onboard mmc partitions present but root on NVMe; 1.9T sda present (unused). +- k3s agent with drop-in 99-nofile.conf. + +## Longhorn disk UUIDs (critical nodes) +- titan-13: /mnt/astreae UUID=6031fa8b-f28c-45c3-b7bc-6133300e07c6 (ext4); /mnt/asteria UUID=cbd4989d-62b5-4741-8b2a-28fdae259cae (ext4) +- titan-15: /mnt/astreae UUID=f3362f14-5822-449f-944b-ac570b5cd615 (ext4); /mnt/asteria UUID=9c5316e6-f847-4884-b502-11f2d0d15d6f (ext4) +- titan-17: /mnt/astreae UUID=1fecdade-08b0-49cb-9ae3-be6c188b0a96 (ext4); /mnt/asteria UUID=2fe9f613-d372-47ca-b84f-82084e4edda0 (ext4) +- titan-19: /mnt/astreae UUID=4890abb9-dda2-4f4f-9c0f-081ee82849cf (ext4); /mnt/asteria UUID=2b4ea28d-b0e6-4fa3-841b-cd7067ae9153 (ext4) + +## Metis repo (~/Development/metis) +- CLI skeleton in Go (`cmd/metis`), inventory loader (`pkg/inventory`), plan builder (`pkg/plan`). +- `inventory.example.yaml` shows expected schema (classes + per-node overlay, Longhorn disks, labels, taints). +- `AGENTS.md` in repo is untracked and holds raw notes. + +## Next implementation steps +- Add per-class golden image refs and checksums (Harbor or file://) when ready. +- Implement burn execution: download with checksum, write via dd/etcher-equivalent, mount boot/root to inject hostname/IP/k3s tokens/labels/taints, journald/GC drop-ins, and Longhorn fstab entries. Add Windows writer (diskpart + wmic) and Linux writer (dd + sgdisk) paths. +- Add Keycloak/SSH bootstrap: ensure ssh user, authorized keys, and k3s token/URL injection for agents; control-plane restore path with etcd snapshot selection. +- Add per-host inventory entries for tethys, titan-db, titan-jh, oceanus/titan-23, future 20/21 once audited. + +## Node OS/Kernel/CRI snapshot (Jan 2026) +- titan-04: Ubuntu 24.04.3 LTS, kernel 6.8.0-1031-raspi, CRI containerd://2.0.5-k3s2, arch arm64 +- titan-05: Ubuntu 24.04.3 LTS, kernel 6.8.0-1039-raspi, CRI containerd://2.0.5-k3s2, arch arm64 +- titan-06: Ubuntu 24.04.3 LTS, kernel 6.8.0-1039-raspi, CRI containerd://2.0.5-k3s2, arch arm64 +- titan-07: Ubuntu 24.04.3 LTS, kernel 6.8.0-1039-raspi, CRI containerd://2.0.5-k3s2, arch arm64 +- titan-08: Ubuntu 24.04.3 LTS, kernel 6.8.0-1039-raspi, CRI containerd://2.0.5-k3s2, arch arm64 +- titan-09: Ubuntu 24.04.3 LTS, kernel 6.8.0-1031-raspi, CRI containerd://2.0.5-k3s2, arch arm64 +- titan-0a: Ubuntu 24.04.1 LTS, kernel 6.8.0-1038-raspi, CRI containerd://2.0.5-k3s2, arch arm64 +- titan-0b: Ubuntu 24.04.1 LTS, kernel 6.8.0-1038-raspi, CRI containerd://2.0.5-k3s2, arch arm64 +- titan-0c: Ubuntu 24.04.1 LTS, kernel 6.8.0-1038-raspi, CRI containerd://2.0.5-k3s2, arch arm64 +- titan-10: Ubuntu 24.04.3 LTS, kernel 6.8.0-1039-raspi, CRI containerd://2.0.5-k3s2, arch arm64 +- titan-11: Ubuntu 24.04.3 LTS, kernel 6.8.0-1039-raspi, CRI containerd://2.0.5-k3s2, arch arm64 +- titan-12: Armbian 24.11.1 noble, kernel 6.6.60-current-bcm2711, CRI containerd://1.7.23-k3s2, arch arm64 +- titan-13: Armbian 25.2.1 noble, kernel 6.6.63-current-bcm2711, CRI containerd://1.7.23-k3s2, arch arm64 +- titan-14: Armbian 24.11.1 noble, kernel 6.6.60-current-bcm2711, CRI containerd://1.7.23-k3s2, arch arm64 +- titan-15: Armbian 25.2.1 noble, kernel 6.6.63-current-bcm2711, CRI containerd://1.7.23-k3s2, arch arm64 +- titan-17: Armbian 25.2.1 noble, kernel 6.6.63-current-bcm2711, CRI containerd://1.7.23-k3s2, arch arm64 +- titan-18: Armbian 24.11.1 noble, kernel 6.6.60-current-bcm2711, CRI containerd://1.7.23-k3s2, arch arm64 +- titan-19: Armbian 25.2.1 noble, kernel 6.6.63-current-bcm2711, CRI containerd://1.7.23-k3s2, arch arm64 +- titan-20: Ubuntu 20.04.6 LTS, kernel 5.10.104-tegra, CRI containerd://2.0.5-k3s2, arch arm64 +- titan-21: Ubuntu 20.04.6 LTS, kernel 5.10.104-tegra, CRI containerd://2.0.5-k3s2, arch arm64 +- titan-22: Debian 13 (trixie), kernel 6.12.41+deb13-amd64, CRI containerd://2.0.5-k3s2, arch amd64 +- titan-24: Debian 13 (trixie), kernel 6.12.57+deb13-amd64, CRI containerd://2.0.5-k3s2, arch amd64 + + +### External hosts +- titan-db: Ubuntu 24.10, kernel 6.11.0-1015-raspi, root on /dev/sda2 ext4 (465G), boot vfat /dev/sda1; PostgreSQL service enabled. +- titan-jh: Arch Linux ARM (rolling), kernel 6.18.4-2-rpi, NVMe root ext4 238G (/), boot vfat 512M; ~495 packages installed (pacman -Q). +- titan-23/oceanus: TODO audit (future). + + +### Control plane Pis (titan-0a/0b/0c) +- Ubuntu 24.04.1 LTS, kernel 6.8.0-1038-raspi, containerd 2.0.5-k3s2. +- Storage: 477G SSD root (/dev/sda2 ext4), /boot/firmware vfat (/dev/sda1). fstab uses LABEL=writable and LABEL=system-boot. +- k3s server (control-plane taint expected); etcd snapshots not yet cataloged (TODO). + + +## k3s versions +- rpi5 workers/control-plane: k3s v1.33.3+k3s1 (crictl v1.31.0-k3s2) +- rpi4 nodes: k3s v1.31.5+k3s1 (crictl v1.31.0-k3s2) +- Jetson titan-20/21: k3s v1.33.3+k3s1 (per node info), crictl v1.31.0-k3s2 diff --git a/scripts/comms_sync_kb.sh b/scripts/comms_sync_kb.sh new file mode 100755 index 0000000..16f9332 --- /dev/null +++ b/scripts/comms_sync_kb.sh @@ -0,0 +1,5 @@ +#!/usr/bin/env bash +set -euo pipefail + +python scripts/knowledge_render_atlas.py --write +python scripts/knowledge_render_atlas.py --write --out services/comms/knowledge diff --git a/scripts/dashboards_render_atlas.py b/scripts/dashboards_render_atlas.py index 7ad117b..01fe9c7 100644 --- a/scripts/dashboards_render_atlas.py +++ b/scripts/dashboards_render_atlas.py @@ -9,6 +9,7 @@ Usage: import argparse import json import textwrap +import urllib.parse from pathlib import Path # --------------------------------------------------------------------------- @@ -45,12 +46,14 @@ PERCENT_THRESHOLDS = { ], } +NAMESPACE_CPU_WINDOW = "1m" + # --------------------------------------------------------------------------- # Cluster metadata # --------------------------------------------------------------------------- CONTROL_PLANE_NODES = ["titan-0a", "titan-0b", "titan-0c"] -CONTROL_DEPENDENCIES = ["titan-db"] +CONTROL_DEPENDENCIES = ["titan-db", "titan-jh"] CONTROL_ALL = CONTROL_PLANE_NODES + CONTROL_DEPENDENCIES WORKER_NODES = [ "titan-04", @@ -61,11 +64,12 @@ WORKER_NODES = [ "titan-09", "titan-10", "titan-11", + "titan-20", + "titan-21", "titan-12", "titan-13", "titan-14", "titan-15", - "titan-16", "titan-17", "titan-18", "titan-19", @@ -80,7 +84,22 @@ CONTROL_TOTAL = len(CONTROL_PLANE_NODES) WORKER_TOTAL = len(WORKER_NODES) CONTROL_SUFFIX = f"/{CONTROL_TOTAL}" WORKER_SUFFIX = f"/{WORKER_TOTAL}" -CP_ALLOWED_NS = "kube-system|kube-public|kube-node-lease|longhorn-system|monitoring|flux-system" +# Namespaces considered infrastructure (excluded from workload counts) +INFRA_NAMESPACES = [ + "kube-system", + "longhorn-system", + "metallb-system", + "monitoring", + "logging", + "cert-manager", + "flux-system", + "traefik", + "maintenance", + "postgres", +] +INFRA_REGEX = f"^({'|'.join(INFRA_NAMESPACES)})$" +# Namespaces allowed on control plane without counting as workloads +CP_ALLOWED_NS = INFRA_REGEX LONGHORN_NODE_REGEX = "titan-1[2-9]|titan-2[24]" GAUGE_WIDTHS = [4, 3, 3, 4, 3, 3, 4] CONTROL_WORKLOADS_EXPR = ( @@ -170,22 +189,48 @@ def node_io_expr(scope=""): return scoped_node_expr(base, scope) +def namespace_selector(scope_var): + return f'namespace!="",pod!="",container!="",container!="POD",{scope_var}' + + +def namespace_gpu_selector(scope_var): + return f'namespace!="",pod!="",{scope_var}' + + +def namespace_cpu_raw(scope_var): + return ( + "sum(rate(container_cpu_usage_seconds_total" + f"{{{namespace_selector(scope_var)}}}[{NAMESPACE_CPU_WINDOW}])) by (namespace)" + ) + + +def namespace_ram_raw(scope_var): + return f"sum(container_memory_working_set_bytes{{{namespace_selector(scope_var)}}}) by (namespace)" + + +def namespace_gpu_usage_instant(scope_var): + return f"sum(DCGM_FI_DEV_GPU_UTIL{{{namespace_gpu_selector(scope_var)}}}) by (namespace)" + + def namespace_share_expr(resource_expr): - selected = f"( {resource_expr} ) and on(namespace) ( {NAMESPACE_TOP_FILTER} )" - total = f"clamp_min(sum( {selected} ), 1)" - return f"100 * ( {selected} ) / {total}" + total = f"clamp_min(sum( {resource_expr} ), 1)" + return f"100 * ( {resource_expr} ) / {total}" -def namespace_cpu_share_expr(): - return namespace_share_expr(NAMESPACE_CPU_RAW) +def namespace_cpu_share_expr(scope_var): + return namespace_share_expr(namespace_cpu_raw(scope_var)) -def namespace_ram_share_expr(): - return namespace_share_expr(NAMESPACE_RAM_RAW) +def namespace_ram_share_expr(scope_var): + return namespace_share_expr(namespace_ram_raw(scope_var)) -def namespace_gpu_share_expr(): - return namespace_share_expr(NAMESPACE_GPU_RAW) +def namespace_gpu_share_expr(scope_var): + usage = namespace_gpu_usage_instant(scope_var) + total = f"(sum({usage}) or on() vector(0))" + share = f"100 * ({usage}) / clamp_min({total}, 1)" + idle = 'label_replace(vector(100), "namespace", "idle", "", "") and on() (' + total + " == 0)" + return f"({share}) or ({idle})" PROBLEM_PODS_EXPR = ( @@ -270,46 +315,12 @@ STUCK_TABLE_EXPR = ( ")" ) -NAMESPACE_CPU_RAW = ( - 'sum(rate(container_cpu_usage_seconds_total{namespace!="",pod!="",container!=""}[5m])) by (namespace)' -) -NAMESPACE_RAM_RAW = ( - 'sum(container_memory_working_set_bytes{namespace!="",pod!="",container!=""}) by (namespace)' -) +NAMESPACE_SCOPE_WORKLOAD = f'namespace!~"{INFRA_REGEX}"' +NAMESPACE_SCOPE_ALL = 'namespace=~".*"' +NAMESPACE_SCOPE_INFRA = f'namespace=~"{INFRA_REGEX}"' +NAMESPACE_SCOPE_VARS = ["namespace_scope_cpu", "namespace_scope_gpu", "namespace_scope_ram"] GPU_NODES = ["titan-20", "titan-21", "titan-22", "titan-24"] GPU_NODE_REGEX = "|".join(GPU_NODES) -NAMESPACE_GPU_ALLOC = ( - 'sum((kube_pod_container_resource_requests{namespace!="",resource="nvidia.com/gpu"}' - ' or kube_pod_container_resource_limits{namespace!="",resource="nvidia.com/gpu"})) by (namespace)' -) -NAMESPACE_GPU_USAGE_SHARE = ( - 'sum by (namespace) (max_over_time(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\"}[$__range]))' -) -NAMESPACE_GPU_USAGE_INSTANT = 'sum(DCGM_FI_DEV_GPU_UTIL{namespace!="",pod!=""}) by (namespace)' -NAMESPACE_GPU_RAW = ( - "(" - + NAMESPACE_GPU_USAGE_SHARE - + ") or on(namespace) (" - + NAMESPACE_CPU_RAW - + " * 0)" -) -NAMESPACE_GPU_WEIGHT = ( - "(" - + NAMESPACE_GPU_ALLOC - + ") or on(namespace) (" - + NAMESPACE_CPU_RAW - + " * 0)" -) -NAMESPACE_ACTIVITY_SCORE = ( - "( " - + NAMESPACE_CPU_RAW - + " ) + (" - + NAMESPACE_RAM_RAW - + " / 1e9) + (" - + NAMESPACE_GPU_WEIGHT - + " * 100)" -) -NAMESPACE_TOP_FILTER = "(topk(10, " + NAMESPACE_ACTIVITY_SCORE + ") >= bool 0)" TRAEFIK_ROUTER_EXPR = "sum by (router) (rate(traefik_router_requests_total[5m]))" TRAEFIK_NET_INGRESS = ( 'sum(rate(container_network_receive_bytes_total{namespace="traefik",pod=~"traefik-.*"}[5m]))' @@ -560,9 +571,9 @@ def table_panel( return panel -def pie_panel(panel_id, title, expr, grid): +def pie_panel(panel_id, title, expr, grid, *, links=None, description=None): """Return a pie chart panel with readable namespace labels.""" - return { + panel = { "id": panel_id, "type": "piechart", "title": title, @@ -586,6 +597,71 @@ def pie_panel(panel_id, title, expr, grid): "reduceOptions": {"calcs": ["lastNotNull"], "fields": "", "values": False}, }, } + if links: + panel["links"] = links + if description: + panel["description"] = description + return panel + + +def namespace_scope_variable(var_name, label): + options = [ + { + "text": "workload namespaces only", + "value": NAMESPACE_SCOPE_WORKLOAD, + "selected": True, + }, + {"text": "all namespaces", "value": NAMESPACE_SCOPE_ALL, "selected": False}, + { + "text": "infrastructure namespaces only", + "value": NAMESPACE_SCOPE_INFRA, + "selected": False, + }, + ] + query = ( + "workload namespaces only : " + + NAMESPACE_SCOPE_WORKLOAD + + ",all namespaces : " + + NAMESPACE_SCOPE_ALL + + ",infrastructure namespaces only : " + + NAMESPACE_SCOPE_INFRA + ) + return { + "name": var_name, + "label": label, + "type": "custom", + "query": query, + "current": {"text": options[0]["text"], "value": options[0]["value"], "selected": True}, + "options": options, + "hide": 2, + "multi": False, + "includeAll": False, + "refresh": 1, + "sort": 0, + "skipUrlSync": False, + } + + +def namespace_scope_links(var_name): + def with_value(value): + encoded = urllib.parse.quote(value, safe="") + params = [] + for other in NAMESPACE_SCOPE_VARS: + if other == var_name: + params.append(f"var-{other}={encoded}") + else: + params.append(f"var-{other}=${{{other}}}") + return "?" + "&".join(params) + + return [ + {"title": "Workload namespaces only", "url": with_value(NAMESPACE_SCOPE_WORKLOAD), "targetBlank": False}, + {"title": "All namespaces", "url": with_value(NAMESPACE_SCOPE_ALL), "targetBlank": False}, + { + "title": "Infrastructure namespaces only", + "url": with_value(NAMESPACE_SCOPE_INFRA), + "targetBlank": False, + }, + ] def bargauge_panel( @@ -857,6 +933,115 @@ def build_overview(): ) ) + mail_bounce_rate_thresholds = { + "mode": "absolute", + "steps": [ + {"color": "green", "value": None}, + {"color": "yellow", "value": 5}, + {"color": "orange", "value": 8}, + {"color": "red", "value": 10}, + ], + } + mail_limit_thresholds = { + "mode": "absolute", + "steps": [ + {"color": "green", "value": None}, + {"color": "yellow", "value": 70}, + {"color": "orange", "value": 85}, + {"color": "red", "value": 95}, + ], + } + mail_success_thresholds = { + "mode": "absolute", + "steps": [ + {"color": "red", "value": None}, + {"color": "orange", "value": 90}, + {"color": "yellow", "value": 95}, + {"color": "green", "value": 98}, + ], + } + panels.append( + stat_panel( + 30, + "Mail Sent (1d)", + 'max(postmark_outbound_sent{window="1d"})', + {"h": 2, "w": 6, "x": 0, "y": 8}, + unit="none", + links=link_to("atlas-mail"), + ) + ) + panels.append( + { + "id": 31, + "type": "stat", + "title": "Mail Bounces (1d)", + "datasource": PROM_DS, + "gridPos": {"h": 2, "w": 6, "x": 12, "y": 8}, + "targets": [ + { + "expr": 'max(postmark_outbound_bounce_rate{window="1d"})', + "refId": "A", + "legendFormat": "Rate", + }, + { + "expr": 'max(postmark_outbound_bounced{window="1d"})', + "refId": "B", + "legendFormat": "Count", + }, + ], + "fieldConfig": { + "defaults": { + "color": {"mode": "thresholds"}, + "custom": {"displayMode": "auto"}, + "thresholds": mail_bounce_rate_thresholds, + "unit": "none", + }, + "overrides": [ + { + "matcher": {"id": "byName", "options": "Rate"}, + "properties": [{"id": "unit", "value": "percent"}], + }, + { + "matcher": {"id": "byName", "options": "Count"}, + "properties": [{"id": "unit", "value": "none"}], + }, + ], + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": {"calcs": ["lastNotNull"], "fields": "", "values": False}, + "textMode": "name_and_value", + }, + "links": link_to("atlas-mail"), + } + ) + panels.append( + stat_panel( + 32, + "Mail Success Rate (1d)", + 'clamp_min(100 - max(postmark_outbound_bounce_rate{window="1d"}), 0)', + {"h": 2, "w": 6, "x": 6, "y": 8}, + unit="percent", + thresholds=mail_success_thresholds, + decimals=1, + links=link_to("atlas-mail"), + ) + ) + panels.append( + stat_panel( + 33, + "Mail Limit Used (30d)", + "max(postmark_sending_limit_used_percent)", + {"h": 2, "w": 6, "x": 18, "y": 8}, + unit="percent", + thresholds=mail_limit_thresholds, + decimals=1, + links=link_to("atlas-mail"), + ) + ) + storage_panels = [ (23, "Astreae Usage", astreae_usage_expr("/mnt/astreae"), "percent"), (24, "Asteria Usage", astreae_usage_expr("/mnt/asteria"), "percent"), @@ -876,28 +1061,38 @@ def build_overview(): ) ) + cpu_scope = "$namespace_scope_cpu" + gpu_scope = "$namespace_scope_gpu" + ram_scope = "$namespace_scope_ram" + panels.append( pie_panel( 11, "Namespace CPU Share", - namespace_cpu_share_expr(), + namespace_cpu_share_expr(cpu_scope), {"h": 9, "w": 8, "x": 0, "y": 16}, + links=namespace_scope_links("namespace_scope_cpu"), + description="Values are normalized within the selected scope; use panel links to switch scope.", ) ) panels.append( pie_panel( 12, "Namespace GPU Share", - namespace_gpu_share_expr(), + namespace_gpu_share_expr(gpu_scope), {"h": 9, "w": 8, "x": 8, "y": 16}, + links=namespace_scope_links("namespace_scope_gpu"), + description="Values are normalized within the selected scope; use panel links to switch scope.", ) ) panels.append( pie_panel( 13, "Namespace RAM Share", - namespace_ram_share_expr(), + namespace_ram_share_expr(ram_scope), {"h": 9, "w": 8, "x": 16, "y": 16}, + links=namespace_scope_links("namespace_scope_ram"), + description="Values are normalized within the selected scope; use panel links to switch scope.", ) ) @@ -1052,7 +1247,6 @@ def build_overview(): links=link_to("atlas-storage"), ) ) - return { "uid": "atlas-overview", "title": "Atlas Overview", @@ -1063,7 +1257,13 @@ def build_overview(): "schemaVersion": 39, "style": "dark", "tags": ["atlas", "overview"], - "templating": {"list": []}, + "templating": { + "list": [ + namespace_scope_variable("namespace_scope_cpu", "CPU namespace filter"), + namespace_scope_variable("namespace_scope_gpu", "GPU namespace filter"), + namespace_scope_variable("namespace_scope_ram", "RAM namespace filter"), + ] + }, "time": {"from": "now-1h", "to": "now"}, "refresh": "1m", "links": [], @@ -1513,6 +1713,33 @@ def build_storage_dashboard(): time_from="90d", ) ) + panels.append( + stat_panel( + 30, + "Maintenance Sweepers Ready", + 'kube_daemonset_status_number_ready{namespace="maintenance",daemonset="node-image-sweeper"} / on(namespace,daemonset) kube_daemonset_status_desired_number_scheduled{namespace="maintenance",daemonset="node-image-sweeper"} * 100', + {"h": 4, "w": 12, "x": 0, "y": 44}, + unit="percent", + thresholds=PERCENT_THRESHOLDS, + ) + ) + panels.append( + stat_panel( + 31, + "Maintenance Cron Freshness (s)", + 'time() - max by (cronjob) (kube_cronjob_status_last_successful_time{namespace="maintenance",cronjob=~"image-sweeper|grafana-smtp-sync"})', + {"h": 4, "w": 12, "x": 12, "y": 44}, + unit="s", + thresholds={ + "mode": "absolute", + "steps": [ + {"color": "green", "value": None}, + {"color": "yellow", "value": 3600}, + {"color": "red", "value": 10800}, + ], + }, + ) + ) return { "uid": "atlas-storage", "title": "Atlas Storage", @@ -1702,21 +1929,231 @@ def build_network_dashboard(): } +def build_mail_dashboard(): + panels = [] + + bounce_rate_thresholds = { + "mode": "absolute", + "steps": [ + {"color": "green", "value": None}, + {"color": "yellow", "value": 5}, + {"color": "orange", "value": 8}, + {"color": "red", "value": 10}, + ], + } + limit_thresholds = { + "mode": "absolute", + "steps": [ + {"color": "green", "value": None}, + {"color": "yellow", "value": 70}, + {"color": "orange", "value": 85}, + {"color": "red", "value": 95}, + ], + } + success_thresholds = { + "mode": "absolute", + "steps": [ + {"color": "red", "value": None}, + {"color": "orange", "value": 90}, + {"color": "yellow", "value": 95}, + {"color": "green", "value": 98}, + ], + } + + panels.append( + stat_panel( + 1, + "Sent (1d)", + 'max(postmark_outbound_sent{window="1d"})', + {"h": 4, "w": 6, "x": 0, "y": 0}, + decimals=0, + ) + ) + panels.append( + stat_panel( + 2, + "Sent (7d)", + 'max(postmark_outbound_sent{window="7d"})', + {"h": 4, "w": 6, "x": 6, "y": 0}, + decimals=0, + ) + ) + panels.append( + { + "id": 3, + "type": "stat", + "title": "Mail Bounces (1d)", + "datasource": PROM_DS, + "gridPos": {"h": 4, "w": 6, "x": 12, "y": 0}, + "targets": [ + { + "expr": 'max(postmark_outbound_bounce_rate{window="1d"})', + "refId": "A", + "legendFormat": "Rate", + }, + { + "expr": 'max(postmark_outbound_bounced{window="1d"})', + "refId": "B", + "legendFormat": "Count", + }, + ], + "fieldConfig": { + "defaults": { + "color": {"mode": "thresholds"}, + "custom": {"displayMode": "auto"}, + "thresholds": bounce_rate_thresholds, + "unit": "none", + }, + "overrides": [ + { + "matcher": {"id": "byName", "options": "Rate"}, + "properties": [{"id": "unit", "value": "percent"}], + }, + { + "matcher": {"id": "byName", "options": "Count"}, + "properties": [{"id": "unit", "value": "none"}], + }, + ], + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": {"calcs": ["lastNotNull"], "fields": "", "values": False}, + "textMode": "name_and_value", + }, + } + ) + panels.append( + stat_panel( + 4, + "Success Rate (1d)", + 'clamp_min(100 - max(postmark_outbound_bounce_rate{window="1d"}), 0)', + {"h": 4, "w": 6, "x": 18, "y": 0}, + unit="percent", + thresholds=success_thresholds, + decimals=1, + ) + ) + + panels.append( + stat_panel( + 5, + "Limit Used (30d)", + "max(postmark_sending_limit_used_percent)", + {"h": 4, "w": 6, "x": 0, "y": 4}, + thresholds=limit_thresholds, + unit="percent", + decimals=1, + ) + ) + panels.append( + stat_panel( + 6, + "Send Limit (30d)", + "max(postmark_sending_limit)", + {"h": 4, "w": 6, "x": 6, "y": 4}, + decimals=0, + ) + ) + panels.append( + stat_panel( + 7, + "Last Success", + "max(postmark_last_success_timestamp_seconds)", + {"h": 4, "w": 6, "x": 12, "y": 4}, + unit="dateTimeAsIso", + decimals=0, + ) + ) + panels.append( + stat_panel( + 8, + "Exporter Errors", + "sum(postmark_request_errors_total)", + {"h": 4, "w": 6, "x": 18, "y": 4}, + decimals=0, + ) + ) + + panels.append( + timeseries_panel( + 13, + "Bounce Rate (1d vs 7d)", + "max by (window) (postmark_outbound_bounce_rate)", + {"h": 8, "w": 12, "x": 0, "y": 12}, + unit="percent", + legend="{{window}}", + legend_display="table", + legend_placement="right", + ) + ) + panels.append( + timeseries_panel( + 14, + "Bounced (1d vs 7d)", + "max by (window) (postmark_outbound_bounced)", + {"h": 8, "w": 12, "x": 12, "y": 12}, + unit="none", + legend="{{window}}", + legend_display="table", + legend_placement="right", + ) + ) + panels.append( + timeseries_panel( + 15, + "Sent (1d vs 7d)", + "max by (window) (postmark_outbound_sent)", + {"h": 8, "w": 12, "x": 0, "y": 20}, + unit="none", + legend="{{window}}", + legend_display="table", + legend_placement="right", + ) + ) + panels.append( + timeseries_panel( + 16, + "Exporter Errors", + "sum(postmark_request_errors_total)", + {"h": 8, "w": 12, "x": 12, "y": 20}, + unit="none", + ) + ) + + return { + "uid": "atlas-mail", + "title": "Atlas Mail", + "folderUid": PRIVATE_FOLDER, + "editable": True, + "panels": panels, + "time": {"from": "now-30d", "to": "now"}, + "annotations": {"list": []}, + "schemaVersion": 39, + "style": "dark", + "tags": ["atlas", "mail"], + } + + def build_gpu_dashboard(): panels = [] + gpu_scope = "$namespace_scope_gpu" panels.append( pie_panel( 1, "Namespace GPU Share", - namespace_gpu_share_expr(), + namespace_gpu_share_expr(gpu_scope), {"h": 8, "w": 12, "x": 0, "y": 0}, + links=namespace_scope_links("namespace_scope_gpu"), + description="Values are normalized within the selected scope; use panel links to switch scope.", ) ) panels.append( timeseries_panel( 2, "GPU Util by Namespace", - NAMESPACE_GPU_USAGE_INSTANT, + namespace_gpu_usage_instant(gpu_scope), {"h": 8, "w": 12, "x": 12, "y": 0}, unit="percent", legend="{{namespace}}", @@ -1757,6 +2194,13 @@ def build_gpu_dashboard(): "schemaVersion": 39, "style": "dark", "tags": ["atlas", "gpu"], + "templating": { + "list": [ + namespace_scope_variable("namespace_scope_cpu", "CPU namespace filter"), + namespace_scope_variable("namespace_scope_gpu", "GPU namespace filter"), + namespace_scope_variable("namespace_scope_ram", "RAM namespace filter"), + ] + }, } @@ -1781,6 +2225,10 @@ DASHBOARDS = { "builder": build_network_dashboard, "configmap": ROOT / "services" / "monitoring" / "grafana-dashboard-network.yaml", }, + "atlas-mail": { + "builder": build_mail_dashboard, + "configmap": ROOT / "services" / "monitoring" / "grafana-dashboard-mail.yaml", + }, "atlas-gpu": { "builder": build_gpu_dashboard, "configmap": ROOT / "services" / "monitoring" / "grafana-dashboard-gpu.yaml", diff --git a/scripts/dashboards_render_logs.py b/scripts/dashboards_render_logs.py new file mode 100755 index 0000000..48b592d --- /dev/null +++ b/scripts/dashboards_render_logs.py @@ -0,0 +1,445 @@ +#!/usr/bin/env python3 +"""Generate OpenSearch Dashboards saved objects and render them into ConfigMaps. + +Usage: + scripts/dashboards_render_logs.py --build # rebuild NDJSON + ConfigMap + scripts/dashboards_render_logs.py # re-render ConfigMap from NDJSON +""" + +from __future__ import annotations + +import argparse +import json +import textwrap +from dataclasses import dataclass +from pathlib import Path + +ROOT = Path(__file__).resolve().parents[1] +DASHBOARD_DIR = ROOT / "services" / "logging" / "dashboards" +NDJSON_PATH = DASHBOARD_DIR / "logs.ndjson" +CONFIG_PATH = ROOT / "services" / "logging" / "opensearch-dashboards-objects.yaml" + +CONFIG_TEMPLATE = textwrap.dedent( + """# {relative_path} +# Generated by scripts/dashboards_render_logs.py --build +apiVersion: v1 +kind: ConfigMap +metadata: + name: opensearch-dashboards-objects + namespace: logging +data: + objects.ndjson: | +{payload} +""" +) + +DASHBOARD_VERSION = "7.10.0" +GRID_COLUMNS = 48 +H_CHART = 10 +H_ERRORS = 8 +H_TABLE = 16 +H_SEARCH = 18 +TABLE_SIZE = 15 +TABLE_PER_PAGE = 15 + +ERROR_TERMS = ("*error*", "*exception*", "*fail*") + + +@dataclass(frozen=True) +class AppSpec: + slug: str + title: str + query: str + index_id: str = "kube-logs" + kind: str = "kube" + + +def error_query(base: str | None = None) -> str: + parts = [f'(log : "{term}" or message : "{term}")' for term in ERROR_TERMS] + expr = " or ".join(parts) + if base: + return f"({base}) and ({expr})" + return f"({expr})" + + +def json_line(obj: dict) -> str: + return json.dumps(obj, separators=(",", ":")) + + +def search_source(query: str) -> dict: + return { + "query": {"language": "kuery", "query": query}, + "filter": [], + "indexRefName": "kibanaSavedObjectMeta.searchSourceJSON.index", + } + + +def index_pattern(object_id: str, title: str, time_field: str = "@timestamp") -> dict: + return { + "type": "index-pattern", + "id": object_id, + "attributes": {"title": title, "timeFieldName": time_field}, + } + + +def histogram_vis(object_id: str, title: str, query: str, index_id: str) -> dict: + vis_state = { + "title": title, + "type": "histogram", + "aggs": [ + {"id": "1", "enabled": True, "type": "count", "schema": "metric"}, + { + "id": "2", + "enabled": True, + "type": "date_histogram", + "schema": "segment", + "params": {"field": "@timestamp", "interval": "auto", "min_doc_count": 1}, + }, + ], + "params": {"addTooltip": True, "addLegend": False, "scale": "linear", "interpolate": "linear"}, + } + return { + "type": "visualization", + "id": object_id, + "attributes": { + "title": title, + "visState": json.dumps(vis_state, separators=(",", ":")), + "uiStateJSON": "{}", + "description": "", + "version": 1, + "kibanaSavedObjectMeta": { + "searchSourceJSON": json.dumps(search_source(query), separators=(",", ":")) + }, + }, + "references": [ + { + "name": "kibanaSavedObjectMeta.searchSourceJSON.index", + "type": "index-pattern", + "id": index_id, + } + ], + } + + +def table_vis(object_id: str, title: str, field: str, query: str, index_id: str) -> dict: + vis_state = { + "title": title, + "type": "table", + "aggs": [ + {"id": "1", "enabled": True, "type": "count", "schema": "metric"}, + { + "id": "2", + "enabled": True, + "type": "terms", + "schema": "bucket", + "params": {"field": field, "size": TABLE_SIZE, "order": "desc", "orderBy": "1"}, + }, + ], + "params": { + "perPage": TABLE_PER_PAGE, + "showPartialRows": False, + "showMetricsAtAllLevels": False, + "sort": {"columnIndex": 1, "direction": "desc"}, + }, + } + return { + "type": "visualization", + "id": object_id, + "attributes": { + "title": title, + "visState": json.dumps(vis_state, separators=(",", ":")), + "uiStateJSON": "{}", + "description": "", + "version": 1, + "kibanaSavedObjectMeta": { + "searchSourceJSON": json.dumps(search_source(query), separators=(",", ":")) + }, + }, + "references": [ + { + "name": "kibanaSavedObjectMeta.searchSourceJSON.index", + "type": "index-pattern", + "id": index_id, + } + ], + } + + +def search_object(object_id: str, title: str, columns: list[str], query: str, index_id: str) -> dict: + return { + "type": "search", + "id": object_id, + "attributes": { + "title": title, + "description": "", + "columns": columns, + "sort": [["@timestamp", "desc"]], + "kibanaSavedObjectMeta": { + "searchSourceJSON": json.dumps(search_source(query), separators=(",", ":")) + }, + }, + "references": [ + { + "name": "kibanaSavedObjectMeta.searchSourceJSON.index", + "type": "index-pattern", + "id": index_id, + } + ], + } + + +def grid(x: int, y: int, w: int, h: int, i: int) -> dict: + return {"x": x, "y": y, "w": w, "h": h, "i": str(i)} + + +def panel(panel_id: str, panel_type: str, grid_data: dict, index: int) -> dict: + return { + "panelIndex": str(index), + "gridData": grid_data, + "id": panel_id, + "type": panel_type, + "version": DASHBOARD_VERSION, + "embeddableConfig": {}, + } + + +def full_width_panels(specs: list[tuple[str, str, int]]) -> list[dict]: + panels = [] + y = 0 + for index, (panel_id, panel_type, height) in enumerate(specs, start=1): + panels.append(panel(panel_id, panel_type, grid(0, y, GRID_COLUMNS, height, index), index)) + y += height + return panels + + +def dashboard_object(object_id: str, title: str, panels: list[dict]) -> dict: + return { + "type": "dashboard", + "id": object_id, + "attributes": { + "title": title, + "description": "", + "hits": 0, + "panelsJSON": json.dumps(panels, separators=(",", ":")), + "optionsJSON": json.dumps({"useMargins": True, "hidePanelTitles": False}, separators=(",", ":")), + "version": 1, + "timeRestore": False, + "kibanaSavedObjectMeta": { + "searchSourceJSON": json.dumps({"query": {"language": "kuery", "query": ""}, "filter": []}) + }, + }, + } + + +def app_dashboard_objects(app: AppSpec) -> list[dict]: + prefix = f"logs-{app.slug}" + objects = [] + + if app.kind == "journald": + columns = ["@timestamp", "_HOSTNAME", "_SYSTEMD_UNIT", "MESSAGE"] + objects.append(histogram_vis(f"{prefix}-volume", f"{app.title} logs", app.query, app.index_id)) + objects.append(histogram_vis(f"{prefix}-errors", f"{app.title} errors", error_query(app.query), app.index_id)) + objects.append(table_vis(f"{prefix}-top-units", "Top units", "_SYSTEMD_UNIT.keyword", app.query, app.index_id)) + objects.append(search_object(f"{prefix}-recent", "Recent logs", columns, app.query, app.index_id)) + objects.append( + search_object( + f"{prefix}-recent-errors", + "Recent errors", + columns, + error_query(app.query), + app.index_id, + ) + ) + panels = full_width_panels( + [ + (f"{prefix}-volume", "visualization", H_CHART), + (f"{prefix}-errors", "visualization", H_ERRORS), + (f"{prefix}-top-units", "visualization", H_TABLE), + (f"{prefix}-recent", "search", H_SEARCH), + (f"{prefix}-recent-errors", "search", H_SEARCH), + ] + ) + objects.append(dashboard_object(prefix, f"{app.title} Logs", panels)) + return objects + + columns = ["@timestamp", "kubernetes.pod_name", "kubernetes.container_name", "log", "message"] + objects.append(histogram_vis(f"{prefix}-volume", f"{app.title} logs", app.query, app.index_id)) + objects.append(histogram_vis(f"{prefix}-errors", f"{app.title} errors", error_query(app.query), app.index_id)) + objects.append(table_vis(f"{prefix}-top-pods", "Top pods", "kubernetes.pod_name.keyword", app.query, app.index_id)) + objects.append( + table_vis(f"{prefix}-top-containers", "Top containers", "kubernetes.container_name.keyword", app.query, app.index_id) + ) + objects.append(search_object(f"{prefix}-recent", "Recent logs", columns, app.query, app.index_id)) + objects.append( + search_object( + f"{prefix}-recent-errors", + "Recent errors", + columns, + error_query(app.query), + app.index_id, + ) + ) + panels = full_width_panels( + [ + (f"{prefix}-volume", "visualization", H_CHART), + (f"{prefix}-errors", "visualization", H_ERRORS), + (f"{prefix}-top-pods", "visualization", H_TABLE), + (f"{prefix}-top-containers", "visualization", H_TABLE), + (f"{prefix}-recent", "search", H_SEARCH), + (f"{prefix}-recent-errors", "search", H_SEARCH), + ] + ) + objects.append(dashboard_object(prefix, f"{app.title} Logs", panels)) + return objects + + +def overview_objects() -> list[dict]: + objects = [] + objects.append(histogram_vis("logs-overview-volume", "Logs per minute", "*", "kube-logs")) + objects.append(histogram_vis("logs-overview-errors", "Errors per minute", error_query(), "kube-logs")) + objects.append( + table_vis( + "logs-overview-top-ns", + "Top namespaces", + "kubernetes.namespace_name.keyword", + "*", + "kube-logs", + ) + ) + objects.append( + table_vis( + "logs-overview-top-error-ns", + "Top error namespaces", + "kubernetes.namespace_name.keyword", + error_query(), + "kube-logs", + ) + ) + objects.append(table_vis("logs-overview-top-pods", "Top pods", "kubernetes.pod_name.keyword", "*", "kube-logs")) + objects.append( + table_vis( + "logs-overview-top-nodes", + "Top nodes", + "kubernetes.node_name.keyword", + "*", + "kube-logs", + ) + ) + objects.append( + search_object( + "logs-overview-recent-errors", + "Recent errors", + ["@timestamp", "kubernetes.namespace_name", "kubernetes.pod_name", "log", "message"], + error_query(), + "kube-logs", + ) + ) + panels = full_width_panels( + [ + ("logs-overview-volume", "visualization", H_CHART), + ("logs-overview-errors", "visualization", H_ERRORS), + ("logs-overview-top-ns", "visualization", H_TABLE), + ("logs-overview-top-error-ns", "visualization", H_TABLE), + ("logs-overview-top-pods", "visualization", H_TABLE), + ("logs-overview-top-nodes", "visualization", H_TABLE), + ("logs-overview-recent-errors", "search", H_SEARCH), + ] + ) + objects.append(dashboard_object("logs-overview", "Atlas Logs Overview", panels)) + return objects + + +def build_objects() -> list[dict]: + objects = [ + index_pattern("kube-logs", "kube-*"), + index_pattern("journald-logs", "journald-*"), + ] + + objects.extend(overview_objects()) + + apps = [ + AppSpec("bstein-dev-home", "bstein-dev-home", 'kubernetes.namespace_name: "bstein-dev-home"'), + AppSpec( + "pegasus", + "pegasus", + 'kubernetes.namespace_name: "jellyfin" and kubernetes.labels.app: "pegasus"', + ), + AppSpec( + "jellyfin", + "jellyfin", + 'kubernetes.namespace_name: "jellyfin" and kubernetes.labels.app: "jellyfin"', + ), + AppSpec("vaultwarden", "vaultwarden", 'kubernetes.namespace_name: "vaultwarden"'), + AppSpec("mailu", "mailu", 'kubernetes.namespace_name: "mailu-mailserver"'), + AppSpec("nextcloud", "nextcloud", 'kubernetes.namespace_name: "nextcloud"'), + AppSpec("gitea", "gitea", 'kubernetes.namespace_name: "gitea"'), + AppSpec("jenkins", "jenkins", 'kubernetes.namespace_name: "jenkins"'), + AppSpec("harbor", "harbor", 'kubernetes.namespace_name: "harbor"'), + AppSpec("vault", "vault", 'kubernetes.namespace_name: "vault"'), + AppSpec("keycloak", "keycloak", 'kubernetes.namespace_name: "sso"'), + AppSpec("flux-system", "flux-system", 'kubernetes.namespace_name: "flux-system"'), + AppSpec("comms", "comms", 'kubernetes.namespace_name: "comms"'), + AppSpec( + "element-web", + "element-web", + 'kubernetes.namespace_name: "comms" and kubernetes.container_name: "element-web"', + ), + AppSpec( + "element-call", + "element-call", + 'kubernetes.namespace_name: "comms" and kubernetes.labels.app: "element-call"', + ), + AppSpec( + "matrix-synapse", + "matrix-synapse", + 'kubernetes.namespace_name: "comms" and kubernetes.container_name: "synapse"', + ), + AppSpec( + "livekit", + "livekit", + 'kubernetes.namespace_name: "comms" and kubernetes.labels.app: "livekit"', + ), + AppSpec( + "coturn", + "coturn", + 'kubernetes.namespace_name: "comms" and kubernetes.labels.app: "coturn"', + ), + AppSpec("lesavka", "lesavka", '_HOSTNAME: "titan-jh"', index_id="journald-logs", kind="journald"), + ] + + for app in apps: + objects.extend(app_dashboard_objects(app)) + + return objects + + +def write_ndjson(objects: list[dict], path: Path) -> None: + path.parent.mkdir(parents=True, exist_ok=True) + payload = "\n".join(json_line(obj) for obj in objects) + path.write_text(payload + "\n") + + +def render_configmap(ndjson_path: Path, output_path: Path) -> None: + payload_lines = ndjson_path.read_text().splitlines() + payload = "\n".join(" " + line for line in payload_lines) + relative_path = output_path.relative_to(ROOT) + output_path.write_text(CONFIG_TEMPLATE.format(relative_path=relative_path, payload=payload)) + + +def main() -> None: + parser = argparse.ArgumentParser() + parser.add_argument("--build", action="store_true", help="Regenerate saved object NDJSON and ConfigMap") + args = parser.parse_args() + + if args.build: + objects = build_objects() + write_ndjson(objects, NDJSON_PATH) + + if not NDJSON_PATH.exists(): + raise SystemExit(f"Missing NDJSON file: {NDJSON_PATH}. Run with --build first.") + + render_configmap(NDJSON_PATH, CONFIG_PATH) + + +if __name__ == "__main__": + main() diff --git a/scripts/knowledge_render_atlas.py b/scripts/knowledge_render_atlas.py new file mode 100644 index 0000000..50ac84c --- /dev/null +++ b/scripts/knowledge_render_atlas.py @@ -0,0 +1,554 @@ +#!/usr/bin/env python3 +"""Render Atlas knowledge artifacts from Flux + kustomize manifests. + +Outputs (committed to git for stable diffs + RAG): +- knowledge/catalog/*.yaml +- knowledge/diagrams/*.mmd + +This is intentionally conservative: +- never includes Secret objects +- never includes secret values +- keeps output deterministic (sorted) +""" + +from __future__ import annotations + +import argparse +import json +import re +import subprocess +import sys +from dataclasses import dataclass +from pathlib import Path +from typing import Any, Iterable + +import yaml + +REPO_ROOT = Path(__file__).resolve().parents[1] + +CLUSTER_SCOPED_KINDS = { + "Namespace", + "Node", + "CustomResourceDefinition", + "ClusterRole", + "ClusterRoleBinding", + "StorageClass", + "PersistentVolume", + "MutatingWebhookConfiguration", + "ValidatingWebhookConfiguration", + "APIService", +} + +INCLUDED_KINDS = { + "Namespace", + "Deployment", + "StatefulSet", + "DaemonSet", + "Service", + "Ingress", + "IngressRoute", # traefik + "HelmRelease", # only to harvest ingress hostnames from values +} + + +def _run(cmd: list[str], *, cwd: Path) -> str: + res = subprocess.run(cmd, cwd=cwd, capture_output=True, text=True, check=False) + if res.returncode != 0: + raise RuntimeError( + f"Command failed ({res.returncode}): {' '.join(cmd)}\n{res.stderr.strip()}" + ) + return res.stdout + + +def kustomize_build(path: Path) -> str: + rel = path.relative_to(REPO_ROOT) + try: + return _run(["kubectl", "kustomize", str(rel)], cwd=REPO_ROOT) + except Exception as e: + msg = str(e) + if "is not in or below" in msg: + # Repo uses configMapGenerators that reference ../../scripts/*.py. + # Kustomize load restriction must be disabled for a full render. + try: + return _run( + ["kubectl", "kustomize", "--load-restrictor=LoadRestrictionsNone", str(rel)], + cwd=REPO_ROOT, + ) + except Exception: + pass + return _run(["kustomize", "build", "--load-restrictor=LoadRestrictionsNone", str(rel)], cwd=REPO_ROOT) + + +def _iter_docs(raw_yaml: str) -> Iterable[dict[str, Any]]: + for doc in yaml.safe_load_all(raw_yaml): + if not isinstance(doc, dict): + continue + kind = doc.get("kind") + if kind == "List" and isinstance(doc.get("items"), list): + for item in doc["items"]: + if isinstance(item, dict): + yield item + continue + if kind: + yield doc + + +def _meta(doc: dict[str, Any]) -> tuple[str, str | None]: + md = doc.get("metadata") or {} + name = md.get("name") or "" + namespace = md.get("namespace") + return name, namespace + + +def _is_namespaced(doc: dict[str, Any]) -> bool: + kind = doc.get("kind") or "" + return kind not in CLUSTER_SCOPED_KINDS + + +@dataclass(frozen=True) +class FluxKustomization: + name: str + path: str + target_namespace: str | None + + +def find_flux_kustomizations() -> list[FluxKustomization]: + """Find Flux Kustomization CRs under clusters/atlas/flux-system.""" + root = REPO_ROOT / "clusters" / "atlas" / "flux-system" + items: list[FluxKustomization] = [] + for file in sorted(root.rglob("*.yaml")): + raw = file.read_text() + for doc in _iter_docs(raw): + if doc.get("kind") != "Kustomization": + continue + api = str(doc.get("apiVersion") or "") + if not api.startswith("kustomize.toolkit.fluxcd.io/"): + continue + name, _ = _meta(doc) + spec = doc.get("spec") or {} + path = spec.get("path") + if not isinstance(path, str) or not path.strip(): + continue + items.append( + FluxKustomization( + name=name, + path=path.strip().lstrip("./"), + target_namespace=spec.get("targetNamespace"), + ) + ) + return sorted(items, key=lambda k: k.name) + + +def _safe_string_scan_for_hosts(value: Any) -> set[str]: + """Best-effort host scan from HelmRelease values without chart rendering.""" + hosts: set[str] = set() + if isinstance(value, str): + for m in re.finditer(r"(?i)([a-z0-9-]+(?:\.[a-z0-9-]+)+)", value): + host = m.group(1).lower() + if host.endswith("bstein.dev"): + hosts.add(host) + return hosts + if isinstance(value, list): + for item in value: + hosts |= _safe_string_scan_for_hosts(item) + return hosts + if isinstance(value, dict): + for item in value.values(): + hosts |= _safe_string_scan_for_hosts(item) + return hosts + return hosts + + +def _service_ports(svc: dict[str, Any]) -> list[dict[str, Any]]: + spec = svc.get("spec") or {} + out: list[dict[str, Any]] = [] + for p in spec.get("ports") or []: + if not isinstance(p, dict): + continue + out.append( + { + "name": p.get("name"), + "port": p.get("port"), + "targetPort": p.get("targetPort"), + "protocol": p.get("protocol", "TCP"), + } + ) + return out + + +def _workload_labels(doc: dict[str, Any]) -> dict[str, str]: + tpl = (doc.get("spec") or {}).get("template") or {} + md = tpl.get("metadata") or {} + labels = md.get("labels") or {} + return {str(k): str(v) for k, v in labels.items()} if isinstance(labels, dict) else {} + + +def _service_selector(doc: dict[str, Any]) -> dict[str, str]: + spec = doc.get("spec") or {} + sel = spec.get("selector") or {} + return {str(k): str(v) for k, v in sel.items()} if isinstance(sel, dict) else {} + + +def _selector_matches(selector: dict[str, str], labels: dict[str, str]) -> bool: + if not selector: + return False + return all(labels.get(k) == v for k, v in selector.items()) + + +def _sanitize_node_id(text: str) -> str: + return re.sub(r"[^a-zA-Z0-9_]", "_", text) + + +def extract_catalog( + rendered: list[tuple[FluxKustomization, list[dict[str, Any]]]], +) -> tuple[dict[str, Any], dict[str, Any], str]: + """Build knowledge catalog + mermaid diagram from rendered docs.""" + # Index workloads and services for mapping. + workloads: dict[tuple[str, str], dict[str, Any]] = {} + services: dict[tuple[str, str], dict[str, Any]] = {} + ingresses: list[dict[str, Any]] = [] + ingressroutes: list[dict[str, Any]] = [] + helmrelease_hosts: dict[str, list[str]] = {} + + for src, docs in rendered: + for doc in docs: + kind = doc.get("kind") + if kind not in INCLUDED_KINDS: + continue + if kind == "Secret": + continue + + name, namespace = _meta(doc) + if _is_namespaced(doc) and not namespace and src.target_namespace: + namespace = src.target_namespace + doc = dict(doc) + doc.setdefault("metadata", {})["namespace"] = namespace + + if kind in ("Deployment", "StatefulSet", "DaemonSet"): + workloads[(namespace or "", name)] = { + "kind": kind, + "namespace": namespace or "", + "name": name, + "labels": _workload_labels(doc), + "serviceAccountName": ((doc.get("spec") or {}).get("template") or {}) + .get("spec", {}) + .get("serviceAccountName"), + "nodeSelector": ((doc.get("spec") or {}).get("template") or {}) + .get("spec", {}) + .get("nodeSelector", {}), + "images": sorted( + { + c.get("image") + for c in ( + (((doc.get("spec") or {}).get("template") or {}).get("spec") or {}).get( + "containers" + ) + or [] + ) + if isinstance(c, dict) and c.get("image") + } + ), + } + elif kind == "Service": + services[(namespace or "", name)] = { + "namespace": namespace or "", + "name": name, + "type": (doc.get("spec") or {}).get("type", "ClusterIP"), + "selector": _service_selector(doc), + "ports": _service_ports(doc), + } + elif kind == "Ingress": + ingresses.append({"source": src.name, "doc": doc}) + elif kind == "IngressRoute": + ingressroutes.append({"source": src.name, "doc": doc}) + elif kind == "HelmRelease": + spec = doc.get("spec") or {} + vals = spec.get("values") or {} + hosts = sorted(_safe_string_scan_for_hosts(vals)) + if hosts: + helmrelease_hosts[f"{src.name}:{namespace or ''}/{name}"] = hosts + + # Map services to workloads. + service_to_workloads: dict[tuple[str, str], list[dict[str, str]]] = {} + for (ns, svc_name), svc in services.items(): + selector = svc.get("selector") or {} + matches: list[dict[str, str]] = [] + for (w_ns, w_name), w in workloads.items(): + if w_ns != ns: + continue + if _selector_matches(selector, w.get("labels") or {}): + matches.append({"kind": w["kind"], "name": w_name}) + service_to_workloads[(ns, svc_name)] = sorted(matches, key=lambda m: (m["kind"], m["name"])) + + # Extract HTTP endpoints. + endpoints: list[dict[str, Any]] = [] + + def add_endpoint( + *, + host: str, + path: str, + namespace: str, + service: str, + port: Any, + source: str, + kind: str, + obj_name: str, + ): + wk = service_to_workloads.get((namespace, service), []) + endpoints.append( + { + "host": host, + "path": path, + "backend": { + "namespace": namespace, + "service": service, + "port": port, + "workloads": wk, + }, + "via": {"kind": kind, "name": obj_name, "source": source}, + } + ) + + for item in ingresses: + doc = item["doc"] + source = item["source"] + name, namespace = _meta(doc) + namespace = namespace or "" + spec = doc.get("spec") or {} + for rule in spec.get("rules") or []: + if not isinstance(rule, dict): + continue + host = (rule.get("host") or "").strip() + http = rule.get("http") or {} + for p in http.get("paths") or []: + if not isinstance(p, dict): + continue + backend = (p.get("backend") or {}).get("service") or {} + svc_name = backend.get("name") + svc_port = (backend.get("port") or {}).get("number") or (backend.get("port") or {}).get("name") + if not host or not svc_name: + continue + add_endpoint( + host=host, + path=p.get("path") or "/", + namespace=namespace, + service=svc_name, + port=svc_port, + source=source, + kind="Ingress", + obj_name=name, + ) + + host_re = re.compile(r"Host\(`([^`]+)`\)") + pathprefix_re = re.compile(r"PathPrefix\(`([^`]+)`\)") + for item in ingressroutes: + doc = item["doc"] + source = item["source"] + name, namespace = _meta(doc) + namespace = namespace or "" + spec = doc.get("spec") or {} + for route in spec.get("routes") or []: + if not isinstance(route, dict): + continue + match = route.get("match") or "" + hosts = host_re.findall(match) + pathprefixes = pathprefix_re.findall(match) or ["/"] + for svc in route.get("services") or []: + if not isinstance(svc, dict): + continue + svc_name = svc.get("name") + svc_port = svc.get("port") + if not svc_name: + continue + for host in hosts: + for pp in pathprefixes: + add_endpoint( + host=host, + path=pp, + namespace=namespace, + service=svc_name, + port=svc_port, + source=source, + kind="IngressRoute", + obj_name=name, + ) + + endpoints = sorted( + endpoints, + key=lambda e: ( + e["host"], + e["path"], + e["backend"]["namespace"], + e["backend"]["service"], + ), + ) + + catalog = { + "cluster": "atlas", + "sources": [ + {"name": k.name, "path": k.path, "targetNamespace": k.target_namespace} + for k, _ in rendered + ], + "workloads": sorted( + list(workloads.values()), + key=lambda w: (w["namespace"], w["kind"], w["name"]), + ), + "services": sorted( + list(services.values()), + key=lambda s: (s["namespace"], s["name"]), + ), + "http_endpoints": endpoints, + "helmrelease_host_hints": {k: v for k, v in sorted(helmrelease_hosts.items())}, + } + + # Mermaid diagram: host -> service -> workload (grouped by namespace). + ns_nodes: dict[str, list[str]] = {} + lines: list[str] = ["flowchart LR"] + edges: set[tuple[str, str]] = set() + + def ensure_ns_node(ns: str, node_id: str): + ns_nodes.setdefault(ns, []) + if node_id not in ns_nodes[ns]: + ns_nodes[ns].append(node_id) + + host_nodes: dict[str, str] = {} + + for ep in endpoints: + host = ep["host"] + host_id = host_nodes.get(host) + if not host_id: + host_id = f"host_{_sanitize_node_id(host)}" + host_nodes[host] = host_id + lines.append(f' {host_id}["{host}"]') + + ns = ep["backend"]["namespace"] + svc = ep["backend"]["service"] + svc_id = f"svc_{_sanitize_node_id(ns)}_{_sanitize_node_id(svc)}" + if svc_id not in ns_nodes.get(ns, []): + lines.append(f' {svc_id}["{ns}/{svc} (Service)"]') + ensure_ns_node(ns, svc_id) + + if (host_id, svc_id) not in edges: + edges.add((host_id, svc_id)) + lines.append(f" {host_id} --> {svc_id}") + + for w in ep["backend"]["workloads"]: + w_id = f"wl_{_sanitize_node_id(ns)}_{_sanitize_node_id(w['name'])}" + if w_id not in ns_nodes.get(ns, []): + lines.append(f' {w_id}["{ns}/{w["name"]} ({w["kind"]})"]') + ensure_ns_node(ns, w_id) + if (svc_id, w_id) not in edges: + edges.add((svc_id, w_id)) + lines.append(f" {svc_id} --> {w_id}") + + # Wrap namespace subgraphs at the end for stability (sorted namespaces). + if ns_nodes: + lines.append("") + for ns in sorted(ns_nodes.keys()): + lines.append(f" subgraph { _sanitize_node_id(ns) }[{ns}]") + for node_id in ns_nodes[ns]: + lines.append(f" {node_id}") + lines.append(" end") + + diagram = "\n".join(lines).rstrip() + "\n" + + summary = { + "counts": { + "workloads": len(workloads), + "services": len(services), + "http_endpoints": len(endpoints), + "helmrelease_host_hints": sum(len(v) for v in helmrelease_hosts.values()), + } + } + + return catalog, summary, diagram + + +def main() -> int: + ap = argparse.ArgumentParser() + ap.add_argument("--out", default="knowledge", help="Output base directory (default: knowledge/)") + ap.add_argument( + "--write", + action="store_true", + help="Write generated files (otherwise just print a summary).", + ) + args = ap.parse_args() + + out_dir = REPO_ROOT / args.out + flux = find_flux_kustomizations() + if not flux: + print("No Flux Kustomizations found under clusters/atlas/flux-system.", file=sys.stderr) + return 2 + + rendered: list[tuple[FluxKustomization, list[dict[str, Any]]]] = [] + for k in flux: + path = REPO_ROOT / k.path + if not path.exists(): + continue + raw = kustomize_build(path) + docs = [d for d in _iter_docs(raw) if d.get("kind") != "Secret"] + rendered.append((k, docs)) + + rendered = sorted(rendered, key=lambda item: item[0].name) + catalog, summary, diagram = extract_catalog(rendered) + + if not args.write: + print(json.dumps(summary, indent=2, sort_keys=True)) + return 0 + + (out_dir / "catalog").mkdir(parents=True, exist_ok=True) + (out_dir / "diagrams").mkdir(parents=True, exist_ok=True) + + catalog_path = out_dir / "catalog" / "atlas.yaml" + catalog_json_path = out_dir / "catalog" / "atlas.json" + summary_path = out_dir / "catalog" / "atlas-summary.json" + diagram_path = out_dir / "diagrams" / "atlas-http.mmd" + runbooks_json_path = out_dir / "catalog" / "runbooks.json" + + catalog_path.write_text( + "# Generated by scripts/knowledge_render_atlas.py (do not edit by hand)\n" + + yaml.safe_dump(catalog, sort_keys=False), + encoding="utf-8", + ) + catalog_json_path.write_text(json.dumps(catalog, indent=2, sort_keys=False) + "\n", encoding="utf-8") + summary_path.write_text(json.dumps(summary, indent=2, sort_keys=True) + "\n", encoding="utf-8") + diagram_path.write_text(diagram, encoding="utf-8") + + # Render runbooks into JSON for lightweight, dependency-free consumption in-cluster. + runbooks_dir = out_dir / "runbooks" + runbooks: list[dict[str, Any]] = [] + if runbooks_dir.exists(): + for md_file in sorted(runbooks_dir.glob("*.md")): + raw = md_file.read_text(encoding="utf-8") + fm: dict[str, Any] = {} + body = raw + if raw.startswith("---\n"): + try: + _, rest = raw.split("---\n", 1) + fm_raw, body = rest.split("\n---\n", 1) + fm = yaml.safe_load(fm_raw) or {} + except Exception: + fm = {} + body = raw + runbooks.append( + { + "path": str(md_file.relative_to(out_dir)), + "title": fm.get("title") or md_file.stem, + "tags": fm.get("tags") or [], + "entrypoints": fm.get("entrypoints") or [], + "source_paths": fm.get("source_paths") or [], + "body": body.strip(), + } + ) + runbooks_json_path.write_text(json.dumps(runbooks, indent=2, sort_keys=False) + "\n", encoding="utf-8") + + print(f"Wrote {catalog_path.relative_to(REPO_ROOT)}") + print(f"Wrote {catalog_json_path.relative_to(REPO_ROOT)}") + print(f"Wrote {summary_path.relative_to(REPO_ROOT)}") + print(f"Wrote {diagram_path.relative_to(REPO_ROOT)}") + print(f"Wrote {runbooks_json_path.relative_to(REPO_ROOT)}") + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/scripts/logging_render_observability.py b/scripts/logging_render_observability.py new file mode 100755 index 0000000..679e340 --- /dev/null +++ b/scripts/logging_render_observability.py @@ -0,0 +1,313 @@ +#!/usr/bin/env python3 +"""Generate OpenSearch Observability seed objects and render them into ConfigMaps. + +Usage: + scripts/logging_render_observability.py --build # rebuild JSON + ConfigMap + scripts/logging_render_observability.py # re-render ConfigMap from JSON +""" + +from __future__ import annotations + +import argparse +import json +import textwrap +from dataclasses import dataclass +from pathlib import Path + +ROOT = Path(__file__).resolve().parents[1] +OBS_DIR = ROOT / "services" / "logging" / "observability" +APPS_PATH = OBS_DIR / "applications.json" +QUERIES_PATH = OBS_DIR / "saved_queries.json" +VIS_PATH = OBS_DIR / "saved_visualizations.json" +CONFIG_PATH = ROOT / "services" / "logging" / "opensearch-observability-objects.yaml" + +CONFIG_TEMPLATE = textwrap.dedent( + """# {relative_path} +# Generated by scripts/logging_render_observability.py --build +apiVersion: v1 +kind: ConfigMap +metadata: + name: opensearch-observability-objects + namespace: logging +data: + applications.json: | +{applications} + saved_queries.json: | +{queries} + saved_visualizations.json: | +{visualizations} +""" +) + +DEFAULT_RANGE = {"start": "now-24h", "end": "now", "text": ""} +DEFAULT_TIMESTAMP = {"name": "@timestamp", "type": "timestamp"} +DEFAULT_FIELDS = {"text": "", "tokens": []} + + +@dataclass(frozen=True) +class AppSpec: + name: str + base_query: str + kind: str = "kube" + description: str = "" + + +@dataclass(frozen=True) +class QuerySpec: + name: str + query: str + description: str = "" + + +@dataclass(frozen=True) +class VisualizationSpec: + name: str + query: str + vis_type: str + description: str = "" + + +def source_query(index: str, where: str | None = None) -> str: + query = f"source = {index}" + if where: + query += f" | where {where}" + return query + + +def error_filter(fields: list[str]) -> str: + parts = [f"match({field}, 'error|exception|fail')" for field in fields] + return " or ".join(parts) + + +def saved_query(spec: QuerySpec) -> dict: + return { + "name": spec.name, + "description": spec.description, + "query": spec.query, + "selected_date_range": DEFAULT_RANGE, + "selected_timestamp": DEFAULT_TIMESTAMP, + "selected_fields": DEFAULT_FIELDS, + } + + +def saved_visualization(spec: VisualizationSpec) -> dict: + return { + "name": spec.name, + "description": spec.description, + "query": spec.query, + "type": spec.vis_type, + "selected_date_range": DEFAULT_RANGE, + "selected_timestamp": DEFAULT_TIMESTAMP, + "selected_fields": DEFAULT_FIELDS, + } + + +def build_objects() -> tuple[list[dict], list[dict], list[dict]]: + kube_error = error_filter(["log", "message"]) + journald_error = error_filter(["MESSAGE"]) + + apps = [ + AppSpec("bstein-dev-home", source_query("kube-*", "kubernetes.namespace_name = 'bstein-dev-home'")), + AppSpec( + "pegasus", + source_query( + "kube-*", + "kubernetes.namespace_name = 'jellyfin' and kubernetes.labels.app = 'pegasus'", + ), + ), + AppSpec( + "jellyfin", + source_query( + "kube-*", + "kubernetes.namespace_name = 'jellyfin' and kubernetes.labels.app = 'jellyfin'", + ), + ), + AppSpec("vaultwarden", source_query("kube-*", "kubernetes.namespace_name = 'vaultwarden'")), + AppSpec("mailu", source_query("kube-*", "kubernetes.namespace_name = 'mailu-mailserver'")), + AppSpec("nextcloud", source_query("kube-*", "kubernetes.namespace_name = 'nextcloud'")), + AppSpec("gitea", source_query("kube-*", "kubernetes.namespace_name = 'gitea'")), + AppSpec("jenkins", source_query("kube-*", "kubernetes.namespace_name = 'jenkins'")), + AppSpec("harbor", source_query("kube-*", "kubernetes.namespace_name = 'harbor'")), + AppSpec("vault", source_query("kube-*", "kubernetes.namespace_name = 'vault'")), + AppSpec("keycloak", source_query("kube-*", "kubernetes.namespace_name = 'sso'")), + AppSpec("flux-system", source_query("kube-*", "kubernetes.namespace_name = 'flux-system'")), + AppSpec("comms", source_query("kube-*", "kubernetes.namespace_name = 'comms'")), + AppSpec( + "element-web", + source_query( + "kube-*", + "kubernetes.namespace_name = 'comms' and kubernetes.container_name = 'element-web'", + ), + ), + AppSpec( + "element-call", + source_query( + "kube-*", + "kubernetes.namespace_name = 'comms' and kubernetes.labels.app = 'element-call'", + ), + ), + AppSpec( + "matrix-synapse", + source_query( + "kube-*", + "kubernetes.namespace_name = 'comms' and kubernetes.container_name = 'synapse'", + ), + ), + AppSpec( + "livekit", + source_query( + "kube-*", + "kubernetes.namespace_name = 'comms' and kubernetes.labels.app = 'livekit'", + ), + ), + AppSpec( + "coturn", + source_query( + "kube-*", + "kubernetes.namespace_name = 'comms' and kubernetes.labels.app = 'coturn'", + ), + ), + AppSpec( + "lesavka", + source_query("journald-*", "_HOSTNAME = 'titan-jh'"), + kind="journald", + ), + ] + + applications = [ + { + "name": app.name, + "description": app.description, + "baseQuery": app.base_query, + "servicesEntities": [], + "traceGroups": [app.name], + } + for app in apps + ] + + queries = [ + saved_query(QuerySpec("kube logs", source_query("kube-*"))), + saved_query(QuerySpec("kube errors", f"{source_query('kube-*')} | where {kube_error}")), + saved_query(QuerySpec("journald logs", source_query("journald-*"))), + saved_query(QuerySpec("journald errors", f"{source_query('journald-*')} | where {journald_error}")), + ] + + for app in apps: + query_base = app.base_query + error_clause = journald_error if app.kind == "journald" else kube_error + queries.append(saved_query(QuerySpec(f"{app.name} logs", query_base))) + queries.append(saved_query(QuerySpec(f"{app.name} errors", f"{query_base} | where {error_clause}"))) + + visualizations = [ + saved_visualization( + VisualizationSpec( + "[Kube] Logs per hour", + "source = kube-* | stats count() as log_count by span(`@timestamp`, 1h)", + "line", + ) + ), + saved_visualization( + VisualizationSpec( + "[Kube] Errors per hour", + f"source = kube-* | where {kube_error} | stats count() as error_count by span(`@timestamp`, 1h)", + "line", + ) + ), + saved_visualization( + VisualizationSpec( + "[Kube] Top namespaces", + "source = kube-* | stats count() as log_count by kubernetes.namespace_name | sort - log_count", + "bar", + ) + ), + saved_visualization( + VisualizationSpec( + "[Kube] Top error namespaces", + f"source = kube-* | where {kube_error} | stats count() as error_count by kubernetes.namespace_name | sort - error_count", + "bar", + ) + ), + saved_visualization( + VisualizationSpec( + "[Kube] Top pods", + "source = kube-* | stats count() as log_count by kubernetes.pod_name | sort - log_count", + "bar", + ) + ), + saved_visualization( + VisualizationSpec( + "[Kube] Top error pods", + f"source = kube-* | where {kube_error} | stats count() as error_count by kubernetes.pod_name | sort - error_count", + "bar", + ) + ), + saved_visualization( + VisualizationSpec( + "[Kube] Top nodes", + "source = kube-* | stats count() as log_count by kubernetes.node_name | sort - log_count", + "bar", + ) + ), + saved_visualization( + VisualizationSpec( + "[Journald] Top units", + "source = journald-* | stats count() as log_count by _SYSTEMD_UNIT | sort - log_count", + "bar", + ) + ), + saved_visualization( + VisualizationSpec( + "[Journald] Top error units", + f"source = journald-* | where {journald_error} | stats count() as error_count by _SYSTEMD_UNIT | sort - error_count", + "bar", + ) + ), + ] + + return applications, queries, visualizations + + +def write_json(payload: list[dict], path: Path) -> None: + path.parent.mkdir(parents=True, exist_ok=True) + path.write_text(json.dumps(payload, indent=2) + "\n") + + +def render_configmap(apps_path: Path, queries_path: Path, vis_path: Path, output_path: Path) -> None: + relative_path = output_path.relative_to(ROOT) + applications = indent_payload(apps_path) + queries = indent_payload(queries_path) + visualizations = indent_payload(vis_path) + output_path.write_text( + CONFIG_TEMPLATE.format( + relative_path=relative_path, + applications=applications, + queries=queries, + visualizations=visualizations, + ) + ) + + +def indent_payload(path: Path) -> str: + lines = path.read_text().splitlines() + return "\n".join(" " + line for line in lines) + + +def main() -> None: + parser = argparse.ArgumentParser() + parser.add_argument("--build", action="store_true", help="Regenerate JSON payloads and ConfigMap") + args = parser.parse_args() + + if args.build: + applications, queries, visualizations = build_objects() + write_json(applications, APPS_PATH) + write_json(queries, QUERIES_PATH) + write_json(visualizations, VIS_PATH) + + if not (APPS_PATH.exists() and QUERIES_PATH.exists() and VIS_PATH.exists()): + raise SystemExit("Missing observability JSON payloads. Run with --build first.") + + render_configmap(APPS_PATH, QUERIES_PATH, VIS_PATH, CONFIG_PATH) + + +if __name__ == "__main__": + main() diff --git a/scripts/monitoring_postmark_exporter.py b/scripts/monitoring_postmark_exporter.py new file mode 100644 index 0000000..2a51a54 --- /dev/null +++ b/scripts/monitoring_postmark_exporter.py @@ -0,0 +1,149 @@ +#!/usr/bin/env python3 + +import datetime as dt +import os +import time +from dataclasses import dataclass + +import requests +from prometheus_client import Gauge, Info, start_http_server + + +@dataclass(frozen=True) +class Window: + label: str + days: int + + +WINDOWS = [ + Window("today", 0), + Window("1d", 1), + Window("7d", 7), + Window("30d", 30), +] + +API_BASE = os.environ.get("POSTMARK_API_BASE", "https://api.postmarkapp.com").rstrip("/") +POLL_INTERVAL_SECONDS = int(os.environ.get("POLL_INTERVAL_SECONDS", "60")) +LISTEN_ADDRESS = os.environ.get("LISTEN_ADDRESS", "0.0.0.0") +LISTEN_PORT = int(os.environ.get("LISTEN_PORT", "8000")) + +PRIMARY_TOKEN = os.environ.get("POSTMARK_SERVER_TOKEN", "").strip() +FALLBACK_TOKEN = os.environ.get("POSTMARK_SERVER_TOKEN_FALLBACK", "").strip() +LIMIT_WINDOW = os.environ.get("POSTMARK_SENDING_LIMIT_WINDOW", "30d").strip() +LIMIT_RAW = os.environ.get("POSTMARK_SENDING_LIMIT", "").strip() +try: + SENDING_LIMIT = float(LIMIT_RAW) if LIMIT_RAW else 0.0 +except ValueError: + SENDING_LIMIT = 0.0 + +EXPORTER_INFO = Info("postmark_exporter", "Exporter build info") +EXPORTER_INFO.info( + { + "api_base": API_BASE, + "windows": ",".join(window.label for window in WINDOWS), + } +) + +POSTMARK_API_UP = Gauge("postmark_api_up", "Whether Postmark API is reachable (1) or not (0)") +POSTMARK_LAST_SUCCESS = Gauge( + "postmark_last_success_timestamp_seconds", + "Unix timestamp of the last successful Postmark stats refresh", +) +POSTMARK_REQUEST_ERRORS = Gauge( + "postmark_request_errors_total", + "Total Postmark stats request errors since exporter start", +) + +POSTMARK_OUTBOUND_SENT = Gauge( + "postmark_outbound_sent", + "Outbound emails sent within the selected window", + labelnames=("window",), +) +POSTMARK_OUTBOUND_BOUNCED = Gauge( + "postmark_outbound_bounced", + "Outbound emails bounced within the selected window", + labelnames=("window",), +) +POSTMARK_OUTBOUND_BOUNCE_RATE = Gauge( + "postmark_outbound_bounce_rate", + "Outbound bounce rate percentage within the selected window", + labelnames=("window",), +) +POSTMARK_SENDING_LIMIT_GAUGE = Gauge( + "postmark_sending_limit", + "Configured Postmark sending limit for the active account", +) +POSTMARK_SENDING_LIMIT_USED = Gauge( + "postmark_sending_limit_used", + "Messages sent within the configured send limit window", +) +POSTMARK_SENDING_LIMIT_USED_PERCENT = Gauge( + "postmark_sending_limit_used_percent", + "Percent of the configured send limit used within the limit window", +) + + +def fetch_outbound_stats(token: str, window: Window) -> dict: + today = dt.date.today() + fromdate = today - dt.timedelta(days=window.days) + params = {"fromdate": fromdate.isoformat(), "todate": today.isoformat()} + headers = { + "Accept": "application/json", + "X-Postmark-Server-Token": token, + } + response = requests.get( + f"{API_BASE}/stats/outbound", + headers=headers, + params=params, + timeout=15, + ) + response.raise_for_status() + return response.json() + + +def update_metrics(token: str) -> None: + sent_by_window = {} + for window in WINDOWS: + data = fetch_outbound_stats(token, window) + sent = int(data.get("Sent", 0) or 0) + bounced = int(data.get("Bounced", 0) or 0) + rate = (bounced / sent * 100.0) if sent else 0.0 + sent_by_window[window.label] = sent + POSTMARK_OUTBOUND_SENT.labels(window=window.label).set(sent) + POSTMARK_OUTBOUND_BOUNCED.labels(window=window.label).set(bounced) + POSTMARK_OUTBOUND_BOUNCE_RATE.labels(window=window.label).set(rate) + + POSTMARK_SENDING_LIMIT_GAUGE.set(SENDING_LIMIT) + limit_window_sent = sent_by_window.get(LIMIT_WINDOW, 0) + POSTMARK_SENDING_LIMIT_USED.set(limit_window_sent) + if SENDING_LIMIT: + POSTMARK_SENDING_LIMIT_USED_PERCENT.set(limit_window_sent / SENDING_LIMIT * 100.0) + else: + POSTMARK_SENDING_LIMIT_USED_PERCENT.set(0.0) + + +def main() -> None: + if not PRIMARY_TOKEN and not FALLBACK_TOKEN: + raise SystemExit("POSTMARK_SERVER_TOKEN or POSTMARK_SERVER_TOKEN_FALLBACK is required") + + start_http_server(LISTEN_PORT, addr=LISTEN_ADDRESS) + + tokens = [token for token in (PRIMARY_TOKEN, FALLBACK_TOKEN) if token] + token_index = 0 + + while True: + token = tokens[token_index % len(tokens)] + token_index += 1 + try: + update_metrics(token) + POSTMARK_API_UP.set(1) + POSTMARK_LAST_SUCCESS.set(time.time()) + except Exception as exc: # noqa: BLE001 + POSTMARK_API_UP.set(0) + POSTMARK_REQUEST_ERRORS.inc() + print(f"postmark_exporter: refresh failed: {exc}", flush=True) + time.sleep(POLL_INTERVAL_SECONDS) + + +if __name__ == "__main__": + main() diff --git a/scripts/monitoring_render_postmark_exporter.py b/scripts/monitoring_render_postmark_exporter.py new file mode 100644 index 0000000..b0a458a --- /dev/null +++ b/scripts/monitoring_render_postmark_exporter.py @@ -0,0 +1,35 @@ +#!/usr/bin/env python3 + +from pathlib import Path + + +def indent(text: str, spaces: int) -> str: + prefix = " " * spaces + return "".join(prefix + line if line.strip("\n") else line for line in text.splitlines(keepends=True)) + + +def main() -> None: + root = Path(__file__).resolve().parents[1] + source = root / "scripts" / "monitoring_postmark_exporter.py" + target = root / "services" / "monitoring" / "postmark-exporter-script.yaml" + + payload = source.read_text(encoding="utf-8") + if not payload.endswith("\n"): + payload += "\n" + + yaml = ( + f"# services/monitoring/postmark-exporter-script.yaml\n" + f"apiVersion: v1\n" + f"kind: ConfigMap\n" + f"metadata:\n" + f" name: postmark-exporter-script\n" + f"data:\n" + f" monitoring_postmark_exporter.py: |\n" + f"{indent(payload, 4)}" + ) + + target.write_text(yaml, encoding="utf-8") + + +if __name__ == "__main__": + main() diff --git a/scripts/nextcloud-mail-sync.sh b/scripts/nextcloud-mail-sync.sh deleted file mode 100755 index 7feeec6..0000000 --- a/scripts/nextcloud-mail-sync.sh +++ /dev/null @@ -1,49 +0,0 @@ -#!/bin/bash -set -euo pipefail - -KC_BASE="${KC_BASE:?}" -KC_REALM="${KC_REALM:?}" -KC_ADMIN_USER="${KC_ADMIN_USER:?}" -KC_ADMIN_PASS="${KC_ADMIN_PASS:?}" - -if ! command -v jq >/dev/null 2>&1; then - apt-get update && apt-get install -y jq curl >/dev/null -fi - -account_exists() { - # Skip if the account email is already present in the mail app. - runuser -u www-data -- php occ mail:account:list 2>/dev/null | grep -Fq " ${1}" || \ - runuser -u www-data -- php occ mail:account:list 2>/dev/null | grep -Fq "${1} " -} - -token=$( - curl -s -d "grant_type=password" \ - -d "client_id=admin-cli" \ - -d "username=${KC_ADMIN_USER}" \ - -d "password=${KC_ADMIN_PASS}" \ - "${KC_BASE}/realms/master/protocol/openid-connect/token" | jq -r '.access_token' -) - -if [[ -z "${token}" || "${token}" == "null" ]]; then - echo "Failed to obtain admin token" - exit 1 -fi - -users=$(curl -s -H "Authorization: Bearer ${token}" \ - "${KC_BASE}/admin/realms/${KC_REALM}/users?max=2000") - -echo "${users}" | jq -c '.[]' | while read -r user; do - username=$(echo "${user}" | jq -r '.username') - email=$(echo "${user}" | jq -r '.email // empty') - app_pw=$(echo "${user}" | jq -r '.attributes.mailu_app_password[0] // empty') - [[ -z "${email}" || -z "${app_pw}" ]] && continue - if account_exists "${email}"; then - echo "Skipping ${email}, already exists" - continue - fi - echo "Syncing ${email}" - runuser -u www-data -- php occ mail:account:create \ - "${username}" "${username}" "${email}" \ - mail.bstein.dev 993 ssl "${email}" "${app_pw}" \ - mail.bstein.dev 587 tls "${email}" "${app_pw}" login || true -done diff --git a/scripts/nextcloud-maintenance.sh b/scripts/nextcloud-maintenance.sh deleted file mode 100755 index af1694c..0000000 --- a/scripts/nextcloud-maintenance.sh +++ /dev/null @@ -1,65 +0,0 @@ -#!/bin/bash -set -euo pipefail - -NC_URL="${NC_URL:-https://cloud.bstein.dev}" -ADMIN_USER="${ADMIN_USER:?}" -ADMIN_PASS="${ADMIN_PASS:?}" - -export DEBIAN_FRONTEND=noninteractive -apt-get update -qq -apt-get install -y -qq curl jq >/dev/null - -run_occ() { - runuser -u www-data -- php occ "$@" -} - -log() { echo "[$(date -Is)] $*"; } - -log "Applying Atlas theming" -run_occ theming:config name "Atlas Cloud" -run_occ theming:config slogan "Unified access to Atlas services" -run_occ theming:config url "https://cloud.bstein.dev" -run_occ theming:config color "#0f172a" -run_occ theming:config disable-user-theming yes - -log "Setting default quota to 200 GB" -run_occ config:app:set files default_quota --value "200 GB" - -API_BASE="${NC_URL}/ocs/v2.php/apps/external/api/v1" -AUTH=(-u "${ADMIN_USER}:${ADMIN_PASS}" -H "OCS-APIRequest: true") - -log "Removing existing external links" -existing=$(curl -sf "${AUTH[@]}" "${API_BASE}?format=json" | jq -r '.ocs.data[].id // empty') -for id in ${existing}; do - curl -sf "${AUTH[@]}" -X DELETE "${API_BASE}/sites/${id}?format=json" >/dev/null || true -done - -SITES=( - "Vaultwarden|https://vault.bstein.dev" - "Jellyfin|https://stream.bstein.dev" - "Gitea|https://scm.bstein.dev" - "Jenkins|https://ci.bstein.dev" - "Harbor|https://registry.bstein.dev" - "Vault|https://secret.bstein.dev" - "Jitsi|https://meet.bstein.dev" - "Grafana|https://metrics.bstein.dev" - "Chat LLM|https://chat.ai.bstein.dev" - "Vision|https://draw.ai.bstein.dev" - "STT/TTS|https://talk.ai.bstein.dev" -) - -log "Seeding external links" -for entry in "${SITES[@]}"; do - IFS="|" read -r name url <<<"${entry}" - curl -sf "${AUTH[@]}" -X POST "${API_BASE}/sites?format=json" \ - -d "name=${name}" \ - -d "url=${url}" \ - -d "lang=" \ - -d "type=link" \ - -d "device=" \ - -d "icon=" \ - -d "groups[]=" \ - -d "redirect=1" >/dev/null -done - -log "Maintenance run completed" diff --git a/scripts/test_atlas_user_cleanup.py b/scripts/test_atlas_user_cleanup.py new file mode 100755 index 0000000..41ba708 --- /dev/null +++ b/scripts/test_atlas_user_cleanup.py @@ -0,0 +1,509 @@ +#!/usr/bin/env python3 +"""Clean up Atlas test users and portal requests (manual-only). + +Default behavior is DRY RUN. This script is intended for operators to clean up +test accounts created via the bstein-dev-home onboarding portal. + +Targets (best-effort): + - Keycloak users in realm "atlas" + - Atlas portal Postgres rows (access_requests + dependent tables) + - Vaultwarden users/invites created by the portal + +Safety: + - Requires an explicit username prefix (e.g. "test-") + - Dry-run unless --apply is set + - --apply requires an explicit --confirm guard + - Validates prefixes to a conservative charset +""" + +from __future__ import annotations + +import argparse +import base64 +import json +import os +import re +import subprocess +import sys +import time +import urllib.parse +import urllib.request +from dataclasses import dataclass +from typing import Any, Iterable + + +_SAFE_PREFIX_RE = re.compile(r"^[a-zA-Z0-9][a-zA-Z0-9._-]{0,63}$") + + +@dataclass(frozen=True) +class KeycloakUser: + user_id: str + username: str + email: str + + +@dataclass(frozen=True) +class PortalRequestRow: + request_code: str + username: str + status: str + + +@dataclass(frozen=True) +class VaultwardenUser: + user_id: str + email: str + status: int + + +def _run(cmd: list[str], *, input_bytes: bytes | None = None) -> str: + proc = subprocess.run( + cmd, + input=input_bytes, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + check=False, + ) + if proc.returncode != 0: + stderr = proc.stderr.decode("utf-8", errors="replace").strip() + raise RuntimeError(f"command failed ({proc.returncode}): {' '.join(cmd)}\n{stderr}") + return proc.stdout.decode("utf-8", errors="replace") + + +def _kubectl_get_secret_value(namespace: str, name: str, key: str) -> str: + raw_b64 = _run( + [ + "kubectl", + "-n", + namespace, + "get", + "secret", + name, + "-o", + f"jsonpath={{.data.{key}}}", + ] + ).strip() + if not raw_b64: + raise RuntimeError(f"secret {namespace}/{name} key {key} is empty") + return base64.b64decode(raw_b64).decode("utf-8").strip() + + +def _kubectl_first_pod(namespace: str) -> str: + raw = _run( + [ + "kubectl", + "-n", + namespace, + "get", + "pods", + "-o", + "json", + ] + ) + data = json.loads(raw) + items = data.get("items") or [] + if not isinstance(items, list) or not items: + raise RuntimeError(f"no pods found in namespace {namespace}") + pod_name = items[0].get("metadata", {}).get("name") + if not isinstance(pod_name, str) or not pod_name: + raise RuntimeError(f"unexpected pod list in namespace {namespace}") + return pod_name + + +def _validate_prefixes(prefixes: list[str]) -> list[str]: + cleaned: list[str] = [] + for prefix in prefixes: + prefix = prefix.strip() + if not prefix: + continue + if not _SAFE_PREFIX_RE.match(prefix): + raise SystemExit( + f"invalid prefix '{prefix}': must match {_SAFE_PREFIX_RE.pattern} (alnum plus ._-)" + ) + cleaned.append(prefix) + if not cleaned: + raise SystemExit("at least one --prefix is required") + return cleaned + + +def _starts_with_any(value: str, prefixes: Iterable[str]) -> bool: + return any(value.startswith(p) for p in prefixes) + + +def _keycloak_token(server: str, realm: str, client_id: str, client_secret: str) -> str: + data = urllib.parse.urlencode( + { + "grant_type": "client_credentials", + "client_id": client_id, + "client_secret": client_secret, + } + ).encode("utf-8") + req = urllib.request.Request( + f"{server}/realms/{realm}/protocol/openid-connect/token", + data=data, + method="POST", + ) + req.add_header("Content-Type", "application/x-www-form-urlencoded") + with urllib.request.urlopen(req, timeout=15) as resp: + payload = json.loads(resp.read().decode("utf-8")) + token = payload.get("access_token") + if not isinstance(token, str) or not token: + raise RuntimeError("failed to obtain keycloak access token") + return token + + +def _keycloak_list_users(server: str, realm: str, token: str, search: str) -> list[KeycloakUser]: + query = urllib.parse.urlencode({"max": "1000", "search": search}) + req = urllib.request.Request(f"{server}/admin/realms/{realm}/users?{query}", method="GET") + req.add_header("Authorization", f"Bearer {token}") + with urllib.request.urlopen(req, timeout=30) as resp: + payload = json.loads(resp.read().decode("utf-8")) + if not isinstance(payload, list): + raise RuntimeError("unexpected keycloak users response") + users: list[KeycloakUser] = [] + for item in payload: + if not isinstance(item, dict): + continue + user_id = item.get("id") + username = item.get("username") or "" + email = item.get("email") or "" + if not isinstance(user_id, str) or not user_id: + continue + if not isinstance(username, str): + continue + users.append(KeycloakUser(user_id=user_id, username=username, email=str(email))) + return users + + +def _keycloak_delete_user(server: str, realm: str, token: str, user_id: str) -> None: + req = urllib.request.Request(f"{server}/admin/realms/{realm}/users/{user_id}", method="DELETE") + req.add_header("Authorization", f"Bearer {token}") + try: + with urllib.request.urlopen(req, timeout=30) as resp: + _ = resp.read() + except urllib.error.HTTPError as exc: + if exc.code == 404: + return + raise + + +def _psql_json(portal_db_url: str, sql: str) -> list[dict[str, Any]]: + postgres_pod = _kubectl_first_pod("postgres") + out = _run( + [ + "kubectl", + "-n", + "postgres", + "exec", + "-i", + postgres_pod, + "--", + "psql", + portal_db_url, + "-At", + "-F", + "\t", + "-c", + sql, + ] + ) + rows: list[dict[str, Any]] = [] + for line in out.splitlines(): + parts = line.split("\t") + rows.append({"cols": parts}) + return rows + + +def _portal_list_requests(portal_db_url: str, prefixes: list[str]) -> list[PortalRequestRow]: + clauses = " OR ".join([f"username LIKE '{p}%'" for p in prefixes]) + sql = ( + "SELECT request_code, username, status " + "FROM access_requests " + f"WHERE {clauses} " + "ORDER BY created_at DESC;" + ) + raw_rows = _psql_json(portal_db_url, sql) + parsed: list[PortalRequestRow] = [] + for row in raw_rows: + cols = row.get("cols") or [] + if len(cols) < 3: + continue + parsed.append(PortalRequestRow(request_code=cols[0], username=cols[1], status=cols[2])) + return parsed + + +def _portal_delete_requests(portal_db_url: str, prefixes: list[str]) -> int: + clauses = " OR ".join([f"username LIKE '{p}%'" for p in prefixes]) + sql = f"DELETE FROM access_requests WHERE {clauses};" + postgres_pod = _kubectl_first_pod("postgres") + out = _run( + [ + "kubectl", + "-n", + "postgres", + "exec", + "-i", + postgres_pod, + "--", + "psql", + portal_db_url, + "-c", + sql, + ] + ) + # psql prints "DELETE " + match = re.search(r"DELETE\\s+(\\d+)", out) + return int(match.group(1)) if match else 0 + + +def _vaultwarden_admin_cookie(admin_token: str, base_url: str) -> str: + data = urllib.parse.urlencode({"token": admin_token}).encode("utf-8") + req = urllib.request.Request(f"{base_url}/admin", data=data, method="POST") + req.add_header("Content-Type", "application/x-www-form-urlencoded") + try: + with urllib.request.urlopen(req, timeout=10) as resp: + set_cookie = resp.headers.get("Set-Cookie") or "" + except urllib.error.HTTPError as exc: + if exc.code == 429: + raise RuntimeError("vaultwarden admin rate limited (HTTP 429)") from exc + raise + cookie = set_cookie.split(";", 1)[0].strip() + if not cookie: + raise RuntimeError("vaultwarden admin cookie missing") + return cookie + + +def _vaultwarden_list_users(base_url: str, cookie: str) -> list[VaultwardenUser]: + req = urllib.request.Request(f"{base_url}/admin/users", method="GET") + req.add_header("Cookie", cookie) + try: + with urllib.request.urlopen(req, timeout=30) as resp: + payload = json.loads(resp.read().decode("utf-8")) + except urllib.error.HTTPError as exc: + if exc.code == 429: + raise RuntimeError("vaultwarden admin rate limited (HTTP 429)") from exc + raise + if not isinstance(payload, list): + raise RuntimeError("unexpected vaultwarden /admin/users response") + users: list[VaultwardenUser] = [] + for item in payload: + if not isinstance(item, dict): + continue + user_id = item.get("id") + email = item.get("email") + status = item.get("_status") + if not isinstance(user_id, str) or not user_id: + continue + if not isinstance(email, str) or not email: + continue + if not isinstance(status, int): + status = -1 + users.append(VaultwardenUser(user_id=user_id, email=email, status=status)) + return users + + +def _vaultwarden_delete_user(base_url: str, cookie: str, user_id: str) -> None: + req = urllib.request.Request(f"{base_url}/admin/users/{user_id}", method="DELETE") + req.add_header("Cookie", cookie) + try: + with urllib.request.urlopen(req, timeout=30) as resp: + _ = resp.read() + except urllib.error.HTTPError as exc: + if exc.code in {404}: + return + if exc.code == 429: + raise RuntimeError("vaultwarden admin rate limited (HTTP 429)") from exc + raise + + +def _port_forward(namespace: str, target: str, local_port: int, remote_port: int) -> subprocess.Popen[bytes]: + # Keep stdout/stderr muted to avoid leaking internal details in output. + return subprocess.Popen( + [ + "kubectl", + "-n", + namespace, + "port-forward", + target, + f"{local_port}:{remote_port}", + "--address", + "127.0.0.1", + ], + stdout=subprocess.DEVNULL, + stderr=subprocess.DEVNULL, + ) + + +def main() -> int: + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument( + "--prefix", + action="append", + default=[], + help="Username prefix to match (repeatable). Example: --prefix test-", + ) + parser.add_argument( + "--apply", + action="store_true", + help="Actually delete; otherwise dry-run only.", + ) + parser.add_argument( + "--confirm", + default="", + help=( + "Required when using --apply. Must exactly equal the comma-separated " + "sorted prefix list (e.g. 'atlas-,bob-,e2e-,test-')." + ), + ) + parser.add_argument("--skip-keycloak", action="store_true", help="Skip Keycloak user deletion.") + parser.add_argument("--skip-portal-db", action="store_true", help="Skip portal DB cleanup.") + parser.add_argument("--skip-vaultwarden", action="store_true", help="Skip Vaultwarden cleanup.") + parser.add_argument( + "--protect-keycloak-username", + action="append", + default=[], + help="Keycloak usernames that must never be deleted (repeatable).", + ) + parser.add_argument( + "--protect-vaultwarden-email", + action="append", + default=[], + help="Vaultwarden emails that must never be deleted (repeatable).", + ) + args = parser.parse_args() + + prefixes = sorted(set(_validate_prefixes(args.prefix))) + apply = bool(args.apply) + expected_confirm = ",".join(prefixes) + protected_keycloak = {"bstein", "robotuser", *[u.strip() for u in args.protect_keycloak_username if u.strip()]} + protected_vaultwarden = {e.strip() for e in args.protect_vaultwarden_email if e.strip()} + + if apply and args.confirm != expected_confirm: + raise SystemExit( + f"refusing to apply without --confirm '{expected_confirm}' (got '{args.confirm}')" + ) + + print("Atlas test-user cleanup") + print("prefixes:", expected_confirm) + print("mode:", "APPLY (destructive)" if apply else "DRY RUN (no changes)") + if protected_keycloak: + print("protected keycloak usernames:", ", ".join(sorted(protected_keycloak))) + if protected_vaultwarden: + print("protected vaultwarden emails:", ", ".join(sorted(protected_vaultwarden))) + print() + + if not args.skip_portal_db: + portal_db_url = _kubectl_get_secret_value("bstein-dev-home", "atlas-portal-db", "PORTAL_DATABASE_URL") + requests = _portal_list_requests(portal_db_url, prefixes) + print(f"Portal DB: {len(requests)} access_requests matched") + for row in requests[:50]: + print(f" {row.request_code}\t{row.status}\t{row.username}") + if len(requests) > 50: + print(f" ... and {len(requests) - 50} more") + if apply and requests: + deleted = _portal_delete_requests(portal_db_url, prefixes) + print(f"Portal DB: deleted {deleted} access_requests (cascade removes tasks/steps/artifacts).") + print() + + if not args.skip_keycloak: + kc_server = os.getenv("KEYCLOAK_PUBLIC_URL", "https://sso.bstein.dev").rstrip("/") + kc_realm = os.getenv("KEYCLOAK_REALM", "atlas") + kc_client_id = os.getenv("KEYCLOAK_ADMIN_CLIENT_ID", "bstein-dev-home-admin") + kc_client_secret = _kubectl_get_secret_value( + "bstein-dev-home", "bstein-dev-home-keycloak-admin", "client_secret" + ) + token = _keycloak_token(kc_server, kc_realm, kc_client_id, kc_client_secret) + found: dict[str, KeycloakUser] = {} + for prefix in prefixes: + for user in _keycloak_list_users(kc_server, kc_realm, token, prefix): + if not _starts_with_any(user.username, prefixes): + continue + if user.username in protected_keycloak: + continue + found[user.user_id] = user + users = list(found.values()) + users.sort(key=lambda u: u.username) + print(f"Keycloak: {len(users)} users matched") + for user in users[:50]: + email = user.email or "-" + print(f" {user.username}\t{email}\t{user.user_id}") + if len(users) > 50: + print(f" ... and {len(users) - 50} more") + if apply and users: + for user in users: + _keycloak_delete_user(kc_server, kc_realm, token, user.user_id) + print(f"Keycloak: deleted {len(users)} users.") + print() + + if not args.skip_vaultwarden: + pf = _port_forward("vaultwarden", "svc/vaultwarden-service", 18081, 80) + try: + # wait briefly for the port-forward to come up + for _ in range(30): + try: + urllib.request.urlopen("http://127.0.0.1:18081/", timeout=1).read(1) + break + except Exception: + time.sleep(0.2) + + admin_token = _kubectl_get_secret_value("vaultwarden", "vaultwarden-admin", "ADMIN_TOKEN") + base_url = "http://127.0.0.1:18081" + try: + cookie = "" + for attempt in range(7): + try: + cookie = _vaultwarden_admin_cookie(admin_token, base_url) + break + except RuntimeError as exc: + if "rate limited" in str(exc).lower(): + time.sleep(min(60.0, 2.0**attempt)) + continue + raise + if not cookie: + raise RuntimeError("vaultwarden admin login repeatedly rate limited") + + users: list[VaultwardenUser] = [] + for attempt in range(7): + try: + users = _vaultwarden_list_users(base_url, cookie) + break + except RuntimeError as exc: + if "rate limited" in str(exc).lower(): + time.sleep(min(60.0, 2.0**attempt)) + continue + raise + if not users: + raise RuntimeError("vaultwarden user list unavailable (possibly rate limited)") + except RuntimeError as exc: + print(f"Vaultwarden: ERROR: {exc}") + print() + return 1 + matched: list[VaultwardenUser] = [] + for user in users: + local = user.email.split("@", 1)[0] + if _starts_with_any(local, prefixes): + if user.email in protected_vaultwarden: + continue + matched.append(user) + matched.sort(key=lambda u: u.email) + print(f"Vaultwarden: {len(matched)} users matched") + for user in matched[:50]: + print(f" {user.email}\tstatus={user.status}\t{user.user_id}") + if len(matched) > 50: + print(f" ... and {len(matched) - 50} more") + if apply and matched: + for user in matched: + _vaultwarden_delete_user(base_url, cookie, user.user_id) + print(f"Vaultwarden: deleted {len(matched)} users.") + print() + finally: + pf.terminate() + try: + pf.wait(timeout=3) + except Exception: + pf.kill() + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/scripts/test_user_cleanup.py b/scripts/test_user_cleanup.py new file mode 100755 index 0000000..d29b775 --- /dev/null +++ b/scripts/test_user_cleanup.py @@ -0,0 +1,276 @@ +#!/usr/bin/env python3 + +from __future__ import annotations + +import argparse +import sys +from collections import defaultdict +from dataclasses import dataclass +from typing import Any, Iterable +from urllib.parse import quote + +import httpx + +from atlas_portal import db, settings +from atlas_portal.keycloak import admin_client + + +@dataclass(frozen=True) +class KeycloakUser: + id: str + username: str + + +@dataclass(frozen=True) +class PortalRequest: + request_code: str + username: str + status: str + + +def _dedupe_by_id(users: Iterable[KeycloakUser]) -> list[KeycloakUser]: + seen: set[str] = set() + out: list[KeycloakUser] = [] + for user in users: + if user.id in seen: + continue + seen.add(user.id) + out.append(user) + return out + + +def _iter_keycloak_users_for_prefix(prefix: str, max_results: int) -> list[KeycloakUser]: + client = admin_client() + if not client.ready(): + raise RuntimeError("keycloak admin client not configured in this environment") + + url = f"{settings.KEYCLOAK_ADMIN_URL}/admin/realms/{settings.KEYCLOAK_REALM}/users" + # Keycloak can return false positives for search; we do a strict prefix match client-side. + params = {"search": prefix, "max": str(max_results), "briefRepresentation": "true"} + with httpx.Client(timeout=settings.HTTP_CHECK_TIMEOUT_SEC) as http: + resp = http.get(url, params=params, headers=client.headers()) + resp.raise_for_status() + payload = resp.json() + + if not isinstance(payload, list): + return [] + + found: list[KeycloakUser] = [] + for item in payload: + if not isinstance(item, dict): + continue + username = item.get("username") + user_id = item.get("id") + if not isinstance(username, str) or not isinstance(user_id, str): + continue + if not username.startswith(prefix): + continue + if username.startswith("service-account-"): + continue + found.append(KeycloakUser(id=user_id, username=username)) + return found + + +def _find_keycloak_users(prefixes: list[str], max_results: int, protected: set[str]) -> list[KeycloakUser]: + matches: list[KeycloakUser] = [] + for prefix in prefixes: + matches.extend(_iter_keycloak_users_for_prefix(prefix, max_results=max_results)) + + deduped = _dedupe_by_id(matches) + return [user for user in deduped if user.username not in protected] + + +def _delete_keycloak_users(users: list[KeycloakUser]) -> None: + if not users: + return + + client = admin_client() + if not client.ready(): + raise RuntimeError("keycloak admin client not configured in this environment") + + base = f"{settings.KEYCLOAK_ADMIN_URL}/admin/realms/{settings.KEYCLOAK_REALM}/users" + with httpx.Client(timeout=settings.HTTP_CHECK_TIMEOUT_SEC) as http: + for user in users: + url = f"{base}/{quote(user.id, safe='')}" + resp = http.delete(url, headers=client.headers()) + # Deleting a non-existent user is treated as success for idempotency. + if resp.status_code == 404: + continue + resp.raise_for_status() + + +def _find_portal_requests(prefixes: list[str], max_results: int) -> list[PortalRequest]: + if not db.configured(): + return [] + + like_prefixes = [f"{prefix}%" for prefix in prefixes] + rows: list[dict[str, Any]] = [] + with db.connect() as conn: + for like in like_prefixes: + cursor = conn.execute( + """ + SELECT request_code, username, status + FROM access_requests + WHERE username LIKE %s + ORDER BY created_at DESC + LIMIT %s + """, + (like, max_results), + ) + batch = cursor.fetchall() + if isinstance(batch, list): + rows.extend([r for r in batch if isinstance(r, dict)]) + + out: list[PortalRequest] = [] + for row in rows: + request_code = row.get("request_code") + username = row.get("username") + status = row.get("status") + if not isinstance(request_code, str) or not isinstance(username, str) or not isinstance(status, str): + continue + out.append(PortalRequest(request_code=request_code, username=username, status=status)) + return out + + +def _delete_portal_requests(prefixes: list[str]) -> int: + if not db.configured(): + return 0 + + like_prefixes = [f"{prefix}%" for prefix in prefixes] + deleted = 0 + with db.connect() as conn: + for like in like_prefixes: + cursor = conn.execute("DELETE FROM access_requests WHERE username LIKE %s", (like,)) + deleted += cursor.rowcount or 0 + return deleted + + +def _summarize_portal_requests(rows: list[PortalRequest]) -> dict[str, int]: + counts: dict[str, int] = defaultdict(int) + for row in rows: + counts[row.status] += 1 + return dict(counts) + + +def _parse_args(argv: list[str]) -> argparse.Namespace: + parser = argparse.ArgumentParser( + prog="test_user_cleanup", + description=( + "Manual-only cleanup for test users/requests. " + "This script is intended to be run inside the bstein-dev-home backend container." + ), + ) + parser.add_argument( + "--prefix", + action="append", + required=True, + help="Username prefix to target (repeatable). Example: --prefix test-", + ) + parser.add_argument( + "--max", + type=int, + default=500, + help="Maximum users/requests to enumerate per prefix (default: 500).", + ) + parser.add_argument( + "--apply", + action="store_true", + help="Apply deletions (default is dry-run). Requires --confirm.", + ) + parser.add_argument( + "--confirm", + default="", + help="Required when using --apply. Must exactly equal the comma-separated prefix list.", + ) + parser.add_argument( + "--skip-keycloak", + action="store_true", + help="Skip deleting Keycloak users.", + ) + parser.add_argument( + "--skip-portal", + action="store_true", + help="Skip deleting portal (DB) access requests.", + ) + parser.add_argument( + "--protect", + action="append", + default=[], + help="Extra usernames to never delete (repeatable).", + ) + parser.add_argument( + "--verbose", + action="store_true", + help="List matched usernames/request codes.", + ) + return parser.parse_args(argv) + + +def main(argv: list[str]) -> int: + args = _parse_args(argv) + prefixes = sorted({p.strip() for p in args.prefix if p.strip()}) + if not prefixes: + print("error: no valid --prefix values provided", file=sys.stderr) + return 2 + + expected_confirm = ",".join(prefixes) + protected = {"bstein", "robotuser", *[p.strip() for p in args.protect if p.strip()]} + + if args.apply and args.confirm != expected_confirm: + print( + f"error: refusing to apply without --confirm '{expected_confirm}' (got '{args.confirm}')", + file=sys.stderr, + ) + return 2 + + keycloak_users: list[KeycloakUser] = [] + portal_requests: list[PortalRequest] = [] + + if not args.skip_keycloak: + keycloak_users = _find_keycloak_users(prefixes, max_results=args.max, protected=protected) + + if not args.skip_portal: + portal_requests = _find_portal_requests(prefixes, max_results=args.max) + + print(f"prefixes: {expected_confirm}") + print(f"mode: {'APPLY' if args.apply else 'DRY-RUN'}") + if protected: + print(f"protected usernames: {', '.join(sorted(protected))}") + + if not args.skip_keycloak: + print(f"keycloak users matched: {len(keycloak_users)}") + if args.verbose and keycloak_users: + for user in sorted(keycloak_users, key=lambda u: u.username): + print(f" - {user.username}") + + if not args.skip_portal: + print(f"portal requests matched: {len(portal_requests)}") + if portal_requests: + summary = _summarize_portal_requests(portal_requests) + summary_str = ", ".join(f"{k}={v}" for k, v in sorted(summary.items())) + print(f" statuses: {summary_str}") + if args.verbose and portal_requests: + for req in portal_requests[: min(50, len(portal_requests))]: + print(f" - {req.request_code} ({req.status})") + if len(portal_requests) > 50: + print(f" ... and {len(portal_requests) - 50} more") + + if not args.apply: + print("dry-run complete (no changes made)") + return 0 + + if not args.skip_portal: + deleted = _delete_portal_requests(prefixes) + print(f"deleted portal requests: {deleted}") + + if not args.skip_keycloak: + _delete_keycloak_users(keycloak_users) + print(f"deleted keycloak users: {len(keycloak_users)}") + + print("done") + return 0 + + +if __name__ == "__main__": + raise SystemExit(main(sys.argv[1:])) + diff --git a/scripts/test_user_cleanup.sh b/scripts/test_user_cleanup.sh new file mode 100755 index 0000000..346aedc --- /dev/null +++ b/scripts/test_user_cleanup.sh @@ -0,0 +1,18 @@ +#!/usr/bin/env bash +set -euo pipefail + +# Manual-only helper to run `scripts/test_user_cleanup.py` inside the portal backend container. +# +# Usage (dry-run): +# scripts/test_user_cleanup.sh --prefix test- +# +# Usage (apply): +# scripts/test_user_cleanup.sh --prefix test- --apply --confirm test- + +NS="${PORTAL_NAMESPACE:-bstein-dev-home}" +TARGET="${PORTAL_BACKEND_EXEC_TARGET:-deploy/bstein-dev-home-backend}" + +SCRIPT_DIR="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" >/dev/null 2>&1 && pwd)" + +cat "${SCRIPT_DIR}/test_user_cleanup.py" | kubectl -n "${NS}" exec -i "${TARGET}" -- python - "$@" + diff --git a/scripts/test_vaultwarden_user_cleanup.py b/scripts/test_vaultwarden_user_cleanup.py new file mode 100755 index 0000000..93b7aa5 --- /dev/null +++ b/scripts/test_vaultwarden_user_cleanup.py @@ -0,0 +1,318 @@ +#!/usr/bin/env python3 +"""Clean up Vaultwarden test users and invites (manual-only). + +This script deletes Vaultwarden rows directly from the Postgres database. It is +intended only for removing test fallout (e.g. e2e-*, test-*) and is deliberately +conservative: + +- Requires one or more explicit email prefixes (repeatable). +- Dry-run by default; --apply requires an exact --confirm guard. +- Refuses to delete any user with dependent data in Vaultwarden tables. +- Supports a protected email allowlist to prevent catastrophic mistakes. + +Example (dry-run): + scripts/test_vaultwarden_user_cleanup.py --prefix e2e- + +Example (apply): + scripts/test_vaultwarden_user_cleanup.py --prefix e2e- --apply --confirm e2e- +""" + +from __future__ import annotations + +import argparse +import json +import re +import subprocess +import sys +from dataclasses import dataclass +from typing import Iterable, Sequence + + +_SAFE_PREFIX_RE = re.compile(r"^[a-zA-Z0-9][a-zA-Z0-9._-]{0,63}$") +_UUID_RE = re.compile(r"^[0-9a-fA-F-]{32,36}$") + + +@dataclass(frozen=True) +class VaultwardenUser: + uuid: str + email: str + dependent_rows: int + + +def _run(cmd: Sequence[str], *, input_bytes: bytes | None = None) -> str: + proc = subprocess.run( + list(cmd), + input=input_bytes, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + check=False, + ) + if proc.returncode != 0: + stderr = proc.stderr.decode("utf-8", errors="replace").strip() + raise RuntimeError(f"command failed ({proc.returncode}): {' '.join(cmd)}\n{stderr}") + return proc.stdout.decode("utf-8", errors="replace") + + +def _kubectl_first_pod(namespace: str) -> str: + raw = _run(["kubectl", "-n", namespace, "get", "pods", "-o", "json"]) + data = json.loads(raw) + items = data.get("items") or [] + if not isinstance(items, list) or not items: + raise RuntimeError(f"no pods found in namespace {namespace}") + name = items[0].get("metadata", {}).get("name") + if not isinstance(name, str) or not name: + raise RuntimeError(f"unexpected pod list in namespace {namespace}") + return name + + +def _psql(sql: str) -> str: + pod = _kubectl_first_pod("postgres") + return _run( + [ + "kubectl", + "-n", + "postgres", + "exec", + "-i", + pod, + "--", + "psql", + "-U", + "postgres", + "-d", + "vaultwarden", + "-At", + "-F", + "\t", + "-c", + sql, + ] + ) + + +def _validate_prefixes(prefixes: Iterable[str]) -> list[str]: + cleaned: list[str] = [] + for prefix in prefixes: + prefix = prefix.strip() + if not prefix: + continue + if not _SAFE_PREFIX_RE.match(prefix): + raise SystemExit( + f"invalid prefix '{prefix}': must match {_SAFE_PREFIX_RE.pattern} (alnum plus ._-)" + ) + if not prefix.endswith("-"): + raise SystemExit(f"refusing prefix '{prefix}': must end with '-' for safety") + cleaned.append(prefix) + if not cleaned: + raise SystemExit("at least one --prefix is required") + return sorted(set(cleaned)) + + +def _parse_rows(tsv: str) -> list[list[str]]: + rows: list[list[str]] = [] + for line in tsv.splitlines(): + line = line.strip() + if not line: + continue + rows.append(line.split("\t")) + return rows + + +def _sql_or_email_prefixes(prefixes: list[str]) -> str: + # prefixes validated to safe charset; safe to interpolate. + clauses = [f"email LIKE '{p}%'" for p in prefixes] + return " OR ".join(clauses) if clauses else "FALSE" + + +def _sql_quote(value: str) -> str: + return "'" + value.replace("'", "''") + "'" + + +def _sql_text_array(values: Iterable[str]) -> str: + items = ",".join(_sql_quote(v) for v in values) + return f"ARRAY[{items}]::text[]" + + +def _list_users(prefixes: list[str], protected: set[str]) -> list[VaultwardenUser]: + clause = _sql_or_email_prefixes(prefixes) + sql = f""" + WITH candidates AS ( + SELECT uuid, email + FROM users + WHERE enabled + AND ({clause}) + AND email <> ALL({_sql_text_array(sorted(protected))}) + ) + SELECT + candidates.uuid, + candidates.email, + ( + (SELECT COUNT(*) FROM auth_requests WHERE user_uuid = candidates.uuid) + + (SELECT COUNT(*) FROM ciphers WHERE user_uuid = candidates.uuid) + + (SELECT COUNT(*) FROM devices WHERE user_uuid = candidates.uuid) + + (SELECT COUNT(*) FROM emergency_access WHERE grantor_uuid = candidates.uuid OR grantee_uuid = candidates.uuid) + + (SELECT COUNT(*) FROM favorites WHERE user_uuid = candidates.uuid) + + (SELECT COUNT(*) FROM folders WHERE user_uuid = candidates.uuid) + + (SELECT COUNT(*) FROM sends WHERE user_uuid = candidates.uuid) + + (SELECT COUNT(*) FROM twofactor WHERE user_uuid = candidates.uuid) + + (SELECT COUNT(*) FROM twofactor_incomplete WHERE user_uuid = candidates.uuid) + + (SELECT COUNT(*) FROM users_collections WHERE user_uuid = candidates.uuid) + + (SELECT COUNT(*) FROM users_organizations WHERE user_uuid = candidates.uuid) + ) AS dependent_rows + FROM candidates + ORDER BY candidates.email; + """ + out = _psql(sql) + users: list[VaultwardenUser] = [] + for row in _parse_rows(out): + if len(row) < 3: + continue + uuid, email, dep_raw = row[0].strip(), row[1].strip(), row[2].strip() + if not uuid or not email: + continue + if not _UUID_RE.match(uuid): + continue + try: + dep = int(dep_raw) + except ValueError: + dep = 0 + users.append(VaultwardenUser(uuid=uuid, email=email, dependent_rows=dep)) + return users + + +def _list_invitations(prefixes: list[str], protected: set[str]) -> list[str]: + clause = _sql_or_email_prefixes(prefixes) + protected_clause = "" + if protected: + protected_clause = f"AND email <> ALL({_sql_text_array(sorted(protected))})" + sql = f"SELECT email FROM invitations WHERE ({clause}) {protected_clause} ORDER BY email;" + out = _psql(sql) + invites: list[str] = [] + for row in _parse_rows(out): + if not row: + continue + email = row[0].strip() + if email: + invites.append(email) + return invites + + +def _delete_invitations(emails: list[str]) -> int: + if not emails: + return 0 + email_list = ",".join(_sql_quote(e) for e in emails) + sql = f"DELETE FROM invitations WHERE email IN ({email_list});" + out = _psql(sql) + match = re.search(r"DELETE\s+(\d+)", out) + return int(match.group(1)) if match else 0 + + +def _delete_users(uuids: list[str]) -> int: + if not uuids: + return 0 + uuid_list = ",".join(_sql_quote(u) for u in uuids) + sql = f"DELETE FROM users WHERE uuid IN ({uuid_list});" + out = _psql(sql) + match = re.search(r"DELETE\s+(\d+)", out) + return int(match.group(1)) if match else 0 + + +def _parse_args(argv: list[str]) -> argparse.Namespace: + parser = argparse.ArgumentParser( + prog="test_vaultwarden_user_cleanup", + description="Manual-only cleanup for Vaultwarden test users/invites (DB-level).", + ) + parser.add_argument( + "--prefix", + action="append", + required=True, + help="Email prefix to target (repeatable). Example: --prefix e2e-", + ) + parser.add_argument( + "--apply", + action="store_true", + help="Apply deletions (default is dry-run). Requires --confirm.", + ) + parser.add_argument( + "--confirm", + default="", + help="Required when using --apply. Must exactly equal the comma-separated prefix list.", + ) + parser.add_argument( + "--protect-email", + action="append", + default=[], + help="Vaultwarden emails that must never be deleted (repeatable).", + ) + parser.add_argument( + "--verbose", + action="store_true", + help="List matched emails (and invitation emails).", + ) + return parser.parse_args(argv) + + +def main(argv: list[str]) -> int: + args = _parse_args(argv) + prefixes = _validate_prefixes(args.prefix) + expected_confirm = ",".join(prefixes) + + protected = {e.strip() for e in args.protect_email if e.strip()} + protected |= { + "brad@bstein.dev", + "edstein87@outlook.com", + "indifox8@gmail.com", + "mgs.stein@gmail.com", + "patriot87@gmail.com", + } + + if args.apply and args.confirm != expected_confirm: + print( + f"error: refusing to apply without --confirm '{expected_confirm}' (got '{args.confirm}')", + file=sys.stderr, + ) + return 2 + + users = _list_users(prefixes, protected=protected) + invites = _list_invitations(prefixes, protected=protected) + + print(f"prefixes: {expected_confirm}") + print(f"mode: {'APPLY' if args.apply else 'DRY-RUN'}") + if protected: + print(f"protected emails: {', '.join(sorted(protected))}") + print(f"vaultwarden users matched: {len(users)}") + print(f"vaultwarden invitations matched: {len(invites)}") + + if args.verbose: + for user in users[: min(100, len(users))]: + print(f" user: {user.email} (deps={user.dependent_rows})") + if len(users) > 100: + print(f" ... and {len(users) - 100} more users") + for email in invites[: min(100, len(invites))]: + print(f" invite: {email}") + if len(invites) > 100: + print(f" ... and {len(invites) - 100} more invitations") + + unsafe = [u for u in users if u.dependent_rows > 0] + if unsafe: + print("refusing to delete users with dependent data:", file=sys.stderr) + for user in unsafe[: min(50, len(unsafe))]: + print(f" - {user.email} deps={user.dependent_rows}", file=sys.stderr) + if len(unsafe) > 50: + print(f" ... and {len(unsafe) - 50} more", file=sys.stderr) + return 2 + + if not args.apply: + print("dry-run complete (no changes made)") + return 0 + + deleted_invites = _delete_invitations(invites) + deleted_users = _delete_users([u.uuid for u in users]) + print(f"deleted vaultwarden invitations: {deleted_invites}") + print(f"deleted vaultwarden users: {deleted_users}") + print("done") + return 0 + + +if __name__ == "__main__": + raise SystemExit(main(sys.argv[1:])) diff --git a/scripts/test_vaultwarden_user_cleanup.sh b/scripts/test_vaultwarden_user_cleanup.sh new file mode 100755 index 0000000..f21f85e --- /dev/null +++ b/scripts/test_vaultwarden_user_cleanup.sh @@ -0,0 +1,15 @@ +#!/usr/bin/env bash +set -euo pipefail + +# Manual-only helper to clean Vaultwarden test users and invites from Postgres. +# +# Usage (dry-run): +# scripts/test_vaultwarden_user_cleanup.sh --prefix e2e- +# +# Usage (apply): +# scripts/test_vaultwarden_user_cleanup.sh --prefix e2e- --apply --confirm e2e- + +SCRIPT_DIR="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" >/dev/null 2>&1 && pwd)" + +python3 "${SCRIPT_DIR}/test_vaultwarden_user_cleanup.py" "$@" + diff --git a/scripts/tests/test_mailu_sync.py b/scripts/tests/test_mailu_sync.py index 41616b2..49bd2e4 100644 --- a/scripts/tests/test_mailu_sync.py +++ b/scripts/tests/test_mailu_sync.py @@ -20,7 +20,13 @@ def load_sync_module(monkeypatch): } for k, v in env.items(): monkeypatch.setenv(k, v) - module_path = pathlib.Path(__file__).resolve().parents[1] / "mailu_sync.py" + module_path = ( + pathlib.Path(__file__).resolve().parents[2] + / "services" + / "mailu" + / "scripts" + / "mailu_sync.py" + ) spec = importlib.util.spec_from_file_location("mailu_sync_testmod", module_path) module = importlib.util.module_from_spec(spec) assert spec.loader is not None @@ -102,7 +108,8 @@ def test_kc_get_users_paginates(monkeypatch): sync.SESSION = _PagedSession() users = sync.kc_get_users("tok") assert [u["id"] for u in users] == ["u1", "u2"] - assert sync.SESSION.calls == 2 + # Pagination stops when results < page size. + assert sync.SESSION.calls == 1 def test_ensure_mailu_user_skips_foreign_domain(monkeypatch): @@ -119,6 +126,7 @@ def test_ensure_mailu_user_skips_foreign_domain(monkeypatch): def test_ensure_mailu_user_upserts(monkeypatch): sync = load_sync_module(monkeypatch) + monkeypatch.setattr(sync.bcrypt_sha256, "hash", lambda password: f"hash:{password}") captured = {} class _Cursor: @@ -134,6 +142,7 @@ def test_ensure_mailu_user_upserts(monkeypatch): def test_main_generates_password_and_upserts(monkeypatch): sync = load_sync_module(monkeypatch) + monkeypatch.setattr(sync.bcrypt_sha256, "hash", lambda password: f"hash:{password}") users = [ {"id": "u1", "username": "user1", "email": "user1@example.com", "attributes": {}}, {"id": "u2", "username": "user2", "email": "user2@example.com", "attributes": {"mailu_app_password": ["keepme"]}}, @@ -176,6 +185,6 @@ def test_main_generates_password_and_upserts(monkeypatch): sync.main() - # Should attempt two inserts (third user skipped due to domain mismatch) - assert len(updated) == 1 # only one missing attr was backfilled - assert conns and len(conns[0]._cursor.executions) == 2 + # Always backfill mailu_email, even if Keycloak recovery email is external. + assert len(updated) == 3 + assert conns and len(conns[0]._cursor.executions) == 3 diff --git a/services/ai-llm/deployment.yaml b/services/ai-llm/deployment.yaml new file mode 100644 index 0000000..b6e6701 --- /dev/null +++ b/services/ai-llm/deployment.yaml @@ -0,0 +1,105 @@ +# services/ai-llm/deployment.yaml +apiVersion: apps/v1 +kind: Deployment +metadata: + name: ollama + namespace: ai +spec: + replicas: 1 + revisionHistoryLimit: 2 + strategy: + type: RollingUpdate + rollingUpdate: + maxSurge: 0 + maxUnavailable: 1 + selector: + matchLabels: + app: ollama + template: + metadata: + labels: + app: ollama + annotations: + ai.bstein.dev/model: qwen2.5-coder:7b-instruct-q4_0 + ai.bstein.dev/gpu: GPU pool (titan-20/21/22/24) + spec: + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: kubernetes.io/hostname + operator: In + values: + - titan-20 + - titan-21 + - titan-22 + - titan-24 + runtimeClassName: nvidia + volumes: + - name: models + persistentVolumeClaim: + claimName: ollama-models + initContainers: + - name: warm-model + image: ollama/ollama:latest + env: + - name: OLLAMA_HOST + value: 0.0.0.0 + - name: NVIDIA_VISIBLE_DEVICES + value: all + - name: NVIDIA_DRIVER_CAPABILITIES + value: compute,utility + - name: OLLAMA_MODELS + value: /root/.ollama + - name: OLLAMA_MODEL + value: qwen2.5-coder:7b-instruct-q4_0 + command: + - /bin/sh + - -c + - | + set -e + ollama serve >/tmp/ollama.log 2>&1 & + sleep 6 + ollama pull "${OLLAMA_MODEL}" + pkill ollama || true + volumeMounts: + - name: models + mountPath: /root/.ollama + resources: + requests: + cpu: 250m + memory: 1Gi + nvidia.com/gpu.shared: 1 + limits: + nvidia.com/gpu.shared: 1 + containers: + - name: ollama + image: ollama/ollama:latest + imagePullPolicy: IfNotPresent + ports: + - name: http + containerPort: 11434 + env: + - name: OLLAMA_HOST + value: 0.0.0.0 + - name: OLLAMA_KEEP_ALIVE + value: 6h + - name: OLLAMA_MODELS + value: /root/.ollama + - name: NVIDIA_VISIBLE_DEVICES + value: all + - name: NVIDIA_DRIVER_CAPABILITIES + value: compute,utility + volumeMounts: + - name: models + mountPath: /root/.ollama + resources: + requests: + cpu: "2" + memory: 8Gi + nvidia.com/gpu.shared: 1 + limits: + cpu: "4" + memory: 12Gi + nvidia.com/gpu.shared: 1 diff --git a/services/ai-llm/kustomization.yaml b/services/ai-llm/kustomization.yaml new file mode 100644 index 0000000..46ea286 --- /dev/null +++ b/services/ai-llm/kustomization.yaml @@ -0,0 +1,9 @@ +# services/ai-llm/kustomization.yaml +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization +namespace: ai +resources: + - namespace.yaml + - pvc.yaml + - deployment.yaml + - service.yaml diff --git a/services/ai-llm/namespace.yaml b/services/ai-llm/namespace.yaml new file mode 100644 index 0000000..96f5a81 --- /dev/null +++ b/services/ai-llm/namespace.yaml @@ -0,0 +1,5 @@ +# services/ai-llm/namespace.yaml +apiVersion: v1 +kind: Namespace +metadata: + name: ai diff --git a/services/ai-llm/pvc.yaml b/services/ai-llm/pvc.yaml new file mode 100644 index 0000000..51c0384 --- /dev/null +++ b/services/ai-llm/pvc.yaml @@ -0,0 +1,13 @@ +# services/ai-llm/pvc.yaml +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: ollama-models + namespace: ai +spec: + accessModes: + - ReadWriteOnce + resources: + requests: + storage: 30Gi + storageClassName: astreae diff --git a/services/ai-llm/service.yaml b/services/ai-llm/service.yaml new file mode 100644 index 0000000..f086a90 --- /dev/null +++ b/services/ai-llm/service.yaml @@ -0,0 +1,14 @@ +# services/ai-llm/service.yaml +apiVersion: v1 +kind: Service +metadata: + name: ollama + namespace: ai +spec: + type: ClusterIP + selector: + app: ollama + ports: + - name: http + port: 11434 + targetPort: 11434 diff --git a/services/bstein-dev-home/backend-deployment.yaml b/services/bstein-dev-home/backend-deployment.yaml index 1159487..2e92443 100644 --- a/services/bstein-dev-home/backend-deployment.yaml +++ b/services/bstein-dev-home/backend-deployment.yaml @@ -5,7 +5,7 @@ metadata: name: bstein-dev-home-backend namespace: bstein-dev-home spec: - replicas: 2 + replicas: 1 revisionHistoryLimit: 3 selector: matchLabels: @@ -15,6 +15,8 @@ spec: labels: app: bstein-dev-home-backend spec: + automountServiceAccountToken: true + serviceAccountName: bstein-dev-home nodeSelector: kubernetes.io/arch: arm64 node-role.kubernetes.io/worker: "true" @@ -22,8 +24,73 @@ spec: - name: harbor-bstein-robot containers: - name: backend - image: registry.bstein.dev/bstein/bstein-dev-home-backend:latest + image: registry.bstein.dev/bstein/bstein-dev-home-backend:0.1.1-92 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend"} imagePullPolicy: Always + command: ["gunicorn"] + args: + - -b + - 0.0.0.0:8080 + - --workers + - "2" + - --timeout + - "180" + - app:app + env: + - name: AI_CHAT_API + value: http://ollama.ai.svc.cluster.local:11434 + - name: AI_CHAT_MODEL + value: qwen2.5-coder:7b-instruct-q4_0 + - name: AI_CHAT_TIMEOUT_SEC + value: "60" + - name: AI_NODE_NAME + valueFrom: + fieldRef: + fieldPath: spec.nodeName + - name: AI_NODE_GPU_MAP + value: | + {"titan-20": "Jetson Xavier (edge GPU)", "titan-21": "Jetson Xavier (edge GPU)", "titan-22": "RTX 3050 8GB (local GPU)", "titan-24": "RTX 3080 8GB (local GPU)"} + - name: KEYCLOAK_ENABLED + value: "true" + - name: KEYCLOAK_URL + value: https://sso.bstein.dev + - name: KEYCLOAK_REALM + value: atlas + - name: KEYCLOAK_CLIENT_ID + value: bstein-dev-home + - name: KEYCLOAK_ISSUER + value: https://sso.bstein.dev/realms/atlas + - name: KEYCLOAK_JWKS_URL + value: http://keycloak.sso.svc.cluster.local/realms/atlas/protocol/openid-connect/certs + - name: KEYCLOAK_ADMIN_URL + value: http://keycloak.sso.svc.cluster.local + - name: KEYCLOAK_ADMIN_REALM + value: atlas + - name: KEYCLOAK_ADMIN_CLIENT_ID + value: bstein-dev-home-admin + - name: KEYCLOAK_ADMIN_CLIENT_SECRET + valueFrom: + secretKeyRef: + name: bstein-dev-home-keycloak-admin + key: client_secret + - name: ACCOUNT_ALLOWED_GROUPS + value: "" + - name: PORTAL_DATABASE_URL + valueFrom: + secretKeyRef: + name: atlas-portal-db + key: PORTAL_DATABASE_URL + - name: HTTP_CHECK_TIMEOUT_SEC + value: "2" + - name: ACCESS_REQUEST_SUBMIT_RATE_LIMIT + value: "30" + - name: ACCESS_REQUEST_SUBMIT_RATE_WINDOW_SEC + value: "3600" + - name: ACCESS_REQUEST_STATUS_RATE_LIMIT + value: "120" + - name: ACCESS_REQUEST_STATUS_RATE_WINDOW_SEC + value: "60" + - name: ACCESS_REQUEST_INTERNAL_EMAIL_ALLOWLIST + value: robotuser@bstein.dev ports: - name: http containerPort: 8080 @@ -33,16 +100,18 @@ spec: port: http initialDelaySeconds: 2 periodSeconds: 5 + timeoutSeconds: 3 livenessProbe: httpGet: path: /api/healthz port: http initialDelaySeconds: 10 periodSeconds: 10 + timeoutSeconds: 3 resources: requests: - cpu: 50m - memory: 64Mi + cpu: 100m + memory: 128Mi limits: - cpu: 300m - memory: 256Mi + cpu: 500m + memory: 512Mi diff --git a/services/bstein-dev-home/chat-ai-gateway-deployment.yaml b/services/bstein-dev-home/chat-ai-gateway-deployment.yaml new file mode 100644 index 0000000..7ac6504 --- /dev/null +++ b/services/bstein-dev-home/chat-ai-gateway-deployment.yaml @@ -0,0 +1,69 @@ +# services/bstein-dev-home/chat-ai-gateway-deployment.yaml +apiVersion: apps/v1 +kind: Deployment +metadata: + name: chat-ai-gateway + namespace: bstein-dev-home +spec: + replicas: 1 + revisionHistoryLimit: 2 + selector: + matchLabels: + app: chat-ai-gateway + template: + metadata: + labels: + app: chat-ai-gateway + spec: + nodeSelector: + kubernetes.io/arch: arm64 + node-role.kubernetes.io/worker: "true" + containers: + - name: gateway + image: python:3.11-slim + command: ["/bin/sh","-c"] + args: + - python /app/gateway.py + env: + - name: UPSTREAM_URL + value: http://bstein-dev-home-backend/api/chat + - name: CHAT_KEY_MATRIX + valueFrom: + secretKeyRef: + name: chat-ai-keys-runtime + key: matrix + - name: CHAT_KEY_HOMEPAGE + valueFrom: + secretKeyRef: + name: chat-ai-keys-runtime + key: homepage + ports: + - name: http + containerPort: 8080 + readinessProbe: + httpGet: + path: /healthz + port: http + initialDelaySeconds: 2 + periodSeconds: 5 + livenessProbe: + httpGet: + path: /healthz + port: http + initialDelaySeconds: 10 + periodSeconds: 10 + resources: + requests: + cpu: 20m + memory: 64Mi + limits: + cpu: 200m + memory: 256Mi + volumeMounts: + - name: code + mountPath: /app/gateway.py + subPath: gateway.py + volumes: + - name: code + configMap: + name: chat-ai-gateway diff --git a/services/bstein-dev-home/chat-ai-gateway-service.yaml b/services/bstein-dev-home/chat-ai-gateway-service.yaml new file mode 100644 index 0000000..8a71d20 --- /dev/null +++ b/services/bstein-dev-home/chat-ai-gateway-service.yaml @@ -0,0 +1,13 @@ +# services/bstein-dev-home/chat-ai-gateway-service.yaml +apiVersion: v1 +kind: Service +metadata: + name: chat-ai-gateway + namespace: bstein-dev-home +spec: + selector: + app: chat-ai-gateway + ports: + - name: http + port: 80 + targetPort: 8080 diff --git a/services/bstein-dev-home/frontend-deployment.yaml b/services/bstein-dev-home/frontend-deployment.yaml index 7189bee..3092edb 100644 --- a/services/bstein-dev-home/frontend-deployment.yaml +++ b/services/bstein-dev-home/frontend-deployment.yaml @@ -5,7 +5,7 @@ metadata: name: bstein-dev-home-frontend namespace: bstein-dev-home spec: - replicas: 2 + replicas: 1 revisionHistoryLimit: 3 selector: matchLabels: @@ -22,7 +22,7 @@ spec: - name: harbor-bstein-robot containers: - name: frontend - image: registry.bstein.dev/bstein/bstein-dev-home-frontend:latest + image: registry.bstein.dev/bstein/bstein-dev-home-frontend:0.1.1-92 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend"} imagePullPolicy: Always ports: - name: http diff --git a/services/bstein-dev-home/ingress.yaml b/services/bstein-dev-home/ingress.yaml index 471f1bc..1537c94 100644 --- a/services/bstein-dev-home/ingress.yaml +++ b/services/bstein-dev-home/ingress.yaml @@ -11,7 +11,7 @@ metadata: cert-manager.io/cluster-issuer: letsencrypt spec: tls: - - hosts: [ "bstein.dev" ] + - hosts: [ "bstein.dev", "chat.ai.bstein.dev" ] secretName: bstein-dev-home-tls rules: - host: bstein.dev @@ -29,3 +29,12 @@ spec: service: name: bstein-dev-home-frontend port: { number: 80 } + - host: chat.ai.bstein.dev + http: + paths: + - path: / + pathType: Prefix + backend: + service: + name: chat-ai-gateway + port: { number: 80 } diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml index 3268f6d..4847d2b 100644 --- a/services/bstein-dev-home/kustomization.yaml +++ b/services/bstein-dev-home/kustomization.yaml @@ -5,13 +5,34 @@ namespace: bstein-dev-home resources: - namespace.yaml - image.yaml + - rbac.yaml + - portal-e2e-client-secret-sync-rbac.yaml + - chat-ai-gateway-deployment.yaml + - chat-ai-gateway-service.yaml - frontend-deployment.yaml - frontend-service.yaml - backend-deployment.yaml - backend-service.yaml + - vaultwarden-cred-sync-cronjob.yaml + - portal-onboarding-e2e-test-job.yaml - ingress.yaml -images: - - name: registry.bstein.dev/bstein/bstein-dev-home-frontend - newTag: 0.1.0-11 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend"} - - name: registry.bstein.dev/bstein/bstein-dev-home-backend - newTag: 0.1.0-11 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend"} + +configMapGenerator: + - name: chat-ai-gateway + namespace: bstein-dev-home + files: + - gateway.py=scripts/gateway.py + options: + disableNameSuffixHash: true + - name: vaultwarden-cred-sync-script + namespace: bstein-dev-home + files: + - vaultwarden_cred_sync.py=scripts/vaultwarden_cred_sync.py + options: + disableNameSuffixHash: true + - name: portal-onboarding-e2e-tests + namespace: bstein-dev-home + files: + - test_portal_onboarding_flow.py=scripts/test_portal_onboarding_flow.py + options: + disableNameSuffixHash: true diff --git a/services/bstein-dev-home/portal-e2e-client-secret-sync-rbac.yaml b/services/bstein-dev-home/portal-e2e-client-secret-sync-rbac.yaml new file mode 100644 index 0000000..045bd0a --- /dev/null +++ b/services/bstein-dev-home/portal-e2e-client-secret-sync-rbac.yaml @@ -0,0 +1,24 @@ +# services/bstein-dev-home/portal-e2e-client-secret-sync-rbac.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: Role +metadata: + name: portal-e2e-client-secret-sync-target + namespace: bstein-dev-home +rules: + - apiGroups: [""] + resources: ["secrets"] + verbs: ["get", "create", "patch", "update"] +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + name: portal-e2e-client-secret-sync-target + namespace: bstein-dev-home +subjects: + - kind: ServiceAccount + name: portal-e2e-client-secret-sync + namespace: sso +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: Role + name: portal-e2e-client-secret-sync-target diff --git a/services/bstein-dev-home/portal-onboarding-e2e-test-job.yaml b/services/bstein-dev-home/portal-onboarding-e2e-test-job.yaml new file mode 100644 index 0000000..3170f86 --- /dev/null +++ b/services/bstein-dev-home/portal-onboarding-e2e-test-job.yaml @@ -0,0 +1,66 @@ +# services/bstein-dev-home/portal-onboarding-e2e-test-job.yaml +apiVersion: batch/v1 +kind: Job +metadata: + name: portal-onboarding-e2e-test-11 + namespace: bstein-dev-home +spec: + backoffLimit: 0 + template: + spec: + restartPolicy: Never + containers: + - name: test + image: python:3.11-slim + env: + - name: PORTAL_BASE_URL + value: http://bstein-dev-home-backend.bstein-dev-home.svc.cluster.local + - name: KEYCLOAK_ADMIN_URL + value: https://sso.bstein.dev + - name: KEYCLOAK_REALM + value: atlas + - name: KEYCLOAK_ADMIN_CLIENT_ID + value: bstein-dev-home-admin + - name: KEYCLOAK_ADMIN_CLIENT_SECRET + valueFrom: + secretKeyRef: + name: bstein-dev-home-keycloak-admin + key: client_secret + - name: PORTAL_E2E_CLIENT_ID + valueFrom: + secretKeyRef: + name: portal-e2e-client + key: client_id + - name: PORTAL_E2E_CLIENT_SECRET + valueFrom: + secretKeyRef: + name: portal-e2e-client + key: client_secret + - name: PORTAL_TARGET_CLIENT_ID + value: bstein-dev-home + - name: E2E_PORTAL_ADMIN_USERNAME + value: bstein + - name: E2E_USERNAME_PREFIX + value: e2e-portal + - name: E2E_CONTACT_EMAIL + value: robotuser@bstein.dev + - name: E2E_IMAP_KEYCLOAK_USERNAME + value: robotuser + - name: E2E_DEADLINE_SECONDS + value: "600" + - name: E2E_POLL_SECONDS + value: "10" + command: ["/bin/sh", "-c"] + args: + - | + set -euo pipefail + python /scripts/test_portal_onboarding_flow.py + volumeMounts: + - name: tests + mountPath: /scripts + readOnly: true + volumes: + - name: tests + configMap: + name: portal-onboarding-e2e-tests + defaultMode: 0555 diff --git a/services/bstein-dev-home/rbac.yaml b/services/bstein-dev-home/rbac.yaml new file mode 100644 index 0000000..f97ed24 --- /dev/null +++ b/services/bstein-dev-home/rbac.yaml @@ -0,0 +1,108 @@ +# services/bstein-dev-home/rbac.yaml +apiVersion: v1 +kind: ServiceAccount +metadata: + name: bstein-dev-home + namespace: bstein-dev-home +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: bstein-dev-home-ai-reader +rules: + - apiGroups: [""] + resources: ["pods"] + verbs: ["get", "list", "watch"] + resourceNames: [] +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: bstein-dev-home-ai-reader +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: bstein-dev-home-ai-reader +subjects: + - kind: ServiceAccount + name: bstein-dev-home + namespace: bstein-dev-home +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: bstein-dev-home-vaultwarden-admin-secret-reader +rules: + - apiGroups: [""] + resources: ["secrets"] + verbs: ["get"] + resourceNames: ["vaultwarden-admin"] +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: bstein-dev-home-vaultwarden-admin-secret-reader +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: bstein-dev-home-vaultwarden-admin-secret-reader +subjects: + - kind: ServiceAccount + name: bstein-dev-home + namespace: bstein-dev-home +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: Role +metadata: + name: bstein-dev-home-vaultwarden-admin-token-reader + namespace: vaultwarden +rules: + - apiGroups: [""] + resources: ["secrets"] + verbs: ["get"] + resourceNames: ["vaultwarden-admin"] +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + name: bstein-dev-home-vaultwarden-admin-token-reader + namespace: vaultwarden +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: Role + name: bstein-dev-home-vaultwarden-admin-token-reader +subjects: + - kind: ServiceAccount + name: bstein-dev-home + namespace: bstein-dev-home +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: Role +metadata: + name: bstein-dev-home-nextcloud-mail-sync + namespace: nextcloud +rules: + - apiGroups: ["batch"] + resources: ["cronjobs"] + verbs: ["get"] + resourceNames: ["nextcloud-mail-sync"] + - apiGroups: ["batch"] + resources: ["jobs"] + verbs: ["create", "get", "list", "watch"] + - apiGroups: [""] + resources: ["pods"] + verbs: ["get", "list"] +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + name: bstein-dev-home-nextcloud-mail-sync + namespace: nextcloud +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: Role + name: bstein-dev-home-nextcloud-mail-sync +subjects: + - kind: ServiceAccount + name: bstein-dev-home + namespace: bstein-dev-home diff --git a/services/bstein-dev-home/scripts/gateway.py b/services/bstein-dev-home/scripts/gateway.py new file mode 100644 index 0000000..3ca2fa1 --- /dev/null +++ b/services/bstein-dev-home/scripts/gateway.py @@ -0,0 +1,70 @@ +import json +import os +from http.server import BaseHTTPRequestHandler, HTTPServer +from urllib import request, error + +UPSTREAM = os.environ.get("UPSTREAM_URL", "http://bstein-dev-home-backend/api/chat") +KEY_MATRIX = os.environ.get("CHAT_KEY_MATRIX", "") +KEY_HOMEPAGE = os.environ.get("CHAT_KEY_HOMEPAGE", "") + +ALLOWED = {k for k in (KEY_MATRIX, KEY_HOMEPAGE) if k} + +class Handler(BaseHTTPRequestHandler): + def _send_json(self, code: int, payload: dict): + body = json.dumps(payload).encode() + self.send_response(code) + self.send_header("Content-Type", "application/json") + self.send_header("Content-Length", str(len(body))) + self.end_headers() + self.wfile.write(body) + + def do_GET(self): # noqa: N802 + if self.path in ("/healthz", "/"): + return self._send_json(200, {"ok": True}) + return self._send_json(404, {"error": "not_found"}) + + def do_POST(self): # noqa: N802 + if self.path != "/": + return self._send_json(404, {"error": "not_found"}) + + key = self.headers.get("x-api-key", "") + if not key or key not in ALLOWED: + return self._send_json(401, {"error": "unauthorized"}) + + length = int(self.headers.get("content-length", "0") or "0") + raw = self.rfile.read(length) if length else b"{}" + + try: + upstream_req = request.Request( + UPSTREAM, + data=raw, + headers={"Content-Type": "application/json"}, + method="POST", + ) + with request.urlopen(upstream_req, timeout=90) as resp: + data = resp.read() + self.send_response(resp.status) + for k, v in resp.headers.items(): + if k.lower() in ("content-length", "connection", "server", "date"): + continue + self.send_header(k, v) + self.send_header("Content-Length", str(len(data))) + self.end_headers() + self.wfile.write(data) + except error.HTTPError as e: + data = e.read() if hasattr(e, "read") else b"" + self.send_response(e.code) + self.send_header("Content-Type", "application/json") + self.send_header("Content-Length", str(len(data))) + self.end_headers() + self.wfile.write(data) + except Exception: + return self._send_json(502, {"error": "bad_gateway"}) + +def main(): + port = int(os.environ.get("PORT", "8080")) + httpd = HTTPServer(("0.0.0.0", port), Handler) + httpd.serve_forever() + +if __name__ == "__main__": + main() diff --git a/services/bstein-dev-home/scripts/test_portal_onboarding_flow.py b/services/bstein-dev-home/scripts/test_portal_onboarding_flow.py new file mode 100644 index 0000000..9c5124a --- /dev/null +++ b/services/bstein-dev-home/scripts/test_portal_onboarding_flow.py @@ -0,0 +1,428 @@ +#!/usr/bin/env python3 +import email +import http.client +import imaplib +import json +import os +import re +import ssl +import sys +import time +import urllib.error +import urllib.parse +import urllib.request + + +def _env(name: str, default: str | None = None) -> str: + value = os.environ.get(name, default) + if value is None or value == "": + raise SystemExit(f"missing required env var: {name}") + return value + + +def _post_json(url: str, payload: dict, timeout_s: int = 30) -> dict: + body = json.dumps(payload).encode() + req = urllib.request.Request( + url, + data=body, + headers={"Content-Type": "application/json"}, + method="POST", + ) + try: + with urllib.request.urlopen(req, timeout=timeout_s) as resp: + raw = resp.read().decode() + return json.loads(raw) if raw else {} + except urllib.error.HTTPError as exc: + raw = exc.read().decode(errors="replace") + raise SystemExit(f"HTTP {exc.code} from {url}: {raw}") + + +def _post_form(url: str, data: dict[str, str], timeout_s: int = 30) -> dict: + body = urllib.parse.urlencode(data).encode() + req = urllib.request.Request( + url, + data=body, + headers={"Content-Type": "application/x-www-form-urlencoded"}, + method="POST", + ) + try: + with urllib.request.urlopen(req, timeout=timeout_s) as resp: + raw = resp.read().decode() + return json.loads(raw) if raw else {} + except urllib.error.HTTPError as exc: + raw = exc.read().decode(errors="replace") + raise SystemExit(f"HTTP {exc.code} from {url}: {raw}") + + +def _get_json(url: str, headers: dict[str, str] | None = None, timeout_s: int = 30) -> object: + req = urllib.request.Request(url, headers=headers or {}, method="GET") + try: + with urllib.request.urlopen(req, timeout=timeout_s) as resp: + raw = resp.read().decode() + return json.loads(raw) if raw else None + except urllib.error.HTTPError as exc: + raw = exc.read().decode(errors="replace") + raise SystemExit(f"HTTP {exc.code} from {url}: {raw}") + + +def _request_json( + method: str, + url: str, + token: str, + payload: dict | None = None, + timeout_s: int = 30, +) -> dict: + data = None + headers = {"Authorization": f"Bearer {token}"} + if payload is not None: + data = json.dumps(payload).encode() + headers["Content-Type"] = "application/json" + req = urllib.request.Request(url, data=data, headers=headers, method=method) + try: + with urllib.request.urlopen(req, timeout=timeout_s) as resp: + raw = resp.read().decode() + return json.loads(raw) if raw else {} + except urllib.error.HTTPError as exc: + raw = exc.read().decode(errors="replace") + raise SystemExit(f"HTTP {exc.code} from {url}: {raw}") + + +def _keycloak_client_token(keycloak_base: str, realm: str, client_id: str, client_secret: str) -> str: + token_url = f"{keycloak_base.rstrip('/')}/realms/{realm}/protocol/openid-connect/token" + payload = _post_form( + token_url, + { + "grant_type": "client_credentials", + "client_id": client_id, + "client_secret": client_secret, + }, + timeout_s=20, + ) + token = payload.get("access_token") + if not isinstance(token, str) or not token: + raise SystemExit("keycloak token response missing access_token") + return token + + +def _keycloak_token_exchange( + *, + keycloak_base: str, + realm: str, + client_id: str, + client_secret: str, + subject_token: str, + requested_subject: str, + audience: str, +) -> str: + token_url = f"{keycloak_base.rstrip('/')}/realms/{realm}/protocol/openid-connect/token" + payload = _post_form( + token_url, + { + "grant_type": "urn:ietf:params:oauth:grant-type:token-exchange", + "client_id": client_id, + "client_secret": client_secret, + "subject_token": subject_token, + "requested_subject": requested_subject, + "audience": audience, + }, + timeout_s=20, + ) + token = payload.get("access_token") + if not isinstance(token, str) or not token: + raise SystemExit("keycloak token exchange response missing access_token") + return token + + +def _keycloak_find_user(keycloak_base: str, realm: str, token: str, username: str) -> dict | None: + url = f"{keycloak_base.rstrip('/')}/admin/realms/{realm}/users?{urllib.parse.urlencode({'username': username, 'exact': 'true', 'max': '1'})}" + users = _get_json(url, headers={"Authorization": f"Bearer {token}"}, timeout_s=20) + if not isinstance(users, list) or not users: + return None + user = users[0] + return user if isinstance(user, dict) else None + + +def _keycloak_get_user(keycloak_base: str, realm: str, token: str, user_id: str) -> dict: + url = f"{keycloak_base.rstrip('/')}/admin/realms/{realm}/users/{urllib.parse.quote(user_id, safe='')}" + data = _get_json(url, headers={"Authorization": f"Bearer {token}"}, timeout_s=20) + if not isinstance(data, dict): + raise SystemExit("unexpected keycloak user payload") + return data + + +def _extract_attr(attributes: object, key: str) -> str: + if not isinstance(attributes, dict): + return "" + value = attributes.get(key) + if isinstance(value, list) and value and isinstance(value[0], str): + return value[0] + if isinstance(value, str): + return value + return "" + + +def _imap_wait_for_verify_token( + *, + host: str, + port: int, + username: str, + password: str, + request_code: str, + deadline_sec: int, +) -> str: + ssl_context = ssl._create_unverified_context() + deadline_at = time.monotonic() + deadline_sec + + with imaplib.IMAP4_SSL(host, port, ssl_context=ssl_context) as client: + client.login(username, password) + client.select("INBOX") + + while time.monotonic() < deadline_at: + status, data = client.search(None, "TEXT", request_code) + if status == "OK" and data and data[0]: + ids = data[0].split() + msg_id = ids[-1] + fetch_status, msg_data = client.fetch(msg_id, "(RFC822)") + if fetch_status != "OK" or not msg_data: + time.sleep(2) + continue + + raw = msg_data[0][1] if isinstance(msg_data[0], tuple) and len(msg_data[0]) > 1 else None + if not isinstance(raw, (bytes, bytearray)): + time.sleep(2) + continue + + message = email.message_from_bytes(raw) + body = None + if message.is_multipart(): + for part in message.walk(): + if part.get_content_type() == "text/plain": + payload = part.get_payload(decode=True) + if isinstance(payload, (bytes, bytearray)): + body = payload.decode(errors="replace") + break + else: + payload = message.get_payload(decode=True) + if isinstance(payload, (bytes, bytearray)): + body = payload.decode(errors="replace") + + if not body: + time.sleep(2) + continue + + url = None + for line in body.splitlines(): + candidate = line.strip() + if "verify=" in candidate and candidate.startswith("http"): + url = candidate + break + if not url: + match = re.search(r"https?://\\S+verify=\\S+", body) + url = match.group(0) if match else None + if not url: + time.sleep(2) + continue + + parsed = urllib.parse.urlparse(url) + query = urllib.parse.parse_qs(parsed.query) + token = query.get("verify", [""])[0] + if isinstance(token, str) and token: + return token + time.sleep(2) + + raise SystemExit("verification email not found before deadline") + + +def main() -> int: + portal_base = _env("PORTAL_BASE_URL").rstrip("/") + + keycloak_base = _env("KEYCLOAK_ADMIN_URL").rstrip("/") + realm = _env("KEYCLOAK_REALM", "atlas") + kc_admin_client_id = _env("KEYCLOAK_ADMIN_CLIENT_ID") + kc_admin_client_secret = _env("KEYCLOAK_ADMIN_CLIENT_SECRET") + portal_e2e_client_id = _env("PORTAL_E2E_CLIENT_ID") + portal_e2e_client_secret = _env("PORTAL_E2E_CLIENT_SECRET") + portal_target_client_id = os.environ.get("PORTAL_TARGET_CLIENT_ID", "bstein-dev-home").strip() or "bstein-dev-home" + portal_admin_username = os.environ.get("E2E_PORTAL_ADMIN_USERNAME", "bstein").strip() or "bstein" + + contact_email = os.environ.get("E2E_CONTACT_EMAIL", "robotuser@bstein.dev").strip() + if not contact_email: + raise SystemExit("E2E_CONTACT_EMAIL must not be empty") + + imap_host = os.environ.get("E2E_IMAP_HOST", "mailu-front.mailu-mailserver.svc.cluster.local").strip() + imap_port = int(os.environ.get("E2E_IMAP_PORT", "993")) + imap_keycloak_username = os.environ.get("E2E_IMAP_KEYCLOAK_USERNAME", "robotuser").strip() + imap_wait_sec = int(os.environ.get("E2E_IMAP_WAIT_SECONDS", "90")) + + try: + token = _keycloak_client_token(keycloak_base, realm, kc_admin_client_id, kc_admin_client_secret) + except SystemExit as exc: + raise SystemExit(f"failed to fetch keycloak token for admin client {kc_admin_client_id!r}: {exc}") + mailbox_user = _keycloak_find_user(keycloak_base, realm, token, imap_keycloak_username) + if not mailbox_user: + raise SystemExit(f"unable to locate Keycloak mailbox user {imap_keycloak_username!r}") + mailbox_user_id = mailbox_user.get("id") + if not isinstance(mailbox_user_id, str) or not mailbox_user_id: + raise SystemExit("mailbox user missing id") + + mailbox_full = _keycloak_get_user(keycloak_base, realm, token, mailbox_user_id) + mailbox_attrs = mailbox_full.get("attributes") + mailu_email = _extract_attr(mailbox_attrs, "mailu_email") + if not mailu_email: + mailu_email = contact_email + mailu_password = _extract_attr(mailbox_attrs, "mailu_app_password") + if not mailu_password: + raise SystemExit(f"Keycloak user {imap_keycloak_username!r} missing mailu_app_password attribute") + + username_prefix = os.environ.get("E2E_USERNAME_PREFIX", "e2e-user") + now = int(time.time()) + username = f"{username_prefix}-{now}" + + submit_url = f"{portal_base}/api/access/request" + submit_payload = {"username": username, "email": contact_email, "note": "portal onboarding e2e"} + submit = None + for attempt in range(1, 6): + try: + submit = _post_json(submit_url, submit_payload, timeout_s=20) + break + except (http.client.RemoteDisconnected, TimeoutError, urllib.error.URLError) as exc: + if attempt == 5: + raise SystemExit(f"portal submit failed after {attempt} attempts: {exc}") + time.sleep(2) + if not isinstance(submit, dict): + raise SystemExit("portal submit did not return json") + + request_code = submit.get("request_code") + if not isinstance(request_code, str) or not request_code: + raise SystemExit(f"request submit did not return request_code: {submit}") + + verify_token = _imap_wait_for_verify_token( + host=imap_host, + port=imap_port, + username=mailu_email, + password=mailu_password, + request_code=request_code, + deadline_sec=imap_wait_sec, + ) + verify_resp = _post_json( + f"{portal_base}/api/access/request/verify", + {"request_code": request_code, "token": verify_token}, + timeout_s=30, + ) + if not isinstance(verify_resp, dict) or verify_resp.get("ok") is not True: + raise SystemExit(f"unexpected verify response: {verify_resp}") + + portal_admin = _keycloak_find_user(keycloak_base, realm, token, portal_admin_username) + if not portal_admin: + raise SystemExit(f"unable to locate portal admin user {portal_admin_username!r} via Keycloak admin API") + portal_admin_user_id = portal_admin.get("id") + if not isinstance(portal_admin_user_id, str) or not portal_admin_user_id: + raise SystemExit("portal admin user missing id") + + try: + e2e_subject_token = _keycloak_client_token(keycloak_base, realm, portal_e2e_client_id, portal_e2e_client_secret) + except SystemExit as exc: + raise SystemExit(f"failed to fetch keycloak token for E2E client {portal_e2e_client_id!r}: {exc}") + try: + portal_bearer = _keycloak_token_exchange( + keycloak_base=keycloak_base, + realm=realm, + client_id=portal_e2e_client_id, + client_secret=portal_e2e_client_secret, + subject_token=e2e_subject_token, + requested_subject=portal_admin_user_id, + audience=portal_target_client_id, + ) + except SystemExit as exc: + raise SystemExit(f"failed to exchange token for portal approval as {portal_admin_username!r}: {exc}") + + approve_url = f"{portal_base}/api/admin/access/requests/{urllib.parse.quote(username, safe='')}/approve" + approve_timeout_s = int(os.environ.get("E2E_APPROVE_TIMEOUT_SECONDS", "180")) + approve_attempts = int(os.environ.get("E2E_APPROVE_ATTEMPTS", "3")) + approve_resp = None + approve_error = None + for attempt in range(1, approve_attempts + 1): + try: + approve_resp = _request_json("POST", approve_url, portal_bearer, payload=None, timeout_s=approve_timeout_s) + approve_error = None + break + except (http.client.RemoteDisconnected, TimeoutError, urllib.error.URLError) as exc: + approve_error = str(exc) + if attempt == approve_attempts: + break + time.sleep(3) + if approve_resp is None: + print( + "WARNING: portal approval request did not return a response; " + f"continuing to poll status (last_error={approve_error})" + ) + elif not isinstance(approve_resp, dict) or approve_resp.get("ok") is not True: + raise SystemExit(f"unexpected approval response: {approve_resp}") + + status_url = f"{portal_base}/api/access/request/status" + deadline_s = int(os.environ.get("E2E_DEADLINE_SECONDS", "600")) + interval_s = int(os.environ.get("E2E_POLL_SECONDS", "10")) + deadline_at = time.monotonic() + deadline_s + + last_status = None + last_error = None + while True: + try: + status_payload = _post_json(status_url, {"request_code": request_code}, timeout_s=60) + last_error = None + except (http.client.RemoteDisconnected, TimeoutError, urllib.error.URLError) as exc: + last_error = str(exc) + if time.monotonic() >= deadline_at: + raise SystemExit(f"timed out waiting for provisioning to finish (last error={last_error})") + time.sleep(interval_s) + continue + status = status_payload.get("status") + if isinstance(status, str): + last_status = status + + if status in ("awaiting_onboarding", "ready"): + break + if status in ("denied", "unknown"): + raise SystemExit(f"request transitioned to unexpected terminal status: {status_payload}") + if time.monotonic() >= deadline_at: + suffix = f" (last error={last_error})" if last_error else "" + raise SystemExit(f"timed out waiting for provisioning to finish (last status={last_status}){suffix}") + time.sleep(interval_s) + + # Refresh admin token (it may expire during the provisioning wait). + token = _keycloak_client_token(keycloak_base, realm, kc_admin_client_id, kc_admin_client_secret) + + user = _keycloak_find_user(keycloak_base, realm, token, username) + if not user: + raise SystemExit("expected Keycloak user was not created") + user_id = user.get("id") + if not isinstance(user_id, str) or not user_id: + raise SystemExit("created user missing id") + + full = _keycloak_get_user(keycloak_base, realm, token, user_id) + required_actions = full.get("requiredActions") or [] + required: set[str] = set() + if isinstance(required_actions, list): + required = {a for a in required_actions if isinstance(a, str)} + + unexpected = sorted(required.intersection({"UPDATE_PASSWORD", "VERIFY_EMAIL", "CONFIGURE_TOTP"})) + if unexpected: + raise SystemExit( + "Keycloak user should not require actions at first login " + f"(Vaultwarden-first onboarding): unexpected requiredActions={unexpected} full={sorted(required)}" + ) + + email_verified = full.get("emailVerified") + if email_verified is not True: + raise SystemExit(f"Keycloak user should have emailVerified=true: emailVerified={email_verified!r}") + + kc_email = full.get("email") + if isinstance(kc_email, str) and contact_email and kc_email != contact_email: + raise SystemExit(f"Keycloak user email mismatch: expected {contact_email!r} got {kc_email!r}") + + print(f"PASS: onboarding provisioning completed for {request_code} ({username})") + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/services/bstein-dev-home/scripts/vaultwarden_cred_sync.py b/services/bstein-dev-home/scripts/vaultwarden_cred_sync.py new file mode 100644 index 0000000..d259b31 --- /dev/null +++ b/services/bstein-dev-home/scripts/vaultwarden_cred_sync.py @@ -0,0 +1,193 @@ +#!/usr/bin/env python3 + +from __future__ import annotations + +import sys +import time +from typing import Any, Iterable + +import httpx + +from atlas_portal import settings +from atlas_portal.keycloak import admin_client +from atlas_portal.vaultwarden import invite_user + + +VAULTWARDEN_EMAIL_ATTR = "vaultwarden_email" +VAULTWARDEN_STATUS_ATTR = "vaultwarden_status" +VAULTWARDEN_SYNCED_AT_ATTR = "vaultwarden_synced_at" + + +def _iter_keycloak_users(page_size: int = 200) -> Iterable[dict[str, Any]]: + client = admin_client() + if not client.ready(): + raise RuntimeError("keycloak admin client not configured") + + url = f"{settings.KEYCLOAK_ADMIN_URL}/admin/realms/{settings.KEYCLOAK_REALM}/users" + first = 0 + while True: + headers = client.headers() + # We need attributes for idempotency (vaultwarden_status/vaultwarden_email). Keycloak defaults to a + # brief representation which may omit these. + params = {"first": str(first), "max": str(page_size), "briefRepresentation": "false"} + with httpx.Client(timeout=settings.HTTP_CHECK_TIMEOUT_SEC) as http: + resp = http.get(url, params=params, headers=headers) + resp.raise_for_status() + payload = resp.json() + + if not isinstance(payload, list) or not payload: + return + + for item in payload: + if isinstance(item, dict): + yield item + + if len(payload) < page_size: + return + first += page_size + + +def _extract_attr(attrs: Any, key: str) -> str: + if not isinstance(attrs, dict): + return "" + raw = attrs.get(key) + if isinstance(raw, list): + for item in raw: + if isinstance(item, str) and item.strip(): + return item.strip() + return "" + if isinstance(raw, str) and raw.strip(): + return raw.strip() + return "" + + +def _vaultwarden_email_for_user(user: dict[str, Any]) -> str: + username = (user.get("username") if isinstance(user.get("username"), str) else "") or "" + username = username.strip() + if not username: + return "" + + attrs = user.get("attributes") + vaultwarden_email = _extract_attr(attrs, VAULTWARDEN_EMAIL_ATTR) + if vaultwarden_email: + return vaultwarden_email + + mailu_email = _extract_attr(attrs, "mailu_email") + if mailu_email: + return mailu_email + + email = (user.get("email") if isinstance(user.get("email"), str) else "") or "" + email = email.strip() + if email and email.lower().endswith(f"@{settings.MAILU_DOMAIN.lower()}"): + return email + + # Don't guess an internal mailbox address until Mailu sync has run and stored mailu_email. + # This avoids spamming Vaultwarden invites that can never be delivered (unknown recipient). + return "" + + +def _set_user_attribute_if_missing(username: str, user: dict[str, Any], key: str, value: str) -> None: + value = (value or "").strip() + if not value: + return + existing = _extract_attr(user.get("attributes"), key) + if existing: + return + admin_client().set_user_attribute(username, key, value) + + +def _set_user_attribute(username: str, key: str, value: str) -> None: + value = (value or "").strip() + if not value: + return + admin_client().set_user_attribute(username, key, value) + + +def main() -> int: + processed = 0 + created = 0 + skipped = 0 + failures = 0 + + for user in _iter_keycloak_users(): + username = (user.get("username") if isinstance(user.get("username"), str) else "") or "" + username = username.strip() + if not username: + skipped += 1 + continue + + enabled = user.get("enabled") + if enabled is False: + skipped += 1 + continue + + if user.get("serviceAccountClientId") or username.startswith("service-account-"): + skipped += 1 + continue + + # Fetch the full user payload so we can reliably read attributes (and skip re-invites). + user_id = (user.get("id") if isinstance(user.get("id"), str) else "") or "" + user_id = user_id.strip() + full_user = user + if user_id: + try: + full_user = admin_client().get_user(user_id) + except Exception: + full_user = user + + current_status = _extract_attr(full_user.get("attributes"), VAULTWARDEN_STATUS_ATTR) + current_synced_at = _extract_attr(full_user.get("attributes"), VAULTWARDEN_SYNCED_AT_ATTR) + email = _vaultwarden_email_for_user(full_user) + if not email: + print(f"skip {username}: missing email", file=sys.stderr) + skipped += 1 + continue + + try: + _set_user_attribute_if_missing(username, full_user, VAULTWARDEN_EMAIL_ATTR, email) + except Exception: + pass + + # If we've already successfully invited or confirmed presence, do not re-invite on every cron run. + # Vaultwarden returns 409 for "already exists", which is idempotent but noisy and can trigger rate limits. + if current_status in {"invited", "already_present"}: + if not current_synced_at: + try: + _set_user_attribute( + username, + VAULTWARDEN_SYNCED_AT_ATTR, + time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()), + ) + except Exception: + pass + skipped += 1 + continue + + processed += 1 + result = invite_user(email) + if result.ok: + created += 1 + print(f"ok {username}: {result.status}") + try: + _set_user_attribute(username, VAULTWARDEN_STATUS_ATTR, result.status) + _set_user_attribute(username, VAULTWARDEN_SYNCED_AT_ATTR, time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime())) + except Exception: + pass + else: + failures += 1 + print(f"err {username}: {result.status} {result.detail}", file=sys.stderr) + try: + _set_user_attribute(username, VAULTWARDEN_STATUS_ATTR, result.status) + _set_user_attribute(username, VAULTWARDEN_SYNCED_AT_ATTR, time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime())) + except Exception: + pass + + print( + f"done processed={processed} created_or_present={created} skipped={skipped} failures={failures}", + file=sys.stderr, + ) + return 0 if failures == 0 else 2 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/services/bstein-dev-home/vaultwarden-cred-sync-cronjob.yaml b/services/bstein-dev-home/vaultwarden-cred-sync-cronjob.yaml new file mode 100644 index 0000000..5e7c779 --- /dev/null +++ b/services/bstein-dev-home/vaultwarden-cred-sync-cronjob.yaml @@ -0,0 +1,59 @@ +# services/bstein-dev-home/vaultwarden-cred-sync-cronjob.yaml +apiVersion: batch/v1 +kind: CronJob +metadata: + name: vaultwarden-cred-sync + namespace: bstein-dev-home +spec: + schedule: "*/15 * * * *" + concurrencyPolicy: Forbid + successfulJobsHistoryLimit: 1 + failedJobsHistoryLimit: 3 + jobTemplate: + spec: + backoffLimit: 0 + template: + spec: + serviceAccountName: bstein-dev-home + restartPolicy: Never + nodeSelector: + kubernetes.io/arch: arm64 + node-role.kubernetes.io/worker: "true" + imagePullSecrets: + - name: harbor-bstein-robot + containers: + - name: sync + image: registry.bstein.dev/bstein/bstein-dev-home-backend:0.1.1-92 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend"} + imagePullPolicy: Always + command: + - python + - /scripts/vaultwarden_cred_sync.py + env: + - name: PYTHONPATH + value: /app + - name: KEYCLOAK_ENABLED + value: "true" + - name: KEYCLOAK_REALM + value: atlas + - name: KEYCLOAK_ADMIN_URL + value: http://keycloak.sso.svc.cluster.local + - name: KEYCLOAK_ADMIN_REALM + value: atlas + - name: KEYCLOAK_ADMIN_CLIENT_ID + value: bstein-dev-home-admin + - name: KEYCLOAK_ADMIN_CLIENT_SECRET + valueFrom: + secretKeyRef: + name: bstein-dev-home-keycloak-admin + key: client_secret + - name: HTTP_CHECK_TIMEOUT_SEC + value: "20" + volumeMounts: + - name: vaultwarden-cred-sync-script + mountPath: /scripts + readOnly: true + volumes: + - name: vaultwarden-cred-sync-script + configMap: + name: vaultwarden-cred-sync-script + defaultMode: 0555 diff --git a/services/ci-demo/deployment.yaml b/services/ci-demo/deployment.yaml deleted file mode 100644 index df882f5..0000000 --- a/services/ci-demo/deployment.yaml +++ /dev/null @@ -1,31 +0,0 @@ -# services/ci-demo/deployment.yaml -apiVersion: apps/v1 -kind: Deployment -metadata: - name: ci-demo - namespace: ci-demo -spec: - replicas: 1 - selector: - matchLabels: - app.kubernetes.io/name: ci-demo - template: - metadata: - labels: - app.kubernetes.io/name: ci-demo - spec: - nodeSelector: - hardware: rpi4 - containers: - - name: ci-demo - image: registry.bstein.dev/infra/ci-demo:latest - ports: - - name: http - containerPort: 8080 - readinessProbe: - httpGet: - path: / - port: http - initialDelaySeconds: 2 - periodSeconds: 5 - diff --git a/services/ci-demo/image.yaml b/services/ci-demo/image.yaml deleted file mode 100644 index 333fa0a..0000000 --- a/services/ci-demo/image.yaml +++ /dev/null @@ -1,24 +0,0 @@ -# services/ci-demo/image.yaml -apiVersion: image.toolkit.fluxcd.io/v1 -kind: ImageRepository -metadata: - name: ci-demo - namespace: flux-system -spec: - image: registry.bstein.dev/infra/ci-demo - interval: 1m0s ---- -apiVersion: image.toolkit.fluxcd.io/v1 -kind: ImagePolicy -metadata: - name: ci-demo - namespace: flux-system -spec: - imageRepositoryRef: - name: ci-demo - filterTags: - pattern: '^v(?P0\.0\.0-\d+)$' - extract: '$version' - policy: - semver: - range: ">=0.0.0-0" diff --git a/services/ci-demo/kustomization.yaml b/services/ci-demo/kustomization.yaml deleted file mode 100644 index 3eb503f..0000000 --- a/services/ci-demo/kustomization.yaml +++ /dev/null @@ -1,11 +0,0 @@ -# services/ci-demo/kustomization.yaml -apiVersion: kustomize.config.k8s.io/v1beta1 -kind: Kustomization -resources: - - namespace.yaml - - image.yaml - - deployment.yaml - - service.yaml -images: - - name: registry.bstein.dev/infra/ci-demo - newTag: registry.bstein.dev/infra/ci-demo:v0.0.0-3 # {"$imagepolicy": "flux-system:ci-demo"} diff --git a/services/ci-demo/namespace.yaml b/services/ci-demo/namespace.yaml deleted file mode 100644 index e661fc1..0000000 --- a/services/ci-demo/namespace.yaml +++ /dev/null @@ -1,6 +0,0 @@ -# services/ci-demo/namespace.yaml -apiVersion: v1 -kind: Namespace -metadata: - name: ci-demo - diff --git a/services/comms/NOTES.md b/services/comms/NOTES.md new file mode 100644 index 0000000..39898da --- /dev/null +++ b/services/comms/NOTES.md @@ -0,0 +1,31 @@ +# services/comms/NOTES.md + +Purpose: Matrix + Element + LiveKit stack for Othrys (live.bstein.dev). + +Core flow +- Matrix Authentication Service (MAS) handles login/SSO and issues Matrix access tokens. +- Synapse is the homeserver; MAS fronts login, Synapse serves client/server APIs. +- Element Web provides the main UI; Element Call embeds LiveKit for group video. +- LiveKit handles SFU media; Coturn provides TURN for NAT traversal. +- matrix-guest-register provisions MAS guest accounts and performs MAS password login to mint device-bound guest tokens (no Keycloak). + +Operational jobs +- mas-db-ensure-job: ensures MAS database role/database + secret in comms. +- comms-secrets-ensure-job: creates runtime secrets (TURN, LiveKit, Synapse, atlasbot). +- synapse-signingkey-ensure-job: ensures Synapse signing key secret. +- synapse-seeder-admin-ensure-job: ensures Synapse admin user exists. +- synapse-user-seed-job: seeds atlasbot + othrys-seeder users/passwords. +- mas-local-users-ensure-job: ensures MAS local users exist (seeder/bot). +- seed-othrys-room: (suspended) creates Othrys + joins locals. +- reset-othrys-room: suspended CronJob for a manual room reset + pin invite. +- pin-othrys-invite: (suspended) pin invite message if missing. +- guest-name-randomizer: renames numeric/guest users to adj-noun names. +- bstein-force-leave: one-off room leave cleanup. + +Manual re-runs +- Unsuspend a CronJob only when needed; re-suspend after completion. + +Ports +- Traefik (HTTPS) via LB on 192.168.22.9. +- Coturn LB on 192.168.22.5 (3478/5349 + UDP range). +- LiveKit LB on 192.168.22.6 (7880/7881/7882/7883). diff --git a/services/comms/atlasbot-deployment.yaml b/services/comms/atlasbot-deployment.yaml new file mode 100644 index 0000000..4d8bfc7 --- /dev/null +++ b/services/comms/atlasbot-deployment.yaml @@ -0,0 +1,87 @@ +# services/comms/atlasbot-deployment.yaml +apiVersion: apps/v1 +kind: Deployment +metadata: + name: atlasbot + namespace: comms + labels: + app: atlasbot +spec: + replicas: 1 + selector: + matchLabels: + app: atlasbot + template: + metadata: + labels: + app: atlasbot + annotations: + checksum/atlasbot-configmap: manual-atlasbot-3 + spec: + serviceAccountName: atlasbot + nodeSelector: + hardware: rpi5 + containers: + - name: atlasbot + image: python:3.11-slim + command: ["/bin/sh","-c"] + args: + - | + python /app/bot.py + env: + - name: MATRIX_BASE + value: http://othrys-synapse-matrix-synapse:8008 + - name: AUTH_BASE + value: http://matrix-authentication-service:8080 + - name: KB_DIR + value: /kb + - name: VM_URL + value: http://victoria-metrics-single-server.monitoring.svc.cluster.local:8428 + - name: BOT_USER + value: atlasbot + - name: BOT_PASS + valueFrom: + secretKeyRef: + name: atlasbot-credentials-runtime + key: bot-password + - name: CHAT_API_KEY + valueFrom: + secretKeyRef: + name: chat-ai-keys-runtime + key: matrix + - name: OLLAMA_URL + value: https://chat.ai.bstein.dev/ + - name: OLLAMA_MODEL + value: qwen2.5-coder:7b-instruct-q4_0 + resources: + requests: + cpu: 100m + memory: 256Mi + limits: + cpu: 500m + memory: 512Mi + volumeMounts: + - name: code + mountPath: /app/bot.py + subPath: bot.py + - name: kb + mountPath: /kb + readOnly: true + volumes: + - name: code + configMap: + name: atlasbot + - name: kb + configMap: + name: atlas-kb + items: + - key: INDEX.md + path: INDEX.md + - key: atlas.json + path: catalog/atlas.json + - key: atlas-summary.json + path: catalog/atlas-summary.json + - key: runbooks.json + path: catalog/runbooks.json + - key: atlas-http.mmd + path: diagrams/atlas-http.mmd diff --git a/services/comms/atlasbot-rbac.yaml b/services/comms/atlasbot-rbac.yaml new file mode 100644 index 0000000..bc6623b --- /dev/null +++ b/services/comms/atlasbot-rbac.yaml @@ -0,0 +1,47 @@ +# services/comms/atlasbot-rbac.yaml +apiVersion: v1 +kind: ServiceAccount +metadata: + name: atlasbot + namespace: comms +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: atlasbot-readonly +rules: + - apiGroups: [""] + resources: ["namespaces", "nodes", "pods", "services", "endpoints", "events"] + verbs: ["get", "list", "watch"] + - apiGroups: ["apps"] + resources: ["deployments", "statefulsets", "daemonsets", "replicasets"] + verbs: ["get", "list", "watch"] + - apiGroups: ["networking.k8s.io"] + resources: ["ingresses"] + verbs: ["get", "list", "watch"] + - apiGroups: ["traefik.io"] + resources: ["ingressroutes", "middlewares", "serverstransports"] + verbs: ["get", "list", "watch"] + - apiGroups: ["kustomize.toolkit.fluxcd.io"] + resources: ["kustomizations"] + verbs: ["get", "list", "watch"] + - apiGroups: ["helm.toolkit.fluxcd.io"] + resources: ["helmreleases"] + verbs: ["get", "list", "watch"] + - apiGroups: ["source.toolkit.fluxcd.io"] + resources: ["gitrepositories", "helmrepositories", "buckets"] + verbs: ["get", "list", "watch"] +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: atlasbot-readonly +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: atlasbot-readonly +subjects: + - kind: ServiceAccount + name: atlasbot + namespace: comms + diff --git a/services/comms/bstein-force-leave-job.yaml b/services/comms/bstein-force-leave-job.yaml new file mode 100644 index 0000000..956330b --- /dev/null +++ b/services/comms/bstein-force-leave-job.yaml @@ -0,0 +1,189 @@ +# services/comms/bstein-force-leave-job.yaml +apiVersion: batch/v1 +kind: Job +metadata: + name: bstein-leave-rooms-6 + namespace: comms +spec: + backoffLimit: 0 + template: + spec: + restartPolicy: Never + volumes: + - name: mas-admin-client + secret: + secretName: mas-admin-client-runtime + items: + - key: client_secret + path: client_secret + containers: + - name: leave + image: python:3.11-slim + volumeMounts: + - name: mas-admin-client + mountPath: /etc/mas-admin-client + readOnly: true + env: + - name: MAS_ADMIN_CLIENT_ID + value: 01KDXMVQBQ5JNY6SEJPZW6Z8BM + - name: MAS_ADMIN_CLIENT_SECRET_FILE + value: /etc/mas-admin-client/client_secret + - name: MAS_TOKEN_URL + value: http://matrix-authentication-service:8080/oauth2/token + - name: MAS_ADMIN_API_BASE + value: http://matrix-authentication-service:8081/api/admin/v1 + - name: SYNAPSE_BASE + value: http://matrix-authentication-service:8080 + - name: TARGET_USERNAME + value: bstein + - name: TARGET_ROOMS + value: "!OkltaJguODUnZrbcUp:live.bstein.dev,!pMKAVvSRheIOCPIjDM:live.bstein.dev" + command: + - /bin/sh + - -c + - | + set -euo pipefail + python - <<'PY' + import base64 + import json + import os + import urllib.error + import urllib.parse + import urllib.request + import time + + MAS_ADMIN_CLIENT_ID = os.environ["MAS_ADMIN_CLIENT_ID"] + MAS_ADMIN_CLIENT_SECRET_FILE = os.environ["MAS_ADMIN_CLIENT_SECRET_FILE"] + MAS_TOKEN_URL = os.environ["MAS_TOKEN_URL"] + MAS_ADMIN_API_BASE = os.environ["MAS_ADMIN_API_BASE"].rstrip("/") + SYNAPSE_BASE = os.environ["SYNAPSE_BASE"].rstrip("/") + TARGET_USERNAME = os.environ["TARGET_USERNAME"] + TARGET_ROOMS = [r.strip() for r in os.environ["TARGET_ROOMS"].split(",") if r.strip()] + + def http_json(method, url, *, headers=None, json_body=None, form=None, timeout=30): + req_headers = dict(headers or {}) + data = None + + if json_body is not None and form is not None: + raise ValueError("choose json_body or form, not both") + + if json_body is not None: + data = json.dumps(json_body).encode() + req_headers.setdefault("Content-Type", "application/json") + + if form is not None: + data = urllib.parse.urlencode(form).encode() + req_headers.setdefault("Content-Type", "application/x-www-form-urlencoded") + + req = urllib.request.Request(url, data=data, method=method, headers=req_headers) + try: + with urllib.request.urlopen(req, timeout=timeout) as resp: + raw = resp.read() + payload = json.loads(raw.decode("utf-8")) if raw else None + return resp.status, payload + except urllib.error.HTTPError as e: + raw = e.read() + try: + payload = json.loads(raw.decode("utf-8")) if raw else None + except Exception: + payload = None + return e.code, payload + except urllib.error.URLError: + return 0, None + + with open(MAS_ADMIN_CLIENT_SECRET_FILE, "r", encoding="utf-8") as f: + mas_admin_client_secret = f.read().strip() + if not mas_admin_client_secret: + raise RuntimeError("MAS admin client secret file is empty") + + basic = base64.b64encode(f"{MAS_ADMIN_CLIENT_ID}:{mas_admin_client_secret}".encode()).decode() + token_status = 0 + token_payload = None + for attempt in range(1, 6): + token_status, token_payload = http_json( + "POST", + MAS_TOKEN_URL, + headers={"Authorization": f"Basic {basic}"}, + form={"grant_type": "client_credentials", "scope": "urn:mas:admin"}, + timeout=30, + ) + if token_status == 200 and token_payload and "access_token" in token_payload: + break + time.sleep(attempt * 2) + if token_status != 200 or not token_payload or "access_token" not in token_payload: + raise RuntimeError(f"MAS admin token request failed (HTTP {token_status})") + mas_admin_token = token_payload["access_token"] + + user_status, user_payload = http_json( + "GET", + f"{MAS_ADMIN_API_BASE}/users/by-username/{urllib.parse.quote(TARGET_USERNAME)}", + headers={"Authorization": f"Bearer {mas_admin_token}"}, + timeout=30, + ) + if user_status != 200 or not user_payload or "data" not in user_payload or "id" not in user_payload["data"]: + raise RuntimeError(f"MAS user lookup failed (HTTP {user_status})") + actor_user_id = user_payload["data"]["id"] + + sess_status, sess_payload = http_json( + "POST", + f"{MAS_ADMIN_API_BASE}/personal-sessions", + headers={"Authorization": f"Bearer {mas_admin_token}"}, + json_body={ + "actor_user_id": actor_user_id, + "human_name": "bstein room cleanup", + "scope": "urn:matrix:client:api:*", + "expires_in": 300, + }, + timeout=30, + ) + if sess_status != 201 or not sess_payload or "data" not in sess_payload: + raise RuntimeError(f"MAS personal session create failed (HTTP {sess_status})") + + personal_session_id = sess_payload["data"]["id"] + personal_token = (sess_payload.get("data", {}).get("attributes", {}) or {}).get("access_token") + if not personal_token: + raise RuntimeError("MAS personal session did not return an access token") + + results = {"rooms": {}, "revoke": None} + failures = [] + + try: + for room_id in TARGET_ROOMS: + room_q = urllib.parse.quote(room_id, safe="") + leave_status = 0 + forget_status = 0 + for attempt in range(1, 6): + leave_status, _ = http_json( + "POST", + f"{SYNAPSE_BASE}/_matrix/client/v3/rooms/{room_q}/leave", + headers={"Authorization": f"Bearer {personal_token}"}, + json_body={}, + timeout=30, + ) + forget_status, _ = http_json( + "POST", + f"{SYNAPSE_BASE}/_matrix/client/v3/rooms/{room_q}/forget", + headers={"Authorization": f"Bearer {personal_token}"}, + json_body={}, + timeout=30, + ) + if leave_status in (200, 404) and forget_status in (200, 404): + break + time.sleep(attempt * 2) + results["rooms"][room_id] = {"leave": leave_status, "forget": forget_status} + if leave_status not in (200, 404) or forget_status not in (200, 404): + failures.append(room_id) + finally: + revoke_status, _ = http_json( + "POST", + f"{MAS_ADMIN_API_BASE}/personal-sessions/{urllib.parse.quote(personal_session_id)}/revoke", + headers={"Authorization": f"Bearer {mas_admin_token}"}, + json_body={}, + timeout=30, + ) + results["revoke"] = revoke_status + + print(json.dumps(results, indent=2, sort_keys=True)) + if failures: + raise SystemExit(f"failed to leave/forget rooms: {', '.join(failures)}") + PY diff --git a/services/comms/comms-secrets-ensure-job.yaml b/services/comms/comms-secrets-ensure-job.yaml new file mode 100644 index 0000000..877649b --- /dev/null +++ b/services/comms/comms-secrets-ensure-job.yaml @@ -0,0 +1,102 @@ +# services/comms/comms-secrets-ensure-job.yaml +apiVersion: batch/v1 +kind: Job +metadata: + name: comms-secrets-ensure-1 + namespace: comms +spec: + backoffLimit: 1 + ttlSecondsAfterFinished: 3600 + template: + spec: + serviceAccountName: comms-secrets-ensure + restartPolicy: Never + containers: + - name: ensure + image: bitnami/kubectl:latest + command: ["/bin/sh", "-c"] + args: + - | + set -eu + trap 'echo "comms-secrets-ensure failed"; sleep 300' ERR + umask 077 + + safe_pass() { + head -c 32 /dev/urandom | base64 | tr -d '\n' | tr '+/' '-_' | tr -d '=' + } + + get_secret_value() { + ns="$1" + name="$2" + key="$3" + kubectl -n "${ns}" get secret "${name}" -o "jsonpath={.data.${key}}" 2>/dev/null | base64 -d 2>/dev/null || true + } + + ensure_secret_key() { + ns="$1" + name="$2" + key="$3" + value="$4" + if ! kubectl -n "${ns}" get secret "${name}" >/dev/null 2>&1; then + kubectl -n "${ns}" create secret generic "${name}" --from-literal="${key}=${value}" >/dev/null + return + fi + existing="$(kubectl -n "${ns}" get secret "${name}" -o "jsonpath={.data.${key}}" 2>/dev/null || true)" + if [ -z "${existing}" ]; then + b64="$(printf '%s' "${value}" | base64 | tr -d '\n')" + payload="$(printf '{"data":{"%s":"%s"}}' "${key}" "${b64}")" + kubectl -n "${ns}" patch secret "${name}" --type=merge -p "${payload}" >/dev/null + fi + } + + ensure_chat_secret() { + ns="$1" + if ! kubectl -n "${ns}" get secret chat-ai-keys-runtime >/dev/null 2>&1; then + kubectl -n "${ns}" create secret generic chat-ai-keys-runtime \ + --from-literal=matrix="${CHAT_KEY_MATRIX}" \ + --from-literal=homepage="${CHAT_KEY_HOMEPAGE}" >/dev/null + return + fi + ensure_secret_key "${ns}" chat-ai-keys-runtime matrix "${CHAT_KEY_MATRIX}" + ensure_secret_key "${ns}" chat-ai-keys-runtime homepage "${CHAT_KEY_HOMEPAGE}" + } + + CHAT_KEY_MATRIX="$(get_secret_value comms chat-ai-keys-runtime matrix)" + CHAT_KEY_HOMEPAGE="$(get_secret_value comms chat-ai-keys-runtime homepage)" + if [ -z "${CHAT_KEY_MATRIX}" ] || [ -z "${CHAT_KEY_HOMEPAGE}" ]; then + ALT_MATRIX="$(get_secret_value bstein-dev-home chat-ai-keys-runtime matrix)" + ALT_HOMEPAGE="$(get_secret_value bstein-dev-home chat-ai-keys-runtime homepage)" + [ -z "${CHAT_KEY_MATRIX}" ] && CHAT_KEY_MATRIX="${ALT_MATRIX}" + [ -z "${CHAT_KEY_HOMEPAGE}" ] && CHAT_KEY_HOMEPAGE="${ALT_HOMEPAGE}" + fi + [ -z "${CHAT_KEY_MATRIX}" ] && CHAT_KEY_MATRIX="$(safe_pass)" + [ -z "${CHAT_KEY_HOMEPAGE}" ] && CHAT_KEY_HOMEPAGE="$(safe_pass)" + + ensure_chat_secret comms + ensure_chat_secret bstein-dev-home + + ensure_secret_key comms turn-shared-secret TURN_STATIC_AUTH_SECRET "$(safe_pass)" + ensure_secret_key comms livekit-api primary "$(safe_pass)" + ensure_secret_key comms synapse-redis redis-password "$(safe_pass)" + ensure_secret_key comms synapse-macaroon macaroon_secret_key "$(safe_pass)" + ensure_secret_key comms atlasbot-credentials-runtime bot-password "$(safe_pass)" + ensure_secret_key comms atlasbot-credentials-runtime seeder-password "$(safe_pass)" + + SYN_PASS="$(get_secret_value comms synapse-db POSTGRES_PASSWORD)" + if [ -z "${SYN_PASS}" ]; then + SYN_PASS="$(safe_pass)" + kubectl -n comms create secret generic synapse-db --from-literal=POSTGRES_PASSWORD="${SYN_PASS}" >/dev/null + fi + + POD_NAME="$(kubectl -n postgres get pods -l app=postgres -o jsonpath='{.items[0].metadata.name}')" + if [ -z "${POD_NAME}" ]; then + echo "postgres pod not found" >&2 + exit 1 + fi + SYN_PASS_SQL="$(printf '%s' "${SYN_PASS}" | sed "s/'/''/g")" + kubectl -n postgres exec -i "${POD_NAME}" -- psql -U postgres -d postgres \ + -c "CREATE ROLE synapse LOGIN PASSWORD '${SYN_PASS_SQL}';" || true + kubectl -n postgres exec -i "${POD_NAME}" -- psql -U postgres -d postgres \ + -c "ALTER ROLE synapse WITH PASSWORD '${SYN_PASS_SQL}';" + kubectl -n postgres exec -i "${POD_NAME}" -- psql -U postgres -d postgres \ + -c "CREATE DATABASE synapse OWNER synapse;" || true diff --git a/services/comms/comms-secrets-ensure-rbac.yaml b/services/comms/comms-secrets-ensure-rbac.yaml new file mode 100644 index 0000000..dfb4f21 --- /dev/null +++ b/services/comms/comms-secrets-ensure-rbac.yaml @@ -0,0 +1,34 @@ +# services/comms/comms-secrets-ensure-rbac.yaml +apiVersion: v1 +kind: ServiceAccount +metadata: + name: comms-secrets-ensure + namespace: comms +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: comms-secrets-ensure +rules: + - apiGroups: [""] + resources: ["secrets"] + verbs: ["get", "create", "patch", "update"] + - apiGroups: [""] + resources: ["pods"] + verbs: ["get", "list"] + - apiGroups: [""] + resources: ["pods/exec"] + verbs: ["create"] +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: comms-secrets-ensure +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: comms-secrets-ensure +subjects: + - kind: ServiceAccount + name: comms-secrets-ensure + namespace: comms diff --git a/services/comms/coturn.yaml b/services/comms/coturn.yaml new file mode 100644 index 0000000..12fa78a --- /dev/null +++ b/services/comms/coturn.yaml @@ -0,0 +1,333 @@ +# services/comms/coturn.yaml +apiVersion: apps/v1 +kind: Deployment +metadata: + name: coturn + labels: + app: coturn +spec: + replicas: 1 + selector: + matchLabels: + app: coturn + template: + metadata: + labels: + app: coturn + spec: + nodeSelector: + hardware: rpi5 + affinity: + nodeAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - weight: 50 + preference: + matchExpressions: + - key: hardware + operator: In + values: ["rpi5","rpi4"] + containers: + - name: coturn + image: ghcr.io/coturn/coturn:4.6.2 + command: + - /bin/sh + - -c + - | + exec /usr/bin/turnserver \ + --no-cli \ + --fingerprint \ + --lt-cred-mech \ + --listening-ip=0.0.0.0 \ + --relay-ip="${POD_IP}" \ + --external-ip="${TURN_PUBLIC_IP}/${POD_IP}" \ + --user=livekit:"${TURN_STATIC_AUTH_SECRET}" \ + --realm=live.bstein.dev \ + --listening-port=3478 \ + --tls-listening-port=5349 \ + --min-port=50000 \ + --max-port=50050 \ + --cert=/etc/coturn/tls/tls.crt \ + --pkey=/etc/coturn/tls/tls.key \ + --log-file=stdout \ + --no-software-attribute + env: + - name: POD_IP + valueFrom: + fieldRef: + fieldPath: status.podIP + - name: TURN_PUBLIC_IP + value: "38.28.125.112" + - name: TURN_STATIC_AUTH_SECRET + valueFrom: + secretKeyRef: + name: turn-shared-secret + key: TURN_STATIC_AUTH_SECRET + ports: + - name: turn-udp + containerPort: 3478 + protocol: UDP + - name: turn-tcp + containerPort: 3478 + protocol: TCP + - name: turn-tls + containerPort: 5349 + protocol: TCP + volumeMounts: + - name: tls + mountPath: /etc/coturn/tls + readOnly: true + resources: + requests: + cpu: 200m + memory: 256Mi + limits: + cpu: "2" + memory: 512Mi + volumes: + - name: tls + secret: + secretName: turn-live-tls +--- +apiVersion: v1 +kind: Service +metadata: + name: coturn + annotations: + metallb.universe.tf/address-pool: communication-pool +spec: + type: LoadBalancer + loadBalancerClass: metallb + loadBalancerIP: 192.168.22.5 + externalTrafficPolicy: Local + selector: + app: coturn + ports: + - name: turn-udp + port: 3478 + targetPort: 3478 + protocol: UDP + - name: turn-tcp + port: 3478 + targetPort: 3478 + protocol: TCP + - name: turn-tls + port: 5349 + targetPort: 5349 + protocol: TCP + # Expose relay range for UDP media + - name: relay-50000 + port: 50000 + targetPort: 50000 + protocol: UDP + - name: relay-50001 + port: 50001 + targetPort: 50001 + protocol: UDP + - name: relay-50002 + port: 50002 + targetPort: 50002 + protocol: UDP + - name: relay-50003 + port: 50003 + targetPort: 50003 + protocol: UDP + - name: relay-50004 + port: 50004 + targetPort: 50004 + protocol: UDP + - name: relay-50005 + port: 50005 + targetPort: 50005 + protocol: UDP + - name: relay-50006 + port: 50006 + targetPort: 50006 + protocol: UDP + - name: relay-50007 + port: 50007 + targetPort: 50007 + protocol: UDP + - name: relay-50008 + port: 50008 + targetPort: 50008 + protocol: UDP + - name: relay-50009 + port: 50009 + targetPort: 50009 + protocol: UDP + - name: relay-50010 + port: 50010 + targetPort: 50010 + protocol: UDP + - name: relay-50011 + port: 50011 + targetPort: 50011 + protocol: UDP + - name: relay-50012 + port: 50012 + targetPort: 50012 + protocol: UDP + - name: relay-50013 + port: 50013 + targetPort: 50013 + protocol: UDP + - name: relay-50014 + port: 50014 + targetPort: 50014 + protocol: UDP + - name: relay-50015 + port: 50015 + targetPort: 50015 + protocol: UDP + - name: relay-50016 + port: 50016 + targetPort: 50016 + protocol: UDP + - name: relay-50017 + port: 50017 + targetPort: 50017 + protocol: UDP + - name: relay-50018 + port: 50018 + targetPort: 50018 + protocol: UDP + - name: relay-50019 + port: 50019 + targetPort: 50019 + protocol: UDP + - name: relay-50020 + port: 50020 + targetPort: 50020 + protocol: UDP + - name: relay-50021 + port: 50021 + targetPort: 50021 + protocol: UDP + - name: relay-50022 + port: 50022 + targetPort: 50022 + protocol: UDP + - name: relay-50023 + port: 50023 + targetPort: 50023 + protocol: UDP + - name: relay-50024 + port: 50024 + targetPort: 50024 + protocol: UDP + - name: relay-50025 + port: 50025 + targetPort: 50025 + protocol: UDP + - name: relay-50026 + port: 50026 + targetPort: 50026 + protocol: UDP + - name: relay-50027 + port: 50027 + targetPort: 50027 + protocol: UDP + - name: relay-50028 + port: 50028 + targetPort: 50028 + protocol: UDP + - name: relay-50029 + port: 50029 + targetPort: 50029 + protocol: UDP + - name: relay-50030 + port: 50030 + targetPort: 50030 + protocol: UDP + - name: relay-50031 + port: 50031 + targetPort: 50031 + protocol: UDP + - name: relay-50032 + port: 50032 + targetPort: 50032 + protocol: UDP + - name: relay-50033 + port: 50033 + targetPort: 50033 + protocol: UDP + - name: relay-50034 + port: 50034 + targetPort: 50034 + protocol: UDP + - name: relay-50035 + port: 50035 + targetPort: 50035 + protocol: UDP + - name: relay-50036 + port: 50036 + targetPort: 50036 + protocol: UDP + - name: relay-50037 + port: 50037 + targetPort: 50037 + protocol: UDP + - name: relay-50038 + port: 50038 + targetPort: 50038 + protocol: UDP + - name: relay-50039 + port: 50039 + targetPort: 50039 + protocol: UDP + - name: relay-50040 + port: 50040 + targetPort: 50040 + protocol: UDP + - name: relay-50041 + port: 50041 + targetPort: 50041 + protocol: UDP + - name: relay-50042 + port: 50042 + targetPort: 50042 + protocol: UDP + - name: relay-50043 + port: 50043 + targetPort: 50043 + protocol: UDP + - name: relay-50044 + port: 50044 + targetPort: 50044 + protocol: UDP + - name: relay-50045 + port: 50045 + targetPort: 50045 + protocol: UDP + - name: relay-50046 + port: 50046 + targetPort: 50046 + protocol: UDP + - name: relay-50047 + port: 50047 + targetPort: 50047 + protocol: UDP + - name: relay-50048 + port: 50048 + targetPort: 50048 + protocol: UDP + - name: relay-50049 + port: 50049 + targetPort: 50049 + protocol: UDP + - name: relay-50050 + port: 50050 + targetPort: 50050 + protocol: UDP +--- +apiVersion: cert-manager.io/v1 +kind: Certificate +metadata: + name: turn-live-cert +spec: + secretName: turn-live-tls + issuerRef: + name: letsencrypt + kind: ClusterIssuer + dnsNames: + - turn.live.bstein.dev diff --git a/services/comms/element-call-config.yaml b/services/comms/element-call-config.yaml new file mode 100644 index 0000000..85368f2 --- /dev/null +++ b/services/comms/element-call-config.yaml @@ -0,0 +1,24 @@ +# services/comms/element-call-config.yaml +apiVersion: v1 +kind: ConfigMap +metadata: + name: element-call-config +data: + config.json: | + { + "default_server_config": { + "m.homeserver": { + "base_url": "https://matrix.live.bstein.dev", + "server_name": "live.bstein.dev" + }, + "m.identity_server": { + "base_url": "https://vector.im" + } + }, + "livekit": { + "livekit_service_url": "https://kit.live.bstein.dev/livekit/jwt" + }, + "branding": { + "app_name": "Othrys Call" + } + } diff --git a/services/comms/element-call-deployment.yaml b/services/comms/element-call-deployment.yaml new file mode 100644 index 0000000..7f3581d --- /dev/null +++ b/services/comms/element-call-deployment.yaml @@ -0,0 +1,75 @@ +# services/comms/element-call-deployment.yaml +apiVersion: apps/v1 +kind: Deployment +metadata: + name: element-call + labels: + app: element-call +spec: + replicas: 1 + selector: + matchLabels: + app: element-call + template: + metadata: + labels: + app: element-call + spec: + nodeSelector: + hardware: rpi5 + containers: + - name: element-call + image: ghcr.io/element-hq/element-call:latest + ports: + - containerPort: 8080 + name: http + volumeMounts: + - name: config + mountPath: /app/config.json + subPath: config.json + volumes: + - name: config + configMap: + name: element-call-config + items: + - key: config.json + path: config.json + optional: false +--- +apiVersion: v1 +kind: Service +metadata: + name: element-call +spec: + selector: + app: element-call + ports: + - name: http + port: 80 + targetPort: 8080 +--- +apiVersion: networking.k8s.io/v1 +kind: Ingress +metadata: + name: element-call + annotations: + kubernetes.io/ingress.class: traefik + traefik.ingress.kubernetes.io/router.entrypoints: websecure + traefik.ingress.kubernetes.io/router.tls: "true" + cert-manager.io/cluster-issuer: letsencrypt +spec: + tls: + - hosts: + - call.live.bstein.dev + secretName: call-live-tls + rules: + - host: call.live.bstein.dev + http: + paths: + - path: / + pathType: Prefix + backend: + service: + name: element-call + port: + number: 80 diff --git a/services/comms/element-rendered.yaml b/services/comms/element-rendered.yaml new file mode 100644 index 0000000..0d3200e --- /dev/null +++ b/services/comms/element-rendered.yaml @@ -0,0 +1,202 @@ +--- +# Source: element-web/templates/serviceaccount.yaml +apiVersion: v1 +kind: ServiceAccount +metadata: + name: othrys-element-element-web + labels: + helm.sh/chart: element-web-1.4.26 + app.kubernetes.io/name: element-web + app.kubernetes.io/instance: othrys-element + app.kubernetes.io/version: "1.12.6" + app.kubernetes.io/managed-by: Helm +--- +# Source: element-web/templates/configuration-nginx.yaml +apiVersion: v1 +kind: ConfigMap +metadata: + name: othrys-element-element-web-nginx + labels: + helm.sh/chart: element-web-1.4.26 + app.kubernetes.io/name: element-web + app.kubernetes.io/instance: othrys-element + app.kubernetes.io/version: "1.12.6" + app.kubernetes.io/managed-by: Helm +data: + default.conf: | + server { + listen 8080; + listen [::]:8080; + server_name localhost; + + root /usr/share/nginx/html; + index index.html; + + add_header X-Frame-Options SAMEORIGIN; + add_header X-Content-Type-Options nosniff; + add_header X-XSS-Protection "1; mode=block"; + add_header Content-Security-Policy "frame-ancestors 'self'"; + + # Set no-cache for the index.html only so that browsers always check for a new copy of Element Web. + location = /index.html { + add_header Cache-Control "no-cache"; + } + + # redirect server error pages to the static page /50x.html + # + error_page 500 502 503 504 /50x.html; + } +--- +# Source: element-web/templates/configuration.yaml +apiVersion: v1 +kind: ConfigMap +metadata: + name: othrys-element-element-web + labels: + helm.sh/chart: element-web-1.4.26 + app.kubernetes.io/name: element-web + app.kubernetes.io/instance: othrys-element + app.kubernetes.io/version: "1.12.6" + app.kubernetes.io/managed-by: Helm +data: + config.json: | + {"brand":"Othrys","default_server_config":{"m.homeserver":{"base_url":"https://matrix.live.bstein.dev","server_name":"live.bstein.dev"},"m.identity_server":{"base_url":"https://vector.im"}},"default_theme":"dark","disable_custom_urls":true,"disable_login_language_selector":true,"disable_guests":false,"registration_url":"https://bstein.dev/request-access","show_labs_settings":true,"features":{"feature_group_calls":true,"feature_video_rooms":true,"feature_element_call_video_rooms":true},"room_directory":{"servers":["live.bstein.dev"]},"jitsi":{},"element_call":{"url":"https://call.live.bstein.dev","participant_limit":16,"brand":"Othrys Call"}} +--- +# Source: element-web/templates/service.yaml +apiVersion: v1 +kind: Service +metadata: + name: othrys-element-element-web + labels: + helm.sh/chart: element-web-1.4.26 + app.kubernetes.io/name: element-web + app.kubernetes.io/instance: othrys-element + app.kubernetes.io/version: "1.12.6" + app.kubernetes.io/managed-by: Helm +spec: + type: ClusterIP + ports: + - port: 80 + targetPort: http + protocol: TCP + name: http + selector: + app.kubernetes.io/name: element-web + app.kubernetes.io/instance: othrys-element +--- +# Source: element-web/templates/deployment.yaml +apiVersion: apps/v1 +kind: Deployment +metadata: + name: othrys-element-element-web + labels: + helm.sh/chart: element-web-1.4.26 + app.kubernetes.io/name: element-web + app.kubernetes.io/instance: othrys-element + app.kubernetes.io/version: "1.12.6" + app.kubernetes.io/managed-by: Helm +spec: + replicas: 1 + selector: + matchLabels: + app.kubernetes.io/name: element-web + app.kubernetes.io/instance: othrys-element + template: + metadata: + annotations: + checksum/config: manual-rtc-enable-1 + checksum/config-nginx: 085061d0925f4840c3770233509dc0b00fe8fa1a5fef8bf282a514fd101c76fa + labels: + app.kubernetes.io/name: element-web + app.kubernetes.io/instance: othrys-element + spec: + serviceAccountName: othrys-element-element-web + securityContext: + {} + containers: + - name: element-web + securityContext: + {} + image: "ghcr.io/element-hq/element-web:v1.12.6" + imagePullPolicy: IfNotPresent + env: + - name: ELEMENT_WEB_PORT + value: '8080' + ports: + - name: http + containerPort: 8080 + protocol: TCP + livenessProbe: + httpGet: + path: / + port: http + readinessProbe: + httpGet: + path: / + port: http + resources: + limits: + cpu: 500m + memory: 512Mi + requests: + cpu: 100m + memory: 256Mi + volumeMounts: + - mountPath: /app/config.json + name: config + subPath: config.json + - mountPath: /etc/nginx/conf.d/config.json + name: config-nginx + subPath: config.json + volumes: + - name: config + configMap: + name: othrys-element-element-web + - name: config-nginx + configMap: + name: othrys-element-element-web-nginx + nodeSelector: + hardware: rpi5 + affinity: + nodeAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - preference: + matchExpressions: + - key: hardware + operator: In + values: + - rpi5 + - rpi4 + weight: 50 +--- +# Source: element-web/templates/ingress.yaml +apiVersion: networking.k8s.io/v1 +kind: Ingress +metadata: + name: othrys-element-element-web + labels: + helm.sh/chart: element-web-1.4.26 + app.kubernetes.io/name: element-web + app.kubernetes.io/instance: othrys-element + app.kubernetes.io/version: "1.12.6" + app.kubernetes.io/managed-by: Helm + annotations: + cert-manager.io/cluster-issuer: letsencrypt + traefik.ingress.kubernetes.io/router.entrypoints: websecure +spec: + ingressClassName: traefik + tls: + - hosts: + - "live.bstein.dev" + secretName: live-othrys-tls + rules: + - host: "live.bstein.dev" + http: + paths: + - path: / + backend: + service: + name: othrys-element-element-web + port: + number: 80 + pathType: Prefix diff --git a/services/comms/guest-name-job.yaml b/services/comms/guest-name-job.yaml new file mode 100644 index 0000000..156617d --- /dev/null +++ b/services/comms/guest-name-job.yaml @@ -0,0 +1,401 @@ +# services/comms/guest-name-job.yaml +apiVersion: batch/v1 +kind: CronJob +metadata: + name: guest-name-randomizer + namespace: comms +spec: + schedule: "*/1 * * * *" + suspend: false + concurrencyPolicy: Forbid + successfulJobsHistoryLimit: 1 + failedJobsHistoryLimit: 1 + jobTemplate: + spec: + backoffLimit: 0 + template: + spec: + restartPolicy: Never + volumes: + - name: mas-admin-client + secret: + secretName: mas-admin-client-runtime + items: + - key: client_secret + path: client_secret + containers: + - name: rename + image: python:3.11-slim + volumeMounts: + - name: mas-admin-client + mountPath: /etc/mas-admin-client + readOnly: true + env: + - name: SYNAPSE_BASE + value: http://othrys-synapse-matrix-synapse:8008 + - name: MAS_ADMIN_CLIENT_ID + value: 01KDXMVQBQ5JNY6SEJPZW6Z8BM + - name: MAS_ADMIN_CLIENT_SECRET_FILE + value: /etc/mas-admin-client/client_secret + - name: MAS_ADMIN_API_BASE + value: http://matrix-authentication-service:8081/api/admin/v1 + - name: MAS_TOKEN_URL + value: http://matrix-authentication-service:8080/oauth2/token + - name: SEEDER_USER + value: othrys-seeder + - name: PGHOST + value: postgres-service.postgres.svc.cluster.local + - name: PGPORT + value: "5432" + - name: PGDATABASE + value: synapse + - name: PGUSER + value: synapse + - name: PGPASSWORD + valueFrom: + secretKeyRef: + name: synapse-db + key: POSTGRES_PASSWORD + command: + - /bin/sh + - -c + - | + set -euo pipefail + pip install --no-cache-dir requests psycopg2-binary >/dev/null + python - <<'PY' + import base64 + import os + import random + import requests + import time + import urllib.parse + import psycopg2 + + ADJ = [ + "brisk","calm","eager","gentle","merry","nifty","rapid","sunny","witty","zesty", + "amber","bold","bright","crisp","daring","frosty","glad","jolly","lively","mellow", + "quiet","ripe","serene","spry","tidy","vivid","warm","wild","clever","kind", + ] + NOUN = [ + "otter","falcon","comet","ember","grove","harbor","meadow","raven","river","summit", + "breeze","cedar","cinder","cove","delta","forest","glade","lark","marsh","peak", + "pine","quartz","reef","ridge","sable","sage","shore","thunder","vale","zephyr", + ] + + BASE = os.environ["SYNAPSE_BASE"] + MAS_ADMIN_CLIENT_ID = os.environ["MAS_ADMIN_CLIENT_ID"] + MAS_ADMIN_CLIENT_SECRET_FILE = os.environ["MAS_ADMIN_CLIENT_SECRET_FILE"] + MAS_ADMIN_API_BASE = os.environ["MAS_ADMIN_API_BASE"].rstrip("/") + MAS_TOKEN_URL = os.environ["MAS_TOKEN_URL"] + SEEDER_USER = os.environ["SEEDER_USER"] + ROOM_ALIAS = "#othrys:live.bstein.dev" + SERVER_NAME = "live.bstein.dev" + + def mas_admin_token(): + with open(MAS_ADMIN_CLIENT_SECRET_FILE, "r", encoding="utf-8") as f: + secret = f.read().strip() + basic = base64.b64encode(f"{MAS_ADMIN_CLIENT_ID}:{secret}".encode()).decode() + last_err = None + for attempt in range(5): + try: + r = requests.post( + MAS_TOKEN_URL, + headers={"Authorization": f"Basic {basic}"}, + data={"grant_type": "client_credentials", "scope": "urn:mas:admin"}, + timeout=30, + ) + r.raise_for_status() + return r.json()["access_token"] + except Exception as exc: # noqa: BLE001 + last_err = exc + time.sleep(2 ** attempt) + raise last_err + + def mas_user_id(token, username): + r = requests.get( + f"{MAS_ADMIN_API_BASE}/users/by-username/{urllib.parse.quote(username)}", + headers={"Authorization": f"Bearer {token}"}, + timeout=30, + ) + r.raise_for_status() + return r.json()["data"]["id"] + + def mas_personal_session(token, user_id): + r = requests.post( + f"{MAS_ADMIN_API_BASE}/personal-sessions", + headers={"Authorization": f"Bearer {token}"}, + json={ + "actor_user_id": user_id, + "human_name": "guest-name-randomizer", + "scope": "urn:matrix:client:api:*", + "expires_in": 300, + }, + timeout=30, + ) + r.raise_for_status() + data = r.json().get("data", {}).get("attributes", {}) or {} + return data["access_token"], r.json()["data"]["id"] + + def mas_revoke_session(token, session_id): + requests.post( + f"{MAS_ADMIN_API_BASE}/personal-sessions/{urllib.parse.quote(session_id)}/revoke", + headers={"Authorization": f"Bearer {token}"}, + json={}, + timeout=30, + ) + + def resolve_alias(token, alias): + headers = {"Authorization": f"Bearer {token}"} + enc = urllib.parse.quote(alias) + r = requests.get(f"{BASE}/_matrix/client/v3/directory/room/{enc}", headers=headers) + r.raise_for_status() + return r.json()["room_id"] + + def room_members(token, room_id): + headers = {"Authorization": f"Bearer {token}"} + r = requests.get(f"{BASE}/_matrix/client/v3/rooms/{urllib.parse.quote(room_id)}/members", headers=headers) + r.raise_for_status() + members = set() + existing_names = set() + for ev in r.json().get("chunk", []): + user_id = ev.get("state_key") + if user_id: + members.add(user_id) + disp = (ev.get("content") or {}).get("displayname") + if disp: + existing_names.add(disp) + return members, existing_names + + def mas_list_users(token): + headers = {"Authorization": f"Bearer {token}"} + users = [] + cursor = None + while True: + url = f"{MAS_ADMIN_API_BASE}/users?page[size]=100" + if cursor: + url += f"&page[after]={urllib.parse.quote(cursor)}" + r = requests.get(url, headers=headers, timeout=30) + r.raise_for_status() + data = r.json().get("data", []) + if not data: + break + users.extend(data) + cursor = data[-1].get("meta", {}).get("page", {}).get("cursor") + if not cursor: + break + return users + + def synapse_list_users(token): + headers = {"Authorization": f"Bearer {token}"} + users = [] + from_token = None + while True: + url = f"{BASE}/_synapse/admin/v2/users?local=true&deactivated=false&limit=100" + if from_token: + url += f"&from={urllib.parse.quote(from_token)}" + r = requests.get(url, headers=headers, timeout=30) + r.raise_for_status() + payload = r.json() + users.extend(payload.get("users", [])) + from_token = payload.get("next_token") + if not from_token: + break + return users + + def user_id_for_username(username): + return f"@{username}:live.bstein.dev" + + def get_displayname(token, user_id): + headers = {"Authorization": f"Bearer {token}"} + r = requests.get(f"{BASE}/_matrix/client/v3/profile/{urllib.parse.quote(user_id)}", headers=headers) + r.raise_for_status() + return r.json().get("displayname") + + def get_displayname_admin(token, user_id): + headers = {"Authorization": f"Bearer {token}"} + r = requests.get( + f"{BASE}/_synapse/admin/v2/users/{urllib.parse.quote(user_id)}", + headers=headers, + timeout=30, + ) + if r.status_code == 404: + return None + r.raise_for_status() + return r.json().get("displayname") + + def set_displayname(token, room_id, user_id, name, in_room): + headers = {"Authorization": f"Bearer {token}"} + payload = {"displayname": name} + r = requests.put( + f"{BASE}/_matrix/client/v3/profile/{urllib.parse.quote(user_id)}/displayname", + headers=headers, + json=payload, + ) + r.raise_for_status() + if not in_room: + return + state_url = f"{BASE}/_matrix/client/v3/rooms/{urllib.parse.quote(room_id)}/state/m.room.member/{urllib.parse.quote(user_id)}" + content = {"membership": "join", "displayname": name} + requests.put(state_url, headers=headers, json=content, timeout=30) + + def set_displayname_admin(token, user_id, name): + headers = {"Authorization": f"Bearer {token}"} + payload = {"displayname": name} + r = requests.put( + f"{BASE}/_synapse/admin/v2/users/{urllib.parse.quote(user_id)}", + headers=headers, + json=payload, + timeout=30, + ) + if r.status_code in (200, 201, 204): + return True + return False + + def needs_rename_username(username): + return username.isdigit() or username.startswith("guest-") + + def needs_rename_display(display): + return not display or display.isdigit() or display.startswith("guest-") + + def db_rename_numeric(existing_names): + profile_rows = [] + profile_index = {} + users = [] + conn = psycopg2.connect( + host=os.environ["PGHOST"], + port=int(os.environ["PGPORT"]), + dbname=os.environ["PGDATABASE"], + user=os.environ["PGUSER"], + password=os.environ["PGPASSWORD"], + ) + try: + with conn: + with conn.cursor() as cur: + cur.execute( + "SELECT user_id, full_user_id, displayname FROM profiles WHERE full_user_id ~ %s", + (f"^@\\d+:{SERVER_NAME}$",), + ) + profile_rows = cur.fetchall() + profile_index = {row[1]: row for row in profile_rows} + for user_id, full_user_id, display in profile_rows: + if display and not needs_rename_display(display): + continue + new = None + for _ in range(30): + candidate = f"{random.choice(ADJ)}-{random.choice(NOUN)}" + if candidate not in existing_names: + new = candidate + existing_names.add(candidate) + break + if not new: + continue + cur.execute( + "UPDATE profiles SET displayname = %s WHERE full_user_id = %s", + (new, full_user_id), + ) + + cur.execute( + "SELECT name FROM users WHERE name ~ %s", + (f"^@\\d+:{SERVER_NAME}$",), + ) + users = [row[0] for row in cur.fetchall()] + if not users: + return + cur.execute( + "SELECT user_id, full_user_id FROM profiles WHERE full_user_id = ANY(%s)", + (users,), + ) + for existing_full in cur.fetchall(): + profile_index.setdefault(existing_full[1], existing_full) + + for full_user_id in users: + if full_user_id in profile_index: + continue + localpart = full_user_id.split(":", 1)[0].lstrip("@") + new = None + for _ in range(30): + candidate = f"{random.choice(ADJ)}-{random.choice(NOUN)}" + if candidate not in existing_names: + new = candidate + existing_names.add(candidate) + break + if not new: + continue + cur.execute( + "INSERT INTO profiles (user_id, displayname, full_user_id) VALUES (%s, %s, %s) " + "ON CONFLICT (full_user_id) DO UPDATE SET displayname = EXCLUDED.displayname", + (localpart, new, full_user_id), + ) + finally: + conn.close() + + admin_token = mas_admin_token() + seeder_id = mas_user_id(admin_token, SEEDER_USER) + seeder_token, seeder_session = mas_personal_session(admin_token, seeder_id) + try: + room_id = resolve_alias(seeder_token, ROOM_ALIAS) + members, existing = room_members(seeder_token, room_id) + users = mas_list_users(admin_token) + mas_usernames = set() + for user in users: + attrs = user.get("attributes") or {} + username = attrs.get("username") or "" + if username: + mas_usernames.add(username) + legacy_guest = attrs.get("legacy_guest") + if not username: + continue + if not (legacy_guest or needs_rename_username(username)): + continue + user_id = user_id_for_username(username) + access_token, session_id = mas_personal_session(admin_token, user["id"]) + try: + display = get_displayname(access_token, user_id) + if display and not needs_rename_display(display): + continue + new = None + for _ in range(30): + candidate = f"{random.choice(ADJ)}-{random.choice(NOUN)}" + if candidate not in existing: + new = candidate + existing.add(candidate) + break + if not new: + continue + set_displayname(access_token, room_id, user_id, new, user_id in members) + finally: + mas_revoke_session(admin_token, session_id) + + try: + entries = synapse_list_users(seeder_token) + except Exception as exc: # noqa: BLE001 + print(f"synapse admin list skipped: {exc}") + entries = [] + for entry in entries: + user_id = entry.get("name") or "" + if not user_id.startswith("@"): + continue + localpart = user_id.split(":", 1)[0].lstrip("@") + if localpart in mas_usernames: + continue + is_guest = entry.get("is_guest") + if not (is_guest or needs_rename_username(localpart)): + continue + display = get_displayname_admin(seeder_token, user_id) + if display and not needs_rename_display(display): + continue + new = None + for _ in range(30): + candidate = f"{random.choice(ADJ)}-{random.choice(NOUN)}" + if candidate not in existing: + new = candidate + existing.add(candidate) + break + if not new: + continue + if not set_displayname_admin(seeder_token, user_id, new): + continue + db_rename_numeric(existing) + finally: + mas_revoke_session(admin_token, seeder_session) + PY diff --git a/services/comms/guest-register-deployment.yaml b/services/comms/guest-register-deployment.yaml new file mode 100644 index 0000000..284cc42 --- /dev/null +++ b/services/comms/guest-register-deployment.yaml @@ -0,0 +1,104 @@ +# services/comms/guest-register-deployment.yaml +apiVersion: apps/v1 +kind: Deployment +metadata: + name: matrix-guest-register + labels: + app.kubernetes.io/name: matrix-guest-register +spec: + replicas: 1 + selector: + matchLabels: + app.kubernetes.io/name: matrix-guest-register + template: + metadata: + annotations: + checksum/config: guest-register-proxy-5 + labels: + app.kubernetes.io/name: matrix-guest-register + spec: + securityContext: + runAsNonRoot: true + runAsUser: 10001 + runAsGroup: 10001 + containers: + - name: guest-register + image: python:3.11-slim + imagePullPolicy: IfNotPresent + securityContext: + allowPrivilegeEscalation: false + capabilities: + drop: + - ALL + env: + - name: PYTHONDONTWRITEBYTECODE + value: "1" + - name: PYTHONUNBUFFERED + value: "1" + - name: PORT + value: "8080" + - name: MAS_BASE + value: http://matrix-authentication-service:8080 + - name: MAS_ADMIN_CLIENT_ID + value: 01KDXMVQBQ5JNY6SEJPZW6Z8BM + - name: MAS_ADMIN_CLIENT_SECRET_FILE + value: /etc/mas/admin-client/client_secret + - name: MAS_ADMIN_API_BASE + value: http://matrix-authentication-service:8081/api/admin/v1 + - name: SYNAPSE_BASE + value: http://othrys-synapse-matrix-synapse:8008 + - name: MATRIX_SERVER_NAME + value: live.bstein.dev + - name: RATE_WINDOW_SEC + value: "60" + - name: RATE_MAX + value: "30" + ports: + - name: http + containerPort: 8080 + protocol: TCP + readinessProbe: + httpGet: + path: /healthz + port: http + initialDelaySeconds: 2 + periodSeconds: 10 + timeoutSeconds: 2 + livenessProbe: + httpGet: + path: /healthz + port: http + initialDelaySeconds: 10 + periodSeconds: 20 + timeoutSeconds: 2 + resources: + requests: + cpu: 50m + memory: 64Mi + limits: + cpu: 250m + memory: 256Mi + volumeMounts: + - name: app + mountPath: /app/server.py + subPath: server.py + readOnly: true + - name: mas-admin-client + mountPath: /etc/mas/admin-client + readOnly: true + command: + - python + - /app/server.py + volumes: + - name: app + configMap: + name: matrix-guest-register + items: + - key: server.py + path: server.py + - name: mas-admin-client + secret: + secretName: mas-admin-client-runtime + items: + - key: client_secret + path: client_secret diff --git a/services/comms/guest-register-service.yaml b/services/comms/guest-register-service.yaml new file mode 100644 index 0000000..5bb740a --- /dev/null +++ b/services/comms/guest-register-service.yaml @@ -0,0 +1,16 @@ +# services/comms/guest-register-service.yaml +apiVersion: v1 +kind: Service +metadata: + name: matrix-guest-register + labels: + app.kubernetes.io/name: matrix-guest-register +spec: + selector: + app.kubernetes.io/name: matrix-guest-register + ports: + - name: http + port: 8080 + targetPort: http + protocol: TCP + diff --git a/services/comms/knowledge/INDEX.md b/services/comms/knowledge/INDEX.md new file mode 100644 index 0000000..fac9153 --- /dev/null +++ b/services/comms/knowledge/INDEX.md @@ -0,0 +1,22 @@ +Atlas Knowledge Base (KB) + +This folder is the source-of-truth “memory” for Atlas/Titan assistants (and for humans). It is designed to be: +- Accurate (grounded in GitOps + read-only cluster tools) +- Maintainable (small docs + deterministic generators) +- Safe (no secrets; refer to Secret/Vault paths by name only) + +Layout +- `knowledge/runbooks/`: human-written docs (short, chunkable Markdown). +- `knowledge/catalog/`: generated machine-readable facts (YAML/JSON). +- `knowledge/diagrams/`: generated Mermaid diagrams (`.mmd`) derived from the catalog. + +Regeneration +- Update manifests/docs, then regenerate generated artifacts: + - `python scripts/knowledge_render_atlas.py --write` + +Authoring rules +- Never include secret values. Prefer `secretRef` names or Vault paths like `kv/atlas/...`. +- Prefer stable identifiers: Kubernetes `namespace/name`, DNS hostnames, Flux kustomization paths. +- Keep each runbook small; one topic per file; use headings. +- When in doubt, link to the exact file path in this repo that configures the behavior. + diff --git a/services/comms/knowledge/catalog/atlas-summary.json b/services/comms/knowledge/catalog/atlas-summary.json new file mode 100644 index 0000000..2139e29 --- /dev/null +++ b/services/comms/knowledge/catalog/atlas-summary.json @@ -0,0 +1,8 @@ +{ + "counts": { + "helmrelease_host_hints": 7, + "http_endpoints": 35, + "services": 44, + "workloads": 49 + } +} diff --git a/services/comms/knowledge/catalog/atlas.json b/services/comms/knowledge/catalog/atlas.json new file mode 100644 index 0000000..92f08f4 --- /dev/null +++ b/services/comms/knowledge/catalog/atlas.json @@ -0,0 +1,2771 @@ +{ + "cluster": "atlas", + "sources": [ + { + "name": "ai-llm", + "path": "services/ai-llm", + "targetNamespace": "ai" + }, + { + "name": "bstein-dev-home", + "path": "services/bstein-dev-home", + "targetNamespace": "bstein-dev-home" + }, + { + "name": "ci-demo", + "path": "services/ci-demo", + "targetNamespace": null + }, + { + "name": "communication", + "path": "services/comms", + "targetNamespace": "comms" + }, + { + "name": "core", + "path": "infrastructure/core", + "targetNamespace": null + }, + { + "name": "crypto", + "path": "services/crypto", + "targetNamespace": "crypto" + }, + { + "name": "flux-system", + "path": "clusters/atlas/flux-system", + "targetNamespace": null + }, + { + "name": "gitea", + "path": "services/gitea", + "targetNamespace": "gitea" + }, + { + "name": "gitops-ui", + "path": "services/gitops-ui", + "targetNamespace": "flux-system" + }, + { + "name": "harbor", + "path": "services/harbor", + "targetNamespace": "harbor" + }, + { + "name": "helm", + "path": "infrastructure/sources/helm", + "targetNamespace": "flux-system" + }, + { + "name": "jellyfin", + "path": "services/jellyfin", + "targetNamespace": "jellyfin" + }, + { + "name": "jenkins", + "path": "services/jenkins", + "targetNamespace": "jenkins" + }, + { + "name": "keycloak", + "path": "services/keycloak", + "targetNamespace": "sso" + }, + { + "name": "longhorn-ui", + "path": "infrastructure/longhorn/ui-ingress", + "targetNamespace": "longhorn-system" + }, + { + "name": "mailu", + "path": "services/mailu", + "targetNamespace": "mailu-mailserver" + }, + { + "name": "metallb", + "path": "infrastructure/metallb", + "targetNamespace": "metallb-system" + }, + { + "name": "monerod", + "path": "services/crypto/monerod", + "targetNamespace": "crypto" + }, + { + "name": "monitoring", + "path": "services/monitoring", + "targetNamespace": null + }, + { + "name": "nextcloud", + "path": "services/nextcloud", + "targetNamespace": "nextcloud" + }, + { + "name": "nextcloud-mail-sync", + "path": "services/nextcloud-mail-sync", + "targetNamespace": "nextcloud" + }, + { + "name": "oauth2-proxy", + "path": "services/oauth2-proxy", + "targetNamespace": "sso" + }, + { + "name": "openldap", + "path": "services/openldap", + "targetNamespace": "sso" + }, + { + "name": "pegasus", + "path": "services/pegasus", + "targetNamespace": "jellyfin" + }, + { + "name": "sui-metrics", + "path": "services/sui-metrics/overlays/atlas", + "targetNamespace": "sui-metrics" + }, + { + "name": "traefik", + "path": "infrastructure/traefik", + "targetNamespace": "traefik" + }, + { + "name": "vault", + "path": "services/vault", + "targetNamespace": "vault" + }, + { + "name": "vault-csi", + "path": "infrastructure/vault-csi", + "targetNamespace": "kube-system" + }, + { + "name": "vaultwarden", + "path": "services/vaultwarden", + "targetNamespace": "vaultwarden" + }, + { + "name": "xmr-miner", + "path": "services/crypto/xmr-miner", + "targetNamespace": "crypto" + } + ], + "workloads": [ + { + "kind": "Deployment", + "namespace": "ai", + "name": "ollama", + "labels": { + "app": "ollama" + }, + "serviceAccountName": null, + "nodeSelector": {}, + "images": [ + "ollama/ollama:latest" + ] + }, + { + "kind": "Deployment", + "namespace": "bstein-dev-home", + "name": "bstein-dev-home-backend", + "labels": { + "app": "bstein-dev-home-backend" + }, + "serviceAccountName": "bstein-dev-home", + "nodeSelector": { + "kubernetes.io/arch": "arm64", + "node-role.kubernetes.io/worker": "true" + }, + "images": [ + "registry.bstein.dev/bstein/bstein-dev-home-backend:0.1.1-84" + ] + }, + { + "kind": "Deployment", + "namespace": "bstein-dev-home", + "name": "bstein-dev-home-frontend", + "labels": { + "app": "bstein-dev-home-frontend" + }, + "serviceAccountName": null, + "nodeSelector": { + "kubernetes.io/arch": "arm64", + "node-role.kubernetes.io/worker": "true" + }, + "images": [ + "registry.bstein.dev/bstein/bstein-dev-home-frontend:0.1.1-84" + ] + }, + { + "kind": "Deployment", + "namespace": "bstein-dev-home", + "name": "chat-ai-gateway", + "labels": { + "app": "chat-ai-gateway" + }, + "serviceAccountName": null, + "nodeSelector": { + "kubernetes.io/arch": "arm64", + "node-role.kubernetes.io/worker": "true" + }, + "images": [ + "python:3.11-slim" + ] + }, + { + "kind": "Deployment", + "namespace": "ci-demo", + "name": "ci-demo", + "labels": { + "app.kubernetes.io/name": "ci-demo" + }, + "serviceAccountName": null, + "nodeSelector": { + "hardware": "rpi4" + }, + "images": [ + "registry.bstein.dev/infra/ci-demo:v0.0.0-3" + ] + }, + { + "kind": "Deployment", + "namespace": "comms", + "name": "atlasbot", + "labels": { + "app": "atlasbot" + }, + "serviceAccountName": "atlasbot", + "nodeSelector": { + "hardware": "rpi5" + }, + "images": [ + "python:3.11-slim" + ] + }, + { + "kind": "Deployment", + "namespace": "comms", + "name": "coturn", + "labels": { + "app": "coturn" + }, + "serviceAccountName": null, + "nodeSelector": { + "hardware": "rpi5" + }, + "images": [ + "ghcr.io/coturn/coturn:4.6.2" + ] + }, + { + "kind": "Deployment", + "namespace": "comms", + "name": "element-call", + "labels": { + "app": "element-call" + }, + "serviceAccountName": null, + "nodeSelector": { + "hardware": "rpi5" + }, + "images": [ + "ghcr.io/element-hq/element-call:latest" + ] + }, + { + "kind": "Deployment", + "namespace": "comms", + "name": "livekit", + "labels": { + "app": "livekit" + }, + "serviceAccountName": null, + "nodeSelector": { + "hardware": "rpi5" + }, + "images": [ + "livekit/livekit-server:v1.9.0" + ] + }, + { + "kind": "Deployment", + "namespace": "comms", + "name": "livekit-token-service", + "labels": { + "app": "livekit-token-service" + }, + "serviceAccountName": null, + "nodeSelector": { + "hardware": "rpi5" + }, + "images": [ + "ghcr.io/element-hq/lk-jwt-service:0.3.0" + ] + }, + { + "kind": "Deployment", + "namespace": "comms", + "name": "matrix-authentication-service", + "labels": { + "app": "matrix-authentication-service" + }, + "serviceAccountName": null, + "nodeSelector": { + "hardware": "rpi5" + }, + "images": [ + "ghcr.io/element-hq/matrix-authentication-service:1.8.0" + ] + }, + { + "kind": "Deployment", + "namespace": "comms", + "name": "matrix-guest-register", + "labels": { + "app.kubernetes.io/name": "matrix-guest-register" + }, + "serviceAccountName": null, + "nodeSelector": {}, + "images": [ + "python:3.11-slim" + ] + }, + { + "kind": "Deployment", + "namespace": "comms", + "name": "matrix-wellknown", + "labels": { + "app": "matrix-wellknown" + }, + "serviceAccountName": null, + "nodeSelector": {}, + "images": [ + "nginx:1.27-alpine" + ] + }, + { + "kind": "Deployment", + "namespace": "comms", + "name": "othrys-element-element-web", + "labels": { + "app.kubernetes.io/instance": "othrys-element", + "app.kubernetes.io/name": "element-web" + }, + "serviceAccountName": "othrys-element-element-web", + "nodeSelector": { + "hardware": "rpi5" + }, + "images": [ + "ghcr.io/element-hq/element-web:v1.12.6" + ] + }, + { + "kind": "Deployment", + "namespace": "comms", + "name": "othrys-synapse-matrix-synapse", + "labels": { + "app.kubernetes.io/component": "synapse", + "app.kubernetes.io/instance": "othrys-synapse", + "app.kubernetes.io/name": "matrix-synapse" + }, + "serviceAccountName": "default", + "nodeSelector": { + "hardware": "rpi5" + }, + "images": [ + "ghcr.io/element-hq/synapse:v1.144.0" + ] + }, + { + "kind": "Deployment", + "namespace": "comms", + "name": "othrys-synapse-redis-master", + "labels": { + "app.kubernetes.io/component": "master", + "app.kubernetes.io/instance": "othrys-synapse", + "app.kubernetes.io/managed-by": "Helm", + "app.kubernetes.io/name": "redis", + "helm.sh/chart": "redis-17.17.1" + }, + "serviceAccountName": "othrys-synapse-redis", + "nodeSelector": {}, + "images": [ + "docker.io/bitnamilegacy/redis:7.0.12-debian-11-r34" + ] + }, + { + "kind": "DaemonSet", + "namespace": "crypto", + "name": "monero-xmrig", + "labels": { + "app": "monero-xmrig" + }, + "serviceAccountName": null, + "nodeSelector": { + "node-role.kubernetes.io/worker": "true" + }, + "images": [ + "ghcr.io/tari-project/xmrig:latest" + ] + }, + { + "kind": "Deployment", + "namespace": "crypto", + "name": "monero-p2pool", + "labels": { + "app": "monero-p2pool" + }, + "serviceAccountName": null, + "nodeSelector": { + "node-role.kubernetes.io/worker": "true" + }, + "images": [ + "debian:bookworm-slim" + ] + }, + { + "kind": "Deployment", + "namespace": "crypto", + "name": "monerod", + "labels": { + "app": "monerod" + }, + "serviceAccountName": null, + "nodeSelector": { + "node-role.kubernetes.io/worker": "true" + }, + "images": [ + "registry.bstein.dev/crypto/monerod:0.18.4.1" + ] + }, + { + "kind": "Deployment", + "namespace": "flux-system", + "name": "helm-controller", + "labels": { + "app": "helm-controller", + "app.kubernetes.io/component": "helm-controller", + "app.kubernetes.io/instance": "flux-system", + "app.kubernetes.io/part-of": "flux", + "app.kubernetes.io/version": "v2.7.5" + }, + "serviceAccountName": "helm-controller", + "nodeSelector": { + "kubernetes.io/os": "linux" + }, + "images": [ + "ghcr.io/fluxcd/helm-controller:v1.4.5" + ] + }, + { + "kind": "Deployment", + "namespace": "flux-system", + "name": "image-automation-controller", + "labels": { + "app": "image-automation-controller", + "app.kubernetes.io/component": "image-automation-controller", + "app.kubernetes.io/instance": "flux-system", + "app.kubernetes.io/part-of": "flux", + "app.kubernetes.io/version": "v2.7.5" + }, + "serviceAccountName": "image-automation-controller", + "nodeSelector": { + "kubernetes.io/os": "linux" + }, + "images": [ + "ghcr.io/fluxcd/image-automation-controller:v1.0.4" + ] + }, + { + "kind": "Deployment", + "namespace": "flux-system", + "name": "image-reflector-controller", + "labels": { + "app": "image-reflector-controller", + "app.kubernetes.io/component": "image-reflector-controller", + "app.kubernetes.io/instance": "flux-system", + "app.kubernetes.io/part-of": "flux", + "app.kubernetes.io/version": "v2.7.5" + }, + "serviceAccountName": "image-reflector-controller", + "nodeSelector": { + "kubernetes.io/os": "linux" + }, + "images": [ + "ghcr.io/fluxcd/image-reflector-controller:v1.0.4" + ] + }, + { + "kind": "Deployment", + "namespace": "flux-system", + "name": "kustomize-controller", + "labels": { + "app": "kustomize-controller", + "app.kubernetes.io/component": "kustomize-controller", + "app.kubernetes.io/instance": "flux-system", + "app.kubernetes.io/part-of": "flux", + "app.kubernetes.io/version": "v2.7.5" + }, + "serviceAccountName": "kustomize-controller", + "nodeSelector": { + "kubernetes.io/os": "linux" + }, + "images": [ + "ghcr.io/fluxcd/kustomize-controller:v1.7.3" + ] + }, + { + "kind": "Deployment", + "namespace": "flux-system", + "name": "notification-controller", + "labels": { + "app": "notification-controller", + "app.kubernetes.io/component": "notification-controller", + "app.kubernetes.io/instance": "flux-system", + "app.kubernetes.io/part-of": "flux", + "app.kubernetes.io/version": "v2.7.5" + }, + "serviceAccountName": "notification-controller", + "nodeSelector": { + "kubernetes.io/os": "linux" + }, + "images": [ + "ghcr.io/fluxcd/notification-controller:v1.7.5" + ] + }, + { + "kind": "Deployment", + "namespace": "flux-system", + "name": "source-controller", + "labels": { + "app": "source-controller", + "app.kubernetes.io/component": "source-controller", + "app.kubernetes.io/instance": "flux-system", + "app.kubernetes.io/part-of": "flux", + "app.kubernetes.io/version": "v2.7.5" + }, + "serviceAccountName": "source-controller", + "nodeSelector": { + "kubernetes.io/os": "linux" + }, + "images": [ + "ghcr.io/fluxcd/source-controller:v1.7.4" + ] + }, + { + "kind": "Deployment", + "namespace": "gitea", + "name": "gitea", + "labels": { + "app": "gitea" + }, + "serviceAccountName": null, + "nodeSelector": { + "node-role.kubernetes.io/worker": "true" + }, + "images": [ + "gitea/gitea:1.23" + ] + }, + { + "kind": "Deployment", + "namespace": "jellyfin", + "name": "jellyfin", + "labels": { + "app": "jellyfin" + }, + "serviceAccountName": null, + "nodeSelector": {}, + "images": [ + "docker.io/jellyfin/jellyfin:10.11.5" + ] + }, + { + "kind": "Deployment", + "namespace": "jellyfin", + "name": "pegasus", + "labels": { + "app": "pegasus" + }, + "serviceAccountName": null, + "nodeSelector": { + "kubernetes.io/arch": "arm64", + "node-role.kubernetes.io/worker": "true" + }, + "images": [ + "alpine:3.20", + "registry.bstein.dev/streaming/pegasus:1.2.32" + ] + }, + { + "kind": "Deployment", + "namespace": "jenkins", + "name": "jenkins", + "labels": { + "app": "jenkins" + }, + "serviceAccountName": "jenkins", + "nodeSelector": { + "kubernetes.io/arch": "arm64", + "node-role.kubernetes.io/worker": "true" + }, + "images": [ + "jenkins/jenkins:2.528.3-jdk21" + ] + }, + { + "kind": "DaemonSet", + "namespace": "kube-system", + "name": "nvidia-device-plugin-jetson", + "labels": { + "app.kubernetes.io/instance": "jetson", + "app.kubernetes.io/name": "nvidia-device-plugin" + }, + "serviceAccountName": null, + "nodeSelector": { + "jetson": "true", + "kubernetes.io/arch": "arm64" + }, + "images": [ + "nvcr.io/nvidia/k8s-device-plugin:v0.16.2" + ] + }, + { + "kind": "DaemonSet", + "namespace": "kube-system", + "name": "nvidia-device-plugin-minipc", + "labels": { + "app.kubernetes.io/instance": "titan22", + "app.kubernetes.io/name": "nvidia-device-plugin" + }, + "serviceAccountName": null, + "nodeSelector": { + "kubernetes.io/arch": "amd64", + "kubernetes.io/hostname": "titan-22" + }, + "images": [ + "nvcr.io/nvidia/k8s-device-plugin:v0.16.2" + ] + }, + { + "kind": "DaemonSet", + "namespace": "kube-system", + "name": "nvidia-device-plugin-tethys", + "labels": { + "app.kubernetes.io/instance": "titan24", + "app.kubernetes.io/name": "nvidia-device-plugin" + }, + "serviceAccountName": null, + "nodeSelector": { + "kubernetes.io/arch": "amd64", + "kubernetes.io/hostname": "titan-24" + }, + "images": [ + "nvcr.io/nvidia/k8s-device-plugin:v0.16.2" + ] + }, + { + "kind": "DaemonSet", + "namespace": "kube-system", + "name": "vault-csi-provider", + "labels": { + "app.kubernetes.io/name": "vault-csi-provider" + }, + "serviceAccountName": "vault-csi-provider", + "nodeSelector": { + "kubernetes.io/os": "linux" + }, + "images": [ + "hashicorp/vault-csi-provider:1.7.0" + ] + }, + { + "kind": "Deployment", + "namespace": "longhorn-system", + "name": "oauth2-proxy-longhorn", + "labels": { + "app": "oauth2-proxy-longhorn" + }, + "serviceAccountName": null, + "nodeSelector": { + "node-role.kubernetes.io/worker": "true" + }, + "images": [ + "quay.io/oauth2-proxy/oauth2-proxy:v7.6.0" + ] + }, + { + "kind": "DaemonSet", + "namespace": "mailu-mailserver", + "name": "vip-controller", + "labels": { + "app": "vip-controller" + }, + "serviceAccountName": "vip-controller", + "nodeSelector": { + "mailu.bstein.dev/vip": "true" + }, + "images": [ + "lachlanevenson/k8s-kubectl:latest" + ] + }, + { + "kind": "Deployment", + "namespace": "mailu-mailserver", + "name": "mailu-sync-listener", + "labels": { + "app": "mailu-sync-listener" + }, + "serviceAccountName": null, + "nodeSelector": {}, + "images": [ + "python:3.11-alpine" + ] + }, + { + "kind": "DaemonSet", + "namespace": "metallb-system", + "name": "metallb-speaker", + "labels": { + "app.kubernetes.io/component": "speaker", + "app.kubernetes.io/instance": "metallb", + "app.kubernetes.io/name": "metallb" + }, + "serviceAccountName": "metallb-speaker", + "nodeSelector": { + "kubernetes.io/os": "linux" + }, + "images": [ + "quay.io/frrouting/frr:10.4.1", + "quay.io/metallb/speaker:v0.15.3" + ] + }, + { + "kind": "Deployment", + "namespace": "metallb-system", + "name": "metallb-controller", + "labels": { + "app.kubernetes.io/component": "controller", + "app.kubernetes.io/instance": "metallb", + "app.kubernetes.io/name": "metallb" + }, + "serviceAccountName": "metallb-controller", + "nodeSelector": { + "kubernetes.io/os": "linux" + }, + "images": [ + "quay.io/metallb/controller:v0.15.3" + ] + }, + { + "kind": "DaemonSet", + "namespace": "monitoring", + "name": "dcgm-exporter", + "labels": { + "app": "dcgm-exporter" + }, + "serviceAccountName": "default", + "nodeSelector": {}, + "images": [ + "registry.bstein.dev/monitoring/dcgm-exporter:4.4.2-4.7.0-ubuntu22.04" + ] + }, + { + "kind": "Deployment", + "namespace": "monitoring", + "name": "postmark-exporter", + "labels": { + "app": "postmark-exporter" + }, + "serviceAccountName": null, + "nodeSelector": {}, + "images": [ + "python:3.12-alpine" + ] + }, + { + "kind": "Deployment", + "namespace": "nextcloud", + "name": "collabora", + "labels": { + "app": "collabora" + }, + "serviceAccountName": null, + "nodeSelector": { + "hardware": "rpi5" + }, + "images": [ + "collabora/code:latest" + ] + }, + { + "kind": "Deployment", + "namespace": "nextcloud", + "name": "nextcloud", + "labels": { + "app": "nextcloud" + }, + "serviceAccountName": null, + "nodeSelector": { + "hardware": "rpi5" + }, + "images": [ + "nextcloud:29-apache" + ] + }, + { + "kind": "Deployment", + "namespace": "sso", + "name": "keycloak", + "labels": { + "app": "keycloak" + }, + "serviceAccountName": null, + "nodeSelector": {}, + "images": [ + "quay.io/keycloak/keycloak:26.0.7" + ] + }, + { + "kind": "Deployment", + "namespace": "sso", + "name": "oauth2-proxy", + "labels": { + "app": "oauth2-proxy" + }, + "serviceAccountName": null, + "nodeSelector": { + "node-role.kubernetes.io/worker": "true" + }, + "images": [ + "quay.io/oauth2-proxy/oauth2-proxy:v7.6.0" + ] + }, + { + "kind": "StatefulSet", + "namespace": "sso", + "name": "openldap", + "labels": { + "app": "openldap" + }, + "serviceAccountName": null, + "nodeSelector": { + "kubernetes.io/arch": "arm64", + "node-role.kubernetes.io/worker": "true" + }, + "images": [ + "docker.io/osixia/openldap:1.5.0" + ] + }, + { + "kind": "Deployment", + "namespace": "sui-metrics", + "name": "sui-metrics", + "labels": { + "app": "sui-metrics" + }, + "serviceAccountName": "sui-metrics", + "nodeSelector": { + "kubernetes.io/hostname": "titan-24" + }, + "images": [ + "victoriametrics/vmagent:v1.103.0" + ] + }, + { + "kind": "Deployment", + "namespace": "traefik", + "name": "traefik", + "labels": { + "app": "traefik" + }, + "serviceAccountName": "traefik-ingress-controller", + "nodeSelector": { + "node-role.kubernetes.io/worker": "true" + }, + "images": [ + "traefik:v3.3.3" + ] + }, + { + "kind": "StatefulSet", + "namespace": "vault", + "name": "vault", + "labels": { + "app": "vault" + }, + "serviceAccountName": "vault", + "nodeSelector": { + "kubernetes.io/arch": "arm64", + "node-role.kubernetes.io/worker": "true" + }, + "images": [ + "hashicorp/vault:1.17.6" + ] + }, + { + "kind": "Deployment", + "namespace": "vaultwarden", + "name": "vaultwarden", + "labels": { + "app": "vaultwarden" + }, + "serviceAccountName": null, + "nodeSelector": {}, + "images": [ + "vaultwarden/server:1.33.2" + ] + } + ], + "services": [ + { + "namespace": "ai", + "name": "ollama", + "type": "ClusterIP", + "selector": { + "app": "ollama" + }, + "ports": [ + { + "name": "http", + "port": 11434, + "targetPort": 11434, + "protocol": "TCP" + } + ] + }, + { + "namespace": "bstein-dev-home", + "name": "bstein-dev-home-backend", + "type": "ClusterIP", + "selector": { + "app": "bstein-dev-home-backend" + }, + "ports": [ + { + "name": "http", + "port": 80, + "targetPort": 8080, + "protocol": "TCP" + } + ] + }, + { + "namespace": "bstein-dev-home", + "name": "bstein-dev-home-frontend", + "type": "ClusterIP", + "selector": { + "app": "bstein-dev-home-frontend" + }, + "ports": [ + { + "name": "http", + "port": 80, + "targetPort": 80, + "protocol": "TCP" + } + ] + }, + { + "namespace": "bstein-dev-home", + "name": "chat-ai-gateway", + "type": "ClusterIP", + "selector": { + "app": "chat-ai-gateway" + }, + "ports": [ + { + "name": "http", + "port": 80, + "targetPort": 8080, + "protocol": "TCP" + } + ] + }, + { + "namespace": "ci-demo", + "name": "ci-demo", + "type": "ClusterIP", + "selector": { + "app.kubernetes.io/name": "ci-demo" + }, + "ports": [ + { + "name": "http", + "port": 80, + "targetPort": "http", + "protocol": "TCP" + } + ] + }, + { + "namespace": "comms", + "name": "coturn", + "type": "LoadBalancer", + "selector": { + "app": "coturn" + }, + "ports": [ + { + "name": "turn-udp", + "port": 3478, + "targetPort": 3478, + "protocol": "UDP" + }, + { + "name": "turn-tcp", + "port": 3478, + "targetPort": 3478, + "protocol": "TCP" + }, + { + "name": "turn-tls", + "port": 5349, + "targetPort": 5349, + "protocol": "TCP" + }, + { + "name": "relay-50000", + "port": 50000, + "targetPort": 50000, + "protocol": "UDP" + }, + { + "name": "relay-50001", + "port": 50001, + "targetPort": 50001, + "protocol": "UDP" + }, + { + "name": "relay-50002", + "port": 50002, + "targetPort": 50002, + "protocol": "UDP" + }, + { + "name": "relay-50003", + "port": 50003, + "targetPort": 50003, + "protocol": "UDP" + }, + { + "name": "relay-50004", + "port": 50004, + "targetPort": 50004, + "protocol": "UDP" + }, + { + "name": "relay-50005", + "port": 50005, + "targetPort": 50005, + "protocol": "UDP" + }, + { + "name": "relay-50006", + "port": 50006, + "targetPort": 50006, + "protocol": "UDP" + }, + { + "name": "relay-50007", + "port": 50007, + "targetPort": 50007, + "protocol": "UDP" + }, + { + "name": "relay-50008", + "port": 50008, + "targetPort": 50008, + "protocol": "UDP" + }, + { + "name": "relay-50009", + "port": 50009, + "targetPort": 50009, + "protocol": "UDP" + }, + { + "name": "relay-50010", + "port": 50010, + "targetPort": 50010, + "protocol": "UDP" + }, + { + "name": "relay-50011", + "port": 50011, + "targetPort": 50011, + "protocol": "UDP" + }, + { + "name": "relay-50012", + "port": 50012, + "targetPort": 50012, + "protocol": "UDP" + }, + { + "name": "relay-50013", + "port": 50013, + "targetPort": 50013, + "protocol": "UDP" + }, + { + "name": "relay-50014", + "port": 50014, + "targetPort": 50014, + "protocol": "UDP" + }, + { + "name": "relay-50015", + "port": 50015, + "targetPort": 50015, + "protocol": "UDP" + }, + { + "name": "relay-50016", + "port": 50016, + "targetPort": 50016, + "protocol": "UDP" + }, + { + "name": "relay-50017", + "port": 50017, + "targetPort": 50017, + "protocol": "UDP" + }, + { + "name": "relay-50018", + "port": 50018, + "targetPort": 50018, + "protocol": "UDP" + }, + { + "name": "relay-50019", + "port": 50019, + "targetPort": 50019, + "protocol": "UDP" + }, + { + "name": "relay-50020", + "port": 50020, + "targetPort": 50020, + "protocol": "UDP" + }, + { + "name": "relay-50021", + "port": 50021, + "targetPort": 50021, + "protocol": "UDP" + }, + { + "name": "relay-50022", + "port": 50022, + "targetPort": 50022, + "protocol": "UDP" + }, + { + "name": "relay-50023", + "port": 50023, + "targetPort": 50023, + "protocol": "UDP" + }, + { + "name": "relay-50024", + "port": 50024, + "targetPort": 50024, + "protocol": "UDP" + }, + { + "name": "relay-50025", + "port": 50025, + "targetPort": 50025, + "protocol": "UDP" + }, + { + "name": "relay-50026", + "port": 50026, + "targetPort": 50026, + "protocol": "UDP" + }, + { + "name": "relay-50027", + "port": 50027, + "targetPort": 50027, + "protocol": "UDP" + }, + { + "name": "relay-50028", + "port": 50028, + "targetPort": 50028, + "protocol": "UDP" + }, + { + "name": "relay-50029", + "port": 50029, + "targetPort": 50029, + "protocol": "UDP" + }, + { + "name": "relay-50030", + "port": 50030, + "targetPort": 50030, + "protocol": "UDP" + }, + { + "name": "relay-50031", + "port": 50031, + "targetPort": 50031, + "protocol": "UDP" + }, + { + "name": "relay-50032", + "port": 50032, + "targetPort": 50032, + "protocol": "UDP" + }, + { + "name": "relay-50033", + "port": 50033, + "targetPort": 50033, + "protocol": "UDP" + }, + { + "name": "relay-50034", + "port": 50034, + "targetPort": 50034, + "protocol": "UDP" + }, + { + "name": "relay-50035", + "port": 50035, + "targetPort": 50035, + "protocol": "UDP" + }, + { + "name": "relay-50036", + "port": 50036, + "targetPort": 50036, + "protocol": "UDP" + }, + { + "name": "relay-50037", + "port": 50037, + "targetPort": 50037, + "protocol": "UDP" + }, + { + "name": "relay-50038", + "port": 50038, + "targetPort": 50038, + "protocol": "UDP" + }, + { + "name": "relay-50039", + "port": 50039, + "targetPort": 50039, + "protocol": "UDP" + }, + { + "name": "relay-50040", + "port": 50040, + "targetPort": 50040, + "protocol": "UDP" + }, + { + "name": "relay-50041", + "port": 50041, + "targetPort": 50041, + "protocol": "UDP" + }, + { + "name": "relay-50042", + "port": 50042, + "targetPort": 50042, + "protocol": "UDP" + }, + { + "name": "relay-50043", + "port": 50043, + "targetPort": 50043, + "protocol": "UDP" + }, + { + "name": "relay-50044", + "port": 50044, + "targetPort": 50044, + "protocol": "UDP" + }, + { + "name": "relay-50045", + "port": 50045, + "targetPort": 50045, + "protocol": "UDP" + }, + { + "name": "relay-50046", + "port": 50046, + "targetPort": 50046, + "protocol": "UDP" + }, + { + "name": "relay-50047", + "port": 50047, + "targetPort": 50047, + "protocol": "UDP" + }, + { + "name": "relay-50048", + "port": 50048, + "targetPort": 50048, + "protocol": "UDP" + }, + { + "name": "relay-50049", + "port": 50049, + "targetPort": 50049, + "protocol": "UDP" + }, + { + "name": "relay-50050", + "port": 50050, + "targetPort": 50050, + "protocol": "UDP" + } + ] + }, + { + "namespace": "comms", + "name": "element-call", + "type": "ClusterIP", + "selector": { + "app": "element-call" + }, + "ports": [ + { + "name": "http", + "port": 80, + "targetPort": 8080, + "protocol": "TCP" + } + ] + }, + { + "namespace": "comms", + "name": "livekit", + "type": "LoadBalancer", + "selector": { + "app": "livekit" + }, + "ports": [ + { + "name": "http", + "port": 7880, + "targetPort": 7880, + "protocol": "TCP" + }, + { + "name": "rtc-tcp", + "port": 7881, + "targetPort": 7881, + "protocol": "TCP" + }, + { + "name": "rtc-udp-7882", + "port": 7882, + "targetPort": 7882, + "protocol": "UDP" + }, + { + "name": "rtc-udp-7883", + "port": 7883, + "targetPort": 7883, + "protocol": "UDP" + } + ] + }, + { + "namespace": "comms", + "name": "livekit-token-service", + "type": "ClusterIP", + "selector": { + "app": "livekit-token-service" + }, + "ports": [ + { + "name": "http", + "port": 8080, + "targetPort": 8080, + "protocol": "TCP" + } + ] + }, + { + "namespace": "comms", + "name": "matrix-authentication-service", + "type": "ClusterIP", + "selector": { + "app": "matrix-authentication-service" + }, + "ports": [ + { + "name": "http", + "port": 8080, + "targetPort": "http", + "protocol": "TCP" + }, + { + "name": "internal", + "port": 8081, + "targetPort": "internal", + "protocol": "TCP" + } + ] + }, + { + "namespace": "comms", + "name": "matrix-guest-register", + "type": "ClusterIP", + "selector": { + "app.kubernetes.io/name": "matrix-guest-register" + }, + "ports": [ + { + "name": "http", + "port": 8080, + "targetPort": "http", + "protocol": "TCP" + } + ] + }, + { + "namespace": "comms", + "name": "matrix-wellknown", + "type": "ClusterIP", + "selector": { + "app": "matrix-wellknown" + }, + "ports": [ + { + "name": "http", + "port": 80, + "targetPort": 80, + "protocol": "TCP" + } + ] + }, + { + "namespace": "comms", + "name": "othrys-element-element-web", + "type": "ClusterIP", + "selector": { + "app.kubernetes.io/instance": "othrys-element", + "app.kubernetes.io/name": "element-web" + }, + "ports": [ + { + "name": "http", + "port": 80, + "targetPort": "http", + "protocol": "TCP" + } + ] + }, + { + "namespace": "comms", + "name": "othrys-synapse-matrix-synapse", + "type": "ClusterIP", + "selector": { + "app.kubernetes.io/component": "synapse", + "app.kubernetes.io/instance": "othrys-synapse", + "app.kubernetes.io/name": "matrix-synapse" + }, + "ports": [ + { + "name": "http", + "port": 8008, + "targetPort": "http", + "protocol": "TCP" + } + ] + }, + { + "namespace": "comms", + "name": "othrys-synapse-redis-headless", + "type": "ClusterIP", + "selector": { + "app.kubernetes.io/instance": "othrys-synapse", + "app.kubernetes.io/name": "redis" + }, + "ports": [ + { + "name": "tcp-redis", + "port": 6379, + "targetPort": "redis", + "protocol": "TCP" + } + ] + }, + { + "namespace": "comms", + "name": "othrys-synapse-redis-master", + "type": "ClusterIP", + "selector": { + "app.kubernetes.io/component": "master", + "app.kubernetes.io/instance": "othrys-synapse", + "app.kubernetes.io/name": "redis" + }, + "ports": [ + { + "name": "tcp-redis", + "port": 6379, + "targetPort": "redis", + "protocol": "TCP" + } + ] + }, + { + "namespace": "comms", + "name": "othrys-synapse-replication", + "type": "ClusterIP", + "selector": { + "app.kubernetes.io/component": "synapse", + "app.kubernetes.io/instance": "othrys-synapse", + "app.kubernetes.io/name": "matrix-synapse" + }, + "ports": [ + { + "name": "replication", + "port": 9093, + "targetPort": "replication", + "protocol": "TCP" + } + ] + }, + { + "namespace": "crypto", + "name": "monerod", + "type": "ClusterIP", + "selector": { + "app": "monerod" + }, + "ports": [ + { + "name": "rpc", + "port": 18081, + "targetPort": 18081, + "protocol": "TCP" + }, + { + "name": "p2p", + "port": 18080, + "targetPort": 18080, + "protocol": "TCP" + }, + { + "name": "zmq", + "port": 18083, + "targetPort": 18083, + "protocol": "TCP" + } + ] + }, + { + "namespace": "crypto", + "name": "p2pool", + "type": "ClusterIP", + "selector": { + "app": "p2pool" + }, + "ports": [ + { + "name": "stratum", + "port": 3333, + "targetPort": 3333, + "protocol": "TCP" + } + ] + }, + { + "namespace": "flux-system", + "name": "notification-controller", + "type": "ClusterIP", + "selector": { + "app": "notification-controller" + }, + "ports": [ + { + "name": "http", + "port": 80, + "targetPort": "http", + "protocol": "TCP" + } + ] + }, + { + "namespace": "flux-system", + "name": "source-controller", + "type": "ClusterIP", + "selector": { + "app": "source-controller" + }, + "ports": [ + { + "name": "http", + "port": 80, + "targetPort": "http", + "protocol": "TCP" + } + ] + }, + { + "namespace": "flux-system", + "name": "webhook-receiver", + "type": "ClusterIP", + "selector": { + "app": "notification-controller" + }, + "ports": [ + { + "name": "http", + "port": 80, + "targetPort": "http-webhook", + "protocol": "TCP" + } + ] + }, + { + "namespace": "gitea", + "name": "gitea", + "type": "ClusterIP", + "selector": { + "app": "gitea" + }, + "ports": [ + { + "name": "http", + "port": 3000, + "targetPort": 3000, + "protocol": "TCP" + } + ] + }, + { + "namespace": "gitea", + "name": "gitea-ssh", + "type": "NodePort", + "selector": { + "app": "gitea" + }, + "ports": [ + { + "name": "ssh", + "port": 2242, + "targetPort": 2242, + "protocol": "TCP" + } + ] + }, + { + "namespace": "jellyfin", + "name": "jellyfin", + "type": "ClusterIP", + "selector": { + "app": "jellyfin" + }, + "ports": [ + { + "name": "http", + "port": 80, + "targetPort": 8096, + "protocol": "TCP" + } + ] + }, + { + "namespace": "jellyfin", + "name": "pegasus", + "type": "ClusterIP", + "selector": { + "app": "pegasus" + }, + "ports": [ + { + "name": "http", + "port": 80, + "targetPort": "http", + "protocol": "TCP" + } + ] + }, + { + "namespace": "jenkins", + "name": "jenkins", + "type": "ClusterIP", + "selector": { + "app": "jenkins" + }, + "ports": [ + { + "name": "http", + "port": 8080, + "targetPort": 8080, + "protocol": "TCP" + }, + { + "name": "agent-listener", + "port": 50000, + "targetPort": 50000, + "protocol": "TCP" + } + ] + }, + { + "namespace": "kube-system", + "name": "traefik", + "type": "LoadBalancer", + "selector": { + "app.kubernetes.io/instance": "traefik-kube-system", + "app.kubernetes.io/name": "traefik" + }, + "ports": [ + { + "name": "web", + "port": 80, + "targetPort": "web", + "protocol": "TCP" + }, + { + "name": "websecure", + "port": 443, + "targetPort": "websecure", + "protocol": "TCP" + } + ] + }, + { + "namespace": "longhorn-system", + "name": "oauth2-proxy-longhorn", + "type": "ClusterIP", + "selector": { + "app": "oauth2-proxy-longhorn" + }, + "ports": [ + { + "name": "http", + "port": 80, + "targetPort": 4180, + "protocol": "TCP" + } + ] + }, + { + "namespace": "mailu-mailserver", + "name": "mailu-front-lb", + "type": "LoadBalancer", + "selector": { + "app.kubernetes.io/component": "front", + "app.kubernetes.io/instance": "mailu", + "app.kubernetes.io/name": "mailu" + }, + "ports": [ + { + "name": "smtp", + "port": 25, + "targetPort": 25, + "protocol": "TCP" + }, + { + "name": "smtps", + "port": 465, + "targetPort": 465, + "protocol": "TCP" + }, + { + "name": "submission", + "port": 587, + "targetPort": 587, + "protocol": "TCP" + }, + { + "name": "imaps", + "port": 993, + "targetPort": 993, + "protocol": "TCP" + }, + { + "name": "pop3s", + "port": 995, + "targetPort": 995, + "protocol": "TCP" + }, + { + "name": "sieve", + "port": 4190, + "targetPort": 4190, + "protocol": "TCP" + } + ] + }, + { + "namespace": "mailu-mailserver", + "name": "mailu-sync-listener", + "type": "ClusterIP", + "selector": { + "app": "mailu-sync-listener" + }, + "ports": [ + { + "name": "http", + "port": 8080, + "targetPort": 8080, + "protocol": "TCP" + } + ] + }, + { + "namespace": "metallb-system", + "name": "metallb-webhook-service", + "type": "ClusterIP", + "selector": { + "app.kubernetes.io/component": "controller", + "app.kubernetes.io/instance": "metallb", + "app.kubernetes.io/name": "metallb" + }, + "ports": [ + { + "name": null, + "port": 443, + "targetPort": 9443, + "protocol": "TCP" + } + ] + }, + { + "namespace": "monitoring", + "name": "dcgm-exporter", + "type": "ClusterIP", + "selector": { + "app": "dcgm-exporter" + }, + "ports": [ + { + "name": "metrics", + "port": 9400, + "targetPort": "metrics", + "protocol": "TCP" + } + ] + }, + { + "namespace": "monitoring", + "name": "postmark-exporter", + "type": "ClusterIP", + "selector": { + "app": "postmark-exporter" + }, + "ports": [ + { + "name": "http", + "port": 8000, + "targetPort": "http", + "protocol": "TCP" + } + ] + }, + { + "namespace": "nextcloud", + "name": "collabora", + "type": "ClusterIP", + "selector": { + "app": "collabora" + }, + "ports": [ + { + "name": "http", + "port": 9980, + "targetPort": "http", + "protocol": "TCP" + } + ] + }, + { + "namespace": "nextcloud", + "name": "nextcloud", + "type": "ClusterIP", + "selector": { + "app": "nextcloud" + }, + "ports": [ + { + "name": "http", + "port": 80, + "targetPort": "http", + "protocol": "TCP" + } + ] + }, + { + "namespace": "sso", + "name": "keycloak", + "type": "ClusterIP", + "selector": { + "app": "keycloak" + }, + "ports": [ + { + "name": "http", + "port": 80, + "targetPort": "http", + "protocol": "TCP" + } + ] + }, + { + "namespace": "sso", + "name": "oauth2-proxy", + "type": "ClusterIP", + "selector": { + "app": "oauth2-proxy" + }, + "ports": [ + { + "name": "http", + "port": 80, + "targetPort": 4180, + "protocol": "TCP" + } + ] + }, + { + "namespace": "sso", + "name": "openldap", + "type": "ClusterIP", + "selector": { + "app": "openldap" + }, + "ports": [ + { + "name": "ldap", + "port": 389, + "targetPort": "ldap", + "protocol": "TCP" + }, + { + "name": "ldaps", + "port": 636, + "targetPort": "ldaps", + "protocol": "TCP" + } + ] + }, + { + "namespace": "sui-metrics", + "name": "sui-metrics", + "type": "ClusterIP", + "selector": { + "app": "sui-metrics" + }, + "ports": [ + { + "name": "http", + "port": 8429, + "targetPort": 8429, + "protocol": "TCP" + } + ] + }, + { + "namespace": "traefik", + "name": "traefik-metrics", + "type": "ClusterIP", + "selector": { + "app": "traefik" + }, + "ports": [ + { + "name": "metrics", + "port": 9100, + "targetPort": "metrics", + "protocol": "TCP" + } + ] + }, + { + "namespace": "vault", + "name": "vault", + "type": "ClusterIP", + "selector": { + "app": "vault" + }, + "ports": [ + { + "name": "api", + "port": 8200, + "targetPort": 8200, + "protocol": "TCP" + }, + { + "name": "cluster", + "port": 8201, + "targetPort": 8201, + "protocol": "TCP" + } + ] + }, + { + "namespace": "vault", + "name": "vault-internal", + "type": "ClusterIP", + "selector": { + "app": "vault" + }, + "ports": [ + { + "name": "api", + "port": 8200, + "targetPort": 8200, + "protocol": "TCP" + }, + { + "name": "cluster", + "port": 8201, + "targetPort": 8201, + "protocol": "TCP" + } + ] + }, + { + "namespace": "vaultwarden", + "name": "vaultwarden-service", + "type": "ClusterIP", + "selector": { + "app": "vaultwarden" + }, + "ports": [ + { + "name": "http", + "port": 80, + "targetPort": "http", + "protocol": "TCP" + } + ] + } + ], + "http_endpoints": [ + { + "host": "auth.bstein.dev", + "path": "/", + "backend": { + "namespace": "sso", + "service": "oauth2-proxy", + "port": 80, + "workloads": [ + { + "kind": "Deployment", + "name": "oauth2-proxy" + } + ] + }, + "via": { + "kind": "Ingress", + "name": "oauth2-proxy", + "source": "oauth2-proxy" + } + }, + { + "host": "bstein.dev", + "path": "/", + "backend": { + "namespace": "bstein-dev-home", + "service": "bstein-dev-home-frontend", + "port": 80, + "workloads": [ + { + "kind": "Deployment", + "name": "bstein-dev-home-frontend" + } + ] + }, + "via": { + "kind": "Ingress", + "name": "bstein-dev-home", + "source": "bstein-dev-home" + } + }, + { + "host": "bstein.dev", + "path": "/.well-known/matrix/client", + "backend": { + "namespace": "comms", + "service": "matrix-wellknown", + "port": 80, + "workloads": [ + { + "kind": "Deployment", + "name": "matrix-wellknown" + } + ] + }, + "via": { + "kind": "Ingress", + "name": "matrix-wellknown-bstein-dev", + "source": "communication" + } + }, + { + "host": "bstein.dev", + "path": "/.well-known/matrix/server", + "backend": { + "namespace": "comms", + "service": "matrix-wellknown", + "port": 80, + "workloads": [ + { + "kind": "Deployment", + "name": "matrix-wellknown" + } + ] + }, + "via": { + "kind": "Ingress", + "name": "matrix-wellknown-bstein-dev", + "source": "communication" + } + }, + { + "host": "bstein.dev", + "path": "/api", + "backend": { + "namespace": "bstein-dev-home", + "service": "bstein-dev-home-backend", + "port": 80, + "workloads": [ + { + "kind": "Deployment", + "name": "bstein-dev-home-backend" + } + ] + }, + "via": { + "kind": "Ingress", + "name": "bstein-dev-home", + "source": "bstein-dev-home" + } + }, + { + "host": "call.live.bstein.dev", + "path": "/", + "backend": { + "namespace": "comms", + "service": "element-call", + "port": 80, + "workloads": [ + { + "kind": "Deployment", + "name": "element-call" + } + ] + }, + "via": { + "kind": "Ingress", + "name": "element-call", + "source": "communication" + } + }, + { + "host": "chat.ai.bstein.dev", + "path": "/", + "backend": { + "namespace": "bstein-dev-home", + "service": "chat-ai-gateway", + "port": 80, + "workloads": [ + { + "kind": "Deployment", + "name": "chat-ai-gateway" + } + ] + }, + "via": { + "kind": "Ingress", + "name": "bstein-dev-home", + "source": "bstein-dev-home" + } + }, + { + "host": "ci.bstein.dev", + "path": "/", + "backend": { + "namespace": "jenkins", + "service": "jenkins", + "port": "http", + "workloads": [ + { + "kind": "Deployment", + "name": "jenkins" + } + ] + }, + "via": { + "kind": "Ingress", + "name": "jenkins", + "source": "jenkins" + } + }, + { + "host": "cloud.bstein.dev", + "path": "/", + "backend": { + "namespace": "nextcloud", + "service": "nextcloud", + "port": 80, + "workloads": [ + { + "kind": "Deployment", + "name": "nextcloud" + } + ] + }, + "via": { + "kind": "Ingress", + "name": "nextcloud", + "source": "nextcloud" + } + }, + { + "host": "kit.live.bstein.dev", + "path": "/livekit/jwt", + "backend": { + "namespace": "comms", + "service": "livekit-token-service", + "port": 8080, + "workloads": [ + { + "kind": "Deployment", + "name": "livekit-token-service" + } + ] + }, + "via": { + "kind": "Ingress", + "name": "livekit-jwt-ingress", + "source": "communication" + } + }, + { + "host": "kit.live.bstein.dev", + "path": "/livekit/sfu", + "backend": { + "namespace": "comms", + "service": "livekit", + "port": 7880, + "workloads": [ + { + "kind": "Deployment", + "name": "livekit" + } + ] + }, + "via": { + "kind": "Ingress", + "name": "livekit-ingress", + "source": "communication" + } + }, + { + "host": "live.bstein.dev", + "path": "/", + "backend": { + "namespace": "comms", + "service": "othrys-element-element-web", + "port": 80, + "workloads": [ + { + "kind": "Deployment", + "name": "othrys-element-element-web" + } + ] + }, + "via": { + "kind": "Ingress", + "name": "othrys-element-element-web", + "source": "communication" + } + }, + { + "host": "live.bstein.dev", + "path": "/.well-known/matrix/client", + "backend": { + "namespace": "comms", + "service": "matrix-wellknown", + "port": 80, + "workloads": [ + { + "kind": "Deployment", + "name": "matrix-wellknown" + } + ] + }, + "via": { + "kind": "Ingress", + "name": "matrix-wellknown", + "source": "communication" + } + }, + { + "host": "live.bstein.dev", + "path": "/.well-known/matrix/server", + "backend": { + "namespace": "comms", + "service": "matrix-wellknown", + "port": 80, + "workloads": [ + { + "kind": "Deployment", + "name": "matrix-wellknown" + } + ] + }, + "via": { + "kind": "Ingress", + "name": "matrix-wellknown", + "source": "communication" + } + }, + { + "host": "live.bstein.dev", + "path": "/_matrix", + "backend": { + "namespace": "comms", + "service": "othrys-synapse-matrix-synapse", + "port": 8008, + "workloads": [ + { + "kind": "Deployment", + "name": "othrys-synapse-matrix-synapse" + } + ] + }, + "via": { + "kind": "Ingress", + "name": "matrix-routing", + "source": "communication" + } + }, + { + "host": "longhorn.bstein.dev", + "path": "/", + "backend": { + "namespace": "longhorn-system", + "service": "oauth2-proxy-longhorn", + "port": 80, + "workloads": [ + { + "kind": "Deployment", + "name": "oauth2-proxy-longhorn" + } + ] + }, + "via": { + "kind": "Ingress", + "name": "longhorn-ingress", + "source": "longhorn-ui" + } + }, + { + "host": "mail.bstein.dev", + "path": "/", + "backend": { + "namespace": "mailu-mailserver", + "service": "mailu-front", + "port": 443, + "workloads": [] + }, + "via": { + "kind": "IngressRoute", + "name": "mailu", + "source": "mailu" + } + }, + { + "host": "matrix.live.bstein.dev", + "path": "/", + "backend": { + "namespace": "comms", + "service": "matrix-authentication-service", + "port": 8080, + "workloads": [ + { + "kind": "Deployment", + "name": "matrix-authentication-service" + } + ] + }, + "via": { + "kind": "Ingress", + "name": "matrix-routing", + "source": "communication" + } + }, + { + "host": "matrix.live.bstein.dev", + "path": "/.well-known/matrix/client", + "backend": { + "namespace": "comms", + "service": "matrix-wellknown", + "port": 80, + "workloads": [ + { + "kind": "Deployment", + "name": "matrix-wellknown" + } + ] + }, + "via": { + "kind": "Ingress", + "name": "matrix-wellknown-matrix-live", + "source": "communication" + } + }, + { + "host": "matrix.live.bstein.dev", + "path": "/.well-known/matrix/server", + "backend": { + "namespace": "comms", + "service": "matrix-wellknown", + "port": 80, + "workloads": [ + { + "kind": "Deployment", + "name": "matrix-wellknown" + } + ] + }, + "via": { + "kind": "Ingress", + "name": "matrix-wellknown-matrix-live", + "source": "communication" + } + }, + { + "host": "matrix.live.bstein.dev", + "path": "/_matrix", + "backend": { + "namespace": "comms", + "service": "othrys-synapse-matrix-synapse", + "port": 8008, + "workloads": [ + { + "kind": "Deployment", + "name": "othrys-synapse-matrix-synapse" + } + ] + }, + "via": { + "kind": "Ingress", + "name": "matrix-routing", + "source": "communication" + } + }, + { + "host": "matrix.live.bstein.dev", + "path": "/_matrix/client/r0/register", + "backend": { + "namespace": "comms", + "service": "matrix-guest-register", + "port": 8080, + "workloads": [ + { + "kind": "Deployment", + "name": "matrix-guest-register" + } + ] + }, + "via": { + "kind": "Ingress", + "name": "matrix-routing", + "source": "communication" + } + }, + { + "host": "matrix.live.bstein.dev", + "path": "/_matrix/client/v3/login", + "backend": { + "namespace": "comms", + "service": "matrix-authentication-service", + "port": 8080, + "workloads": [ + { + "kind": "Deployment", + "name": "matrix-authentication-service" + } + ] + }, + "via": { + "kind": "Ingress", + "name": "matrix-routing", + "source": "communication" + } + }, + { + "host": "matrix.live.bstein.dev", + "path": "/_matrix/client/v3/logout", + "backend": { + "namespace": "comms", + "service": "matrix-authentication-service", + "port": 8080, + "workloads": [ + { + "kind": "Deployment", + "name": "matrix-authentication-service" + } + ] + }, + "via": { + "kind": "Ingress", + "name": "matrix-routing", + "source": "communication" + } + }, + { + "host": "matrix.live.bstein.dev", + "path": "/_matrix/client/v3/refresh", + "backend": { + "namespace": "comms", + "service": "matrix-authentication-service", + "port": 8080, + "workloads": [ + { + "kind": "Deployment", + "name": "matrix-authentication-service" + } + ] + }, + "via": { + "kind": "Ingress", + "name": "matrix-routing", + "source": "communication" + } + }, + { + "host": "matrix.live.bstein.dev", + "path": "/_matrix/client/v3/register", + "backend": { + "namespace": "comms", + "service": "matrix-guest-register", + "port": 8080, + "workloads": [ + { + "kind": "Deployment", + "name": "matrix-guest-register" + } + ] + }, + "via": { + "kind": "Ingress", + "name": "matrix-routing", + "source": "communication" + } + }, + { + "host": "matrix.live.bstein.dev", + "path": "/_synapse", + "backend": { + "namespace": "comms", + "service": "othrys-synapse-matrix-synapse", + "port": 8008, + "workloads": [ + { + "kind": "Deployment", + "name": "othrys-synapse-matrix-synapse" + } + ] + }, + "via": { + "kind": "Ingress", + "name": "matrix-routing", + "source": "communication" + } + }, + { + "host": "monero.bstein.dev", + "path": "/", + "backend": { + "namespace": "crypto", + "service": "monerod", + "port": 18081, + "workloads": [ + { + "kind": "Deployment", + "name": "monerod" + } + ] + }, + "via": { + "kind": "Ingress", + "name": "monerod", + "source": "monerod" + } + }, + { + "host": "office.bstein.dev", + "path": "/", + "backend": { + "namespace": "nextcloud", + "service": "collabora", + "port": 9980, + "workloads": [ + { + "kind": "Deployment", + "name": "collabora" + } + ] + }, + "via": { + "kind": "Ingress", + "name": "collabora", + "source": "nextcloud" + } + }, + { + "host": "pegasus.bstein.dev", + "path": "/", + "backend": { + "namespace": "jellyfin", + "service": "pegasus", + "port": 80, + "workloads": [ + { + "kind": "Deployment", + "name": "pegasus" + } + ] + }, + "via": { + "kind": "Ingress", + "name": "pegasus", + "source": "pegasus" + } + }, + { + "host": "scm.bstein.dev", + "path": "/", + "backend": { + "namespace": "gitea", + "service": "gitea", + "port": 3000, + "workloads": [ + { + "kind": "Deployment", + "name": "gitea" + } + ] + }, + "via": { + "kind": "Ingress", + "name": "gitea-ingress", + "source": "gitea" + } + }, + { + "host": "secret.bstein.dev", + "path": "/", + "backend": { + "namespace": "vault", + "service": "vault", + "port": 8200, + "workloads": [ + { + "kind": "StatefulSet", + "name": "vault" + } + ] + }, + "via": { + "kind": "Ingress", + "name": "vault", + "source": "vault" + } + }, + { + "host": "sso.bstein.dev", + "path": "/", + "backend": { + "namespace": "sso", + "service": "keycloak", + "port": 80, + "workloads": [ + { + "kind": "Deployment", + "name": "keycloak" + } + ] + }, + "via": { + "kind": "Ingress", + "name": "keycloak", + "source": "keycloak" + } + }, + { + "host": "stream.bstein.dev", + "path": "/", + "backend": { + "namespace": "jellyfin", + "service": "jellyfin", + "port": 80, + "workloads": [ + { + "kind": "Deployment", + "name": "jellyfin" + } + ] + }, + "via": { + "kind": "Ingress", + "name": "jellyfin", + "source": "jellyfin" + } + }, + { + "host": "vault.bstein.dev", + "path": "/", + "backend": { + "namespace": "vaultwarden", + "service": "vaultwarden-service", + "port": 80, + "workloads": [ + { + "kind": "Deployment", + "name": "vaultwarden" + } + ] + }, + "via": { + "kind": "Ingress", + "name": "vaultwarden-ingress", + "source": "vaultwarden" + } + } + ], + "helmrelease_host_hints": { + "gitops-ui:flux-system/weave-gitops": [ + "cd.bstein.dev" + ], + "harbor:harbor/harbor": [ + "registry.bstein.dev" + ], + "mailu:mailu-mailserver/mailu": [ + "bstein.dev", + "mail.bstein.dev" + ], + "monitoring:monitoring/alertmanager": [ + "alerts.bstein.dev" + ], + "monitoring:monitoring/grafana": [ + "metrics.bstein.dev", + "sso.bstein.dev" + ] + } +} diff --git a/services/comms/knowledge/catalog/atlas.yaml b/services/comms/knowledge/catalog/atlas.yaml new file mode 100644 index 0000000..06e2469 --- /dev/null +++ b/services/comms/knowledge/catalog/atlas.yaml @@ -0,0 +1,1799 @@ +# Generated by scripts/knowledge_render_atlas.py (do not edit by hand) +cluster: atlas +sources: +- name: ai-llm + path: services/ai-llm + targetNamespace: ai +- name: bstein-dev-home + path: services/bstein-dev-home + targetNamespace: bstein-dev-home +- name: ci-demo + path: services/ci-demo + targetNamespace: null +- name: communication + path: services/comms + targetNamespace: comms +- name: core + path: infrastructure/core + targetNamespace: null +- name: crypto + path: services/crypto + targetNamespace: crypto +- name: flux-system + path: clusters/atlas/flux-system + targetNamespace: null +- name: gitea + path: services/gitea + targetNamespace: gitea +- name: gitops-ui + path: services/gitops-ui + targetNamespace: flux-system +- name: harbor + path: services/harbor + targetNamespace: harbor +- name: helm + path: infrastructure/sources/helm + targetNamespace: flux-system +- name: jellyfin + path: services/jellyfin + targetNamespace: jellyfin +- name: jenkins + path: services/jenkins + targetNamespace: jenkins +- name: keycloak + path: services/keycloak + targetNamespace: sso +- name: longhorn-ui + path: infrastructure/longhorn/ui-ingress + targetNamespace: longhorn-system +- name: mailu + path: services/mailu + targetNamespace: mailu-mailserver +- name: metallb + path: infrastructure/metallb + targetNamespace: metallb-system +- name: monerod + path: services/crypto/monerod + targetNamespace: crypto +- name: monitoring + path: services/monitoring + targetNamespace: null +- name: nextcloud + path: services/nextcloud + targetNamespace: nextcloud +- name: nextcloud-mail-sync + path: services/nextcloud-mail-sync + targetNamespace: nextcloud +- name: oauth2-proxy + path: services/oauth2-proxy + targetNamespace: sso +- name: openldap + path: services/openldap + targetNamespace: sso +- name: pegasus + path: services/pegasus + targetNamespace: jellyfin +- name: sui-metrics + path: services/sui-metrics/overlays/atlas + targetNamespace: sui-metrics +- name: traefik + path: infrastructure/traefik + targetNamespace: traefik +- name: vault + path: services/vault + targetNamespace: vault +- name: vault-csi + path: infrastructure/vault-csi + targetNamespace: kube-system +- name: vaultwarden + path: services/vaultwarden + targetNamespace: vaultwarden +- name: xmr-miner + path: services/crypto/xmr-miner + targetNamespace: crypto +workloads: +- kind: Deployment + namespace: ai + name: ollama + labels: + app: ollama + serviceAccountName: null + nodeSelector: {} + images: + - ollama/ollama:latest +- kind: Deployment + namespace: bstein-dev-home + name: bstein-dev-home-backend + labels: + app: bstein-dev-home-backend + serviceAccountName: bstein-dev-home + nodeSelector: + kubernetes.io/arch: arm64 + node-role.kubernetes.io/worker: 'true' + images: + - registry.bstein.dev/bstein/bstein-dev-home-backend:0.1.1-84 +- kind: Deployment + namespace: bstein-dev-home + name: bstein-dev-home-frontend + labels: + app: bstein-dev-home-frontend + serviceAccountName: null + nodeSelector: + kubernetes.io/arch: arm64 + node-role.kubernetes.io/worker: 'true' + images: + - registry.bstein.dev/bstein/bstein-dev-home-frontend:0.1.1-84 +- kind: Deployment + namespace: bstein-dev-home + name: chat-ai-gateway + labels: + app: chat-ai-gateway + serviceAccountName: null + nodeSelector: + kubernetes.io/arch: arm64 + node-role.kubernetes.io/worker: 'true' + images: + - python:3.11-slim +- kind: Deployment + namespace: ci-demo + name: ci-demo + labels: + app.kubernetes.io/name: ci-demo + serviceAccountName: null + nodeSelector: + hardware: rpi4 + images: + - registry.bstein.dev/infra/ci-demo:v0.0.0-3 +- kind: Deployment + namespace: comms + name: atlasbot + labels: + app: atlasbot + serviceAccountName: atlasbot + nodeSelector: + hardware: rpi5 + images: + - python:3.11-slim +- kind: Deployment + namespace: comms + name: coturn + labels: + app: coturn + serviceAccountName: null + nodeSelector: + hardware: rpi5 + images: + - ghcr.io/coturn/coturn:4.6.2 +- kind: Deployment + namespace: comms + name: element-call + labels: + app: element-call + serviceAccountName: null + nodeSelector: + hardware: rpi5 + images: + - ghcr.io/element-hq/element-call:latest +- kind: Deployment + namespace: comms + name: livekit + labels: + app: livekit + serviceAccountName: null + nodeSelector: + hardware: rpi5 + images: + - livekit/livekit-server:v1.9.0 +- kind: Deployment + namespace: comms + name: livekit-token-service + labels: + app: livekit-token-service + serviceAccountName: null + nodeSelector: + hardware: rpi5 + images: + - ghcr.io/element-hq/lk-jwt-service:0.3.0 +- kind: Deployment + namespace: comms + name: matrix-authentication-service + labels: + app: matrix-authentication-service + serviceAccountName: null + nodeSelector: + hardware: rpi5 + images: + - ghcr.io/element-hq/matrix-authentication-service:1.8.0 +- kind: Deployment + namespace: comms + name: matrix-guest-register + labels: + app.kubernetes.io/name: matrix-guest-register + serviceAccountName: null + nodeSelector: {} + images: + - python:3.11-slim +- kind: Deployment + namespace: comms + name: matrix-wellknown + labels: + app: matrix-wellknown + serviceAccountName: null + nodeSelector: {} + images: + - nginx:1.27-alpine +- kind: Deployment + namespace: comms + name: othrys-element-element-web + labels: + app.kubernetes.io/instance: othrys-element + app.kubernetes.io/name: element-web + serviceAccountName: othrys-element-element-web + nodeSelector: + hardware: rpi5 + images: + - ghcr.io/element-hq/element-web:v1.12.6 +- kind: Deployment + namespace: comms + name: othrys-synapse-matrix-synapse + labels: + app.kubernetes.io/component: synapse + app.kubernetes.io/instance: othrys-synapse + app.kubernetes.io/name: matrix-synapse + serviceAccountName: default + nodeSelector: + hardware: rpi5 + images: + - ghcr.io/element-hq/synapse:v1.144.0 +- kind: Deployment + namespace: comms + name: othrys-synapse-redis-master + labels: + app.kubernetes.io/component: master + app.kubernetes.io/instance: othrys-synapse + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/name: redis + helm.sh/chart: redis-17.17.1 + serviceAccountName: othrys-synapse-redis + nodeSelector: {} + images: + - docker.io/bitnamilegacy/redis:7.0.12-debian-11-r34 +- kind: DaemonSet + namespace: crypto + name: monero-xmrig + labels: + app: monero-xmrig + serviceAccountName: null + nodeSelector: + node-role.kubernetes.io/worker: 'true' + images: + - ghcr.io/tari-project/xmrig:latest +- kind: Deployment + namespace: crypto + name: monero-p2pool + labels: + app: monero-p2pool + serviceAccountName: null + nodeSelector: + node-role.kubernetes.io/worker: 'true' + images: + - debian:bookworm-slim +- kind: Deployment + namespace: crypto + name: monerod + labels: + app: monerod + serviceAccountName: null + nodeSelector: + node-role.kubernetes.io/worker: 'true' + images: + - registry.bstein.dev/crypto/monerod:0.18.4.1 +- kind: Deployment + namespace: flux-system + name: helm-controller + labels: + app: helm-controller + app.kubernetes.io/component: helm-controller + app.kubernetes.io/instance: flux-system + app.kubernetes.io/part-of: flux + app.kubernetes.io/version: v2.7.5 + serviceAccountName: helm-controller + nodeSelector: + kubernetes.io/os: linux + images: + - ghcr.io/fluxcd/helm-controller:v1.4.5 +- kind: Deployment + namespace: flux-system + name: image-automation-controller + labels: + app: image-automation-controller + app.kubernetes.io/component: image-automation-controller + app.kubernetes.io/instance: flux-system + app.kubernetes.io/part-of: flux + app.kubernetes.io/version: v2.7.5 + serviceAccountName: image-automation-controller + nodeSelector: + kubernetes.io/os: linux + images: + - ghcr.io/fluxcd/image-automation-controller:v1.0.4 +- kind: Deployment + namespace: flux-system + name: image-reflector-controller + labels: + app: image-reflector-controller + app.kubernetes.io/component: image-reflector-controller + app.kubernetes.io/instance: flux-system + app.kubernetes.io/part-of: flux + app.kubernetes.io/version: v2.7.5 + serviceAccountName: image-reflector-controller + nodeSelector: + kubernetes.io/os: linux + images: + - ghcr.io/fluxcd/image-reflector-controller:v1.0.4 +- kind: Deployment + namespace: flux-system + name: kustomize-controller + labels: + app: kustomize-controller + app.kubernetes.io/component: kustomize-controller + app.kubernetes.io/instance: flux-system + app.kubernetes.io/part-of: flux + app.kubernetes.io/version: v2.7.5 + serviceAccountName: kustomize-controller + nodeSelector: + kubernetes.io/os: linux + images: + - ghcr.io/fluxcd/kustomize-controller:v1.7.3 +- kind: Deployment + namespace: flux-system + name: notification-controller + labels: + app: notification-controller + app.kubernetes.io/component: notification-controller + app.kubernetes.io/instance: flux-system + app.kubernetes.io/part-of: flux + app.kubernetes.io/version: v2.7.5 + serviceAccountName: notification-controller + nodeSelector: + kubernetes.io/os: linux + images: + - ghcr.io/fluxcd/notification-controller:v1.7.5 +- kind: Deployment + namespace: flux-system + name: source-controller + labels: + app: source-controller + app.kubernetes.io/component: source-controller + app.kubernetes.io/instance: flux-system + app.kubernetes.io/part-of: flux + app.kubernetes.io/version: v2.7.5 + serviceAccountName: source-controller + nodeSelector: + kubernetes.io/os: linux + images: + - ghcr.io/fluxcd/source-controller:v1.7.4 +- kind: Deployment + namespace: gitea + name: gitea + labels: + app: gitea + serviceAccountName: null + nodeSelector: + node-role.kubernetes.io/worker: 'true' + images: + - gitea/gitea:1.23 +- kind: Deployment + namespace: jellyfin + name: jellyfin + labels: + app: jellyfin + serviceAccountName: null + nodeSelector: {} + images: + - docker.io/jellyfin/jellyfin:10.11.5 +- kind: Deployment + namespace: jellyfin + name: pegasus + labels: + app: pegasus + serviceAccountName: null + nodeSelector: + kubernetes.io/arch: arm64 + node-role.kubernetes.io/worker: 'true' + images: + - alpine:3.20 + - registry.bstein.dev/streaming/pegasus:1.2.32 +- kind: Deployment + namespace: jenkins + name: jenkins + labels: + app: jenkins + serviceAccountName: jenkins + nodeSelector: + kubernetes.io/arch: arm64 + node-role.kubernetes.io/worker: 'true' + images: + - jenkins/jenkins:2.528.3-jdk21 +- kind: DaemonSet + namespace: kube-system + name: nvidia-device-plugin-jetson + labels: + app.kubernetes.io/instance: jetson + app.kubernetes.io/name: nvidia-device-plugin + serviceAccountName: null + nodeSelector: + jetson: 'true' + kubernetes.io/arch: arm64 + images: + - nvcr.io/nvidia/k8s-device-plugin:v0.16.2 +- kind: DaemonSet + namespace: kube-system + name: nvidia-device-plugin-minipc + labels: + app.kubernetes.io/instance: titan22 + app.kubernetes.io/name: nvidia-device-plugin + serviceAccountName: null + nodeSelector: + kubernetes.io/arch: amd64 + kubernetes.io/hostname: titan-22 + images: + - nvcr.io/nvidia/k8s-device-plugin:v0.16.2 +- kind: DaemonSet + namespace: kube-system + name: nvidia-device-plugin-tethys + labels: + app.kubernetes.io/instance: titan24 + app.kubernetes.io/name: nvidia-device-plugin + serviceAccountName: null + nodeSelector: + kubernetes.io/arch: amd64 + kubernetes.io/hostname: titan-24 + images: + - nvcr.io/nvidia/k8s-device-plugin:v0.16.2 +- kind: DaemonSet + namespace: kube-system + name: vault-csi-provider + labels: + app.kubernetes.io/name: vault-csi-provider + serviceAccountName: vault-csi-provider + nodeSelector: + kubernetes.io/os: linux + images: + - hashicorp/vault-csi-provider:1.7.0 +- kind: Deployment + namespace: longhorn-system + name: oauth2-proxy-longhorn + labels: + app: oauth2-proxy-longhorn + serviceAccountName: null + nodeSelector: + node-role.kubernetes.io/worker: 'true' + images: + - quay.io/oauth2-proxy/oauth2-proxy:v7.6.0 +- kind: DaemonSet + namespace: mailu-mailserver + name: vip-controller + labels: + app: vip-controller + serviceAccountName: vip-controller + nodeSelector: + mailu.bstein.dev/vip: 'true' + images: + - lachlanevenson/k8s-kubectl:latest +- kind: Deployment + namespace: mailu-mailserver + name: mailu-sync-listener + labels: + app: mailu-sync-listener + serviceAccountName: null + nodeSelector: {} + images: + - python:3.11-alpine +- kind: DaemonSet + namespace: metallb-system + name: metallb-speaker + labels: + app.kubernetes.io/component: speaker + app.kubernetes.io/instance: metallb + app.kubernetes.io/name: metallb + serviceAccountName: metallb-speaker + nodeSelector: + kubernetes.io/os: linux + images: + - quay.io/frrouting/frr:10.4.1 + - quay.io/metallb/speaker:v0.15.3 +- kind: Deployment + namespace: metallb-system + name: metallb-controller + labels: + app.kubernetes.io/component: controller + app.kubernetes.io/instance: metallb + app.kubernetes.io/name: metallb + serviceAccountName: metallb-controller + nodeSelector: + kubernetes.io/os: linux + images: + - quay.io/metallb/controller:v0.15.3 +- kind: DaemonSet + namespace: monitoring + name: dcgm-exporter + labels: + app: dcgm-exporter + serviceAccountName: default + nodeSelector: {} + images: + - registry.bstein.dev/monitoring/dcgm-exporter:4.4.2-4.7.0-ubuntu22.04 +- kind: Deployment + namespace: monitoring + name: postmark-exporter + labels: + app: postmark-exporter + serviceAccountName: null + nodeSelector: {} + images: + - python:3.12-alpine +- kind: Deployment + namespace: nextcloud + name: collabora + labels: + app: collabora + serviceAccountName: null + nodeSelector: + hardware: rpi5 + images: + - collabora/code:latest +- kind: Deployment + namespace: nextcloud + name: nextcloud + labels: + app: nextcloud + serviceAccountName: null + nodeSelector: + hardware: rpi5 + images: + - nextcloud:29-apache +- kind: Deployment + namespace: sso + name: keycloak + labels: + app: keycloak + serviceAccountName: null + nodeSelector: {} + images: + - quay.io/keycloak/keycloak:26.0.7 +- kind: Deployment + namespace: sso + name: oauth2-proxy + labels: + app: oauth2-proxy + serviceAccountName: null + nodeSelector: + node-role.kubernetes.io/worker: 'true' + images: + - quay.io/oauth2-proxy/oauth2-proxy:v7.6.0 +- kind: StatefulSet + namespace: sso + name: openldap + labels: + app: openldap + serviceAccountName: null + nodeSelector: + kubernetes.io/arch: arm64 + node-role.kubernetes.io/worker: 'true' + images: + - docker.io/osixia/openldap:1.5.0 +- kind: Deployment + namespace: sui-metrics + name: sui-metrics + labels: + app: sui-metrics + serviceAccountName: sui-metrics + nodeSelector: + kubernetes.io/hostname: titan-24 + images: + - victoriametrics/vmagent:v1.103.0 +- kind: Deployment + namespace: traefik + name: traefik + labels: + app: traefik + serviceAccountName: traefik-ingress-controller + nodeSelector: + node-role.kubernetes.io/worker: 'true' + images: + - traefik:v3.3.3 +- kind: StatefulSet + namespace: vault + name: vault + labels: + app: vault + serviceAccountName: vault + nodeSelector: + kubernetes.io/arch: arm64 + node-role.kubernetes.io/worker: 'true' + images: + - hashicorp/vault:1.17.6 +- kind: Deployment + namespace: vaultwarden + name: vaultwarden + labels: + app: vaultwarden + serviceAccountName: null + nodeSelector: {} + images: + - vaultwarden/server:1.33.2 +services: +- namespace: ai + name: ollama + type: ClusterIP + selector: + app: ollama + ports: + - name: http + port: 11434 + targetPort: 11434 + protocol: TCP +- namespace: bstein-dev-home + name: bstein-dev-home-backend + type: ClusterIP + selector: + app: bstein-dev-home-backend + ports: + - name: http + port: 80 + targetPort: 8080 + protocol: TCP +- namespace: bstein-dev-home + name: bstein-dev-home-frontend + type: ClusterIP + selector: + app: bstein-dev-home-frontend + ports: + - name: http + port: 80 + targetPort: 80 + protocol: TCP +- namespace: bstein-dev-home + name: chat-ai-gateway + type: ClusterIP + selector: + app: chat-ai-gateway + ports: + - name: http + port: 80 + targetPort: 8080 + protocol: TCP +- namespace: ci-demo + name: ci-demo + type: ClusterIP + selector: + app.kubernetes.io/name: ci-demo + ports: + - name: http + port: 80 + targetPort: http + protocol: TCP +- namespace: comms + name: coturn + type: LoadBalancer + selector: + app: coturn + ports: + - name: turn-udp + port: 3478 + targetPort: 3478 + protocol: UDP + - name: turn-tcp + port: 3478 + targetPort: 3478 + protocol: TCP + - name: turn-tls + port: 5349 + targetPort: 5349 + protocol: TCP + - name: relay-50000 + port: 50000 + targetPort: 50000 + protocol: UDP + - name: relay-50001 + port: 50001 + targetPort: 50001 + protocol: UDP + - name: relay-50002 + port: 50002 + targetPort: 50002 + protocol: UDP + - name: relay-50003 + port: 50003 + targetPort: 50003 + protocol: UDP + - name: relay-50004 + port: 50004 + targetPort: 50004 + protocol: UDP + - name: relay-50005 + port: 50005 + targetPort: 50005 + protocol: UDP + - name: relay-50006 + port: 50006 + targetPort: 50006 + protocol: UDP + - name: relay-50007 + port: 50007 + targetPort: 50007 + protocol: UDP + - name: relay-50008 + port: 50008 + targetPort: 50008 + protocol: UDP + - name: relay-50009 + port: 50009 + targetPort: 50009 + protocol: UDP + - name: relay-50010 + port: 50010 + targetPort: 50010 + protocol: UDP + - name: relay-50011 + port: 50011 + targetPort: 50011 + protocol: UDP + - name: relay-50012 + port: 50012 + targetPort: 50012 + protocol: UDP + - name: relay-50013 + port: 50013 + targetPort: 50013 + protocol: UDP + - name: relay-50014 + port: 50014 + targetPort: 50014 + protocol: UDP + - name: relay-50015 + port: 50015 + targetPort: 50015 + protocol: UDP + - name: relay-50016 + port: 50016 + targetPort: 50016 + protocol: UDP + - name: relay-50017 + port: 50017 + targetPort: 50017 + protocol: UDP + - name: relay-50018 + port: 50018 + targetPort: 50018 + protocol: UDP + - name: relay-50019 + port: 50019 + targetPort: 50019 + protocol: UDP + - name: relay-50020 + port: 50020 + targetPort: 50020 + protocol: UDP + - name: relay-50021 + port: 50021 + targetPort: 50021 + protocol: UDP + - name: relay-50022 + port: 50022 + targetPort: 50022 + protocol: UDP + - name: relay-50023 + port: 50023 + targetPort: 50023 + protocol: UDP + - name: relay-50024 + port: 50024 + targetPort: 50024 + protocol: UDP + - name: relay-50025 + port: 50025 + targetPort: 50025 + protocol: UDP + - name: relay-50026 + port: 50026 + targetPort: 50026 + protocol: UDP + - name: relay-50027 + port: 50027 + targetPort: 50027 + protocol: UDP + - name: relay-50028 + port: 50028 + targetPort: 50028 + protocol: UDP + - name: relay-50029 + port: 50029 + targetPort: 50029 + protocol: UDP + - name: relay-50030 + port: 50030 + targetPort: 50030 + protocol: UDP + - name: relay-50031 + port: 50031 + targetPort: 50031 + protocol: UDP + - name: relay-50032 + port: 50032 + targetPort: 50032 + protocol: UDP + - name: relay-50033 + port: 50033 + targetPort: 50033 + protocol: UDP + - name: relay-50034 + port: 50034 + targetPort: 50034 + protocol: UDP + - name: relay-50035 + port: 50035 + targetPort: 50035 + protocol: UDP + - name: relay-50036 + port: 50036 + targetPort: 50036 + protocol: UDP + - name: relay-50037 + port: 50037 + targetPort: 50037 + protocol: UDP + - name: relay-50038 + port: 50038 + targetPort: 50038 + protocol: UDP + - name: relay-50039 + port: 50039 + targetPort: 50039 + protocol: UDP + - name: relay-50040 + port: 50040 + targetPort: 50040 + protocol: UDP + - name: relay-50041 + port: 50041 + targetPort: 50041 + protocol: UDP + - name: relay-50042 + port: 50042 + targetPort: 50042 + protocol: UDP + - name: relay-50043 + port: 50043 + targetPort: 50043 + protocol: UDP + - name: relay-50044 + port: 50044 + targetPort: 50044 + protocol: UDP + - name: relay-50045 + port: 50045 + targetPort: 50045 + protocol: UDP + - name: relay-50046 + port: 50046 + targetPort: 50046 + protocol: UDP + - name: relay-50047 + port: 50047 + targetPort: 50047 + protocol: UDP + - name: relay-50048 + port: 50048 + targetPort: 50048 + protocol: UDP + - name: relay-50049 + port: 50049 + targetPort: 50049 + protocol: UDP + - name: relay-50050 + port: 50050 + targetPort: 50050 + protocol: UDP +- namespace: comms + name: element-call + type: ClusterIP + selector: + app: element-call + ports: + - name: http + port: 80 + targetPort: 8080 + protocol: TCP +- namespace: comms + name: livekit + type: LoadBalancer + selector: + app: livekit + ports: + - name: http + port: 7880 + targetPort: 7880 + protocol: TCP + - name: rtc-tcp + port: 7881 + targetPort: 7881 + protocol: TCP + - name: rtc-udp-7882 + port: 7882 + targetPort: 7882 + protocol: UDP + - name: rtc-udp-7883 + port: 7883 + targetPort: 7883 + protocol: UDP +- namespace: comms + name: livekit-token-service + type: ClusterIP + selector: + app: livekit-token-service + ports: + - name: http + port: 8080 + targetPort: 8080 + protocol: TCP +- namespace: comms + name: matrix-authentication-service + type: ClusterIP + selector: + app: matrix-authentication-service + ports: + - name: http + port: 8080 + targetPort: http + protocol: TCP + - name: internal + port: 8081 + targetPort: internal + protocol: TCP +- namespace: comms + name: matrix-guest-register + type: ClusterIP + selector: + app.kubernetes.io/name: matrix-guest-register + ports: + - name: http + port: 8080 + targetPort: http + protocol: TCP +- namespace: comms + name: matrix-wellknown + type: ClusterIP + selector: + app: matrix-wellknown + ports: + - name: http + port: 80 + targetPort: 80 + protocol: TCP +- namespace: comms + name: othrys-element-element-web + type: ClusterIP + selector: + app.kubernetes.io/instance: othrys-element + app.kubernetes.io/name: element-web + ports: + - name: http + port: 80 + targetPort: http + protocol: TCP +- namespace: comms + name: othrys-synapse-matrix-synapse + type: ClusterIP + selector: + app.kubernetes.io/component: synapse + app.kubernetes.io/instance: othrys-synapse + app.kubernetes.io/name: matrix-synapse + ports: + - name: http + port: 8008 + targetPort: http + protocol: TCP +- namespace: comms + name: othrys-synapse-redis-headless + type: ClusterIP + selector: + app.kubernetes.io/instance: othrys-synapse + app.kubernetes.io/name: redis + ports: + - name: tcp-redis + port: 6379 + targetPort: redis + protocol: TCP +- namespace: comms + name: othrys-synapse-redis-master + type: ClusterIP + selector: + app.kubernetes.io/component: master + app.kubernetes.io/instance: othrys-synapse + app.kubernetes.io/name: redis + ports: + - name: tcp-redis + port: 6379 + targetPort: redis + protocol: TCP +- namespace: comms + name: othrys-synapse-replication + type: ClusterIP + selector: + app.kubernetes.io/component: synapse + app.kubernetes.io/instance: othrys-synapse + app.kubernetes.io/name: matrix-synapse + ports: + - name: replication + port: 9093 + targetPort: replication + protocol: TCP +- namespace: crypto + name: monerod + type: ClusterIP + selector: + app: monerod + ports: + - name: rpc + port: 18081 + targetPort: 18081 + protocol: TCP + - name: p2p + port: 18080 + targetPort: 18080 + protocol: TCP + - name: zmq + port: 18083 + targetPort: 18083 + protocol: TCP +- namespace: crypto + name: p2pool + type: ClusterIP + selector: + app: p2pool + ports: + - name: stratum + port: 3333 + targetPort: 3333 + protocol: TCP +- namespace: flux-system + name: notification-controller + type: ClusterIP + selector: + app: notification-controller + ports: + - name: http + port: 80 + targetPort: http + protocol: TCP +- namespace: flux-system + name: source-controller + type: ClusterIP + selector: + app: source-controller + ports: + - name: http + port: 80 + targetPort: http + protocol: TCP +- namespace: flux-system + name: webhook-receiver + type: ClusterIP + selector: + app: notification-controller + ports: + - name: http + port: 80 + targetPort: http-webhook + protocol: TCP +- namespace: gitea + name: gitea + type: ClusterIP + selector: + app: gitea + ports: + - name: http + port: 3000 + targetPort: 3000 + protocol: TCP +- namespace: gitea + name: gitea-ssh + type: NodePort + selector: + app: gitea + ports: + - name: ssh + port: 2242 + targetPort: 2242 + protocol: TCP +- namespace: jellyfin + name: jellyfin + type: ClusterIP + selector: + app: jellyfin + ports: + - name: http + port: 80 + targetPort: 8096 + protocol: TCP +- namespace: jellyfin + name: pegasus + type: ClusterIP + selector: + app: pegasus + ports: + - name: http + port: 80 + targetPort: http + protocol: TCP +- namespace: jenkins + name: jenkins + type: ClusterIP + selector: + app: jenkins + ports: + - name: http + port: 8080 + targetPort: 8080 + protocol: TCP + - name: agent-listener + port: 50000 + targetPort: 50000 + protocol: TCP +- namespace: kube-system + name: traefik + type: LoadBalancer + selector: + app.kubernetes.io/instance: traefik-kube-system + app.kubernetes.io/name: traefik + ports: + - name: web + port: 80 + targetPort: web + protocol: TCP + - name: websecure + port: 443 + targetPort: websecure + protocol: TCP +- namespace: longhorn-system + name: oauth2-proxy-longhorn + type: ClusterIP + selector: + app: oauth2-proxy-longhorn + ports: + - name: http + port: 80 + targetPort: 4180 + protocol: TCP +- namespace: mailu-mailserver + name: mailu-front-lb + type: LoadBalancer + selector: + app.kubernetes.io/component: front + app.kubernetes.io/instance: mailu + app.kubernetes.io/name: mailu + ports: + - name: smtp + port: 25 + targetPort: 25 + protocol: TCP + - name: smtps + port: 465 + targetPort: 465 + protocol: TCP + - name: submission + port: 587 + targetPort: 587 + protocol: TCP + - name: imaps + port: 993 + targetPort: 993 + protocol: TCP + - name: pop3s + port: 995 + targetPort: 995 + protocol: TCP + - name: sieve + port: 4190 + targetPort: 4190 + protocol: TCP +- namespace: mailu-mailserver + name: mailu-sync-listener + type: ClusterIP + selector: + app: mailu-sync-listener + ports: + - name: http + port: 8080 + targetPort: 8080 + protocol: TCP +- namespace: metallb-system + name: metallb-webhook-service + type: ClusterIP + selector: + app.kubernetes.io/component: controller + app.kubernetes.io/instance: metallb + app.kubernetes.io/name: metallb + ports: + - name: null + port: 443 + targetPort: 9443 + protocol: TCP +- namespace: monitoring + name: dcgm-exporter + type: ClusterIP + selector: + app: dcgm-exporter + ports: + - name: metrics + port: 9400 + targetPort: metrics + protocol: TCP +- namespace: monitoring + name: postmark-exporter + type: ClusterIP + selector: + app: postmark-exporter + ports: + - name: http + port: 8000 + targetPort: http + protocol: TCP +- namespace: nextcloud + name: collabora + type: ClusterIP + selector: + app: collabora + ports: + - name: http + port: 9980 + targetPort: http + protocol: TCP +- namespace: nextcloud + name: nextcloud + type: ClusterIP + selector: + app: nextcloud + ports: + - name: http + port: 80 + targetPort: http + protocol: TCP +- namespace: sso + name: keycloak + type: ClusterIP + selector: + app: keycloak + ports: + - name: http + port: 80 + targetPort: http + protocol: TCP +- namespace: sso + name: oauth2-proxy + type: ClusterIP + selector: + app: oauth2-proxy + ports: + - name: http + port: 80 + targetPort: 4180 + protocol: TCP +- namespace: sso + name: openldap + type: ClusterIP + selector: + app: openldap + ports: + - name: ldap + port: 389 + targetPort: ldap + protocol: TCP + - name: ldaps + port: 636 + targetPort: ldaps + protocol: TCP +- namespace: sui-metrics + name: sui-metrics + type: ClusterIP + selector: + app: sui-metrics + ports: + - name: http + port: 8429 + targetPort: 8429 + protocol: TCP +- namespace: traefik + name: traefik-metrics + type: ClusterIP + selector: + app: traefik + ports: + - name: metrics + port: 9100 + targetPort: metrics + protocol: TCP +- namespace: vault + name: vault + type: ClusterIP + selector: + app: vault + ports: + - name: api + port: 8200 + targetPort: 8200 + protocol: TCP + - name: cluster + port: 8201 + targetPort: 8201 + protocol: TCP +- namespace: vault + name: vault-internal + type: ClusterIP + selector: + app: vault + ports: + - name: api + port: 8200 + targetPort: 8200 + protocol: TCP + - name: cluster + port: 8201 + targetPort: 8201 + protocol: TCP +- namespace: vaultwarden + name: vaultwarden-service + type: ClusterIP + selector: + app: vaultwarden + ports: + - name: http + port: 80 + targetPort: http + protocol: TCP +http_endpoints: +- host: auth.bstein.dev + path: / + backend: + namespace: sso + service: oauth2-proxy + port: 80 + workloads: + - kind: Deployment + name: oauth2-proxy + via: + kind: Ingress + name: oauth2-proxy + source: oauth2-proxy +- host: bstein.dev + path: / + backend: + namespace: bstein-dev-home + service: bstein-dev-home-frontend + port: 80 + workloads: + - kind: Deployment + name: bstein-dev-home-frontend + via: + kind: Ingress + name: bstein-dev-home + source: bstein-dev-home +- host: bstein.dev + path: /.well-known/matrix/client + backend: + namespace: comms + service: matrix-wellknown + port: 80 + workloads: &id001 + - kind: Deployment + name: matrix-wellknown + via: + kind: Ingress + name: matrix-wellknown-bstein-dev + source: communication +- host: bstein.dev + path: /.well-known/matrix/server + backend: + namespace: comms + service: matrix-wellknown + port: 80 + workloads: *id001 + via: + kind: Ingress + name: matrix-wellknown-bstein-dev + source: communication +- host: bstein.dev + path: /api + backend: + namespace: bstein-dev-home + service: bstein-dev-home-backend + port: 80 + workloads: + - kind: Deployment + name: bstein-dev-home-backend + via: + kind: Ingress + name: bstein-dev-home + source: bstein-dev-home +- host: call.live.bstein.dev + path: / + backend: + namespace: comms + service: element-call + port: 80 + workloads: + - kind: Deployment + name: element-call + via: + kind: Ingress + name: element-call + source: communication +- host: chat.ai.bstein.dev + path: / + backend: + namespace: bstein-dev-home + service: chat-ai-gateway + port: 80 + workloads: + - kind: Deployment + name: chat-ai-gateway + via: + kind: Ingress + name: bstein-dev-home + source: bstein-dev-home +- host: ci.bstein.dev + path: / + backend: + namespace: jenkins + service: jenkins + port: http + workloads: + - kind: Deployment + name: jenkins + via: + kind: Ingress + name: jenkins + source: jenkins +- host: cloud.bstein.dev + path: / + backend: + namespace: nextcloud + service: nextcloud + port: 80 + workloads: + - kind: Deployment + name: nextcloud + via: + kind: Ingress + name: nextcloud + source: nextcloud +- host: kit.live.bstein.dev + path: /livekit/jwt + backend: + namespace: comms + service: livekit-token-service + port: 8080 + workloads: + - kind: Deployment + name: livekit-token-service + via: + kind: Ingress + name: livekit-jwt-ingress + source: communication +- host: kit.live.bstein.dev + path: /livekit/sfu + backend: + namespace: comms + service: livekit + port: 7880 + workloads: + - kind: Deployment + name: livekit + via: + kind: Ingress + name: livekit-ingress + source: communication +- host: live.bstein.dev + path: / + backend: + namespace: comms + service: othrys-element-element-web + port: 80 + workloads: + - kind: Deployment + name: othrys-element-element-web + via: + kind: Ingress + name: othrys-element-element-web + source: communication +- host: live.bstein.dev + path: /.well-known/matrix/client + backend: + namespace: comms + service: matrix-wellknown + port: 80 + workloads: *id001 + via: + kind: Ingress + name: matrix-wellknown + source: communication +- host: live.bstein.dev + path: /.well-known/matrix/server + backend: + namespace: comms + service: matrix-wellknown + port: 80 + workloads: *id001 + via: + kind: Ingress + name: matrix-wellknown + source: communication +- host: live.bstein.dev + path: /_matrix + backend: + namespace: comms + service: othrys-synapse-matrix-synapse + port: 8008 + workloads: &id002 + - kind: Deployment + name: othrys-synapse-matrix-synapse + via: + kind: Ingress + name: matrix-routing + source: communication +- host: longhorn.bstein.dev + path: / + backend: + namespace: longhorn-system + service: oauth2-proxy-longhorn + port: 80 + workloads: + - kind: Deployment + name: oauth2-proxy-longhorn + via: + kind: Ingress + name: longhorn-ingress + source: longhorn-ui +- host: mail.bstein.dev + path: / + backend: + namespace: mailu-mailserver + service: mailu-front + port: 443 + workloads: [] + via: + kind: IngressRoute + name: mailu + source: mailu +- host: matrix.live.bstein.dev + path: / + backend: + namespace: comms + service: matrix-authentication-service + port: 8080 + workloads: &id003 + - kind: Deployment + name: matrix-authentication-service + via: + kind: Ingress + name: matrix-routing + source: communication +- host: matrix.live.bstein.dev + path: /.well-known/matrix/client + backend: + namespace: comms + service: matrix-wellknown + port: 80 + workloads: *id001 + via: + kind: Ingress + name: matrix-wellknown-matrix-live + source: communication +- host: matrix.live.bstein.dev + path: /.well-known/matrix/server + backend: + namespace: comms + service: matrix-wellknown + port: 80 + workloads: *id001 + via: + kind: Ingress + name: matrix-wellknown-matrix-live + source: communication +- host: matrix.live.bstein.dev + path: /_matrix + backend: + namespace: comms + service: othrys-synapse-matrix-synapse + port: 8008 + workloads: *id002 + via: + kind: Ingress + name: matrix-routing + source: communication +- host: matrix.live.bstein.dev + path: /_matrix/client/r0/register + backend: + namespace: comms + service: matrix-guest-register + port: 8080 + workloads: &id004 + - kind: Deployment + name: matrix-guest-register + via: + kind: Ingress + name: matrix-routing + source: communication +- host: matrix.live.bstein.dev + path: /_matrix/client/v3/login + backend: + namespace: comms + service: matrix-authentication-service + port: 8080 + workloads: *id003 + via: + kind: Ingress + name: matrix-routing + source: communication +- host: matrix.live.bstein.dev + path: /_matrix/client/v3/logout + backend: + namespace: comms + service: matrix-authentication-service + port: 8080 + workloads: *id003 + via: + kind: Ingress + name: matrix-routing + source: communication +- host: matrix.live.bstein.dev + path: /_matrix/client/v3/refresh + backend: + namespace: comms + service: matrix-authentication-service + port: 8080 + workloads: *id003 + via: + kind: Ingress + name: matrix-routing + source: communication +- host: matrix.live.bstein.dev + path: /_matrix/client/v3/register + backend: + namespace: comms + service: matrix-guest-register + port: 8080 + workloads: *id004 + via: + kind: Ingress + name: matrix-routing + source: communication +- host: matrix.live.bstein.dev + path: /_synapse + backend: + namespace: comms + service: othrys-synapse-matrix-synapse + port: 8008 + workloads: *id002 + via: + kind: Ingress + name: matrix-routing + source: communication +- host: monero.bstein.dev + path: / + backend: + namespace: crypto + service: monerod + port: 18081 + workloads: + - kind: Deployment + name: monerod + via: + kind: Ingress + name: monerod + source: monerod +- host: office.bstein.dev + path: / + backend: + namespace: nextcloud + service: collabora + port: 9980 + workloads: + - kind: Deployment + name: collabora + via: + kind: Ingress + name: collabora + source: nextcloud +- host: pegasus.bstein.dev + path: / + backend: + namespace: jellyfin + service: pegasus + port: 80 + workloads: + - kind: Deployment + name: pegasus + via: + kind: Ingress + name: pegasus + source: pegasus +- host: scm.bstein.dev + path: / + backend: + namespace: gitea + service: gitea + port: 3000 + workloads: + - kind: Deployment + name: gitea + via: + kind: Ingress + name: gitea-ingress + source: gitea +- host: secret.bstein.dev + path: / + backend: + namespace: vault + service: vault + port: 8200 + workloads: + - kind: StatefulSet + name: vault + via: + kind: Ingress + name: vault + source: vault +- host: sso.bstein.dev + path: / + backend: + namespace: sso + service: keycloak + port: 80 + workloads: + - kind: Deployment + name: keycloak + via: + kind: Ingress + name: keycloak + source: keycloak +- host: stream.bstein.dev + path: / + backend: + namespace: jellyfin + service: jellyfin + port: 80 + workloads: + - kind: Deployment + name: jellyfin + via: + kind: Ingress + name: jellyfin + source: jellyfin +- host: vault.bstein.dev + path: / + backend: + namespace: vaultwarden + service: vaultwarden-service + port: 80 + workloads: + - kind: Deployment + name: vaultwarden + via: + kind: Ingress + name: vaultwarden-ingress + source: vaultwarden +helmrelease_host_hints: + gitops-ui:flux-system/weave-gitops: + - cd.bstein.dev + harbor:harbor/harbor: + - registry.bstein.dev + mailu:mailu-mailserver/mailu: + - bstein.dev + - mail.bstein.dev + monitoring:monitoring/alertmanager: + - alerts.bstein.dev + monitoring:monitoring/grafana: + - metrics.bstein.dev + - sso.bstein.dev diff --git a/services/comms/knowledge/catalog/runbooks.json b/services/comms/knowledge/catalog/runbooks.json new file mode 100644 index 0000000..d7356ca --- /dev/null +++ b/services/comms/knowledge/catalog/runbooks.json @@ -0,0 +1,73 @@ +[ + { + "path": "runbooks/ci-gitea-jenkins.md", + "title": "CI: Gitea \u2192 Jenkins pipeline", + "tags": [ + "atlas", + "ci", + "gitea", + "jenkins" + ], + "entrypoints": [ + "scm.bstein.dev", + "ci.bstein.dev" + ], + "source_paths": [ + "services/gitea", + "services/jenkins", + "scripts/jenkins_cred_sync.sh", + "scripts/gitea_cred_sync.sh" + ], + "body": "# CI: Gitea \u2192 Jenkins pipeline\n\n## What this is\nAtlas uses Gitea for source control and Jenkins for CI. Authentication is via Keycloak (SSO).\n\n## Where it is configured\n- Gitea manifests: `services/gitea/`\n- Jenkins manifests: `services/jenkins/`\n- Credential sync helpers: `scripts/gitea_cred_sync.sh`, `scripts/jenkins_cred_sync.sh`\n\n## What users do (typical flow)\n- Create a repo in Gitea.\n- Create/update a Jenkins job/pipeline that can fetch the repo.\n- Configure a webhook (or SCM polling) so pushes trigger builds.\n\n## Troubleshooting (common)\n- \u201cWebhook not firing\u201d: confirm ingress host, webhook URL, and Jenkins job is reachable.\n- \u201cAuth denied cloning\u201d: confirm Keycloak group membership and that Jenkins has a valid token/credential configured." + }, + { + "path": "runbooks/kb-authoring.md", + "title": "KB authoring: what to write (and what not to)", + "tags": [ + "atlas", + "kb", + "runbooks" + ], + "entrypoints": [], + "source_paths": [ + "knowledge/runbooks", + "scripts/knowledge_render_atlas.py" + ], + "body": "# KB authoring: what to write (and what not to)\n\n## The goal\nGive Atlas assistants enough grounded, Atlas-specific context to answer \u201chow do I\u2026?\u201d questions without guessing.\n\n## What to capture (high value)\n- User workflows: \u201cclick here, set X, expected result\u201d\n- Operator workflows: \u201cedit these files, reconcile this kustomization, verify with these commands\u201d\n- Wiring: \u201cthis host routes to this service; this service depends on Postgres/Vault/etc\u201d\n- Failure modes: exact error messages + the 2\u20135 checks that usually resolve them\n- Permissions: Keycloak groups/roles and what they unlock\n\n## What to avoid (low value / fluff)\n- Generic Kubernetes explanations (link to upstream docs instead)\n- Copy-pasting large manifests (prefer file paths + small snippets)\n- Anything that will drift quickly (render it from GitOps instead)\n- Any secret values (reference Secret/Vault locations by name only)\n\n## Document pattern (recommended)\nEach runbook should answer:\n- \u201cWhat is this?\u201d\n- \u201cWhat do users do?\u201d\n- \u201cWhat do operators change (where in Git)?\u201d\n- \u201cHow do we verify it works?\u201d\n- \u201cWhat breaks and how to debug it?\u201d" + }, + { + "path": "runbooks/observability.md", + "title": "Observability: Grafana + VictoriaMetrics (how to query safely)", + "tags": [ + "atlas", + "monitoring", + "grafana", + "victoriametrics" + ], + "entrypoints": [ + "metrics.bstein.dev", + "alerts.bstein.dev" + ], + "source_paths": [ + "services/monitoring" + ], + "body": "# Observability: Grafana + VictoriaMetrics (how to query safely)\n\n## Where it is configured\n- `services/monitoring/helmrelease.yaml` (Grafana + Alertmanager + VM values)\n- `services/monitoring/grafana-dashboard-*.yaml` (dashboards and their PromQL)\n\n## Using metrics as a \u201ctool\u201d for Atlas assistants\nThe safest pattern is: map a small set of intents \u2192 fixed PromQL queries, then summarize results.\n\nExamples (intents)\n- \u201cIs the cluster healthy?\u201d \u2192 node readiness + pod restart rate\n- \u201cWhy is Element Call failing?\u201d \u2192 LiveKit/coturn pod restarts + synapse errors + ingress 5xx\n- \u201cIs Jenkins slow?\u201d \u2192 pod CPU/memory + HTTP latency metrics (if exported)\n\n## Why dashboards are not the KB\nDashboards are great references, but the assistant should query VictoriaMetrics directly for live answers and keep the\nKB focused on wiring, runbooks, and stable conventions." + }, + { + "path": "runbooks/template.md", + "title": "", + "tags": [ + "atlas", + "", + "" + ], + "entrypoints": [ + "" + ], + "source_paths": [ + "services/", + "clusters/atlas/<...>" + ], + "body": "# \n\n## What this is\n\n## For users (how to)\n\n## For operators (where configured)\n\n## Troubleshooting (symptoms \u2192 checks)" + } +] diff --git a/services/comms/knowledge/diagrams/atlas-http.mmd b/services/comms/knowledge/diagrams/atlas-http.mmd new file mode 100644 index 0000000..ddd33d8 --- /dev/null +++ b/services/comms/knowledge/diagrams/atlas-http.mmd @@ -0,0 +1,189 @@ +flowchart LR + host_auth_bstein_dev["auth.bstein.dev"] + svc_sso_oauth2_proxy["sso/oauth2-proxy (Service)"] + host_auth_bstein_dev --> svc_sso_oauth2_proxy + wl_sso_oauth2_proxy["sso/oauth2-proxy (Deployment)"] + svc_sso_oauth2_proxy --> wl_sso_oauth2_proxy + host_bstein_dev["bstein.dev"] + svc_bstein_dev_home_bstein_dev_home_frontend["bstein-dev-home/bstein-dev-home-frontend (Service)"] + host_bstein_dev --> svc_bstein_dev_home_bstein_dev_home_frontend + wl_bstein_dev_home_bstein_dev_home_frontend["bstein-dev-home/bstein-dev-home-frontend (Deployment)"] + svc_bstein_dev_home_bstein_dev_home_frontend --> wl_bstein_dev_home_bstein_dev_home_frontend + svc_comms_matrix_wellknown["comms/matrix-wellknown (Service)"] + host_bstein_dev --> svc_comms_matrix_wellknown + wl_comms_matrix_wellknown["comms/matrix-wellknown (Deployment)"] + svc_comms_matrix_wellknown --> wl_comms_matrix_wellknown + svc_bstein_dev_home_bstein_dev_home_backend["bstein-dev-home/bstein-dev-home-backend (Service)"] + host_bstein_dev --> svc_bstein_dev_home_bstein_dev_home_backend + wl_bstein_dev_home_bstein_dev_home_backend["bstein-dev-home/bstein-dev-home-backend (Deployment)"] + svc_bstein_dev_home_bstein_dev_home_backend --> wl_bstein_dev_home_bstein_dev_home_backend + host_call_live_bstein_dev["call.live.bstein.dev"] + svc_comms_element_call["comms/element-call (Service)"] + host_call_live_bstein_dev --> svc_comms_element_call + wl_comms_element_call["comms/element-call (Deployment)"] + svc_comms_element_call --> wl_comms_element_call + host_chat_ai_bstein_dev["chat.ai.bstein.dev"] + svc_bstein_dev_home_chat_ai_gateway["bstein-dev-home/chat-ai-gateway (Service)"] + host_chat_ai_bstein_dev --> svc_bstein_dev_home_chat_ai_gateway + wl_bstein_dev_home_chat_ai_gateway["bstein-dev-home/chat-ai-gateway (Deployment)"] + svc_bstein_dev_home_chat_ai_gateway --> wl_bstein_dev_home_chat_ai_gateway + host_ci_bstein_dev["ci.bstein.dev"] + svc_jenkins_jenkins["jenkins/jenkins (Service)"] + host_ci_bstein_dev --> svc_jenkins_jenkins + wl_jenkins_jenkins["jenkins/jenkins (Deployment)"] + svc_jenkins_jenkins --> wl_jenkins_jenkins + host_cloud_bstein_dev["cloud.bstein.dev"] + svc_nextcloud_nextcloud["nextcloud/nextcloud (Service)"] + host_cloud_bstein_dev --> svc_nextcloud_nextcloud + wl_nextcloud_nextcloud["nextcloud/nextcloud (Deployment)"] + svc_nextcloud_nextcloud --> wl_nextcloud_nextcloud + host_kit_live_bstein_dev["kit.live.bstein.dev"] + svc_comms_livekit_token_service["comms/livekit-token-service (Service)"] + host_kit_live_bstein_dev --> svc_comms_livekit_token_service + wl_comms_livekit_token_service["comms/livekit-token-service (Deployment)"] + svc_comms_livekit_token_service --> wl_comms_livekit_token_service + svc_comms_livekit["comms/livekit (Service)"] + host_kit_live_bstein_dev --> svc_comms_livekit + wl_comms_livekit["comms/livekit (Deployment)"] + svc_comms_livekit --> wl_comms_livekit + host_live_bstein_dev["live.bstein.dev"] + svc_comms_othrys_element_element_web["comms/othrys-element-element-web (Service)"] + host_live_bstein_dev --> svc_comms_othrys_element_element_web + wl_comms_othrys_element_element_web["comms/othrys-element-element-web (Deployment)"] + svc_comms_othrys_element_element_web --> wl_comms_othrys_element_element_web + host_live_bstein_dev --> svc_comms_matrix_wellknown + svc_comms_othrys_synapse_matrix_synapse["comms/othrys-synapse-matrix-synapse (Service)"] + host_live_bstein_dev --> svc_comms_othrys_synapse_matrix_synapse + wl_comms_othrys_synapse_matrix_synapse["comms/othrys-synapse-matrix-synapse (Deployment)"] + svc_comms_othrys_synapse_matrix_synapse --> wl_comms_othrys_synapse_matrix_synapse + host_longhorn_bstein_dev["longhorn.bstein.dev"] + svc_longhorn_system_oauth2_proxy_longhorn["longhorn-system/oauth2-proxy-longhorn (Service)"] + host_longhorn_bstein_dev --> svc_longhorn_system_oauth2_proxy_longhorn + wl_longhorn_system_oauth2_proxy_longhorn["longhorn-system/oauth2-proxy-longhorn (Deployment)"] + svc_longhorn_system_oauth2_proxy_longhorn --> wl_longhorn_system_oauth2_proxy_longhorn + host_mail_bstein_dev["mail.bstein.dev"] + svc_mailu_mailserver_mailu_front["mailu-mailserver/mailu-front (Service)"] + host_mail_bstein_dev --> svc_mailu_mailserver_mailu_front + host_matrix_live_bstein_dev["matrix.live.bstein.dev"] + svc_comms_matrix_authentication_service["comms/matrix-authentication-service (Service)"] + host_matrix_live_bstein_dev --> svc_comms_matrix_authentication_service + wl_comms_matrix_authentication_service["comms/matrix-authentication-service (Deployment)"] + svc_comms_matrix_authentication_service --> wl_comms_matrix_authentication_service + host_matrix_live_bstein_dev --> svc_comms_matrix_wellknown + host_matrix_live_bstein_dev --> svc_comms_othrys_synapse_matrix_synapse + svc_comms_matrix_guest_register["comms/matrix-guest-register (Service)"] + host_matrix_live_bstein_dev --> svc_comms_matrix_guest_register + wl_comms_matrix_guest_register["comms/matrix-guest-register (Deployment)"] + svc_comms_matrix_guest_register --> wl_comms_matrix_guest_register + host_monero_bstein_dev["monero.bstein.dev"] + svc_crypto_monerod["crypto/monerod (Service)"] + host_monero_bstein_dev --> svc_crypto_monerod + wl_crypto_monerod["crypto/monerod (Deployment)"] + svc_crypto_monerod --> wl_crypto_monerod + host_office_bstein_dev["office.bstein.dev"] + svc_nextcloud_collabora["nextcloud/collabora (Service)"] + host_office_bstein_dev --> svc_nextcloud_collabora + wl_nextcloud_collabora["nextcloud/collabora (Deployment)"] + svc_nextcloud_collabora --> wl_nextcloud_collabora + host_pegasus_bstein_dev["pegasus.bstein.dev"] + svc_jellyfin_pegasus["jellyfin/pegasus (Service)"] + host_pegasus_bstein_dev --> svc_jellyfin_pegasus + wl_jellyfin_pegasus["jellyfin/pegasus (Deployment)"] + svc_jellyfin_pegasus --> wl_jellyfin_pegasus + host_scm_bstein_dev["scm.bstein.dev"] + svc_gitea_gitea["gitea/gitea (Service)"] + host_scm_bstein_dev --> svc_gitea_gitea + wl_gitea_gitea["gitea/gitea (Deployment)"] + svc_gitea_gitea --> wl_gitea_gitea + host_secret_bstein_dev["secret.bstein.dev"] + svc_vault_vault["vault/vault (Service)"] + host_secret_bstein_dev --> svc_vault_vault + wl_vault_vault["vault/vault (StatefulSet)"] + svc_vault_vault --> wl_vault_vault + host_sso_bstein_dev["sso.bstein.dev"] + svc_sso_keycloak["sso/keycloak (Service)"] + host_sso_bstein_dev --> svc_sso_keycloak + wl_sso_keycloak["sso/keycloak (Deployment)"] + svc_sso_keycloak --> wl_sso_keycloak + host_stream_bstein_dev["stream.bstein.dev"] + svc_jellyfin_jellyfin["jellyfin/jellyfin (Service)"] + host_stream_bstein_dev --> svc_jellyfin_jellyfin + wl_jellyfin_jellyfin["jellyfin/jellyfin (Deployment)"] + svc_jellyfin_jellyfin --> wl_jellyfin_jellyfin + host_vault_bstein_dev["vault.bstein.dev"] + svc_vaultwarden_vaultwarden_service["vaultwarden/vaultwarden-service (Service)"] + host_vault_bstein_dev --> svc_vaultwarden_vaultwarden_service + wl_vaultwarden_vaultwarden["vaultwarden/vaultwarden (Deployment)"] + svc_vaultwarden_vaultwarden_service --> wl_vaultwarden_vaultwarden + + subgraph bstein_dev_home[bstein-dev-home] + svc_bstein_dev_home_bstein_dev_home_frontend + wl_bstein_dev_home_bstein_dev_home_frontend + svc_bstein_dev_home_bstein_dev_home_backend + wl_bstein_dev_home_bstein_dev_home_backend + svc_bstein_dev_home_chat_ai_gateway + wl_bstein_dev_home_chat_ai_gateway + end + subgraph comms[comms] + svc_comms_matrix_wellknown + wl_comms_matrix_wellknown + svc_comms_element_call + wl_comms_element_call + svc_comms_livekit_token_service + wl_comms_livekit_token_service + svc_comms_livekit + wl_comms_livekit + svc_comms_othrys_element_element_web + wl_comms_othrys_element_element_web + svc_comms_othrys_synapse_matrix_synapse + wl_comms_othrys_synapse_matrix_synapse + svc_comms_matrix_authentication_service + wl_comms_matrix_authentication_service + svc_comms_matrix_guest_register + wl_comms_matrix_guest_register + end + subgraph crypto[crypto] + svc_crypto_monerod + wl_crypto_monerod + end + subgraph gitea[gitea] + svc_gitea_gitea + wl_gitea_gitea + end + subgraph jellyfin[jellyfin] + svc_jellyfin_pegasus + wl_jellyfin_pegasus + svc_jellyfin_jellyfin + wl_jellyfin_jellyfin + end + subgraph jenkins[jenkins] + svc_jenkins_jenkins + wl_jenkins_jenkins + end + subgraph longhorn_system[longhorn-system] + svc_longhorn_system_oauth2_proxy_longhorn + wl_longhorn_system_oauth2_proxy_longhorn + end + subgraph mailu_mailserver[mailu-mailserver] + svc_mailu_mailserver_mailu_front + end + subgraph nextcloud[nextcloud] + svc_nextcloud_nextcloud + wl_nextcloud_nextcloud + svc_nextcloud_collabora + wl_nextcloud_collabora + end + subgraph sso[sso] + svc_sso_oauth2_proxy + wl_sso_oauth2_proxy + svc_sso_keycloak + wl_sso_keycloak + end + subgraph vault[vault] + svc_vault_vault + wl_vault_vault + end + subgraph vaultwarden[vaultwarden] + svc_vaultwarden_vaultwarden_service + wl_vaultwarden_vaultwarden + end diff --git a/services/comms/knowledge/runbooks/ci-gitea-jenkins.md b/services/comms/knowledge/runbooks/ci-gitea-jenkins.md new file mode 100644 index 0000000..48dc91f --- /dev/null +++ b/services/comms/knowledge/runbooks/ci-gitea-jenkins.md @@ -0,0 +1,27 @@ +--- +title: "CI: Gitea → Jenkins pipeline" +tags: ["atlas", "ci", "gitea", "jenkins"] +owners: ["brad"] +entrypoints: ["scm.bstein.dev", "ci.bstein.dev"] +source_paths: ["services/gitea", "services/jenkins", "scripts/jenkins_cred_sync.sh", "scripts/gitea_cred_sync.sh"] +--- + +# CI: Gitea → Jenkins pipeline + +## What this is +Atlas uses Gitea for source control and Jenkins for CI. Authentication is via Keycloak (SSO). + +## Where it is configured +- Gitea manifests: `services/gitea/` +- Jenkins manifests: `services/jenkins/` +- Credential sync helpers: `scripts/gitea_cred_sync.sh`, `scripts/jenkins_cred_sync.sh` + +## What users do (typical flow) +- Create a repo in Gitea. +- Create/update a Jenkins job/pipeline that can fetch the repo. +- Configure a webhook (or SCM polling) so pushes trigger builds. + +## Troubleshooting (common) +- “Webhook not firing”: confirm ingress host, webhook URL, and Jenkins job is reachable. +- “Auth denied cloning”: confirm Keycloak group membership and that Jenkins has a valid token/credential configured. + diff --git a/services/comms/knowledge/runbooks/kb-authoring.md b/services/comms/knowledge/runbooks/kb-authoring.md new file mode 100644 index 0000000..9378d1d --- /dev/null +++ b/services/comms/knowledge/runbooks/kb-authoring.md @@ -0,0 +1,34 @@ +--- +title: "KB authoring: what to write (and what not to)" +tags: ["atlas", "kb", "runbooks"] +owners: ["brad"] +entrypoints: [] +source_paths: ["knowledge/runbooks", "scripts/knowledge_render_atlas.py"] +--- + +# KB authoring: what to write (and what not to) + +## The goal +Give Atlas assistants enough grounded, Atlas-specific context to answer “how do I…?” questions without guessing. + +## What to capture (high value) +- User workflows: “click here, set X, expected result” +- Operator workflows: “edit these files, reconcile this kustomization, verify with these commands” +- Wiring: “this host routes to this service; this service depends on Postgres/Vault/etc” +- Failure modes: exact error messages + the 2–5 checks that usually resolve them +- Permissions: Keycloak groups/roles and what they unlock + +## What to avoid (low value / fluff) +- Generic Kubernetes explanations (link to upstream docs instead) +- Copy-pasting large manifests (prefer file paths + small snippets) +- Anything that will drift quickly (render it from GitOps instead) +- Any secret values (reference Secret/Vault locations by name only) + +## Document pattern (recommended) +Each runbook should answer: +- “What is this?” +- “What do users do?” +- “What do operators change (where in Git)?” +- “How do we verify it works?” +- “What breaks and how to debug it?” + diff --git a/services/comms/knowledge/runbooks/observability.md b/services/comms/knowledge/runbooks/observability.md new file mode 100644 index 0000000..4c5be6e --- /dev/null +++ b/services/comms/knowledge/runbooks/observability.md @@ -0,0 +1,26 @@ +--- +title: "Observability: Grafana + VictoriaMetrics (how to query safely)" +tags: ["atlas", "monitoring", "grafana", "victoriametrics"] +owners: ["brad"] +entrypoints: ["metrics.bstein.dev", "alerts.bstein.dev"] +source_paths: ["services/monitoring"] +--- + +# Observability: Grafana + VictoriaMetrics (how to query safely) + +## Where it is configured +- `services/monitoring/helmrelease.yaml` (Grafana + Alertmanager + VM values) +- `services/monitoring/grafana-dashboard-*.yaml` (dashboards and their PromQL) + +## Using metrics as a “tool” for Atlas assistants +The safest pattern is: map a small set of intents → fixed PromQL queries, then summarize results. + +Examples (intents) +- “Is the cluster healthy?” → node readiness + pod restart rate +- “Why is Element Call failing?” → LiveKit/coturn pod restarts + synapse errors + ingress 5xx +- “Is Jenkins slow?” → pod CPU/memory + HTTP latency metrics (if exported) + +## Why dashboards are not the KB +Dashboards are great references, but the assistant should query VictoriaMetrics directly for live answers and keep the +KB focused on wiring, runbooks, and stable conventions. + diff --git a/services/comms/knowledge/runbooks/template.md b/services/comms/knowledge/runbooks/template.md new file mode 100644 index 0000000..086c65f --- /dev/null +++ b/services/comms/knowledge/runbooks/template.md @@ -0,0 +1,18 @@ +--- +title: "" +tags: ["atlas", "", ""] +owners: ["brad"] +entrypoints: [""] +source_paths: ["services/", "clusters/atlas/<...>"] +--- + +# + +## What this is + +## For users (how to) + +## For operators (where configured) + +## Troubleshooting (symptoms → checks) + diff --git a/services/comms/kustomization.yaml b/services/comms/kustomization.yaml new file mode 100644 index 0000000..2008843 --- /dev/null +++ b/services/comms/kustomization.yaml @@ -0,0 +1,82 @@ +# services/comms/kustomization.yaml +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization +namespace: comms +resources: + - namespace.yaml + - mas-configmap.yaml + - element-rendered.yaml + - livekit-config.yaml + - element-call-config.yaml + - element-call-deployment.yaml + - guest-register-deployment.yaml + - guest-register-service.yaml + - atlasbot-deployment.yaml + - wellknown.yaml + - atlasbot-rbac.yaml + - mas-secrets-ensure-rbac.yaml + - comms-secrets-ensure-rbac.yaml + - mas-db-ensure-rbac.yaml + - mas-admin-client-secret-ensure-job.yaml + - mas-db-ensure-job.yaml + - comms-secrets-ensure-job.yaml + - synapse-signingkey-ensure-job.yaml + - synapse-seeder-admin-ensure-job.yaml + - synapse-user-seed-job.yaml + - mas-local-users-ensure-job.yaml + - synapse-rendered.yaml + - mas-deployment.yaml + - livekit-token-deployment.yaml + - livekit.yaml + - coturn.yaml + - seed-othrys-room.yaml + - guest-name-job.yaml + - othrys-kick-numeric-job.yaml + - pin-othrys-job.yaml + - reset-othrys-room-job.yaml + - bstein-force-leave-job.yaml + - livekit-ingress.yaml + - livekit-middlewares.yaml + - matrix-ingress.yaml + +patches: + - path: synapse-deployment-strategy-patch.yaml + +configMapGenerator: + - name: matrix-guest-register + files: + - server.py=scripts/guest-register/server.py + options: + disableNameSuffixHash: true + - name: atlasbot + files: + - bot.py=scripts/atlasbot/bot.py + options: + disableNameSuffixHash: true + - name: othrys-synapse-redis-health + files: + - ping_readiness_local.sh=scripts/synapse/redis/ping_readiness_local.sh + - ping_liveness_local.sh=scripts/synapse/redis/ping_liveness_local.sh + - ping_readiness_master.sh=scripts/synapse/redis/ping_readiness_master.sh + - ping_liveness_master.sh=scripts/synapse/redis/ping_liveness_master.sh + - ping_readiness_local_and_master.sh=scripts/synapse/redis/ping_readiness_local_and_master.sh + - ping_liveness_local_and_master.sh=scripts/synapse/redis/ping_liveness_local_and_master.sh + options: + disableNameSuffixHash: true + - name: othrys-synapse-redis-scripts + files: + - start-master.sh=scripts/synapse/redis/start-master.sh + options: + disableNameSuffixHash: true + - name: othrys-synapse-matrix-synapse-scripts + files: + - signing-key.sh=scripts/synapse/signing-key.sh + options: + disableNameSuffixHash: true + - name: atlas-kb + files: + - INDEX.md=knowledge/INDEX.md + - atlas.json=knowledge/catalog/atlas.json + - atlas-summary.json=knowledge/catalog/atlas-summary.json + - runbooks.json=knowledge/catalog/runbooks.json + - atlas-http.mmd=knowledge/diagrams/atlas-http.mmd diff --git a/services/comms/livekit-config.yaml b/services/comms/livekit-config.yaml new file mode 100644 index 0000000..8b977a4 --- /dev/null +++ b/services/comms/livekit-config.yaml @@ -0,0 +1,30 @@ +# services/comms/livekit-config.yaml +apiVersion: v1 +kind: ConfigMap +metadata: + name: livekit-config +data: + livekit.yaml: | + port: 7880 + rtc: + udp_port: "7882-7883" + tcp_port: 7881 + use_external_ip: true + turn_servers: + - host: turn.live.bstein.dev + port: 5349 + protocol: tls + username: livekit + credential: "@@TURN_PASSWORD@@" + - host: turn.live.bstein.dev + port: 3478 + protocol: tcp + username: livekit + credential: "@@TURN_PASSWORD@@" + - host: turn.live.bstein.dev + port: 3478 + protocol: udp + username: livekit + credential: "@@TURN_PASSWORD@@" + room: + auto_create: false diff --git a/services/comms/livekit-ingress.yaml b/services/comms/livekit-ingress.yaml new file mode 100644 index 0000000..ba30ae3 --- /dev/null +++ b/services/comms/livekit-ingress.yaml @@ -0,0 +1,27 @@ +# services/comms/livekit-ingress.yaml +apiVersion: networking.k8s.io/v1 +kind: Ingress +metadata: + name: livekit-ingress + annotations: + kubernetes.io/ingress.class: traefik + traefik.ingress.kubernetes.io/router.entrypoints: websecure + traefik.ingress.kubernetes.io/router.tls: "true" + traefik.ingress.kubernetes.io/router.middlewares: comms-livekit-sfu-strip@kubernetescrd + cert-manager.io/cluster-issuer: letsencrypt +spec: + tls: + - hosts: + - kit.live.bstein.dev + secretName: kit-live-tls + rules: + - host: kit.live.bstein.dev + http: + paths: + - path: /livekit/sfu + pathType: Prefix + backend: + service: + name: livekit + port: + number: 7880 diff --git a/services/comms/livekit-middlewares.yaml b/services/comms/livekit-middlewares.yaml new file mode 100644 index 0000000..f1b74ed --- /dev/null +++ b/services/comms/livekit-middlewares.yaml @@ -0,0 +1,45 @@ +# services/comms/livekit-middlewares.yaml +apiVersion: traefik.io/v1alpha1 +kind: Middleware +metadata: + name: livekit-sfu-strip +spec: + stripPrefix: + prefixes: + - /livekit/sfu +--- +apiVersion: traefik.io/v1alpha1 +kind: Middleware +metadata: + name: livekit-jwt-strip +spec: + stripPrefix: + prefixes: + - /livekit/jwt +--- +apiVersion: networking.k8s.io/v1 +kind: Ingress +metadata: + name: livekit-jwt-ingress + annotations: + kubernetes.io/ingress.class: traefik + traefik.ingress.kubernetes.io/router.entrypoints: websecure + traefik.ingress.kubernetes.io/router.tls: "true" + traefik.ingress.kubernetes.io/router.middlewares: comms-livekit-jwt-strip@kubernetescrd + cert-manager.io/cluster-issuer: letsencrypt +spec: + tls: + - hosts: + - kit.live.bstein.dev + secretName: kit-live-tls + rules: + - host: kit.live.bstein.dev + http: + paths: + - path: /livekit/jwt + pathType: Prefix + backend: + service: + name: livekit-token-service + port: + number: 8080 diff --git a/services/comms/livekit-token-deployment.yaml b/services/comms/livekit-token-deployment.yaml new file mode 100644 index 0000000..1b4cdca --- /dev/null +++ b/services/comms/livekit-token-deployment.yaml @@ -0,0 +1,69 @@ +# services/comms/livekit-token-deployment.yaml +apiVersion: apps/v1 +kind: Deployment +metadata: + name: livekit-token-service + labels: + app: livekit-token-service +spec: + replicas: 1 + selector: + matchLabels: + app: livekit-token-service + template: + metadata: + labels: + app: livekit-token-service + spec: + nodeSelector: + hardware: rpi5 + affinity: + nodeAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - weight: 50 + preference: + matchExpressions: + - key: hardware + operator: In + values: ["rpi5","rpi4"] + hostAliases: + - ip: 10.43.60.6 + hostnames: + - live.bstein.dev + containers: + - name: token-service + image: ghcr.io/element-hq/lk-jwt-service:0.3.0 + env: + - name: LIVEKIT_URL + value: wss://kit.live.bstein.dev/livekit/sfu + - name: LIVEKIT_KEY + value: primary + - name: LIVEKIT_SECRET + valueFrom: + secretKeyRef: + name: livekit-api + key: primary + - name: LIVEKIT_FULL_ACCESS_HOMESERVERS + value: live.bstein.dev + ports: + - containerPort: 8080 + name: http + resources: + requests: + cpu: 50m + memory: 128Mi + limits: + cpu: 300m + memory: 256Mi +--- +apiVersion: v1 +kind: Service +metadata: + name: livekit-token-service +spec: + selector: + app: livekit-token-service + ports: + - name: http + port: 8080 + targetPort: 8080 diff --git a/services/comms/livekit.yaml b/services/comms/livekit.yaml new file mode 100644 index 0000000..46d57f8 --- /dev/null +++ b/services/comms/livekit.yaml @@ -0,0 +1,143 @@ +# services/comms/livekit.yaml +apiVersion: apps/v1 +kind: Deployment +metadata: + name: livekit + labels: + app: livekit +spec: + replicas: 1 + selector: + matchLabels: + app: livekit + template: + metadata: + annotations: + checksum/config: livekit-config-v5 + labels: + app: livekit + spec: + enableServiceLinks: false + nodeSelector: + hardware: rpi5 + affinity: + nodeAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - weight: 50 + preference: + matchExpressions: + - key: hardware + operator: In + values: ["rpi5","rpi4"] + initContainers: + - name: render-config + image: alpine:3.20 + command: ["/bin/sh","-c"] + args: + - | + set -euo pipefail + umask 077 + TURN_PASSWORD_ESCAPED="$(printf '%s' "${TURN_PASSWORD}" | sed 's/[\\/&]/\\&/g')" + sed "s/@@TURN_PASSWORD@@/${TURN_PASSWORD_ESCAPED}/g" /etc/livekit-template/livekit.yaml > /etc/livekit/livekit.yaml + chmod 0644 /etc/livekit/livekit.yaml + env: + - name: TURN_PASSWORD + valueFrom: + secretKeyRef: + name: turn-shared-secret + key: TURN_STATIC_AUTH_SECRET + volumeMounts: + - name: config-template + mountPath: /etc/livekit-template + readOnly: true + - name: config + mountPath: /etc/livekit + readOnly: false + containers: + - name: livekit + image: livekit/livekit-server:v1.9.0 + command: + - /bin/sh + - -c + - | + set -euo pipefail + umask 077 + printf "%s: %s\n" "${LIVEKIT_API_KEY_ID}" "${LIVEKIT_API_SECRET}" > /var/run/livekit/keys + chmod 600 /var/run/livekit/keys + exec /livekit-server --config /etc/livekit/livekit.yaml --key-file /var/run/livekit/keys + env: + - name: LIVEKIT_API_KEY_ID + value: primary + - name: LIVEKIT_API_SECRET + valueFrom: + secretKeyRef: + name: livekit-api + key: primary + ports: + - containerPort: 7880 + name: http + protocol: TCP + - containerPort: 7881 + name: rtc-tcp + protocol: TCP + - containerPort: 7882 + name: rtc-udp + protocol: UDP + - containerPort: 7883 + name: rtc-udp2 + protocol: UDP + volumeMounts: + - name: config + mountPath: /etc/livekit + readOnly: true + - name: runtime-keys + mountPath: /var/run/livekit + resources: + requests: + cpu: 500m + memory: 512Mi + limits: + cpu: "2" + memory: 1Gi + volumes: + - name: config-template + configMap: + name: livekit-config + items: + - key: livekit.yaml + path: livekit.yaml + - name: config + emptyDir: {} + - name: runtime-keys + emptyDir: {} +--- +apiVersion: v1 +kind: Service +metadata: + name: livekit + annotations: + metallb.universe.tf/address-pool: communication-pool +spec: + type: LoadBalancer + loadBalancerClass: metallb + loadBalancerIP: 192.168.22.6 + externalTrafficPolicy: Local + selector: + app: livekit + ports: + - name: http + port: 7880 + targetPort: 7880 + protocol: TCP + - name: rtc-tcp + port: 7881 + targetPort: 7881 + protocol: TCP + - name: rtc-udp-7882 + port: 7882 + targetPort: 7882 + protocol: UDP + - name: rtc-udp-7883 + port: 7883 + targetPort: 7883 + protocol: UDP diff --git a/services/comms/mas-admin-client-secret-ensure-job.yaml b/services/comms/mas-admin-client-secret-ensure-job.yaml new file mode 100644 index 0000000..3843877 --- /dev/null +++ b/services/comms/mas-admin-client-secret-ensure-job.yaml @@ -0,0 +1,84 @@ +# services/comms/mas-admin-client-secret-ensure-job.yaml +apiVersion: v1 +kind: ServiceAccount +metadata: + name: mas-admin-client-secret-writer + namespace: comms +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: Role +metadata: + name: mas-admin-client-secret-writer + namespace: comms +rules: + - apiGroups: [""] + resources: ["secrets"] + resourceNames: ["mas-admin-client-runtime"] + verbs: ["get", "patch", "update"] + - apiGroups: [""] + resources: ["secrets"] + verbs: ["create"] +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + name: mas-admin-client-secret-writer + namespace: comms +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: Role + name: mas-admin-client-secret-writer +subjects: + - kind: ServiceAccount + name: mas-admin-client-secret-writer + namespace: comms +--- +apiVersion: batch/v1 +kind: Job +metadata: + name: mas-admin-client-secret-ensure-7 + namespace: comms +spec: + backoffLimit: 2 + template: + spec: + serviceAccountName: mas-admin-client-secret-writer + restartPolicy: OnFailure + volumes: + - name: work + emptyDir: {} + initContainers: + - name: generate + image: alpine:3.20 + command: ["/bin/sh", "-c"] + args: + - | + set -euo pipefail + umask 077 + dd if=/dev/urandom bs=32 count=1 2>/dev/null | od -An -tx1 | tr -d ' \n' > /work/client_secret + chmod 0644 /work/client_secret + volumeMounts: + - name: work + mountPath: /work + containers: + - name: patch + image: bitnami/kubectl:latest + command: ["/bin/sh", "-c"] + args: + - | + set -euo pipefail + if kubectl -n comms get secret mas-admin-client-runtime >/dev/null 2>&1; then + if kubectl -n comms get secret mas-admin-client-runtime -o jsonpath='{.data.client_secret}' 2>/dev/null | grep -q .; then + exit 0 + fi + else + kubectl -n comms create secret generic mas-admin-client-runtime \ + --from-file=client_secret=/work/client_secret >/dev/null + exit 0 + fi + secret_b64="$(base64 /work/client_secret | tr -d '\n')" + payload="$(printf '{"data":{"client_secret":"%s"}}' "${secret_b64}")" + kubectl -n comms patch secret mas-admin-client-runtime --type=merge -p "${payload}" >/dev/null + volumeMounts: + - name: work + mountPath: /work diff --git a/services/comms/mas-configmap.yaml b/services/comms/mas-configmap.yaml new file mode 100644 index 0000000..a41ebeb --- /dev/null +++ b/services/comms/mas-configmap.yaml @@ -0,0 +1,84 @@ +# services/comms/mas-configmap.yaml +apiVersion: v1 +kind: ConfigMap +metadata: + name: matrix-authentication-service-config +data: + config.yaml: | + http: + public_base: "https://matrix.live.bstein.dev/" + listeners: + - name: web + resources: + - name: discovery + - name: human + - name: oauth + - name: compat + - name: graphql + - name: assets + binds: + - address: "0.0.0.0:8080" + - name: internal + resources: + - name: health + - name: adminapi + binds: + - address: "0.0.0.0:8081" + + database: + uri: "postgresql://mas:@@MAS_DB_PASSWORD@@@postgres-service.postgres.svc.cluster.local:5432/mas?sslmode=prefer" + + clients: + - client_id: 01KDXMVQBQ5JNY6SEJPZW6Z8BM + client_auth_method: client_secret_basic + client_secret_file: /etc/mas/admin-client/client_secret + + secrets: + encryption_file: /etc/mas/secrets/encryption + keys: + - kid: "othrys-rsa-1" + key_file: /etc/mas/keys/rsa_key + + passwords: + enabled: true + schemes: + - version: 1 + algorithm: bcrypt + + matrix: + kind: synapse + homeserver: live.bstein.dev + endpoint: "http://othrys-synapse-matrix-synapse:8008/" + secret: "@@MATRIX_SHARED_SECRET@@" + + upstream_oauth2: + providers: + - id: 01KDTTKYCYTAAAQKMAKZZ5CPW3 + synapse_idp_id: oidc-keycloak + issuer: "https://sso.bstein.dev/realms/atlas" + human_name: "Keycloak" + brand_name: "keycloak" + client_id: "othrys-mas" + client_secret: "@@KEYCLOAK_CLIENT_SECRET@@" + token_endpoint_auth_method: client_secret_post + scope: "openid profile email" + claims_imports: + localpart: + action: require + template: "{{ user.preferred_username }}" + on_conflict: add + displayname: + action: force + template: "{{ user.name }}" + email: + action: force + template: "{{ user.email }}" + + policy: + data: + admin_clients: + - 01KDXMVQBQ5JNY6SEJPZW6Z8BM + client_registration: + allow_insecure_uris: true + allow_host_mismatch: true + allow_missing_client_uri: true diff --git a/services/comms/mas-db-ensure-job.yaml b/services/comms/mas-db-ensure-job.yaml new file mode 100644 index 0000000..1c8b5c4 --- /dev/null +++ b/services/comms/mas-db-ensure-job.yaml @@ -0,0 +1,55 @@ +# services/comms/mas-db-ensure-job.yaml +apiVersion: batch/v1 +kind: Job +metadata: + name: mas-db-ensure-16 + namespace: comms +spec: + backoffLimit: 1 + ttlSecondsAfterFinished: 600 + template: + spec: + serviceAccountName: mas-db-ensure + restartPolicy: Never + containers: + - name: ensure + image: bitnami/kubectl:latest + command: ["/bin/sh", "-c"] + args: + - | + set -eu + trap 'echo "mas-db-ensure failed"; sleep 300' ERR + umask 077 + safe_pass() { + head -c 32 /dev/urandom | base64 | tr -d '\n' | tr '+/' '-_' | tr -d '=' + } + + EXISTING_B64="$(kubectl -n comms get secret mas-db -o jsonpath='{.data.password}' 2>/dev/null || true)" + if [ -n "${EXISTING_B64}" ]; then + MAS_PASS="$(printf '%s' "${EXISTING_B64}" | base64 -d)" + if printf '%s' "${MAS_PASS}" | grep -Eq '[^A-Za-z0-9_-]'; then + MAS_PASS="$(safe_pass)" + MAS_B64="$(printf '%s' "${MAS_PASS}" | base64 | tr -d '\n')" + payload="$(printf '{"data":{"password":"%s"}}' "${MAS_B64}")" + kubectl -n comms patch secret mas-db --type=merge -p "${payload}" >/dev/null + fi + else + MAS_PASS="$(safe_pass)" + kubectl -n comms create secret generic mas-db --from-literal=password="${MAS_PASS}" >/dev/null + fi + + POD_NAME="$(kubectl -n postgres get pods -l app=postgres -o jsonpath='{.items[0].metadata.name}')" + if [ -z "${POD_NAME}" ]; then + echo "postgres pod not found" >&2 + exit 1 + fi + + MAS_PASS_SQL="$(printf '%s' "${MAS_PASS}" | sed "s/'/''/g")" + kubectl -n postgres exec -i "${POD_NAME}" -- psql -U postgres -d postgres \ + -c "CREATE ROLE mas LOGIN PASSWORD '${MAS_PASS_SQL}';" || true + kubectl -n postgres exec -i "${POD_NAME}" -- psql -U postgres -d postgres \ + -c "ALTER ROLE mas WITH PASSWORD '${MAS_PASS_SQL}';" + kubectl -n postgres exec -i "${POD_NAME}" -- psql -U postgres -d postgres \ + -c "CREATE DATABASE mas OWNER mas;" || true + kubectl -n postgres exec -i "${POD_NAME}" -- /bin/sh -c \ + "PGPASSWORD='${MAS_PASS_SQL}' psql -U mas -d mas -c 'select 1;'" diff --git a/services/comms/mas-db-ensure-rbac.yaml b/services/comms/mas-db-ensure-rbac.yaml new file mode 100644 index 0000000..19691d7 --- /dev/null +++ b/services/comms/mas-db-ensure-rbac.yaml @@ -0,0 +1,38 @@ +# services/comms/mas-db-ensure-rbac.yaml +apiVersion: v1 +kind: ServiceAccount +metadata: + name: mas-db-ensure + namespace: comms +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: mas-db-ensure +rules: + - apiGroups: [""] + resources: ["secrets"] + resourceNames: ["mas-db"] + verbs: ["get", "patch", "update"] + - apiGroups: [""] + resources: ["secrets"] + verbs: ["create"] + - apiGroups: [""] + resources: ["pods"] + verbs: ["get", "list"] + - apiGroups: [""] + resources: ["pods/exec"] + verbs: ["create"] +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: mas-db-ensure +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: mas-db-ensure +subjects: + - kind: ServiceAccount + name: mas-db-ensure + namespace: comms diff --git a/services/comms/mas-deployment.yaml b/services/comms/mas-deployment.yaml new file mode 100644 index 0000000..2117c17 --- /dev/null +++ b/services/comms/mas-deployment.yaml @@ -0,0 +1,152 @@ +# services/comms/mas-deployment.yaml +apiVersion: apps/v1 +kind: Deployment +metadata: + name: matrix-authentication-service + labels: + app: matrix-authentication-service +spec: + replicas: 1 + selector: + matchLabels: + app: matrix-authentication-service + template: + metadata: + annotations: + checksum/config: v5-adminapi-7 + labels: + app: matrix-authentication-service + spec: + enableServiceLinks: false + nodeSelector: + hardware: rpi5 + affinity: + nodeAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - weight: 50 + preference: + matchExpressions: + - key: hardware + operator: In + values: ["rpi5","rpi4"] + initContainers: + - name: render-config + image: alpine:3.20 + command: ["/bin/sh","-c"] + args: + - | + set -euo pipefail + umask 077 + DB_PASS_ESCAPED="$(printf '%s' "${MAS_DB_PASSWORD}" | sed 's/[\\/&]/\\&/g')" + MATRIX_SECRET_ESCAPED="$(printf '%s' "${MATRIX_SHARED_SECRET}" | sed 's/[\\/&]/\\&/g')" + KC_SECRET_ESCAPED="$(printf '%s' "${KEYCLOAK_CLIENT_SECRET}" | sed 's/[\\/&]/\\&/g')" + + sed \ + -e "s/@@MAS_DB_PASSWORD@@/${DB_PASS_ESCAPED}/g" \ + -e "s/@@MATRIX_SHARED_SECRET@@/${MATRIX_SECRET_ESCAPED}/g" \ + -e "s/@@KEYCLOAK_CLIENT_SECRET@@/${KC_SECRET_ESCAPED}/g" \ + /etc/mas/config.yaml > /rendered/config.yaml + chmod 0644 /rendered/config.yaml + env: + - name: MAS_DB_PASSWORD + valueFrom: + secretKeyRef: + name: mas-db + key: password + - name: MATRIX_SHARED_SECRET + valueFrom: + secretKeyRef: + name: mas-secrets-runtime + key: matrix_shared_secret + - name: KEYCLOAK_CLIENT_SECRET + valueFrom: + secretKeyRef: + name: mas-secrets-runtime + key: keycloak_client_secret + volumeMounts: + - name: config + mountPath: /etc/mas/config.yaml + subPath: config.yaml + readOnly: true + - name: rendered + mountPath: /rendered + readOnly: false + containers: + - name: mas + image: ghcr.io/element-hq/matrix-authentication-service:1.8.0 + args: ["server","--config","/rendered/config.yaml"] + ports: + - name: http + containerPort: 8080 + protocol: TCP + - name: internal + containerPort: 8081 + protocol: TCP + volumeMounts: + - name: rendered + mountPath: /rendered + readOnly: true + - name: secrets + mountPath: /etc/mas/secrets + readOnly: true + - name: admin-client + mountPath: /etc/mas/admin-client + readOnly: true + - name: keys + mountPath: /etc/mas/keys + readOnly: true + resources: + requests: + cpu: 200m + memory: 256Mi + limits: + cpu: "2" + memory: 1Gi + volumes: + - name: config + configMap: + name: matrix-authentication-service-config + items: + - key: config.yaml + path: config.yaml + - name: rendered + emptyDir: {} + - name: secrets + secret: + secretName: mas-secrets-runtime + items: + - key: encryption + path: encryption + - key: matrix_shared_secret + path: matrix_shared_secret + - key: keycloak_client_secret + path: keycloak_client_secret + - name: keys + secret: + secretName: mas-secrets-runtime + items: + - key: rsa_key + path: rsa_key + - name: admin-client + secret: + secretName: mas-admin-client-runtime + items: + - key: client_secret + path: client_secret +--- +apiVersion: v1 +kind: Service +metadata: + name: matrix-authentication-service +spec: + selector: + app: matrix-authentication-service + ports: + - name: http + port: 8080 + targetPort: http + protocol: TCP + - name: internal + port: 8081 + targetPort: internal + protocol: TCP diff --git a/services/comms/mas-local-users-ensure-job.yaml b/services/comms/mas-local-users-ensure-job.yaml new file mode 100644 index 0000000..e462426 --- /dev/null +++ b/services/comms/mas-local-users-ensure-job.yaml @@ -0,0 +1,162 @@ +# services/comms/mas-local-users-ensure-job.yaml +apiVersion: batch/v1 +kind: Job +metadata: + name: mas-local-users-ensure-5 + namespace: comms +spec: + backoffLimit: 1 + ttlSecondsAfterFinished: 3600 + template: + spec: + restartPolicy: Never + volumes: + - name: mas-admin-client + secret: + secretName: mas-admin-client-runtime + items: + - key: client_secret + path: client_secret + containers: + - name: ensure + image: python:3.11-slim + volumeMounts: + - name: mas-admin-client + mountPath: /etc/mas-admin-client + readOnly: true + env: + - name: MAS_ADMIN_CLIENT_ID + value: 01KDXMVQBQ5JNY6SEJPZW6Z8BM + - name: MAS_ADMIN_CLIENT_SECRET_FILE + value: /etc/mas-admin-client/client_secret + - name: MAS_TOKEN_URL + value: http://matrix-authentication-service:8080/oauth2/token + - name: MAS_ADMIN_API_BASE + value: http://matrix-authentication-service:8081/api/admin/v1 + - name: SEEDER_USER + value: othrys-seeder + - name: SEEDER_PASS + valueFrom: + secretKeyRef: + name: atlasbot-credentials-runtime + key: seeder-password + - name: BOT_USER + value: atlasbot + - name: BOT_PASS + valueFrom: + secretKeyRef: + name: atlasbot-credentials-runtime + key: bot-password + command: + - /bin/sh + - -c + - | + set -euo pipefail + pip install --no-cache-dir requests >/dev/null + python - <<'PY' + import base64 + import os + import time + import requests + import urllib.parse + + MAS_ADMIN_CLIENT_ID = os.environ["MAS_ADMIN_CLIENT_ID"] + MAS_ADMIN_CLIENT_SECRET_FILE = os.environ["MAS_ADMIN_CLIENT_SECRET_FILE"] + MAS_TOKEN_URL = os.environ["MAS_TOKEN_URL"] + MAS_ADMIN_API_BASE = os.environ["MAS_ADMIN_API_BASE"].rstrip("/") + AUTH_BASE = "http://matrix-authentication-service:8080" + SERVER_NAME = "live.bstein.dev" + + def admin_token(): + with open(MAS_ADMIN_CLIENT_SECRET_FILE, "r", encoding="utf-8") as f: + secret = f.read().strip() + basic = base64.b64encode(f"{MAS_ADMIN_CLIENT_ID}:{secret}".encode()).decode() + last = None + for attempt in range(1, 6): + try: + r = requests.post( + MAS_TOKEN_URL, + headers={"Authorization": f"Basic {basic}"}, + data={"grant_type": "client_credentials", "scope": "urn:mas:admin"}, + timeout=30, + ) + if r.status_code == 200: + return r.json()["access_token"] + except Exception as exc: # noqa: BLE001 + last = exc + time.sleep(attempt * 2) + raise RuntimeError(f"MAS admin token request failed: {last}") + + def get_user(token, username): + r = requests.get( + f"{MAS_ADMIN_API_BASE}/users/by-username/{urllib.parse.quote(username)}", + headers={"Authorization": f"Bearer {token}"}, + timeout=30, + ) + if r.status_code == 404: + return None + r.raise_for_status() + return r.json()["data"] + + def create_user(token, username, password): + payloads = [ + { + "data": { + "type": "user", + "attributes": { + "username": username, + "password": password, + }, + } + }, + {"username": username, "password": password}, + ] + for payload in payloads: + r = requests.post( + f"{MAS_ADMIN_API_BASE}/users", + headers={"Authorization": f"Bearer {token}"}, + json=payload, + timeout=30, + ) + if r.status_code in (200, 201): + return r.json().get("data") or {} + if r.status_code == 409: + return None + return None + + def update_password(token, user_id, password): + r = requests.post( + f"{MAS_ADMIN_API_BASE}/users/{urllib.parse.quote(user_id)}/set-password", + headers={"Authorization": f"Bearer {token}"}, + json={"password": password}, + timeout=30, + ) + return r.status_code in (200, 204) + + def ensure_user(token, username, password): + user = get_user(token, username) + if user is None: + user = create_user(token, username, password) + user = get_user(token, username) + if user is None: + raise RuntimeError(f"failed to ensure user {username}") + update_password(token, user["id"], password) + login_name = username + if not login_name.startswith("@"): + login_name = f"@{login_name}:{SERVER_NAME}" + r = requests.post( + f"{AUTH_BASE}/_matrix/client/v3/login", + json={ + "type": "m.login.password", + "identifier": {"type": "m.id.user", "user": login_name}, + "password": password, + }, + timeout=30, + ) + if r.status_code != 200: + raise RuntimeError(f"login failed for {username}: {r.status_code} {r.text}") + + token = admin_token() + ensure_user(token, os.environ["SEEDER_USER"], os.environ["SEEDER_PASS"]) + ensure_user(token, os.environ["BOT_USER"], os.environ["BOT_PASS"]) + PY diff --git a/services/comms/mas-secrets-ensure-rbac.yaml b/services/comms/mas-secrets-ensure-rbac.yaml new file mode 100644 index 0000000..22ff987 --- /dev/null +++ b/services/comms/mas-secrets-ensure-rbac.yaml @@ -0,0 +1,22 @@ +# services/comms/mas-secrets-ensure-rbac.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: mas-secrets-ensure +rules: + - apiGroups: [""] + resources: ["secrets"] + verbs: ["get", "create", "patch", "update"] +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: mas-secrets-ensure +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: mas-secrets-ensure +subjects: + - kind: ServiceAccount + name: mas-secrets-ensure + namespace: sso diff --git a/services/comms/matrix-ingress.yaml b/services/comms/matrix-ingress.yaml new file mode 100644 index 0000000..caaa593 --- /dev/null +++ b/services/comms/matrix-ingress.yaml @@ -0,0 +1,90 @@ +# services/comms/matrix-ingress.yaml +apiVersion: networking.k8s.io/v1 +kind: Ingress +metadata: + name: matrix-routing + annotations: + kubernetes.io/ingress.class: traefik + traefik.ingress.kubernetes.io/router.entrypoints: websecure + traefik.ingress.kubernetes.io/router.tls: "true" + cert-manager.io/cluster-issuer: letsencrypt +spec: + ingressClassName: traefik + tls: + - hosts: + - matrix.live.bstein.dev + secretName: matrix-live-tls + - hosts: + - live.bstein.dev + secretName: live-othrys-tls + # Consolidated Matrix routing: MAS for auth/UI, Synapse for Matrix APIs, guest-register for guest joins. + rules: + - host: matrix.live.bstein.dev + http: + paths: + - path: /_matrix/client/v3/register + pathType: Prefix + backend: + service: + name: matrix-guest-register + port: + number: 8080 + - path: /_matrix/client/r0/register + pathType: Prefix + backend: + service: + name: matrix-guest-register + port: + number: 8080 + - path: /_matrix/client/v3/login + pathType: Prefix + backend: + service: + name: matrix-authentication-service + port: + number: 8080 + - path: /_matrix/client/v3/logout + pathType: Exact + backend: + service: + name: matrix-authentication-service + port: + number: 8080 + - path: /_matrix/client/v3/refresh + pathType: Exact + backend: + service: + name: matrix-authentication-service + port: + number: 8080 + - path: /_matrix + pathType: Prefix + backend: + service: + name: othrys-synapse-matrix-synapse + port: + number: 8008 + - path: /_synapse + pathType: Prefix + backend: + service: + name: othrys-synapse-matrix-synapse + port: + number: 8008 + - path: / + pathType: Prefix + backend: + service: + name: matrix-authentication-service + port: + number: 8080 + - host: live.bstein.dev + http: + paths: + - path: /_matrix + pathType: Prefix + backend: + service: + name: othrys-synapse-matrix-synapse + port: + number: 8008 diff --git a/services/comms/namespace.yaml b/services/comms/namespace.yaml new file mode 100644 index 0000000..9d44af2 --- /dev/null +++ b/services/comms/namespace.yaml @@ -0,0 +1,5 @@ +# services/comms/namespace.yaml +apiVersion: v1 +kind: Namespace +metadata: + name: comms diff --git a/services/comms/othrys-kick-numeric-job.yaml b/services/comms/othrys-kick-numeric-job.yaml new file mode 100644 index 0000000..8f02bbb --- /dev/null +++ b/services/comms/othrys-kick-numeric-job.yaml @@ -0,0 +1,115 @@ +# services/comms/othrys-kick-numeric-job.yaml +apiVersion: batch/v1 +kind: Job +metadata: + name: othrys-kick-numeric-1 + namespace: comms +spec: + backoffLimit: 0 + template: + spec: + restartPolicy: Never + containers: + - name: kick + image: python:3.11-slim + env: + - name: SYNAPSE_BASE + value: http://othrys-synapse-matrix-synapse:8008 + - name: AUTH_BASE + value: http://matrix-authentication-service:8080 + - name: SERVER_NAME + value: live.bstein.dev + - name: ROOM_ALIAS + value: "#othrys:live.bstein.dev" + - name: SEEDER_USER + value: othrys-seeder + - name: SEEDER_PASS + valueFrom: + secretKeyRef: + name: atlasbot-credentials-runtime + key: seeder-password + command: + - /bin/sh + - -c + - | + set -euo pipefail + pip install --no-cache-dir requests >/dev/null + python - <<'PY' + import os + import urllib.parse + import requests + + BASE = os.environ["SYNAPSE_BASE"] + AUTH_BASE = os.environ.get("AUTH_BASE", BASE) + SERVER_NAME = os.environ.get("SERVER_NAME", "live.bstein.dev") + ROOM_ALIAS = os.environ.get("ROOM_ALIAS", "#othrys:live.bstein.dev") + SEEDER_USER = os.environ["SEEDER_USER"] + SEEDER_PASS = os.environ["SEEDER_PASS"] + + def canon_user(user): + u = (user or "").strip() + if u.startswith("@") and ":" in u: + return u + u = u.lstrip("@") + if ":" in u: + return f"@{u}" + return f"@{u}:{SERVER_NAME}" + + def auth(token): + return {"Authorization": f"Bearer {token}"} + + def login(user, password): + r = requests.post( + f"{AUTH_BASE}/_matrix/client/v3/login", + json={ + "type": "m.login.password", + "identifier": {"type": "m.id.user", "user": canon_user(user)}, + "password": password, + }, + timeout=30, + ) + r.raise_for_status() + return r.json()["access_token"] + + def resolve_alias(token, alias): + enc = urllib.parse.quote(alias) + r = requests.get(f"{BASE}/_matrix/client/v3/directory/room/{enc}", headers=auth(token), timeout=30) + r.raise_for_status() + return r.json()["room_id"] + + def list_members(token, room_id): + r = requests.get( + f"{BASE}/_matrix/client/v3/rooms/{urllib.parse.quote(room_id)}/members?membership=join", + headers=auth(token), + timeout=30, + ) + r.raise_for_status() + members = [] + for ev in r.json().get("chunk", []): + uid = ev.get("state_key") + if isinstance(uid, str) and uid.startswith("@"): + members.append(uid) + return members + + def is_numeric(user_id): + localpart = user_id.split(":", 1)[0].lstrip("@") + return localpart.isdigit() + + def kick(token, room_id, user_id): + r = requests.post( + f"{BASE}/_matrix/client/v3/rooms/{urllib.parse.quote(room_id)}/kick", + headers=auth(token), + json={"user_id": user_id, "reason": "cleanup numeric guest"}, + timeout=30, + ) + if r.status_code not in (200, 202): + raise SystemExit(f"kick {user_id} failed: {r.status_code} {r.text}") + + token = login(SEEDER_USER, SEEDER_PASS) + room_id = resolve_alias(token, ROOM_ALIAS) + for user_id in list_members(token, room_id): + if user_id == canon_user(SEEDER_USER): + continue + if is_numeric(user_id): + kick(token, room_id, user_id) + PY diff --git a/services/comms/pin-othrys-job.yaml b/services/comms/pin-othrys-job.yaml new file mode 100644 index 0000000..3639194 --- /dev/null +++ b/services/comms/pin-othrys-job.yaml @@ -0,0 +1,123 @@ +# services/comms/pin-othrys-job.yaml +apiVersion: batch/v1 +kind: CronJob +metadata: + name: pin-othrys-invite + namespace: comms +spec: + schedule: "*/30 * * * *" + suspend: true + concurrencyPolicy: Forbid + successfulJobsHistoryLimit: 1 + failedJobsHistoryLimit: 1 + jobTemplate: + spec: + backoffLimit: 0 + template: + spec: + restartPolicy: Never + containers: + - name: pin + image: python:3.11-slim + env: + - name: SYNAPSE_BASE + value: http://othrys-synapse-matrix-synapse:8008 + - name: AUTH_BASE + value: http://matrix-authentication-service:8080 + - name: SEEDER_USER + value: othrys-seeder + - name: SEEDER_PASS + valueFrom: + secretKeyRef: + name: atlasbot-credentials-runtime + key: seeder-password + command: + - /bin/sh + - -c + - | + set -euo pipefail + pip install --no-cache-dir requests >/dev/null + python - <<'PY' + import os, requests, urllib.parse + + BASE = os.environ["SYNAPSE_BASE"] + AUTH_BASE = os.environ.get("AUTH_BASE", BASE) + ROOM_ALIAS = "#othrys:live.bstein.dev" + MESSAGE = ( + "Invite guests: share https://live.bstein.dev/#/room/#othrys:live.bstein.dev?action=join " + "and choose 'Continue' -> 'Join as guest'." + ) + + def auth(token): return {"Authorization": f"Bearer {token}"} + + def canon_user(user): + u = (user or "").strip() + if u.startswith("@") and ":" in u: + return u + u = u.lstrip("@") + if ":" in u: + return f"@{u}" + return f"@{u}:live.bstein.dev" + + def login(user, password): + r = requests.post(f"{AUTH_BASE}/_matrix/client/v3/login", json={ + "type": "m.login.password", + "identifier": {"type": "m.id.user", "user": canon_user(user)}, + "password": password, + }) + r.raise_for_status() + return r.json()["access_token"] + + def resolve(alias, token): + enc = urllib.parse.quote(alias) + r = requests.get(f"{BASE}/_matrix/client/v3/directory/room/{enc}", headers=auth(token)) + r.raise_for_status() + return r.json()["room_id"] + + def get_pinned(room_id, token): + r = requests.get( + f"{BASE}/_matrix/client/v3/rooms/{urllib.parse.quote(room_id)}/state/m.room.pinned_events", + headers=auth(token), + ) + if r.status_code == 404: + return [] + r.raise_for_status() + return r.json().get("pinned", []) + + def get_event(room_id, event_id, token): + r = requests.get( + f"{BASE}/_matrix/client/v3/rooms/{urllib.parse.quote(room_id)}/event/{urllib.parse.quote(event_id)}", + headers=auth(token), + ) + if r.status_code == 404: + return None + r.raise_for_status() + return r.json() + + def send(room_id, token, body): + r = requests.post( + f"{BASE}/_matrix/client/v3/rooms/{urllib.parse.quote(room_id)}/send/m.room.message", + headers=auth(token), + json={"msgtype": "m.text", "body": body}, + ) + r.raise_for_status() + return r.json()["event_id"] + + def pin(room_id, token, event_id): + r = requests.put( + f"{BASE}/_matrix/client/v3/rooms/{urllib.parse.quote(room_id)}/state/m.room.pinned_events", + headers=auth(token), + json={"pinned": [event_id]}, + ) + r.raise_for_status() + + token = login(os.environ["SEEDER_USER"], os.environ["SEEDER_PASS"]) + room_id = resolve(ROOM_ALIAS, token) + for event_id in get_pinned(room_id, token): + ev = get_event(room_id, event_id, token) + if ev and ev.get("content", {}).get("body") == MESSAGE: + raise SystemExit(0) + + eid = send(room_id, token, MESSAGE) + pin(room_id, token, eid) + PY diff --git a/services/comms/reset-othrys-room-job.yaml b/services/comms/reset-othrys-room-job.yaml new file mode 100644 index 0000000..dd056c3 --- /dev/null +++ b/services/comms/reset-othrys-room-job.yaml @@ -0,0 +1,266 @@ +# services/comms/reset-othrys-room-job.yaml +apiVersion: batch/v1 +kind: CronJob +metadata: + name: othrys-room-reset + namespace: comms +spec: + schedule: "0 0 1 1 *" + suspend: true + concurrencyPolicy: Forbid + successfulJobsHistoryLimit: 1 + failedJobsHistoryLimit: 1 + jobTemplate: + spec: + backoffLimit: 0 + template: + spec: + restartPolicy: Never + containers: + - name: reset + image: python:3.11-slim + env: + - name: SYNAPSE_BASE + value: http://othrys-synapse-matrix-synapse:8008 + - name: AUTH_BASE + value: http://matrix-authentication-service:8080 + - name: SERVER_NAME + value: live.bstein.dev + - name: ROOM_ALIAS + value: "#othrys:live.bstein.dev" + - name: ROOM_NAME + value: Othrys + - name: PIN_MESSAGE + value: "Invite guests: share https://live.bstein.dev/#/room/#othrys:live.bstein.dev?action=join and choose 'Continue' -> 'Join as guest'." + - name: SEEDER_USER + value: othrys-seeder + - name: SEEDER_PASS + valueFrom: + secretKeyRef: + name: atlasbot-credentials-runtime + key: seeder-password + - name: BOT_USER + value: atlasbot + command: + - /bin/sh + - -c + - | + set -euo pipefail + pip install --no-cache-dir requests >/dev/null + python - <<'PY' + import os + import time + import urllib.parse + import requests + + BASE = os.environ["SYNAPSE_BASE"] + AUTH_BASE = os.environ.get("AUTH_BASE", BASE) + SERVER_NAME = os.environ.get("SERVER_NAME", "live.bstein.dev") + ROOM_ALIAS = os.environ.get("ROOM_ALIAS", "#othrys:live.bstein.dev") + ROOM_NAME = os.environ.get("ROOM_NAME", "Othrys") + PIN_MESSAGE = os.environ["PIN_MESSAGE"] + SEEDER_USER = os.environ["SEEDER_USER"] + SEEDER_PASS = os.environ["SEEDER_PASS"] + BOT_USER = os.environ["BOT_USER"] + + POWER_LEVELS = { + "ban": 50, + "events": { + "m.room.avatar": 50, + "m.room.canonical_alias": 50, + "m.room.encryption": 100, + "m.room.history_visibility": 100, + "m.room.name": 50, + "m.room.power_levels": 100, + "m.room.server_acl": 100, + "m.room.tombstone": 100, + }, + "events_default": 0, + "historical": 100, + "invite": 50, + "kick": 50, + "m.call.invite": 50, + "redact": 50, + "state_default": 50, + "users": {f"@{SEEDER_USER}:{SERVER_NAME}": 100}, + "users_default": 0, + } + + def auth(token): + return {"Authorization": f"Bearer {token}"} + + def canon_user(user): + u = (user or "").strip() + if u.startswith("@") and ":" in u: + return u + u = u.lstrip("@") + if ":" in u: + return f"@{u}" + return f"@{u}:{SERVER_NAME}" + + def login(user, password): + r = requests.post( + f"{AUTH_BASE}/_matrix/client/v3/login", + json={ + "type": "m.login.password", + "identifier": {"type": "m.id.user", "user": canon_user(user)}, + "password": password, + }, + ) + if r.status_code != 200: + raise SystemExit(f"login failed: {r.status_code} {r.text}") + return r.json()["access_token"] + + def resolve_alias(token, alias): + enc = urllib.parse.quote(alias) + r = requests.get(f"{BASE}/_matrix/client/v3/directory/room/{enc}", headers=auth(token)) + if r.status_code == 404: + return None + r.raise_for_status() + return r.json()["room_id"] + + def create_room(token): + r = requests.post( + f"{BASE}/_matrix/client/v3/createRoom", + headers=auth(token), + json={ + "preset": "public_chat", + "name": ROOM_NAME, + "room_version": "11", + }, + ) + r.raise_for_status() + return r.json()["room_id"] + + def put_state(token, room_id, ev_type, content): + r = requests.put( + f"{BASE}/_matrix/client/v3/rooms/{urllib.parse.quote(room_id)}/state/{ev_type}", + headers=auth(token), + json=content, + ) + r.raise_for_status() + + def set_directory_visibility(token, room_id, visibility): + r = requests.put( + f"{BASE}/_matrix/client/v3/directory/list/room/{urllib.parse.quote(room_id)}", + headers=auth(token), + json={"visibility": visibility}, + ) + r.raise_for_status() + + def delete_alias(token, alias): + enc = urllib.parse.quote(alias) + r = requests.delete(f"{BASE}/_matrix/client/v3/directory/room/{enc}", headers=auth(token)) + if r.status_code in (200, 202, 404): + return + r.raise_for_status() + + def put_alias(token, alias, room_id): + enc = urllib.parse.quote(alias) + r = requests.put( + f"{BASE}/_matrix/client/v3/directory/room/{enc}", + headers=auth(token), + json={"room_id": room_id}, + ) + r.raise_for_status() + + def list_joined_members(token, room_id): + r = requests.get( + f"{BASE}/_matrix/client/v3/rooms/{urllib.parse.quote(room_id)}/members?membership=join", + headers=auth(token), + ) + r.raise_for_status() + members = [] + for ev in r.json().get("chunk", []): + if ev.get("type") != "m.room.member": + continue + uid = ev.get("state_key") + if not isinstance(uid, str) or not uid.startswith("@"): + continue + members.append(uid) + return members + + def invite_user(token, room_id, user_id): + r = requests.post( + f"{BASE}/_matrix/client/v3/rooms/{urllib.parse.quote(room_id)}/invite", + headers=auth(token), + json={"user_id": user_id}, + ) + if r.status_code in (200, 202): + return + r.raise_for_status() + + def send_message(token, room_id, body): + r = requests.post( + f"{BASE}/_matrix/client/v3/rooms/{urllib.parse.quote(room_id)}/send/m.room.message", + headers=auth(token), + json={"msgtype": "m.text", "body": body}, + ) + r.raise_for_status() + return r.json()["event_id"] + + def login_with_retry(): + last = None + for attempt in range(1, 6): + try: + return login(SEEDER_USER, SEEDER_PASS) + except Exception as exc: # noqa: BLE001 + last = exc + time.sleep(attempt * 2) + raise last + + token = login_with_retry() + + old_room_id = resolve_alias(token, ROOM_ALIAS) + if not old_room_id: + raise SystemExit(f"alias {ROOM_ALIAS} not found; refusing to proceed") + + new_room_id = create_room(token) + + # Configure the new room. + put_state(token, new_room_id, "m.room.join_rules", {"join_rule": "public"}) + put_state(token, new_room_id, "m.room.guest_access", {"guest_access": "can_join"}) + put_state(token, new_room_id, "m.room.history_visibility", {"history_visibility": "shared"}) + put_state(token, new_room_id, "m.room.power_levels", POWER_LEVELS) + + # Move the alias. + delete_alias(token, ROOM_ALIAS) + put_alias(token, ROOM_ALIAS, new_room_id) + put_state(token, new_room_id, "m.room.canonical_alias", {"alias": ROOM_ALIAS}) + + set_directory_visibility(token, new_room_id, "public") + + # Invite the bot and all joined members of the old room. + bot_user_id = f"@{BOT_USER}:{SERVER_NAME}" + invite_user(token, new_room_id, bot_user_id) + for uid in list_joined_members(token, old_room_id): + if uid == f"@{SEEDER_USER}:{SERVER_NAME}": + continue + localpart = uid.split(":", 1)[0].lstrip("@") + if localpart.isdigit(): + continue + invite_user(token, new_room_id, uid) + + # Pin the guest invite message in the new room. + event_id = send_message(token, new_room_id, PIN_MESSAGE) + put_state(token, new_room_id, "m.room.pinned_events", {"pinned": [event_id]}) + + # De-list and tombstone the old room. + set_directory_visibility(token, old_room_id, "private") + put_state(token, old_room_id, "m.room.join_rules", {"join_rule": "invite"}) + put_state(token, old_room_id, "m.room.guest_access", {"guest_access": "forbidden"}) + put_state( + token, + old_room_id, + "m.room.tombstone", + {"body": "Othrys has been reset. Please join the new room.", "replacement_room": new_room_id}, + ) + send_message( + token, + old_room_id, + "Othrys was reset. Join the new room at https://live.bstein.dev/#/room/#othrys:live.bstein.dev?action=join", + ) + + print(f"old_room_id={old_room_id}") + print(f"new_room_id={new_room_id}") + PY diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py new file mode 100644 index 0000000..e8bd1a8 --- /dev/null +++ b/services/comms/scripts/atlasbot/bot.py @@ -0,0 +1,622 @@ +import collections +import json +import os +import re +import ssl +import time +from typing import Any +from urllib import error, parse, request + +BASE = os.environ.get("MATRIX_BASE", "http://othrys-synapse-matrix-synapse:8008") +AUTH_BASE = os.environ.get("AUTH_BASE", "http://matrix-authentication-service:8080") +USER = os.environ["BOT_USER"] +PASSWORD = os.environ["BOT_PASS"] +ROOM_ALIAS = "#othrys:live.bstein.dev" + +OLLAMA_URL = os.environ.get("OLLAMA_URL", "https://chat.ai.bstein.dev/") +MODEL = os.environ.get("OLLAMA_MODEL", "qwen2.5-coder:7b-instruct-q4_0") +API_KEY = os.environ.get("CHAT_API_KEY", "") + +KB_DIR = os.environ.get("KB_DIR", "") +VM_URL = os.environ.get("VM_URL", "http://victoria-metrics-single-server.monitoring.svc.cluster.local:8428") + +BOT_MENTIONS = os.environ.get("BOT_MENTIONS", f"{USER},atlas") +SERVER_NAME = os.environ.get("MATRIX_SERVER_NAME", "live.bstein.dev") + +MAX_KB_CHARS = int(os.environ.get("ATLASBOT_MAX_KB_CHARS", "2500")) +MAX_TOOL_CHARS = int(os.environ.get("ATLASBOT_MAX_TOOL_CHARS", "2500")) + +TOKEN_RE = re.compile(r"[a-z0-9][a-z0-9_.-]{1,}", re.IGNORECASE) +HOST_RE = re.compile(r"(?i)([a-z0-9-]+(?:\\.[a-z0-9-]+)+)") +STOPWORDS = { + "the", + "and", + "for", + "with", + "this", + "that", + "from", + "into", + "what", + "how", + "why", + "when", + "where", + "which", + "who", + "can", + "could", + "should", + "would", + "please", + "help", + "atlas", + "othrys", +} + +METRIC_HINT_WORDS = { + "health", + "status", + "down", + "slow", + "error", + "unknown_error", + "timeout", + "crash", + "crashloop", + "restart", + "restarts", + "pending", + "unreachable", + "latency", +} + +def _tokens(text: str) -> list[str]: + toks = [t.lower() for t in TOKEN_RE.findall(text or "")] + return [t for t in toks if t not in STOPWORDS and len(t) >= 2] + + +# Mention detection (Matrix rich mentions + plain @atlas). +MENTION_TOKENS = [m.strip() for m in BOT_MENTIONS.split(",") if m.strip()] +MENTION_LOCALPARTS = [m.lstrip("@").split(":", 1)[0] for m in MENTION_TOKENS] +MENTION_RE = re.compile( + r"(? str: + t = token.strip() + if not t: + return "" + if t.startswith("@") and ":" in t: + return t + t = t.lstrip("@") + if ":" in t: + return f"@{t}" + return f"@{t}:{SERVER_NAME}" + +MENTION_USER_IDS = {normalize_user_id(t).lower() for t in MENTION_TOKENS if normalize_user_id(t)} + +def is_mentioned(content: dict, body: str) -> bool: + if MENTION_RE.search(body or "") is not None: + return True + mentions = content.get("m.mentions", {}) + user_ids = mentions.get("user_ids", []) + if not isinstance(user_ids, list): + return False + return any(isinstance(uid, str) and uid.lower() in MENTION_USER_IDS for uid in user_ids) + + +# Matrix HTTP helper. +def req(method: str, path: str, token: str | None = None, body=None, timeout=60, base: str | None = None): + url = (base or BASE) + path + data = None + headers = {} + if body is not None: + data = json.dumps(body).encode() + headers["Content-Type"] = "application/json" + if token: + headers["Authorization"] = f"Bearer {token}" + r = request.Request(url, data=data, headers=headers, method=method) + with request.urlopen(r, timeout=timeout) as resp: + raw = resp.read() + return json.loads(raw.decode()) if raw else {} + +def login() -> str: + login_user = normalize_user_id(USER) + payload = { + "type": "m.login.password", + "identifier": {"type": "m.id.user", "user": login_user}, + "password": PASSWORD, + } + res = req("POST", "/_matrix/client/v3/login", body=payload, base=AUTH_BASE) + return res["access_token"] + +def resolve_alias(token: str, alias: str) -> str: + enc = parse.quote(alias) + res = req("GET", f"/_matrix/client/v3/directory/room/{enc}", token) + return res["room_id"] + +def join_room(token: str, room: str): + req("POST", f"/_matrix/client/v3/rooms/{parse.quote(room)}/join", token, body={}) + +def send_msg(token: str, room: str, text: str): + path = f"/_matrix/client/v3/rooms/{parse.quote(room)}/send/m.room.message" + req("POST", path, token, body={"msgtype": "m.text", "body": text}) + + +# Atlas KB loader (no external deps; files are pre-rendered JSON via scripts/knowledge_render_atlas.py). +KB = {"catalog": {}, "runbooks": []} +_HOST_INDEX: dict[str, list[dict]] = {} +_NAME_INDEX: set[str] = set() + +def _load_json_file(path: str) -> Any | None: + try: + with open(path, "rb") as f: + return json.loads(f.read().decode("utf-8")) + except Exception: + return None + +def load_kb(): + global KB, _HOST_INDEX, _NAME_INDEX + if not KB_DIR: + return + catalog = _load_json_file(os.path.join(KB_DIR, "catalog", "atlas.json")) or {} + runbooks = _load_json_file(os.path.join(KB_DIR, "catalog", "runbooks.json")) or [] + KB = {"catalog": catalog, "runbooks": runbooks} + + host_index: dict[str, list[dict]] = collections.defaultdict(list) + for ep in catalog.get("http_endpoints", []) if isinstance(catalog, dict) else []: + host = (ep.get("host") or "").lower() + if host: + host_index[host].append(ep) + _HOST_INDEX = {k: host_index[k] for k in sorted(host_index.keys())} + + names: set[str] = set() + for s in catalog.get("services", []) if isinstance(catalog, dict) else []: + if isinstance(s, dict) and s.get("name"): + names.add(str(s["name"]).lower()) + for w in catalog.get("workloads", []) if isinstance(catalog, dict) else []: + if isinstance(w, dict) and w.get("name"): + names.add(str(w["name"]).lower()) + _NAME_INDEX = names + +def kb_retrieve(query: str, *, limit: int = 3) -> str: + q = (query or "").strip() + if not q or not KB.get("runbooks"): + return "" + ql = q.lower() + q_tokens = _tokens(q) + if not q_tokens: + return "" + + scored: list[tuple[int, dict]] = [] + for doc in KB.get("runbooks", []): + if not isinstance(doc, dict): + continue + title = str(doc.get("title") or "") + body = str(doc.get("body") or "") + tags = doc.get("tags") or [] + entrypoints = doc.get("entrypoints") or [] + hay = (title + "\n" + " ".join(tags) + "\n" + " ".join(entrypoints) + "\n" + body).lower() + score = 0 + for t in set(q_tokens): + if t in hay: + score += 3 if t in title.lower() else 1 + for h in entrypoints: + if isinstance(h, str) and h.lower() in ql: + score += 4 + if score: + scored.append((score, doc)) + + scored.sort(key=lambda x: x[0], reverse=True) + picked = [d for _, d in scored[:limit]] + if not picked: + return "" + + parts: list[str] = ["Atlas KB (retrieved):"] + used = 0 + for d in picked: + path = d.get("path") or "" + title = d.get("title") or path + body = (d.get("body") or "").strip() + snippet = body[:900].strip() + chunk = f"- {title} ({path})\n{snippet}" + if used + len(chunk) > MAX_KB_CHARS: + break + parts.append(chunk) + used += len(chunk) + return "\n".join(parts).strip() + +def catalog_hints(query: str) -> tuple[str, list[tuple[str, str]]]: + q = (query or "").strip() + if not q or not KB.get("catalog"): + return "", [] + ql = q.lower() + hosts = {m.group(1).lower() for m in HOST_RE.finditer(ql) if m.group(1).lower().endswith("bstein.dev")} + + # Also match by known workload/service names. + for t in _tokens(ql): + if t in _NAME_INDEX: + hosts |= {ep["host"].lower() for ep in KB["catalog"].get("http_endpoints", []) if isinstance(ep, dict) and ep.get("backend", {}).get("service") == t} + + edges: list[tuple[str, str]] = [] + lines: list[str] = [] + for host in sorted(hosts): + for ep in _HOST_INDEX.get(host, []): + backend = ep.get("backend") or {} + ns = backend.get("namespace") or "" + svc = backend.get("service") or "" + path = ep.get("path") or "/" + if not svc: + continue + wk = backend.get("workloads") or [] + wk_str = ", ".join(f"{w.get('kind')}:{w.get('name')}" for w in wk if isinstance(w, dict) and w.get("name")) or "unknown" + lines.append(f"- {host}{path} → {ns}/{svc} → {wk_str}") + for w in wk: + if isinstance(w, dict) and w.get("name"): + edges.append((ns, str(w["name"]))) + if not lines: + return "", [] + return "Atlas endpoints (from GitOps):\n" + "\n".join(lines[:20]), edges + + +# Kubernetes API (read-only). RBAC is provided via ServiceAccount atlasbot. +_K8S_TOKEN: str | None = None +_K8S_CTX: ssl.SSLContext | None = None + +def _k8s_context() -> ssl.SSLContext: + global _K8S_CTX + if _K8S_CTX is not None: + return _K8S_CTX + ca_path = "/var/run/secrets/kubernetes.io/serviceaccount/ca.crt" + ctx = ssl.create_default_context(cafile=ca_path) + _K8S_CTX = ctx + return ctx + +def _k8s_token() -> str: + global _K8S_TOKEN + if _K8S_TOKEN: + return _K8S_TOKEN + token_path = "/var/run/secrets/kubernetes.io/serviceaccount/token" + with open(token_path, "r", encoding="utf-8") as f: + _K8S_TOKEN = f.read().strip() + return _K8S_TOKEN + +def k8s_get(path: str, timeout: int = 8) -> dict: + host = os.environ.get("KUBERNETES_SERVICE_HOST") + port = os.environ.get("KUBERNETES_SERVICE_PORT_HTTPS") or os.environ.get("KUBERNETES_SERVICE_PORT") or "443" + if not host: + raise RuntimeError("k8s host missing") + url = f"https://{host}:{port}{path}" + headers = {"Authorization": f"Bearer {_k8s_token()}"} + r = request.Request(url, headers=headers, method="GET") + with request.urlopen(r, timeout=timeout, context=_k8s_context()) as resp: + raw = resp.read() + return json.loads(raw.decode()) if raw else {} + +def k8s_pods(namespace: str) -> list[dict]: + data = k8s_get(f"/api/v1/namespaces/{parse.quote(namespace)}/pods?limit=500") + items = data.get("items") or [] + return items if isinstance(items, list) else [] + +def summarize_pods(namespace: str, prefixes: set[str] | None = None) -> str: + try: + pods = k8s_pods(namespace) + except Exception: + return "" + out: list[str] = [] + for p in pods: + md = p.get("metadata") or {} + st = p.get("status") or {} + name = md.get("name") or "" + if prefixes and not any(name.startswith(pref + "-") or name == pref or name.startswith(pref) for pref in prefixes): + continue + phase = st.get("phase") or "?" + cs = st.get("containerStatuses") or [] + restarts = 0 + ready = 0 + total = 0 + reason = st.get("reason") or "" + for c in cs if isinstance(cs, list) else []: + if not isinstance(c, dict): + continue + total += 1 + restarts += int(c.get("restartCount") or 0) + if c.get("ready"): + ready += 1 + state = c.get("state") or {} + if not reason and isinstance(state, dict): + waiting = state.get("waiting") or {} + if isinstance(waiting, dict) and waiting.get("reason"): + reason = waiting.get("reason") + extra = f" ({reason})" if reason else "" + out.append(f"- {namespace}/{name}: {phase} {ready}/{total} restarts={restarts}{extra}") + return "\n".join(out[:20]) + +def flux_not_ready() -> str: + try: + data = k8s_get( + "/apis/kustomize.toolkit.fluxcd.io/v1/namespaces/flux-system/kustomizations?limit=200" + ) + except Exception: + return "" + items = data.get("items") or [] + bad: list[str] = [] + for it in items if isinstance(items, list) else []: + md = it.get("metadata") or {} + st = it.get("status") or {} + name = md.get("name") or "" + conds = st.get("conditions") or [] + ready = None + msg = "" + for c in conds if isinstance(conds, list) else []: + if isinstance(c, dict) and c.get("type") == "Ready": + ready = c.get("status") + msg = c.get("message") or "" + if ready not in ("True", True): + bad.append(f"- flux kustomization/{name}: Ready={ready} {msg}".strip()) + return "\n".join(bad[:10]) + + +# VictoriaMetrics (PromQL) helpers. +def vm_query(query: str, timeout: int = 8) -> dict | None: + try: + url = VM_URL.rstrip("/") + "/api/v1/query?" + parse.urlencode({"query": query}) + with request.urlopen(url, timeout=timeout) as resp: + return json.loads(resp.read().decode()) + except Exception: + return None + +def _vm_value_series(res: dict) -> list[dict]: + if not res or (res.get("status") != "success"): + return [] + data = res.get("data") or {} + result = data.get("result") or [] + return result if isinstance(result, list) else [] + +def vm_render_result(res: dict | None, limit: int = 12) -> str: + if not res: + return "" + series = _vm_value_series(res) + if not series: + return "" + out: list[str] = [] + for r in series[:limit]: + if not isinstance(r, dict): + continue + metric = r.get("metric") or {} + value = r.get("value") or [] + val = value[1] if isinstance(value, list) and len(value) > 1 else "" + # Prefer common labels if present. + label_parts = [] + for k in ("namespace", "pod", "container", "node", "instance", "job", "phase"): + if isinstance(metric, dict) and metric.get(k): + label_parts.append(f"{k}={metric.get(k)}") + if not label_parts and isinstance(metric, dict): + for k in sorted(metric.keys()): + if k.startswith("__"): + continue + label_parts.append(f"{k}={metric.get(k)}") + if len(label_parts) >= 4: + break + labels = ", ".join(label_parts) if label_parts else "series" + out.append(f"- {labels}: {val}") + return "\n".join(out) + +def vm_top_restarts(hours: int = 1) -> str: + q = f"topk(5, sum by (namespace,pod) (increase(kube_pod_container_status_restarts_total[{hours}h])))" + res = vm_query(q) + if not res or (res.get("status") != "success"): + return "" + out: list[str] = [] + for r in (res.get("data") or {}).get("result") or []: + if not isinstance(r, dict): + continue + m = r.get("metric") or {} + v = r.get("value") or [] + ns = (m.get("namespace") or "").strip() + pod = (m.get("pod") or "").strip() + val = v[1] if isinstance(v, list) and len(v) > 1 else "" + if pod: + out.append(f"- restarts({hours}h): {ns}/{pod} = {val}") + return "\n".join(out) + +def vm_cluster_snapshot() -> str: + parts: list[str] = [] + # Node readiness (kube-state-metrics). + ready = vm_query('sum(kube_node_status_condition{condition="Ready",status="true"})') + not_ready = vm_query('sum(kube_node_status_condition{condition="Ready",status="false"})') + if ready and not_ready: + try: + r = _vm_value_series(ready)[0]["value"][1] + nr = _vm_value_series(not_ready)[0]["value"][1] + parts.append(f"- nodes ready: {r} (not ready: {nr})") + except Exception: + pass + + phases = vm_query("sum by (phase) (kube_pod_status_phase)") + pr = vm_render_result(phases, limit=8) + if pr: + parts.append("Pod phases:") + parts.append(pr) + return "\n".join(parts).strip() + + +# Conversation state. +history = collections.defaultdict(list) # (room_id, sender|None) -> list[str] (short transcript) + +def key_for(room_id: str, sender: str, is_dm: bool): + return (room_id, None) if is_dm else (room_id, sender) + +def build_context(prompt: str, *, allow_tools: bool, targets: list[tuple[str, str]]) -> str: + parts: list[str] = [] + + kb = kb_retrieve(prompt) + if kb: + parts.append(kb) + + endpoints, edges = catalog_hints(prompt) + if endpoints: + parts.append(endpoints) + + if allow_tools: + # Scope pod summaries to relevant namespaces/workloads when possible. + prefixes_by_ns: dict[str, set[str]] = collections.defaultdict(set) + for ns, name in (targets or []) + (edges or []): + if ns and name: + prefixes_by_ns[ns].add(name) + pod_lines: list[str] = [] + for ns in sorted(prefixes_by_ns.keys()): + summary = summarize_pods(ns, prefixes_by_ns[ns]) + if summary: + pod_lines.append(f"Pods (live):\n{summary}") + if pod_lines: + parts.append("\n".join(pod_lines)[:MAX_TOOL_CHARS]) + + flux_bad = flux_not_ready() + if flux_bad: + parts.append("Flux (not ready):\n" + flux_bad) + + p_l = (prompt or "").lower() + if any(w in p_l for w in METRIC_HINT_WORDS): + restarts = vm_top_restarts(1) + if restarts: + parts.append("VictoriaMetrics (top restarts 1h):\n" + restarts) + snap = vm_cluster_snapshot() + if snap: + parts.append("VictoriaMetrics (cluster snapshot):\n" + snap) + + return "\n\n".join([p for p in parts if p]).strip() + +def ollama_reply(hist_key, prompt: str, *, context: str) -> str: + try: + system = ( + "System: You are Atlas, the Titan lab assistant for Atlas/Othrys. " + "Be helpful, direct, and concise. " + "Prefer answering with exact repo paths and Kubernetes resource names. " + "Never include or request secret values." + ) + transcript_parts = [system] + if context: + transcript_parts.append("Context (grounded):\n" + context[:MAX_KB_CHARS]) + transcript_parts.extend(history[hist_key][-24:]) + transcript_parts.append(f"User: {prompt}") + transcript = "\n".join(transcript_parts) + + payload = {"model": MODEL, "message": transcript} + headers = {"Content-Type": "application/json"} + if API_KEY: + headers["x-api-key"] = API_KEY + r = request.Request(OLLAMA_URL, data=json.dumps(payload).encode(), headers=headers) + with request.urlopen(r, timeout=20) as resp: + data = json.loads(resp.read().decode()) + reply = data.get("message") or data.get("response") or data.get("reply") or "I'm here to help." + history[hist_key].append(f"Atlas: {reply}") + return reply + except Exception: + return "I’m here — but I couldn’t reach the model backend." + +def sync_loop(token: str, room_id: str): + since = None + try: + res = req("GET", "/_matrix/client/v3/sync?timeout=0", token, timeout=10) + since = res.get("next_batch") + except Exception: + pass + + while True: + params = {"timeout": 30000} + if since: + params["since"] = since + query = parse.urlencode(params) + try: + res = req("GET", f"/_matrix/client/v3/sync?{query}", token, timeout=35) + except Exception: + time.sleep(5) + continue + since = res.get("next_batch", since) + + # invites + for rid, data in res.get("rooms", {}).get("invite", {}).items(): + try: + join_room(token, rid) + except Exception: + pass + + # messages + for rid, data in res.get("rooms", {}).get("join", {}).items(): + timeline = data.get("timeline", {}).get("events", []) + joined_count = data.get("summary", {}).get("m.joined_member_count") + is_dm = joined_count is not None and joined_count <= 2 + + for ev in timeline: + if ev.get("type") != "m.room.message": + continue + content = ev.get("content", {}) + body = (content.get("body", "") or "").strip() + if not body: + continue + sender = ev.get("sender", "") + if sender == f"@{USER}:live.bstein.dev": + continue + + mentioned = is_mentioned(content, body) + hist_key = key_for(rid, sender, is_dm) + history[hist_key].append(f"{sender}: {body}") + history[hist_key] = history[hist_key][-80:] + + if not (is_dm or mentioned): + continue + + # Only do live cluster/metrics introspection in DMs. + allow_tools = is_dm + + promql = "" + if allow_tools: + m = re.match(r"(?is)^\\s*promql\\s*(?:\\:|\\s)\\s*(.+?)\\s*$", body) + if m: + promql = m.group(1).strip() + + # Attempt to scope tools to the most likely workloads when hostnames are mentioned. + targets: list[tuple[str, str]] = [] + for m in HOST_RE.finditer(body.lower()): + host = m.group(1).lower() + for ep in _HOST_INDEX.get(host, []): + backend = ep.get("backend") or {} + ns = backend.get("namespace") or "" + for w in backend.get("workloads") or []: + if isinstance(w, dict) and w.get("name"): + targets.append((ns, str(w["name"]))) + + context = build_context(body, allow_tools=allow_tools, targets=targets) + if allow_tools and promql: + res = vm_query(promql, timeout=20) + rendered = vm_render_result(res, limit=15) or "(no results)" + extra = "VictoriaMetrics (PromQL result):\n" + rendered + context = (context + "\n\n" + extra).strip() if context else extra + reply = ollama_reply(hist_key, body, context=context) + send_msg(token, rid, reply) + +def login_with_retry(): + last_err = None + for attempt in range(10): + try: + return login() + except Exception as exc: # noqa: BLE001 + last_err = exc + time.sleep(min(30, 2 ** attempt)) + raise last_err + +def main(): + load_kb() + token = login_with_retry() + try: + room_id = resolve_alias(token, ROOM_ALIAS) + join_room(token, room_id) + except Exception: + room_id = None + sync_loop(token, room_id) + +if __name__ == "__main__": + main() diff --git a/services/comms/scripts/guest-register/server.py b/services/comms/scripts/guest-register/server.py new file mode 100644 index 0000000..0e1fb4c --- /dev/null +++ b/services/comms/scripts/guest-register/server.py @@ -0,0 +1,264 @@ +import base64 +import json +import os +import random +import secrets +from http.server import BaseHTTPRequestHandler, HTTPServer +from urllib import error, parse, request + +MAS_BASE = os.environ.get("MAS_BASE", "http://matrix-authentication-service:8080").rstrip("/") +MAS_ADMIN_API_BASE = os.environ.get("MAS_ADMIN_API_BASE", "http://matrix-authentication-service:8081/api/admin/v1").rstrip("/") +SYNAPSE_BASE = os.environ.get("SYNAPSE_BASE", "http://othrys-synapse-matrix-synapse:8008").rstrip("/") +SERVER_NAME = os.environ.get("MATRIX_SERVER_NAME", "live.bstein.dev") + +MAS_ADMIN_CLIENT_ID = os.environ["MAS_ADMIN_CLIENT_ID"] +MAS_ADMIN_CLIENT_SECRET_FILE = os.environ.get("MAS_ADMIN_CLIENT_SECRET_FILE", "/etc/mas/admin-client/client_secret") +MAS_ADMIN_SCOPE = os.environ.get("MAS_ADMIN_SCOPE", "urn:mas:admin") +RATE_WINDOW_SEC = int(os.environ.get("RATE_WINDOW_SEC", "60")) +RATE_MAX = int(os.environ.get("RATE_MAX", "30")) +_rate = {} # ip -> [window_start, count] + +ADJ = [ + "brisk","calm","eager","gentle","merry","nifty","rapid","sunny","witty","zesty", + "amber","bold","bright","crisp","daring","frosty","glad","jolly","lively","mellow", + "quiet","ripe","serene","spry","tidy","vivid","warm","wild","clever","kind", +] +NOUN = [ + "otter","falcon","comet","ember","grove","harbor","meadow","raven","river","summit", + "breeze","cedar","cinder","cove","delta","forest","glade","lark","marsh","peak", + "pine","quartz","reef","ridge","sable","sage","shore","thunder","vale","zephyr", +] + +def _json(method, url, *, headers=None, body=None, timeout=20): + hdrs = {"Content-Type": "application/json"} + if headers: + hdrs.update(headers) + data = None + if body is not None: + data = json.dumps(body).encode() + req = request.Request(url, data=data, headers=hdrs, method=method) + try: + with request.urlopen(req, timeout=timeout) as resp: + raw = resp.read() + payload = json.loads(raw.decode()) if raw else {} + return resp.status, payload + except error.HTTPError as e: + raw = e.read() + try: + payload = json.loads(raw.decode()) if raw else {} + except Exception: + payload = {} + return e.code, payload + +def _form(method, url, *, headers=None, fields=None, timeout=20): + hdrs = {"Content-Type": "application/x-www-form-urlencoded"} + if headers: + hdrs.update(headers) + data = parse.urlencode(fields or {}).encode() + req = request.Request(url, data=data, headers=hdrs, method=method) + try: + with request.urlopen(req, timeout=timeout) as resp: + raw = resp.read() + payload = json.loads(raw.decode()) if raw else {} + return resp.status, payload + except error.HTTPError as e: + raw = e.read() + try: + payload = json.loads(raw.decode()) if raw else {} + except Exception: + payload = {} + return e.code, payload + +_admin_token = None +_admin_token_at = 0.0 + +def _mas_admin_access_token(now): + global _admin_token, _admin_token_at + if _admin_token and (now - _admin_token_at) < 300: + return _admin_token + + with open(MAS_ADMIN_CLIENT_SECRET_FILE, encoding="utf-8") as fh: + client_secret = fh.read().strip() + basic = base64.b64encode(f"{MAS_ADMIN_CLIENT_ID}:{client_secret}".encode()).decode() + + status, payload = _form( + "POST", + f"{MAS_BASE}/oauth2/token", + headers={"Authorization": f"Basic {basic}"}, + fields={"grant_type": "client_credentials", "scope": MAS_ADMIN_SCOPE}, + timeout=20, + ) + if status != 200 or "access_token" not in payload: + raise RuntimeError("mas_admin_token_failed") + + _admin_token = payload["access_token"] + _admin_token_at = now + return _admin_token + +def _generate_localpart(): + return "guest-" + secrets.token_hex(6) + +def _generate_displayname(): + return f"{random.choice(ADJ)}-{random.choice(NOUN)}" + +def _admin_api(admin_token, method, path, body=None): + return _json( + method, + f"{MAS_ADMIN_API_BASE}{path}", + headers={"Authorization": f"Bearer {admin_token}"}, + body=body, + timeout=20, + ) + +def _create_user(admin_token, username): + status, payload = _admin_api(admin_token, "POST", "/users", {"username": username}) + if status != 201: + return status, None + user = payload.get("data") or {} + return status, user.get("id") + +def _set_password(admin_token, user_id, password): + status, _payload = _admin_api( + admin_token, + "POST", + f"/users/{parse.quote(user_id)}/set-password", + {"password": password}, + ) + return status in (200, 204) + +def _login_password(username, password): + payload = { + "type": "m.login.password", + "identifier": {"type": "m.id.user", "user": f"@{username}:{SERVER_NAME}"}, + "password": password, + } + status, data = _json( + "POST", + f"{MAS_BASE}/_matrix/client/v3/login", + body=payload, + timeout=20, + ) + if status != 200: + return None, None + return data.get("access_token"), data.get("device_id") + +def _set_display_name(access_token, user_id, displayname): + _json( + "PUT", + f"{SYNAPSE_BASE}/_matrix/client/v3/profile/{parse.quote(user_id, safe='')}/displayname", + headers={"Authorization": f"Bearer {access_token}"}, + body={"displayname": displayname}, + timeout=20, + ) + +def _rate_check(ip, now): + win, cnt = _rate.get(ip, (now, 0)) + if now - win > RATE_WINDOW_SEC: + _rate[ip] = (now, 1) + return True + if cnt >= RATE_MAX: + return False + _rate[ip] = (win, cnt + 1) + return True + +class Handler(BaseHTTPRequestHandler): + server_version = "matrix-guest-register" + + def _send_json(self, code, payload): + body = json.dumps(payload).encode() + self.send_response(code) + self.send_header("Content-Type", "application/json") + self.send_header("Access-Control-Allow-Origin", "*") + self.send_header("Access-Control-Allow-Methods", "GET, POST, OPTIONS") + self.send_header("Access-Control-Allow-Headers", "Content-Type, Authorization, X-Requested-With") + self.send_header("Content-Length", str(len(body))) + self.end_headers() + self.wfile.write(body) + + def do_OPTIONS(self): # noqa: N802 + self.send_response(204) + self.send_header("Access-Control-Allow-Origin", "*") + self.send_header("Access-Control-Allow-Methods", "GET, POST, OPTIONS") + self.send_header("Access-Control-Allow-Headers", "Content-Type, Authorization, X-Requested-With") + self.end_headers() + + def do_GET(self): # noqa: N802 + parsed = parse.urlparse(self.path) + if parsed.path in ("/healthz", "/"): + return self._send_json(200, {"ok": True}) + if parsed.path in ("/_matrix/client/v3/register", "/_matrix/client/r0/register"): + return self._send_json(200, {"flows": [{"stages": []}]}) + return self._send_json(404, {"errcode": "M_NOT_FOUND", "error": "not_found"}) + + def do_POST(self): # noqa: N802 + parsed = parse.urlparse(self.path) + if parsed.path not in ("/_matrix/client/v3/register", "/_matrix/client/r0/register"): + return self._send_json(404, {"errcode": "M_NOT_FOUND", "error": "not_found"}) + + qs = parse.parse_qs(parsed.query) + kind = (qs.get("kind") or ["user"])[0] + if kind != "guest": + return self._send_json( + 403, + { + "errcode": "M_FORBIDDEN", + "error": "Registration is disabled; use https://bstein.dev/request-access for accounts.", + }, + ) + + xfwd = self.headers.get("x-forwarded-for", "") + ip = (xfwd.split(",")[0].strip() if xfwd else "") or self.client_address[0] + now = __import__("time").time() + if not _rate_check(ip, now): + return self._send_json(429, {"errcode": "M_LIMIT_EXCEEDED", "error": "rate_limited"}) + + length = int(self.headers.get("content-length", "0") or "0") + raw = self.rfile.read(length) if length else b"{}" + try: + body = json.loads(raw.decode()) if raw else {} + if not isinstance(body, dict): + body = {} + except Exception: + body = {} + try: + admin_token = _mas_admin_access_token(now) + displayname = _generate_displayname() + + localpart = None + mas_user_id = None + for _ in range(5): + localpart = _generate_localpart() + status, mas_user_id = _create_user(admin_token, localpart) + if status == 201 and mas_user_id: + break + mas_user_id = None + if not mas_user_id or not localpart: + raise RuntimeError("add_user_failed") + + password = secrets.token_urlsafe(18) + if not _set_password(admin_token, mas_user_id, password): + raise RuntimeError("set_password_failed") + access_token, device_id = _login_password(localpart, password) + if not access_token: + raise RuntimeError("login_failed") + try: + _set_display_name(access_token, f"@{localpart}:{SERVER_NAME}", displayname) + except Exception: + pass + except Exception: + return self._send_json(502, {"errcode": "M_UNKNOWN", "error": "guest_provision_failed"}) + + resp = { + "user_id": f"@{localpart}:{SERVER_NAME}", + "access_token": access_token, + "device_id": device_id or "guest_device", + "home_server": SERVER_NAME, + } + return self._send_json(200, resp) + +def main(): + port = int(os.environ.get("PORT", "8080")) + HTTPServer(("0.0.0.0", port), Handler).serve_forever() + +if __name__ == "__main__": + main() diff --git a/services/comms/scripts/synapse/redis/ping_liveness_local.sh b/services/comms/scripts/synapse/redis/ping_liveness_local.sh new file mode 100644 index 0000000..964e552 --- /dev/null +++ b/services/comms/scripts/synapse/redis/ping_liveness_local.sh @@ -0,0 +1,20 @@ +#!/bin/bash + +[[ -f $REDIS_PASSWORD_FILE ]] && export REDIS_PASSWORD="$(< "${REDIS_PASSWORD_FILE}")" +[[ -n "$REDIS_PASSWORD" ]] && export REDISCLI_AUTH="$REDIS_PASSWORD" +response=$( + timeout -s 15 $1 \ + redis-cli \ + -h localhost \ + -p $REDIS_PORT \ + ping +) +if [ "$?" -eq "124" ]; then + echo "Timed out" + exit 1 +fi +responseFirstWord=$(echo $response | head -n1 | awk '{print $1;}') +if [ "$response" != "PONG" ] && [ "$responseFirstWord" != "LOADING" ] && [ "$responseFirstWord" != "MASTERDOWN" ]; then + echo "$response" + exit 1 +fi \ No newline at end of file diff --git a/services/comms/scripts/synapse/redis/ping_liveness_local_and_master.sh b/services/comms/scripts/synapse/redis/ping_liveness_local_and_master.sh new file mode 100644 index 0000000..c343f82 --- /dev/null +++ b/services/comms/scripts/synapse/redis/ping_liveness_local_and_master.sh @@ -0,0 +1,5 @@ +script_dir="$(dirname "$0")" +exit_status=0 +"$script_dir/ping_liveness_local.sh" $1 || exit_status=$? +"$script_dir/ping_liveness_master.sh" $1 || exit_status=$? +exit $exit_status \ No newline at end of file diff --git a/services/comms/scripts/synapse/redis/ping_liveness_master.sh b/services/comms/scripts/synapse/redis/ping_liveness_master.sh new file mode 100644 index 0000000..849982a --- /dev/null +++ b/services/comms/scripts/synapse/redis/ping_liveness_master.sh @@ -0,0 +1,20 @@ +#!/bin/bash + +[[ -f $REDIS_MASTER_PASSWORD_FILE ]] && export REDIS_MASTER_PASSWORD="$(< "${REDIS_MASTER_PASSWORD_FILE}")" +[[ -n "$REDIS_MASTER_PASSWORD" ]] && export REDISCLI_AUTH="$REDIS_MASTER_PASSWORD" +response=$( + timeout -s 15 $1 \ + redis-cli \ + -h $REDIS_MASTER_HOST \ + -p $REDIS_MASTER_PORT_NUMBER \ + ping +) +if [ "$?" -eq "124" ]; then + echo "Timed out" + exit 1 +fi +responseFirstWord=$(echo $response | head -n1 | awk '{print $1;}') +if [ "$response" != "PONG" ] && [ "$responseFirstWord" != "LOADING" ]; then + echo "$response" + exit 1 +fi \ No newline at end of file diff --git a/services/comms/scripts/synapse/redis/ping_readiness_local.sh b/services/comms/scripts/synapse/redis/ping_readiness_local.sh new file mode 100644 index 0000000..080273f --- /dev/null +++ b/services/comms/scripts/synapse/redis/ping_readiness_local.sh @@ -0,0 +1,19 @@ +#!/bin/bash + +[[ -f $REDIS_PASSWORD_FILE ]] && export REDIS_PASSWORD="$(< "${REDIS_PASSWORD_FILE}")" +[[ -n "$REDIS_PASSWORD" ]] && export REDISCLI_AUTH="$REDIS_PASSWORD" +response=$( + timeout -s 15 $1 \ + redis-cli \ + -h localhost \ + -p $REDIS_PORT \ + ping +) +if [ "$?" -eq "124" ]; then + echo "Timed out" + exit 1 +fi +if [ "$response" != "PONG" ]; then + echo "$response" + exit 1 +fi \ No newline at end of file diff --git a/services/comms/scripts/synapse/redis/ping_readiness_local_and_master.sh b/services/comms/scripts/synapse/redis/ping_readiness_local_and_master.sh new file mode 100644 index 0000000..0ba63cc --- /dev/null +++ b/services/comms/scripts/synapse/redis/ping_readiness_local_and_master.sh @@ -0,0 +1,5 @@ +script_dir="$(dirname "$0")" +exit_status=0 +"$script_dir/ping_readiness_local.sh" $1 || exit_status=$? +"$script_dir/ping_readiness_master.sh" $1 || exit_status=$? +exit $exit_status \ No newline at end of file diff --git a/services/comms/scripts/synapse/redis/ping_readiness_master.sh b/services/comms/scripts/synapse/redis/ping_readiness_master.sh new file mode 100644 index 0000000..95ced76 --- /dev/null +++ b/services/comms/scripts/synapse/redis/ping_readiness_master.sh @@ -0,0 +1,19 @@ +#!/bin/bash + +[[ -f $REDIS_MASTER_PASSWORD_FILE ]] && export REDIS_MASTER_PASSWORD="$(< "${REDIS_MASTER_PASSWORD_FILE}")" +[[ -n "$REDIS_MASTER_PASSWORD" ]] && export REDISCLI_AUTH="$REDIS_MASTER_PASSWORD" +response=$( + timeout -s 15 $1 \ + redis-cli \ + -h $REDIS_MASTER_HOST \ + -p $REDIS_MASTER_PORT_NUMBER \ + ping +) +if [ "$?" -eq "124" ]; then + echo "Timed out" + exit 1 +fi +if [ "$response" != "PONG" ]; then + echo "$response" + exit 1 +fi \ No newline at end of file diff --git a/services/comms/scripts/synapse/redis/start-master.sh b/services/comms/scripts/synapse/redis/start-master.sh new file mode 100644 index 0000000..4284839 --- /dev/null +++ b/services/comms/scripts/synapse/redis/start-master.sh @@ -0,0 +1,15 @@ +#!/bin/bash + +[[ -f $REDIS_PASSWORD_FILE ]] && export REDIS_PASSWORD="$(< "${REDIS_PASSWORD_FILE}")" +if [[ -f /opt/bitnami/redis/mounted-etc/master.conf ]];then + cp /opt/bitnami/redis/mounted-etc/master.conf /opt/bitnami/redis/etc/master.conf +fi +if [[ -f /opt/bitnami/redis/mounted-etc/redis.conf ]];then + cp /opt/bitnami/redis/mounted-etc/redis.conf /opt/bitnami/redis/etc/redis.conf +fi +ARGS=("--port" "${REDIS_PORT}") +ARGS+=("--requirepass" "${REDIS_PASSWORD}") +ARGS+=("--masterauth" "${REDIS_PASSWORD}") +ARGS+=("--include" "/opt/bitnami/redis/etc/redis.conf") +ARGS+=("--include" "/opt/bitnami/redis/etc/master.conf") +exec redis-server "${ARGS[@]}" diff --git a/services/comms/scripts/synapse/signing-key.sh b/services/comms/scripts/synapse/signing-key.sh new file mode 100644 index 0000000..5d1b941 --- /dev/null +++ b/services/comms/scripts/synapse/signing-key.sh @@ -0,0 +1,41 @@ +#!/bin/sh + +set -eu + +check_key() { + set +e + + echo "Checking for existing signing key..." + key="$(kubectl get secret "$SECRET_NAME" -o jsonpath="{.data['signing\.key']}" 2> /dev/null)" + [ $? -ne 0 ] && return 1 + [ -z "$key" ] && return 2 + return 0 +} + +create_key() { + echo "Waiting for new signing key to be generated..." + begin=$(date +%s) + end=$((begin + 300)) # 5 minutes + while true; do + [ -f /synapse/keys/signing.key ] && return 0 + [ "$(date +%s)" -gt $end ] && return 1 + sleep 5 + done +} + +store_key() { + echo "Storing signing key in Kubernetes secret..." + kubectl patch secret "$SECRET_NAME" -p "{\"data\":{\"signing.key\":\"$(base64 /synapse/keys/signing.key | tr -d '\n')\"}}" +} + +if check_key; then + echo "Key already in place, exiting." + exit +fi + +if ! create_key; then + echo "Timed out waiting for a signing key to appear." + exit 1 +fi + +store_key diff --git a/services/comms/seed-othrys-room.yaml b/services/comms/seed-othrys-room.yaml new file mode 100644 index 0000000..901f14d --- /dev/null +++ b/services/comms/seed-othrys-room.yaml @@ -0,0 +1,146 @@ +# services/comms/seed-othrys-room.yaml +apiVersion: batch/v1 +kind: CronJob +metadata: + name: seed-othrys-room + namespace: comms +spec: + schedule: "*/10 * * * *" + suspend: true + concurrencyPolicy: Forbid + jobTemplate: + spec: + backoffLimit: 0 + template: + spec: + restartPolicy: Never + containers: + - name: seed + image: python:3.11-slim + env: + - name: SYNAPSE_BASE + value: http://othrys-synapse-matrix-synapse:8008 + - name: AUTH_BASE + value: http://matrix-authentication-service:8080 + - name: SEEDER_USER + value: othrys-seeder + - name: SEEDER_PASS + valueFrom: + secretKeyRef: + name: atlasbot-credentials-runtime + key: seeder-password + - name: BOT_USER + value: atlasbot + - name: BOT_PASS + valueFrom: + secretKeyRef: + name: atlasbot-credentials-runtime + key: bot-password + command: + - /bin/sh + - -c + - | + set -euo pipefail + pip install --no-cache-dir requests pyyaml >/dev/null + python - <<'PY' + import os, requests, urllib.parse + + BASE = os.environ["SYNAPSE_BASE"] + AUTH_BASE = os.environ.get("AUTH_BASE", BASE) + + def canon_user(user): + u = (user or "").strip() + if u.startswith("@") and ":" in u: + return u + u = u.lstrip("@") + if ":" in u: + return f"@{u}" + return f"@{u}:live.bstein.dev" + + def login(user, password): + r = requests.post(f"{AUTH_BASE}/_matrix/client/v3/login", json={ + "type": "m.login.password", + "identifier": {"type": "m.id.user", "user": canon_user(user)}, + "password": password, + }) + if r.status_code != 200: + raise SystemExit(f"login failed: {r.status_code} {r.text}") + return r.json()["access_token"] + + def ensure_user(token, localpart, password, admin): + headers = {"Authorization": f"Bearer {token}"} + user_id = f"@{localpart}:live.bstein.dev" + url = f"{BASE}/_synapse/admin/v2/users/{urllib.parse.quote(user_id)}" + res = requests.get(url, headers=headers) + if res.status_code == 200: + return + payload = {"password": password, "admin": admin, "deactivated": False} + create = requests.put(url, headers=headers, json=payload) + if create.status_code not in (200, 201): + raise SystemExit(f"create user {user_id} failed: {create.status_code} {create.text}") + + def ensure_room(token): + headers = {"Authorization": f"Bearer {token}"} + alias = "#othrys:live.bstein.dev" + alias_enc = "%23othrys%3Alive.bstein.dev" + exists = requests.get(f"{BASE}/_matrix/client/v3/directory/room/{alias_enc}", headers=headers) + if exists.status_code == 200: + room_id = exists.json()["room_id"] + else: + create = requests.post(f"{BASE}/_matrix/client/v3/createRoom", headers=headers, json={ + "preset": "public_chat", + "name": "Othrys", + "room_alias_name": "othrys", + "initial_state": [], + "power_level_content_override": {"events_default": 0, "users_default": 0, "state_default": 50}, + }) + if create.status_code not in (200, 409): + raise SystemExit(f"create room failed: {create.status_code} {create.text}") + exists = requests.get(f"{BASE}/_matrix/client/v3/directory/room/{alias_enc}", headers=headers) + room_id = exists.json()["room_id"] + state_events = [ + ("m.room.join_rules", {"join_rule": "public"}), + ("m.room.guest_access", {"guest_access": "can_join"}), + ("m.room.history_visibility", {"history_visibility": "shared"}), + ("m.room.canonical_alias", {"alias": alias}), + ] + for ev_type, content in state_events: + requests.put(f"{BASE}/_matrix/client/v3/rooms/{room_id}/state/{ev_type}", headers=headers, json=content) + requests.put(f"{BASE}/_matrix/client/v3/directory/list/room/{room_id}", headers=headers, json={"visibility": "public"}) + return room_id + + def join_user(token, room_id, user_id): + headers = {"Authorization": f"Bearer {token}"} + requests.post(f"{BASE}/_synapse/admin/v1/join/{urllib.parse.quote(room_id)}", headers=headers, json={"user_id": user_id}) + + def join_all_locals(token, room_id): + headers = {"Authorization": f"Bearer {token}"} + users = [] + from_token = None + while True: + url = f"{BASE}/_synapse/admin/v2/users?local=true&deactivated=false&limit=100" + if from_token: + url += f"&from={from_token}" + res = requests.get(url, headers=headers).json() + users.extend([u["name"] for u in res.get("users", [])]) + from_token = res.get("next_token") + if not from_token: + break + for uid in users: + join_user(token, room_id, uid) + + token = login(os.environ["SEEDER_USER"], os.environ["SEEDER_PASS"]) + ensure_user(token, os.environ["SEEDER_USER"], os.environ["SEEDER_PASS"], admin=True) + ensure_user(token, os.environ["BOT_USER"], os.environ["BOT_PASS"], admin=False) + room_id = ensure_room(token) + join_user(token, room_id, f"@{os.environ['BOT_USER']}:live.bstein.dev") + join_all_locals(token, room_id) + PY + volumeMounts: + - name: synapse-config + mountPath: /config + readOnly: true + volumes: + - name: synapse-config + secret: + secretName: othrys-synapse-matrix-synapse diff --git a/services/comms/synapse-deployment-strategy-patch.yaml b/services/comms/synapse-deployment-strategy-patch.yaml new file mode 100644 index 0000000..59b8e32 --- /dev/null +++ b/services/comms/synapse-deployment-strategy-patch.yaml @@ -0,0 +1,11 @@ +# services/comms/synapse-deployment-strategy-patch.yaml +apiVersion: apps/v1 +kind: Deployment +metadata: + name: othrys-synapse-matrix-synapse +spec: + strategy: + type: RollingUpdate + rollingUpdate: + maxSurge: 0 + maxUnavailable: 1 diff --git a/services/comms/synapse-rendered.yaml b/services/comms/synapse-rendered.yaml new file mode 100644 index 0000000..83fce79 --- /dev/null +++ b/services/comms/synapse-rendered.yaml @@ -0,0 +1,895 @@ +--- +# Source: matrix-synapse/charts/redis/templates/serviceaccount.yaml +apiVersion: v1 +kind: ServiceAccount +automountServiceAccountToken: true +metadata: + name: othrys-synapse-redis + labels: + app.kubernetes.io/instance: othrys-synapse + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/name: redis + helm.sh/chart: redis-17.17.1 +--- +# Source: matrix-synapse/templates/secrets.yaml +apiVersion: v1 +kind: Secret +metadata: + name: othrys-synapse-matrix-synapse + labels: + helm.sh/chart: matrix-synapse-3.12.17 + app.kubernetes.io/name: matrix-synapse + app.kubernetes.io/instance: othrys-synapse + app.kubernetes.io/version: "1.144.0" + app.kubernetes.io/managed-by: Helm +stringData: + config.yaml: | + ## Registration ## + + ## API Configuration ## + + ## Database configuration ## + + database: + name: "psycopg2" + args: + user: "synapse" + password: "@@POSTGRES_PASSWORD@@" + database: "synapse" + host: "postgres-service.postgres.svc.cluster.local" + port: 5432 + sslmode: "prefer" + cp_min: 5 + cp_max: 10 + + + ## Redis configuration ## + + redis: + enabled: true + host: "othrys-synapse-redis-master" + port: 6379 + password: "@@REDIS_PASSWORD@@" +--- +# Source: matrix-synapse/charts/redis/templates/configmap.yaml +apiVersion: v1 +kind: ConfigMap +metadata: + name: othrys-synapse-redis-configuration + labels: + app.kubernetes.io/instance: othrys-synapse + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/name: redis + helm.sh/chart: redis-17.17.1 +data: + redis.conf: |- + # User-supplied common configuration: + # Enable AOF https://redis.io/topics/persistence#append-only-file + appendonly yes + # Disable RDB persistence, AOF persistence already enabled. + save "" + # End of common configuration + master.conf: |- + dir /data + # User-supplied master configuration: + rename-command FLUSHDB "" + rename-command FLUSHALL "" + # End of master configuration + replica.conf: |- + dir /data + # User-supplied replica configuration: + rename-command FLUSHDB "" + rename-command FLUSHALL "" + # End of replica configuration +--- +# Source: matrix-synapse/templates/configuration.yaml +apiVersion: v1 +kind: ConfigMap +metadata: + name: othrys-synapse-matrix-synapse + labels: + helm.sh/chart: matrix-synapse-3.12.17 + app.kubernetes.io/name: matrix-synapse + app.kubernetes.io/instance: othrys-synapse + app.kubernetes.io/version: "1.144.0" + app.kubernetes.io/managed-by: Helm +data: + log.yaml: | + version: 1 + formatters: + precise: + format: '%(asctime)s - %(name)s - %(lineno)d - %(levelname)s - %(request)s- %(message)s' + filters: + context: + (): synapse.util.logcontext.LoggingContextFilter + request: "" + handlers: + console: + class: logging.StreamHandler + formatter: precise + filters: [context] + level: INFO + loggers: + synapse: + level: INFO + root: + level: INFO + handlers: [console] + homeserver.yaml: | + # NOTE: + # Secrets are stored in separate configs to better fit K8s concepts + + ## Server ## + + server_name: "live.bstein.dev" + public_baseurl: "https://matrix.live.bstein.dev" + pid_file: /homeserver.pid + web_client: False + soft_file_limit: 0 + log_config: "/synapse/config/log.yaml" + report_stats: false + + instance_map: + main: + host: othrys-synapse-replication + port: 9093 + + ## Ports ## + + listeners: + - port: 8008 + tls: false + bind_addresses: ["0.0.0.0"] + type: http + x_forwarded: true + + resources: + - names: + - client + - federation + compress: false + + - port: 9090 + tls: false + bind_addresses: ["::"] + type: http + + resources: + - names: [metrics] + compress: false + + - port: 9093 + tls: false + bind_addresses: ["::"] + type: http + + resources: + - names: [replication] + compress: false + + ## Files ## + + media_store_path: "/synapse/data/media" + uploads_path: "/synapse/data/uploads" + + ## Registration ## + + enable_registration: false + + ## Metrics ### + + enable_metrics: true + + ## Signing Keys ## + + signing_key_path: "/synapse/keys/signing.key" + macaroon_secret_key: "@@MACAROON_SECRET_KEY@@" + + # The trusted servers to download signing keys from. + trusted_key_servers: + - server_name: matrix.org + + ## Workers ## + + ## Extra config ## + + allow_guest_access: true + allow_public_rooms_without_auth: true + auto_join_rooms: + - "#othrys:live.bstein.dev" + autocreate_auto_join_rooms: true + default_room_version: "11" + experimental_features: + msc3266_enabled: true + msc4108_enabled: true + msc4143_enabled: true + msc4222_enabled: true + max_event_delay_duration: 24h + password_config: + enabled: false + turn_uris: + - "turn:turn.live.bstein.dev:3478?transport=udp" + - "turn:turn.live.bstein.dev:3478?transport=tcp" + - "turns:turn.live.bstein.dev:5349?transport=tcp" + turn_shared_secret: "@@TURN_SECRET@@" + turn_allow_guests: true + turn_user_lifetime: 86400000 + rc_login: + address: + burst_count: 20 + per_second: 5 + account: + burst_count: 20 + per_second: 5 + failed_attempts: + burst_count: 20 + per_second: 5 + rc_message: + per_second: 0.5 + burst_count: 30 + rc_delayed_event_mgmt: + per_second: 1 + burst_count: 20 + room_list_publication_rules: + - action: allow + well_known_client: + "m.homeserver": + "base_url": "https://matrix.live.bstein.dev" + "org.matrix.msc2965.authentication": + "issuer": "https://matrix.live.bstein.dev/" + "account": "https://matrix.live.bstein.dev/account/" + "org.matrix.msc4143.rtc_foci": + - type: "livekit" + livekit_service_url: "https://kit.live.bstein.dev/livekit/jwt" + + matrix_authentication_service: + enabled: true + endpoint: http://matrix-authentication-service:8080/ + secret: "@@MAS_SHARED_SECRET@@" +--- +# Source: matrix-synapse/templates/pvc.yaml +kind: PersistentVolumeClaim +apiVersion: v1 +metadata: + name: othrys-synapse-matrix-synapse + labels: + helm.sh/chart: matrix-synapse-3.12.17 + app.kubernetes.io/name: matrix-synapse + app.kubernetes.io/instance: othrys-synapse + app.kubernetes.io/version: "1.144.0" + app.kubernetes.io/managed-by: Helm +spec: + accessModes: + - "ReadWriteOnce" + resources: + requests: + storage: "50Gi" + storageClassName: "asteria" +--- +# Source: matrix-synapse/charts/redis/templates/headless-svc.yaml +apiVersion: v1 +kind: Service +metadata: + name: othrys-synapse-redis-headless + labels: + app.kubernetes.io/instance: othrys-synapse + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/name: redis + helm.sh/chart: redis-17.17.1 + annotations: + +spec: + type: ClusterIP + clusterIP: None + ports: + - name: tcp-redis + port: 6379 + targetPort: redis + selector: + app.kubernetes.io/instance: othrys-synapse + app.kubernetes.io/name: redis +--- +# Source: matrix-synapse/charts/redis/templates/master/service.yaml +apiVersion: v1 +kind: Service +metadata: + name: othrys-synapse-redis-master + labels: + app.kubernetes.io/instance: othrys-synapse + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/name: redis + helm.sh/chart: redis-17.17.1 + app.kubernetes.io/component: master +spec: + type: ClusterIP + internalTrafficPolicy: Cluster + sessionAffinity: None + ports: + - name: tcp-redis + port: 6379 + targetPort: redis + nodePort: null + selector: + app.kubernetes.io/instance: othrys-synapse + app.kubernetes.io/name: redis + app.kubernetes.io/component: master +--- +# Source: matrix-synapse/templates/service.yaml +apiVersion: v1 +kind: Service +metadata: + name: othrys-synapse-matrix-synapse + labels: + helm.sh/chart: matrix-synapse-3.12.17 + app.kubernetes.io/name: matrix-synapse + app.kubernetes.io/instance: othrys-synapse + app.kubernetes.io/version: "1.144.0" + app.kubernetes.io/managed-by: Helm +spec: + type: ClusterIP + ports: + - port: 8008 + targetPort: http + protocol: TCP + name: http + selector: + app.kubernetes.io/component: synapse + app.kubernetes.io/name: matrix-synapse + app.kubernetes.io/instance: othrys-synapse +--- +# Source: matrix-synapse/templates/service.yaml +apiVersion: v1 +kind: Service +metadata: + name: othrys-synapse-replication + labels: + helm.sh/chart: matrix-synapse-3.12.17 + app.kubernetes.io/name: matrix-synapse + app.kubernetes.io/instance: othrys-synapse + app.kubernetes.io/version: "1.144.0" + app.kubernetes.io/managed-by: Helm +spec: + type: ClusterIP + ports: + - port: 9093 + targetPort: replication + protocol: TCP + name: replication + selector: + app.kubernetes.io/name: matrix-synapse + app.kubernetes.io/instance: othrys-synapse + app.kubernetes.io/component: synapse +--- +# Source: matrix-synapse/charts/redis/templates/master/application.yaml +apiVersion: apps/v1 +kind: Deployment +metadata: + name: othrys-synapse-redis-master + labels: + app.kubernetes.io/instance: othrys-synapse + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/name: redis + helm.sh/chart: redis-17.17.1 + app.kubernetes.io/component: master +spec: + replicas: 1 + selector: + matchLabels: + app.kubernetes.io/instance: othrys-synapse + app.kubernetes.io/name: redis + app.kubernetes.io/component: master + strategy: + type: RollingUpdate + template: + metadata: + labels: + app.kubernetes.io/instance: othrys-synapse + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/name: redis + helm.sh/chart: redis-17.17.1 + app.kubernetes.io/component: master + annotations: + checksum/configmap: 86bcc953bb473748a3d3dc60b7c11f34e60c93519234d4c37f42e22ada559d47 + checksum/health: aff24913d801436ea469d8d374b2ddb3ec4c43ee7ab24663d5f8ff1a1b6991a9 + checksum/scripts: 560c33ff34d845009b51830c332aa05fa211444d1877d3526d3599be7543aaa5 + checksum/secret: 44136fa355b3678a1146ad16f7e8649e94fb4fc21fe77e8310c060f61caaff8a + spec: + + securityContext: + fsGroup: 1001 + serviceAccountName: othrys-synapse-redis + automountServiceAccountToken: true + affinity: + podAffinity: + + podAntiAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - podAffinityTerm: + labelSelector: + matchLabels: + app.kubernetes.io/instance: othrys-synapse + app.kubernetes.io/name: redis + app.kubernetes.io/component: master + topologyKey: kubernetes.io/hostname + weight: 1 + nodeAffinity: + + enableServiceLinks: true + terminationGracePeriodSeconds: 30 + containers: + - name: redis + image: docker.io/bitnamilegacy/redis:7.0.12-debian-11-r34 + imagePullPolicy: "IfNotPresent" + securityContext: + allowPrivilegeEscalation: false + capabilities: + drop: + - ALL + runAsGroup: 0 + runAsNonRoot: true + runAsUser: 1001 + seccompProfile: + type: RuntimeDefault + command: + - /bin/bash + args: + - -c + - /opt/bitnami/scripts/start-scripts/start-master.sh + env: + - name: BITNAMI_DEBUG + value: "false" + - name: REDIS_REPLICATION_MODE + value: master + - name: ALLOW_EMPTY_PASSWORD + value: "no" + - name: REDIS_PASSWORD + valueFrom: + secretKeyRef: + name: synapse-redis + key: redis-password + - name: REDIS_TLS_ENABLED + value: "no" + - name: REDIS_PORT + value: "6379" + ports: + - name: redis + containerPort: 6379 + livenessProbe: + initialDelaySeconds: 20 + periodSeconds: 5 + # One second longer than command timeout should prevent generation of zombie processes. + timeoutSeconds: 6 + successThreshold: 1 + failureThreshold: 5 + exec: + command: + - sh + - -c + - /health/ping_liveness_local.sh 5 + readinessProbe: + initialDelaySeconds: 20 + periodSeconds: 5 + timeoutSeconds: 2 + successThreshold: 1 + failureThreshold: 5 + exec: + command: + - sh + - -c + - /health/ping_readiness_local.sh 1 + resources: + limits: {} + requests: {} + volumeMounts: + - name: start-scripts + mountPath: /opt/bitnami/scripts/start-scripts + - name: health + mountPath: /health + - name: redis-data + mountPath: /data + - name: config + mountPath: /opt/bitnami/redis/mounted-etc + - name: redis-tmp-conf + mountPath: /opt/bitnami/redis/etc/ + - name: tmp + mountPath: /tmp + volumes: + - name: start-scripts + configMap: + name: othrys-synapse-redis-scripts + defaultMode: 0755 + - name: health + configMap: + name: othrys-synapse-redis-health + defaultMode: 0755 + - name: config + configMap: + name: othrys-synapse-redis-configuration + - name: redis-tmp-conf + emptyDir: {} + - name: tmp + emptyDir: {} + - name: redis-data + emptyDir: {} +--- +# Source: matrix-synapse/templates/deployment.yaml +# Server: live.bstein.dev +apiVersion: apps/v1 +kind: Deployment +metadata: + name: othrys-synapse-matrix-synapse + labels: + helm.sh/chart: matrix-synapse-3.12.17 + app.kubernetes.io/name: matrix-synapse + app.kubernetes.io/instance: othrys-synapse + app.kubernetes.io/version: "1.144.0" + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/component: synapse +spec: + replicas: 1 + strategy: + type: RollingUpdate + selector: + matchLabels: + app.kubernetes.io/name: matrix-synapse + app.kubernetes.io/instance: othrys-synapse + app.kubernetes.io/component: synapse + template: + metadata: + annotations: + checksum/config: manual-rtc-enable-11 + checksum/secrets: ec9f3b254a562a0f0709461eb74a8cc91b8c1a2fb06be2594a131776c2541773 + labels: + app.kubernetes.io/name: matrix-synapse + app.kubernetes.io/instance: othrys-synapse + app.kubernetes.io/component: synapse + spec: + serviceAccountName: default + + securityContext: + fsGroup: 666 + runAsGroup: 666 + runAsUser: 666 + containers: + - name: synapse + command: + - sh + - -c + - | + export POSTGRES_PASSWORD=$(echo "${POSTGRES_PASSWORD:-}" | sed 's/\//\\\//g' | sed 's/\&/\\\&/g') && \ + export REDIS_PASSWORD=$(echo "${REDIS_PASSWORD:-}" | sed 's/\//\\\//g' | sed 's/\&/\\\&/g') && \ + export OIDC_CLIENT_SECRET_ESCAPED=$(echo "${OIDC_CLIENT_SECRET:-}" | sed 's/[\\/&]/\\&/g') && \ + export TURN_SECRET_ESCAPED=$(echo "${TURN_SECRET:-}" | sed 's/[\\/&]/\\&/g') && \ + export MAS_SHARED_SECRET_ESCAPED=$(echo "${MAS_SHARED_SECRET:-}" | sed 's/[\\/&]/\\&/g') && \ + export MACAROON_SECRET_KEY_ESCAPED=$(echo "${MACAROON_SECRET_KEY:-}" | sed 's/[\\/&]/\\&/g') && \ + cat /synapse/secrets/*.yaml | \ + sed -e "s/@@POSTGRES_PASSWORD@@/${POSTGRES_PASSWORD:-}/" \ + -e "s/@@REDIS_PASSWORD@@/${REDIS_PASSWORD:-}/" \ + > /synapse/config/conf.d/secrets.yaml + + cp /synapse/config/homeserver.yaml /synapse/runtime-config/homeserver.yaml && \ + if [ -n "${OIDC_CLIENT_SECRET_ESCAPED}" ]; then \ + sed -i "s/@@OIDC_CLIENT_SECRET@@/${OIDC_CLIENT_SECRET_ESCAPED}/g" /synapse/runtime-config/homeserver.yaml; \ + fi; \ + if [ -n "${TURN_SECRET_ESCAPED}" ]; then \ + sed -i "s/@@TURN_SECRET@@/${TURN_SECRET_ESCAPED}/g" /synapse/runtime-config/homeserver.yaml; \ + fi; \ + if [ -n "${MAS_SHARED_SECRET_ESCAPED}" ]; then \ + sed -i "s/@@MAS_SHARED_SECRET@@/${MAS_SHARED_SECRET_ESCAPED}/g" /synapse/runtime-config/homeserver.yaml; \ + fi; \ + if [ -n "${MACAROON_SECRET_KEY_ESCAPED}" ]; then \ + sed -i "s/@@MACAROON_SECRET_KEY@@/${MACAROON_SECRET_KEY_ESCAPED}/g" /synapse/runtime-config/homeserver.yaml; \ + fi + exec python -B -m synapse.app.homeserver \ + -c /synapse/runtime-config/homeserver.yaml \ + -c /synapse/config/conf.d/ + env: + - name: POSTGRES_PASSWORD + valueFrom: + secretKeyRef: + name: synapse-db + key: POSTGRES_PASSWORD + - name: REDIS_PASSWORD + valueFrom: + secretKeyRef: + name: synapse-redis + key: redis-password + - name: OIDC_CLIENT_SECRET + valueFrom: + secretKeyRef: + name: synapse-oidc + key: client-secret + - name: TURN_SECRET + valueFrom: + secretKeyRef: + name: turn-shared-secret + key: TURN_STATIC_AUTH_SECRET + - name: MAS_SHARED_SECRET + valueFrom: + secretKeyRef: + name: mas-secrets-runtime + key: matrix_shared_secret + - name: MACAROON_SECRET_KEY + valueFrom: + secretKeyRef: + name: synapse-macaroon + key: macaroon_secret_key + image: "ghcr.io/element-hq/synapse:v1.144.0" + imagePullPolicy: IfNotPresent + securityContext: + {} + ports: + - name: http + containerPort: 8008 + protocol: TCP + - name: replication + containerPort: 9093 + protocol: TCP + - name: metrics + containerPort: 9090 + protocol: TCP + livenessProbe: + httpGet: + path: /health + port: http + readinessProbe: + httpGet: + path: /health + port: http + startupProbe: + failureThreshold: 12 + httpGet: + path: /health + port: http + volumeMounts: + - name: config + mountPath: /synapse/config + - name: runtime-config + mountPath: /synapse/runtime-config + - name: tmpconf + mountPath: /synapse/config/conf.d + - name: secrets + mountPath: /synapse/secrets + - name: signingkey + mountPath: /synapse/keys + - name: media + mountPath: /synapse/data + - name: tmpdir + mountPath: /tmp + resources: + limits: + cpu: "2" + memory: 3Gi + requests: + cpu: 500m + memory: 1Gi + volumes: + - name: config + configMap: + name: othrys-synapse-matrix-synapse + - name: secrets + secret: + secretName: othrys-synapse-matrix-synapse + - name: signingkey + secret: + secretName: "othrys-synapse-signingkey" + items: + - key: "signing.key" + path: signing.key + - name: tmpconf + emptyDir: {} + - name: tmpdir + emptyDir: {} + - name: runtime-config + emptyDir: {} + - name: media + persistentVolumeClaim: + claimName: othrys-synapse-matrix-synapse + nodeSelector: + hardware: rpi5 + affinity: + nodeAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - preference: + matchExpressions: + - key: hardware + operator: In + values: + - rpi5 + - rpi4 + weight: 50 +--- +# Source: matrix-synapse/templates/signing-key-job.yaml +apiVersion: v1 +kind: ServiceAccount +metadata: + name: othrys-synapse-signingkey-job + labels: + helm.sh/chart: matrix-synapse-3.12.17 + app.kubernetes.io/name: matrix-synapse + app.kubernetes.io/instance: othrys-synapse + app.kubernetes.io/version: "1.144.0" + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/component: signingkey-job + annotations: + helm.sh/hook: pre-install + helm.sh/hook-delete-policy: hook-succeeded +--- +# Source: matrix-synapse/templates/signing-key-job.yaml +# Create secret if signing key job is enabled, or if we're running in ArgoCD and we don't have an existing secret +apiVersion: v1 +kind: Secret +metadata: + annotations: + helm.sh/hook: pre-install + helm.sh/hook-delete-policy: never + helm.sh/resource-policy: keep + # If for some reason we didn't detect ArgoCD, but are running in it, we want to make sure we don't delete the secret + argocd.argoproj.io/hook: Skip + name: othrys-synapse-signingkey + labels: + helm.sh/chart: matrix-synapse-3.12.17 + app.kubernetes.io/name: matrix-synapse + app.kubernetes.io/instance: othrys-synapse + app.kubernetes.io/version: "1.144.0" + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/component: signingkey-job +--- +# Source: matrix-synapse/templates/signing-key-job.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: Role +metadata: + name: othrys-synapse-signingkey-job + labels: + helm.sh/chart: matrix-synapse-3.12.17 + app.kubernetes.io/name: matrix-synapse + app.kubernetes.io/instance: othrys-synapse + app.kubernetes.io/version: "1.144.0" + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/component: signingkey-job + annotations: + helm.sh/hook: pre-install + helm.sh/hook-delete-policy: hook-succeeded +rules: + - apiGroups: + - "" + resources: + - secrets + resourceNames: + - othrys-synapse-signingkey + verbs: + - get + - update + - patch +--- +# Source: matrix-synapse/templates/signing-key-job.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + name: othrys-synapse-signingkey-job + labels: + helm.sh/chart: matrix-synapse-3.12.17 + app.kubernetes.io/name: matrix-synapse + app.kubernetes.io/instance: othrys-synapse + app.kubernetes.io/version: "1.144.0" + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/component: signingkey-job + annotations: + helm.sh/hook: pre-install + helm.sh/hook-delete-policy: hook-succeeded +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: Role + name: othrys-synapse-signingkey-job +subjects: + - kind: ServiceAccount + name: othrys-synapse-signingkey-job + namespace: comms +--- +# Source: matrix-synapse/templates/tests/test-connection.yaml +apiVersion: v1 +kind: Pod +metadata: + name: "othrys-synapse-matrix-synapse-test-connection" + labels: + helm.sh/chart: matrix-synapse-3.12.17 + app.kubernetes.io/name: matrix-synapse + app.kubernetes.io/instance: othrys-synapse + app.kubernetes.io/version: "1.144.0" + app.kubernetes.io/managed-by: Helm + annotations: + "helm.sh/hook": test-success +spec: + containers: + - name: wget + image: busybox + command: ['wget'] + args: ['othrys-synapse-matrix-synapse:8008/_matrix/client/versions'] + restartPolicy: Never +--- +# Source: matrix-synapse/templates/signing-key-job.yaml +apiVersion: batch/v1 +kind: Job +metadata: + name: othrys-synapse-signingkey-job + labels: + helm.sh/chart: matrix-synapse-3.12.17 + app.kubernetes.io/name: matrix-synapse + app.kubernetes.io/instance: othrys-synapse + app.kubernetes.io/version: "1.144.0" + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/component: signingkey-job + annotations: + helm.sh/hook: pre-install + helm.sh/hook-delete-policy: hook-succeeded +spec: + ttlSecondsAfterFinished: 0 + template: + metadata: + labels: + helm.sh/chart: matrix-synapse-3.12.17 + app.kubernetes.io/name: matrix-synapse + app.kubernetes.io/instance: othrys-synapse + app.kubernetes.io/version: "1.144.0" + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/component: signingkey-job + spec: + containers: + - command: + - sh + - -c + - | + echo "Generating signing key..." + if which generate_signing_key.py >/dev/null; then + generate_signing_key.py -o /synapse/keys/signing.key + else + generate_signing_key -o /synapse/keys/signing.key + fi + image: "matrixdotorg/synapse:latest" + imagePullPolicy: IfNotPresent + name: signing-key-generate + resources: + {} + securityContext: + {} + volumeMounts: + - mountPath: /synapse/keys + name: matrix-synapse-keys + - command: + - sh + - -c + - | + printf "Checking rights to update secret... " + kubectl auth can-i update secret/${SECRET_NAME} + /scripts/signing-key.sh + env: + - name: SECRET_NAME + value: othrys-synapse-signingkey + image: "bitnami/kubectl:latest" + imagePullPolicy: IfNotPresent + name: signing-key-upload + resources: + {} + securityContext: + {} + volumeMounts: + - mountPath: /scripts + name: scripts + readOnly: true + - mountPath: /synapse/keys + name: matrix-synapse-keys + readOnly: true + securityContext: + {} + restartPolicy: Never + serviceAccount: othrys-synapse-signingkey-job + volumes: + - name: scripts + configMap: + name: othrys-synapse-matrix-synapse-scripts + defaultMode: 0755 + - name: matrix-synapse-keys + emptyDir: {} + parallelism: 1 + completions: 1 + backoffLimit: 1 diff --git a/services/comms/synapse-seeder-admin-ensure-job.yaml b/services/comms/synapse-seeder-admin-ensure-job.yaml new file mode 100644 index 0000000..0885722 --- /dev/null +++ b/services/comms/synapse-seeder-admin-ensure-job.yaml @@ -0,0 +1,36 @@ +# services/comms/synapse-seeder-admin-ensure-job.yaml +apiVersion: batch/v1 +kind: Job +metadata: + name: synapse-seeder-admin-ensure-2 + namespace: comms +spec: + backoffLimit: 2 + template: + spec: + restartPolicy: OnFailure + containers: + - name: psql + image: postgres:16-alpine + env: + - name: PGHOST + value: postgres-service.postgres.svc.cluster.local + - name: PGPORT + value: "5432" + - name: PGDATABASE + value: synapse + - name: PGUSER + value: synapse + - name: PGPASSWORD + valueFrom: + secretKeyRef: + name: synapse-db + key: POSTGRES_PASSWORD + command: + - /bin/sh + - -c + - | + set -euo pipefail + psql -v ON_ERROR_STOP=1 <<'SQL' + UPDATE users SET admin = 1 WHERE name = '@othrys-seeder:live.bstein.dev'; + SQL diff --git a/services/comms/synapse-signingkey-ensure-job.yaml b/services/comms/synapse-signingkey-ensure-job.yaml new file mode 100644 index 0000000..5ebaeda --- /dev/null +++ b/services/comms/synapse-signingkey-ensure-job.yaml @@ -0,0 +1,42 @@ +# services/comms/synapse-signingkey-ensure-job.yaml +apiVersion: batch/v1 +kind: Job +metadata: + name: othrys-synapse-signingkey-ensure-5 + namespace: comms +spec: + backoffLimit: 2 + template: + spec: + serviceAccountName: othrys-synapse-signingkey-job + restartPolicy: OnFailure + volumes: + - name: work + emptyDir: {} + initContainers: + - name: generate + image: ghcr.io/element-hq/synapse:v1.144.0 + command: ["/bin/sh", "-c"] + args: + - | + set -euo pipefail + generate_signing_key -o /work/signing.key + volumeMounts: + - name: work + mountPath: /work + containers: + - name: patch + image: bitnami/kubectl:latest + command: ["/bin/sh", "-c"] + args: + - | + set -euo pipefail + if kubectl -n comms get secret othrys-synapse-signingkey -o jsonpath='{.data.signing\.key}' 2>/dev/null | grep -q .; then + exit 0 + fi + kubectl -n comms create secret generic othrys-synapse-signingkey \ + --from-file=signing.key=/work/signing.key \ + --dry-run=client -o yaml | kubectl -n comms apply -f - >/dev/null + volumeMounts: + - name: work + mountPath: /work diff --git a/services/comms/synapse-user-seed-job.yaml b/services/comms/synapse-user-seed-job.yaml new file mode 100644 index 0000000..083f72e --- /dev/null +++ b/services/comms/synapse-user-seed-job.yaml @@ -0,0 +1,120 @@ +# services/comms/synapse-user-seed-job.yaml +apiVersion: batch/v1 +kind: Job +metadata: + name: synapse-user-seed-2 + namespace: comms +spec: + backoffLimit: 1 + ttlSecondsAfterFinished: 3600 + template: + spec: + restartPolicy: Never + containers: + - name: seed + image: python:3.11-slim + env: + - name: PGHOST + value: postgres-service.postgres.svc.cluster.local + - name: PGPORT + value: "5432" + - name: PGDATABASE + value: synapse + - name: PGUSER + value: synapse + - name: PGPASSWORD + valueFrom: + secretKeyRef: + name: synapse-db + key: POSTGRES_PASSWORD + - name: SEEDER_USER + value: othrys-seeder + - name: SEEDER_PASS + valueFrom: + secretKeyRef: + name: atlasbot-credentials-runtime + key: seeder-password + - name: BOT_USER + value: atlasbot + - name: BOT_PASS + valueFrom: + secretKeyRef: + name: atlasbot-credentials-runtime + key: bot-password + command: + - /bin/sh + - -c + - | + set -euo pipefail + pip install --no-cache-dir psycopg2-binary bcrypt >/dev/null + python - <<'PY' + import os + import time + import bcrypt + import psycopg2 + + def get_cols(cur): + cur.execute( + """ + SELECT column_name, is_nullable, column_default, data_type + FROM information_schema.columns + WHERE table_schema = 'public' AND table_name = 'users' + """ + ) + cols = {} + for name, is_nullable, default, data_type in cur.fetchall(): + cols[name] = { + "nullable": is_nullable == "YES", + "default": default, + "type": data_type, + } + return cols + + def upsert_user(cur, cols, user_id, password, admin): + now_ms = int(time.time() * 1000) + values = { + "name": user_id, + "password_hash": bcrypt.hashpw(password.encode(), bcrypt.gensalt()).decode(), + "creation_ts": now_ms, + } + def add_flag(name, flag): + if name not in cols: + return + if cols[name]["type"] in ("smallint", "integer"): + values[name] = int(flag) + else: + values[name] = bool(flag) + + add_flag("admin", admin) + add_flag("deactivated", False) + add_flag("shadow_banned", False) + add_flag("is_guest", False) + + columns = list(values.keys()) + placeholders = ", ".join(["%s"] * len(columns)) + updates = ", ".join([f"{col}=EXCLUDED.{col}" for col in columns if col != "name"]) + query = f"INSERT INTO users ({', '.join(columns)}) VALUES ({placeholders}) ON CONFLICT (name) DO UPDATE SET {updates};" + cur.execute(query, [values[c] for c in columns]) + + seeder_user = os.environ["SEEDER_USER"] + bot_user = os.environ["BOT_USER"] + server = "live.bstein.dev" + seeder_id = f"@{seeder_user}:{server}" + bot_id = f"@{bot_user}:{server}" + + conn = psycopg2.connect( + host=os.environ["PGHOST"], + port=int(os.environ["PGPORT"]), + dbname=os.environ["PGDATABASE"], + user=os.environ["PGUSER"], + password=os.environ["PGPASSWORD"], + ) + try: + with conn: + with conn.cursor() as cur: + cols = get_cols(cur) + upsert_user(cur, cols, seeder_id, os.environ["SEEDER_PASS"], True) + upsert_user(cur, cols, bot_id, os.environ["BOT_PASS"], False) + finally: + conn.close() + PY diff --git a/services/comms/values-element.yaml b/services/comms/values-element.yaml new file mode 100644 index 0000000..b8c7d87 --- /dev/null +++ b/services/comms/values-element.yaml @@ -0,0 +1,59 @@ +# services/comms/values-element.yaml +replicaCount: 1 + +defaultServer: + url: https://matrix.live.bstein.dev + name: live.bstein.dev + +config: + default_theme: dark + brand: Othrys + disable_custom_urls: true + disable_login_language_selector: true + disable_guests: false + show_labs_settings: true + features: + feature_group_calls: true + feature_video_rooms: true + feature_element_call_video_rooms: true + room_directory: + servers: + - live.bstein.dev + jitsi: {} + element_call: + url: https://call.live.bstein.dev + participant_limit: 16 + brand: Othrys Call + +ingress: + enabled: true + className: traefik + annotations: + cert-manager.io/cluster-issuer: letsencrypt + traefik.ingress.kubernetes.io/router.entrypoints: websecure + hosts: + - live.bstein.dev + tls: + - secretName: live-othrys-tls + hosts: [live.bstein.dev] + +resources: + requests: + cpu: 100m + memory: 256Mi + limits: + cpu: 500m + memory: 512Mi + +nodeSelector: + hardware: rpi5 + +affinity: + nodeAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - weight: 50 + preference: + matchExpressions: + - key: hardware + operator: In + values: ["rpi5","rpi4"] diff --git a/services/comms/values-synapse.yaml b/services/comms/values-synapse.yaml new file mode 100644 index 0000000..650d0e8 --- /dev/null +++ b/services/comms/values-synapse.yaml @@ -0,0 +1,132 @@ +# services/comms/values-synapse.yaml +serverName: live.bstein.dev +publicServerName: matrix.live.bstein.dev + +config: + publicBaseurl: https://matrix.live.bstein.dev + +externalPostgresql: + host: postgres-service.postgres.svc.cluster.local + port: 5432 + username: synapse + existingSecret: synapse-db + existingSecretPasswordKey: POSTGRES_PASSWORD + database: synapse + +redis: + enabled: true + auth: + enabled: true + existingSecret: synapse-redis + existingSecretPasswordKey: redis-password + +postgresql: + enabled: false + +persistence: + enabled: true + storageClass: asteria + accessMode: ReadWriteOnce + size: 50Gi + +synapse: + podSecurityContext: + fsGroup: 666 + runAsUser: 666 + runAsGroup: 666 + resources: + requests: + cpu: 500m + memory: 1Gi + limits: + cpu: "2" + memory: 3Gi + nodeSelector: + hardware: rpi5 + affinity: + nodeAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - weight: 50 + preference: + matchExpressions: + - key: hardware + operator: In + values: ["rpi5","rpi4"] + +ingress: + enabled: true + className: traefik + annotations: + cert-manager.io/cluster-issuer: letsencrypt + traefik.ingress.kubernetes.io/router.entrypoints: websecure + csHosts: + - matrix.live.bstein.dev + hosts: + - matrix.live.bstein.dev + wkHosts: + - live.bstein.dev + - bstein.dev + tls: + - secretName: matrix-live-tls + hosts: + - matrix.live.bstein.dev + - live.bstein.dev + +extraConfig: + allow_guest_access: true + allow_public_rooms_without_auth: true + auto_join_rooms: + - "#othrys:live.bstein.dev" + autocreate_auto_join_rooms: true + default_room_version: "11" + experimental_features: + msc3266_enabled: true + msc4143_enabled: true + msc4222_enabled: true + max_event_delay_duration: 24h + password_config: + enabled: true + oidc_enabled: true + oidc_providers: + - idp_id: keycloak + idp_name: Keycloak + issuer: https://sso.bstein.dev/realms/atlas + client_id: synapse + client_secret: "@@OIDC_CLIENT_SECRET@@" + client_auth_method: client_secret_post + scopes: ["openid", "profile", "email"] + authorization_endpoint: https://sso.bstein.dev/realms/atlas/protocol/openid-connect/auth + token_endpoint: https://sso.bstein.dev/realms/atlas/protocol/openid-connect/token + userinfo_endpoint: https://sso.bstein.dev/realms/atlas/protocol/openid-connect/userinfo + user_mapping_provider: + config: + localpart_template: "{{ user.preferred_username }}" + display_name_template: "{{ user.name }}" + allow_existing_users: true + rc_message: + per_second: 0.5 + burst_count: 30 + rc_delayed_event_mgmt: + per_second: 1 + burst_count: 20 + rc_login: + address: + burst_count: 20 + per_second: 5 + account: + burst_count: 20 + per_second: 5 + failed_attempts: + burst_count: 20 + per_second: 5 + room_list_publication_rules: + - action: allow + well_known_client: + "m.homeserver": + "base_url": "https://matrix.live.bstein.dev" + "org.matrix.msc4143.rtc_foci": + - type: "livekit" + livekit_service_url: "https://kit.live.bstein.dev/livekit/jwt" + +worker: + enabled: false diff --git a/services/comms/wellknown.yaml b/services/comms/wellknown.yaml new file mode 100644 index 0000000..601bafa --- /dev/null +++ b/services/comms/wellknown.yaml @@ -0,0 +1,206 @@ +# services/comms/wellknown.yaml +apiVersion: v1 +kind: ConfigMap +metadata: + name: matrix-wellknown +data: + client.json: | + { + "m.homeserver": { + "base_url": "https://matrix.live.bstein.dev" + }, + "org.matrix.msc2965.authentication": { + "issuer": "https://matrix.live.bstein.dev/", + "account": "https://matrix.live.bstein.dev/account/" + }, + "org.matrix.msc4143.rtc_foci": [ + { + "type": "livekit", + "livekit_service_url": "https://kit.live.bstein.dev/livekit/jwt" + } + ] + } + server.json: | + { + "m.server": "live.bstein.dev:443" + } +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: matrix-wellknown-nginx +data: + default.conf: | + server { + listen 80; + server_name _; + + root /usr/share/nginx/html; + + # Some clients request a trailing slash; serve both. + location ~ ^/\.well-known/matrix/client/?$ { + default_type application/json; + add_header Access-Control-Allow-Origin "*" always; + try_files /.well-known/matrix/client =404; + } + + location ~ ^/\.well-known/matrix/server/?$ { + default_type application/json; + add_header Access-Control-Allow-Origin "*" always; + try_files /.well-known/matrix/server =404; + } + } +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: matrix-wellknown + labels: + app: matrix-wellknown +spec: + replicas: 1 + selector: + matchLabels: + app: matrix-wellknown + template: + metadata: + labels: + app: matrix-wellknown + spec: + containers: + - name: nginx + image: nginx:1.27-alpine + ports: + - containerPort: 80 + volumeMounts: + - name: wellknown + mountPath: /usr/share/nginx/html/.well-known/matrix + readOnly: true + - name: nginx-config + mountPath: /etc/nginx/conf.d + readOnly: true + volumes: + - name: wellknown + configMap: + name: matrix-wellknown + items: + - key: client.json + path: client + - key: server.json + path: server + - name: nginx-config + configMap: + name: matrix-wellknown-nginx + items: + - key: default.conf + path: default.conf +--- +apiVersion: v1 +kind: Service +metadata: + name: matrix-wellknown +spec: + selector: + app: matrix-wellknown + ports: + - name: http + port: 80 + targetPort: 80 +--- +apiVersion: networking.k8s.io/v1 +kind: Ingress +metadata: + name: matrix-wellknown + annotations: + kubernetes.io/ingress.class: traefik + traefik.ingress.kubernetes.io/router.entrypoints: websecure + traefik.ingress.kubernetes.io/router.tls: "true" + cert-manager.io/cluster-issuer: letsencrypt +spec: + tls: + - hosts: + - live.bstein.dev + secretName: live-othrys-tls + rules: + - host: live.bstein.dev + http: + paths: + - path: /.well-known/matrix/client + pathType: Prefix + backend: + service: + name: matrix-wellknown + port: + number: 80 + - path: /.well-known/matrix/server + pathType: Prefix + backend: + service: + name: matrix-wellknown + port: + number: 80 +--- +apiVersion: networking.k8s.io/v1 +kind: Ingress +metadata: + name: matrix-wellknown-matrix-live + annotations: + kubernetes.io/ingress.class: traefik + traefik.ingress.kubernetes.io/router.entrypoints: websecure + traefik.ingress.kubernetes.io/router.tls: "true" +spec: + tls: + - hosts: + - matrix.live.bstein.dev + secretName: matrix-live-tls + rules: + - host: matrix.live.bstein.dev + http: + paths: + - path: /.well-known/matrix/client + pathType: Prefix + backend: + service: + name: matrix-wellknown + port: + number: 80 + - path: /.well-known/matrix/server + pathType: Prefix + backend: + service: + name: matrix-wellknown + port: + number: 80 +--- +apiVersion: networking.k8s.io/v1 +kind: Ingress +metadata: + name: matrix-wellknown-bstein-dev + annotations: + kubernetes.io/ingress.class: traefik + traefik.ingress.kubernetes.io/router.entrypoints: websecure + traefik.ingress.kubernetes.io/router.tls: "true" + cert-manager.io/cluster-issuer: letsencrypt +spec: + tls: + - hosts: + - bstein.dev + secretName: bstein-dev-home-tls + rules: + - host: bstein.dev + http: + paths: + - path: /.well-known/matrix/client + pathType: Prefix + backend: + service: + name: matrix-wellknown + port: + number: 80 + - path: /.well-known/matrix/server + pathType: Prefix + backend: + service: + name: matrix-wellknown + port: + number: 80 diff --git a/services/crypto/xmr-miner/configmap-sources.yaml b/services/crypto/xmr-miner/configmap-sources.yaml index b7c7bbc..5335001 100644 --- a/services/crypto/xmr-miner/configmap-sources.yaml +++ b/services/crypto/xmr-miner/configmap-sources.yaml @@ -6,7 +6,7 @@ metadata: namespace: crypto data: # REQUIRED: set to the official p2pool ARM64 tarball URL - P2POOL_URL: "https://downloads.sourceforge.net/project/p2pool-xmr/Release/p2pool-v4.8.1-linux-aarch64.tar.gz" + P2POOL_URL: "https://github.com/SChernykh/p2pool/releases/download/v4.9/p2pool-v4.9-linux-aarch64.tar.gz" # OPTIONAL: p2pool SHA256 (exact 64-hex chars). Leave blank to skip verification. P2POOL_SHA256: "" diff --git a/services/crypto/xmr-miner/deployment.yaml b/services/crypto/xmr-miner/deployment.yaml index dc24828..efc00ca 100644 --- a/services/crypto/xmr-miner/deployment.yaml +++ b/services/crypto/xmr-miner/deployment.yaml @@ -30,11 +30,38 @@ spec: - key: hardware operator: In values: ["rpi4"] + initContainers: + - name: fetch-p2pool + image: alpine:3.20 + command: ["/bin/sh","-c"] + args: + - | + set -euxo pipefail + apk add --no-cache curl tar openssl >/dev/null + test -n "${P2POOL_URL}" + echo "Downloading ${P2POOL_URL}" + curl -fsSL "${P2POOL_URL}" -o /tmp/p2pool.tgz + if [ -n "${P2POOL_SHA256}" ]; then + echo "${P2POOL_SHA256} /tmp/p2pool.tgz" | sha256sum -c - + fi + mkdir -p /opt/p2pool + tar -xzf /tmp/p2pool.tgz -C /opt/p2pool + ls -l /opt/p2pool + BIN="$(find /opt/p2pool -maxdepth 2 -type f -name 'p2pool*' | head -n1)" + test -n "${BIN}" + install -m0755 "${BIN}" /opt/p2pool/p2pool + env: + - name: P2POOL_URL + valueFrom: { configMapKeyRef: { name: xmr-miner-sources, key: P2POOL_URL } } + - name: P2POOL_SHA256 + valueFrom: { configMapKeyRef: { name: xmr-miner-sources, key: P2POOL_SHA256, optional: true } } + volumeMounts: + - { name: p2pool-bin, mountPath: /opt/p2pool } containers: - name: monero-p2pool - image: registry.bstein.dev/crypto/monero-p2pool:4.9 - imagePullPolicy: Always - command: ["p2pool"] + image: debian:bookworm-slim + imagePullPolicy: IfNotPresent + command: ["/opt/p2pool/p2pool"] args: - "--host" - "monerod.crypto.svc.cluster.local" @@ -61,3 +88,8 @@ spec: tcpSocket: { port: 3333 } initialDelaySeconds: 10 periodSeconds: 10 + volumeMounts: + - { name: p2pool-bin, mountPath: /opt/p2pool } + volumes: + - name: p2pool-bin + emptyDir: {} diff --git a/services/gitea/deployment.yaml b/services/gitea/deployment.yaml index d17a007..ed2cd63 100644 --- a/services/gitea/deployment.yaml +++ b/services/gitea/deployment.yaml @@ -125,6 +125,8 @@ spec: value: "true" - name: GITEA__oauth2_client__ENABLE_AUTO_REGISTRATION value: "true" + - name: GITEA__oauth2_client__ACCOUNT_LINKING + value: "auto" - name: GITEA__service__ALLOW_ONLY_EXTERNAL_REGISTRATION value: "true" - name: GITEA__service__DISABLE_REGISTRATION diff --git a/services/jellyfin/deployment.yaml b/services/jellyfin/deployment.yaml index fec0c78..1177a06 100644 --- a/services/jellyfin/deployment.yaml +++ b/services/jellyfin/deployment.yaml @@ -21,8 +21,65 @@ spec: labels: app: jellyfin spec: - nodeSelector: - jellyfin: "true" + # Clean up any lingering OIDC artifacts and strip the injected script tag + initContainers: + - name: strip-oidc + image: docker.io/jellyfin/jellyfin:10.11.5 + securityContext: + runAsUser: 0 + runAsGroup: 0 + command: + - /bin/sh + - -c + - | + set -euxo pipefail + cp -a /jellyfin/jellyfin-web/. /web-root + # remove injected OIDC script tags everywhere just in case + for f in $(find /web-root -type f -name 'index.html'); do + sed -i '/oidc\/inject/d' "$f" + printf '%s\n' "$f" + done + # clean any lingering OIDC plugin artifacts on the config volume + rm -rf "/config/plugins/OIDC Authentication_"* /config/plugins/configurations/JellyfinOIDCPlugin.v2.xml || true + volumeMounts: + - name: web-root + mountPath: /web-root + - name: config + mountPath: /config + # Force all users to authenticate via the LDAP plugin provider by updating the DB on start. + # This keeps Flux enforcement for auth provider drift (e.g., after UI edits). + - name: set-ldap-auth-provider + image: docker.io/library/alpine:3.20 + securityContext: + runAsUser: 0 + runAsGroup: 0 + command: + - /bin/sh + - -c + - | + set -euxo pipefail + apk add --no-cache sqlite + db="/config/data/jellyfin.db" + if [ -f "$db" ]; then + sqlite3 "$db" "UPDATE Users SET AuthenticationProviderId='Jellyfin.Plugin.LDAP_Auth.LdapAuthenticationProviderPlugin', Password=NULL, EnableLocalPassword=0 WHERE AuthenticationProviderId!='Jellyfin.Plugin.LDAP_Auth.LdapAuthenticationProviderPlugin';" + else + echo "db not found at $db, skipping" + fi + volumeMounts: + - name: config + mountPath: /config + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: kubernetes.io/hostname + operator: In + values: + - titan-20 + - titan-21 + - titan-22 + - titan-24 securityContext: runAsUser: 1000 fsGroup: 65532 @@ -31,7 +88,7 @@ spec: runtimeClassName: nvidia containers: - name: jellyfin - image: docker.io/jellyfin/jellyfin:10.10.7 + image: docker.io/jellyfin/jellyfin:10.11.5 imagePullPolicy: IfNotPresent ports: - name: http @@ -49,24 +106,45 @@ spec: value: "002" resources: limits: - nvidia.com/gpu: 1 + nvidia.com/gpu.shared: 1 # cpu: "4" # memory: 8Gi requests: - nvidia.com/gpu: 1 + nvidia.com/gpu.shared: 1 cpu: "500m" memory: 1Gi volumeMounts: - name: config mountPath: /config + # Override LDAP plugin configuration from a secret to avoid embedding credentials in the PVC. + - name: ldap-config + mountPath: /config/plugins/configurations/LDAP-Auth.xml + subPath: ldap-config.xml - name: cache mountPath: /cache - name: media mountPath: /media + - name: web-root + mountPath: /jellyfin/jellyfin-web + lifecycle: + postStart: + exec: + command: + - /bin/sh + - -c + - | + set -eux + for f in $(find /jellyfin/jellyfin-web -type f -name 'index.html'); do + sed -i '/oidc\/inject/d' "$f" || true + done securityContext: + runAsUser: 0 + runAsGroup: 0 allowPrivilegeEscalation: false readOnlyRootFilesystem: false volumes: + - name: web-root + emptyDir: {} - name: config persistentVolumeClaim: claimName: jellyfin-config-astreae @@ -76,3 +154,9 @@ spec: - name: media persistentVolumeClaim: claimName: jellyfin-media-asteria-new + - name: ldap-config + secret: + secretName: jellyfin-ldap-config + items: + - key: ldap-config.xml + path: ldap-config.xml diff --git a/services/jellyfin/oidc/Jenkinsfile b/services/jellyfin/oidc/Jenkinsfile new file mode 100644 index 0000000..6886dc9 --- /dev/null +++ b/services/jellyfin/oidc/Jenkinsfile @@ -0,0 +1,568 @@ +pipeline { + agent { + kubernetes { + yaml """ +apiVersion: v1 +kind: Pod +spec: + restartPolicy: Never + containers: + - name: dotnet + image: mcr.microsoft.com/dotnet/sdk:9.0 + command: + - cat + tty: true +""" + } + } + options { + timestamps() + } + parameters { + string(name: 'HARBOR_REPO', defaultValue: 'registry.bstein.dev/streaming/oidc-plugin', description: 'OCI repository for the plugin artifact') + string(name: 'JELLYFIN_VERSION', defaultValue: '10.11.5', description: 'Jellyfin version to tag the plugin with') + string(name: 'PLUGIN_VERSION', defaultValue: '1.0.2.0', description: 'Plugin version') + } + environment { + ORAS_VERSION = "1.2.0" + DOTNET_CLI_TELEMETRY_OPTOUT = "1" + DOTNET_SKIP_FIRST_TIME_EXPERIENCE = "1" + } + stages { + stage('Checkout') { + steps { + container('dotnet') { + checkout scm + } + } + } + stage('Build plugin') { + steps { + container('dotnet') { + sh ''' + set -euo pipefail + apt-get update + apt-get install -y --no-install-recommends zip curl ca-certificates git + WORKDIR="$(pwd)/build" + SRC_DIR="${WORKDIR}/src" + DIST_DIR="${WORKDIR}/dist" + ART_DIR="${WORKDIR}/artifact" + rm -rf "${SRC_DIR}" "${DIST_DIR}" "${ART_DIR}" + mkdir -p "${SRC_DIR}" "${DIST_DIR}" "${ART_DIR}" + git clone https://github.com/lolerskatez/JellyfinOIDCPlugin.git "${SRC_DIR}" + cd "${SRC_DIR}" + # Override controllers to avoid DI version issues and add injection script + cat > Controllers/OidcController.cs <<'EOF' +using System; +using System.Collections.Generic; +using System.Linq; +using System.Threading.Tasks; +using IdentityModel.OidcClient; +using MediaBrowser.Controller.Library; +using Microsoft.AspNetCore.Mvc; +using Microsoft.Extensions.DependencyInjection; + +namespace JellyfinOIDCPlugin.Controllers; + +#nullable enable + +[ApiController] +[Route("api/oidc")] +public class OidcController : ControllerBase +{ + private IUserManager UserManager => HttpContext.RequestServices.GetRequiredService(); + private static readonly Dictionary StateManager = new(); // Store AuthorizeState objects + + [HttpGet("start")] + public async Task Start() + { + var config = Plugin.Instance?.Configuration; + if (config == null) + { + return BadRequest("Plugin not initialized"); + } + + var options = new OidcClientOptions + { + Authority = config.OidEndpoint?.Trim(), + ClientId = config.OidClientId?.Trim(), + ClientSecret = config.OidSecret?.Trim(), + RedirectUri = GetRedirectUri(), + Scope = string.Join(" ", config.OidScopes) + }; + + try + { + var client = new OidcClient(options); + var result = await client.PrepareLoginAsync().ConfigureAwait(false); + + // Store the authorize state for the callback + var stateString = (string)result.GetType().GetProperty("State")?.GetValue(result); + if (!string.IsNullOrEmpty(stateString)) + { + StateManager[stateString] = result; + } + + var startUrl = (string)result.GetType().GetProperty("StartUrl")?.GetValue(result); + if (string.IsNullOrEmpty(startUrl)) + { + Console.WriteLine("OIDC: Could not get StartUrl from OIDC result"); + return BadRequest("OIDC initialization failed"); + } + + return Redirect(startUrl); + } + catch (Exception ex) + { + Console.WriteLine($"OIDC start error: {ex}"); + return BadRequest("OIDC error: " + ex.Message); + } + } + + [HttpGet("callback")] + public async Task Callback() + { + var config = Plugin.Instance?.Configuration; + if (config == null) + { + return BadRequest("Plugin not initialized"); + } + + try + { + var stateParam = Request.Query["state"].ToString(); + if (string.IsNullOrEmpty(stateParam) || !StateManager.TryGetValue(stateParam, out var storedState)) + { + Console.WriteLine($"OIDC: Invalid state {stateParam}"); + return BadRequest("Invalid state"); + } + + var options = new OidcClientOptions + { + Authority = config.OidEndpoint?.Trim(), + ClientId = config.OidClientId?.Trim(), + ClientSecret = config.OidSecret?.Trim(), + RedirectUri = GetRedirectUri(), + Scope = string.Join(" ", config.OidScopes) + }; + + var client = new OidcClient(options); + // Cast stored state to AuthorizeState - it's stored as object + var authorizeState = (AuthorizeState)storedState; + var result = await client.ProcessResponseAsync(Request.QueryString.Value, authorizeState).ConfigureAwait(false); + + if (result.IsError) + { + Console.WriteLine($"OIDC callback failed: {result.Error} - {result.ErrorDescription}"); + return BadRequest("OIDC authentication failed"); + } + + // Get email from claims + var email = result.User?.FindFirst("email")?.Value ?? + result.User?.FindFirst("preferred_username")?.Value ?? + result.User?.FindFirst("sub")?.Value; + + if (string.IsNullOrEmpty(email)) + { + Console.WriteLine("OIDC: No email/username found in OIDC response"); + return BadRequest("No email/username found in OIDC response"); + } + + // Get or create user + var user = UserManager.GetUserByName(email); + if (user == null) + { + Console.WriteLine($"OIDC: Creating new user {email}"); + user = await UserManager.CreateUserAsync(email).ConfigureAwait(false); + } + + // Set authentication provider + user.AuthenticationProviderId = "OIDC"; + + // Get roles from claims + var rolesClaimValue = result.User?.FindFirst(config.RoleClaim)?.Value; + var roles = string.IsNullOrEmpty(rolesClaimValue) + ? Array.Empty() + : rolesClaimValue.Split(new[] { ',', ' ' }, StringSplitOptions.RemoveEmptyEntries); + + // Set permissions based on groups + var isAdmin = roles.Any(r => r.Equals("admin", StringComparison.OrdinalIgnoreCase)); + var isPowerUser = roles.Any(r => r.Equals("Power User", StringComparison.OrdinalIgnoreCase)) && !isAdmin; + + Console.WriteLine($"OIDC: User {email} authenticated. Admin: {isAdmin}, PowerUser: {isPowerUser}"); + + // Update user in database + await UserManager.UpdateUserAsync(user).ConfigureAwait(false); + + StateManager.Remove(stateParam); + + // Redirect to Jellyfin main page + return Redirect("/"); + } + catch (Exception ex) + { + Console.WriteLine($"OIDC callback error: {ex}"); + return BadRequest("OIDC error: " + ex.Message); + } + } + + [HttpPost("token")] + public async Task ExchangeToken([FromBody] TokenExchangeRequest request) + { + var config = Plugin.Instance?.Configuration; + if (config == null) + { + Console.WriteLine("OIDC: Plugin not initialized"); + return BadRequest("Plugin not initialized"); + } + + if (string.IsNullOrEmpty(request?.AccessToken)) + { + Console.WriteLine("OIDC: No access token provided"); + return BadRequest("Access token is required"); + } + + try + { + Console.WriteLine("OIDC: Processing token exchange request"); + + // Validate the token with the OIDC provider using UserInfo endpoint + var options = new OidcClientOptions + { + Authority = config.OidEndpoint?.Trim(), + ClientId = config.OidClientId?.Trim(), + ClientSecret = config.OidSecret?.Trim(), + Scope = string.Join(" ", config.OidScopes) + }; + + var client = new OidcClient(options); + + // Use the access token to get user info + var userInfoResult = await client.GetUserInfoAsync(request.AccessToken).ConfigureAwait(false); + + if (userInfoResult.IsError) + { + Console.WriteLine($"OIDC: Failed to get user info: {userInfoResult.Error}"); + return Unauthorized("Invalid access token"); + } + + // Extract email/username from user info + var email = userInfoResult.Claims.FirstOrDefault(c => c.Type == "email")?.Value ?? + userInfoResult.Claims.FirstOrDefault(c => c.Type == "preferred_username")?.Value ?? + userInfoResult.Claims.FirstOrDefault(c => c.Type == "sub")?.Value; + + if (string.IsNullOrEmpty(email)) + { + Console.WriteLine("OIDC: No email/username found in token"); + return BadRequest("No email/username found in token"); + } + + // Get or create user + var user = UserManager.GetUserByName(email); + if (user == null) + { + if (!config.AutoCreateUser) + { + Console.WriteLine($"OIDC: User {email} not found and auto-create disabled"); + return Unauthorized("User does not exist and auto-creation is disabled"); + } + + Console.WriteLine($"OIDC: Creating new user from token {email}"); + user = await UserManager.CreateUserAsync(email).ConfigureAwait(false); + } + + // Update user authentication provider + user.AuthenticationProviderId = "OIDC"; + + // Get roles from claims + var rolesClaimName = config.RoleClaim ?? "groups"; + var rolesClaimValue = userInfoResult.Claims.FirstOrDefault(c => c.Type == rolesClaimName)?.Value; + var roles = string.IsNullOrEmpty(rolesClaimValue) + ? Array.Empty() + : rolesClaimValue.Split(new[] { ',', ' ' }, StringSplitOptions.RemoveEmptyEntries); + + // Set permissions based on groups + var isAdmin = roles.Any(r => r.Equals("admin", StringComparison.OrdinalIgnoreCase)); + var isPowerUser = roles.Any(r => r.Equals("Power User", StringComparison.OrdinalIgnoreCase)) && !isAdmin; + + Console.WriteLine($"OIDC: Token exchange for {email} Admin:{isAdmin} Power:{isPowerUser}"); + + // Update user in database + await UserManager.UpdateUserAsync(user).ConfigureAwait(false); + + // Return success with user info + return Ok(new TokenExchangeResponse + { + Success = true, + UserId = user.Id.ToString(), + Username = user.Username, + Email = email, + IsAdmin = isAdmin, + Message = "User authenticated successfully" + }); + } + catch (Exception ex) + { + Console.WriteLine($"OIDC token exchange error: {ex}"); + return StatusCode(500, $"Token exchange failed: {ex.Message}"); + } + } + + private string GetRedirectUri() + { + var configured = Plugin.Instance?.Configuration?.RedirectUri; + if (!string.IsNullOrWhiteSpace(configured)) + { + return configured!; + } + + return $"{Request.Scheme}://{Request.Host}/api/oidc/callback"; + } +} + +public class TokenExchangeRequest +{ + public string? AccessToken { get; set; } + public string? IdToken { get; set; } +} + +public class TokenExchangeResponse +{ + public bool Success { get; set; } + public string? UserId { get; set; } + public string? Username { get; set; } + public string? Email { get; set; } + public bool IsAdmin { get; set; } + public string? Message { get; set; } +} +EOF + + cat > Controllers/OidcStaticController.cs <<'EOF' +using System; +using System.IO; +using System.Reflection; +using MediaBrowser.Common.Plugins; +using Microsoft.AspNetCore.Mvc; + +namespace JellyfinOIDCPlugin.Controllers; + +[ApiController] +[Route("api/oidc")] +public class OidcStaticController : ControllerBase +{ + [HttpGet("login.js")] + public IActionResult GetLoginScript() + { + try + { + var assembly = Assembly.GetExecutingAssembly(); + using var stream = assembly.GetManifestResourceStream("JellyfinOIDCPlugin.web.oidc-login.js"); + if (stream == null) + { + Console.WriteLine("OIDC: Login script resource not found"); + return NotFound(); + } + + using var reader = new StreamReader(stream); + var content = reader.ReadToEnd(); + + return Content(content, "application/javascript"); + } + catch (Exception ex) + { + Console.WriteLine($"OIDC: Error serving login script {ex}"); + return StatusCode(500, "Error loading login script"); + } + } + + [HttpGet("loader.js")] + public IActionResult GetLoader() + { + try + { + var assembly = Assembly.GetExecutingAssembly(); + using var stream = assembly.GetManifestResourceStream("JellyfinOIDCPlugin.web.oidc-loader.js"); + if (stream == null) + { + Console.WriteLine("OIDC: Loader script resource not found"); + return NotFound(); + } + + using var reader = new StreamReader(stream); + var content = reader.ReadToEnd(); + + return Content(content, "application/javascript"); + } + catch (Exception ex) + { + Console.WriteLine($"OIDC: Error serving loader script {ex}"); + return StatusCode(500, "Error loading loader script"); + } + } + + [HttpGet("inject")] + public IActionResult GetInject() + { + try + { + var script = @" +(function() { + console.log('[OIDC Plugin] Bootstrap inject started'); + + // Load oidc-loader.js dynamically + const loaderScript = document.createElement('script'); + loaderScript.src = '/api/oidc/loader.js'; + loaderScript.type = 'application/javascript'; + loaderScript.onerror = function() { + console.error('[OIDC Plugin] Failed to load loader.js'); + }; + loaderScript.onload = function() { + console.log('[OIDC Plugin] Loader.js loaded successfully'); + }; + + // Append to head or body + const target = document.head || document.documentElement; + target.appendChild(loaderScript); + + console.log('[OIDC Plugin] Bootstrap script appended to page'); +})(); +"; + return Content(script, "application/javascript"); + } + catch (Exception ex) + { + Console.WriteLine($"OIDC: Error serving inject script {ex}"); + return StatusCode(500, "Error loading inject script"); + } + } + + [HttpGet("global.js")] + public IActionResult GetGlobalInjector() + { + try + { + var assembly = Assembly.GetExecutingAssembly(); + using var stream = assembly.GetManifestResourceStream("JellyfinOIDCPlugin.web.oidc-global-injector.js"); + if (stream == null) + { + Console.WriteLine("OIDC: Global injector resource not found"); + return NotFound(); + } + + using var reader = new StreamReader(stream); + var content = reader.ReadToEnd(); + + return Content(content, "application/javascript"); + } + catch (Exception ex) + { + Console.WriteLine($"OIDC: Error serving global injector {ex}"); + return StatusCode(500, "Error loading global injector"); + } + } + + [HttpGet("config")] + public IActionResult GetConfigurationPage() + { + try + { + var assembly = Assembly.GetExecutingAssembly(); + using var stream = assembly.GetManifestResourceStream("JellyfinOIDCPlugin.web.configurationpage.html"); + if (stream == null) + { + Console.WriteLine("OIDC: Configuration page resource not found"); + return NotFound("Configuration page resource not found"); + } + + using var reader = new StreamReader(stream); + var content = reader.ReadToEnd(); + + return Content(content, "text/html"); + } + catch (Exception ex) + { + Console.WriteLine($"OIDC: Error serving configuration page {ex}"); + return StatusCode(500, $"Error loading configuration page: {ex.Message}"); + } + } +} +EOF + cat > JellyfinOIDCPlugin.csproj <<'EOF' + + + net9.0 + JellyfinOIDCPlugin.v2 + JellyfinOIDCPlugin + latest + enable + enable + 1.0.2.0 + 1.0.2.0 + false + + + + runtime + + + runtime + + + runtime + + + runtime + + + runtime + + + none + + + runtime + + + + + + + + +EOF + dotnet restore + dotnet publish -c Release --no-self-contained -o "${DIST_DIR}" + cd "${DIST_DIR}" + zip -r "${ART_DIR}/OIDC_Authentication_${PLUGIN_VERSION}-net9.zip" . + ''' + } + } + } + stage('Push to Harbor') { + steps { + container('dotnet') { + withCredentials([usernamePassword(credentialsId: 'harbor-robot', usernameVariable: 'HARBOR_USERNAME', passwordVariable: 'HARBOR_PASSWORD')]) { + sh ''' + set -euo pipefail + WORKDIR="$(pwd)/build" + ORAS_BIN="/usr/local/bin/oras" + curl -sSL "https://github.com/oras-project/oras/releases/download/v${ORAS_VERSION}/oras_${ORAS_VERSION}_linux_amd64.tar.gz" | tar -xz -C /usr/local/bin oras + ref_host="$(echo "${HARBOR_REPO}" | cut -d/ -f1)" + "${ORAS_BIN}" login "${ref_host}" -u "${HARBOR_USERNAME}" -p "${HARBOR_PASSWORD}" + artifact="${WORKDIR}/artifact/OIDC_Authentication_${PLUGIN_VERSION}-net9.zip" + "${ORAS_BIN}" push "${HARBOR_REPO}:${JELLYFIN_VERSION}" "${artifact}:application/zip" --artifact-type application/zip + "${ORAS_BIN}" push "${HARBOR_REPO}:latest" "${artifact}:application/zip" --artifact-type application/zip + ''' + } + } + } + } + } + post { + always { + container('dotnet') { + archiveArtifacts artifacts: 'build/artifact/*.zip', allowEmptyArchive: true + } + } + } +} diff --git a/services/jenkins/configmap-jcasc.yaml b/services/jenkins/configmap-jcasc.yaml new file mode 100644 index 0000000..2c188db --- /dev/null +++ b/services/jenkins/configmap-jcasc.yaml @@ -0,0 +1,200 @@ +# services/jenkins/configmap-jcasc.yaml +apiVersion: v1 +kind: ConfigMap +metadata: + name: jenkins-jcasc + namespace: jenkins +data: + securityrealm.yaml: | + jenkins: + securityRealm: + oic: + clientId: "${OIDC_CLIENT_ID}" + clientSecret: "${OIDC_CLIENT_SECRET}" + serverConfiguration: + wellKnown: + wellKnownOpenIDConfigurationUrl: "${OIDC_ISSUER}/.well-known/openid-configuration" + scopesOverride: "openid profile email" + logoutFromOpenIdProvider: true + postLogoutRedirectUrl: "https://ci.bstein.dev" + sendScopesInTokenRequest: true + rootURLFromRequest: true + userNameField: "preferred_username" + fullNameFieldName: "name" + emailFieldName: "email" + groupsFieldName: "groups" + authorization.yaml: | + jenkins: + authorizationStrategy: + loggedInUsersCanDoAnything: + allowAnonymousRead: false + creds.yaml: | + credentials: + system: + domainCredentials: + - credentials: + - usernamePassword: + scope: GLOBAL + id: gitea-pat + username: "${GITEA_PAT_USERNAME}" + password: "${GITEA_PAT_TOKEN}" + description: "Gitea PAT for pipelines" + - usernamePassword: + scope: GLOBAL + id: harbor-robot + username: "${HARBOR_ROBOT_USERNAME}" + password: "${HARBOR_ROBOT_PASSWORD}" + description: "Harbor robot for pipelines" + jobs.yaml: | + jobs: + - script: | + pipelineJob('harbor-arm-build') { + triggers { + scm('H/5 * * * *') + } + definition { + cpsScm { + scm { + git { + remote { + url('https://scm.bstein.dev/bstein/harbor-arm-build.git') + credentials('gitea-pat') + } + branches('*/master') + } + } + } + } + } + pipelineJob('jellyfin-oidc-plugin') { + definition { + cpsScm { + scm { + git { + remote { + url('https://scm.bstein.dev/bstein/titan-iac.git') + credentials('gitea-pat') + } + branches('*/main') + } + } + scriptPath('services/jellyfin/oidc/Jenkinsfile') + } + } + } + pipelineJob('ci-demo') { + triggers { + scm('H/1 * * * *') + } + definition { + cpsScm { + scm { + git { + remote { + url('https://scm.bstein.dev/bstein/ci-demo.git') + credentials('gitea-pat') + } + branches('*/master') + } + } + scriptPath('Jenkinsfile') + } + } + } + pipelineJob('bstein-dev-home') { + triggers { + scm('H/2 * * * *') + } + definition { + cpsScm { + scm { + git { + remote { + url('https://scm.bstein.dev/bstein/bstein-dev-home.git') + credentials('gitea-pat') + } + branches('*/master') + } + } + scriptPath('Jenkinsfile') + } + } + } + pipelineJob('data-prepper') { + triggers { + scm('H/5 * * * *') + } + definition { + cpsScm { + scm { + git { + remote { + url('https://scm.bstein.dev/bstein/titan-iac.git') + credentials('gitea-pat') + } + branches('*/feature/sso-hardening') + } + } + scriptPath('services/logging/Jenkinsfile.data-prepper') + } + } + } + base.yaml: | + jenkins: + disableRememberMe: false + mode: NORMAL + numExecutors: 0 + labelString: "" + projectNamingStrategy: "standard" + markupFormatter: + plainText + clouds: + - kubernetes: + containerCapStr: "10" + connectTimeout: "5" + readTimeout: "15" + jenkinsUrl: "http://jenkins.jenkins.svc.cluster.local:8080" + jenkinsTunnel: "jenkins.jenkins.svc.cluster.local:50000" + skipTlsVerify: false + maxRequestsPerHostStr: "32" + retentionTimeout: "5" + waitForPodSec: "600" + name: "kubernetes" + namespace: "jenkins" + restrictedPssSecurityContext: false + serverUrl: "https://kubernetes.default" + credentialsId: "" + podLabels: + - key: "jenkins/jenkins-jenkins-agent" + value: "true" + templates: + - name: "default" + namespace: "jenkins" + containers: + - name: "jnlp" + args: "^${computer.jnlpmac} ^${computer.name}" + envVars: + - envVar: + key: "JENKINS_URL" + value: "http://jenkins.jenkins.svc.cluster.local:8080/" + image: "jenkins/inbound-agent:3355.v388858a_47b_33-3" + privileged: "false" + resourceLimitCpu: 512m + resourceLimitMemory: 512Mi + resourceRequestCpu: 512m + resourceRequestMemory: 512Mi + ttyEnabled: false + workingDir: /home/jenkins/agent + idleMinutes: 0 + instanceCap: 2147483647 + label: "jenkins-jenkins-agent " + nodeUsageMode: "NORMAL" + podRetention: Never + serviceAccount: "jenkins" + slaveConnectTimeoutStr: "100" + yamlMergeStrategy: override + inheritYamlMergeStrategy: false + slaveAgentPort: 50000 + crumbIssuer: + standard: + excludeClientIPFromCrumb: true diff --git a/services/jenkins/configmap-plugins.yaml b/services/jenkins/configmap-plugins.yaml new file mode 100644 index 0000000..eabea13 --- /dev/null +++ b/services/jenkins/configmap-plugins.yaml @@ -0,0 +1,17 @@ +# services/jenkins/configmap-plugins.yaml +apiVersion: v1 +kind: ConfigMap +metadata: + name: jenkins-plugins + namespace: jenkins +data: + plugins.txt: | + kubernetes + workflow-aggregator + git + pipeline-utility-steps + configuration-as-code + configuration-as-code-support + oic-auth + job-dsl + simple-theme-plugin diff --git a/services/jenkins/deployment.yaml b/services/jenkins/deployment.yaml new file mode 100644 index 0000000..ec749e8 --- /dev/null +++ b/services/jenkins/deployment.yaml @@ -0,0 +1,195 @@ +# services/jenkins/deployment.yaml +apiVersion: apps/v1 +kind: Deployment +metadata: + name: jenkins + namespace: jenkins + labels: + app: jenkins +spec: + replicas: 1 + selector: + matchLabels: + app: jenkins + strategy: + type: Recreate + template: + metadata: + labels: + app: jenkins + spec: + serviceAccountName: jenkins + nodeSelector: + kubernetes.io/arch: arm64 + node-role.kubernetes.io/worker: "true" + affinity: + nodeAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - weight: 90 + preference: + matchExpressions: + - key: hardware + operator: In + values: ["rpi5"] + - weight: 50 + preference: + matchExpressions: + - key: hardware + operator: In + values: ["rpi4"] + hostAliases: + - ip: 38.28.125.112 + hostnames: + - sso.bstein.dev + securityContext: + fsGroup: 1000 + initContainers: + - name: install-plugins + image: jenkins/jenkins:2.528.3-jdk21 + imagePullPolicy: IfNotPresent + command: + - sh + - -c + - | + set -euo pipefail + jenkins-plugin-cli --plugin-file /plugins/plugins.txt + volumeMounts: + - name: plugins + mountPath: /plugins/plugins.txt + subPath: plugins.txt + - name: plugin-dir + mountPath: /usr/share/jenkins/ref/plugins + containers: + - name: jenkins + image: jenkins/jenkins:2.528.3-jdk21 + imagePullPolicy: IfNotPresent + ports: + - name: http + containerPort: 8080 + - name: agent-listener + containerPort: 50000 + env: + - name: JAVA_OPTS + value: "-Xms512m -Xmx2048m" + - name: JENKINS_OPTS + value: "--webroot=/var/jenkins_cache/war" + - name: JENKINS_SLAVE_AGENT_PORT + value: "50000" + - name: CASC_JENKINS_CONFIG + value: /config/jcasc + - name: ENABLE_OIDC + value: "true" + - name: OIDC_ISSUER + value: "https://sso.bstein.dev/realms/atlas" + - name: OIDC_CLIENT_ID + valueFrom: + secretKeyRef: + name: jenkins-oidc + key: clientId + - name: OIDC_CLIENT_SECRET + valueFrom: + secretKeyRef: + name: jenkins-oidc + key: clientSecret + - name: OIDC_AUTH_URL + valueFrom: + secretKeyRef: + name: jenkins-oidc + key: authorizationUrl + - name: OIDC_TOKEN_URL + valueFrom: + secretKeyRef: + name: jenkins-oidc + key: tokenUrl + - name: OIDC_USERINFO_URL + valueFrom: + secretKeyRef: + name: jenkins-oidc + key: userInfoUrl + - name: OIDC_LOGOUT_URL + valueFrom: + secretKeyRef: + name: jenkins-oidc + key: logoutUrl + - name: HARBOR_ROBOT_USERNAME + valueFrom: + secretKeyRef: + name: harbor-robot-creds + key: username + - name: HARBOR_ROBOT_PASSWORD + valueFrom: + secretKeyRef: + name: harbor-robot-creds + key: password + - name: GITEA_PAT_USERNAME + valueFrom: + secretKeyRef: + name: gitea-pat + key: username + - name: GITEA_PAT_TOKEN + valueFrom: + secretKeyRef: + name: gitea-pat + key: token + resources: + requests: + cpu: 750m + memory: 1536Mi + limits: + cpu: 1500m + memory: 3Gi + livenessProbe: + httpGet: + path: /login + port: http + initialDelaySeconds: 30 + periodSeconds: 10 + timeoutSeconds: 5 + failureThreshold: 5 + readinessProbe: + httpGet: + path: /login + port: http + initialDelaySeconds: 20 + periodSeconds: 10 + timeoutSeconds: 5 + failureThreshold: 3 + startupProbe: + httpGet: + path: /login + port: http + initialDelaySeconds: 30 + periodSeconds: 10 + failureThreshold: 20 + volumeMounts: + - name: jenkins-home + mountPath: /var/jenkins_home + - name: jenkins-cache + mountPath: /var/jenkins_cache + - name: jcasc + mountPath: /config/jcasc + - name: init-scripts + mountPath: /usr/share/jenkins/ref/init.groovy.d + - name: plugin-dir + mountPath: /usr/share/jenkins/ref/plugins + - name: tmp + mountPath: /tmp + volumes: + - name: jenkins-home + persistentVolumeClaim: + claimName: jenkins + - name: jenkins-cache + emptyDir: {} + - name: plugin-dir + emptyDir: {} + - name: plugins + configMap: + name: jenkins-plugins + - name: jcasc + configMap: + name: jenkins-jcasc + - name: init-scripts + configMap: + name: jenkins-init-scripts + - name: tmp + emptyDir: {} diff --git a/services/jenkins/helmrelease.yaml b/services/jenkins/helmrelease.yaml deleted file mode 100644 index 4cdede0..0000000 --- a/services/jenkins/helmrelease.yaml +++ /dev/null @@ -1,373 +0,0 @@ -# services/jenkins/helmrelease.yaml -apiVersion: helm.toolkit.fluxcd.io/v2 -kind: HelmRelease -metadata: - name: jenkins - namespace: jenkins -spec: - interval: 30m - chart: - spec: - chart: jenkins - version: 5.8.114 - sourceRef: - kind: HelmRepository - name: jenkins - namespace: flux-system - install: - timeout: 15m - remediation: - retries: 3 - upgrade: - timeout: 15m - remediation: - retries: 3 - remediateLastFailure: true - cleanupOnFail: true - rollback: - timeout: 15m - values: - controller: - nodeSelector: - kubernetes.io/arch: arm64 - affinity: - nodeAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - nodeSelectorTerms: - - matchExpressions: - - key: kubernetes.io/arch - operator: In - values: ["arm64"] - preferredDuringSchedulingIgnoredDuringExecution: - - weight: 90 - preference: - matchExpressions: - - key: hardware - operator: In - values: ["rpi5"] - - weight: 50 - preference: - matchExpressions: - - key: hardware - operator: In - values: ["rpi4"] - resources: - requests: - cpu: 750m - memory: 1.5Gi - limits: - cpu: 1500m - memory: 3Gi - javaOpts: "-Xms512m -Xmx2048m" - startupProbe: - httpGet: - path: /login - port: http - initialDelaySeconds: 30 - periodSeconds: 10 - timeoutSeconds: 5 - failureThreshold: 20 - jenkinsUrl: https://ci.bstein.dev - ingress: - enabled: true - hostName: ci.bstein.dev - ingressClassName: traefik - annotations: - cert-manager.io/cluster-issuer: letsencrypt - traefik.ingress.kubernetes.io/router.entrypoints: websecure - tls: - - secretName: jenkins-tls - hosts: - - ci.bstein.dev - hostAliases: - - ip: 38.28.125.112 - hostnames: - - sso.bstein.dev - installPlugins: - - kubernetes - - workflow-aggregator - - git - - pipeline-utility-steps - - configuration-as-code - - oic-auth - - job-dsl - - configuration-as-code-support - - simple-theme-plugin - containerEnv: - - name: ENABLE_OIDC - value: "true" - - name: OIDC_ISSUER - value: "https://sso.bstein.dev/realms/atlas" - - name: OIDC_CLIENT_ID - valueFrom: - secretKeyRef: - name: jenkins-oidc - key: clientId - - name: OIDC_CLIENT_SECRET - valueFrom: - secretKeyRef: - name: jenkins-oidc - key: clientSecret - - name: OIDC_AUTH_URL - valueFrom: - secretKeyRef: - name: jenkins-oidc - key: authorizationUrl - - name: OIDC_TOKEN_URL - valueFrom: - secretKeyRef: - name: jenkins-oidc - key: tokenUrl - - name: OIDC_USERINFO_URL - valueFrom: - secretKeyRef: - name: jenkins-oidc - key: userInfoUrl - - name: OIDC_LOGOUT_URL - valueFrom: - secretKeyRef: - name: jenkins-oidc - key: logoutUrl - - name: GITEA_PAT_USERNAME - valueFrom: - secretKeyRef: - name: gitea-pat - key: username - - name: GITEA_PAT_TOKEN - valueFrom: - secretKeyRef: - name: gitea-pat - key: token - customInitContainers: - - name: clean-jcasc-stale - image: alpine:3.20 - imagePullPolicy: IfNotPresent - command: - - sh - - -c - - | - set -euo pipefail - rm -f /var/jenkins_home/casc_configs/* || true - securityContext: - runAsNonRoot: true - runAsUser: 1000 - runAsGroup: 1000 - volumeMounts: - - name: jenkins-home - mountPath: /var/jenkins_home - initScripts: - oidc.groovy: | - import hudson.util.Secret - import jenkins.model.IdStrategy - import jenkins.model.Jenkins - import org.jenkinsci.plugins.oic.OicSecurityRealm - import org.jenkinsci.plugins.oic.OicServerWellKnownConfiguration - import hudson.security.FullControlOnceLoggedInAuthorizationStrategy - def env = System.getenv() - if (!(env['ENABLE_OIDC'] ?: 'false').toBoolean()) { - println("OIDC disabled (ENABLE_OIDC=false); keeping default security realm") - return - } - def required = ['OIDC_CLIENT_ID','OIDC_CLIENT_SECRET','OIDC_ISSUER'] - if (!required.every { env[it] }) { - throw new IllegalStateException("OIDC enabled but missing vars: ${required.findAll { !env[it] }}") - } - try { - def wellKnown = "${env['OIDC_ISSUER']}/.well-known/openid-configuration" - def serverCfg = new OicServerWellKnownConfiguration(wellKnown) - serverCfg.setScopesOverride('openid profile email') - def realm = new OicSecurityRealm( - env['OIDC_CLIENT_ID'], - Secret.fromString(env['OIDC_CLIENT_SECRET']), - serverCfg, - false, - IdStrategy.CASE_INSENSITIVE, - IdStrategy.CASE_INSENSITIVE - ) - realm.createProxyAwareResourceRetriver() - realm.setLogoutFromOpenidProvider(true) - realm.setPostLogoutRedirectUrl('https://ci.bstein.dev') - realm.setUserNameField('preferred_username') - realm.setFullNameFieldName('name') - realm.setEmailFieldName('email') - realm.setGroupsFieldName('groups') - realm.setRootURLFromRequest(true) - realm.setSendScopesInTokenRequest(true) - def j = Jenkins.get() - j.setSecurityRealm(realm) - def auth = new FullControlOnceLoggedInAuthorizationStrategy() - auth.setAllowAnonymousRead(false) - j.setAuthorizationStrategy(auth) - j.save() - println("Configured OIDC realm from init script (well-known)") - } catch (Exception e) { - println("Failed to configure OIDC realm: ${e}") - throw e - } - theme.groovy: | - import jenkins.model.Jenkins - import org.codefirst.SimpleThemeDecorator - - def instance = Jenkins.get() - def decorators = instance.getExtensionList(SimpleThemeDecorator.class) - - if (decorators?.size() > 0) { - def theme = decorators[0] - theme.setCssUrl("https://jenkins-contrib-themes.github.io/jenkins-material-theme/dist/material-ocean.css") - theme.setJsUrl("") - theme.setTheme("") - instance.save() - println("Applied simple-theme-plugin dark theme") - } else { - println("simple-theme-plugin not installed; skipping theme configuration") - } - JCasC: - defaultConfig: false - securityRealm: "" - authorizationStrategy: "" - configScripts: - base.yaml: | - jenkins: - disableRememberMe: false - mode: NORMAL - numExecutors: 0 - labelString: "" - projectNamingStrategy: "standard" - markupFormatter: - plainText - clouds: - - kubernetes: - containerCapStr: "10" - defaultsProviderTemplate: "" - connectTimeout: "5" - readTimeout: "15" - jenkinsUrl: "http://jenkins.jenkins.svc.cluster.local:8080" - jenkinsTunnel: "jenkins-agent.jenkins.svc.cluster.local:50000" - skipTlsVerify: false - usageRestricted: false - maxRequestsPerHostStr: "32" - retentionTimeout: "5" - waitForPodSec: "600" - name: "kubernetes" - namespace: "jenkins" - restrictedPssSecurityContext: false - serverUrl: "https://kubernetes.default" - credentialsId: "" - podLabels: - - key: "jenkins/jenkins-jenkins-agent" - value: "true" - templates: - - name: "default" - namespace: "jenkins" - id: a23c9bbcd21e360a77d51b426f05bd7b8032d8fdedd6ffb97c436883ce6c5ffa - containers: - - name: "jnlp" - alwaysPullImage: false - args: "^${computer.jnlpmac} ^${computer.name}" - envVars: - - envVar: - key: "JENKINS_URL" - value: "http://jenkins.jenkins.svc.cluster.local:8080/" - image: "jenkins/inbound-agent:3355.v388858a_47b_33-3" - privileged: "false" - resourceLimitCpu: 512m - resourceLimitMemory: 512Mi - resourceRequestCpu: 512m - resourceRequestMemory: 512Mi - ttyEnabled: false - workingDir: /home/jenkins/agent - idleMinutes: 0 - instanceCap: 2147483647 - label: "jenkins-jenkins-agent " - nodeUsageMode: "NORMAL" - podRetention: Never - showRawYaml: true - serviceAccount: "default" - slaveConnectTimeoutStr: "100" - yamlMergeStrategy: override - inheritYamlMergeStrategy: false - slaveAgentPort: 50000 - crumbIssuer: - standard: - excludeClientIPFromCrumb: true - security: - apiToken: - creationOfLegacyTokenEnabled: false - tokenGenerationOnCreationEnabled: false - usageStatisticsEnabled: true - creds.yaml: | - credentials: - system: - domainCredentials: - - credentials: - - usernamePassword: - scope: GLOBAL - id: gitea-pat - username: "${GITEA_PAT_USERNAME}" - password: "${GITEA_PAT_TOKEN}" - description: "Gitea PAT for pipelines" - jobs.yaml: | - jobs: - - script: | - pipelineJob('harbor-arm-build') { - triggers { - scm('H/5 * * * *') - } - definition { - cpsScm { - scm { - git { - remote { - url('https://scm.bstein.dev/bstein/harbor-arm-build.git') - credentials('gitea-pat') - } - branches('*/master') - } - } - } - } - } - pipelineJob('ci-demo') { - triggers { - scm('H/1 * * * *') - } - definition { - cpsScm { - scm { - git { - remote { - url('https://scm.bstein.dev/bstein/ci-demo.git') - credentials('gitea-pat') - } - branches('*/master') - } - } - scriptPath('Jenkinsfile') - } - } - } - pipelineJob('bstein-dev-home') { - triggers { - scm('H/2 * * * *') - } - definition { - cpsScm { - scm { - git { - remote { - url('https://scm.bstein.dev/bstein/bstein-dev-home.git') - credentials('gitea-pat') - } - branches('*/master') - } - } - scriptPath('Jenkinsfile') - } - } - } - persistence: - enabled: true - storageClass: astreae - size: 50Gi - serviceAccount: - create: true diff --git a/services/jenkins/ingress.yaml b/services/jenkins/ingress.yaml new file mode 100644 index 0000000..e702c8c --- /dev/null +++ b/services/jenkins/ingress.yaml @@ -0,0 +1,26 @@ +# services/jenkins/ingress.yaml +apiVersion: networking.k8s.io/v1 +kind: Ingress +metadata: + name: jenkins + namespace: jenkins + annotations: + cert-manager.io/cluster-issuer: letsencrypt + traefik.ingress.kubernetes.io/router.entrypoints: websecure +spec: + ingressClassName: traefik + tls: + - secretName: jenkins-tls + hosts: + - ci.bstein.dev + rules: + - host: ci.bstein.dev + http: + paths: + - path: / + pathType: Prefix + backend: + service: + name: jenkins + port: + name: http diff --git a/services/jenkins/kustomization.yaml b/services/jenkins/kustomization.yaml index b20b1d3..acb6fb4 100644 --- a/services/jenkins/kustomization.yaml +++ b/services/jenkins/kustomization.yaml @@ -4,4 +4,18 @@ kind: Kustomization namespace: jenkins resources: - namespace.yaml - - helmrelease.yaml + - serviceaccount.yaml + - pvc.yaml + - configmap-jcasc.yaml + - configmap-plugins.yaml + - deployment.yaml + - service.yaml + - ingress.yaml + +configMapGenerator: + - name: jenkins-init-scripts + namespace: jenkins + files: + - theme.groovy=scripts/theme.groovy + options: + disableNameSuffixHash: true diff --git a/services/jenkins/pvc.yaml b/services/jenkins/pvc.yaml new file mode 100644 index 0000000..be4a55b --- /dev/null +++ b/services/jenkins/pvc.yaml @@ -0,0 +1,14 @@ +# services/jenkins/pvc.yaml +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: jenkins + namespace: jenkins +spec: + accessModes: + - ReadWriteOnce + resources: + requests: + storage: 50Gi + storageClassName: astreae + volumeName: pvc-3e12e869-5b33-4035-93d2-411ebc02ff31 diff --git a/services/jenkins/scripts/theme.groovy b/services/jenkins/scripts/theme.groovy new file mode 100644 index 0000000..cf171f7 --- /dev/null +++ b/services/jenkins/scripts/theme.groovy @@ -0,0 +1,16 @@ +import jenkins.model.Jenkins +import org.codefirst.SimpleThemeDecorator + +def instance = Jenkins.get() +def decorators = instance.getExtensionList(SimpleThemeDecorator.class) + +if (decorators?.size() > 0) { + def theme = decorators[0] + theme.setCssUrl("https://jenkins-contrib-themes.github.io/jenkins-material-theme/dist/material-ocean.css") + theme.setJsUrl("") + theme.setTheme("") + instance.save() + println("Applied simple-theme-plugin dark theme") +} else { + println("simple-theme-plugin not installed; skipping theme configuration") +} diff --git a/services/jenkins/service.yaml b/services/jenkins/service.yaml new file mode 100644 index 0000000..5fba878 --- /dev/null +++ b/services/jenkins/service.yaml @@ -0,0 +1,18 @@ +# services/jenkins/service.yaml +apiVersion: v1 +kind: Service +metadata: + name: jenkins + namespace: jenkins + labels: + app: jenkins +spec: + ports: + - name: http + port: 8080 + targetPort: 8080 + - name: agent-listener + port: 50000 + targetPort: 50000 + selector: + app: jenkins diff --git a/services/jenkins/serviceaccount.yaml b/services/jenkins/serviceaccount.yaml new file mode 100644 index 0000000..27caeed --- /dev/null +++ b/services/jenkins/serviceaccount.yaml @@ -0,0 +1,41 @@ +# services/jenkins/serviceaccount.yaml +apiVersion: v1 +kind: ServiceAccount +metadata: + name: jenkins + namespace: jenkins + +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: Role +metadata: + name: jenkins-agent + namespace: jenkins +rules: + - apiGroups: [""] + resources: + - pods + - pods/exec + - pods/log + - pods/portforward + - services + - endpoints + - persistentvolumeclaims + - configmaps + - secrets + verbs: ["get", "list", "watch", "create", "delete"] + +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + name: jenkins-agent + namespace: jenkins +subjects: + - kind: ServiceAccount + name: jenkins + namespace: jenkins +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: Role + name: jenkins-agent diff --git a/services/jitsi/deployment.yaml b/services/jitsi/deployment.yaml deleted file mode 100644 index ff81b33..0000000 --- a/services/jitsi/deployment.yaml +++ /dev/null @@ -1,171 +0,0 @@ -# services/jitsi/deployment.yaml -apiVersion: apps/v1 -kind: Deployment -metadata: - name: jitsi-prosody - namespace: jitsi -spec: - replicas: 0 - selector: - matchLabels: { app: jitsi-prosody } - template: - metadata: - labels: { app: jitsi-prosody } - spec: - nodeSelector: - kubernetes.io/hostname: titan-22 - kubernetes.io/arch: amd64 - containers: - - name: prosody - image: jitsi/prosody:stable - ports: - - { name: c2s, containerPort: 5222, protocol: TCP } - - { name: http, containerPort: 5280, protocol: TCP } - - { name: comp, containerPort: 5347, protocol: TCP } - env: - - { name: XMPP_DOMAIN, value: "meet.jitsi" } - - { name: XMPP_AUTH_DOMAIN, value: "auth.meet.jitsi" } - - { name: XMPP_MUC_DOMAIN, value: "muc.meet.jitsi" } - - { name: XMPP_INTERNAL_MUC_DOMAIN, value: "internal-muc.meet.jitsi" } - - { name: ENABLE_AUTH, value: "0" } # open instance, no auth (fastest path) - - { name: ENABLE_GUESTS, value: "1" } - - { name: JICOFO_AUTH_USER, value: "focus" } - - { name: JVB_AUTH_USER, value: "jvb" } - - name: JICOFO_AUTH_PASSWORD - valueFrom: { secretKeyRef: { name: jitsi-internal-secrets, key: JICOFO_AUTH_PASSWORD } } - - name: JICOFO_COMPONENT_SECRET - valueFrom: { secretKeyRef: { name: jitsi-internal-secrets, key: JICOFO_COMPONENT_SECRET } } - - name: JVB_AUTH_PASSWORD - valueFrom: { secretKeyRef: { name: jitsi-internal-secrets, key: JVB_AUTH_PASSWORD } } - volumeMounts: - - { name: cfg, mountPath: /config } - volumes: - - name: cfg - persistentVolumeClaim: { claimName: jitsi-prosody-config } - ---- - -apiVersion: apps/v1 -kind: Deployment -metadata: - name: jitsi-jicofo - namespace: jitsi -spec: - replicas: 0 - selector: - matchLabels: { app: jitsi-jicofo } - template: - metadata: - labels: { app: jitsi-jicofo } - spec: - nodeSelector: - kubernetes.io/hostname: titan-22 - kubernetes.io/arch: amd64 - containers: - - name: jicofo - image: jitsi/jicofo:stable - env: - - { name: XMPP_DOMAIN, value: "meet.jitsi" } - - { name: XMPP_AUTH_DOMAIN, value: "auth.meet.jitsi" } - - { name: XMPP_MUC_DOMAIN, value: "muc.meet.jitsi" } - - { name: XMPP_INTERNAL_MUC_DOMAIN, value: "internal-muc.meet.jitsi" } - - { name: XMPP_SERVER, value: "jitsi-prosody.jitsi.svc.cluster.local" } - - { name: JICOFO_AUTH_USER, value: "focus" } - - name: JICOFO_AUTH_PASSWORD - valueFrom: { secretKeyRef: { name: jitsi-internal-secrets, key: JICOFO_AUTH_PASSWORD } } - - name: JICOFO_COMPONENT_SECRET - valueFrom: { secretKeyRef: { name: jitsi-internal-secrets, key: JICOFO_COMPONENT_SECRET } } - - { name: JVB_BREWERY_MUC, value: "jvbbrewery" } - volumeMounts: - - { name: cfg, mountPath: /config } - volumes: - - name: cfg - persistentVolumeClaim: { claimName: jitsi-jicofo-config } - ---- - -apiVersion: apps/v1 -kind: Deployment -metadata: - name: jitsi-jvb - namespace: jitsi -spec: - replicas: 0 - selector: - matchLabels: { app: jitsi-jvb } - template: - metadata: - labels: { app: jitsi-jvb } - spec: - nodeSelector: - kubernetes.io/hostname: titan-22 - kubernetes.io/arch: amd64 - containers: - - name: jvb - image: jitsi/jvb:stable - ports: - - { name: colibri-ws, containerPort: 9090, protocol: TCP } # WebSocket control channel - - { name: rtp-udp, containerPort: 10000, hostPort: 10000, protocol: UDP } # media - - { name: rtp-tcp, containerPort: 4443, hostPort: 4443, protocol: TCP } - env: - - { name: XMPP_DOMAIN, value: "meet.jitsi" } - - { name: XMPP_AUTH_DOMAIN, value: "auth.meet.jitsi" } - - { name: XMPP_MUC_DOMAIN, value: "muc.meet.jitsi" } - - { name: XMPP_INTERNAL_MUC_DOMAIN, value: "internal-muc.meet.jitsi" } - - { name: XMPP_SERVER, value: "jitsi-prosody.jitsi.svc.cluster.local" } - - { name: JVB_AUTH_USER, value: "jvb" } - - name: JVB_AUTH_PASSWORD - valueFrom: { secretKeyRef: { name: jitsi-internal-secrets, key: JVB_AUTH_PASSWORD } } - - { name: JVB_BREWERY_MUC, value: "jvbbrewery" } - - { name: JVB_PORT, value: "10000" } # matches hostPort above - - { name: ENABLE_COLIBRI_WEBSOCKET, value: "1" } # enables /colibri-ws - # - { name: JVB_STUN_SERVERS, value: "stun.l.google.com:19302,stun1.l.google.com:19302,meet-jit-si-turnrelay.jitsi.net:443" } - - { name: JVB_ENABLE_APIS, value: "rest,colibri" } - - { name: JVB_WS_DOMAIN, value: "meet.bstein.dev:443" } - - { name: JVB_WS_TLS, value: "true" } - - { name: JVB_ADVERTISE_IPS, value: "38.28.125.112" } - - { name: JVB_TCP_HARVESTER_DISABLED, value: "false" } - - { name: JVB_TCP_PORT, value: "4443" } - volumeMounts: - - { name: cfg, mountPath: /config } - volumes: - - name: cfg - persistentVolumeClaim: { claimName: jitsi-jvb-config } - ---- - -apiVersion: apps/v1 -kind: Deployment -metadata: - name: jitsi-web - namespace: jitsi -spec: - replicas: 0 - selector: - matchLabels: { app: jitsi-web } - template: - metadata: - labels: { app: jitsi-web } - spec: - nodeSelector: - kubernetes.io/hostname: titan-22 - kubernetes.io/arch: amd64 - containers: - - name: web - image: jitsi/web:stable - ports: - - { name: http, containerPort: 80, protocol: TCP } - env: - - { name: PUBLIC_URL, value: "https://meet.bstein.dev" } - - { name: XMPP_DOMAIN, value: "meet.jitsi" } - - { name: XMPP_AUTH_DOMAIN, value: "auth.meet.jitsi" } - - { name: XMPP_MUC_DOMAIN, value: "muc.meet.jitsi" } - - { name: XMPP_INTERNAL_MUC_DOMAIN, value: "internal-muc.meet.jitsi" } - - { name: XMPP_BOSH_URL_BASE, value: "https://meet.bstein.dev" } - - { name: ENABLE_XMPP_WEBSOCKET, value: "1" } - - { name: ENABLE_COLIBRI_WEBSOCKET, value: "1" } - volumeMounts: - - { name: cfg, mountPath: /config } - volumes: - - name: cfg - persistentVolumeClaim: { claimName: jitsi-web-config } diff --git a/services/jitsi/ingress.yaml b/services/jitsi/ingress.yaml deleted file mode 100644 index 3336c37..0000000 --- a/services/jitsi/ingress.yaml +++ /dev/null @@ -1,41 +0,0 @@ -# services/jitsi/ingress.yaml -apiVersion: networking.k8s.io/v1 -kind: Ingress -metadata: - name: jitsi - namespace: jitsi - annotations: - cert-manager.io/cluster-issuer: letsencrypt -spec: - ingressClassName: traefik - tls: - - hosts: [ "meet.bstein.dev" ] - secretName: jitsi-meet-tls - rules: - - host: meet.bstein.dev - http: - paths: - - path: /colibri-ws - pathType: Prefix - backend: - service: - name: jitsi-jvb - port: { number: 9090 } - - path: /xmpp-websocket - pathType: Prefix - backend: - service: - name: jitsi-prosody - port: { number: 5280 } - - path: /http-bind - pathType: Prefix - backend: - service: - name: jitsi-prosody - port: { number: 5280 } - - path: / - pathType: Prefix - backend: - service: - name: jitsi-web - port: { number: 80 } diff --git a/services/jitsi/namespace.yaml b/services/jitsi/namespace.yaml deleted file mode 100644 index 6ba93f2..0000000 --- a/services/jitsi/namespace.yaml +++ /dev/null @@ -1,5 +0,0 @@ -# services/jitsi/namespace.yaml -apiVersion: v1 -kind: Namespace -metadata: - name: jitsi diff --git a/services/jitsi/pvc.yaml b/services/jitsi/pvc.yaml deleted file mode 100644 index 3a2c14e..0000000 --- a/services/jitsi/pvc.yaml +++ /dev/null @@ -1,42 +0,0 @@ -# services/jitsi/pvc.yaml -apiVersion: v1 -kind: PersistentVolumeClaim -metadata: - name: jitsi-web-config - namespace: jitsi -spec: - accessModes: ["ReadWriteOnce"] - resources: { requests: { storage: 10Gi } } - ---- - -apiVersion: v1 -kind: PersistentVolumeClaim -metadata: - name: jitsi-prosody-config - namespace: jitsi -spec: - accessModes: ["ReadWriteOnce"] - resources: { requests: { storage: 10Gi } } - ---- - -apiVersion: v1 -kind: PersistentVolumeClaim -metadata: - name: jitsi-jicofo-config - namespace: jitsi -spec: - accessModes: ["ReadWriteOnce"] - resources: { requests: { storage: 10Gi } } - ---- - -apiVersion: v1 -kind: PersistentVolumeClaim -metadata: - name: jitsi-jvb-config - namespace: jitsi -spec: - accessModes: ["ReadWriteOnce"] - resources: { requests: { storage: 10Gi } } diff --git a/services/jitsi/secret.yaml b/services/jitsi/secret.yaml deleted file mode 100644 index f851bac..0000000 --- a/services/jitsi/secret.yaml +++ /dev/null @@ -1,11 +0,0 @@ -# services/jitsi/secret.yaml -apiVersion: v1 -kind: Secret -metadata: - name: jitsi-internal-secrets - namespace: jitsi -type: Opaque -data: - JICOFO_COMPONENT_SECRET: bEg5Y09hZFJBem5PUFliQlp4RHkwRTRP - JICOFO_AUTH_PASSWORD: VVkyUmczaVRDWUZ0MzdQdmN3UDN1SFc5 - JVB_AUTH_PASSWORD: d0M5aWJ4dWlPTnhFak9lRHJqSHdYa0g5 \ No newline at end of file diff --git a/services/jitsi/service.yaml b/services/jitsi/service.yaml deleted file mode 100644 index 7b44b5c..0000000 --- a/services/jitsi/service.yaml +++ /dev/null @@ -1,36 +0,0 @@ -# services/jitsi/service.yaml -apiVersion: v1 -kind: Service -metadata: - name: jitsi-prosody - namespace: jitsi -spec: - selector: { app: jitsi-prosody } - ports: - - { name: c2s, port: 5222, targetPort: 5222, protocol: TCP } - - { name: http, port: 5280, targetPort: 5280, protocol: TCP } - - { name: comp, port: 5347, targetPort: 5347, protocol: TCP } - ---- - -apiVersion: v1 -kind: Service -metadata: - name: jitsi-jvb - namespace: jitsi -spec: - selector: { app: jitsi-jvb } - ports: - - { name: colibri-ws, port: 9090, targetPort: 9090, protocol: TCP } - ---- - -apiVersion: v1 -kind: Service -metadata: - name: jitsi-web - namespace: jitsi -spec: - selector: { app: jitsi-web } - ports: - - { name: http, port: 80, targetPort: 80, protocol: TCP } diff --git a/services/keycloak/deployment.yaml b/services/keycloak/deployment.yaml index 9336bd9..48cf5e0 100644 --- a/services/keycloak/deployment.yaml +++ b/services/keycloak/deployment.yaml @@ -8,6 +8,11 @@ metadata: app: keycloak spec: replicas: 1 + strategy: + type: RollingUpdate + rollingUpdate: + maxSurge: 0 + maxUnavailable: 1 selector: matchLabels: app: keycloak @@ -75,17 +80,17 @@ spec: valueFrom: secretKeyRef: name: keycloak-db - key: database + key: POSTGRES_DATABASE - name: KC_DB_USERNAME valueFrom: secretKeyRef: name: keycloak-db - key: username + key: POSTGRES_USER - name: KC_DB_PASSWORD valueFrom: secretKeyRef: name: keycloak-db - key: password + key: POSTGRES_PASSWORD - name: KC_DB_SCHEMA value: public - name: KC_HOSTNAME @@ -98,10 +103,14 @@ spec: value: xforwarded - name: KC_HTTP_ENABLED value: "true" + - name: KC_FEATURES + value: token-exchange,admin-fine-grained-authz - name: KC_HTTP_MANAGEMENT_PORT value: "9000" - name: KC_HTTP_MANAGEMENT_BIND_ADDRESS value: 0.0.0.0 + - name: KC_LOG_LEVEL + value: DEBUG - name: KC_HEALTH_ENABLED value: "true" - name: KC_METRICS_ENABLED diff --git a/services/keycloak/kustomization.yaml b/services/keycloak/kustomization.yaml index a65715c..ddb4ab2 100644 --- a/services/keycloak/kustomization.yaml +++ b/services/keycloak/kustomization.yaml @@ -6,5 +6,28 @@ resources: - namespace.yaml - pvc.yaml - deployment.yaml + - realm-settings-job.yaml + - portal-e2e-client-job.yaml + - portal-e2e-client-secret-sync-rbac.yaml + - portal-e2e-client-secret-sync-cronjob.yaml + - portal-e2e-target-client-job.yaml + - portal-e2e-token-exchange-permissions-job.yaml + - portal-e2e-token-exchange-test-job.yaml + - portal-e2e-execute-actions-email-test-job.yaml + - ldap-federation-job.yaml + - user-overrides-job.yaml + - mas-secrets-ensure-job.yaml + - synapse-oidc-secret-ensure-job.yaml + - logs-oidc-secret-ensure-job.yaml - service.yaml - ingress.yaml +generatorOptions: + disableNameSuffixHash: true +configMapGenerator: + - name: portal-e2e-tests + files: + - test_portal_token_exchange.py=scripts/tests/test_portal_token_exchange.py + - test_keycloak_execute_actions_email.py=scripts/tests/test_keycloak_execute_actions_email.py + - name: portal-e2e-client-secret-sync-script + files: + - sso_portal_e2e_client_secret_sync.sh=scripts/sso_portal_e2e_client_secret_sync.sh diff --git a/services/keycloak/ldap-federation-job.yaml b/services/keycloak/ldap-federation-job.yaml new file mode 100644 index 0000000..9650468 --- /dev/null +++ b/services/keycloak/ldap-federation-job.yaml @@ -0,0 +1,362 @@ +# services/keycloak/ldap-federation-job.yaml +apiVersion: batch/v1 +kind: Job +metadata: + name: keycloak-ldap-federation-5 + namespace: sso +spec: + backoffLimit: 2 + template: + spec: + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: hardware + operator: In + values: ["rpi5","rpi4"] + - key: node-role.kubernetes.io/worker + operator: Exists + restartPolicy: OnFailure + containers: + - name: configure + image: python:3.11-alpine + imagePullPolicy: IfNotPresent + env: + - name: KEYCLOAK_SERVER + value: http://keycloak.sso.svc.cluster.local + - name: KEYCLOAK_REALM + value: atlas + - name: KEYCLOAK_ADMIN_USER + valueFrom: + secretKeyRef: + name: keycloak-admin + key: username + - name: KEYCLOAK_ADMIN_PASSWORD + valueFrom: + secretKeyRef: + name: keycloak-admin + key: password + - name: LDAP_URL + value: ldap://openldap.sso.svc.cluster.local:389 + - name: LDAP_BIND_DN + value: cn=admin,dc=bstein,dc=dev + - name: LDAP_BIND_PASSWORD + valueFrom: + secretKeyRef: + name: openldap-admin + key: LDAP_ADMIN_PASSWORD + - name: LDAP_USERS_DN + value: ou=users,dc=bstein,dc=dev + - name: LDAP_GROUPS_DN + value: ou=groups,dc=bstein,dc=dev + command: ["/bin/sh", "-c"] + args: + - | + set -euo pipefail + python - <<'PY' + import json + import os + import time + import urllib.parse + import urllib.error + import urllib.request + + base_url = os.environ["KEYCLOAK_SERVER"].rstrip("/") + realm = os.environ["KEYCLOAK_REALM"] + admin_user = os.environ["KEYCLOAK_ADMIN_USER"] + admin_password = os.environ["KEYCLOAK_ADMIN_PASSWORD"] + + ldap_url = os.environ["LDAP_URL"] + ldap_bind_dn = os.environ["LDAP_BIND_DN"] + ldap_bind_password = os.environ["LDAP_BIND_PASSWORD"] + ldap_users_dn = os.environ["LDAP_USERS_DN"] + ldap_groups_dn = os.environ["LDAP_GROUPS_DN"] + + def http_json(method: str, url: str, token: str, payload=None): + data = None + headers = {"Authorization": f"Bearer {token}"} + if payload is not None: + data = json.dumps(payload).encode() + headers["Content-Type"] = "application/json" + req = urllib.request.Request(url, data=data, headers=headers, method=method) + with urllib.request.urlopen(req, timeout=30) as resp: + body = resp.read() + if not body: + return resp.status, None, dict(resp.headers) + return resp.status, json.loads(body.decode()), dict(resp.headers) + + def get_token(): + token_data = urllib.parse.urlencode( + { + "grant_type": "password", + "client_id": "admin-cli", + "username": admin_user, + "password": admin_password, + } + ).encode() + token_req = urllib.request.Request( + f"{base_url}/realms/master/protocol/openid-connect/token", + data=token_data, + headers={"Content-Type": "application/x-www-form-urlencoded"}, + method="POST", + ) + with urllib.request.urlopen(token_req, timeout=30) as resp: + token_body = json.loads(resp.read().decode()) + return token_body["access_token"] + + def wait_for_keycloak(): + for _ in range(60): + try: + token = get_token() + if token: + return token + except Exception: + time.sleep(2) + raise SystemExit("Keycloak not ready") + + token = wait_for_keycloak() + + # Keycloak component "parentId" must be the realm UUID, not the realm name. + status, realm_rep, _ = http_json( + "GET", + f"{base_url}/admin/realms/{realm}", + token, + ) + if status != 200 or not realm_rep or not realm_rep.get("id"): + raise SystemExit(f"Unable to resolve realm id for {realm} (status={status})") + realm_id = realm_rep["id"] + + # Some historical LDAP federation components were created with parentId=. + # That makes realm resolution null in Keycloak internals and breaks authentication. + status, all_components, _ = http_json( + "GET", + f"{base_url}/admin/realms/{realm}/components", + token, + ) + if status != 200: + raise SystemExit(f"Unexpected components response: {status}") + all_components = all_components or [] + + for c in all_components: + if c.get("providerId") != "ldap": + continue + if c.get("providerType") != "org.keycloak.storage.UserStorageProvider": + continue + if c.get("parentId") == realm_id: + continue + cid = c.get("id") + if not cid: + continue + print(f"Fixing LDAP federation parentId for {cid} (was {c.get('parentId')})") + status, comp, _ = http_json( + "GET", + f"{base_url}/admin/realms/{realm}/components/{cid}", + token, + ) + if status != 200 or not comp: + raise SystemExit(f"Unable to fetch component {cid} (status={status})") + comp["parentId"] = realm_id + status, _, _ = http_json( + "PUT", + f"{base_url}/admin/realms/{realm}/components/{cid}", + token, + comp, + ) + if status not in (200, 204): + raise SystemExit(f"Unexpected parentId repair status for {cid}: {status}") + + # Find existing LDAP user federation provider (if any) + status, components, _ = http_json( + "GET", + f"{base_url}/admin/realms/{realm}/components?type=org.keycloak.storage.UserStorageProvider", + token, + ) + if status != 200: + raise SystemExit(f"Unexpected components response: {status}") + components = components or [] + + ldap_components = [c for c in components if c.get("providerId") == "ldap" and c.get("id")] + + # Select a canonical LDAP federation provider deterministically. + # Duplicate LDAP providers can cause Keycloak admin/user queries to fail if any one of them is misconfigured. + candidates = [] + for c in ldap_components: + if c.get("name") not in ("openldap", "ldap"): + continue + cfg = c.get("config") or {} + if (cfg.get("connectionUrl") or [None])[0] == ldap_url: + candidates.append(c) + if not candidates: + candidates = [c for c in ldap_components if c.get("name") in ("openldap", "ldap")] + candidates.sort(key=lambda x: x.get("id", "")) + ldap_component = candidates[0] if candidates else None + ldap_component_id = ldap_component["id"] if ldap_component else None + + desired = { + "name": "openldap", + "providerId": "ldap", + "providerType": "org.keycloak.storage.UserStorageProvider", + "parentId": realm_id, + "config": { + "enabled": ["true"], + "priority": ["0"], + "importEnabled": ["true"], + "editMode": ["WRITABLE"], + "syncRegistrations": ["true"], + "vendor": ["other"], + "connectionUrl": [ldap_url], + "bindDn": [ldap_bind_dn], + "bindCredential": [ldap_bind_password], + "authType": ["simple"], + "usersDn": [ldap_users_dn], + "searchScope": ["1"], + "pagination": ["true"], + "usernameLDAPAttribute": ["uid"], + "rdnLDAPAttribute": ["uid"], + "uuidLDAPAttribute": ["entryUUID"], + "userObjectClasses": ["inetOrgPerson, organizationalPerson, person, top"], + "trustEmail": ["true"], + "useTruststoreSpi": ["never"], + "connectionPooling": ["true"], + "cachePolicy": ["DEFAULT"], + "useKerberosForPasswordAuthentication": ["false"], + "allowKerberosAuthentication": ["false"], + }, + } + + if ldap_component: + desired["id"] = ldap_component["id"] + print(f"Updating LDAP federation provider: {desired['id']}") + status, _, _ = http_json( + "PUT", + f"{base_url}/admin/realms/{realm}/components/{desired['id']}", + token, + desired, + ) + if status not in (200, 204): + raise SystemExit(f"Unexpected update status: {status}") + else: + print("Creating LDAP federation provider") + status, _, headers = http_json( + "POST", + f"{base_url}/admin/realms/{realm}/components", + token, + desired, + ) + if status not in (201, 204): + raise SystemExit(f"Unexpected create status: {status}") + location = headers.get("Location", "") + if location: + ldap_component_id = location.rstrip("/").split("/")[-1] + + # Ensure a basic LDAP group mapper exists (optional but harmless). + if not ldap_component_id: + print("WARNING: unable to determine LDAP component id; skipping group mapper") + raise SystemExit(0) + + status, components, _ = http_json( + "GET", + f"{base_url}/admin/realms/{realm}/components?type=org.keycloak.storage.ldap.mappers.LDAPStorageMapper", + token, + ) + components = components or [] + group_mapper = None + for c in components: + if c.get("name") == "openldap-groups" and c.get("parentId") == ldap_component_id: + group_mapper = c + break + + mapper_payload = { + "name": "openldap-groups", + "providerId": "group-ldap-mapper", + "providerType": "org.keycloak.storage.ldap.mappers.LDAPStorageMapper", + "parentId": ldap_component_id, + "config": { + "groups.dn": [ldap_groups_dn], + "group.name.ldap.attribute": ["cn"], + "group.object.classes": ["groupOfNames"], + "membership.ldap.attribute": ["member"], + "membership.attribute.type": ["DN"], + "mode": ["LDAP_ONLY"], + "user.roles.retrieve.strategy": ["LOAD_GROUPS_BY_MEMBER_ATTRIBUTE"], + "preserve.group.inheritance": ["true"], + }, + } + + if group_mapper: + mapper_payload["id"] = group_mapper["id"] + mapper_payload["parentId"] = group_mapper.get("parentId", mapper_payload["parentId"]) + print(f"Updating LDAP group mapper: {mapper_payload['id']}") + status, _, _ = http_json( + "PUT", + f"{base_url}/admin/realms/{realm}/components/{mapper_payload['id']}", + token, + mapper_payload, + ) + if status not in (200, 204): + raise SystemExit(f"Unexpected group mapper update status: {status}") + else: + print("Creating LDAP group mapper") + status, _, _ = http_json( + "POST", + f"{base_url}/admin/realms/{realm}/components", + token, + mapper_payload, + ) + if status not in (201, 204): + raise SystemExit(f"Unexpected group mapper create status: {status}") + + # Cleanup duplicate LDAP federation providers and their child components (mappers, etc). + # Keep only the canonical provider we updated/created above. + try: + status, fresh_components, _ = http_json( + "GET", + f"{base_url}/admin/realms/{realm}/components", + token, + ) + if status != 200: + raise Exception(f"unexpected components status {status}") + fresh_components = fresh_components or [] + + dup_provider_ids = [] + for c in fresh_components: + if c.get("providerId") != "ldap": + continue + if c.get("providerType") != "org.keycloak.storage.UserStorageProvider": + continue + cid = c.get("id") + if not cid or cid == ldap_component_id: + continue + dup_provider_ids.append(cid) + + if dup_provider_ids: + for pid in dup_provider_ids: + # Delete child components first. + for child in fresh_components: + if child.get("parentId") != pid: + continue + child_id = child.get("id") + if not child_id: + continue + try: + http_json( + "DELETE", + f"{base_url}/admin/realms/{realm}/components/{child_id}", + token, + ) + except urllib.error.HTTPError as e: + print(f"WARNING: failed to delete LDAP child component {child_id} (status={e.code})") + try: + http_json( + "DELETE", + f"{base_url}/admin/realms/{realm}/components/{pid}", + token, + ) + except urllib.error.HTTPError as e: + print(f"WARNING: failed to delete duplicate LDAP provider {pid} (status={e.code})") + print(f"Cleaned up {len(dup_provider_ids)} duplicate LDAP federation providers") + except Exception as e: + print(f"WARNING: LDAP cleanup failed (continuing): {e}") + PY diff --git a/services/keycloak/logs-oidc-secret-ensure-job.yaml b/services/keycloak/logs-oidc-secret-ensure-job.yaml new file mode 100644 index 0000000..11d48f9 --- /dev/null +++ b/services/keycloak/logs-oidc-secret-ensure-job.yaml @@ -0,0 +1,103 @@ +# services/keycloak/logs-oidc-secret-ensure-job.yaml +apiVersion: batch/v1 +kind: Job +metadata: + name: logs-oidc-secret-ensure-2 + namespace: sso +spec: + backoffLimit: 0 + ttlSecondsAfterFinished: 3600 + template: + spec: + serviceAccountName: mas-secrets-ensure + restartPolicy: Never + containers: + - name: apply + image: alpine:3.20 + command: ["/bin/sh", "-c"] + args: + - | + set -euo pipefail + apk add --no-cache curl jq kubectl openssl >/dev/null + + KC_URL="http://keycloak.sso.svc.cluster.local" + ACCESS_TOKEN="" + for attempt in 1 2 3 4 5; do + TOKEN_JSON="$(curl -sS -X POST "$KC_URL/realms/master/protocol/openid-connect/token" \ + -H 'Content-Type: application/x-www-form-urlencoded' \ + -d "grant_type=password" \ + -d "client_id=admin-cli" \ + -d "username=${KEYCLOAK_ADMIN}" \ + -d "password=${KEYCLOAK_ADMIN_PASSWORD}" || true)" + ACCESS_TOKEN="$(echo "$TOKEN_JSON" | jq -r '.access_token' 2>/dev/null || true)" + if [ -n "$ACCESS_TOKEN" ] && [ "$ACCESS_TOKEN" != "null" ]; then + break + fi + echo "Keycloak token request failed (attempt ${attempt})" >&2 + sleep $((attempt * 2)) + done + if [ -z "$ACCESS_TOKEN" ] || [ "$ACCESS_TOKEN" = "null" ]; then + echo "Failed to fetch Keycloak admin token" >&2 + exit 1 + fi + + CLIENT_QUERY="$(curl -sS -H "Authorization: Bearer ${ACCESS_TOKEN}" \ + "$KC_URL/admin/realms/atlas/clients?clientId=logs" || true)" + CLIENT_ID="$(echo "$CLIENT_QUERY" | jq -r '.[0].id' 2>/dev/null || true)" + + if [ -z "$CLIENT_ID" ] || [ "$CLIENT_ID" = "null" ]; then + create_payload='{"clientId":"logs","enabled":true,"protocol":"openid-connect","publicClient":false,"standardFlowEnabled":true,"implicitFlowEnabled":false,"directAccessGrantsEnabled":false,"serviceAccountsEnabled":false,"redirectUris":["https://logs.bstein.dev/oauth2/callback"],"webOrigins":["https://logs.bstein.dev"],"rootUrl":"https://logs.bstein.dev","baseUrl":"/"}' + status="$(curl -sS -o /dev/null -w "%{http_code}" -X POST \ + -H "Authorization: Bearer ${ACCESS_TOKEN}" \ + -H 'Content-Type: application/json' \ + -d "${create_payload}" \ + "$KC_URL/admin/realms/atlas/clients")" + if [ "$status" != "201" ] && [ "$status" != "204" ]; then + echo "Keycloak client create failed (status ${status})" >&2 + exit 1 + fi + CLIENT_QUERY="$(curl -sS -H "Authorization: Bearer ${ACCESS_TOKEN}" \ + "$KC_URL/admin/realms/atlas/clients?clientId=logs" || true)" + CLIENT_ID="$(echo "$CLIENT_QUERY" | jq -r '.[0].id' 2>/dev/null || true)" + fi + + if [ -z "$CLIENT_ID" ] || [ "$CLIENT_ID" = "null" ]; then + echo "Keycloak client logs not found" >&2 + exit 1 + fi + + CLIENT_SECRET="$(curl -sS -H "Authorization: Bearer ${ACCESS_TOKEN}" \ + "$KC_URL/admin/realms/atlas/clients/${CLIENT_ID}/client-secret" | jq -r '.value' 2>/dev/null || true)" + if [ -z "$CLIENT_SECRET" ] || [ "$CLIENT_SECRET" = "null" ]; then + echo "Keycloak client secret not found" >&2 + exit 1 + fi + + if kubectl -n logging get secret oauth2-proxy-logs-oidc >/dev/null 2>&1; then + current_cookie="$(kubectl -n logging get secret oauth2-proxy-logs-oidc -o jsonpath='{.data.cookie_secret}' 2>/dev/null || true)" + if [ -n "${current_cookie}" ]; then + decoded="$(printf '%s' "${current_cookie}" | base64 -d 2>/dev/null || true)" + length="$(printf '%s' "${decoded}" | wc -c | tr -d ' ')" + if [ "${length}" = "16" ] || [ "${length}" = "24" ] || [ "${length}" = "32" ]; then + exit 0 + fi + fi + fi + + COOKIE_SECRET="$(openssl rand -hex 16 | tr -d '\n')" + kubectl -n logging create secret generic oauth2-proxy-logs-oidc \ + --from-literal=client_id="logs" \ + --from-literal=client_secret="${CLIENT_SECRET}" \ + --from-literal=cookie_secret="${COOKIE_SECRET}" \ + --dry-run=client -o yaml | kubectl -n logging apply -f - >/dev/null + env: + - name: KEYCLOAK_ADMIN + valueFrom: + secretKeyRef: + name: keycloak-admin + key: username + - name: KEYCLOAK_ADMIN_PASSWORD + valueFrom: + secretKeyRef: + name: keycloak-admin + key: password diff --git a/services/keycloak/mas-secrets-ensure-job.yaml b/services/keycloak/mas-secrets-ensure-job.yaml new file mode 100644 index 0000000..b0951cf --- /dev/null +++ b/services/keycloak/mas-secrets-ensure-job.yaml @@ -0,0 +1,107 @@ +# services/keycloak/mas-secrets-ensure-job.yaml +apiVersion: v1 +kind: ServiceAccount +metadata: + name: mas-secrets-ensure + namespace: sso +--- +apiVersion: batch/v1 +kind: Job +metadata: + name: mas-secrets-ensure-13 + namespace: sso +spec: + backoffLimit: 0 + ttlSecondsAfterFinished: 3600 + template: + spec: + serviceAccountName: mas-secrets-ensure + restartPolicy: Never + volumes: + - name: work + emptyDir: {} + initContainers: + - name: generate + image: alpine:3.20 + command: ["/bin/sh", "-c"] + args: + - | + set -euo pipefail + umask 077 + apk add --no-cache curl openssl jq >/dev/null + + KC_URL="http://keycloak.sso.svc.cluster.local" + ACCESS_TOKEN="" + for attempt in 1 2 3 4 5; do + TOKEN_JSON="$(curl -sS -X POST "$KC_URL/realms/master/protocol/openid-connect/token" \ + -H 'Content-Type: application/x-www-form-urlencoded' \ + -d "grant_type=password" \ + -d "client_id=admin-cli" \ + -d "username=${KEYCLOAK_ADMIN}" \ + -d "password=${KEYCLOAK_ADMIN_PASSWORD}" || true)" + ACCESS_TOKEN="$(echo "$TOKEN_JSON" | jq -r '.access_token' 2>/dev/null || true)" + if [ -n "$ACCESS_TOKEN" ] && [ "$ACCESS_TOKEN" != "null" ]; then + break + fi + echo "Keycloak token request failed (attempt ${attempt})" >&2 + sleep $((attempt * 2)) + done + if [ -z "$ACCESS_TOKEN" ] || [ "$ACCESS_TOKEN" = "null" ]; then + echo "Failed to fetch Keycloak admin token" >&2 + exit 1 + fi + CLIENT_ID="$(curl -sS -H "Authorization: Bearer ${ACCESS_TOKEN}" \ + "$KC_URL/admin/realms/atlas/clients?clientId=othrys-mas" | jq -r '.[0].id' 2>/dev/null || true)" + if [ -z "$CLIENT_ID" ] || [ "$CLIENT_ID" = "null" ]; then + echo "Keycloak client othrys-mas not found" >&2 + exit 1 + fi + CLIENT_SECRET="$(curl -sS -H "Authorization: Bearer ${ACCESS_TOKEN}" \ + "$KC_URL/admin/realms/atlas/clients/${CLIENT_ID}/client-secret" | jq -r '.value' 2>/dev/null || true)" + if [ -z "$CLIENT_SECRET" ] || [ "$CLIENT_SECRET" = "null" ]; then + echo "Keycloak client secret not found" >&2 + exit 1 + fi + + printf '%s' "$CLIENT_SECRET" > /work/keycloak_client_secret + openssl rand -hex 32 | tr -d '\n' > /work/encryption + openssl rand -hex 32 | tr -d '\n' > /work/matrix_shared_secret + openssl genpkey -algorithm RSA -pkeyopt rsa_keygen_bits:4096 -out /work/rsa_key >/dev/null 2>&1 + chmod 0644 /work/* + env: + - name: KEYCLOAK_ADMIN + valueFrom: + secretKeyRef: + name: keycloak-admin + key: username + - name: KEYCLOAK_ADMIN_PASSWORD + valueFrom: + secretKeyRef: + name: keycloak-admin + key: password + volumeMounts: + - name: work + mountPath: /work + containers: + - name: apply + image: bitnami/kubectl:latest + command: ["/bin/sh", "-c"] + args: + - | + set -euo pipefail + if kubectl -n comms get secret mas-secrets-runtime >/dev/null 2>&1; then + kubectl -n comms get secret mas-secrets-runtime -o jsonpath='{.data.encryption}' | base64 -d 2>/dev/null > /tmp/encryption.current || true + current_len="$(wc -c < /tmp/encryption.current | tr -d ' ')" + if [ "${current_len}" = "64" ] && grep -Eq '^[0-9a-fA-F]{64}$' /tmp/encryption.current; then + exit 0 + fi + fi + kubectl -n comms create secret generic mas-secrets-runtime \ + --from-file=encryption=/work/encryption \ + --from-file=matrix_shared_secret=/work/matrix_shared_secret \ + --from-file=keycloak_client_secret=/work/keycloak_client_secret \ + --from-file=rsa_key=/work/rsa_key \ + --dry-run=client -o yaml | kubectl -n comms apply -f - >/dev/null + volumeMounts: + - name: work + mountPath: /work diff --git a/services/keycloak/portal-e2e-client-job.yaml b/services/keycloak/portal-e2e-client-job.yaml new file mode 100644 index 0000000..7f6c5dd --- /dev/null +++ b/services/keycloak/portal-e2e-client-job.yaml @@ -0,0 +1,247 @@ +# services/keycloak/portal-e2e-client-job.yaml +apiVersion: batch/v1 +kind: Job +metadata: + name: keycloak-portal-e2e-client-2 + namespace: sso +spec: + backoffLimit: 0 + template: + spec: + restartPolicy: Never + containers: + - name: configure + image: python:3.11-alpine + env: + - name: KEYCLOAK_SERVER + value: http://keycloak.sso.svc.cluster.local + - name: KEYCLOAK_REALM + value: atlas + - name: KEYCLOAK_ADMIN_USER + valueFrom: + secretKeyRef: + name: keycloak-admin + key: username + - name: KEYCLOAK_ADMIN_PASSWORD + valueFrom: + secretKeyRef: + name: keycloak-admin + key: password + - name: PORTAL_E2E_CLIENT_ID + valueFrom: + secretKeyRef: + name: portal-e2e-client + key: client_id + - name: PORTAL_E2E_CLIENT_SECRET + valueFrom: + secretKeyRef: + name: portal-e2e-client + key: client_secret + command: ["/bin/sh", "-c"] + args: + - | + set -euo pipefail + python - <<'PY' + import json + import os + import urllib.parse + import urllib.error + import urllib.request + + base_url = os.environ["KEYCLOAK_SERVER"].rstrip("/") + realm = os.environ["KEYCLOAK_REALM"] + admin_user = os.environ["KEYCLOAK_ADMIN_USER"] + admin_password = os.environ["KEYCLOAK_ADMIN_PASSWORD"] + e2e_client_id = os.environ["PORTAL_E2E_CLIENT_ID"] + e2e_client_secret = os.environ["PORTAL_E2E_CLIENT_SECRET"] + + def http_json(method: str, url: str, token: str, payload=None): + data = None + headers = {"Authorization": f"Bearer {token}"} + if payload is not None: + data = json.dumps(payload).encode() + headers["Content-Type"] = "application/json" + req = urllib.request.Request(url, data=data, headers=headers, method=method) + try: + with urllib.request.urlopen(req, timeout=30) as resp: + body = resp.read() + if not body: + return resp.status, None + return resp.status, json.loads(body.decode()) + except urllib.error.HTTPError as exc: + raw = exc.read() + if not raw: + return exc.code, None + try: + return exc.code, json.loads(raw.decode()) + except Exception: + return exc.code, {"raw": raw.decode(errors="replace")} + + def get_admin_token() -> str: + token_data = urllib.parse.urlencode( + { + "grant_type": "password", + "client_id": "admin-cli", + "username": admin_user, + "password": admin_password, + } + ).encode() + req = urllib.request.Request( + f"{base_url}/realms/master/protocol/openid-connect/token", + data=token_data, + headers={"Content-Type": "application/x-www-form-urlencoded"}, + method="POST", + ) + try: + with urllib.request.urlopen(req, timeout=15) as resp: + body = json.loads(resp.read().decode()) + except urllib.error.HTTPError as exc: + raw = exc.read().decode(errors="replace") + raise SystemExit(f"Token request failed: status={exc.code} body={raw}") + return body["access_token"] + + token = get_admin_token() + + # Ensure the confidential client for E2E token exchange exists with service accounts enabled. + status, clients = http_json( + "GET", + f"{base_url}/admin/realms/{realm}/clients?clientId={urllib.parse.quote(e2e_client_id)}", + token, + ) + if status != 200 or not isinstance(clients, list): + raise SystemExit(f"Unexpected clients lookup response: {status}") + + client_uuid = None + if clients: + for item in clients: + if isinstance(item, dict) and item.get("clientId") == e2e_client_id: + client_uuid = item.get("id") + break + + desired_rep = { + "clientId": e2e_client_id, + "enabled": True, + "protocol": "openid-connect", + "publicClient": False, + "serviceAccountsEnabled": True, + "standardFlowEnabled": False, + "directAccessGrantsEnabled": False, + "implicitFlowEnabled": False, + "secret": e2e_client_secret, + "attributes": { + "oauth2.device.authorization.grant.enabled": "false", + "oauth2.token.exchange.grant.enabled": "true", + }, + } + + if not client_uuid: + status, resp = http_json( + "POST", + f"{base_url}/admin/realms/{realm}/clients", + token, + desired_rep, + ) + if status not in (201, 204): + raise SystemExit(f"Client create failed (status={status}) resp={resp}") + status, clients = http_json( + "GET", + f"{base_url}/admin/realms/{realm}/clients?clientId={urllib.parse.quote(e2e_client_id)}", + token, + ) + if status != 200 or not isinstance(clients, list) or not clients: + raise SystemExit("Unable to refetch client after creation") + client_uuid = clients[0].get("id") + + # Update existing client with desired settings (idempotent). + status, client_rep = http_json( + "GET", + f"{base_url}/admin/realms/{realm}/clients/{client_uuid}", + token, + ) + if status != 200 or not isinstance(client_rep, dict): + raise SystemExit(f"Unable to fetch client representation (status={status})") + + updated = False + for key in ("enabled", "serviceAccountsEnabled", "standardFlowEnabled", "directAccessGrantsEnabled", "implicitFlowEnabled"): + if client_rep.get(key) != desired_rep.get(key): + client_rep[key] = desired_rep.get(key) + updated = True + if client_rep.get("publicClient") is not False: + client_rep["publicClient"] = False + updated = True + if client_rep.get("secret") != desired_rep.get("secret"): + client_rep["secret"] = desired_rep.get("secret") + updated = True + + attrs = client_rep.get("attributes") or {} + for k, v in desired_rep["attributes"].items(): + if attrs.get(k) != v: + attrs[k] = v + updated = True + client_rep["attributes"] = attrs + + if updated: + status, resp = http_json( + "PUT", + f"{base_url}/admin/realms/{realm}/clients/{client_uuid}", + token, + client_rep, + ) + if status not in (200, 204): + raise SystemExit(f"Client update failed (status={status}) resp={resp}") + + # Give the service account user minimal realm-management roles for impersonation + user lookup. + status, svc_user = http_json( + "GET", + f"{base_url}/admin/realms/{realm}/clients/{client_uuid}/service-account-user", + token, + ) + if status != 200 or not isinstance(svc_user, dict) or not svc_user.get("id"): + raise SystemExit(f"Unable to fetch service account user (status={status})") + svc_user_id = svc_user["id"] + + status, rm_clients = http_json( + "GET", + f"{base_url}/admin/realms/{realm}/clients?clientId=realm-management", + token, + ) + if status != 200 or not isinstance(rm_clients, list) or not rm_clients: + raise SystemExit("Unable to find realm-management client") + rm_uuid = rm_clients[0].get("id") + if not rm_uuid: + raise SystemExit("realm-management client has no id") + + wanted_roles = ("query-users", "view-users", "manage-users", "impersonation") + role_reps = [] + for role_name in wanted_roles: + status, role = http_json( + "GET", + f"{base_url}/admin/realms/{realm}/clients/{rm_uuid}/roles/{urllib.parse.quote(role_name)}", + token, + ) + if status != 200 or not isinstance(role, dict): + raise SystemExit(f"Unable to fetch role {role_name} (status={status})") + role_reps.append({"id": role.get("id"), "name": role.get("name")}) + + status, assigned = http_json( + "GET", + f"{base_url}/admin/realms/{realm}/users/{svc_user_id}/role-mappings/clients/{rm_uuid}", + token, + ) + assigned_names = set() + if status == 200 and isinstance(assigned, list): + for r in assigned: + if isinstance(r, dict) and r.get("name"): + assigned_names.add(r["name"]) + + missing = [r for r in role_reps if r.get("name") and r["name"] not in assigned_names] + if missing: + status, resp = http_json( + "POST", + f"{base_url}/admin/realms/{realm}/users/{svc_user_id}/role-mappings/clients/{rm_uuid}", + token, + missing, + ) + if status not in (200, 204): + raise SystemExit(f"Role mapping update failed (status={status}) resp={resp}") + PY diff --git a/services/keycloak/portal-e2e-client-secret-sync-cronjob.yaml b/services/keycloak/portal-e2e-client-secret-sync-cronjob.yaml new file mode 100644 index 0000000..8bb7e55 --- /dev/null +++ b/services/keycloak/portal-e2e-client-secret-sync-cronjob.yaml @@ -0,0 +1,32 @@ +# services/keycloak/portal-e2e-client-secret-sync-cronjob.yaml +apiVersion: batch/v1 +kind: CronJob +metadata: + name: portal-e2e-client-secret-sync + namespace: sso +spec: + schedule: "*/10 * * * *" + concurrencyPolicy: Forbid + successfulJobsHistoryLimit: 1 + failedJobsHistoryLimit: 3 + jobTemplate: + spec: + backoffLimit: 1 + template: + spec: + serviceAccountName: portal-e2e-client-secret-sync + restartPolicy: Never + containers: + - name: sync + image: bitnami/kubectl@sha256:554ab88b1858e8424c55de37ad417b16f2a0e65d1607aa0f3fe3ce9b9f10b131 + command: ["/usr/bin/env", "bash"] + args: ["/scripts/sso_portal_e2e_client_secret_sync.sh"] + volumeMounts: + - name: script + mountPath: /scripts + readOnly: true + volumes: + - name: script + configMap: + name: portal-e2e-client-secret-sync-script + defaultMode: 0555 diff --git a/services/keycloak/portal-e2e-client-secret-sync-rbac.yaml b/services/keycloak/portal-e2e-client-secret-sync-rbac.yaml new file mode 100644 index 0000000..e2d39bb --- /dev/null +++ b/services/keycloak/portal-e2e-client-secret-sync-rbac.yaml @@ -0,0 +1,31 @@ +# services/keycloak/portal-e2e-client-secret-sync-rbac.yaml +apiVersion: v1 +kind: ServiceAccount +metadata: + name: portal-e2e-client-secret-sync + namespace: sso +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: Role +metadata: + name: portal-e2e-client-secret-sync-source + namespace: sso +rules: + - apiGroups: [""] + resources: ["secrets"] + resourceNames: ["portal-e2e-client"] + verbs: ["get"] +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + name: portal-e2e-client-secret-sync-source + namespace: sso +subjects: + - kind: ServiceAccount + name: portal-e2e-client-secret-sync + namespace: sso +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: Role + name: portal-e2e-client-secret-sync-source diff --git a/services/keycloak/portal-e2e-execute-actions-email-test-job.yaml b/services/keycloak/portal-e2e-execute-actions-email-test-job.yaml new file mode 100644 index 0000000..877dd55 --- /dev/null +++ b/services/keycloak/portal-e2e-execute-actions-email-test-job.yaml @@ -0,0 +1,51 @@ +# services/keycloak/portal-e2e-execute-actions-email-test-job.yaml +apiVersion: batch/v1 +kind: Job +metadata: + name: keycloak-portal-e2e-execute-actions-email-5 + namespace: sso +spec: + backoffLimit: 3 + template: + spec: + restartPolicy: Never + containers: + - name: test + image: python:3.11-alpine + env: + - name: KEYCLOAK_SERVER + value: http://keycloak.sso.svc.cluster.local + - name: KEYCLOAK_REALM + value: atlas + - name: PORTAL_E2E_CLIENT_ID + valueFrom: + secretKeyRef: + name: portal-e2e-client + key: client_id + - name: PORTAL_E2E_CLIENT_SECRET + valueFrom: + secretKeyRef: + name: portal-e2e-client + key: client_secret + - name: E2E_PROBE_USERNAME + value: e2e-smtp-probe + - name: E2E_PROBE_EMAIL + value: robot@bstein.dev + - name: EXECUTE_ACTIONS_CLIENT_ID + value: bstein-dev-home + - name: EXECUTE_ACTIONS_REDIRECT_URI + value: https://bstein.dev/ + command: ["/bin/sh", "-c"] + args: + - | + set -euo pipefail + python /scripts/test_keycloak_execute_actions_email.py + volumeMounts: + - name: tests + mountPath: /scripts + readOnly: true + volumes: + - name: tests + configMap: + name: portal-e2e-tests + defaultMode: 0555 diff --git a/services/keycloak/portal-e2e-target-client-job.yaml b/services/keycloak/portal-e2e-target-client-job.yaml new file mode 100644 index 0000000..45b3980 --- /dev/null +++ b/services/keycloak/portal-e2e-target-client-job.yaml @@ -0,0 +1,138 @@ +# services/keycloak/portal-e2e-target-client-job.yaml +apiVersion: batch/v1 +kind: Job +metadata: + name: keycloak-portal-e2e-target-1 + namespace: sso +spec: + backoffLimit: 0 + template: + spec: + restartPolicy: Never + containers: + - name: configure + image: python:3.11-alpine + env: + - name: KEYCLOAK_SERVER + value: http://keycloak.sso.svc.cluster.local + - name: KEYCLOAK_REALM + value: atlas + - name: KEYCLOAK_ADMIN_USER + valueFrom: + secretKeyRef: + name: keycloak-admin + key: username + - name: KEYCLOAK_ADMIN_PASSWORD + valueFrom: + secretKeyRef: + name: keycloak-admin + key: password + - name: TARGET_CLIENT_ID + value: bstein-dev-home + command: ["/bin/sh", "-c"] + args: + - | + set -euo pipefail + python - <<'PY' + import json + import os + import urllib.parse + import urllib.error + import urllib.request + + base_url = os.environ["KEYCLOAK_SERVER"].rstrip("/") + realm = os.environ["KEYCLOAK_REALM"] + admin_user = os.environ["KEYCLOAK_ADMIN_USER"] + admin_password = os.environ["KEYCLOAK_ADMIN_PASSWORD"] + target_client_id = os.environ["TARGET_CLIENT_ID"] + + def http_json(method: str, url: str, token: str, payload=None): + data = None + headers = {"Authorization": f"Bearer {token}"} + if payload is not None: + data = json.dumps(payload).encode() + headers["Content-Type"] = "application/json" + req = urllib.request.Request(url, data=data, headers=headers, method=method) + try: + with urllib.request.urlopen(req, timeout=30) as resp: + body = resp.read() + if not body: + return resp.status, None + return resp.status, json.loads(body.decode()) + except urllib.error.HTTPError as exc: + raw = exc.read() + if not raw: + return exc.code, None + try: + return exc.code, json.loads(raw.decode()) + except Exception: + return exc.code, {"raw": raw.decode(errors="replace")} + + def get_admin_token() -> str: + token_data = urllib.parse.urlencode( + { + "grant_type": "password", + "client_id": "admin-cli", + "username": admin_user, + "password": admin_password, + } + ).encode() + req = urllib.request.Request( + f"{base_url}/realms/master/protocol/openid-connect/token", + data=token_data, + headers={"Content-Type": "application/x-www-form-urlencoded"}, + method="POST", + ) + try: + with urllib.request.urlopen(req, timeout=15) as resp: + body = json.loads(resp.read().decode()) + except urllib.error.HTTPError as exc: + raw = exc.read().decode(errors="replace") + raise SystemExit(f"Token request failed: status={exc.code} body={raw}") + return body["access_token"] + + token = get_admin_token() + + status, clients = http_json( + "GET", + f"{base_url}/admin/realms/{realm}/clients?clientId={urllib.parse.quote(target_client_id)}", + token, + ) + if status != 200 or not isinstance(clients, list) or not clients: + raise SystemExit(f"Unable to find target client {target_client_id!r} (status={status})") + + client_uuid = None + for item in clients: + if isinstance(item, dict) and item.get("clientId") == target_client_id: + client_uuid = item.get("id") + break + if not client_uuid: + raise SystemExit(f"Target client {target_client_id!r} has no id") + + status, client_rep = http_json( + "GET", + f"{base_url}/admin/realms/{realm}/clients/{client_uuid}", + token, + ) + if status != 200 or not isinstance(client_rep, dict): + raise SystemExit(f"Unable to fetch client representation (status={status})") + + attrs = client_rep.get("attributes") or {} + updated = False + if attrs.get("oauth2.token.exchange.grant.enabled") != "true": + attrs["oauth2.token.exchange.grant.enabled"] = "true" + updated = True + client_rep["attributes"] = attrs + + if updated: + status, resp = http_json( + "PUT", + f"{base_url}/admin/realms/{realm}/clients/{client_uuid}", + token, + client_rep, + ) + if status not in (200, 204): + raise SystemExit(f"Client update failed (status={status}) resp={resp}") + + print(f"OK: ensured token exchange enabled on client {target_client_id}") + PY diff --git a/services/keycloak/portal-e2e-token-exchange-permissions-job.yaml b/services/keycloak/portal-e2e-token-exchange-permissions-job.yaml new file mode 100644 index 0000000..104d6f0 --- /dev/null +++ b/services/keycloak/portal-e2e-token-exchange-permissions-job.yaml @@ -0,0 +1,271 @@ +# services/keycloak/portal-e2e-token-exchange-permissions-job.yaml +apiVersion: batch/v1 +kind: Job +metadata: + name: keycloak-portal-e2e-token-exchange-permissions-5 + namespace: sso +spec: + backoffLimit: 6 + template: + spec: + restartPolicy: Never + containers: + - name: configure + image: python:3.11-alpine + env: + - name: KEYCLOAK_SERVER + value: http://keycloak.sso.svc.cluster.local + - name: KEYCLOAK_REALM + value: atlas + - name: KEYCLOAK_ADMIN_USER + valueFrom: + secretKeyRef: + name: keycloak-admin + key: username + - name: KEYCLOAK_ADMIN_PASSWORD + valueFrom: + secretKeyRef: + name: keycloak-admin + key: password + - name: PORTAL_E2E_CLIENT_ID + value: test-portal-e2e + - name: TARGET_CLIENT_ID + value: bstein-dev-home + command: ["/bin/sh", "-c"] + args: + - | + set -euo pipefail + python - <<'PY' + import json + import os + import re + import time + import urllib.parse + import urllib.error + import urllib.request + from typing import Any + + base_url = os.environ["KEYCLOAK_SERVER"].rstrip("/") + realm = os.environ["KEYCLOAK_REALM"] + admin_user = os.environ["KEYCLOAK_ADMIN_USER"] + admin_password = os.environ["KEYCLOAK_ADMIN_PASSWORD"] + e2e_client_id = os.environ["PORTAL_E2E_CLIENT_ID"] + target_client_id = os.environ["TARGET_CLIENT_ID"] + + uuid_re = re.compile(r"^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$", re.IGNORECASE) + + def is_uuid(value: str) -> bool: + return bool(uuid_re.match(value)) + + def http_json(method: str, url: str, token: str, payload: Any | None = None): + data = None + headers = {"Authorization": f"Bearer {token}"} + if payload is not None: + data = json.dumps(payload).encode() + headers["Content-Type"] = "application/json" + req = urllib.request.Request(url, data=data, headers=headers, method=method) + try: + with urllib.request.urlopen(req, timeout=30) as resp: + body = resp.read() + if not body: + return resp.status, None + return resp.status, json.loads(body.decode()) + except urllib.error.HTTPError as exc: + raw = exc.read() + if not raw: + return exc.code, None + try: + return exc.code, json.loads(raw.decode()) + except Exception: + return exc.code, {"raw": raw.decode(errors="replace")} + + def get_admin_token() -> str: + last_error: str | None = None + token_data = urllib.parse.urlencode( + { + "grant_type": "password", + "client_id": "admin-cli", + "username": admin_user, + "password": admin_password, + } + ).encode() + req = urllib.request.Request( + f"{base_url}/realms/master/protocol/openid-connect/token", + data=token_data, + headers={"Content-Type": "application/x-www-form-urlencoded"}, + method="POST", + ) + for attempt in range(1, 61): + try: + with urllib.request.urlopen(req, timeout=15) as resp: + body = json.loads(resp.read().decode()) + token = body.get("access_token") + if isinstance(token, str) and token: + return token + last_error = "missing access_token" + except urllib.error.HTTPError as exc: + # Treat transient startup errors as retryable. + if exc.code in (404, 429, 500, 502, 503, 504): + last_error = f"http {exc.code}" + else: + raise SystemExit(f"Token request failed: status={exc.code}") + except urllib.error.URLError as exc: + last_error = str(exc.reason) + time.sleep(2) + raise SystemExit(f"Token request failed after retries: {last_error}") + + def find_client_uuid(token: str, client_id: str) -> str: + status, clients = http_json( + "GET", + f"{base_url}/admin/realms/{realm}/clients?clientId={urllib.parse.quote(client_id)}", + token, + ) + if status != 200 or not isinstance(clients, list) or not clients: + raise SystemExit(f"Unable to find client {client_id!r} (status={status})") + for item in clients: + if isinstance(item, dict) and item.get("clientId") == client_id and item.get("id"): + return item["id"] + raise SystemExit(f"Client {client_id!r} has no id") + + token = get_admin_token() + + rm_uuid = find_client_uuid(token, "realm-management") + e2e_uuid = find_client_uuid(token, e2e_client_id) + target_uuid = find_client_uuid(token, target_client_id) + + def enable_and_get_permissions(url: str) -> dict[str, Any]: + status, resp = http_json("PUT", url, token, {"enabled": True}) + if status not in (200, 204): + raise SystemExit(f"Failed enabling permissions at {url} (status={status}) resp={resp}") + status, perms = http_json("GET", url, token) + if status != 200 or not isinstance(perms, dict): + raise SystemExit(f"Failed reading permissions at {url} (status={status}) resp={perms}") + return perms + + users_perms = enable_and_get_permissions(f"{base_url}/admin/realms/{realm}/users-management-permissions") + users_scope_perms = users_perms.get("scopePermissions") or {} + if not isinstance(users_scope_perms, dict): + raise SystemExit("Users management permissions missing scopePermissions") + impersonate_perm_id = users_scope_perms.get("impersonate") or users_scope_perms.get("impersonation") + if not impersonate_perm_id: + keys = sorted(k for k in users_scope_perms.keys()) + raise SystemExit(f"Users permissions missing impersonate scope (have: {keys})") + + target_perms = enable_and_get_permissions( + f"{base_url}/admin/realms/{realm}/clients/{target_uuid}/management/permissions" + ) + target_scope_perms = target_perms.get("scopePermissions") or {} + if not isinstance(target_scope_perms, dict): + raise SystemExit("Target client permissions missing scopePermissions") + token_exchange_perm_id = target_scope_perms.get("token-exchange") + if not token_exchange_perm_id: + keys = sorted(k for k in target_scope_perms.keys()) + raise SystemExit(f"Target client permissions missing token-exchange scope (have: {keys})") + + policy_name = "test-portal-e2e-token-exchange" + policy_base_url = f"{base_url}/admin/realms/{realm}/clients/{rm_uuid}/authz/resource-server/policy" + + def find_policy_by_name(name: str): + urls = [ + f"{policy_base_url}/search?name={urllib.parse.quote(name)}&fields=id,name,type,config", + f"{policy_base_url}/search?name={urllib.parse.quote(name)}", + policy_base_url, + ] + for url in urls: + st, body = http_json("GET", url, token) + if st != 200: + continue + items = None + if isinstance(body, list): + items = body + elif isinstance(body, dict): + for key in ("policies", "items", "data"): + value = body.get(key) + if isinstance(value, list): + items = value + break + if not isinstance(items, list): + continue + for item in items: + if isinstance(item, dict) and item.get("name") == name and item.get("id"): + return item + return None + + policy = find_policy_by_name(policy_name) + + if policy is None: + create_rep: dict[str, Any] = { + "name": policy_name, + "type": "client", + "logic": "POSITIVE", + "decisionStrategy": "UNANIMOUS", + "config": {"clients": json.dumps([e2e_uuid])}, + } + status, created = http_json( + "POST", + policy_base_url, + token, + create_rep, + ) + if status == 201 and isinstance(created, dict) and created.get("id"): + policy = created + elif status == 409: + policy = find_policy_by_name(policy_name) + if policy is None: + raise SystemExit(f"Policy {policy_name!r} exists but could not be retrieved") + else: + raise SystemExit(f"Failed creating policy {policy_name!r} (status={status}) resp={created}") + + policy_id = policy.get("id") + if not isinstance(policy_id, str) or not policy_id: + raise SystemExit(f"Policy {policy_name!r} missing id") + + def patch_permission(permission_id: str): + candidates = [ + f"{base_url}/admin/realms/{realm}/clients/{rm_uuid}/authz/resource-server/permission/scope/{permission_id}", + f"{base_url}/admin/realms/{realm}/clients/{rm_uuid}/authz/resource-server/permission/resource/{permission_id}", + f"{base_url}/admin/realms/{realm}/clients/{rm_uuid}/authz/resource-server/permission/{permission_id}", + ] + perm = None + url_used = None + for url in candidates: + st, body = http_json("GET", url, token) + if st == 200 and isinstance(body, dict): + perm = body + url_used = url + break + if perm is None or url_used is None: + raise SystemExit(f"Unable to fetch permission {permission_id} via expected endpoints") + + policies_field = perm.get("policies") + if isinstance(policies_field, list): + policies_list = [p for p in policies_field if isinstance(p, str)] + else: + policies_list = [] + + use_ids = any(is_uuid(p) for p in policies_list) + entry = policy_id if use_ids else policy_name + if entry in policies_list: + return + + policies_list.append(entry) + perm["policies"] = policies_list + st, body = http_json("PUT", url_used, token, perm) + if st in (200, 201, 204): + return + + # Retry once with the other identifier form. + alt_entry = policy_name if entry == policy_id else policy_id + if alt_entry not in policies_list: + perm["policies"] = [p for p in policies_list if p != entry] + [alt_entry] + st2, body2 = http_json("PUT", url_used, token, perm) + if st2 in (200, 201, 204): + return + raise SystemExit(f"Failed updating permission {permission_id} (status={st2}) resp={body2}") + raise SystemExit(f"Failed updating permission {permission_id} (status={st}) resp={body}") + + patch_permission(str(impersonate_perm_id)) + patch_permission(str(token_exchange_perm_id)) + + print("OK: configured token exchange permissions for portal E2E client") + PY diff --git a/services/keycloak/portal-e2e-token-exchange-test-job.yaml b/services/keycloak/portal-e2e-token-exchange-test-job.yaml new file mode 100644 index 0000000..ab43303 --- /dev/null +++ b/services/keycloak/portal-e2e-token-exchange-test-job.yaml @@ -0,0 +1,52 @@ +# services/keycloak/portal-e2e-token-exchange-test-job.yaml +apiVersion: batch/v1 +kind: Job +metadata: + name: keycloak-portal-e2e-token-exchange-test-1 + namespace: sso +spec: + backoffLimit: 6 + ttlSecondsAfterFinished: 3600 + template: + spec: + restartPolicy: Never + containers: + - name: test + image: python:3.11-alpine + env: + - name: KEYCLOAK_SERVER + value: http://keycloak.sso.svc.cluster.local + - name: KEYCLOAK_REALM + value: atlas + - name: TARGET_CLIENT_ID + value: bstein-dev-home + - name: IMPERSONATE_USERNAME + value: robotuser + - name: RETRY_DEADLINE_SECONDS + value: "300" + - name: RETRY_INTERVAL_SECONDS + value: "5" + - name: PORTAL_E2E_CLIENT_ID + valueFrom: + secretKeyRef: + name: portal-e2e-client + key: client_id + - name: PORTAL_E2E_CLIENT_SECRET + valueFrom: + secretKeyRef: + name: portal-e2e-client + key: client_secret + command: ["/bin/sh", "-c"] + args: + - | + set -euo pipefail + python /scripts/test_portal_token_exchange.py + volumeMounts: + - name: tests + mountPath: /scripts + readOnly: true + volumes: + - name: tests + configMap: + name: portal-e2e-tests + defaultMode: 0555 diff --git a/services/keycloak/realm-settings-job.yaml b/services/keycloak/realm-settings-job.yaml new file mode 100644 index 0000000..bdc816d --- /dev/null +++ b/services/keycloak/realm-settings-job.yaml @@ -0,0 +1,446 @@ +# services/keycloak/realm-settings-job.yaml +apiVersion: batch/v1 +kind: Job +metadata: + name: keycloak-realm-settings-16 + namespace: sso +spec: + backoffLimit: 0 + template: + spec: + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: hardware + operator: In + values: ["rpi5","rpi4"] + - key: node-role.kubernetes.io/worker + operator: Exists + restartPolicy: Never + containers: + - name: configure + image: python:3.11-alpine + env: + - name: KEYCLOAK_SERVER + value: http://keycloak.sso.svc.cluster.local + - name: KEYCLOAK_REALM + value: atlas + - name: KEYCLOAK_ADMIN_USER + valueFrom: + secretKeyRef: + name: keycloak-admin + key: username + - name: KEYCLOAK_ADMIN_PASSWORD + valueFrom: + secretKeyRef: + name: keycloak-admin + key: password + - name: KEYCLOAK_SMTP_HOST + value: mailu-front.mailu-mailserver.svc.cluster.local + - name: KEYCLOAK_SMTP_PORT + value: "25" + - name: KEYCLOAK_SMTP_FROM + value: no-reply@bstein.dev + - name: KEYCLOAK_SMTP_FROM_NAME + value: Atlas SSO + - name: KEYCLOAK_SMTP_REPLY_TO + value: no-reply@bstein.dev + - name: KEYCLOAK_SMTP_REPLY_TO_NAME + value: Atlas SSO + command: ["/bin/sh", "-c"] + args: + - | + set -euo pipefail + python - <<'PY' + import json + import os + import urllib.parse + import urllib.error + import urllib.request + + base_url = os.environ["KEYCLOAK_SERVER"].rstrip("/") + realm = os.environ["KEYCLOAK_REALM"] + admin_user = os.environ["KEYCLOAK_ADMIN_USER"] + admin_password = os.environ["KEYCLOAK_ADMIN_PASSWORD"] + + def http_json(method: str, url: str, token: str, payload=None): + data = None + headers = {"Authorization": f"Bearer {token}"} + if payload is not None: + data = json.dumps(payload).encode() + headers["Content-Type"] = "application/json" + req = urllib.request.Request(url, data=data, headers=headers, method=method) + try: + with urllib.request.urlopen(req, timeout=30) as resp: + body = resp.read() + if not body: + return resp.status, None + return resp.status, json.loads(body.decode()) + except urllib.error.HTTPError as exc: + raw = exc.read() + if not raw: + return exc.code, None + try: + return exc.code, json.loads(raw.decode()) + except Exception: + return exc.code, {"raw": raw.decode(errors="replace")} + + token_data = urllib.parse.urlencode( + { + "grant_type": "password", + "client_id": "admin-cli", + "username": admin_user, + "password": admin_password, + } + ).encode() + token_req = urllib.request.Request( + f"{base_url}/realms/master/protocol/openid-connect/token", + data=token_data, + headers={"Content-Type": "application/x-www-form-urlencoded"}, + method="POST", + ) + try: + with urllib.request.urlopen(token_req, timeout=10) as resp: + token_body = json.loads(resp.read().decode()) + except urllib.error.HTTPError as exc: + body = exc.read().decode(errors="replace") + raise SystemExit(f"Token request failed: status={exc.code} body={body}") + access_token = token_body["access_token"] + + # Update realm settings safely by fetching the full realm representation first. + realm_url = f"{base_url}/admin/realms/{realm}" + status, realm_rep = http_json("GET", realm_url, access_token) + if status != 200 or not realm_rep: + raise SystemExit(f"Unable to fetch realm {realm} (status={status})") + + realm_rep["resetPasswordAllowed"] = True + + smtp = realm_rep.get("smtpServer") or {} + smtp.update( + { + "host": os.environ["KEYCLOAK_SMTP_HOST"], + "port": os.environ["KEYCLOAK_SMTP_PORT"], + "from": os.environ["KEYCLOAK_SMTP_FROM"], + "fromDisplayName": os.environ["KEYCLOAK_SMTP_FROM_NAME"], + "replyTo": os.environ["KEYCLOAK_SMTP_REPLY_TO"], + "replyToDisplayName": os.environ["KEYCLOAK_SMTP_REPLY_TO_NAME"], + "auth": "false", + "starttls": "false", + "ssl": "false", + } + ) + realm_rep["smtpServer"] = smtp + + status, _ = http_json("PUT", realm_url, access_token, realm_rep) + if status not in (200, 204): + raise SystemExit(f"Unexpected realm update response: {status}") + + # Ensure required custom user-profile attributes exist. + profile_url = f"{base_url}/admin/realms/{realm}/users/profile" + status, profile = http_json("GET", profile_url, access_token) + if status == 200 and isinstance(profile, dict): + attrs = profile.get("attributes") + if not isinstance(attrs, list): + attrs = [] + + required_attrs = [ + { + "name": "vaultwarden_email", + "displayName": "Vaultwarden Email", + "multivalued": False, + "annotations": {"group": "user-metadata"}, + "permissions": {"view": ["admin"], "edit": ["admin"]}, + "validations": {"email": {}, "length": {"max": 255}}, + }, + { + "name": "vaultwarden_status", + "displayName": "Vaultwarden Status", + "multivalued": False, + "annotations": {"group": "user-metadata"}, + "permissions": {"view": ["admin"], "edit": ["admin"]}, + "validations": {"length": {"max": 64}}, + }, + { + "name": "vaultwarden_synced_at", + "displayName": "Vaultwarden Last Synced", + "multivalued": False, + "annotations": {"group": "user-metadata"}, + "permissions": {"view": ["admin"], "edit": ["admin"]}, + "validations": {"length": {"max": 64}}, + }, + { + "name": "mailu_email", + "displayName": "Atlas Mailbox", + "multivalued": False, + "annotations": {"group": "user-metadata"}, + "permissions": {"view": ["admin"], "edit": ["admin"]}, + "validations": {"email": {}, "length": {"max": 255}}, + }, + { + "name": "mailu_app_password", + "displayName": "Atlas Mail App Password", + "multivalued": False, + "annotations": {"group": "user-metadata"}, + "permissions": {"view": ["admin"], "edit": ["admin"]}, + "validations": {"length": {"max": 255}}, + }, + { + "name": "nextcloud_mail_primary_email", + "displayName": "Nextcloud Mail Primary Email", + "multivalued": False, + "annotations": {"group": "user-metadata"}, + "permissions": {"view": ["admin"], "edit": ["admin"]}, + "validations": {"email": {}, "length": {"max": 255}}, + }, + { + "name": "nextcloud_mail_account_count", + "displayName": "Nextcloud Mail Account Count", + "multivalued": False, + "annotations": {"group": "user-metadata"}, + "permissions": {"view": ["admin"], "edit": ["admin"]}, + "validations": {"length": {"max": 32}}, + }, + { + "name": "nextcloud_mail_synced_at", + "displayName": "Nextcloud Mail Last Synced", + "multivalued": False, + "annotations": {"group": "user-metadata"}, + "permissions": {"view": ["admin"], "edit": ["admin"]}, + "validations": {"length": {"max": 64}}, + }, + ] + + def has_attr(name: str) -> bool: + return any(isinstance(item, dict) and item.get("name") == name for item in attrs) + + updated = False + for attr in required_attrs: + if not has_attr(attr.get("name", "")): + attrs.append(attr) + updated = True + + if updated: + profile["attributes"] = attrs + status, _ = http_json("PUT", profile_url, access_token, profile) + if status not in (200, 204): + raise SystemExit(f"Unexpected user-profile update response: {status}") + + def find_group(group_name: str): + status, groups = http_json( + "GET", + f"{base_url}/admin/realms/{realm}/groups?search={urllib.parse.quote(group_name)}", + access_token, + ) + if status != 200 or not isinstance(groups, list): + return None + for item in groups: + if isinstance(item, dict) and item.get("name") == group_name: + return item + return None + + def ensure_group(group_name: str): + group = find_group(group_name) + if group: + return group + status, _ = http_json( + "POST", + f"{base_url}/admin/realms/{realm}/groups", + access_token, + {"name": group_name}, + ) + if status not in (201, 204): + raise SystemExit(f"Unexpected group create response for {group_name}: {status}") + return find_group(group_name) + + # Ensure basic realm groups exist for provisioning. + ensure_group("dev") + ensure_group("admin") + planka_group = ensure_group("planka-users") + + if planka_group and planka_group.get("id"): + group_id = planka_group["id"] + status, default_groups = http_json( + "GET", + f"{base_url}/admin/realms/{realm}/default-groups", + access_token, + ) + default_ids = set() + if status == 200 and isinstance(default_groups, list): + for item in default_groups: + if isinstance(item, dict) and item.get("id"): + default_ids.add(item["id"]) + + if group_id not in default_ids: + status, _ = http_json( + "PUT", + f"{base_url}/admin/realms/{realm}/default-groups/{group_id}", + access_token, + ) + if status not in (200, 201, 204): + status, _ = http_json( + "POST", + f"{base_url}/admin/realms/{realm}/default-groups/{group_id}", + access_token, + ) + if status not in (200, 201, 204): + raise SystemExit( + f"Unexpected default-group update response for planka-users: {status}" + ) + + # Ensure all existing users are in the planka-users group. + first = 0 + page_size = 100 + while True: + status, users = http_json( + "GET", + f"{base_url}/admin/realms/{realm}/users?first={first}&max={page_size}", + access_token, + ) + if status != 200 or not isinstance(users, list) or not users: + break + for user in users: + user_id = user.get("id") if isinstance(user, dict) else None + if not user_id: + continue + status, groups = http_json( + "GET", + f"{base_url}/admin/realms/{realm}/users/{user_id}/groups", + access_token, + ) + if status == 200 and isinstance(groups, list): + already = any(isinstance(g, dict) and g.get("id") == group_id for g in groups) + if already: + continue + status, _ = http_json( + "PUT", + f"{base_url}/admin/realms/{realm}/users/{user_id}/groups/{group_id}", + access_token, + ) + if status not in (200, 201, 204): + status, _ = http_json( + "POST", + f"{base_url}/admin/realms/{realm}/users/{user_id}/groups/{group_id}", + access_token, + ) + if status not in (200, 201, 204): + raise SystemExit( + f"Unexpected group membership update for user {user_id}: {status}" + ) + if len(users) < page_size: + break + first += page_size + + # Ensure Planka client exposes groups in userinfo for role mapping. + status, clients = http_json( + "GET", + f"{base_url}/admin/realms/{realm}/clients?clientId=planka", + access_token, + ) + planka_client = None + if status == 200 and isinstance(clients, list): + for item in clients: + if isinstance(item, dict) and item.get("clientId") == "planka": + planka_client = item + break + + if planka_client: + client_id = planka_client.get("id") + mapper_payload = { + "name": "groups", + "protocol": "openid-connect", + "protocolMapper": "oidc-group-membership-mapper", + "consentRequired": False, + "config": { + "full.path": "false", + "id.token.claim": "true", + "access.token.claim": "true", + "userinfo.token.claim": "true", + "claim.name": "groups", + "jsonType.label": "String", + }, + } + status, mappers = http_json( + "GET", + f"{base_url}/admin/realms/{realm}/clients/{client_id}/protocol-mappers/models", + access_token, + ) + existing = None + if status == 200 and isinstance(mappers, list): + for item in mappers: + if isinstance(item, dict) and item.get("name") == mapper_payload["name"]: + existing = item + break + + if existing and existing.get("id"): + mapper_payload["id"] = existing["id"] + status, _ = http_json( + "PUT", + f"{base_url}/admin/realms/{realm}/clients/{client_id}/protocol-mappers/models/{existing['id']}", + access_token, + mapper_payload, + ) + if status not in (200, 204): + raise SystemExit(f"Unexpected protocol mapper update response: {status}") + else: + status, _ = http_json( + "POST", + f"{base_url}/admin/realms/{realm}/clients/{client_id}/protocol-mappers/models", + access_token, + mapper_payload, + ) + if status not in (201, 204): + raise SystemExit(f"Unexpected protocol mapper create response: {status}") + + # Ensure MFA is on by default for newly-created users. + status, required_actions = http_json( + "GET", + f"{base_url}/admin/realms/{realm}/authentication/required-actions", + access_token, + ) + if status == 200 and isinstance(required_actions, list): + for action in required_actions: + if not isinstance(action, dict): + continue + if action.get("alias") != "CONFIGURE_TOTP": + continue + if action.get("enabled") is True and action.get("defaultAction") is True: + break + action["enabled"] = True + action["defaultAction"] = True + status, _ = http_json( + "PUT", + f"{base_url}/admin/realms/{realm}/authentication/required-actions/CONFIGURE_TOTP", + access_token, + action, + ) + if status not in (200, 204): + raise SystemExit( + f"Unexpected required-action update response for CONFIGURE_TOTP: {status}" + ) + + # Disable Identity Provider Redirector in the browser flow for this realm. + status, executions = http_json( + "GET", + f"{base_url}/admin/realms/{realm}/authentication/flows/browser/executions", + access_token, + ) + if status == 200 and executions: + for ex in executions: + if ex.get("providerId") != "identity-provider-redirector": + continue + if ex.get("requirement") == "DISABLED": + continue + ex["requirement"] = "DISABLED" + status, _ = http_json( + "PUT", + f"{base_url}/admin/realms/{realm}/authentication/flows/browser/executions", + access_token, + ex, + ) + if status not in (200, 204): + raise SystemExit( + f"Unexpected execution update response for identity-provider-redirector: {status}" + ) + PY diff --git a/services/keycloak/scripts/sso_portal_e2e_client_secret_sync.sh b/services/keycloak/scripts/sso_portal_e2e_client_secret_sync.sh new file mode 100755 index 0000000..bf944ca --- /dev/null +++ b/services/keycloak/scripts/sso_portal_e2e_client_secret_sync.sh @@ -0,0 +1,20 @@ +#!/usr/bin/env bash +set -euo pipefail + +SOURCE_NAMESPACE="${SOURCE_NAMESPACE:-sso}" +DEST_NAMESPACE="${DEST_NAMESPACE:-bstein-dev-home}" +SECRET_NAME="${SECRET_NAME:-portal-e2e-client}" + +client_id="$(kubectl -n "${SOURCE_NAMESPACE}" get secret "${SECRET_NAME}" -o jsonpath='{.data.client_id}')" +client_secret="$(kubectl -n "${SOURCE_NAMESPACE}" get secret "${SECRET_NAME}" -o jsonpath='{.data.client_secret}')" + +cat </dev/null +apiVersion: v1 +kind: Secret +metadata: + name: ${SECRET_NAME} +type: Opaque +data: + client_id: ${client_id} + client_secret: ${client_secret} +EOF diff --git a/services/keycloak/scripts/tests/test_keycloak_execute_actions_email.py b/services/keycloak/scripts/tests/test_keycloak_execute_actions_email.py new file mode 100644 index 0000000..7d89a2e --- /dev/null +++ b/services/keycloak/scripts/tests/test_keycloak_execute_actions_email.py @@ -0,0 +1,198 @@ +#!/usr/bin/env python3 +import json +import os +import sys +import time +import urllib.error +import urllib.parse +import urllib.request + + +def _require_env(name: str) -> str: + value = os.environ.get(name) + if not value: + raise SystemExit(f"missing required env var: {name}") + return value + + +def _post_form(url: str, data: dict[str, str], timeout_s: int = 30) -> dict: + body = urllib.parse.urlencode(data).encode() + req = urllib.request.Request( + url, + data=body, + headers={"Content-Type": "application/x-www-form-urlencoded"}, + method="POST", + ) + try: + with urllib.request.urlopen(req, timeout=timeout_s) as resp: + payload = resp.read().decode() + return json.loads(payload) if payload else {} + except urllib.error.HTTPError as exc: + raw = exc.read().decode(errors="replace") + raise SystemExit(f"HTTP {exc.code} from {url}: {raw}") + + +def _request_json(method: str, url: str, token: str, payload: object | None = None, timeout_s: int = 30) -> tuple[int, object | None]: + data = None + headers = {"Authorization": f"Bearer {token}"} + if payload is not None: + data = json.dumps(payload).encode() + headers["Content-Type"] = "application/json" + req = urllib.request.Request(url, data=data, headers=headers, method=method) + try: + with urllib.request.urlopen(req, timeout=timeout_s) as resp: + body = resp.read() + if not body: + return resp.status, None + return resp.status, json.loads(body.decode()) + except urllib.error.HTTPError as exc: + raw = exc.read().decode(errors="replace") + try: + return exc.code, json.loads(raw) if raw else None + except json.JSONDecodeError: + return exc.code, raw + + +def main() -> int: + keycloak_base = _require_env("KEYCLOAK_SERVER").rstrip("/") + realm = os.environ.get("KEYCLOAK_REALM", "atlas") + client_id = _require_env("PORTAL_E2E_CLIENT_ID") + client_secret = _require_env("PORTAL_E2E_CLIENT_SECRET") + + probe_username = os.environ.get("E2E_PROBE_USERNAME", "e2e-smtp-probe") + probe_email = os.environ.get("E2E_PROBE_EMAIL", "e2e-smtp-probe@bstein.dev") + + execute_client_id = os.environ.get("EXECUTE_ACTIONS_CLIENT_ID", "bstein-dev-home") + execute_redirect_uri = os.environ.get("EXECUTE_ACTIONS_REDIRECT_URI", "https://bstein.dev/") + + token_url = f"{keycloak_base}/realms/{realm}/protocol/openid-connect/token" + admin_users_url = f"{keycloak_base}/admin/realms/{realm}/users" + + def get_access_token() -> str: + token_payload = _post_form( + token_url, + {"grant_type": "client_credentials", "client_id": client_id, "client_secret": client_secret}, + timeout_s=30, + ) + access_token = token_payload.get("access_token") + if not isinstance(access_token, str) or not access_token: + raise SystemExit("client credentials token missing access_token") + return access_token + + access_token = get_access_token() + + users: list | None = None + search_url = f"{admin_users_url}?{urllib.parse.urlencode({'username': probe_username, 'exact': 'true'})}" + for attempt in range(1, 11): + status, body = _request_json("GET", search_url, access_token, timeout_s=30) + if status == 200 and isinstance(body, list): + users = body + break + if status == 403 and attempt < 10: + time.sleep(3) + access_token = get_access_token() + continue + raise SystemExit(f"unexpected admin API response when searching for probe user (status={status} body={body})") + + if users is None: + raise SystemExit("probe user search did not return a list response") + + if not users: + create_payload = { + "username": probe_username, + "enabled": True, + "email": probe_email, + "emailVerified": True, + } + for attempt in range(1, 6): + status, body = _request_json("POST", admin_users_url, access_token, create_payload, timeout_s=30) + if status in (201, 204): + break + if status == 403 and attempt < 5: + time.sleep(3) + access_token = get_access_token() + continue + raise SystemExit(f"unexpected status creating probe user: {status} body={body}") + + # Refetch. + for attempt in range(1, 11): + status, body = _request_json("GET", search_url, access_token, timeout_s=30) + if status == 200 and isinstance(body, list) and body: + users = body + break + if status == 403 and attempt < 10: + time.sleep(3) + access_token = get_access_token() + continue + raise SystemExit(f"failed to refetch probe user after creation (status={status} body={body})") + + user_id = users[0].get("id") + if not isinstance(user_id, str) or not user_id: + raise SystemExit("probe user missing id") + + # execute-actions-email requires the user to be enabled and have an email configured. + user_url = f"{admin_users_url}/{urllib.parse.quote(user_id)}" + user: dict | None = None + for attempt in range(1, 6): + status, body = _request_json("GET", user_url, access_token, timeout_s=30) + if status == 200 and isinstance(body, dict): + user = body + break + if status == 403 and attempt < 5: + time.sleep(3) + access_token = get_access_token() + continue + raise SystemExit(f"unexpected status fetching probe user: {status} body={body}") + + if user is None: + raise SystemExit("probe user fetch did not return a user object") + + needs_update = False + if user.get("enabled") is False: + user["enabled"] = True + needs_update = True + + if user.get("email") != probe_email: + user["email"] = probe_email + needs_update = True + + if user.get("emailVerified") is not True: + user["emailVerified"] = True + needs_update = True + + if needs_update: + for attempt in range(1, 6): + status, body = _request_json("PUT", user_url, access_token, user, timeout_s=30) + if status in (200, 204): + break + if status == 403 and attempt < 5: + time.sleep(3) + access_token = get_access_token() + continue + raise SystemExit(f"unexpected status updating probe user: {status} body={body}") + + # Trigger an email to validate Keycloak SMTP integration. + query = urllib.parse.urlencode( + { + "client_id": execute_client_id, + "redirect_uri": execute_redirect_uri, + "lifespan": "600", + } + ) + url = f"{admin_users_url}/{urllib.parse.quote(user_id)}/execute-actions-email?{query}" + for attempt in range(1, 6): + status, body = _request_json("PUT", url, access_token, ["UPDATE_PASSWORD"], timeout_s=30) + if status == 204: + break + if status == 403 and attempt < 5: + time.sleep(3) + access_token = get_access_token() + continue + raise SystemExit(f"unexpected status from execute-actions-email: {status} body={body}") + + print("PASS: Keycloak execute-actions-email succeeded") + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/services/keycloak/scripts/tests/test_portal_token_exchange.py b/services/keycloak/scripts/tests/test_portal_token_exchange.py new file mode 100644 index 0000000..8332005 --- /dev/null +++ b/services/keycloak/scripts/tests/test_portal_token_exchange.py @@ -0,0 +1,157 @@ +#!/usr/bin/env python3 +import base64 +import json +import os +import sys +import time +import urllib.parse +import urllib.error +import urllib.request + + +def _require_env(name: str) -> str: + value = os.environ.get(name) + if not value: + raise SystemExit(f"missing required env var: {name}") + return value + + +def _post_form(url: str, data: dict[str, str], token: str | None = None, timeout_s: int = 30) -> dict: + body = urllib.parse.urlencode(data).encode() + headers = {"Content-Type": "application/x-www-form-urlencoded"} + if token: + headers["Authorization"] = f"Bearer {token}" + req = urllib.request.Request(url, data=body, headers=headers, method="POST") + try: + with urllib.request.urlopen(req, timeout=timeout_s) as resp: + payload = resp.read().decode() + return json.loads(payload) if payload else {} + except urllib.error.HTTPError as exc: + raw = exc.read().decode(errors="replace") + raise SystemExit(f"HTTP {exc.code} from {url}: {raw}") + + +def _get_json(url: str, token: str, timeout_s: int = 30) -> object: + req = urllib.request.Request(url, headers={"Authorization": f"Bearer {token}"}, method="GET") + try: + with urllib.request.urlopen(req, timeout=timeout_s) as resp: + payload = resp.read().decode() + return json.loads(payload) if payload else None + except urllib.error.HTTPError as exc: + raw = exc.read().decode(errors="replace") + raise SystemExit(f"HTTP {exc.code} from {url}: {raw}") + + +def _decode_jwt_without_verification(jwt: str) -> dict: + parts = jwt.split(".") + if len(parts) < 2: + return {} + padded = parts[1] + "=" * (-len(parts[1]) % 4) + try: + return json.loads(base64.urlsafe_b64decode(padded.encode()).decode()) + except Exception: + return {} + +def _is_retryable_failure(message: str) -> bool: + retryable_markers = ( + "HTTP 401 ", + "HTTP 403 ", + "HTTP 404 ", + "HTTP 409 ", + "HTTP 429 ", + "HTTP 500 ", + "HTTP 502 ", + "HTTP 503 ", + "HTTP 504 ", + "timed out", + "Temporary failure", + "Connection refused", + ) + return any(marker in message for marker in retryable_markers) + + +def main() -> int: + keycloak_base = _require_env("KEYCLOAK_SERVER").rstrip("/") + realm = os.environ.get("KEYCLOAK_REALM", "atlas") + client_id = _require_env("PORTAL_E2E_CLIENT_ID") + client_secret = _require_env("PORTAL_E2E_CLIENT_SECRET") + target_client_id = os.environ.get("TARGET_CLIENT_ID", "bstein-dev-home") + impersonate_username = os.environ.get("IMPERSONATE_USERNAME", "robotuser") + + token_url = f"{keycloak_base}/realms/{realm}/protocol/openid-connect/token" + admin_users_url = f"{keycloak_base}/admin/realms/{realm}/users" + + def run_once() -> None: + token_payload = _post_form( + token_url, + {"grant_type": "client_credentials", "client_id": client_id, "client_secret": client_secret}, + ) + access_token = token_payload.get("access_token") + if not isinstance(access_token, str) or not access_token: + raise SystemExit("client credentials token missing access_token") + + users = _get_json( + f"{admin_users_url}?{urllib.parse.urlencode({'username': impersonate_username, 'exact': 'true'})}", + access_token, + ) + if not isinstance(users, list) or not users: + raise SystemExit(f"unable to locate user {impersonate_username!r} via admin API") + user_id = users[0].get("id") + if not isinstance(user_id, str) or not user_id: + raise SystemExit(f"user {impersonate_username!r} missing id") + + exchange_payload = _post_form( + token_url, + { + "grant_type": "urn:ietf:params:oauth:grant-type:token-exchange", + "client_id": client_id, + "client_secret": client_secret, + "subject_token": access_token, + "requested_subject": user_id, + "audience": target_client_id, + }, + ) + exchanged = exchange_payload.get("access_token") + if not isinstance(exchanged, str) or not exchanged: + raise SystemExit("token exchange response missing access_token") + + claims = _decode_jwt_without_verification(exchanged) + aud = claims.get("aud") + if aud is None: + raise SystemExit("token exchange access_token missing aud claim") + if isinstance(aud, str): + aud_ok = aud == target_client_id + elif isinstance(aud, list): + aud_ok = target_client_id in aud + else: + aud_ok = False + if not aud_ok: + raise SystemExit(f"token exchange aud mismatch (expected {target_client_id!r})") + + deadline_seconds = int(os.environ.get("RETRY_DEADLINE_SECONDS", "300")) + retry_interval_seconds = int(os.environ.get("RETRY_INTERVAL_SECONDS", "5")) + deadline_at = time.monotonic() + deadline_seconds + last_error: str | None = None + + while True: + try: + run_once() + print("PASS: token exchange works") + return 0 + except SystemExit as exc: + message = str(exc) + last_error = message or last_error + if time.monotonic() >= deadline_at: + raise + if not _is_retryable_failure(message): + raise + time.sleep(retry_interval_seconds) + except Exception as exc: + last_error = str(exc) or last_error + if time.monotonic() >= deadline_at: + raise SystemExit(str(exc)) + time.sleep(retry_interval_seconds) + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/services/keycloak/synapse-oidc-secret-ensure-job.yaml b/services/keycloak/synapse-oidc-secret-ensure-job.yaml new file mode 100644 index 0000000..7486ced --- /dev/null +++ b/services/keycloak/synapse-oidc-secret-ensure-job.yaml @@ -0,0 +1,75 @@ +# services/keycloak/synapse-oidc-secret-ensure-job.yaml +apiVersion: batch/v1 +kind: Job +metadata: + name: synapse-oidc-secret-ensure-4 + namespace: sso +spec: + backoffLimit: 0 + ttlSecondsAfterFinished: 3600 + template: + spec: + serviceAccountName: mas-secrets-ensure + restartPolicy: Never + containers: + - name: apply + image: alpine:3.20 + command: ["/bin/sh", "-c"] + args: + - | + set -euo pipefail + apk add --no-cache curl jq kubectl >/dev/null + + KC_URL="http://keycloak.sso.svc.cluster.local" + ACCESS_TOKEN="" + for attempt in 1 2 3 4 5; do + TOKEN_JSON="$(curl -sS -X POST "$KC_URL/realms/master/protocol/openid-connect/token" \ + -H 'Content-Type: application/x-www-form-urlencoded' \ + -d "grant_type=password" \ + -d "client_id=admin-cli" \ + -d "username=${KEYCLOAK_ADMIN}" \ + -d "password=${KEYCLOAK_ADMIN_PASSWORD}" || true)" + ACCESS_TOKEN="$(echo "$TOKEN_JSON" | jq -r '.access_token' 2>/dev/null || true)" + if [ -n "$ACCESS_TOKEN" ] && [ "$ACCESS_TOKEN" != "null" ]; then + break + fi + echo "Keycloak token request failed (attempt ${attempt})" >&2 + sleep $((attempt * 2)) + done + if [ -z "$ACCESS_TOKEN" ] || [ "$ACCESS_TOKEN" = "null" ]; then + echo "Failed to fetch Keycloak admin token" >&2 + exit 1 + fi + + CLIENT_ID="$(curl -sS -H "Authorization: Bearer ${ACCESS_TOKEN}" \ + "$KC_URL/admin/realms/atlas/clients?clientId=synapse" | jq -r '.[0].id' 2>/dev/null || true)" + if [ -z "$CLIENT_ID" ] || [ "$CLIENT_ID" = "null" ]; then + echo "Keycloak client synapse not found" >&2 + exit 1 + fi + CLIENT_SECRET="$(curl -sS -H "Authorization: Bearer ${ACCESS_TOKEN}" \ + "$KC_URL/admin/realms/atlas/clients/${CLIENT_ID}/client-secret" | jq -r '.value' 2>/dev/null || true)" + if [ -z "$CLIENT_SECRET" ] || [ "$CLIENT_SECRET" = "null" ]; then + echo "Keycloak client secret not found" >&2 + exit 1 + fi + + existing="$(kubectl -n comms get secret synapse-oidc -o jsonpath='{.data.client-secret}' 2>/dev/null || true)" + if [ -n "${existing}" ]; then + exit 0 + fi + + kubectl -n comms create secret generic synapse-oidc \ + --from-literal=client-secret="${CLIENT_SECRET}" \ + --dry-run=client -o yaml | kubectl -n comms apply -f - >/dev/null + env: + - name: KEYCLOAK_ADMIN + valueFrom: + secretKeyRef: + name: keycloak-admin + key: username + - name: KEYCLOAK_ADMIN_PASSWORD + valueFrom: + secretKeyRef: + name: keycloak-admin + key: password diff --git a/services/keycloak/user-overrides-job.yaml b/services/keycloak/user-overrides-job.yaml new file mode 100644 index 0000000..43813ee --- /dev/null +++ b/services/keycloak/user-overrides-job.yaml @@ -0,0 +1,145 @@ +# services/keycloak/user-overrides-job.yaml +apiVersion: batch/v1 +kind: Job +metadata: + name: keycloak-user-overrides-1 + namespace: sso +spec: + backoffLimit: 0 + template: + spec: + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: hardware + operator: In + values: ["rpi5", "rpi4"] + - key: node-role.kubernetes.io/worker + operator: Exists + restartPolicy: Never + containers: + - name: configure + image: python:3.11-alpine + env: + - name: KEYCLOAK_SERVER + value: http://keycloak.sso.svc.cluster.local + - name: KEYCLOAK_REALM + value: atlas + - name: KEYCLOAK_ADMIN_USER + valueFrom: + secretKeyRef: + name: keycloak-admin + key: username + - name: KEYCLOAK_ADMIN_PASSWORD + valueFrom: + secretKeyRef: + name: keycloak-admin + key: password + - name: OVERRIDE_USERNAME + value: bstein + - name: OVERRIDE_MAILU_EMAIL + value: brad@bstein.dev + command: ["/bin/sh", "-c"] + args: + - | + set -euo pipefail + python - <<'PY' + import json + import os + import urllib.parse + import urllib.error + import urllib.request + + base_url = os.environ["KEYCLOAK_SERVER"].rstrip("/") + realm = os.environ["KEYCLOAK_REALM"] + admin_user = os.environ["KEYCLOAK_ADMIN_USER"] + admin_password = os.environ["KEYCLOAK_ADMIN_PASSWORD"] + + override_username = os.environ["OVERRIDE_USERNAME"].strip() + override_mailu_email = os.environ["OVERRIDE_MAILU_EMAIL"].strip() + if not override_username or not override_mailu_email: + raise SystemExit("Missing override inputs") + + def http_json(method: str, url: str, token: str, payload=None): + data = None + headers = {"Authorization": f"Bearer {token}"} + if payload is not None: + data = json.dumps(payload).encode() + headers["Content-Type"] = "application/json" + req = urllib.request.Request(url, data=data, headers=headers, method=method) + try: + with urllib.request.urlopen(req, timeout=30) as resp: + body = resp.read() + if not body: + return resp.status, None + return resp.status, json.loads(body.decode()) + except urllib.error.HTTPError as exc: + raw = exc.read() + if not raw: + return exc.code, None + try: + return exc.code, json.loads(raw.decode()) + except Exception: + return exc.code, {"raw": raw.decode(errors="replace")} + + token_data = urllib.parse.urlencode( + { + "grant_type": "password", + "client_id": "admin-cli", + "username": admin_user, + "password": admin_password, + } + ).encode() + token_req = urllib.request.Request( + f"{base_url}/realms/master/protocol/openid-connect/token", + data=token_data, + headers={"Content-Type": "application/x-www-form-urlencoded"}, + method="POST", + ) + try: + with urllib.request.urlopen(token_req, timeout=10) as resp: + token_body = json.loads(resp.read().decode()) + except urllib.error.HTTPError as exc: + body = exc.read().decode(errors="replace") + raise SystemExit(f"Token request failed: status={exc.code} body={body}") + access_token = token_body["access_token"] + + # Find target user id. + status, users = http_json( + "GET", + f"{base_url}/admin/realms/{realm}/users?username={urllib.parse.quote(override_username)}&exact=true&max=1", + access_token, + ) + if status != 200 or not isinstance(users, list) or not users: + raise SystemExit(f"User not found: {override_username}") + user = users[0] if isinstance(users[0], dict) else None + user_id = (user or {}).get("id") or "" + if not user_id: + raise SystemExit("User id missing") + + # Fetch full user and update only attributes. + status, full = http_json("GET", f"{base_url}/admin/realms/{realm}/users/{user_id}", access_token) + if status != 200 or not isinstance(full, dict): + raise SystemExit("Unable to fetch user") + + attrs = full.get("attributes") or {} + if not isinstance(attrs, dict): + attrs = {} + existing = attrs.get("mailu_email") + if isinstance(existing, list) and existing and existing[0] == override_mailu_email: + raise SystemExit(0) + if isinstance(existing, str) and existing == override_mailu_email: + raise SystemExit(0) + + attrs["mailu_email"] = [override_mailu_email] + status, _ = http_json( + "PUT", + f"{base_url}/admin/realms/{realm}/users/{user_id}", + access_token, + {"attributes": attrs}, + ) + if status not in (200, 204): + raise SystemExit(f"Unexpected user update response: {status}") + PY diff --git a/services/logging/Jenkinsfile.data-prepper b/services/logging/Jenkinsfile.data-prepper new file mode 100644 index 0000000..4f7c6a7 --- /dev/null +++ b/services/logging/Jenkinsfile.data-prepper @@ -0,0 +1,83 @@ +pipeline { + agent { + kubernetes { + yaml """ +apiVersion: v1 +kind: Pod +spec: + restartPolicy: Never + serviceAccountName: jenkins + nodeSelector: + hardware: rpi5 + node-role.kubernetes.io/worker: "true" + containers: + - name: git + image: alpine/git:2.47.1 + command: + - cat + tty: true + - name: kaniko + image: gcr.io/kaniko-project/executor:v1.23.2-debug + command: + - /busybox/cat + tty: true + resources: + requests: + cpu: "500m" + memory: "1Gi" + limits: + cpu: "1500m" + memory: "2Gi" +""" + } + } + parameters { + string(name: 'HARBOR_REPO', defaultValue: 'registry.bstein.dev/streaming/data-prepper', description: 'Docker repository for Data Prepper') + string(name: 'IMAGE_TAG', defaultValue: '2.8.0', description: 'Image tag to publish') + booleanParam(name: 'PUSH_LATEST', defaultValue: true, description: 'Also push the latest tag') + } + stages { + stage('Checkout') { + steps { + container('git') { + checkout scm + } + } + } + stage('Build & Push') { + steps { + container('kaniko') { + withCredentials([usernamePassword(credentialsId: 'harbor-robot', usernameVariable: 'HARBOR_USERNAME', passwordVariable: 'HARBOR_PASSWORD')]) { + sh ''' + set -euo pipefail + if [ -z "${HARBOR_REPO:-}" ] || [ "${HARBOR_REPO}" = "registry.bstein.dev/monitoring/data-prepper" ]; then + HARBOR_REPO="registry.bstein.dev/streaming/data-prepper" + fi + mkdir -p /kaniko/.docker + ref_host="$(echo "${HARBOR_REPO}" | cut -d/ -f1)" + auth="$(printf "%s:%s" "${HARBOR_USERNAME}" "${HARBOR_PASSWORD}" | base64 | tr -d '\\n')" + cat > /kaniko/.docker/config.json </dev/null + + OSD_URL="http://opensearch-dashboards.logging.svc.cluster.local:5601" + for attempt in $(seq 1 60); do + code="$(curl -s -o /dev/null -w "%{http_code}" "${OSD_URL}/api/status" || true)" + if [ "${code}" = "200" ]; then + break + fi + sleep 5 + done + + if ! curl -s -o /dev/null -w "%{http_code}" "${OSD_URL}/api/status" | grep -q "200"; then + echo "OpenSearch Dashboards did not become ready in time" >&2 + exit 1 + fi + + if [ ! -s /config/objects.ndjson ]; then + echo "Saved objects file not found at /config/objects.ndjson" >&2 + exit 1 + fi + + import_code="$(curl -sS -o /tmp/import.json -w "%{http_code}" -X POST \ + "${OSD_URL}/api/saved_objects/_import?overwrite=true" \ + -H 'osd-xsrf: true' \ + -F file=@/config/objects.ndjson)" + + if [ "${import_code}" != "200" ]; then + echo "Saved object import failed with status ${import_code}:" >&2 + cat /tmp/import.json >&2 + exit 1 + fi + + settings_code="$(curl -sS -o /tmp/settings.json -w "%{http_code}" -X POST \ + "${OSD_URL}/api/opensearch-dashboards/settings" \ + -H 'Content-Type: application/json' \ + -H 'osd-xsrf: true' \ + -d '{"changes":{"defaultIndex":"kube-logs"}}')" + + if [ "${settings_code}" != "200" ]; then + echo "Default index update failed with status ${settings_code}:" >&2 + cat /tmp/settings.json >&2 + exit 1 + fi + volumeMounts: + - name: objects + mountPath: /config + readOnly: true + volumes: + - name: objects + configMap: + name: opensearch-dashboards-objects diff --git a/services/logging/opensearch-helmrelease.yaml b/services/logging/opensearch-helmrelease.yaml new file mode 100644 index 0000000..3d7dd6b --- /dev/null +++ b/services/logging/opensearch-helmrelease.yaml @@ -0,0 +1,56 @@ +# services/logging/opensearch-helmrelease.yaml +apiVersion: helm.toolkit.fluxcd.io/v2 +kind: HelmRelease +metadata: + name: opensearch + namespace: logging +spec: + interval: 15m + chart: + spec: + chart: opensearch + version: "~2.36.0" + sourceRef: + kind: HelmRepository + name: opensearch + namespace: flux-system + values: + fullnameOverride: opensearch + clusterName: opensearch + nodeGroup: master + masterService: opensearch-master + singleNode: true + replicas: 1 + minimumMasterNodes: 1 + opensearchJavaOpts: "-Xms1g -Xmx1g" + resources: + requests: + cpu: "500m" + memory: "2Gi" + limits: + memory: "2Gi" + persistence: + enabled: true + storageClass: asteria + size: 500Gi + config: + opensearch.yml: | + cluster.name: opensearch + network.host: 0.0.0.0 + discovery.type: single-node + plugins.security.disabled: true + node.store.allow_mmap: false + nodeSelector: + node-role.kubernetes.io/worker: "true" + hardware: rpi5 + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: hardware + operator: In + values: + - rpi5 + sysctlInit: + enabled: true diff --git a/services/logging/opensearch-ism-job.yaml b/services/logging/opensearch-ism-job.yaml new file mode 100644 index 0000000..3313571 --- /dev/null +++ b/services/logging/opensearch-ism-job.yaml @@ -0,0 +1,73 @@ +# services/logging/opensearch-ism-job.yaml +apiVersion: batch/v1 +kind: Job +metadata: + name: opensearch-ism-setup-5 + namespace: logging +spec: + backoffLimit: 3 + ttlSecondsAfterFinished: 3600 + template: + spec: + restartPolicy: OnFailure + nodeSelector: + node-role.kubernetes.io/worker: "true" + hardware: rpi5 + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: hardware + operator: In + values: + - rpi5 + containers: + - name: apply + image: alpine:3.20 + command: ["/bin/sh", "-c"] + args: + - | + set -euo pipefail + apk add --no-cache curl >/dev/null + + OS_URL="http://opensearch-master.logging.svc.cluster.local:9200" + for attempt in $(seq 1 60); do + if curl -s -o /dev/null -w "%{http_code}" "${OS_URL}" | grep -q "200"; then + break + fi + sleep 5 + done + if ! curl -s -o /dev/null -w "%{http_code}" "${OS_URL}" | grep -q "200"; then + echo "OpenSearch did not become ready in time" >&2 + exit 1 + fi + + policy='{"policy":{"description":"Delete logs after 180 days","schema_version":1,"default_state":"hot","states":[{"name":"hot","actions":[],"transitions":[{"state_name":"delete","conditions":{"min_index_age":"180d"}}]},{"name":"delete","actions":[{"delete":{}}],"transitions":[]}]}}' + curl -sS -X PUT "${OS_URL}/_plugins/_ism/policies/logging-180d" \ + -H 'Content-Type: application/json' \ + -d "${policy}" >/dev/null + + trace_policy='{"policy":{"description":"Delete trace analytics after 30 days","schema_version":1,"default_state":"hot","states":[{"name":"hot","actions":[],"transitions":[{"state_name":"delete","conditions":{"min_index_age":"30d"}}]},{"name":"delete","actions":[{"delete":{}}],"transitions":[]}]}}' + curl -sS -X PUT "${OS_URL}/_plugins/_ism/policies/trace-analytics-30d" \ + -H 'Content-Type: application/json' \ + -d "${trace_policy}" >/dev/null + + kube_template='{"index_patterns":["kube-*"],"priority":200,"template":{"settings":{"index.number_of_shards":1,"index.number_of_replicas":0,"index.refresh_interval":"30s","plugins.index_state_management.policy_id":"logging-180d"},"mappings":{"properties":{"@timestamp":{"type":"date"}}}}}' + curl -sS -X PUT "${OS_URL}/_index_template/kube-logs" \ + -H 'Content-Type: application/json' \ + -d "${kube_template}" >/dev/null + + journal_template='{"index_patterns":["journald-*"],"priority":200,"template":{"settings":{"index.number_of_shards":1,"index.number_of_replicas":0,"index.refresh_interval":"30s","plugins.index_state_management.policy_id":"logging-180d"},"mappings":{"properties":{"@timestamp":{"type":"date"}}}}}' + curl -sS -X PUT "${OS_URL}/_index_template/journald-logs" \ + -H 'Content-Type: application/json' \ + -d "${journal_template}" >/dev/null + + trace_template='{"index_patterns":["trace-analytics-*"],"priority":200,"template":{"settings":{"index.number_of_shards":1,"index.number_of_replicas":0,"index.refresh_interval":"30s","plugins.index_state_management.policy_id":"trace-analytics-30d"}}}' + curl -sS -X PUT "${OS_URL}/_index_template/trace-analytics" \ + -H 'Content-Type: application/json' \ + -d "${trace_template}" >/dev/null + + curl -sS -X PUT "${OS_URL}/_all/_settings" \ + -H 'Content-Type: application/json' \ + -d '{"index":{"number_of_replicas":0}}' >/dev/null diff --git a/services/logging/opensearch-observability-objects.yaml b/services/logging/opensearch-observability-objects.yaml new file mode 100644 index 0000000..19ed195 --- /dev/null +++ b/services/logging/opensearch-observability-objects.yaml @@ -0,0 +1,1115 @@ +# services/logging/opensearch-observability-objects.yaml +# Generated by scripts/logging_render_observability.py --build +apiVersion: v1 +kind: ConfigMap +metadata: + name: opensearch-observability-objects + namespace: logging +data: + applications.json: | + [ + { + "name": "bstein-dev-home", + "description": "", + "baseQuery": "source = kube-* | where kubernetes.namespace_name = 'bstein-dev-home'", + "servicesEntities": [], + "traceGroups": [ + "bstein-dev-home" + ] + }, + { + "name": "pegasus", + "description": "", + "baseQuery": "source = kube-* | where kubernetes.namespace_name = 'jellyfin' and kubernetes.labels.app = 'pegasus'", + "servicesEntities": [], + "traceGroups": [ + "pegasus" + ] + }, + { + "name": "jellyfin", + "description": "", + "baseQuery": "source = kube-* | where kubernetes.namespace_name = 'jellyfin' and kubernetes.labels.app = 'jellyfin'", + "servicesEntities": [], + "traceGroups": [ + "jellyfin" + ] + }, + { + "name": "vaultwarden", + "description": "", + "baseQuery": "source = kube-* | where kubernetes.namespace_name = 'vaultwarden'", + "servicesEntities": [], + "traceGroups": [ + "vaultwarden" + ] + }, + { + "name": "mailu", + "description": "", + "baseQuery": "source = kube-* | where kubernetes.namespace_name = 'mailu-mailserver'", + "servicesEntities": [], + "traceGroups": [ + "mailu" + ] + }, + { + "name": "nextcloud", + "description": "", + "baseQuery": "source = kube-* | where kubernetes.namespace_name = 'nextcloud'", + "servicesEntities": [], + "traceGroups": [ + "nextcloud" + ] + }, + { + "name": "gitea", + "description": "", + "baseQuery": "source = kube-* | where kubernetes.namespace_name = 'gitea'", + "servicesEntities": [], + "traceGroups": [ + "gitea" + ] + }, + { + "name": "jenkins", + "description": "", + "baseQuery": "source = kube-* | where kubernetes.namespace_name = 'jenkins'", + "servicesEntities": [], + "traceGroups": [ + "jenkins" + ] + }, + { + "name": "harbor", + "description": "", + "baseQuery": "source = kube-* | where kubernetes.namespace_name = 'harbor'", + "servicesEntities": [], + "traceGroups": [ + "harbor" + ] + }, + { + "name": "vault", + "description": "", + "baseQuery": "source = kube-* | where kubernetes.namespace_name = 'vault'", + "servicesEntities": [], + "traceGroups": [ + "vault" + ] + }, + { + "name": "keycloak", + "description": "", + "baseQuery": "source = kube-* | where kubernetes.namespace_name = 'sso'", + "servicesEntities": [], + "traceGroups": [ + "keycloak" + ] + }, + { + "name": "flux-system", + "description": "", + "baseQuery": "source = kube-* | where kubernetes.namespace_name = 'flux-system'", + "servicesEntities": [], + "traceGroups": [ + "flux-system" + ] + }, + { + "name": "comms", + "description": "", + "baseQuery": "source = kube-* | where kubernetes.namespace_name = 'comms'", + "servicesEntities": [], + "traceGroups": [ + "comms" + ] + }, + { + "name": "element-web", + "description": "", + "baseQuery": "source = kube-* | where kubernetes.namespace_name = 'comms' and kubernetes.container_name = 'element-web'", + "servicesEntities": [], + "traceGroups": [ + "element-web" + ] + }, + { + "name": "element-call", + "description": "", + "baseQuery": "source = kube-* | where kubernetes.namespace_name = 'comms' and kubernetes.labels.app = 'element-call'", + "servicesEntities": [], + "traceGroups": [ + "element-call" + ] + }, + { + "name": "matrix-synapse", + "description": "", + "baseQuery": "source = kube-* | where kubernetes.namespace_name = 'comms' and kubernetes.container_name = 'synapse'", + "servicesEntities": [], + "traceGroups": [ + "matrix-synapse" + ] + }, + { + "name": "livekit", + "description": "", + "baseQuery": "source = kube-* | where kubernetes.namespace_name = 'comms' and kubernetes.labels.app = 'livekit'", + "servicesEntities": [], + "traceGroups": [ + "livekit" + ] + }, + { + "name": "coturn", + "description": "", + "baseQuery": "source = kube-* | where kubernetes.namespace_name = 'comms' and kubernetes.labels.app = 'coturn'", + "servicesEntities": [], + "traceGroups": [ + "coturn" + ] + }, + { + "name": "lesavka", + "description": "", + "baseQuery": "source = journald-* | where _HOSTNAME = 'titan-jh'", + "servicesEntities": [], + "traceGroups": [ + "lesavka" + ] + } + ] + saved_queries.json: | + [ + { + "name": "kube logs", + "description": "", + "query": "source = kube-*", + "selected_date_range": { + "start": "now-24h", + "end": "now", + "text": "" + }, + "selected_timestamp": { + "name": "@timestamp", + "type": "timestamp" + }, + "selected_fields": { + "text": "", + "tokens": [] + } + }, + { + "name": "kube errors", + "description": "", + "query": "source = kube-* | where match(log, 'error|exception|fail') or match(message, 'error|exception|fail')", + "selected_date_range": { + "start": "now-24h", + "end": "now", + "text": "" + }, + "selected_timestamp": { + "name": "@timestamp", + "type": "timestamp" + }, + "selected_fields": { + "text": "", + "tokens": [] + } + }, + { + "name": "journald logs", + "description": "", + "query": "source = journald-*", + "selected_date_range": { + "start": "now-24h", + "end": "now", + "text": "" + }, + "selected_timestamp": { + "name": "@timestamp", + "type": "timestamp" + }, + "selected_fields": { + "text": "", + "tokens": [] + } + }, + { + "name": "journald errors", + "description": "", + "query": "source = journald-* | where match(MESSAGE, 'error|exception|fail')", + "selected_date_range": { + "start": "now-24h", + "end": "now", + "text": "" + }, + "selected_timestamp": { + "name": "@timestamp", + "type": "timestamp" + }, + "selected_fields": { + "text": "", + "tokens": [] + } + }, + { + "name": "bstein-dev-home logs", + "description": "", + "query": "source = kube-* | where kubernetes.namespace_name = 'bstein-dev-home'", + "selected_date_range": { + "start": "now-24h", + "end": "now", + "text": "" + }, + "selected_timestamp": { + "name": "@timestamp", + "type": "timestamp" + }, + "selected_fields": { + "text": "", + "tokens": [] + } + }, + { + "name": "bstein-dev-home errors", + "description": "", + "query": "source = kube-* | where kubernetes.namespace_name = 'bstein-dev-home' | where match(log, 'error|exception|fail') or match(message, 'error|exception|fail')", + "selected_date_range": { + "start": "now-24h", + "end": "now", + "text": "" + }, + "selected_timestamp": { + "name": "@timestamp", + "type": "timestamp" + }, + "selected_fields": { + "text": "", + "tokens": [] + } + }, + { + "name": "pegasus logs", + "description": "", + "query": "source = kube-* | where kubernetes.namespace_name = 'jellyfin' and kubernetes.labels.app = 'pegasus'", + "selected_date_range": { + "start": "now-24h", + "end": "now", + "text": "" + }, + "selected_timestamp": { + "name": "@timestamp", + "type": "timestamp" + }, + "selected_fields": { + "text": "", + "tokens": [] + } + }, + { + "name": "pegasus errors", + "description": "", + "query": "source = kube-* | where kubernetes.namespace_name = 'jellyfin' and kubernetes.labels.app = 'pegasus' | where match(log, 'error|exception|fail') or match(message, 'error|exception|fail')", + "selected_date_range": { + "start": "now-24h", + "end": "now", + "text": "" + }, + "selected_timestamp": { + "name": "@timestamp", + "type": "timestamp" + }, + "selected_fields": { + "text": "", + "tokens": [] + } + }, + { + "name": "jellyfin logs", + "description": "", + "query": "source = kube-* | where kubernetes.namespace_name = 'jellyfin' and kubernetes.labels.app = 'jellyfin'", + "selected_date_range": { + "start": "now-24h", + "end": "now", + "text": "" + }, + "selected_timestamp": { + "name": "@timestamp", + "type": "timestamp" + }, + "selected_fields": { + "text": "", + "tokens": [] + } + }, + { + "name": "jellyfin errors", + "description": "", + "query": "source = kube-* | where kubernetes.namespace_name = 'jellyfin' and kubernetes.labels.app = 'jellyfin' | where match(log, 'error|exception|fail') or match(message, 'error|exception|fail')", + "selected_date_range": { + "start": "now-24h", + "end": "now", + "text": "" + }, + "selected_timestamp": { + "name": "@timestamp", + "type": "timestamp" + }, + "selected_fields": { + "text": "", + "tokens": [] + } + }, + { + "name": "vaultwarden logs", + "description": "", + "query": "source = kube-* | where kubernetes.namespace_name = 'vaultwarden'", + "selected_date_range": { + "start": "now-24h", + "end": "now", + "text": "" + }, + "selected_timestamp": { + "name": "@timestamp", + "type": "timestamp" + }, + "selected_fields": { + "text": "", + "tokens": [] + } + }, + { + "name": "vaultwarden errors", + "description": "", + "query": "source = kube-* | where kubernetes.namespace_name = 'vaultwarden' | where match(log, 'error|exception|fail') or match(message, 'error|exception|fail')", + "selected_date_range": { + "start": "now-24h", + "end": "now", + "text": "" + }, + "selected_timestamp": { + "name": "@timestamp", + "type": "timestamp" + }, + "selected_fields": { + "text": "", + "tokens": [] + } + }, + { + "name": "mailu logs", + "description": "", + "query": "source = kube-* | where kubernetes.namespace_name = 'mailu-mailserver'", + "selected_date_range": { + "start": "now-24h", + "end": "now", + "text": "" + }, + "selected_timestamp": { + "name": "@timestamp", + "type": "timestamp" + }, + "selected_fields": { + "text": "", + "tokens": [] + } + }, + { + "name": "mailu errors", + "description": "", + "query": "source = kube-* | where kubernetes.namespace_name = 'mailu-mailserver' | where match(log, 'error|exception|fail') or match(message, 'error|exception|fail')", + "selected_date_range": { + "start": "now-24h", + "end": "now", + "text": "" + }, + "selected_timestamp": { + "name": "@timestamp", + "type": "timestamp" + }, + "selected_fields": { + "text": "", + "tokens": [] + } + }, + { + "name": "nextcloud logs", + "description": "", + "query": "source = kube-* | where kubernetes.namespace_name = 'nextcloud'", + "selected_date_range": { + "start": "now-24h", + "end": "now", + "text": "" + }, + "selected_timestamp": { + "name": "@timestamp", + "type": "timestamp" + }, + "selected_fields": { + "text": "", + "tokens": [] + } + }, + { + "name": "nextcloud errors", + "description": "", + "query": "source = kube-* | where kubernetes.namespace_name = 'nextcloud' | where match(log, 'error|exception|fail') or match(message, 'error|exception|fail')", + "selected_date_range": { + "start": "now-24h", + "end": "now", + "text": "" + }, + "selected_timestamp": { + "name": "@timestamp", + "type": "timestamp" + }, + "selected_fields": { + "text": "", + "tokens": [] + } + }, + { + "name": "gitea logs", + "description": "", + "query": "source = kube-* | where kubernetes.namespace_name = 'gitea'", + "selected_date_range": { + "start": "now-24h", + "end": "now", + "text": "" + }, + "selected_timestamp": { + "name": "@timestamp", + "type": "timestamp" + }, + "selected_fields": { + "text": "", + "tokens": [] + } + }, + { + "name": "gitea errors", + "description": "", + "query": "source = kube-* | where kubernetes.namespace_name = 'gitea' | where match(log, 'error|exception|fail') or match(message, 'error|exception|fail')", + "selected_date_range": { + "start": "now-24h", + "end": "now", + "text": "" + }, + "selected_timestamp": { + "name": "@timestamp", + "type": "timestamp" + }, + "selected_fields": { + "text": "", + "tokens": [] + } + }, + { + "name": "jenkins logs", + "description": "", + "query": "source = kube-* | where kubernetes.namespace_name = 'jenkins'", + "selected_date_range": { + "start": "now-24h", + "end": "now", + "text": "" + }, + "selected_timestamp": { + "name": "@timestamp", + "type": "timestamp" + }, + "selected_fields": { + "text": "", + "tokens": [] + } + }, + { + "name": "jenkins errors", + "description": "", + "query": "source = kube-* | where kubernetes.namespace_name = 'jenkins' | where match(log, 'error|exception|fail') or match(message, 'error|exception|fail')", + "selected_date_range": { + "start": "now-24h", + "end": "now", + "text": "" + }, + "selected_timestamp": { + "name": "@timestamp", + "type": "timestamp" + }, + "selected_fields": { + "text": "", + "tokens": [] + } + }, + { + "name": "harbor logs", + "description": "", + "query": "source = kube-* | where kubernetes.namespace_name = 'harbor'", + "selected_date_range": { + "start": "now-24h", + "end": "now", + "text": "" + }, + "selected_timestamp": { + "name": "@timestamp", + "type": "timestamp" + }, + "selected_fields": { + "text": "", + "tokens": [] + } + }, + { + "name": "harbor errors", + "description": "", + "query": "source = kube-* | where kubernetes.namespace_name = 'harbor' | where match(log, 'error|exception|fail') or match(message, 'error|exception|fail')", + "selected_date_range": { + "start": "now-24h", + "end": "now", + "text": "" + }, + "selected_timestamp": { + "name": "@timestamp", + "type": "timestamp" + }, + "selected_fields": { + "text": "", + "tokens": [] + } + }, + { + "name": "vault logs", + "description": "", + "query": "source = kube-* | where kubernetes.namespace_name = 'vault'", + "selected_date_range": { + "start": "now-24h", + "end": "now", + "text": "" + }, + "selected_timestamp": { + "name": "@timestamp", + "type": "timestamp" + }, + "selected_fields": { + "text": "", + "tokens": [] + } + }, + { + "name": "vault errors", + "description": "", + "query": "source = kube-* | where kubernetes.namespace_name = 'vault' | where match(log, 'error|exception|fail') or match(message, 'error|exception|fail')", + "selected_date_range": { + "start": "now-24h", + "end": "now", + "text": "" + }, + "selected_timestamp": { + "name": "@timestamp", + "type": "timestamp" + }, + "selected_fields": { + "text": "", + "tokens": [] + } + }, + { + "name": "keycloak logs", + "description": "", + "query": "source = kube-* | where kubernetes.namespace_name = 'sso'", + "selected_date_range": { + "start": "now-24h", + "end": "now", + "text": "" + }, + "selected_timestamp": { + "name": "@timestamp", + "type": "timestamp" + }, + "selected_fields": { + "text": "", + "tokens": [] + } + }, + { + "name": "keycloak errors", + "description": "", + "query": "source = kube-* | where kubernetes.namespace_name = 'sso' | where match(log, 'error|exception|fail') or match(message, 'error|exception|fail')", + "selected_date_range": { + "start": "now-24h", + "end": "now", + "text": "" + }, + "selected_timestamp": { + "name": "@timestamp", + "type": "timestamp" + }, + "selected_fields": { + "text": "", + "tokens": [] + } + }, + { + "name": "flux-system logs", + "description": "", + "query": "source = kube-* | where kubernetes.namespace_name = 'flux-system'", + "selected_date_range": { + "start": "now-24h", + "end": "now", + "text": "" + }, + "selected_timestamp": { + "name": "@timestamp", + "type": "timestamp" + }, + "selected_fields": { + "text": "", + "tokens": [] + } + }, + { + "name": "flux-system errors", + "description": "", + "query": "source = kube-* | where kubernetes.namespace_name = 'flux-system' | where match(log, 'error|exception|fail') or match(message, 'error|exception|fail')", + "selected_date_range": { + "start": "now-24h", + "end": "now", + "text": "" + }, + "selected_timestamp": { + "name": "@timestamp", + "type": "timestamp" + }, + "selected_fields": { + "text": "", + "tokens": [] + } + }, + { + "name": "comms logs", + "description": "", + "query": "source = kube-* | where kubernetes.namespace_name = 'comms'", + "selected_date_range": { + "start": "now-24h", + "end": "now", + "text": "" + }, + "selected_timestamp": { + "name": "@timestamp", + "type": "timestamp" + }, + "selected_fields": { + "text": "", + "tokens": [] + } + }, + { + "name": "comms errors", + "description": "", + "query": "source = kube-* | where kubernetes.namespace_name = 'comms' | where match(log, 'error|exception|fail') or match(message, 'error|exception|fail')", + "selected_date_range": { + "start": "now-24h", + "end": "now", + "text": "" + }, + "selected_timestamp": { + "name": "@timestamp", + "type": "timestamp" + }, + "selected_fields": { + "text": "", + "tokens": [] + } + }, + { + "name": "element-web logs", + "description": "", + "query": "source = kube-* | where kubernetes.namespace_name = 'comms' and kubernetes.container_name = 'element-web'", + "selected_date_range": { + "start": "now-24h", + "end": "now", + "text": "" + }, + "selected_timestamp": { + "name": "@timestamp", + "type": "timestamp" + }, + "selected_fields": { + "text": "", + "tokens": [] + } + }, + { + "name": "element-web errors", + "description": "", + "query": "source = kube-* | where kubernetes.namespace_name = 'comms' and kubernetes.container_name = 'element-web' | where match(log, 'error|exception|fail') or match(message, 'error|exception|fail')", + "selected_date_range": { + "start": "now-24h", + "end": "now", + "text": "" + }, + "selected_timestamp": { + "name": "@timestamp", + "type": "timestamp" + }, + "selected_fields": { + "text": "", + "tokens": [] + } + }, + { + "name": "element-call logs", + "description": "", + "query": "source = kube-* | where kubernetes.namespace_name = 'comms' and kubernetes.labels.app = 'element-call'", + "selected_date_range": { + "start": "now-24h", + "end": "now", + "text": "" + }, + "selected_timestamp": { + "name": "@timestamp", + "type": "timestamp" + }, + "selected_fields": { + "text": "", + "tokens": [] + } + }, + { + "name": "element-call errors", + "description": "", + "query": "source = kube-* | where kubernetes.namespace_name = 'comms' and kubernetes.labels.app = 'element-call' | where match(log, 'error|exception|fail') or match(message, 'error|exception|fail')", + "selected_date_range": { + "start": "now-24h", + "end": "now", + "text": "" + }, + "selected_timestamp": { + "name": "@timestamp", + "type": "timestamp" + }, + "selected_fields": { + "text": "", + "tokens": [] + } + }, + { + "name": "matrix-synapse logs", + "description": "", + "query": "source = kube-* | where kubernetes.namespace_name = 'comms' and kubernetes.container_name = 'synapse'", + "selected_date_range": { + "start": "now-24h", + "end": "now", + "text": "" + }, + "selected_timestamp": { + "name": "@timestamp", + "type": "timestamp" + }, + "selected_fields": { + "text": "", + "tokens": [] + } + }, + { + "name": "matrix-synapse errors", + "description": "", + "query": "source = kube-* | where kubernetes.namespace_name = 'comms' and kubernetes.container_name = 'synapse' | where match(log, 'error|exception|fail') or match(message, 'error|exception|fail')", + "selected_date_range": { + "start": "now-24h", + "end": "now", + "text": "" + }, + "selected_timestamp": { + "name": "@timestamp", + "type": "timestamp" + }, + "selected_fields": { + "text": "", + "tokens": [] + } + }, + { + "name": "livekit logs", + "description": "", + "query": "source = kube-* | where kubernetes.namespace_name = 'comms' and kubernetes.labels.app = 'livekit'", + "selected_date_range": { + "start": "now-24h", + "end": "now", + "text": "" + }, + "selected_timestamp": { + "name": "@timestamp", + "type": "timestamp" + }, + "selected_fields": { + "text": "", + "tokens": [] + } + }, + { + "name": "livekit errors", + "description": "", + "query": "source = kube-* | where kubernetes.namespace_name = 'comms' and kubernetes.labels.app = 'livekit' | where match(log, 'error|exception|fail') or match(message, 'error|exception|fail')", + "selected_date_range": { + "start": "now-24h", + "end": "now", + "text": "" + }, + "selected_timestamp": { + "name": "@timestamp", + "type": "timestamp" + }, + "selected_fields": { + "text": "", + "tokens": [] + } + }, + { + "name": "coturn logs", + "description": "", + "query": "source = kube-* | where kubernetes.namespace_name = 'comms' and kubernetes.labels.app = 'coturn'", + "selected_date_range": { + "start": "now-24h", + "end": "now", + "text": "" + }, + "selected_timestamp": { + "name": "@timestamp", + "type": "timestamp" + }, + "selected_fields": { + "text": "", + "tokens": [] + } + }, + { + "name": "coturn errors", + "description": "", + "query": "source = kube-* | where kubernetes.namespace_name = 'comms' and kubernetes.labels.app = 'coturn' | where match(log, 'error|exception|fail') or match(message, 'error|exception|fail')", + "selected_date_range": { + "start": "now-24h", + "end": "now", + "text": "" + }, + "selected_timestamp": { + "name": "@timestamp", + "type": "timestamp" + }, + "selected_fields": { + "text": "", + "tokens": [] + } + }, + { + "name": "lesavka logs", + "description": "", + "query": "source = journald-* | where _HOSTNAME = 'titan-jh'", + "selected_date_range": { + "start": "now-24h", + "end": "now", + "text": "" + }, + "selected_timestamp": { + "name": "@timestamp", + "type": "timestamp" + }, + "selected_fields": { + "text": "", + "tokens": [] + } + }, + { + "name": "lesavka errors", + "description": "", + "query": "source = journald-* | where _HOSTNAME = 'titan-jh' | where match(MESSAGE, 'error|exception|fail')", + "selected_date_range": { + "start": "now-24h", + "end": "now", + "text": "" + }, + "selected_timestamp": { + "name": "@timestamp", + "type": "timestamp" + }, + "selected_fields": { + "text": "", + "tokens": [] + } + } + ] + saved_visualizations.json: | + [ + { + "name": "[Kube] Logs per hour", + "description": "", + "query": "source = kube-* | stats count() as log_count by span(`@timestamp`, 1h)", + "type": "line", + "selected_date_range": { + "start": "now-24h", + "end": "now", + "text": "" + }, + "selected_timestamp": { + "name": "@timestamp", + "type": "timestamp" + }, + "selected_fields": { + "text": "", + "tokens": [] + } + }, + { + "name": "[Kube] Errors per hour", + "description": "", + "query": "source = kube-* | where match(log, 'error|exception|fail') or match(message, 'error|exception|fail') | stats count() as error_count by span(`@timestamp`, 1h)", + "type": "line", + "selected_date_range": { + "start": "now-24h", + "end": "now", + "text": "" + }, + "selected_timestamp": { + "name": "@timestamp", + "type": "timestamp" + }, + "selected_fields": { + "text": "", + "tokens": [] + } + }, + { + "name": "[Kube] Top namespaces", + "description": "", + "query": "source = kube-* | stats count() as log_count by kubernetes.namespace_name | sort - log_count", + "type": "bar", + "selected_date_range": { + "start": "now-24h", + "end": "now", + "text": "" + }, + "selected_timestamp": { + "name": "@timestamp", + "type": "timestamp" + }, + "selected_fields": { + "text": "", + "tokens": [] + } + }, + { + "name": "[Kube] Top error namespaces", + "description": "", + "query": "source = kube-* | where match(log, 'error|exception|fail') or match(message, 'error|exception|fail') | stats count() as error_count by kubernetes.namespace_name | sort - error_count", + "type": "bar", + "selected_date_range": { + "start": "now-24h", + "end": "now", + "text": "" + }, + "selected_timestamp": { + "name": "@timestamp", + "type": "timestamp" + }, + "selected_fields": { + "text": "", + "tokens": [] + } + }, + { + "name": "[Kube] Top pods", + "description": "", + "query": "source = kube-* | stats count() as log_count by kubernetes.pod_name | sort - log_count", + "type": "bar", + "selected_date_range": { + "start": "now-24h", + "end": "now", + "text": "" + }, + "selected_timestamp": { + "name": "@timestamp", + "type": "timestamp" + }, + "selected_fields": { + "text": "", + "tokens": [] + } + }, + { + "name": "[Kube] Top error pods", + "description": "", + "query": "source = kube-* | where match(log, 'error|exception|fail') or match(message, 'error|exception|fail') | stats count() as error_count by kubernetes.pod_name | sort - error_count", + "type": "bar", + "selected_date_range": { + "start": "now-24h", + "end": "now", + "text": "" + }, + "selected_timestamp": { + "name": "@timestamp", + "type": "timestamp" + }, + "selected_fields": { + "text": "", + "tokens": [] + } + }, + { + "name": "[Kube] Top nodes", + "description": "", + "query": "source = kube-* | stats count() as log_count by kubernetes.node_name | sort - log_count", + "type": "bar", + "selected_date_range": { + "start": "now-24h", + "end": "now", + "text": "" + }, + "selected_timestamp": { + "name": "@timestamp", + "type": "timestamp" + }, + "selected_fields": { + "text": "", + "tokens": [] + } + }, + { + "name": "[Journald] Top units", + "description": "", + "query": "source = journald-* | stats count() as log_count by _SYSTEMD_UNIT | sort - log_count", + "type": "bar", + "selected_date_range": { + "start": "now-24h", + "end": "now", + "text": "" + }, + "selected_timestamp": { + "name": "@timestamp", + "type": "timestamp" + }, + "selected_fields": { + "text": "", + "tokens": [] + } + }, + { + "name": "[Journald] Top error units", + "description": "", + "query": "source = journald-* | where match(MESSAGE, 'error|exception|fail') | stats count() as error_count by _SYSTEMD_UNIT | sort - error_count", + "type": "bar", + "selected_date_range": { + "start": "now-24h", + "end": "now", + "text": "" + }, + "selected_timestamp": { + "name": "@timestamp", + "type": "timestamp" + }, + "selected_fields": { + "text": "", + "tokens": [] + } + } + ] diff --git a/services/logging/opensearch-observability-setup-job.yaml b/services/logging/opensearch-observability-setup-job.yaml new file mode 100644 index 0000000..e4590fb --- /dev/null +++ b/services/logging/opensearch-observability-setup-job.yaml @@ -0,0 +1,45 @@ +# services/logging/opensearch-observability-setup-job.yaml +apiVersion: batch/v1 +kind: Job +metadata: + name: opensearch-observability-setup-2 + namespace: logging +spec: + backoffLimit: 3 + ttlSecondsAfterFinished: 3600 + template: + spec: + restartPolicy: OnFailure + nodeSelector: + node-role.kubernetes.io/worker: "true" + hardware: rpi5 + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: hardware + operator: In + values: + - rpi5 + containers: + - name: setup + image: python:3.11-alpine + command: ["python", "/scripts/seed.py"] + env: + - name: OSD_URL + value: http://opensearch-dashboards.logging.svc.cluster.local:5601 + volumeMounts: + - name: scripts + mountPath: /scripts + readOnly: true + - name: objects + mountPath: /config + readOnly: true + volumes: + - name: scripts + configMap: + name: opensearch-observability-script + - name: objects + configMap: + name: opensearch-observability-objects diff --git a/services/logging/opensearch-prune-cronjob.yaml b/services/logging/opensearch-prune-cronjob.yaml new file mode 100644 index 0000000..75e72db --- /dev/null +++ b/services/logging/opensearch-prune-cronjob.yaml @@ -0,0 +1,47 @@ +# services/logging/opensearch-prune-cronjob.yaml +apiVersion: batch/v1 +kind: CronJob +metadata: + name: opensearch-prune + namespace: logging +spec: + schedule: "23 3 * * *" + concurrencyPolicy: Forbid + successfulJobsHistoryLimit: 1 + failedJobsHistoryLimit: 3 + jobTemplate: + spec: + backoffLimit: 2 + template: + spec: + restartPolicy: OnFailure + nodeSelector: + node-role.kubernetes.io/worker: "true" + hardware: rpi5 + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: hardware + operator: In + values: + - rpi5 + containers: + - name: prune + image: python:3.11-alpine + command: ["python", "/scripts/prune.py"] + env: + - name: OPENSEARCH_URL + value: http://opensearch-master.logging.svc.cluster.local:9200 + - name: LOG_LIMIT_BYTES + value: "1099511627776" + - name: LOG_INDEX_PATTERNS + value: "kube-*,journald-*,trace-analytics-*" + volumeMounts: + - name: scripts + mountPath: /scripts + volumes: + - name: scripts + configMap: + name: opensearch-prune-script diff --git a/services/logging/opensearch-pvc.yaml b/services/logging/opensearch-pvc.yaml new file mode 100644 index 0000000..f537b99 --- /dev/null +++ b/services/logging/opensearch-pvc.yaml @@ -0,0 +1,14 @@ +# services/logging/opensearch-pvc.yaml +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: opensearch-opensearch-0 + namespace: logging +spec: + accessModes: + - ReadWriteOnce + resources: + requests: + storage: 1024Gi + storageClassName: asteria + volumeMode: Filesystem diff --git a/services/logging/otel-collector-helmrelease.yaml b/services/logging/otel-collector-helmrelease.yaml new file mode 100644 index 0000000..c24682f --- /dev/null +++ b/services/logging/otel-collector-helmrelease.yaml @@ -0,0 +1,90 @@ +# services/logging/otel-collector-helmrelease.yaml +apiVersion: helm.toolkit.fluxcd.io/v2 +kind: HelmRelease +metadata: + name: otel-collector + namespace: logging +spec: + interval: 15m + timeout: 10m + chart: + spec: + chart: opentelemetry-collector + version: "~0.143.0" + sourceRef: + kind: HelmRepository + name: opentelemetry + namespace: flux-system + values: + fullnameOverride: otel-collector + image: + repository: otel/opentelemetry-collector + tag: "0.143.0" + mode: deployment + replicaCount: 1 + ports: + otlp: + enabled: true + containerPort: 4317 + servicePort: 4317 + protocol: TCP + otlp-http: + enabled: true + containerPort: 4318 + servicePort: 4318 + protocol: TCP + jaeger-compact: + enabled: false + jaeger-thrift: + enabled: false + jaeger-grpc: + enabled: false + zipkin: + enabled: false + metrics: + enabled: false + config: + receivers: + otlp: + protocols: + grpc: + endpoint: ${env:MY_POD_IP}:4317 + http: + endpoint: ${env:MY_POD_IP}:4318 + processors: + memory_limiter: + check_interval: 5s + limit_percentage: 80 + spike_limit_percentage: 25 + batch: {} + exporters: + otlp/data-prepper: + endpoint: data-prepper.logging.svc.cluster.local:21890 + tls: + insecure: true + service: + extensions: + - health_check + pipelines: + traces: + receivers: [otlp] + processors: [memory_limiter, batch] + exporters: [otlp/data-prepper] + resources: + requests: + cpu: "100m" + memory: "256Mi" + limits: + memory: "512Mi" + nodeSelector: + node-role.kubernetes.io/worker: "true" + hardware: rpi5 + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: hardware + operator: In + values: + - rpi5 diff --git a/services/logging/scripts/node_image_gc_rpi4.sh b/services/logging/scripts/node_image_gc_rpi4.sh new file mode 100644 index 0000000..81f27b1 --- /dev/null +++ b/services/logging/scripts/node_image_gc_rpi4.sh @@ -0,0 +1,36 @@ +#!/usr/bin/env bash +set -euo pipefail + +changed=0 +k3s_changed=0 +k3s_agent_changed=0 + +k3s_dropin="/host/etc/systemd/system/k3s.service.d/98-image-gc.conf" +k3s_agent_dropin="/host/etc/systemd/system/k3s-agent.service.d/98-image-gc.conf" + +if [ -f "/host/etc/systemd/system/k3s.service" ] && [ ! -f "${k3s_dropin}" ]; then + mkdir -p "$(dirname "${k3s_dropin}")" + printf "[Service]\nEnvironment=\"K3S_KUBELET_ARG=image-gc-high-threshold=70\"\nEnvironment=\"K3S_KUBELET_ARG=image-gc-low-threshold=60\"\nEnvironment=\"K3S_KUBELET_ARG=image-gc-minimum-available=5Gi\"\n" > "${k3s_dropin}" + changed=1 + k3s_changed=1 +fi + +if [ -f "/host/etc/systemd/system/k3s-agent.service" ] && [ ! -f "${k3s_agent_dropin}" ]; then + mkdir -p "$(dirname "${k3s_agent_dropin}")" + printf "[Service]\nEnvironment=\"K3S_KUBELET_ARG=image-gc-high-threshold=70\"\nEnvironment=\"K3S_KUBELET_ARG=image-gc-low-threshold=60\"\nEnvironment=\"K3S_KUBELET_ARG=image-gc-minimum-available=5Gi\"\n" > "${k3s_agent_dropin}" + changed=1 + k3s_agent_changed=1 +fi + +if [ "${changed}" -eq 1 ]; then + sleep "$(( (RANDOM % 300) + 10 ))" + chroot /host /bin/systemctl daemon-reload + if [ "${k3s_changed}" -eq 1 ]; then + chroot /host /bin/systemctl restart k3s + fi + if [ "${k3s_agent_changed}" -eq 1 ]; then + chroot /host /bin/systemctl restart k3s-agent + fi +fi + +sleep infinity diff --git a/services/logging/scripts/node_image_prune_rpi5.sh b/services/logging/scripts/node_image_prune_rpi5.sh new file mode 100644 index 0000000..eb54b77 --- /dev/null +++ b/services/logging/scripts/node_image_prune_rpi5.sh @@ -0,0 +1,26 @@ +#!/usr/bin/env bash +set -euo pipefail + +threshold=70 + +sleep "$(( (RANDOM % 300) + 10 ))" + +while true; do + usage=$(df -P /host | awk 'NR==2 {gsub(/%/,"",$5); print $5}') + if [ -z "${usage}" ]; then + sleep 1800 + continue + fi + + if [ "${usage}" -ge "${threshold}" ]; then + chroot /host /bin/sh -c ' + if command -v crictl >/dev/null 2>&1; then + crictl --runtime-endpoint=unix:///run/k3s/containerd/containerd.sock rmi --prune || true + elif [ -x /usr/local/bin/crictl ]; then + /usr/local/bin/crictl --runtime-endpoint=unix:///run/k3s/containerd/containerd.sock rmi --prune || true + fi + ' + fi + + sleep 21600 +done diff --git a/services/logging/scripts/node_log_rotation.sh b/services/logging/scripts/node_log_rotation.sh new file mode 100644 index 0000000..534806f --- /dev/null +++ b/services/logging/scripts/node_log_rotation.sh @@ -0,0 +1,64 @@ +#!/usr/bin/env bash +set -euo pipefail + +changed=0 +journald_changed=0 +k3s_changed=0 +k3s_agent_changed=0 + +journald_dropin="/host/etc/systemd/journald.conf.d/99-logging.conf" +k3s_dropin="/host/etc/systemd/system/k3s.service.d/99-logging.conf" +k3s_agent_dropin="/host/etc/systemd/system/k3s-agent.service.d/99-logging.conf" +k3s_image_gc_dropin="/host/etc/systemd/system/k3s.service.d/98-image-gc.conf" +k3s_agent_image_gc_dropin="/host/etc/systemd/system/k3s-agent.service.d/98-image-gc.conf" + +if [ ! -f "${journald_dropin}" ]; then + mkdir -p "$(dirname "${journald_dropin}")" + printf "[Journal]\nStorage=volatile\nRuntimeMaxUse=200M\nRuntimeKeepFree=512M\nMaxFileSec=1h\n" > "${journald_dropin}" + changed=1 + journald_changed=1 +fi + +if [ -f "/host/etc/systemd/system/k3s.service" ] && [ ! -f "${k3s_dropin}" ]; then + mkdir -p "$(dirname "${k3s_dropin}")" + printf "[Service]\nEnvironment=\"K3S_KUBELET_ARG=container-log-max-size=10Mi\"\nEnvironment=\"K3S_KUBELET_ARG=container-log-max-files=2\"\n" > "${k3s_dropin}" + changed=1 + k3s_changed=1 +fi + +if [ -f "/host/etc/systemd/system/k3s.service" ] && [ ! -f "${k3s_image_gc_dropin}" ]; then + mkdir -p "$(dirname "${k3s_image_gc_dropin}")" + printf "[Service]\nEnvironment=\"K3S_KUBELET_ARG=image-gc-high-threshold=70\"\nEnvironment=\"K3S_KUBELET_ARG=image-gc-low-threshold=60\"\nEnvironment=\"K3S_KUBELET_ARG=image-gc-minimum-available=5Gi\"\n" > "${k3s_image_gc_dropin}" + changed=1 + k3s_changed=1 +fi + +if [ -f "/host/etc/systemd/system/k3s-agent.service" ] && [ ! -f "${k3s_agent_dropin}" ]; then + mkdir -p "$(dirname "${k3s_agent_dropin}")" + printf "[Service]\nEnvironment=\"K3S_KUBELET_ARG=container-log-max-size=10Mi\"\nEnvironment=\"K3S_KUBELET_ARG=container-log-max-files=2\"\n" > "${k3s_agent_dropin}" + changed=1 + k3s_agent_changed=1 +fi + +if [ -f "/host/etc/systemd/system/k3s-agent.service" ] && [ ! -f "${k3s_agent_image_gc_dropin}" ]; then + mkdir -p "$(dirname "${k3s_agent_image_gc_dropin}")" + printf "[Service]\nEnvironment=\"K3S_KUBELET_ARG=image-gc-high-threshold=70\"\nEnvironment=\"K3S_KUBELET_ARG=image-gc-low-threshold=60\"\nEnvironment=\"K3S_KUBELET_ARG=image-gc-minimum-available=5Gi\"\n" > "${k3s_agent_image_gc_dropin}" + changed=1 + k3s_agent_changed=1 +fi + +if [ "${changed}" -eq 1 ]; then + sleep "$(( (RANDOM % 300) + 10 ))" + chroot /host /bin/systemctl daemon-reload + if [ "${journald_changed}" -eq 1 ]; then + chroot /host /bin/systemctl restart systemd-journald + fi + if [ "${k3s_changed}" -eq 1 ]; then + chroot /host /bin/systemctl restart k3s + fi + if [ "${k3s_agent_changed}" -eq 1 ]; then + chroot /host /bin/systemctl restart k3s-agent + fi +fi + +sleep infinity diff --git a/services/logging/scripts/opensearch_observability_seed.py b/services/logging/scripts/opensearch_observability_seed.py new file mode 100644 index 0000000..d7bf808 --- /dev/null +++ b/services/logging/scripts/opensearch_observability_seed.py @@ -0,0 +1,140 @@ +import json +import os +import time +import urllib.error +import urllib.request + +OSD_URL = os.environ.get( + "OSD_URL", + "http://opensearch-dashboards.logging.svc.cluster.local:5601", +).rstrip("/") +OBJECT_DIR = "/config" + +def request_json(method, path, payload=None): + url = f"{OSD_URL}{path}" + data = None + headers = {"osd-xsrf": "true"} + if payload is not None: + data = json.dumps(payload).encode("utf-8") + headers["Content-Type"] = "application/json" + + req = urllib.request.Request(url, data=data, method=method) + for key, value in headers.items(): + req.add_header(key, value) + + try: + with urllib.request.urlopen(req, timeout=30) as response: + body = response.read().decode("utf-8") + except urllib.error.HTTPError as exc: + detail = exc.read().decode("utf-8") + raise SystemExit(f"{method} {path} failed: {exc.code} {detail}") + + if not body: + return {} + return json.loads(body) + + +def wait_ready(): + for _ in range(60): + try: + request_json("GET", "/api/status") + return + except Exception: + time.sleep(5) + raise SystemExit("OpenSearch Dashboards did not become ready in time") + + +def load_payload(name): + path = os.path.join(OBJECT_DIR, name) + with open(path, "r", encoding="utf-8") as handle: + return json.load(handle) + + +def index_by_name(items, key): + lookup = {} + for item in items: + obj = item.get(key, {}) + name = obj.get("name") + if not name: + continue + lookup.setdefault(name, item) + return lookup + + +def ensure_applications(apps): + existing = request_json("GET", "/api/observability/application/").get("data", []) + existing_by_name = {app.get("name"): app for app in existing if app.get("name")} + + for app in apps: + name = app.get("name") + if not name: + continue + current = existing_by_name.get(name) + if not current: + request_json("POST", "/api/observability/application/", app) + print(f"created application: {name}") + continue + + if app.get("baseQuery") != current.get("baseQuery"): + print(f"baseQuery differs for {name}; skipping update") + + update_body = {} + for key in ("description", "servicesEntities", "traceGroups"): + if app.get(key, "") != current.get(key, ""): + update_body[key] = app.get(key, "") + + if update_body: + request_json( + "PUT", + "/api/observability/application/", + {"appId": current["id"], "updateBody": update_body}, + ) + print(f"updated application: {name}") + + +def ensure_saved_objects(objects, object_type, endpoint): + existing = request_json( + "GET", + f"/api/observability/event_analytics/saved_objects?objectType={object_type}", + ).get("observabilityObjectList", []) + key = "savedQuery" if object_type == "savedQuery" else "savedVisualization" + existing_by_name = index_by_name(existing, key) + + for obj in objects: + name = obj.get("name") + if not name: + continue + current = existing_by_name.get(name) + if not current: + request_json("POST", endpoint, {"object": obj}) + print(f"created {object_type}: {name}") + continue + + current_body = current.get(key, {}) + if current_body != obj: + request_json( + "PUT", + endpoint, + {"object_id": current["objectId"], "object": obj}, + ) + print(f"updated {object_type}: {name}") + + +def main(): + wait_ready() + + applications = load_payload("applications.json") + queries = load_payload("saved_queries.json") + visualizations = load_payload("saved_visualizations.json") + + ensure_applications(applications) + ensure_saved_objects(queries, "savedQuery", "/api/observability/event_analytics/saved_objects/query") + ensure_saved_objects( + visualizations, + "savedVisualization", + "/api/observability/event_analytics/saved_objects/vis", + ) + + +if __name__ == "__main__": + main() diff --git a/services/logging/scripts/opensearch_prune.py b/services/logging/scripts/opensearch_prune.py new file mode 100644 index 0000000..ad84d5b --- /dev/null +++ b/services/logging/scripts/opensearch_prune.py @@ -0,0 +1,77 @@ +import json +import os +import re +import sys +import urllib.error +import urllib.request + +os_url = os.environ.get("OPENSEARCH_URL", "http://opensearch-master.logging.svc.cluster.local:9200").rstrip("/") +limit_bytes = int(os.environ.get("LOG_LIMIT_BYTES", str(1024**4))) +patterns = [p.strip() for p in os.environ.get("LOG_INDEX_PATTERNS", "kube-*,journald-*").split(",") if p.strip()] + +UNITS = { + "b": 1, + "kb": 1024, + "mb": 1024**2, + "gb": 1024**3, + "tb": 1024**4, +} + +def parse_size(value: str) -> int: + if not value: + return 0 + text = value.strip().lower() + if text in ("-", "0"): + return 0 + match = re.match(r"^([0-9.]+)([a-z]+)$", text) + if not match: + return 0 + number = float(match.group(1)) + unit = match.group(2) + if unit not in UNITS: + return 0 + return int(number * UNITS[unit]) + +def request_json(path: str): + url = f"{os_url}{path}" + with urllib.request.urlopen(url, timeout=30) as response: + payload = response.read().decode("utf-8") + return json.loads(payload) + +def delete_index(index: str) -> None: + url = f"{os_url}/{index}" + req = urllib.request.Request(url, method="DELETE") + with urllib.request.urlopen(req, timeout=30) as response: + _ = response.read() + print(f"deleted {index}") + +indices = [] +for pattern in patterns: + try: + data = request_json(f"/_cat/indices/{pattern}?format=json&h=index,store.size,creation.date") + except urllib.error.HTTPError as exc: + if exc.code == 404: + continue + raise + for item in data: + index = item.get("index") + if not index or index.startswith("."): + continue + size = parse_size(item.get("store.size", "")) + created = int(item.get("creation.date", "0") or 0) + indices.append({"index": index, "size": size, "created": created}) + +total = sum(item["size"] for item in indices) +print(f"total_log_bytes={total}") +if total <= limit_bytes: + print("within limit") + sys.exit(0) + +indices.sort(key=lambda item: item["created"]) +for item in indices: + if total <= limit_bytes: + break + delete_index(item["index"]) + total -= item["size"] + +print(f"remaining_log_bytes={total}") diff --git a/services/mailu/front-lb.yaml b/services/mailu/front-lb.yaml new file mode 100644 index 0000000..ada16b0 --- /dev/null +++ b/services/mailu/front-lb.yaml @@ -0,0 +1,42 @@ +# services/mailu/front-lb.yaml +apiVersion: v1 +kind: Service +metadata: + name: mailu-front-lb + namespace: mailu-mailserver + annotations: + metallb.universe.tf/address-pool: communication-pool +spec: + type: LoadBalancer + loadBalancerClass: metallb + loadBalancerIP: 192.168.22.4 + externalTrafficPolicy: Cluster + selector: + app.kubernetes.io/component: front + app.kubernetes.io/instance: mailu + app.kubernetes.io/name: mailu + ports: + - name: smtp + port: 25 + targetPort: 25 + protocol: TCP + - name: smtps + port: 465 + targetPort: 465 + protocol: TCP + - name: submission + port: 587 + targetPort: 587 + protocol: TCP + - name: imaps + port: 993 + targetPort: 993 + protocol: TCP + - name: pop3s + port: 995 + targetPort: 995 + protocol: TCP + - name: sieve + port: 4190 + targetPort: 4190 + protocol: TCP diff --git a/services/mailu/helmrelease.yaml b/services/mailu/helmrelease.yaml index c8b0975..e675961 100644 --- a/services/mailu/helmrelease.yaml +++ b/services/mailu/helmrelease.yaml @@ -33,9 +33,9 @@ spec: dkim: enabled: true externalRelay: - host: "[email-smtp.us-east-2.amazonaws.com]:587" - existingSecret: mailu-ses-relay - usernameKey: relay-username + host: "[smtp.postmarkapp.com]:587" + existingSecret: mailu-postmark-relay + usernameKey: relay-password passwordKey: relay-password timezone: Etc/UTC subnet: 10.42.0.0/16 @@ -217,9 +217,29 @@ spec: nodeSelector: hardware: rpi4 overrides: - smtp_use_tls: "yes" - smtp_tls_security_level: "encrypt" - smtp_sasl_security_options: "noanonymous" + postfix.cf: | + mynetworks = 127.0.0.0/8 [::1]/128 10.42.0.0/16 10.43.0.0/16 192.168.22.0/24 + smtpd_delay_reject = yes + smtpd_helo_required = yes + smtpd_helo_restrictions = reject_invalid_helo_hostname, reject_non_fqdn_helo_hostname, reject_unknown_helo_hostname + smtpd_sasl_auth_enable = yes + smtpd_sasl_type = dovecot + smtpd_sasl_path = private/auth + smtpd_sasl_security_options = noanonymous + smtpd_sasl_tls_security_options = noanonymous + smtpd_client_restrictions = permit_mynetworks, permit_sasl_authenticated, reject_unauth_pipelining, reject_unknown_client_hostname + smtpd_recipient_restrictions = permit_mynetworks, permit_sasl_authenticated, reject_unauth_destination, reject_non_fqdn_recipient, reject_unknown_recipient_domain + smtpd_relay_restrictions = permit_sasl_authenticated, reject_unauth_destination + smtpd_sender_restrictions = reject_non_fqdn_sender, reject_unknown_sender_domain, reject_sender_login_mismatch, reject_authenticated_sender_login_mismatch + smtpd_tls_auth_only = yes + smtpd_forbid_unauth_pipelining = yes + smtpd_client_connection_count_limit = 20 + smtpd_client_connection_rate_limit = 30 + smtpd_client_message_rate_limit = 100 + smtpd_client_recipient_rate_limit = 200 + smtpd_recipient_limit = 100 + podAnnotations: + bstein.dev/restarted-at: "2026-01-06T00:00:00Z" redis: enabled: true architecture: standalone diff --git a/services/mailu/kustomization.yaml b/services/mailu/kustomization.yaml index 2df7440..af4b2b1 100644 --- a/services/mailu/kustomization.yaml +++ b/services/mailu/kustomization.yaml @@ -13,11 +13,16 @@ resources: - mailu-sync-job.yaml - mailu-sync-cronjob.yaml - mailu-sync-listener.yaml + - front-lb.yaml configMapGenerator: - name: mailu-sync-script namespace: mailu-mailserver files: - - sync.py=../../scripts/mailu_sync.py + - sync.py=scripts/mailu_sync.py options: disableNameSuffixHash: true + - name: mailu-sync-listener + namespace: mailu-mailserver + files: + - listener.py=scripts/mailu_sync_listener.py diff --git a/services/mailu/mailu-sync-listener.yaml b/services/mailu/mailu-sync-listener.yaml index 04e8070..2127313 100644 --- a/services/mailu/mailu-sync-listener.yaml +++ b/services/mailu/mailu-sync-listener.yaml @@ -100,55 +100,3 @@ spec: configMap: name: mailu-sync-listener defaultMode: 0444 ---- -apiVersion: v1 -kind: ConfigMap -metadata: - name: mailu-sync-listener - namespace: mailu-mailserver -data: - listener.py: | - import http.server - import json - import os - import subprocess - import threading - - from time import time - - # Simple debounce to avoid hammering on bursts - MIN_INTERVAL_SECONDS = 10 - last_run = 0.0 - lock = threading.Lock() - - def trigger_sync(): - global last_run - with lock: - now = time() - if now - last_run < MIN_INTERVAL_SECONDS: - return - last_run = now - # Fire and forget; output to stdout - subprocess.Popen(["python", "/app/sync.py"], stdout=subprocess.PIPE, stderr=subprocess.STDOUT) - - class Handler(http.server.BaseHTTPRequestHandler): - def do_POST(self): - length = int(self.headers.get("Content-Length", 0)) - body = self.rfile.read(length) if length else b"" - try: - json.loads(body or b"{}") - except json.JSONDecodeError: - self.send_response(400) - self.end_headers() - return - trigger_sync() - self.send_response(202) - self.end_headers() - - def log_message(self, fmt, *args): - # Quiet logging - return - - if __name__ == "__main__": - server = http.server.ThreadingHTTPServer(("", 8080), Handler) - server.serve_forever() diff --git a/scripts/mailu_sync.py b/services/mailu/scripts/mailu_sync.py similarity index 82% rename from scripts/mailu_sync.py rename to services/mailu/scripts/mailu_sync.py index ee8aa18..74b170a 100644 --- a/scripts/mailu_sync.py +++ b/services/mailu/scripts/mailu_sync.py @@ -110,13 +110,33 @@ def random_password(): alphabet = string.ascii_letters + string.digits return "".join(secrets.choice(alphabet) for _ in range(24)) +def get_attribute_value(attributes, key): + raw = (attributes or {}).get(key) + if isinstance(raw, list): + return raw[0] if raw else None + if isinstance(raw, str): + return raw + return None + + +def resolve_mailu_email(user, attributes): + explicit = get_attribute_value(attributes, "mailu_email") + if explicit: + return explicit + + email = user.get("email") or "" + if "@" in email and email.lower().endswith(f"@{MAILU_DOMAIN.lower()}"): + return email + + return f"{user['username']}@{MAILU_DOMAIN}" + def ensure_mailu_user(cursor, email, password, display_name): localpart, domain = email.split("@", 1) if domain.lower() != MAILU_DOMAIN.lower(): return hashed = bcrypt_sha256.hash(password) - now = datetime.datetime.utcnow() + now = datetime.datetime.now(datetime.timezone.utc) cursor.execute( """ INSERT INTO "user" ( @@ -167,30 +187,29 @@ def main(): for user in users: attrs = user.get("attributes", {}) or {} - app_pw_value = attrs.get("mailu_app_password") - if isinstance(app_pw_value, list): - app_pw = app_pw_value[0] if app_pw_value else None - elif isinstance(app_pw_value, str): - app_pw = app_pw_value - else: - app_pw = None + app_pw = get_attribute_value(attrs, "mailu_app_password") + mailu_email = resolve_mailu_email(user, attrs) - email = user.get("email") - if not email: - email = f"{user['username']}@{MAILU_DOMAIN}" + needs_update = False + if not get_attribute_value(attrs, "mailu_email"): + attrs["mailu_email"] = [mailu_email] + needs_update = True if not app_pw: app_pw = random_password() - attrs["mailu_app_password"] = app_pw + attrs["mailu_app_password"] = [app_pw] + needs_update = True + + if needs_update: kc_update_attributes(token, user, attrs) - log(f"Set mailu_app_password for {email}") + log(f"Updated Mailu attributes for {mailu_email}") display_name = " ".join( part for part in [user.get("firstName"), user.get("lastName")] if part ).strip() - ensure_mailu_user(cursor, email, app_pw, display_name) - log(f"Synced mailbox for {email}") + ensure_mailu_user(cursor, mailu_email, app_pw, display_name) + log(f"Synced mailbox for {mailu_email}") cursor.close() conn.close() diff --git a/services/mailu/scripts/mailu_sync_listener.py b/services/mailu/scripts/mailu_sync_listener.py new file mode 100644 index 0000000..27070c0 --- /dev/null +++ b/services/mailu/scripts/mailu_sync_listener.py @@ -0,0 +1,93 @@ +import http.server +import json +import subprocess +import threading + +from time import time + +# Simple debounce to avoid hammering on bursts +MIN_INTERVAL_SECONDS = 10 +last_run = 0.0 +lock = threading.Lock() +sync_done = threading.Event() +sync_done.set() +sync_running = False + + +def _run_sync_blocking() -> int: + global last_run, sync_running + with lock: + if sync_running: + return 0 + sync_running = True + sync_done.clear() + + try: + print("mailu-sync-listener: starting sync", flush=True) + proc = subprocess.run(["python", "/app/sync.py"], check=False) + rc = int(proc.returncode) + print(f"mailu-sync-listener: sync completed rc={rc}", flush=True) + return rc + finally: + with lock: + sync_running = False + last_run = time() + sync_done.set() + + +def _trigger_sync_async() -> bool: + with lock: + now = time() + if sync_running: + return False + if now - last_run < MIN_INTERVAL_SECONDS: + return False + + thread = threading.Thread(target=_run_sync_blocking, daemon=True) + thread.start() + return True + + +class Handler(http.server.BaseHTTPRequestHandler): + def do_POST(self): + length = int(self.headers.get("Content-Length", 0)) + body = self.rfile.read(length) if length else b"" + try: + payload = json.loads(body or b"{}") + except json.JSONDecodeError: + self.send_response(400) + self.end_headers() + return + + wait = False + if isinstance(payload, dict): + wait = bool(payload.get("wait")) + + if wait: + with lock: + already_running = sync_running + if already_running: + sync_done.wait(timeout=120) + with lock: + still_running = sync_running + self.send_response(200 if not still_running else 503) + self.end_headers() + return + + rc = _run_sync_blocking() + self.send_response(200 if rc == 0 else 500) + self.end_headers() + return + + _trigger_sync_async() + self.send_response(202) + self.end_headers() + + def log_message(self, fmt, *args): + # Quiet logging + return + + +if __name__ == "__main__": + server = http.server.ThreadingHTTPServer(("", 8080), Handler) + server.serve_forever() diff --git a/services/maintenance/image-sweeper-cronjob.yaml b/services/maintenance/image-sweeper-cronjob.yaml new file mode 100644 index 0000000..08127bc --- /dev/null +++ b/services/maintenance/image-sweeper-cronjob.yaml @@ -0,0 +1,50 @@ +# services/maintenance/image-sweeper-cronjob.yaml +apiVersion: batch/v1 +kind: CronJob +metadata: + name: image-sweeper + namespace: maintenance +spec: + schedule: "30 4 * * 0" + concurrencyPolicy: Forbid + successfulJobsHistoryLimit: 2 + failedJobsHistoryLimit: 2 + jobTemplate: + spec: + template: + spec: + serviceAccountName: node-image-sweeper + restartPolicy: OnFailure + nodeSelector: + kubernetes.io/os: linux + tolerations: + - key: node-role.kubernetes.io/control-plane + operator: Exists + effect: NoSchedule + - key: node-role.kubernetes.io/master + operator: Exists + effect: NoSchedule + containers: + - name: image-sweeper + image: python:3.12.9-alpine3.20 + command: ["/bin/sh", "/scripts/node_image_sweeper.sh"] + env: + - name: ONE_SHOT + value: "true" + securityContext: + privileged: true + runAsUser: 0 + volumeMounts: + - name: host-root + mountPath: /host + - name: script + mountPath: /scripts + readOnly: true + volumes: + - name: host-root + hostPath: + path: / + - name: script + configMap: + name: node-image-sweeper-script + defaultMode: 0555 diff --git a/services/maintenance/kustomization.yaml b/services/maintenance/kustomization.yaml new file mode 100644 index 0000000..ce34afb --- /dev/null +++ b/services/maintenance/kustomization.yaml @@ -0,0 +1,32 @@ +# services/maintenance/kustomization.yaml +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization +resources: + - namespace.yaml + - node-nofile-serviceaccount.yaml + - pod-cleaner-rbac.yaml + - node-nofile-daemonset.yaml + - pod-cleaner-cronjob.yaml + - node-image-sweeper-serviceaccount.yaml + - node-image-sweeper-daemonset.yaml + - image-sweeper-cronjob.yaml + +configMapGenerator: + - name: node-nofile-script + namespace: maintenance + files: + - node_nofile.sh=scripts/node_nofile.sh + options: + disableNameSuffixHash: true + - name: pod-cleaner-script + namespace: maintenance + files: + - pod_cleaner.sh=scripts/pod_cleaner.sh + options: + disableNameSuffixHash: true + - name: node-image-sweeper-script + namespace: maintenance + files: + - node_image_sweeper.sh=scripts/node_image_sweeper.sh + options: + disableNameSuffixHash: true diff --git a/services/maintenance/namespace.yaml b/services/maintenance/namespace.yaml new file mode 100644 index 0000000..dce28b0 --- /dev/null +++ b/services/maintenance/namespace.yaml @@ -0,0 +1,5 @@ +# services/maintenance/namespace.yaml +apiVersion: v1 +kind: Namespace +metadata: + name: maintenance diff --git a/services/maintenance/node-image-sweeper-daemonset.yaml b/services/maintenance/node-image-sweeper-daemonset.yaml new file mode 100644 index 0000000..c3cb24d --- /dev/null +++ b/services/maintenance/node-image-sweeper-daemonset.yaml @@ -0,0 +1,48 @@ +# services/maintenance/node-image-sweeper-daemonset.yaml +apiVersion: apps/v1 +kind: DaemonSet +metadata: + name: node-image-sweeper + namespace: maintenance +spec: + selector: + matchLabels: + app: node-image-sweeper + updateStrategy: + type: RollingUpdate + template: + metadata: + labels: + app: node-image-sweeper + spec: + serviceAccountName: node-image-sweeper + tolerations: + - key: node-role.kubernetes.io/control-plane + operator: Exists + effect: NoSchedule + - key: node-role.kubernetes.io/master + operator: Exists + effect: NoSchedule + nodeSelector: + kubernetes.io/os: linux + containers: + - name: node-image-sweeper + image: python:3.12.9-alpine3.20 + command: ["/bin/sh", "/scripts/node_image_sweeper.sh"] + securityContext: + privileged: true + runAsUser: 0 + volumeMounts: + - name: host-root + mountPath: /host + - name: script + mountPath: /scripts + readOnly: true + volumes: + - name: host-root + hostPath: + path: / + - name: script + configMap: + name: node-image-sweeper-script + defaultMode: 0555 diff --git a/services/maintenance/node-image-sweeper-serviceaccount.yaml b/services/maintenance/node-image-sweeper-serviceaccount.yaml new file mode 100644 index 0000000..854f041 --- /dev/null +++ b/services/maintenance/node-image-sweeper-serviceaccount.yaml @@ -0,0 +1,6 @@ +# services/maintenance/node-image-sweeper-serviceaccount.yaml +apiVersion: v1 +kind: ServiceAccount +metadata: + name: node-image-sweeper + namespace: maintenance diff --git a/services/maintenance/node-nofile-daemonset.yaml b/services/maintenance/node-nofile-daemonset.yaml new file mode 100644 index 0000000..392753d --- /dev/null +++ b/services/maintenance/node-nofile-daemonset.yaml @@ -0,0 +1,47 @@ +# services/maintenance/node-nofile-daemonset.yaml +apiVersion: apps/v1 +kind: DaemonSet +metadata: + name: node-nofile + namespace: maintenance +spec: + selector: + matchLabels: + app: node-nofile + updateStrategy: + type: RollingUpdate + template: + metadata: + labels: + app: node-nofile + spec: + serviceAccountName: node-nofile + tolerations: + - key: node-role.kubernetes.io/control-plane + operator: Exists + effect: NoSchedule + - key: node-role.kubernetes.io/master + operator: Exists + effect: NoSchedule + containers: + - name: node-nofile + image: bitnami/kubectl@sha256:554ab88b1858e8424c55de37ad417b16f2a0e65d1607aa0f3fe3ce9b9f10b131 + command: ["/usr/bin/env", "bash"] + args: ["/scripts/node_nofile.sh"] + securityContext: + privileged: true + runAsUser: 0 + volumeMounts: + - name: host-root + mountPath: /host + - name: script + mountPath: /scripts + readOnly: true + volumes: + - name: host-root + hostPath: + path: / + - name: script + configMap: + name: node-nofile-script + defaultMode: 0555 diff --git a/services/maintenance/node-nofile-serviceaccount.yaml b/services/maintenance/node-nofile-serviceaccount.yaml new file mode 100644 index 0000000..1cc0499 --- /dev/null +++ b/services/maintenance/node-nofile-serviceaccount.yaml @@ -0,0 +1,6 @@ +# services/maintenance/node-nofile-serviceaccount.yaml +apiVersion: v1 +kind: ServiceAccount +metadata: + name: node-nofile + namespace: maintenance diff --git a/services/maintenance/pod-cleaner-cronjob.yaml b/services/maintenance/pod-cleaner-cronjob.yaml new file mode 100644 index 0000000..ffca7dd --- /dev/null +++ b/services/maintenance/pod-cleaner-cronjob.yaml @@ -0,0 +1,32 @@ +# services/maintenance/pod-cleaner-cronjob.yaml +apiVersion: batch/v1 +kind: CronJob +metadata: + name: pod-cleaner + namespace: maintenance +spec: + schedule: "0 * * * *" + concurrencyPolicy: Forbid + successfulJobsHistoryLimit: 1 + failedJobsHistoryLimit: 3 + jobTemplate: + spec: + backoffLimit: 1 + template: + spec: + serviceAccountName: pod-cleaner + restartPolicy: Never + containers: + - name: cleaner + image: bitnami/kubectl@sha256:554ab88b1858e8424c55de37ad417b16f2a0e65d1607aa0f3fe3ce9b9f10b131 + command: ["/usr/bin/env", "bash"] + args: ["/scripts/pod_cleaner.sh"] + volumeMounts: + - name: script + mountPath: /scripts + readOnly: true + volumes: + - name: script + configMap: + name: pod-cleaner-script + defaultMode: 0555 diff --git a/services/maintenance/pod-cleaner-rbac.yaml b/services/maintenance/pod-cleaner-rbac.yaml new file mode 100644 index 0000000..26bb035 --- /dev/null +++ b/services/maintenance/pod-cleaner-rbac.yaml @@ -0,0 +1,32 @@ +# services/maintenance/pod-cleaner-rbac.yaml +apiVersion: v1 +kind: ServiceAccount +metadata: + name: pod-cleaner + namespace: maintenance + +--- + +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: pod-cleaner +rules: + - apiGroups: [""] + resources: ["pods"] + verbs: ["get", "list", "delete"] + +--- + +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: pod-cleaner +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: pod-cleaner +subjects: + - kind: ServiceAccount + name: pod-cleaner + namespace: maintenance diff --git a/services/maintenance/scripts/node_image_sweeper.sh b/services/maintenance/scripts/node_image_sweeper.sh new file mode 100644 index 0000000..2ad7b47 --- /dev/null +++ b/services/maintenance/scripts/node_image_sweeper.sh @@ -0,0 +1,92 @@ +#!/bin/sh +set -eu + +ONE_SHOT=${ONE_SHOT:-false} +THRESHOLD_DAYS=14 + +usage=$(df -P /host | awk 'NR==2 {gsub(/%/,"",$5); print $5}') || usage="" +if [ -n "${usage}" ] && [ "${usage}" -ge 70 ]; then + THRESHOLD_DAYS=3 +fi + +cutoff=$(python3 - <<'PY' +import time, os +print(int(time.time()) - int(os.environ.get("THRESHOLD_DAYS", "14")) * 86400) +PY +) + +RUNNING=$(chroot /host /bin/sh -c "crictl ps -a --quiet 2>/dev/null" | tr -s ' ' '\n' | sort -u | tr '\n' ' ') +IMAGES_JSON=$(chroot /host /bin/sh -c "crictl images -o json 2>/dev/null" || echo '{}') + +SKIP="registry.k8s.io/pause k8s.gcr.io/pause rancher/mirrored-pause" + +prune_list=$(printf "%s" "${IMAGES_JSON}" | CUTOFF="${cutoff}" RUNNING="${RUNNING}" SKIP="${SKIP}" python3 - <<'PY' +import json, os, sys, time + +try: + data = json.load(sys.stdin) +except Exception: + print("", end="") + sys.exit(0) + +cutoff = int(os.environ.get("CUTOFF", "0")) +running = set(os.environ.get("RUNNING", "").split()) +skip = os.environ.get("SKIP", "").split() +now = int(time.time()) +prune = [] + + +def is_skip(tags): + if not tags: + return False + for t in tags: + for prefix in skip: + if prefix and t.startswith(prefix): + return True + return False + + +for img in data.get("images", []): + image_id = img.get("id", "") + if not image_id: + continue + if image_id in running: + continue + tags = img.get("repoTags") or [] + if is_skip(tags): + continue + created = img.get("createdAt") or 0 + try: + created = int(str(created)) // 1000000000 + except Exception: + created = 0 + if created and created > now: + created = now + if cutoff and created and created < cutoff: + prune.append(image_id) + +seen = set() +for p in prune: + if p in seen: + continue + seen.add(p) + print(p) +PY +) + +if [ -n "${prune_list}" ]; then + printf "%s" "${prune_list}" | while read -r image_id; do + if [ -n "${image_id}" ]; then + chroot /host /bin/sh -c "crictl rmi --prune ${image_id}" || true + fi + done +fi + +find /host/var/lib/rancher/k3s/agent/images -type f -name "*.tar" -mtime +7 -print -delete 2>/dev/null || true +find /host/var/lib/rancher/k3s/agent/containerd -maxdepth 1 -type f -mtime +7 -print -delete 2>/dev/null || true + +if [ "${ONE_SHOT}" = "true" ]; then + exit 0 +fi + +sleep infinity diff --git a/services/maintenance/scripts/node_nofile.sh b/services/maintenance/scripts/node_nofile.sh new file mode 100644 index 0000000..cf6c5d9 --- /dev/null +++ b/services/maintenance/scripts/node_nofile.sh @@ -0,0 +1,30 @@ +#!/usr/bin/env bash +set -euo pipefail + +limit_line="LimitNOFILE=1048576" +changed=0 + +for unit in k3s k3s-agent; do + unit_file="/host/etc/systemd/system/${unit}.service" + if [ -f "${unit_file}" ]; then + dropin_dir="/host/etc/systemd/system/${unit}.service.d" + dropin_file="${dropin_dir}/99-nofile.conf" + if [ ! -f "${dropin_file}" ] || ! grep -q "${limit_line}" "${dropin_file}"; then + mkdir -p "${dropin_dir}" + printf "[Service]\n%s\n" "${limit_line}" > "${dropin_file}" + changed=1 + fi + fi +done + +if [ "${changed}" -eq 1 ]; then + sleep "$(( (RANDOM % 300) + 10 ))" + chroot /host /bin/systemctl daemon-reload + for unit in k3s k3s-agent; do + if [ -f "/host/etc/systemd/system/${unit}.service" ]; then + chroot /host /bin/systemctl restart "${unit}" + fi + done +fi + +sleep infinity diff --git a/services/maintenance/scripts/pod_cleaner.sh b/services/maintenance/scripts/pod_cleaner.sh new file mode 100644 index 0000000..2ec043e --- /dev/null +++ b/services/maintenance/scripts/pod_cleaner.sh @@ -0,0 +1,12 @@ +#!/usr/bin/env bash +set -euo pipefail + +for phase in Succeeded Failed; do + kubectl get pods -A --field-selector="status.phase=${phase}" \ + -o jsonpath='{range .items[*]}{.metadata.namespace}{" "}{.metadata.name}{"\n"}{end}' \ + | while read -r namespace name; do + if [ -n "${namespace}" ] && [ -n "${name}" ]; then + kubectl delete pod -n "${namespace}" "${name}" --ignore-not-found --grace-period=0 --wait=false + fi + done +done diff --git a/services/monitoring/dashboards/atlas-gpu.json b/services/monitoring/dashboards/atlas-gpu.json index 572c2c6..fb1b216 100644 --- a/services/monitoring/dashboards/atlas-gpu.json +++ b/services/monitoring/dashboards/atlas-gpu.json @@ -20,7 +20,7 @@ }, "targets": [ { - "expr": "100 * ( ( (sum by (namespace) (max_over_time(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\"}[$__range]))) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ) / clamp_min(sum( ( (sum by (namespace) (max_over_time(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\"}[$__range]))) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ), 1)", + "expr": "(100 * (sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace)) / clamp_min((sum(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace)) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") and on() ((sum(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace)) or on() vector(0)) == 0))", "refId": "A", "legendFormat": "{{namespace}}" } @@ -53,7 +53,25 @@ "fields": "", "values": false } - } + }, + "links": [ + { + "title": "Workload namespaces only", + "url": "?var-namespace_scope_cpu=${namespace_scope_cpu}&var-namespace_scope_gpu=namespace%21~%22%5E%28kube-system%7Clonghorn-system%7Cmetallb-system%7Cmonitoring%7Clogging%7Ccert-manager%7Cflux-system%7Ctraefik%7Cmaintenance%7Cpostgres%29%24%22&var-namespace_scope_ram=${namespace_scope_ram}", + "targetBlank": false + }, + { + "title": "All namespaces", + "url": "?var-namespace_scope_cpu=${namespace_scope_cpu}&var-namespace_scope_gpu=namespace%3D~%22.%2A%22&var-namespace_scope_ram=${namespace_scope_ram}", + "targetBlank": false + }, + { + "title": "Infrastructure namespaces only", + "url": "?var-namespace_scope_cpu=${namespace_scope_cpu}&var-namespace_scope_gpu=namespace%3D~%22%5E%28kube-system%7Clonghorn-system%7Cmetallb-system%7Cmonitoring%7Clogging%7Ccert-manager%7Cflux-system%7Ctraefik%7Cmaintenance%7Cpostgres%29%24%22&var-namespace_scope_ram=${namespace_scope_ram}", + "targetBlank": false + } + ], + "description": "Values are normalized within the selected scope; use panel links to switch scope." }, { "id": 2, @@ -71,7 +89,7 @@ }, "targets": [ { - "expr": "sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\"}) by (namespace)", + "expr": "sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace)", "refId": "A", "legendFormat": "{{namespace}}" } @@ -182,5 +200,111 @@ "tags": [ "atlas", "gpu" - ] + ], + "templating": { + "list": [ + { + "name": "namespace_scope_cpu", + "label": "CPU namespace filter", + "type": "custom", + "query": "workload namespaces only : namespace!~\"^(kube-system|longhorn-system|metallb-system|monitoring|logging|cert-manager|flux-system|traefik|maintenance|postgres)$\",all namespaces : namespace=~\".*\",infrastructure namespaces only : namespace=~\"^(kube-system|longhorn-system|metallb-system|monitoring|logging|cert-manager|flux-system|traefik|maintenance|postgres)$\"", + "current": { + "text": "workload namespaces only", + "value": "namespace!~\"^(kube-system|longhorn-system|metallb-system|monitoring|logging|cert-manager|flux-system|traefik|maintenance|postgres)$\"", + "selected": true + }, + "options": [ + { + "text": "workload namespaces only", + "value": "namespace!~\"^(kube-system|longhorn-system|metallb-system|monitoring|logging|cert-manager|flux-system|traefik|maintenance|postgres)$\"", + "selected": true + }, + { + "text": "all namespaces", + "value": "namespace=~\".*\"", + "selected": false + }, + { + "text": "infrastructure namespaces only", + "value": "namespace=~\"^(kube-system|longhorn-system|metallb-system|monitoring|logging|cert-manager|flux-system|traefik|maintenance|postgres)$\"", + "selected": false + } + ], + "hide": 2, + "multi": false, + "includeAll": false, + "refresh": 1, + "sort": 0, + "skipUrlSync": false + }, + { + "name": "namespace_scope_gpu", + "label": "GPU namespace filter", + "type": "custom", + "query": "workload namespaces only : namespace!~\"^(kube-system|longhorn-system|metallb-system|monitoring|logging|cert-manager|flux-system|traefik|maintenance|postgres)$\",all namespaces : namespace=~\".*\",infrastructure namespaces only : namespace=~\"^(kube-system|longhorn-system|metallb-system|monitoring|logging|cert-manager|flux-system|traefik|maintenance|postgres)$\"", + "current": { + "text": "workload namespaces only", + "value": "namespace!~\"^(kube-system|longhorn-system|metallb-system|monitoring|logging|cert-manager|flux-system|traefik|maintenance|postgres)$\"", + "selected": true + }, + "options": [ + { + "text": "workload namespaces only", + "value": "namespace!~\"^(kube-system|longhorn-system|metallb-system|monitoring|logging|cert-manager|flux-system|traefik|maintenance|postgres)$\"", + "selected": true + }, + { + "text": "all namespaces", + "value": "namespace=~\".*\"", + "selected": false + }, + { + "text": "infrastructure namespaces only", + "value": "namespace=~\"^(kube-system|longhorn-system|metallb-system|monitoring|logging|cert-manager|flux-system|traefik|maintenance|postgres)$\"", + "selected": false + } + ], + "hide": 2, + "multi": false, + "includeAll": false, + "refresh": 1, + "sort": 0, + "skipUrlSync": false + }, + { + "name": "namespace_scope_ram", + "label": "RAM namespace filter", + "type": "custom", + "query": "workload namespaces only : namespace!~\"^(kube-system|longhorn-system|metallb-system|monitoring|logging|cert-manager|flux-system|traefik|maintenance|postgres)$\",all namespaces : namespace=~\".*\",infrastructure namespaces only : namespace=~\"^(kube-system|longhorn-system|metallb-system|monitoring|logging|cert-manager|flux-system|traefik|maintenance|postgres)$\"", + "current": { + "text": "workload namespaces only", + "value": "namespace!~\"^(kube-system|longhorn-system|metallb-system|monitoring|logging|cert-manager|flux-system|traefik|maintenance|postgres)$\"", + "selected": true + }, + "options": [ + { + "text": "workload namespaces only", + "value": "namespace!~\"^(kube-system|longhorn-system|metallb-system|monitoring|logging|cert-manager|flux-system|traefik|maintenance|postgres)$\"", + "selected": true + }, + { + "text": "all namespaces", + "value": "namespace=~\".*\"", + "selected": false + }, + { + "text": "infrastructure namespaces only", + "value": "namespace=~\"^(kube-system|longhorn-system|metallb-system|monitoring|logging|cert-manager|flux-system|traefik|maintenance|postgres)$\"", + "selected": false + } + ], + "hide": 2, + "multi": false, + "includeAll": false, + "refresh": 1, + "sort": 0, + "skipUrlSync": false + } + ] + } } diff --git a/services/monitoring/dashboards/atlas-mail.json b/services/monitoring/dashboards/atlas-mail.json new file mode 100644 index 0000000..67c1766 --- /dev/null +++ b/services/monitoring/dashboards/atlas-mail.json @@ -0,0 +1,709 @@ +{ + "uid": "atlas-mail", + "title": "Atlas Mail", + "folderUid": "atlas-internal", + "editable": true, + "panels": [ + { + "id": 1, + "type": "stat", + "title": "Sent (1d)", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 4, + "w": 6, + "x": 0, + "y": 0 + }, + "targets": [ + { + "expr": "max(postmark_outbound_sent{window=\"1d\"})", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "rgba(115, 115, 115, 1)", + "value": null + }, + { + "color": "green", + "value": 1 + } + ] + }, + "unit": "none", + "custom": { + "displayMode": "auto" + }, + "decimals": 0 + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "value" + } + }, + { + "id": 2, + "type": "stat", + "title": "Sent (7d)", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 4, + "w": 6, + "x": 6, + "y": 0 + }, + "targets": [ + { + "expr": "max(postmark_outbound_sent{window=\"7d\"})", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "rgba(115, 115, 115, 1)", + "value": null + }, + { + "color": "green", + "value": 1 + } + ] + }, + "unit": "none", + "custom": { + "displayMode": "auto" + }, + "decimals": 0 + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "value" + } + }, + { + "id": 3, + "type": "stat", + "title": "Mail Bounces (1d)", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 4, + "w": 6, + "x": 12, + "y": 0 + }, + "targets": [ + { + "expr": "max(postmark_outbound_bounce_rate{window=\"1d\"})", + "refId": "A", + "legendFormat": "Rate" + }, + { + "expr": "max(postmark_outbound_bounced{window=\"1d\"})", + "refId": "B", + "legendFormat": "Count" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "custom": { + "displayMode": "auto" + }, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 5 + }, + { + "color": "orange", + "value": 8 + }, + { + "color": "red", + "value": 10 + } + ] + }, + "unit": "none" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "Rate" + }, + "properties": [ + { + "id": "unit", + "value": "percent" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Count" + }, + "properties": [ + { + "id": "unit", + "value": "none" + } + ] + } + ] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "name_and_value" + } + }, + { + "id": 4, + "type": "stat", + "title": "Success Rate (1d)", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 4, + "w": 6, + "x": 18, + "y": 0 + }, + "targets": [ + { + "expr": "clamp_min(100 - max(postmark_outbound_bounce_rate{window=\"1d\"}), 0)", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "red", + "value": null + }, + { + "color": "orange", + "value": 90 + }, + { + "color": "yellow", + "value": 95 + }, + { + "color": "green", + "value": 98 + } + ] + }, + "unit": "percent", + "custom": { + "displayMode": "auto" + }, + "decimals": 1 + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "value" + } + }, + { + "id": 5, + "type": "stat", + "title": "Limit Used (30d)", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 4, + "w": 6, + "x": 0, + "y": 4 + }, + "targets": [ + { + "expr": "max(postmark_sending_limit_used_percent)", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 70 + }, + { + "color": "orange", + "value": 85 + }, + { + "color": "red", + "value": 95 + } + ] + }, + "unit": "percent", + "custom": { + "displayMode": "auto" + }, + "decimals": 1 + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "value" + } + }, + { + "id": 6, + "type": "stat", + "title": "Send Limit (30d)", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 4, + "w": 6, + "x": 6, + "y": 4 + }, + "targets": [ + { + "expr": "max(postmark_sending_limit)", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "rgba(115, 115, 115, 1)", + "value": null + }, + { + "color": "green", + "value": 1 + } + ] + }, + "unit": "none", + "custom": { + "displayMode": "auto" + }, + "decimals": 0 + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "value" + } + }, + { + "id": 7, + "type": "stat", + "title": "Last Success", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 4, + "w": 6, + "x": 12, + "y": 4 + }, + "targets": [ + { + "expr": "max(postmark_last_success_timestamp_seconds)", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "rgba(115, 115, 115, 1)", + "value": null + }, + { + "color": "green", + "value": 1 + } + ] + }, + "unit": "dateTimeAsIso", + "custom": { + "displayMode": "auto" + }, + "decimals": 0 + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "value" + } + }, + { + "id": 8, + "type": "stat", + "title": "Exporter Errors", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 4, + "w": 6, + "x": 18, + "y": 4 + }, + "targets": [ + { + "expr": "sum(postmark_request_errors_total)", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "rgba(115, 115, 115, 1)", + "value": null + }, + { + "color": "green", + "value": 1 + } + ] + }, + "unit": "none", + "custom": { + "displayMode": "auto" + }, + "decimals": 0 + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "value" + } + }, + { + "id": 13, + "type": "timeseries", + "title": "Bounce Rate (1d vs 7d)", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 12 + }, + "targets": [ + { + "expr": "max by (window) (postmark_outbound_bounce_rate)", + "refId": "A", + "legendFormat": "{{window}}" + } + ], + "fieldConfig": { + "defaults": { + "unit": "percent" + }, + "overrides": [] + }, + "options": { + "legend": { + "displayMode": "table", + "placement": "right" + }, + "tooltip": { + "mode": "multi" + } + } + }, + { + "id": 14, + "type": "timeseries", + "title": "Bounced (1d vs 7d)", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 12 + }, + "targets": [ + { + "expr": "max by (window) (postmark_outbound_bounced)", + "refId": "A", + "legendFormat": "{{window}}" + } + ], + "fieldConfig": { + "defaults": { + "unit": "none" + }, + "overrides": [] + }, + "options": { + "legend": { + "displayMode": "table", + "placement": "right" + }, + "tooltip": { + "mode": "multi" + } + } + }, + { + "id": 15, + "type": "timeseries", + "title": "Sent (1d vs 7d)", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 20 + }, + "targets": [ + { + "expr": "max by (window) (postmark_outbound_sent)", + "refId": "A", + "legendFormat": "{{window}}" + } + ], + "fieldConfig": { + "defaults": { + "unit": "none" + }, + "overrides": [] + }, + "options": { + "legend": { + "displayMode": "table", + "placement": "right" + }, + "tooltip": { + "mode": "multi" + } + } + }, + { + "id": 16, + "type": "timeseries", + "title": "Exporter Errors", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 20 + }, + "targets": [ + { + "expr": "sum(postmark_request_errors_total)", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "unit": "none" + }, + "overrides": [] + }, + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom" + }, + "tooltip": { + "mode": "multi" + } + } + } + ], + "time": { + "from": "now-30d", + "to": "now" + }, + "annotations": { + "list": [] + }, + "schemaVersion": 39, + "style": "dark", + "tags": [ + "atlas", + "mail" + ] +} diff --git a/services/monitoring/dashboards/atlas-nodes.json b/services/monitoring/dashboards/atlas-nodes.json index 495c622..0bfd639 100644 --- a/services/monitoring/dashboards/atlas-nodes.json +++ b/services/monitoring/dashboards/atlas-nodes.json @@ -20,7 +20,7 @@ }, "targets": [ { - "expr": "sum(kube_node_status_condition{condition=\"Ready\",status=\"true\",node=~\"titan-04|titan-05|titan-06|titan-07|titan-08|titan-09|titan-10|titan-11|titan-12|titan-13|titan-14|titan-15|titan-16|titan-17|titan-18|titan-19|titan-22|titan-24\"})", + "expr": "sum(kube_node_status_condition{condition=\"Ready\",status=\"true\",node=~\"titan-04|titan-05|titan-06|titan-07|titan-08|titan-09|titan-10|titan-11|titan-20|titan-21|titan-12|titan-13|titan-14|titan-15|titan-17|titan-18|titan-19|titan-22|titan-24\"})", "refId": "A" } ], @@ -46,7 +46,7 @@ "unit": "none", "custom": { "displayMode": "auto", - "valueSuffix": "/18" + "valueSuffix": "/19" } }, "overrides": [] @@ -142,7 +142,7 @@ }, "targets": [ { - "expr": "sum(kube_pod_info{node=~\"titan-0a|titan-0b|titan-0c\",namespace!~\"kube-system|kube-public|kube-node-lease|longhorn-system|monitoring|flux-system\"})", + "expr": "sum(kube_pod_info{node=~\"titan-0a|titan-0b|titan-0c\",namespace!~\"^(kube-system|longhorn-system|metallb-system|monitoring|logging|cert-manager|flux-system|traefik|maintenance|postgres)$\"})", "refId": "A" } ], @@ -489,7 +489,7 @@ }, "targets": [ { - "expr": "(avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-0a|titan-0b|titan-0c|titan-db\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")", + "expr": "(avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-0a|titan-0b|titan-0c|titan-db|titan-jh\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")", "refId": "A", "legendFormat": "{{node}}" } @@ -526,7 +526,7 @@ }, "targets": [ { - "expr": "(avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-0a|titan-0b|titan-0c|titan-db\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")", + "expr": "(avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-0a|titan-0b|titan-0c|titan-db|titan-jh\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")", "refId": "A", "legendFormat": "{{node}}" } diff --git a/services/monitoring/dashboards/atlas-overview.json b/services/monitoring/dashboards/atlas-overview.json index 70062e0..a113d22 100644 --- a/services/monitoring/dashboards/atlas-overview.json +++ b/services/monitoring/dashboards/atlas-overview.json @@ -76,7 +76,7 @@ }, "targets": [ { - "expr": "sum(kube_pod_info{node=~\"titan-0a|titan-0b|titan-0c\",namespace!~\"kube-system|kube-public|kube-node-lease|longhorn-system|monitoring|flux-system\"}) or on() vector(0)", + "expr": "sum(kube_pod_info{node=~\"titan-0a|titan-0b|titan-0c\",namespace!~\"^(kube-system|longhorn-system|metallb-system|monitoring|logging|cert-manager|flux-system|traefik|maintenance|postgres)$\"}) or on() vector(0)", "refId": "A" } ], @@ -449,14 +449,14 @@ }, "targets": [ { - "expr": "sum(kube_node_status_condition{condition=\"Ready\",status=\"true\",node=~\"titan-04|titan-05|titan-06|titan-07|titan-08|titan-09|titan-10|titan-11|titan-12|titan-13|titan-14|titan-15|titan-16|titan-17|titan-18|titan-19|titan-22|titan-24\"})", + "expr": "sum(kube_node_status_condition{condition=\"Ready\",status=\"true\",node=~\"titan-04|titan-05|titan-06|titan-07|titan-08|titan-09|titan-10|titan-11|titan-20|titan-21|titan-12|titan-13|titan-14|titan-15|titan-17|titan-18|titan-19|titan-22|titan-24\"})", "refId": "A" } ], "fieldConfig": { "defaults": { "min": 0, - "max": 18, + "max": 19, "thresholds": { "mode": "absolute", "steps": [ @@ -466,15 +466,15 @@ }, { "color": "orange", - "value": 16 - }, - { - "color": "yellow", "value": 17 }, { - "color": "green", + "color": "yellow", "value": 18 + }, + { + "color": "green", + "value": 19 } ] } @@ -786,6 +786,330 @@ } ] }, + { + "id": 30, + "type": "stat", + "title": "Mail Sent (1d)", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 2, + "w": 6, + "x": 0, + "y": 8 + }, + "targets": [ + { + "expr": "max(postmark_outbound_sent{window=\"1d\"})", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "rgba(115, 115, 115, 1)", + "value": null + }, + { + "color": "green", + "value": 1 + } + ] + }, + "unit": "none", + "custom": { + "displayMode": "auto" + } + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "value" + }, + "links": [ + { + "title": "Open atlas-mail dashboard", + "url": "/d/atlas-mail", + "targetBlank": true + } + ] + }, + { + "id": 31, + "type": "stat", + "title": "Mail Bounces (1d)", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 2, + "w": 6, + "x": 12, + "y": 8 + }, + "targets": [ + { + "expr": "max(postmark_outbound_bounce_rate{window=\"1d\"})", + "refId": "A", + "legendFormat": "Rate" + }, + { + "expr": "max(postmark_outbound_bounced{window=\"1d\"})", + "refId": "B", + "legendFormat": "Count" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "custom": { + "displayMode": "auto" + }, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 5 + }, + { + "color": "orange", + "value": 8 + }, + { + "color": "red", + "value": 10 + } + ] + }, + "unit": "none" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "Rate" + }, + "properties": [ + { + "id": "unit", + "value": "percent" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Count" + }, + "properties": [ + { + "id": "unit", + "value": "none" + } + ] + } + ] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "name_and_value" + }, + "links": [ + { + "title": "Open atlas-mail dashboard", + "url": "/d/atlas-mail", + "targetBlank": true + } + ] + }, + { + "id": 32, + "type": "stat", + "title": "Mail Success Rate (1d)", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 2, + "w": 6, + "x": 6, + "y": 8 + }, + "targets": [ + { + "expr": "clamp_min(100 - max(postmark_outbound_bounce_rate{window=\"1d\"}), 0)", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "red", + "value": null + }, + { + "color": "orange", + "value": 90 + }, + { + "color": "yellow", + "value": 95 + }, + { + "color": "green", + "value": 98 + } + ] + }, + "unit": "percent", + "custom": { + "displayMode": "auto" + }, + "decimals": 1 + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "value" + }, + "links": [ + { + "title": "Open atlas-mail dashboard", + "url": "/d/atlas-mail", + "targetBlank": true + } + ] + }, + { + "id": 33, + "type": "stat", + "title": "Mail Limit Used (30d)", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 2, + "w": 6, + "x": 18, + "y": 8 + }, + "targets": [ + { + "expr": "max(postmark_sending_limit_used_percent)", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 70 + }, + { + "color": "orange", + "value": 85 + }, + { + "color": "red", + "value": 95 + } + ] + }, + "unit": "percent", + "custom": { + "displayMode": "auto" + }, + "decimals": 1 + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "value" + }, + "links": [ + { + "title": "Open atlas-mail dashboard", + "url": "/d/atlas-mail", + "targetBlank": true + } + ] + }, { "id": 23, "type": "stat", @@ -1086,7 +1410,7 @@ }, "targets": [ { - "expr": "100 * ( ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ) / clamp_min(sum( ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ), 1)", + "expr": "100 * ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\",container!=\"POD\",$namespace_scope_cpu}[1m])) by (namespace) ) / clamp_min(sum( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\",container!=\"POD\",$namespace_scope_cpu}[1m])) by (namespace) ), 1)", "refId": "A", "legendFormat": "{{namespace}}" } @@ -1119,7 +1443,25 @@ "fields": "", "values": false } - } + }, + "links": [ + { + "title": "Workload namespaces only", + "url": "?var-namespace_scope_cpu=namespace%21~%22%5E%28kube-system%7Clonghorn-system%7Cmetallb-system%7Cmonitoring%7Clogging%7Ccert-manager%7Cflux-system%7Ctraefik%7Cmaintenance%7Cpostgres%29%24%22&var-namespace_scope_gpu=${namespace_scope_gpu}&var-namespace_scope_ram=${namespace_scope_ram}", + "targetBlank": false + }, + { + "title": "All namespaces", + "url": "?var-namespace_scope_cpu=namespace%3D~%22.%2A%22&var-namespace_scope_gpu=${namespace_scope_gpu}&var-namespace_scope_ram=${namespace_scope_ram}", + "targetBlank": false + }, + { + "title": "Infrastructure namespaces only", + "url": "?var-namespace_scope_cpu=namespace%3D~%22%5E%28kube-system%7Clonghorn-system%7Cmetallb-system%7Cmonitoring%7Clogging%7Ccert-manager%7Cflux-system%7Ctraefik%7Cmaintenance%7Cpostgres%29%24%22&var-namespace_scope_gpu=${namespace_scope_gpu}&var-namespace_scope_ram=${namespace_scope_ram}", + "targetBlank": false + } + ], + "description": "Values are normalized within the selected scope; use panel links to switch scope." }, { "id": 12, @@ -1137,7 +1479,7 @@ }, "targets": [ { - "expr": "100 * ( ( (sum by (namespace) (max_over_time(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\"}[$__range]))) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ) / clamp_min(sum( ( (sum by (namespace) (max_over_time(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\"}[$__range]))) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ), 1)", + "expr": "(100 * (sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace)) / clamp_min((sum(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace)) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") and on() ((sum(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace)) or on() vector(0)) == 0))", "refId": "A", "legendFormat": "{{namespace}}" } @@ -1170,7 +1512,25 @@ "fields": "", "values": false } - } + }, + "links": [ + { + "title": "Workload namespaces only", + "url": "?var-namespace_scope_cpu=${namespace_scope_cpu}&var-namespace_scope_gpu=namespace%21~%22%5E%28kube-system%7Clonghorn-system%7Cmetallb-system%7Cmonitoring%7Clogging%7Ccert-manager%7Cflux-system%7Ctraefik%7Cmaintenance%7Cpostgres%29%24%22&var-namespace_scope_ram=${namespace_scope_ram}", + "targetBlank": false + }, + { + "title": "All namespaces", + "url": "?var-namespace_scope_cpu=${namespace_scope_cpu}&var-namespace_scope_gpu=namespace%3D~%22.%2A%22&var-namespace_scope_ram=${namespace_scope_ram}", + "targetBlank": false + }, + { + "title": "Infrastructure namespaces only", + "url": "?var-namespace_scope_cpu=${namespace_scope_cpu}&var-namespace_scope_gpu=namespace%3D~%22%5E%28kube-system%7Clonghorn-system%7Cmetallb-system%7Cmonitoring%7Clogging%7Ccert-manager%7Cflux-system%7Ctraefik%7Cmaintenance%7Cpostgres%29%24%22&var-namespace_scope_ram=${namespace_scope_ram}", + "targetBlank": false + } + ], + "description": "Values are normalized within the selected scope; use panel links to switch scope." }, { "id": 13, @@ -1188,7 +1548,7 @@ }, "targets": [ { - "expr": "100 * ( ( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ) / clamp_min(sum( ( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ), 1)", + "expr": "100 * ( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\",container!=\"POD\",$namespace_scope_ram}) by (namespace) ) / clamp_min(sum( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\",container!=\"POD\",$namespace_scope_ram}) by (namespace) ), 1)", "refId": "A", "legendFormat": "{{namespace}}" } @@ -1221,7 +1581,25 @@ "fields": "", "values": false } - } + }, + "links": [ + { + "title": "Workload namespaces only", + "url": "?var-namespace_scope_cpu=${namespace_scope_cpu}&var-namespace_scope_gpu=${namespace_scope_gpu}&var-namespace_scope_ram=namespace%21~%22%5E%28kube-system%7Clonghorn-system%7Cmetallb-system%7Cmonitoring%7Clogging%7Ccert-manager%7Cflux-system%7Ctraefik%7Cmaintenance%7Cpostgres%29%24%22", + "targetBlank": false + }, + { + "title": "All namespaces", + "url": "?var-namespace_scope_cpu=${namespace_scope_cpu}&var-namespace_scope_gpu=${namespace_scope_gpu}&var-namespace_scope_ram=namespace%3D~%22.%2A%22", + "targetBlank": false + }, + { + "title": "Infrastructure namespaces only", + "url": "?var-namespace_scope_cpu=${namespace_scope_cpu}&var-namespace_scope_gpu=${namespace_scope_gpu}&var-namespace_scope_ram=namespace%3D~%22%5E%28kube-system%7Clonghorn-system%7Cmetallb-system%7Cmonitoring%7Clogging%7Ccert-manager%7Cflux-system%7Ctraefik%7Cmaintenance%7Cpostgres%29%24%22", + "targetBlank": false + } + ], + "description": "Values are normalized within the selected scope; use panel links to switch scope." }, { "id": 14, @@ -1239,7 +1617,7 @@ }, "targets": [ { - "expr": "(avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-04|titan-05|titan-06|titan-07|titan-08|titan-09|titan-10|titan-11|titan-12|titan-13|titan-14|titan-15|titan-16|titan-17|titan-18|titan-19|titan-22|titan-24\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")", + "expr": "(avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-04|titan-05|titan-06|titan-07|titan-08|titan-09|titan-10|titan-11|titan-20|titan-21|titan-12|titan-13|titan-14|titan-15|titan-17|titan-18|titan-19|titan-22|titan-24\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")", "refId": "A", "legendFormat": "{{node}}" } @@ -1286,7 +1664,7 @@ }, "targets": [ { - "expr": "(avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-04|titan-05|titan-06|titan-07|titan-08|titan-09|titan-10|titan-11|titan-12|titan-13|titan-14|titan-15|titan-16|titan-17|titan-18|titan-19|titan-22|titan-24\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")", + "expr": "(avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-04|titan-05|titan-06|titan-07|titan-08|titan-09|titan-10|titan-11|titan-20|titan-21|titan-12|titan-13|titan-14|titan-15|titan-17|titan-18|titan-19|titan-22|titan-24\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")", "refId": "A", "legendFormat": "{{node}}" } @@ -1333,7 +1711,7 @@ }, "targets": [ { - "expr": "(avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-0a|titan-0b|titan-0c|titan-db\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")", + "expr": "(avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-0a|titan-0b|titan-0c|titan-db|titan-jh\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")", "refId": "A", "legendFormat": "{{node}}" } @@ -1370,7 +1748,7 @@ }, "targets": [ { - "expr": "(avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-0a|titan-0b|titan-0c|titan-db\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")", + "expr": "(avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-0a|titan-0b|titan-0c|titan-db|titan-jh\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")", "refId": "A", "legendFormat": "{{node}}" } @@ -1791,7 +2169,110 @@ "overview" ], "templating": { - "list": [] + "list": [ + { + "name": "namespace_scope_cpu", + "label": "CPU namespace filter", + "type": "custom", + "query": "workload namespaces only : namespace!~\"^(kube-system|longhorn-system|metallb-system|monitoring|logging|cert-manager|flux-system|traefik|maintenance|postgres)$\",all namespaces : namespace=~\".*\",infrastructure namespaces only : namespace=~\"^(kube-system|longhorn-system|metallb-system|monitoring|logging|cert-manager|flux-system|traefik|maintenance|postgres)$\"", + "current": { + "text": "workload namespaces only", + "value": "namespace!~\"^(kube-system|longhorn-system|metallb-system|monitoring|logging|cert-manager|flux-system|traefik|maintenance|postgres)$\"", + "selected": true + }, + "options": [ + { + "text": "workload namespaces only", + "value": "namespace!~\"^(kube-system|longhorn-system|metallb-system|monitoring|logging|cert-manager|flux-system|traefik|maintenance|postgres)$\"", + "selected": true + }, + { + "text": "all namespaces", + "value": "namespace=~\".*\"", + "selected": false + }, + { + "text": "infrastructure namespaces only", + "value": "namespace=~\"^(kube-system|longhorn-system|metallb-system|monitoring|logging|cert-manager|flux-system|traefik|maintenance|postgres)$\"", + "selected": false + } + ], + "hide": 2, + "multi": false, + "includeAll": false, + "refresh": 1, + "sort": 0, + "skipUrlSync": false + }, + { + "name": "namespace_scope_gpu", + "label": "GPU namespace filter", + "type": "custom", + "query": "workload namespaces only : namespace!~\"^(kube-system|longhorn-system|metallb-system|monitoring|logging|cert-manager|flux-system|traefik|maintenance|postgres)$\",all namespaces : namespace=~\".*\",infrastructure namespaces only : namespace=~\"^(kube-system|longhorn-system|metallb-system|monitoring|logging|cert-manager|flux-system|traefik|maintenance|postgres)$\"", + "current": { + "text": "workload namespaces only", + "value": "namespace!~\"^(kube-system|longhorn-system|metallb-system|monitoring|logging|cert-manager|flux-system|traefik|maintenance|postgres)$\"", + "selected": true + }, + "options": [ + { + "text": "workload namespaces only", + "value": "namespace!~\"^(kube-system|longhorn-system|metallb-system|monitoring|logging|cert-manager|flux-system|traefik|maintenance|postgres)$\"", + "selected": true + }, + { + "text": "all namespaces", + "value": "namespace=~\".*\"", + "selected": false + }, + { + "text": "infrastructure namespaces only", + "value": "namespace=~\"^(kube-system|longhorn-system|metallb-system|monitoring|logging|cert-manager|flux-system|traefik|maintenance|postgres)$\"", + "selected": false + } + ], + "hide": 2, + "multi": false, + "includeAll": false, + "refresh": 1, + "sort": 0, + "skipUrlSync": false + }, + { + "name": "namespace_scope_ram", + "label": "RAM namespace filter", + "type": "custom", + "query": "workload namespaces only : namespace!~\"^(kube-system|longhorn-system|metallb-system|monitoring|logging|cert-manager|flux-system|traefik|maintenance|postgres)$\",all namespaces : namespace=~\".*\",infrastructure namespaces only : namespace=~\"^(kube-system|longhorn-system|metallb-system|monitoring|logging|cert-manager|flux-system|traefik|maintenance|postgres)$\"", + "current": { + "text": "workload namespaces only", + "value": "namespace!~\"^(kube-system|longhorn-system|metallb-system|monitoring|logging|cert-manager|flux-system|traefik|maintenance|postgres)$\"", + "selected": true + }, + "options": [ + { + "text": "workload namespaces only", + "value": "namespace!~\"^(kube-system|longhorn-system|metallb-system|monitoring|logging|cert-manager|flux-system|traefik|maintenance|postgres)$\"", + "selected": true + }, + { + "text": "all namespaces", + "value": "namespace=~\".*\"", + "selected": false + }, + { + "text": "infrastructure namespaces only", + "value": "namespace=~\"^(kube-system|longhorn-system|metallb-system|monitoring|logging|cert-manager|flux-system|traefik|maintenance|postgres)$\"", + "selected": false + } + ], + "hide": 2, + "multi": false, + "includeAll": false, + "refresh": 1, + "sort": 0, + "skipUrlSync": false + } + ] }, "time": { "from": "now-1h", diff --git a/services/monitoring/dashboards/atlas-pods.json b/services/monitoring/dashboards/atlas-pods.json index 4b2a54a..ff2dbdd 100644 --- a/services/monitoring/dashboards/atlas-pods.json +++ b/services/monitoring/dashboards/atlas-pods.json @@ -200,7 +200,7 @@ }, "targets": [ { - "expr": "sum(kube_pod_info{node=~\"titan-0a|titan-0b|titan-0c\",namespace!~\"kube-system|kube-public|kube-node-lease|longhorn-system|monitoring|flux-system\"})", + "expr": "sum(kube_pod_info{node=~\"titan-0a|titan-0b|titan-0c\",namespace!~\"^(kube-system|longhorn-system|metallb-system|monitoring|logging|cert-manager|flux-system|traefik|maintenance|postgres)$\"})", "refId": "A" } ], @@ -520,7 +520,7 @@ }, "targets": [ { - "expr": "(sum by (namespace,node) (kube_pod_info{pod!=\"\" , node!=\"\"}) / on(namespace) group_left() clamp_min(sum by (namespace) (kube_pod_info{pod!=\"\"}), 1) * 100) * on(namespace,node) group_left() ((sum by (namespace,node) (kube_pod_info{pod!=\"\" , node!=\"\"}) / on(namespace) group_left() clamp_min(sum by (namespace) (kube_pod_info{pod!=\"\"}), 1) * 100) + on(node) group_left() ((sum by (node) (kube_node_info{node=\"titan-0a\"}) * 0 + 0.001) or (sum by (node) (kube_node_info{node=\"titan-0b\"}) * 0 + 0.002) or (sum by (node) (kube_node_info{node=\"titan-0c\"}) * 0 + 0.003) or (sum by (node) (kube_node_info{node=\"titan-db\"}) * 0 + 0.004) or (sum by (node) (kube_node_info{node=\"titan-04\"}) * 0 + 0.005) or (sum by (node) (kube_node_info{node=\"titan-05\"}) * 0 + 0.006) or (sum by (node) (kube_node_info{node=\"titan-06\"}) * 0 + 0.007) or (sum by (node) (kube_node_info{node=\"titan-07\"}) * 0 + 0.008) or (sum by (node) (kube_node_info{node=\"titan-08\"}) * 0 + 0.009000000000000001) or (sum by (node) (kube_node_info{node=\"titan-09\"}) * 0 + 0.01) or (sum by (node) (kube_node_info{node=\"titan-10\"}) * 0 + 0.011) or (sum by (node) (kube_node_info{node=\"titan-11\"}) * 0 + 0.012) or (sum by (node) (kube_node_info{node=\"titan-12\"}) * 0 + 0.013000000000000001) or (sum by (node) (kube_node_info{node=\"titan-13\"}) * 0 + 0.014) or (sum by (node) (kube_node_info{node=\"titan-14\"}) * 0 + 0.015) or (sum by (node) (kube_node_info{node=\"titan-15\"}) * 0 + 0.016) or (sum by (node) (kube_node_info{node=\"titan-16\"}) * 0 + 0.017) or (sum by (node) (kube_node_info{node=\"titan-17\"}) * 0 + 0.018000000000000002) or (sum by (node) (kube_node_info{node=\"titan-18\"}) * 0 + 0.019) or (sum by (node) (kube_node_info{node=\"titan-19\"}) * 0 + 0.02) or (sum by (node) (kube_node_info{node=\"titan-22\"}) * 0 + 0.021) or (sum by (node) (kube_node_info{node=\"titan-24\"}) * 0 + 0.022)) == bool on(namespace) group_left() (max by (namespace) ((sum by (namespace,node) (kube_pod_info{pod!=\"\" , node!=\"\"}) / on(namespace) group_left() clamp_min(sum by (namespace) (kube_pod_info{pod!=\"\"}), 1) * 100) + on(node) group_left() ((sum by (node) (kube_node_info{node=\"titan-0a\"}) * 0 + 0.001) or (sum by (node) (kube_node_info{node=\"titan-0b\"}) * 0 + 0.002) or (sum by (node) (kube_node_info{node=\"titan-0c\"}) * 0 + 0.003) or (sum by (node) (kube_node_info{node=\"titan-db\"}) * 0 + 0.004) or (sum by (node) (kube_node_info{node=\"titan-04\"}) * 0 + 0.005) or (sum by (node) (kube_node_info{node=\"titan-05\"}) * 0 + 0.006) or (sum by (node) (kube_node_info{node=\"titan-06\"}) * 0 + 0.007) or (sum by (node) (kube_node_info{node=\"titan-07\"}) * 0 + 0.008) or (sum by (node) (kube_node_info{node=\"titan-08\"}) * 0 + 0.009000000000000001) or (sum by (node) (kube_node_info{node=\"titan-09\"}) * 0 + 0.01) or (sum by (node) (kube_node_info{node=\"titan-10\"}) * 0 + 0.011) or (sum by (node) (kube_node_info{node=\"titan-11\"}) * 0 + 0.012) or (sum by (node) (kube_node_info{node=\"titan-12\"}) * 0 + 0.013000000000000001) or (sum by (node) (kube_node_info{node=\"titan-13\"}) * 0 + 0.014) or (sum by (node) (kube_node_info{node=\"titan-14\"}) * 0 + 0.015) or (sum by (node) (kube_node_info{node=\"titan-15\"}) * 0 + 0.016) or (sum by (node) (kube_node_info{node=\"titan-16\"}) * 0 + 0.017) or (sum by (node) (kube_node_info{node=\"titan-17\"}) * 0 + 0.018000000000000002) or (sum by (node) (kube_node_info{node=\"titan-18\"}) * 0 + 0.019) or (sum by (node) (kube_node_info{node=\"titan-19\"}) * 0 + 0.02) or (sum by (node) (kube_node_info{node=\"titan-22\"}) * 0 + 0.021) or (sum by (node) (kube_node_info{node=\"titan-24\"}) * 0 + 0.022)))))", + "expr": "(sum by (namespace,node) (kube_pod_info{pod!=\"\" , node!=\"\"}) / on(namespace) group_left() clamp_min(sum by (namespace) (kube_pod_info{pod!=\"\"}), 1) * 100) * on(namespace,node) group_left() ((sum by (namespace,node) (kube_pod_info{pod!=\"\" , node!=\"\"}) / on(namespace) group_left() clamp_min(sum by (namespace) (kube_pod_info{pod!=\"\"}), 1) * 100) + on(node) group_left() ((sum by (node) (kube_node_info{node=\"titan-0a\"}) * 0 + 0.001) or (sum by (node) (kube_node_info{node=\"titan-0b\"}) * 0 + 0.002) or (sum by (node) (kube_node_info{node=\"titan-0c\"}) * 0 + 0.003) or (sum by (node) (kube_node_info{node=\"titan-db\"}) * 0 + 0.004) or (sum by (node) (kube_node_info{node=\"titan-jh\"}) * 0 + 0.005) or (sum by (node) (kube_node_info{node=\"titan-04\"}) * 0 + 0.006) or (sum by (node) (kube_node_info{node=\"titan-05\"}) * 0 + 0.007) or (sum by (node) (kube_node_info{node=\"titan-06\"}) * 0 + 0.008) or (sum by (node) (kube_node_info{node=\"titan-07\"}) * 0 + 0.009000000000000001) or (sum by (node) (kube_node_info{node=\"titan-08\"}) * 0 + 0.01) or (sum by (node) (kube_node_info{node=\"titan-09\"}) * 0 + 0.011) or (sum by (node) (kube_node_info{node=\"titan-10\"}) * 0 + 0.012) or (sum by (node) (kube_node_info{node=\"titan-11\"}) * 0 + 0.013000000000000001) or (sum by (node) (kube_node_info{node=\"titan-20\"}) * 0 + 0.014) or (sum by (node) (kube_node_info{node=\"titan-21\"}) * 0 + 0.015) or (sum by (node) (kube_node_info{node=\"titan-12\"}) * 0 + 0.016) or (sum by (node) (kube_node_info{node=\"titan-13\"}) * 0 + 0.017) or (sum by (node) (kube_node_info{node=\"titan-14\"}) * 0 + 0.018000000000000002) or (sum by (node) (kube_node_info{node=\"titan-15\"}) * 0 + 0.019) or (sum by (node) (kube_node_info{node=\"titan-17\"}) * 0 + 0.02) or (sum by (node) (kube_node_info{node=\"titan-18\"}) * 0 + 0.021) or (sum by (node) (kube_node_info{node=\"titan-19\"}) * 0 + 0.022) or (sum by (node) (kube_node_info{node=\"titan-22\"}) * 0 + 0.023) or (sum by (node) (kube_node_info{node=\"titan-24\"}) * 0 + 0.024)) == bool on(namespace) group_left() (max by (namespace) ((sum by (namespace,node) (kube_pod_info{pod!=\"\" , node!=\"\"}) / on(namespace) group_left() clamp_min(sum by (namespace) (kube_pod_info{pod!=\"\"}), 1) * 100) + on(node) group_left() ((sum by (node) (kube_node_info{node=\"titan-0a\"}) * 0 + 0.001) or (sum by (node) (kube_node_info{node=\"titan-0b\"}) * 0 + 0.002) or (sum by (node) (kube_node_info{node=\"titan-0c\"}) * 0 + 0.003) or (sum by (node) (kube_node_info{node=\"titan-db\"}) * 0 + 0.004) or (sum by (node) (kube_node_info{node=\"titan-jh\"}) * 0 + 0.005) or (sum by (node) (kube_node_info{node=\"titan-04\"}) * 0 + 0.006) or (sum by (node) (kube_node_info{node=\"titan-05\"}) * 0 + 0.007) or (sum by (node) (kube_node_info{node=\"titan-06\"}) * 0 + 0.008) or (sum by (node) (kube_node_info{node=\"titan-07\"}) * 0 + 0.009000000000000001) or (sum by (node) (kube_node_info{node=\"titan-08\"}) * 0 + 0.01) or (sum by (node) (kube_node_info{node=\"titan-09\"}) * 0 + 0.011) or (sum by (node) (kube_node_info{node=\"titan-10\"}) * 0 + 0.012) or (sum by (node) (kube_node_info{node=\"titan-11\"}) * 0 + 0.013000000000000001) or (sum by (node) (kube_node_info{node=\"titan-20\"}) * 0 + 0.014) or (sum by (node) (kube_node_info{node=\"titan-21\"}) * 0 + 0.015) or (sum by (node) (kube_node_info{node=\"titan-12\"}) * 0 + 0.016) or (sum by (node) (kube_node_info{node=\"titan-13\"}) * 0 + 0.017) or (sum by (node) (kube_node_info{node=\"titan-14\"}) * 0 + 0.018000000000000002) or (sum by (node) (kube_node_info{node=\"titan-15\"}) * 0 + 0.019) or (sum by (node) (kube_node_info{node=\"titan-17\"}) * 0 + 0.02) or (sum by (node) (kube_node_info{node=\"titan-18\"}) * 0 + 0.021) or (sum by (node) (kube_node_info{node=\"titan-19\"}) * 0 + 0.022) or (sum by (node) (kube_node_info{node=\"titan-22\"}) * 0 + 0.023) or (sum by (node) (kube_node_info{node=\"titan-24\"}) * 0 + 0.024)))))", "refId": "A", "instant": true, "format": "table" diff --git a/services/monitoring/dashboards/atlas-storage.json b/services/monitoring/dashboards/atlas-storage.json index 2e548b2..d93a941 100644 --- a/services/monitoring/dashboards/atlas-storage.json +++ b/services/monitoring/dashboards/atlas-storage.json @@ -409,6 +409,138 @@ } }, "timeFrom": "90d" + }, + { + "id": 30, + "type": "stat", + "title": "Maintenance Sweepers Ready", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 4, + "w": 12, + "x": 0, + "y": 44 + }, + "targets": [ + { + "expr": "kube_daemonset_status_number_ready{namespace=\"maintenance\",daemonset=\"node-image-sweeper\"} / on(namespace,daemonset) kube_daemonset_status_desired_number_scheduled{namespace=\"maintenance\",daemonset=\"node-image-sweeper\"} * 100", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 50 + }, + { + "color": "orange", + "value": 75 + }, + { + "color": "red", + "value": 91.5 + } + ] + }, + "unit": "percent", + "custom": { + "displayMode": "auto" + } + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "value" + } + }, + { + "id": 31, + "type": "stat", + "title": "Maintenance Cron Freshness (s)", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 4, + "w": 12, + "x": 12, + "y": 44 + }, + "targets": [ + { + "expr": "time() - max by (cronjob) (kube_cronjob_status_last_successful_time{namespace=\"maintenance\",cronjob=~\"image-sweeper|grafana-smtp-sync\"})", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 3600 + }, + { + "color": "red", + "value": 10800 + } + ] + }, + "unit": "s", + "custom": { + "displayMode": "auto" + } + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "value" + } } ], "time": { diff --git a/services/monitoring/dcgm-exporter.yaml b/services/monitoring/dcgm-exporter.yaml index cd37b7b..7627420 100644 --- a/services/monitoring/dcgm-exporter.yaml +++ b/services/monitoring/dcgm-exporter.yaml @@ -28,13 +28,14 @@ spec: requiredDuringSchedulingIgnoredDuringExecution: nodeSelectorTerms: - matchExpressions: - - key: kubernetes.io/hostname + - key: kubernetes.io/arch operator: In values: - - titan-20 - - titan-21 - - titan-22 - - titan-24 + - amd64 + - key: jetson + operator: NotIn + values: + - "true" tolerations: - operator: Exists containers: diff --git a/services/monitoring/grafana-alerting-config.yaml b/services/monitoring/grafana-alerting-config.yaml new file mode 100644 index 0000000..c679bff --- /dev/null +++ b/services/monitoring/grafana-alerting-config.yaml @@ -0,0 +1,384 @@ +# services/monitoring/grafana-alerting-config.yaml +apiVersion: v1 +kind: ConfigMap +metadata: + name: grafana-alerting + namespace: monitoring + labels: + grafana_alerting: "1" +data: + alerting.yaml: | + apiVersion: 1 + contactPoints: + - orgId: 1 + name: email-admins + receivers: + - uid: email-admins + type: email + settings: + addresses: ${GRAFANA_ALERT_EMAILS} + singleEmail: true + policies: + - orgId: 1 + receiver: email-admins + group_by: + - alertname + rules.yaml: | + apiVersion: 1 + groups: + - orgId: 1 + name: atlas-disk + folder: Alerts + interval: 1m + rules: + - uid: disk-pressure-root + title: "Node rootfs high (>80%)" + condition: C + for: "10m" + data: + - refId: A + relativeTimeRange: + from: 600 + to: 0 + datasourceUid: atlas-vm + model: + intervalMs: 60000 + maxDataPoints: 43200 + expr: avg by (node) ((avg by (instance) ((1 - (node_filesystem_avail_bytes{mountpoint="/",fstype!~"tmpfs|overlay"} / node_filesystem_size_bytes{mountpoint="/",fstype!~"tmpfs|overlay"})) * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=""}, "node", "$1", "nodename", "(.*)")) + legendFormat: '{{node}}' + datasource: + type: prometheus + uid: atlas-vm + - refId: B + datasourceUid: __expr__ + model: + expression: A + intervalMs: 60000 + maxDataPoints: 43200 + reducer: last + type: reduce + - refId: C + datasourceUid: __expr__ + model: + expression: B + intervalMs: 60000 + maxDataPoints: 43200 + type: threshold + conditions: + - evaluator: + params: [80] + type: gt + operator: + type: and + reducer: + type: last + type: query + noDataState: NoData + execErrState: Error + annotations: + summary: "{{ $labels.node }} rootfs >80% for 10m" + labels: + severity: warning + - uid: disk-growth-1h + title: "Node rootfs growing fast (>1Gi in 1h)" + condition: C + for: "10m" + data: + - refId: A + relativeTimeRange: + from: 3600 + to: 0 + datasourceUid: atlas-vm + model: + intervalMs: 60000 + maxDataPoints: 43200 + expr: increase((node_filesystem_size_bytes{mountpoint="/",fstype!~"tmpfs|overlay"} - node_filesystem_free_bytes{mountpoint="/",fstype!~"tmpfs|overlay"})[1h]) / 1024 / 1024 / 1024 + legendFormat: '{{instance}}' + datasource: + type: prometheus + uid: atlas-vm + - refId: B + datasourceUid: __expr__ + model: + expression: A + intervalMs: 60000 + maxDataPoints: 43200 + reducer: last + type: reduce + - refId: C + datasourceUid: __expr__ + model: + expression: B + intervalMs: 60000 + maxDataPoints: 43200 + type: threshold + conditions: + - evaluator: + params: [1] + type: gt + operator: + type: and + reducer: + type: last + type: query + noDataState: NoData + execErrState: Error + annotations: + summary: "{{ $labels.instance }} rootfs grew >1Gi in the last hour" + labels: + severity: warning + - orgId: 1 + name: atlas-cpu + folder: Alerts + interval: 1m + rules: + - uid: cpu-high-10m + title: "Node CPU high (>90% for 10m)" + condition: C + for: 10m + data: + - refId: A + relativeTimeRange: + from: 600 + to: 0 + datasourceUid: atlas-vm + model: + intervalMs: 60000 + maxDataPoints: 43200 + expr: avg_over_time((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100)[10m:1m] + legendFormat: '{{instance}}' + datasource: + type: prometheus + uid: atlas-vm + - refId: B + datasourceUid: __expr__ + model: + expression: A + intervalMs: 60000 + maxDataPoints: 43200 + reducer: last + type: reduce + - refId: C + datasourceUid: __expr__ + model: + expression: B + intervalMs: 60000 + maxDataPoints: 43200 + type: threshold + conditions: + - evaluator: + params: [90] + type: gt + operator: + type: and + reducer: + type: last + type: query + noDataState: NoData + execErrState: Error + annotations: + summary: "{{ $labels.instance }} CPU >90% for 10m" + labels: + severity: warning + - orgId: 1 + name: maintenance + folder: Alerts + interval: 1m + rules: + - uid: maint-sweeper + title: "Maintenance sweeper not ready" + condition: C + for: "5m" + data: + - refId: A + relativeTimeRange: + from: 300 + to: 0 + datasourceUid: atlas-vm + model: + intervalMs: 60000 + maxDataPoints: 43200 + expr: kube_daemonset_status_number_ready{namespace="maintenance",daemonset="node-image-sweeper"} / on(namespace,daemonset) kube_daemonset_status_desired_number_scheduled{namespace="maintenance",daemonset="node-image-sweeper"} + legendFormat: '{{daemonset}}' + datasource: + type: prometheus + uid: atlas-vm + - refId: B + datasourceUid: __expr__ + model: + expression: A + intervalMs: 60000 + maxDataPoints: 43200 + reducer: last + type: reduce + - refId: C + datasourceUid: __expr__ + model: + expression: B + intervalMs: 60000 + maxDataPoints: 43200 + type: threshold + conditions: + - evaluator: + params: [1] + type: lt + operator: + type: and + reducer: + type: last + type: query + noDataState: NoData + execErrState: Error + annotations: + summary: "node-image-sweeper not fully ready" + labels: + severity: warning + - uid: maint-cron-stale + title: "Maintenance CronJobs stale (>3h since success)" + condition: C + for: "5m" + data: + - refId: A + relativeTimeRange: + from: 300 + to: 0 + datasourceUid: atlas-vm + model: + expr: time() - max by (cronjob) (kube_cronjob_status_last_successful_time{namespace="maintenance",cronjob=~"image-sweeper|grafana-smtp-sync"}) + intervalMs: 60000 + maxDataPoints: 43200 + legendFormat: '{{cronjob}}' + datasource: + type: prometheus + uid: atlas-vm + - refId: B + datasourceUid: __expr__ + model: + expression: A + intervalMs: 60000 + maxDataPoints: 43200 + reducer: last + type: reduce + - refId: C + datasourceUid: __expr__ + model: + expression: B + intervalMs: 60000 + maxDataPoints: 43200 + type: threshold + conditions: + - evaluator: + params: [10800] + type: gt + operator: + type: and + reducer: + type: last + type: query + noDataState: NoData + execErrState: Error + annotations: + summary: "Maintenance cronjob stale >3h since last success" + labels: + severity: warning + - orgId: 1 + name: postmark + folder: Alerts + interval: 1m + rules: + - uid: postmark-bounce + title: "Postmark bounce rate high" + condition: C + for: "10m" + data: + - refId: A + relativeTimeRange: + from: 300 + to: 0 + datasourceUid: atlas-vm + model: + intervalMs: 60000 + maxDataPoints: 43200 + expr: POSTMARK_OUTBOUND_BOUNCE_RATE{window="1d"} + legendFormat: bounce 1d + datasource: + type: prometheus + uid: atlas-vm + - refId: B + datasourceUid: __expr__ + model: + expression: A + intervalMs: 60000 + maxDataPoints: 43200 + reducer: last + type: reduce + - refId: C + datasourceUid: __expr__ + model: + expression: B + intervalMs: 60000 + maxDataPoints: 43200 + type: threshold + conditions: + - evaluator: + params: [5] + type: gt + operator: + type: and + reducer: + type: last + type: query + noDataState: NoData + execErrState: Error + annotations: + summary: "Postmark 1d bounce rate >5%" + labels: + severity: warning + - uid: postmark-api-down + title: "Postmark exporter down" + condition: C + for: "5m" + data: + - refId: A + relativeTimeRange: + from: 300 + to: 0 + datasourceUid: atlas-vm + model: + intervalMs: 60000 + maxDataPoints: 43200 + expr: POSTMARK_API_UP + legendFormat: api up + datasource: + type: prometheus + uid: atlas-vm + - refId: B + datasourceUid: __expr__ + model: + expression: A + intervalMs: 60000 + maxDataPoints: 43200 + reducer: last + type: reduce + - refId: C + datasourceUid: __expr__ + model: + expression: B + intervalMs: 60000 + maxDataPoints: 43200 + type: threshold + conditions: + - evaluator: + params: [1] + type: lt + operator: + type: and + reducer: + type: last + type: query + noDataState: NoData + execErrState: Error + annotations: + summary: "Postmark exporter reports API down" + labels: + severity: critical diff --git a/services/monitoring/grafana-dashboard-gpu.yaml b/services/monitoring/grafana-dashboard-gpu.yaml index 48725de..49b5d39 100644 --- a/services/monitoring/grafana-dashboard-gpu.yaml +++ b/services/monitoring/grafana-dashboard-gpu.yaml @@ -29,7 +29,7 @@ data: }, "targets": [ { - "expr": "100 * ( ( (sum by (namespace) (max_over_time(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\"}[$__range]))) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ) / clamp_min(sum( ( (sum by (namespace) (max_over_time(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\"}[$__range]))) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ), 1)", + "expr": "(100 * (sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace)) / clamp_min((sum(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace)) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") and on() ((sum(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace)) or on() vector(0)) == 0))", "refId": "A", "legendFormat": "{{namespace}}" } @@ -62,7 +62,25 @@ data: "fields": "", "values": false } - } + }, + "links": [ + { + "title": "Workload namespaces only", + "url": "?var-namespace_scope_cpu=${namespace_scope_cpu}&var-namespace_scope_gpu=namespace%21~%22%5E%28kube-system%7Clonghorn-system%7Cmetallb-system%7Cmonitoring%7Clogging%7Ccert-manager%7Cflux-system%7Ctraefik%7Cmaintenance%7Cpostgres%29%24%22&var-namespace_scope_ram=${namespace_scope_ram}", + "targetBlank": false + }, + { + "title": "All namespaces", + "url": "?var-namespace_scope_cpu=${namespace_scope_cpu}&var-namespace_scope_gpu=namespace%3D~%22.%2A%22&var-namespace_scope_ram=${namespace_scope_ram}", + "targetBlank": false + }, + { + "title": "Infrastructure namespaces only", + "url": "?var-namespace_scope_cpu=${namespace_scope_cpu}&var-namespace_scope_gpu=namespace%3D~%22%5E%28kube-system%7Clonghorn-system%7Cmetallb-system%7Cmonitoring%7Clogging%7Ccert-manager%7Cflux-system%7Ctraefik%7Cmaintenance%7Cpostgres%29%24%22&var-namespace_scope_ram=${namespace_scope_ram}", + "targetBlank": false + } + ], + "description": "Values are normalized within the selected scope; use panel links to switch scope." }, { "id": 2, @@ -80,7 +98,7 @@ data: }, "targets": [ { - "expr": "sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\"}) by (namespace)", + "expr": "sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace)", "refId": "A", "legendFormat": "{{namespace}}" } @@ -191,5 +209,111 @@ data: "tags": [ "atlas", "gpu" - ] + ], + "templating": { + "list": [ + { + "name": "namespace_scope_cpu", + "label": "CPU namespace filter", + "type": "custom", + "query": "workload namespaces only : namespace!~\"^(kube-system|longhorn-system|metallb-system|monitoring|logging|cert-manager|flux-system|traefik|maintenance|postgres)$\",all namespaces : namespace=~\".*\",infrastructure namespaces only : namespace=~\"^(kube-system|longhorn-system|metallb-system|monitoring|logging|cert-manager|flux-system|traefik|maintenance|postgres)$\"", + "current": { + "text": "workload namespaces only", + "value": "namespace!~\"^(kube-system|longhorn-system|metallb-system|monitoring|logging|cert-manager|flux-system|traefik|maintenance|postgres)$\"", + "selected": true + }, + "options": [ + { + "text": "workload namespaces only", + "value": "namespace!~\"^(kube-system|longhorn-system|metallb-system|monitoring|logging|cert-manager|flux-system|traefik|maintenance|postgres)$\"", + "selected": true + }, + { + "text": "all namespaces", + "value": "namespace=~\".*\"", + "selected": false + }, + { + "text": "infrastructure namespaces only", + "value": "namespace=~\"^(kube-system|longhorn-system|metallb-system|monitoring|logging|cert-manager|flux-system|traefik|maintenance|postgres)$\"", + "selected": false + } + ], + "hide": 2, + "multi": false, + "includeAll": false, + "refresh": 1, + "sort": 0, + "skipUrlSync": false + }, + { + "name": "namespace_scope_gpu", + "label": "GPU namespace filter", + "type": "custom", + "query": "workload namespaces only : namespace!~\"^(kube-system|longhorn-system|metallb-system|monitoring|logging|cert-manager|flux-system|traefik|maintenance|postgres)$\",all namespaces : namespace=~\".*\",infrastructure namespaces only : namespace=~\"^(kube-system|longhorn-system|metallb-system|monitoring|logging|cert-manager|flux-system|traefik|maintenance|postgres)$\"", + "current": { + "text": "workload namespaces only", + "value": "namespace!~\"^(kube-system|longhorn-system|metallb-system|monitoring|logging|cert-manager|flux-system|traefik|maintenance|postgres)$\"", + "selected": true + }, + "options": [ + { + "text": "workload namespaces only", + "value": "namespace!~\"^(kube-system|longhorn-system|metallb-system|monitoring|logging|cert-manager|flux-system|traefik|maintenance|postgres)$\"", + "selected": true + }, + { + "text": "all namespaces", + "value": "namespace=~\".*\"", + "selected": false + }, + { + "text": "infrastructure namespaces only", + "value": "namespace=~\"^(kube-system|longhorn-system|metallb-system|monitoring|logging|cert-manager|flux-system|traefik|maintenance|postgres)$\"", + "selected": false + } + ], + "hide": 2, + "multi": false, + "includeAll": false, + "refresh": 1, + "sort": 0, + "skipUrlSync": false + }, + { + "name": "namespace_scope_ram", + "label": "RAM namespace filter", + "type": "custom", + "query": "workload namespaces only : namespace!~\"^(kube-system|longhorn-system|metallb-system|monitoring|logging|cert-manager|flux-system|traefik|maintenance|postgres)$\",all namespaces : namespace=~\".*\",infrastructure namespaces only : namespace=~\"^(kube-system|longhorn-system|metallb-system|monitoring|logging|cert-manager|flux-system|traefik|maintenance|postgres)$\"", + "current": { + "text": "workload namespaces only", + "value": "namespace!~\"^(kube-system|longhorn-system|metallb-system|monitoring|logging|cert-manager|flux-system|traefik|maintenance|postgres)$\"", + "selected": true + }, + "options": [ + { + "text": "workload namespaces only", + "value": "namespace!~\"^(kube-system|longhorn-system|metallb-system|monitoring|logging|cert-manager|flux-system|traefik|maintenance|postgres)$\"", + "selected": true + }, + { + "text": "all namespaces", + "value": "namespace=~\".*\"", + "selected": false + }, + { + "text": "infrastructure namespaces only", + "value": "namespace=~\"^(kube-system|longhorn-system|metallb-system|monitoring|logging|cert-manager|flux-system|traefik|maintenance|postgres)$\"", + "selected": false + } + ], + "hide": 2, + "multi": false, + "includeAll": false, + "refresh": 1, + "sort": 0, + "skipUrlSync": false + } + ] + } } diff --git a/services/monitoring/grafana-dashboard-mail.yaml b/services/monitoring/grafana-dashboard-mail.yaml new file mode 100644 index 0000000..4c011a8 --- /dev/null +++ b/services/monitoring/grafana-dashboard-mail.yaml @@ -0,0 +1,718 @@ +# services/monitoring/grafana-dashboard-mail.yaml +apiVersion: v1 +kind: ConfigMap +metadata: + name: grafana-dashboard-mail + labels: + grafana_dashboard: "1" +data: + atlas-mail.json: | + { + "uid": "atlas-mail", + "title": "Atlas Mail", + "folderUid": "atlas-internal", + "editable": true, + "panels": [ + { + "id": 1, + "type": "stat", + "title": "Sent (1d)", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 4, + "w": 6, + "x": 0, + "y": 0 + }, + "targets": [ + { + "expr": "max(postmark_outbound_sent{window=\"1d\"})", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "rgba(115, 115, 115, 1)", + "value": null + }, + { + "color": "green", + "value": 1 + } + ] + }, + "unit": "none", + "custom": { + "displayMode": "auto" + }, + "decimals": 0 + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "value" + } + }, + { + "id": 2, + "type": "stat", + "title": "Sent (7d)", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 4, + "w": 6, + "x": 6, + "y": 0 + }, + "targets": [ + { + "expr": "max(postmark_outbound_sent{window=\"7d\"})", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "rgba(115, 115, 115, 1)", + "value": null + }, + { + "color": "green", + "value": 1 + } + ] + }, + "unit": "none", + "custom": { + "displayMode": "auto" + }, + "decimals": 0 + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "value" + } + }, + { + "id": 3, + "type": "stat", + "title": "Mail Bounces (1d)", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 4, + "w": 6, + "x": 12, + "y": 0 + }, + "targets": [ + { + "expr": "max(postmark_outbound_bounce_rate{window=\"1d\"})", + "refId": "A", + "legendFormat": "Rate" + }, + { + "expr": "max(postmark_outbound_bounced{window=\"1d\"})", + "refId": "B", + "legendFormat": "Count" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "custom": { + "displayMode": "auto" + }, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 5 + }, + { + "color": "orange", + "value": 8 + }, + { + "color": "red", + "value": 10 + } + ] + }, + "unit": "none" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "Rate" + }, + "properties": [ + { + "id": "unit", + "value": "percent" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Count" + }, + "properties": [ + { + "id": "unit", + "value": "none" + } + ] + } + ] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "name_and_value" + } + }, + { + "id": 4, + "type": "stat", + "title": "Success Rate (1d)", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 4, + "w": 6, + "x": 18, + "y": 0 + }, + "targets": [ + { + "expr": "clamp_min(100 - max(postmark_outbound_bounce_rate{window=\"1d\"}), 0)", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "red", + "value": null + }, + { + "color": "orange", + "value": 90 + }, + { + "color": "yellow", + "value": 95 + }, + { + "color": "green", + "value": 98 + } + ] + }, + "unit": "percent", + "custom": { + "displayMode": "auto" + }, + "decimals": 1 + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "value" + } + }, + { + "id": 5, + "type": "stat", + "title": "Limit Used (30d)", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 4, + "w": 6, + "x": 0, + "y": 4 + }, + "targets": [ + { + "expr": "max(postmark_sending_limit_used_percent)", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 70 + }, + { + "color": "orange", + "value": 85 + }, + { + "color": "red", + "value": 95 + } + ] + }, + "unit": "percent", + "custom": { + "displayMode": "auto" + }, + "decimals": 1 + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "value" + } + }, + { + "id": 6, + "type": "stat", + "title": "Send Limit (30d)", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 4, + "w": 6, + "x": 6, + "y": 4 + }, + "targets": [ + { + "expr": "max(postmark_sending_limit)", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "rgba(115, 115, 115, 1)", + "value": null + }, + { + "color": "green", + "value": 1 + } + ] + }, + "unit": "none", + "custom": { + "displayMode": "auto" + }, + "decimals": 0 + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "value" + } + }, + { + "id": 7, + "type": "stat", + "title": "Last Success", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 4, + "w": 6, + "x": 12, + "y": 4 + }, + "targets": [ + { + "expr": "max(postmark_last_success_timestamp_seconds)", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "rgba(115, 115, 115, 1)", + "value": null + }, + { + "color": "green", + "value": 1 + } + ] + }, + "unit": "dateTimeAsIso", + "custom": { + "displayMode": "auto" + }, + "decimals": 0 + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "value" + } + }, + { + "id": 8, + "type": "stat", + "title": "Exporter Errors", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 4, + "w": 6, + "x": 18, + "y": 4 + }, + "targets": [ + { + "expr": "sum(postmark_request_errors_total)", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "rgba(115, 115, 115, 1)", + "value": null + }, + { + "color": "green", + "value": 1 + } + ] + }, + "unit": "none", + "custom": { + "displayMode": "auto" + }, + "decimals": 0 + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "value" + } + }, + { + "id": 13, + "type": "timeseries", + "title": "Bounce Rate (1d vs 7d)", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 12 + }, + "targets": [ + { + "expr": "max by (window) (postmark_outbound_bounce_rate)", + "refId": "A", + "legendFormat": "{{window}}" + } + ], + "fieldConfig": { + "defaults": { + "unit": "percent" + }, + "overrides": [] + }, + "options": { + "legend": { + "displayMode": "table", + "placement": "right" + }, + "tooltip": { + "mode": "multi" + } + } + }, + { + "id": 14, + "type": "timeseries", + "title": "Bounced (1d vs 7d)", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 12 + }, + "targets": [ + { + "expr": "max by (window) (postmark_outbound_bounced)", + "refId": "A", + "legendFormat": "{{window}}" + } + ], + "fieldConfig": { + "defaults": { + "unit": "none" + }, + "overrides": [] + }, + "options": { + "legend": { + "displayMode": "table", + "placement": "right" + }, + "tooltip": { + "mode": "multi" + } + } + }, + { + "id": 15, + "type": "timeseries", + "title": "Sent (1d vs 7d)", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 20 + }, + "targets": [ + { + "expr": "max by (window) (postmark_outbound_sent)", + "refId": "A", + "legendFormat": "{{window}}" + } + ], + "fieldConfig": { + "defaults": { + "unit": "none" + }, + "overrides": [] + }, + "options": { + "legend": { + "displayMode": "table", + "placement": "right" + }, + "tooltip": { + "mode": "multi" + } + } + }, + { + "id": 16, + "type": "timeseries", + "title": "Exporter Errors", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 20 + }, + "targets": [ + { + "expr": "sum(postmark_request_errors_total)", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "unit": "none" + }, + "overrides": [] + }, + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom" + }, + "tooltip": { + "mode": "multi" + } + } + } + ], + "time": { + "from": "now-30d", + "to": "now" + }, + "annotations": { + "list": [] + }, + "schemaVersion": 39, + "style": "dark", + "tags": [ + "atlas", + "mail" + ] + } diff --git a/services/monitoring/grafana-dashboard-nodes.yaml b/services/monitoring/grafana-dashboard-nodes.yaml index 542daca..5e02c18 100644 --- a/services/monitoring/grafana-dashboard-nodes.yaml +++ b/services/monitoring/grafana-dashboard-nodes.yaml @@ -29,7 +29,7 @@ data: }, "targets": [ { - "expr": "sum(kube_node_status_condition{condition=\"Ready\",status=\"true\",node=~\"titan-04|titan-05|titan-06|titan-07|titan-08|titan-09|titan-10|titan-11|titan-12|titan-13|titan-14|titan-15|titan-16|titan-17|titan-18|titan-19|titan-22|titan-24\"})", + "expr": "sum(kube_node_status_condition{condition=\"Ready\",status=\"true\",node=~\"titan-04|titan-05|titan-06|titan-07|titan-08|titan-09|titan-10|titan-11|titan-20|titan-21|titan-12|titan-13|titan-14|titan-15|titan-17|titan-18|titan-19|titan-22|titan-24\"})", "refId": "A" } ], @@ -55,7 +55,7 @@ data: "unit": "none", "custom": { "displayMode": "auto", - "valueSuffix": "/18" + "valueSuffix": "/19" } }, "overrides": [] @@ -151,7 +151,7 @@ data: }, "targets": [ { - "expr": "sum(kube_pod_info{node=~\"titan-0a|titan-0b|titan-0c\",namespace!~\"kube-system|kube-public|kube-node-lease|longhorn-system|monitoring|flux-system\"})", + "expr": "sum(kube_pod_info{node=~\"titan-0a|titan-0b|titan-0c\",namespace!~\"^(kube-system|longhorn-system|metallb-system|monitoring|logging|cert-manager|flux-system|traefik|maintenance|postgres)$\"})", "refId": "A" } ], @@ -498,7 +498,7 @@ data: }, "targets": [ { - "expr": "(avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-0a|titan-0b|titan-0c|titan-db\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")", + "expr": "(avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-0a|titan-0b|titan-0c|titan-db|titan-jh\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")", "refId": "A", "legendFormat": "{{node}}" } @@ -535,7 +535,7 @@ data: }, "targets": [ { - "expr": "(avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-0a|titan-0b|titan-0c|titan-db\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")", + "expr": "(avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-0a|titan-0b|titan-0c|titan-db|titan-jh\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")", "refId": "A", "legendFormat": "{{node}}" } diff --git a/services/monitoring/grafana-dashboard-overview.yaml b/services/monitoring/grafana-dashboard-overview.yaml index cfd2cd6..e627658 100644 --- a/services/monitoring/grafana-dashboard-overview.yaml +++ b/services/monitoring/grafana-dashboard-overview.yaml @@ -85,7 +85,7 @@ data: }, "targets": [ { - "expr": "sum(kube_pod_info{node=~\"titan-0a|titan-0b|titan-0c\",namespace!~\"kube-system|kube-public|kube-node-lease|longhorn-system|monitoring|flux-system\"}) or on() vector(0)", + "expr": "sum(kube_pod_info{node=~\"titan-0a|titan-0b|titan-0c\",namespace!~\"^(kube-system|longhorn-system|metallb-system|monitoring|logging|cert-manager|flux-system|traefik|maintenance|postgres)$\"}) or on() vector(0)", "refId": "A" } ], @@ -458,14 +458,14 @@ data: }, "targets": [ { - "expr": "sum(kube_node_status_condition{condition=\"Ready\",status=\"true\",node=~\"titan-04|titan-05|titan-06|titan-07|titan-08|titan-09|titan-10|titan-11|titan-12|titan-13|titan-14|titan-15|titan-16|titan-17|titan-18|titan-19|titan-22|titan-24\"})", + "expr": "sum(kube_node_status_condition{condition=\"Ready\",status=\"true\",node=~\"titan-04|titan-05|titan-06|titan-07|titan-08|titan-09|titan-10|titan-11|titan-20|titan-21|titan-12|titan-13|titan-14|titan-15|titan-17|titan-18|titan-19|titan-22|titan-24\"})", "refId": "A" } ], "fieldConfig": { "defaults": { "min": 0, - "max": 18, + "max": 19, "thresholds": { "mode": "absolute", "steps": [ @@ -475,15 +475,15 @@ data: }, { "color": "orange", - "value": 16 - }, - { - "color": "yellow", "value": 17 }, { - "color": "green", + "color": "yellow", "value": 18 + }, + { + "color": "green", + "value": 19 } ] } @@ -795,6 +795,330 @@ data: } ] }, + { + "id": 30, + "type": "stat", + "title": "Mail Sent (1d)", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 2, + "w": 6, + "x": 0, + "y": 8 + }, + "targets": [ + { + "expr": "max(postmark_outbound_sent{window=\"1d\"})", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "rgba(115, 115, 115, 1)", + "value": null + }, + { + "color": "green", + "value": 1 + } + ] + }, + "unit": "none", + "custom": { + "displayMode": "auto" + } + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "value" + }, + "links": [ + { + "title": "Open atlas-mail dashboard", + "url": "/d/atlas-mail", + "targetBlank": true + } + ] + }, + { + "id": 31, + "type": "stat", + "title": "Mail Bounces (1d)", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 2, + "w": 6, + "x": 12, + "y": 8 + }, + "targets": [ + { + "expr": "max(postmark_outbound_bounce_rate{window=\"1d\"})", + "refId": "A", + "legendFormat": "Rate" + }, + { + "expr": "max(postmark_outbound_bounced{window=\"1d\"})", + "refId": "B", + "legendFormat": "Count" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "custom": { + "displayMode": "auto" + }, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 5 + }, + { + "color": "orange", + "value": 8 + }, + { + "color": "red", + "value": 10 + } + ] + }, + "unit": "none" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "Rate" + }, + "properties": [ + { + "id": "unit", + "value": "percent" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Count" + }, + "properties": [ + { + "id": "unit", + "value": "none" + } + ] + } + ] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "name_and_value" + }, + "links": [ + { + "title": "Open atlas-mail dashboard", + "url": "/d/atlas-mail", + "targetBlank": true + } + ] + }, + { + "id": 32, + "type": "stat", + "title": "Mail Success Rate (1d)", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 2, + "w": 6, + "x": 6, + "y": 8 + }, + "targets": [ + { + "expr": "clamp_min(100 - max(postmark_outbound_bounce_rate{window=\"1d\"}), 0)", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "red", + "value": null + }, + { + "color": "orange", + "value": 90 + }, + { + "color": "yellow", + "value": 95 + }, + { + "color": "green", + "value": 98 + } + ] + }, + "unit": "percent", + "custom": { + "displayMode": "auto" + }, + "decimals": 1 + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "value" + }, + "links": [ + { + "title": "Open atlas-mail dashboard", + "url": "/d/atlas-mail", + "targetBlank": true + } + ] + }, + { + "id": 33, + "type": "stat", + "title": "Mail Limit Used (30d)", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 2, + "w": 6, + "x": 18, + "y": 8 + }, + "targets": [ + { + "expr": "max(postmark_sending_limit_used_percent)", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 70 + }, + { + "color": "orange", + "value": 85 + }, + { + "color": "red", + "value": 95 + } + ] + }, + "unit": "percent", + "custom": { + "displayMode": "auto" + }, + "decimals": 1 + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "value" + }, + "links": [ + { + "title": "Open atlas-mail dashboard", + "url": "/d/atlas-mail", + "targetBlank": true + } + ] + }, { "id": 23, "type": "stat", @@ -1095,7 +1419,7 @@ data: }, "targets": [ { - "expr": "100 * ( ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ) / clamp_min(sum( ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ), 1)", + "expr": "100 * ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\",container!=\"POD\",$namespace_scope_cpu}[1m])) by (namespace) ) / clamp_min(sum( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\",container!=\"POD\",$namespace_scope_cpu}[1m])) by (namespace) ), 1)", "refId": "A", "legendFormat": "{{namespace}}" } @@ -1128,7 +1452,25 @@ data: "fields": "", "values": false } - } + }, + "links": [ + { + "title": "Workload namespaces only", + "url": "?var-namespace_scope_cpu=namespace%21~%22%5E%28kube-system%7Clonghorn-system%7Cmetallb-system%7Cmonitoring%7Clogging%7Ccert-manager%7Cflux-system%7Ctraefik%7Cmaintenance%7Cpostgres%29%24%22&var-namespace_scope_gpu=${namespace_scope_gpu}&var-namespace_scope_ram=${namespace_scope_ram}", + "targetBlank": false + }, + { + "title": "All namespaces", + "url": "?var-namespace_scope_cpu=namespace%3D~%22.%2A%22&var-namespace_scope_gpu=${namespace_scope_gpu}&var-namespace_scope_ram=${namespace_scope_ram}", + "targetBlank": false + }, + { + "title": "Infrastructure namespaces only", + "url": "?var-namespace_scope_cpu=namespace%3D~%22%5E%28kube-system%7Clonghorn-system%7Cmetallb-system%7Cmonitoring%7Clogging%7Ccert-manager%7Cflux-system%7Ctraefik%7Cmaintenance%7Cpostgres%29%24%22&var-namespace_scope_gpu=${namespace_scope_gpu}&var-namespace_scope_ram=${namespace_scope_ram}", + "targetBlank": false + } + ], + "description": "Values are normalized within the selected scope; use panel links to switch scope." }, { "id": 12, @@ -1146,7 +1488,7 @@ data: }, "targets": [ { - "expr": "100 * ( ( (sum by (namespace) (max_over_time(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\"}[$__range]))) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ) / clamp_min(sum( ( (sum by (namespace) (max_over_time(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\"}[$__range]))) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ), 1)", + "expr": "(100 * (sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace)) / clamp_min((sum(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace)) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") and on() ((sum(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace)) or on() vector(0)) == 0))", "refId": "A", "legendFormat": "{{namespace}}" } @@ -1179,7 +1521,25 @@ data: "fields": "", "values": false } - } + }, + "links": [ + { + "title": "Workload namespaces only", + "url": "?var-namespace_scope_cpu=${namespace_scope_cpu}&var-namespace_scope_gpu=namespace%21~%22%5E%28kube-system%7Clonghorn-system%7Cmetallb-system%7Cmonitoring%7Clogging%7Ccert-manager%7Cflux-system%7Ctraefik%7Cmaintenance%7Cpostgres%29%24%22&var-namespace_scope_ram=${namespace_scope_ram}", + "targetBlank": false + }, + { + "title": "All namespaces", + "url": "?var-namespace_scope_cpu=${namespace_scope_cpu}&var-namespace_scope_gpu=namespace%3D~%22.%2A%22&var-namespace_scope_ram=${namespace_scope_ram}", + "targetBlank": false + }, + { + "title": "Infrastructure namespaces only", + "url": "?var-namespace_scope_cpu=${namespace_scope_cpu}&var-namespace_scope_gpu=namespace%3D~%22%5E%28kube-system%7Clonghorn-system%7Cmetallb-system%7Cmonitoring%7Clogging%7Ccert-manager%7Cflux-system%7Ctraefik%7Cmaintenance%7Cpostgres%29%24%22&var-namespace_scope_ram=${namespace_scope_ram}", + "targetBlank": false + } + ], + "description": "Values are normalized within the selected scope; use panel links to switch scope." }, { "id": 13, @@ -1197,7 +1557,7 @@ data: }, "targets": [ { - "expr": "100 * ( ( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ) / clamp_min(sum( ( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ), 1)", + "expr": "100 * ( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\",container!=\"POD\",$namespace_scope_ram}) by (namespace) ) / clamp_min(sum( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\",container!=\"POD\",$namespace_scope_ram}) by (namespace) ), 1)", "refId": "A", "legendFormat": "{{namespace}}" } @@ -1230,7 +1590,25 @@ data: "fields": "", "values": false } - } + }, + "links": [ + { + "title": "Workload namespaces only", + "url": "?var-namespace_scope_cpu=${namespace_scope_cpu}&var-namespace_scope_gpu=${namespace_scope_gpu}&var-namespace_scope_ram=namespace%21~%22%5E%28kube-system%7Clonghorn-system%7Cmetallb-system%7Cmonitoring%7Clogging%7Ccert-manager%7Cflux-system%7Ctraefik%7Cmaintenance%7Cpostgres%29%24%22", + "targetBlank": false + }, + { + "title": "All namespaces", + "url": "?var-namespace_scope_cpu=${namespace_scope_cpu}&var-namespace_scope_gpu=${namespace_scope_gpu}&var-namespace_scope_ram=namespace%3D~%22.%2A%22", + "targetBlank": false + }, + { + "title": "Infrastructure namespaces only", + "url": "?var-namespace_scope_cpu=${namespace_scope_cpu}&var-namespace_scope_gpu=${namespace_scope_gpu}&var-namespace_scope_ram=namespace%3D~%22%5E%28kube-system%7Clonghorn-system%7Cmetallb-system%7Cmonitoring%7Clogging%7Ccert-manager%7Cflux-system%7Ctraefik%7Cmaintenance%7Cpostgres%29%24%22", + "targetBlank": false + } + ], + "description": "Values are normalized within the selected scope; use panel links to switch scope." }, { "id": 14, @@ -1248,7 +1626,7 @@ data: }, "targets": [ { - "expr": "(avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-04|titan-05|titan-06|titan-07|titan-08|titan-09|titan-10|titan-11|titan-12|titan-13|titan-14|titan-15|titan-16|titan-17|titan-18|titan-19|titan-22|titan-24\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")", + "expr": "(avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-04|titan-05|titan-06|titan-07|titan-08|titan-09|titan-10|titan-11|titan-20|titan-21|titan-12|titan-13|titan-14|titan-15|titan-17|titan-18|titan-19|titan-22|titan-24\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")", "refId": "A", "legendFormat": "{{node}}" } @@ -1295,7 +1673,7 @@ data: }, "targets": [ { - "expr": "(avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-04|titan-05|titan-06|titan-07|titan-08|titan-09|titan-10|titan-11|titan-12|titan-13|titan-14|titan-15|titan-16|titan-17|titan-18|titan-19|titan-22|titan-24\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")", + "expr": "(avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-04|titan-05|titan-06|titan-07|titan-08|titan-09|titan-10|titan-11|titan-20|titan-21|titan-12|titan-13|titan-14|titan-15|titan-17|titan-18|titan-19|titan-22|titan-24\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")", "refId": "A", "legendFormat": "{{node}}" } @@ -1342,7 +1720,7 @@ data: }, "targets": [ { - "expr": "(avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-0a|titan-0b|titan-0c|titan-db\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")", + "expr": "(avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-0a|titan-0b|titan-0c|titan-db|titan-jh\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")", "refId": "A", "legendFormat": "{{node}}" } @@ -1379,7 +1757,7 @@ data: }, "targets": [ { - "expr": "(avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-0a|titan-0b|titan-0c|titan-db\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")", + "expr": "(avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-0a|titan-0b|titan-0c|titan-db|titan-jh\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")", "refId": "A", "legendFormat": "{{node}}" } @@ -1800,7 +2178,110 @@ data: "overview" ], "templating": { - "list": [] + "list": [ + { + "name": "namespace_scope_cpu", + "label": "CPU namespace filter", + "type": "custom", + "query": "workload namespaces only : namespace!~\"^(kube-system|longhorn-system|metallb-system|monitoring|logging|cert-manager|flux-system|traefik|maintenance|postgres)$\",all namespaces : namespace=~\".*\",infrastructure namespaces only : namespace=~\"^(kube-system|longhorn-system|metallb-system|monitoring|logging|cert-manager|flux-system|traefik|maintenance|postgres)$\"", + "current": { + "text": "workload namespaces only", + "value": "namespace!~\"^(kube-system|longhorn-system|metallb-system|monitoring|logging|cert-manager|flux-system|traefik|maintenance|postgres)$\"", + "selected": true + }, + "options": [ + { + "text": "workload namespaces only", + "value": "namespace!~\"^(kube-system|longhorn-system|metallb-system|monitoring|logging|cert-manager|flux-system|traefik|maintenance|postgres)$\"", + "selected": true + }, + { + "text": "all namespaces", + "value": "namespace=~\".*\"", + "selected": false + }, + { + "text": "infrastructure namespaces only", + "value": "namespace=~\"^(kube-system|longhorn-system|metallb-system|monitoring|logging|cert-manager|flux-system|traefik|maintenance|postgres)$\"", + "selected": false + } + ], + "hide": 2, + "multi": false, + "includeAll": false, + "refresh": 1, + "sort": 0, + "skipUrlSync": false + }, + { + "name": "namespace_scope_gpu", + "label": "GPU namespace filter", + "type": "custom", + "query": "workload namespaces only : namespace!~\"^(kube-system|longhorn-system|metallb-system|monitoring|logging|cert-manager|flux-system|traefik|maintenance|postgres)$\",all namespaces : namespace=~\".*\",infrastructure namespaces only : namespace=~\"^(kube-system|longhorn-system|metallb-system|monitoring|logging|cert-manager|flux-system|traefik|maintenance|postgres)$\"", + "current": { + "text": "workload namespaces only", + "value": "namespace!~\"^(kube-system|longhorn-system|metallb-system|monitoring|logging|cert-manager|flux-system|traefik|maintenance|postgres)$\"", + "selected": true + }, + "options": [ + { + "text": "workload namespaces only", + "value": "namespace!~\"^(kube-system|longhorn-system|metallb-system|monitoring|logging|cert-manager|flux-system|traefik|maintenance|postgres)$\"", + "selected": true + }, + { + "text": "all namespaces", + "value": "namespace=~\".*\"", + "selected": false + }, + { + "text": "infrastructure namespaces only", + "value": "namespace=~\"^(kube-system|longhorn-system|metallb-system|monitoring|logging|cert-manager|flux-system|traefik|maintenance|postgres)$\"", + "selected": false + } + ], + "hide": 2, + "multi": false, + "includeAll": false, + "refresh": 1, + "sort": 0, + "skipUrlSync": false + }, + { + "name": "namespace_scope_ram", + "label": "RAM namespace filter", + "type": "custom", + "query": "workload namespaces only : namespace!~\"^(kube-system|longhorn-system|metallb-system|monitoring|logging|cert-manager|flux-system|traefik|maintenance|postgres)$\",all namespaces : namespace=~\".*\",infrastructure namespaces only : namespace=~\"^(kube-system|longhorn-system|metallb-system|monitoring|logging|cert-manager|flux-system|traefik|maintenance|postgres)$\"", + "current": { + "text": "workload namespaces only", + "value": "namespace!~\"^(kube-system|longhorn-system|metallb-system|monitoring|logging|cert-manager|flux-system|traefik|maintenance|postgres)$\"", + "selected": true + }, + "options": [ + { + "text": "workload namespaces only", + "value": "namespace!~\"^(kube-system|longhorn-system|metallb-system|monitoring|logging|cert-manager|flux-system|traefik|maintenance|postgres)$\"", + "selected": true + }, + { + "text": "all namespaces", + "value": "namespace=~\".*\"", + "selected": false + }, + { + "text": "infrastructure namespaces only", + "value": "namespace=~\"^(kube-system|longhorn-system|metallb-system|monitoring|logging|cert-manager|flux-system|traefik|maintenance|postgres)$\"", + "selected": false + } + ], + "hide": 2, + "multi": false, + "includeAll": false, + "refresh": 1, + "sort": 0, + "skipUrlSync": false + } + ] }, "time": { "from": "now-1h", diff --git a/services/monitoring/grafana-dashboard-pods.yaml b/services/monitoring/grafana-dashboard-pods.yaml index b7c49d5..5ea8343 100644 --- a/services/monitoring/grafana-dashboard-pods.yaml +++ b/services/monitoring/grafana-dashboard-pods.yaml @@ -209,7 +209,7 @@ data: }, "targets": [ { - "expr": "sum(kube_pod_info{node=~\"titan-0a|titan-0b|titan-0c\",namespace!~\"kube-system|kube-public|kube-node-lease|longhorn-system|monitoring|flux-system\"})", + "expr": "sum(kube_pod_info{node=~\"titan-0a|titan-0b|titan-0c\",namespace!~\"^(kube-system|longhorn-system|metallb-system|monitoring|logging|cert-manager|flux-system|traefik|maintenance|postgres)$\"})", "refId": "A" } ], @@ -529,7 +529,7 @@ data: }, "targets": [ { - "expr": "(sum by (namespace,node) (kube_pod_info{pod!=\"\" , node!=\"\"}) / on(namespace) group_left() clamp_min(sum by (namespace) (kube_pod_info{pod!=\"\"}), 1) * 100) * on(namespace,node) group_left() ((sum by (namespace,node) (kube_pod_info{pod!=\"\" , node!=\"\"}) / on(namespace) group_left() clamp_min(sum by (namespace) (kube_pod_info{pod!=\"\"}), 1) * 100) + on(node) group_left() ((sum by (node) (kube_node_info{node=\"titan-0a\"}) * 0 + 0.001) or (sum by (node) (kube_node_info{node=\"titan-0b\"}) * 0 + 0.002) or (sum by (node) (kube_node_info{node=\"titan-0c\"}) * 0 + 0.003) or (sum by (node) (kube_node_info{node=\"titan-db\"}) * 0 + 0.004) or (sum by (node) (kube_node_info{node=\"titan-04\"}) * 0 + 0.005) or (sum by (node) (kube_node_info{node=\"titan-05\"}) * 0 + 0.006) or (sum by (node) (kube_node_info{node=\"titan-06\"}) * 0 + 0.007) or (sum by (node) (kube_node_info{node=\"titan-07\"}) * 0 + 0.008) or (sum by (node) (kube_node_info{node=\"titan-08\"}) * 0 + 0.009000000000000001) or (sum by (node) (kube_node_info{node=\"titan-09\"}) * 0 + 0.01) or (sum by (node) (kube_node_info{node=\"titan-10\"}) * 0 + 0.011) or (sum by (node) (kube_node_info{node=\"titan-11\"}) * 0 + 0.012) or (sum by (node) (kube_node_info{node=\"titan-12\"}) * 0 + 0.013000000000000001) or (sum by (node) (kube_node_info{node=\"titan-13\"}) * 0 + 0.014) or (sum by (node) (kube_node_info{node=\"titan-14\"}) * 0 + 0.015) or (sum by (node) (kube_node_info{node=\"titan-15\"}) * 0 + 0.016) or (sum by (node) (kube_node_info{node=\"titan-16\"}) * 0 + 0.017) or (sum by (node) (kube_node_info{node=\"titan-17\"}) * 0 + 0.018000000000000002) or (sum by (node) (kube_node_info{node=\"titan-18\"}) * 0 + 0.019) or (sum by (node) (kube_node_info{node=\"titan-19\"}) * 0 + 0.02) or (sum by (node) (kube_node_info{node=\"titan-22\"}) * 0 + 0.021) or (sum by (node) (kube_node_info{node=\"titan-24\"}) * 0 + 0.022)) == bool on(namespace) group_left() (max by (namespace) ((sum by (namespace,node) (kube_pod_info{pod!=\"\" , node!=\"\"}) / on(namespace) group_left() clamp_min(sum by (namespace) (kube_pod_info{pod!=\"\"}), 1) * 100) + on(node) group_left() ((sum by (node) (kube_node_info{node=\"titan-0a\"}) * 0 + 0.001) or (sum by (node) (kube_node_info{node=\"titan-0b\"}) * 0 + 0.002) or (sum by (node) (kube_node_info{node=\"titan-0c\"}) * 0 + 0.003) or (sum by (node) (kube_node_info{node=\"titan-db\"}) * 0 + 0.004) or (sum by (node) (kube_node_info{node=\"titan-04\"}) * 0 + 0.005) or (sum by (node) (kube_node_info{node=\"titan-05\"}) * 0 + 0.006) or (sum by (node) (kube_node_info{node=\"titan-06\"}) * 0 + 0.007) or (sum by (node) (kube_node_info{node=\"titan-07\"}) * 0 + 0.008) or (sum by (node) (kube_node_info{node=\"titan-08\"}) * 0 + 0.009000000000000001) or (sum by (node) (kube_node_info{node=\"titan-09\"}) * 0 + 0.01) or (sum by (node) (kube_node_info{node=\"titan-10\"}) * 0 + 0.011) or (sum by (node) (kube_node_info{node=\"titan-11\"}) * 0 + 0.012) or (sum by (node) (kube_node_info{node=\"titan-12\"}) * 0 + 0.013000000000000001) or (sum by (node) (kube_node_info{node=\"titan-13\"}) * 0 + 0.014) or (sum by (node) (kube_node_info{node=\"titan-14\"}) * 0 + 0.015) or (sum by (node) (kube_node_info{node=\"titan-15\"}) * 0 + 0.016) or (sum by (node) (kube_node_info{node=\"titan-16\"}) * 0 + 0.017) or (sum by (node) (kube_node_info{node=\"titan-17\"}) * 0 + 0.018000000000000002) or (sum by (node) (kube_node_info{node=\"titan-18\"}) * 0 + 0.019) or (sum by (node) (kube_node_info{node=\"titan-19\"}) * 0 + 0.02) or (sum by (node) (kube_node_info{node=\"titan-22\"}) * 0 + 0.021) or (sum by (node) (kube_node_info{node=\"titan-24\"}) * 0 + 0.022)))))", + "expr": "(sum by (namespace,node) (kube_pod_info{pod!=\"\" , node!=\"\"}) / on(namespace) group_left() clamp_min(sum by (namespace) (kube_pod_info{pod!=\"\"}), 1) * 100) * on(namespace,node) group_left() ((sum by (namespace,node) (kube_pod_info{pod!=\"\" , node!=\"\"}) / on(namespace) group_left() clamp_min(sum by (namespace) (kube_pod_info{pod!=\"\"}), 1) * 100) + on(node) group_left() ((sum by (node) (kube_node_info{node=\"titan-0a\"}) * 0 + 0.001) or (sum by (node) (kube_node_info{node=\"titan-0b\"}) * 0 + 0.002) or (sum by (node) (kube_node_info{node=\"titan-0c\"}) * 0 + 0.003) or (sum by (node) (kube_node_info{node=\"titan-db\"}) * 0 + 0.004) or (sum by (node) (kube_node_info{node=\"titan-jh\"}) * 0 + 0.005) or (sum by (node) (kube_node_info{node=\"titan-04\"}) * 0 + 0.006) or (sum by (node) (kube_node_info{node=\"titan-05\"}) * 0 + 0.007) or (sum by (node) (kube_node_info{node=\"titan-06\"}) * 0 + 0.008) or (sum by (node) (kube_node_info{node=\"titan-07\"}) * 0 + 0.009000000000000001) or (sum by (node) (kube_node_info{node=\"titan-08\"}) * 0 + 0.01) or (sum by (node) (kube_node_info{node=\"titan-09\"}) * 0 + 0.011) or (sum by (node) (kube_node_info{node=\"titan-10\"}) * 0 + 0.012) or (sum by (node) (kube_node_info{node=\"titan-11\"}) * 0 + 0.013000000000000001) or (sum by (node) (kube_node_info{node=\"titan-20\"}) * 0 + 0.014) or (sum by (node) (kube_node_info{node=\"titan-21\"}) * 0 + 0.015) or (sum by (node) (kube_node_info{node=\"titan-12\"}) * 0 + 0.016) or (sum by (node) (kube_node_info{node=\"titan-13\"}) * 0 + 0.017) or (sum by (node) (kube_node_info{node=\"titan-14\"}) * 0 + 0.018000000000000002) or (sum by (node) (kube_node_info{node=\"titan-15\"}) * 0 + 0.019) or (sum by (node) (kube_node_info{node=\"titan-17\"}) * 0 + 0.02) or (sum by (node) (kube_node_info{node=\"titan-18\"}) * 0 + 0.021) or (sum by (node) (kube_node_info{node=\"titan-19\"}) * 0 + 0.022) or (sum by (node) (kube_node_info{node=\"titan-22\"}) * 0 + 0.023) or (sum by (node) (kube_node_info{node=\"titan-24\"}) * 0 + 0.024)) == bool on(namespace) group_left() (max by (namespace) ((sum by (namespace,node) (kube_pod_info{pod!=\"\" , node!=\"\"}) / on(namespace) group_left() clamp_min(sum by (namespace) (kube_pod_info{pod!=\"\"}), 1) * 100) + on(node) group_left() ((sum by (node) (kube_node_info{node=\"titan-0a\"}) * 0 + 0.001) or (sum by (node) (kube_node_info{node=\"titan-0b\"}) * 0 + 0.002) or (sum by (node) (kube_node_info{node=\"titan-0c\"}) * 0 + 0.003) or (sum by (node) (kube_node_info{node=\"titan-db\"}) * 0 + 0.004) or (sum by (node) (kube_node_info{node=\"titan-jh\"}) * 0 + 0.005) or (sum by (node) (kube_node_info{node=\"titan-04\"}) * 0 + 0.006) or (sum by (node) (kube_node_info{node=\"titan-05\"}) * 0 + 0.007) or (sum by (node) (kube_node_info{node=\"titan-06\"}) * 0 + 0.008) or (sum by (node) (kube_node_info{node=\"titan-07\"}) * 0 + 0.009000000000000001) or (sum by (node) (kube_node_info{node=\"titan-08\"}) * 0 + 0.01) or (sum by (node) (kube_node_info{node=\"titan-09\"}) * 0 + 0.011) or (sum by (node) (kube_node_info{node=\"titan-10\"}) * 0 + 0.012) or (sum by (node) (kube_node_info{node=\"titan-11\"}) * 0 + 0.013000000000000001) or (sum by (node) (kube_node_info{node=\"titan-20\"}) * 0 + 0.014) or (sum by (node) (kube_node_info{node=\"titan-21\"}) * 0 + 0.015) or (sum by (node) (kube_node_info{node=\"titan-12\"}) * 0 + 0.016) or (sum by (node) (kube_node_info{node=\"titan-13\"}) * 0 + 0.017) or (sum by (node) (kube_node_info{node=\"titan-14\"}) * 0 + 0.018000000000000002) or (sum by (node) (kube_node_info{node=\"titan-15\"}) * 0 + 0.019) or (sum by (node) (kube_node_info{node=\"titan-17\"}) * 0 + 0.02) or (sum by (node) (kube_node_info{node=\"titan-18\"}) * 0 + 0.021) or (sum by (node) (kube_node_info{node=\"titan-19\"}) * 0 + 0.022) or (sum by (node) (kube_node_info{node=\"titan-22\"}) * 0 + 0.023) or (sum by (node) (kube_node_info{node=\"titan-24\"}) * 0 + 0.024)))))", "refId": "A", "instant": true, "format": "table" diff --git a/services/monitoring/grafana-dashboard-storage.yaml b/services/monitoring/grafana-dashboard-storage.yaml index 8aef820..5ce4186 100644 --- a/services/monitoring/grafana-dashboard-storage.yaml +++ b/services/monitoring/grafana-dashboard-storage.yaml @@ -418,6 +418,138 @@ data: } }, "timeFrom": "90d" + }, + { + "id": 30, + "type": "stat", + "title": "Maintenance Sweepers Ready", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 4, + "w": 12, + "x": 0, + "y": 44 + }, + "targets": [ + { + "expr": "kube_daemonset_status_number_ready{namespace=\"maintenance\",daemonset=\"node-image-sweeper\"} / on(namespace,daemonset) kube_daemonset_status_desired_number_scheduled{namespace=\"maintenance\",daemonset=\"node-image-sweeper\"} * 100", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 50 + }, + { + "color": "orange", + "value": 75 + }, + { + "color": "red", + "value": 91.5 + } + ] + }, + "unit": "percent", + "custom": { + "displayMode": "auto" + } + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "value" + } + }, + { + "id": 31, + "type": "stat", + "title": "Maintenance Cron Freshness (s)", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 4, + "w": 12, + "x": 12, + "y": 44 + }, + "targets": [ + { + "expr": "time() - max by (cronjob) (kube_cronjob_status_last_successful_time{namespace=\"maintenance\",cronjob=~\"image-sweeper|grafana-smtp-sync\"})", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 3600 + }, + { + "color": "red", + "value": 10800 + } + ] + }, + "unit": "s", + "custom": { + "displayMode": "auto" + } + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "value" + } } ], "time": { diff --git a/services/monitoring/grafana-org-bootstrap.yaml b/services/monitoring/grafana-org-bootstrap.yaml new file mode 100644 index 0000000..0872f4a --- /dev/null +++ b/services/monitoring/grafana-org-bootstrap.yaml @@ -0,0 +1,110 @@ +# services/monitoring/grafana-org-bootstrap.yaml +apiVersion: batch/v1 +kind: Job +metadata: + name: grafana-org-bootstrap-1 + namespace: monitoring +spec: + backoffLimit: 2 + template: + spec: + restartPolicy: OnFailure + containers: + - name: bootstrap + image: python:3.11-alpine + env: + - name: GRAFANA_URL + value: http://grafana + - name: OVERVIEW_ORG_NAME + value: Overview + - name: GRAFANA_USER + valueFrom: + secretKeyRef: + name: grafana-admin + key: admin-user + - name: GRAFANA_PASSWORD + valueFrom: + secretKeyRef: + name: grafana-admin + key: admin-password + command: ["/bin/sh", "-c"] + args: + - | + set -euo pipefail + python - <<'PY' + import base64 + import json + import os + import time + import urllib.error + import urllib.request + + grafana_url = os.environ["GRAFANA_URL"].rstrip("/") + org_name = os.environ["OVERVIEW_ORG_NAME"] + user = os.environ["GRAFANA_USER"] + password = os.environ["GRAFANA_PASSWORD"] + + auth = base64.b64encode(f"{user}:{password}".encode()).decode() + base_headers = { + "Authorization": f"Basic {auth}", + "Content-Type": "application/json", + } + + def request(path, method="GET", data=None, org_id=None): + headers = dict(base_headers) + if org_id is not None: + headers["X-Grafana-Org-Id"] = str(org_id) + payload = None + if data is not None: + payload = json.dumps(data).encode() + req = urllib.request.Request( + f"{grafana_url}{path}", + data=payload, + headers=headers, + method=method, + ) + return urllib.request.urlopen(req, timeout=10) + + for _ in range(60): + try: + with request("/api/health") as resp: + if resp.status == 200: + break + except Exception: + time.sleep(2) + else: + raise SystemExit("Grafana API did not become ready in time") + + with request("/api/orgs") as resp: + orgs = json.load(resp) + org_id = next((org["id"] for org in orgs if org["name"] == org_name), None) + if org_id is None: + with request("/api/orgs", method="POST", data={"name": org_name}) as resp: + org_id = json.load(resp).get("orgId") + if org_id is None: + raise SystemExit(f"Unable to resolve org ID for {org_name}") + + datasource = { + "name": "VictoriaMetrics", + "type": "prometheus", + "access": "proxy", + "url": "http://victoria-metrics-single-server:8428", + "isDefault": True, + "uid": "atlas-vm", + "jsonData": {"timeInterval": "15s"}, + } + try: + with request("/api/datasources/uid/atlas-vm", org_id=org_id) as resp: + if resp.status != 200: + raise urllib.error.HTTPError(resp.url, resp.status, resp.reason, resp.headers, None) + except urllib.error.HTTPError as err: + if err.code != 404: + raise + with request("/api/datasources", method="POST", data=datasource, org_id=org_id): + pass + + with request("/api/admin/provisioning/datasources/reload", method="POST"): + pass + with request("/api/admin/provisioning/dashboards/reload", method="POST"): + pass + PY diff --git a/services/monitoring/grafana-smtp-sync-cronjob.yaml b/services/monitoring/grafana-smtp-sync-cronjob.yaml new file mode 100644 index 0000000..3b92d4c --- /dev/null +++ b/services/monitoring/grafana-smtp-sync-cronjob.yaml @@ -0,0 +1,44 @@ +# services/monitoring/grafana-smtp-sync-cronjob.yaml +apiVersion: batch/v1 +kind: CronJob +metadata: + name: grafana-smtp-sync + namespace: monitoring +spec: + schedule: "15 3 * * *" + concurrencyPolicy: Forbid + jobTemplate: + spec: + template: + spec: + serviceAccountName: grafana-smtp-sync + restartPolicy: OnFailure + containers: + - name: sync + image: bitnami/kubectl@sha256:554ab88b1858e8424c55de37ad417b16f2a0e65d1607aa0f3fe3ce9b9f10b131 + command: ["/bin/sh", "-c"] + args: + - | + set -euo pipefail + if ! command -v jq >/dev/null 2>&1; then + apt-get update >/dev/null && apt-get install -y jq >/dev/null + fi + exec /scripts/sync.sh + env: + - name: SOURCE_NS + value: mailu-mailserver + - name: SOURCE_SECRET + value: mailu-postmark-relay + - name: TARGET_NS + value: monitoring + - name: TARGET_SECRET + value: grafana-smtp + volumeMounts: + - name: script + mountPath: /scripts + readOnly: true + volumes: + - name: script + configMap: + name: grafana-smtp-sync-script + defaultMode: 0555 diff --git a/services/monitoring/grafana-smtp-sync-rbac.yaml b/services/monitoring/grafana-smtp-sync-rbac.yaml new file mode 100644 index 0000000..532d622 --- /dev/null +++ b/services/monitoring/grafana-smtp-sync-rbac.yaml @@ -0,0 +1,49 @@ +# services/monitoring/grafana-smtp-sync-rbac.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: grafana-smtp-sync +rules: + - apiGroups: [""] + resources: ["secrets"] + verbs: ["get"] + resourceNames: + - mailu-postmark-relay +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: grafana-smtp-sync +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: grafana-smtp-sync +subjects: + - kind: ServiceAccount + name: grafana-smtp-sync + namespace: monitoring + +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: Role +metadata: + name: grafana-smtp-sync + namespace: monitoring +rules: + - apiGroups: [""] + resources: ["secrets"] + verbs: ["get", "create", "update", "patch"] +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + name: grafana-smtp-sync + namespace: monitoring +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: Role + name: grafana-smtp-sync +subjects: + - kind: ServiceAccount + name: grafana-smtp-sync + namespace: monitoring diff --git a/services/monitoring/grafana-smtp-sync-serviceaccount.yaml b/services/monitoring/grafana-smtp-sync-serviceaccount.yaml new file mode 100644 index 0000000..6ad0e18 --- /dev/null +++ b/services/monitoring/grafana-smtp-sync-serviceaccount.yaml @@ -0,0 +1,6 @@ +# services/monitoring/grafana-smtp-sync-serviceaccount.yaml +apiVersion: v1 +kind: ServiceAccount +metadata: + name: grafana-smtp-sync + namespace: monitoring diff --git a/services/monitoring/helmrelease.yaml b/services/monitoring/helmrelease.yaml index 3fd76db..704b91d 100644 --- a/services/monitoring/helmrelease.yaml +++ b/services/monitoring/helmrelease.yaml @@ -195,6 +195,15 @@ spec: target_label: instance replacement: titan-db + # --- titan-jh node_exporter (external control-plane host) --- + - job_name: "titan-jh" + static_configs: + - targets: ["192.168.22.8:9100"] + relabel_configs: + - source_labels: [__address__] + target_label: instance + replacement: titan-jh + # --- cert-manager (pods expose on 9402) --- - job_name: "cert-manager" kubernetes_sd_configs: [{ role: pod }] @@ -245,10 +254,21 @@ spec: enabled: true size: 20Gi storageClassName: astreae + deploymentStrategy: + type: Recreate service: type: ClusterIP env: - GF_AUTH_ANONYMOUS_ENABLED: "false" + GF_AUTH_GENERIC_OAUTH_CLIENT_ID: "grafana" + GF_AUTH_GENERIC_OAUTH_CLIENT_SECRET: "" + GF_AUTH_ANONYMOUS_ENABLED: "true" + GF_AUTH_ANONYMOUS_ORG_NAME: "Overview" + GF_AUTH_ANONYMOUS_ORG_ROLE: "Viewer" + GF_SMTP_ENABLED: "true" + GF_SMTP_HOST: "smtp.postmarkapp.com:587" + GF_SMTP_FROM: "alerts@bstein.dev" + GF_SMTP_FROM_NAME: "Atlas Alerts" + GRAFANA_ALERT_EMAILS: "alerts@bstein.dev" GF_SECURITY_ALLOW_EMBEDDING: "true" GF_AUTH_GENERIC_OAUTH_ENABLED: "true" GF_AUTH_GENERIC_OAUTH_NAME: "Keycloak" @@ -258,17 +278,9 @@ spec: GF_AUTH_GENERIC_OAUTH_TOKEN_URL: "https://sso.bstein.dev/realms/atlas/protocol/openid-connect/token" GF_AUTH_GENERIC_OAUTH_API_URL: "https://sso.bstein.dev/realms/atlas/protocol/openid-connect/userinfo" GF_AUTH_GENERIC_OAUTH_ROLE_ATTRIBUTE_PATH: "contains(groups, 'admin') && 'Admin' || 'Viewer'" + GF_AUTH_GENERIC_OAUTH_USE_PKCE: "true" GF_AUTH_GENERIC_OAUTH_TLS_SKIP_VERIFY_INSECURE: "false" GF_AUTH_SIGNOUT_REDIRECT_URL: "https://sso.bstein.dev/realms/atlas/protocol/openid-connect/logout?redirect_uri=https://metrics.bstein.dev/" - envValueFrom: - GF_AUTH_GENERIC_OAUTH_CLIENT_ID: - secretKeyRef: - name: grafana-oidc - key: client_id - GF_AUTH_GENERIC_OAUTH_CLIENT_SECRET: - secretKeyRef: - name: grafana-oidc - key: client_secret grafana.ini: server: domain: metrics.bstein.dev @@ -279,6 +291,15 @@ spec: hide_version: true users: default_theme: dark + envValueFrom: + GF_SMTP_USER: + secretKeyRef: + name: grafana-smtp + key: username + GF_SMTP_PASSWORD: + secretKeyRef: + name: grafana-smtp + key: password ingress: enabled: true ingressClassName: traefik @@ -303,6 +324,16 @@ spec: jsonData: timeInterval: "15s" uid: atlas-vm + orgId: 1 + - name: VictoriaMetrics + type: prometheus + access: proxy + url: http://victoria-metrics-single-server:8428 + isDefault: true + jsonData: + timeInterval: "15s" + uid: atlas-vm + orgId: 2 dashboardProviders: dashboardproviders.yaml: apiVersion: 1 @@ -315,6 +346,14 @@ spec: editable: false options: path: /var/lib/grafana/dashboards/overview + - name: overview-public + orgId: 2 + folder: Overview + type: file + disableDeletion: false + editable: false + options: + path: /var/lib/grafana/dashboards/overview-public - name: pods orgId: 1 folder: Atlas Internal @@ -355,18 +394,32 @@ spec: editable: true options: path: /var/lib/grafana/dashboards/network + - name: mail + orgId: 1 + folder: Atlas Internal + type: file + disableDeletion: false + editable: true + options: + path: /var/lib/grafana/dashboards/mail dashboardsConfigMaps: overview: grafana-dashboard-overview + overview-public: grafana-dashboard-overview pods: grafana-dashboard-pods nodes: grafana-dashboard-nodes storage: grafana-dashboard-storage gpu: grafana-dashboard-gpu network: grafana-dashboard-network + mail: grafana-dashboard-mail extraConfigmapMounts: - name: grafana-folders mountPath: /etc/grafana/provisioning/folders configMap: grafana-folders readOnly: true + - name: grafana-alerting + mountPath: /etc/grafana/provisioning/alerting + configMap: grafana-alerting + readOnly: true --- diff --git a/services/monitoring/jetson-tegrastats-exporter.yaml b/services/monitoring/jetson-tegrastats-exporter.yaml new file mode 100644 index 0000000..8788b20 --- /dev/null +++ b/services/monitoring/jetson-tegrastats-exporter.yaml @@ -0,0 +1,80 @@ +# services/monitoring/jetson-tegrastats-exporter.yaml +apiVersion: apps/v1 +kind: DaemonSet +metadata: + name: jetson-tegrastats-exporter + namespace: monitoring + labels: + app: jetson-tegrastats-exporter +spec: + selector: + matchLabels: + app: jetson-tegrastats-exporter + template: + metadata: + labels: + app: jetson-tegrastats-exporter + annotations: + prometheus.io/scrape: "true" + prometheus.io/port: "9100" + spec: + serviceAccountName: default + hostPID: true + tolerations: + - operator: Exists + nodeSelector: + jetson: "true" + containers: + - name: exporter + # Exposes tegrastats output as Prometheus metrics for Jetson devices. + image: python:3.10-slim + imagePullPolicy: IfNotPresent + securityContext: + privileged: true + ports: + - name: metrics + containerPort: 9100 + resources: + requests: + cpu: 50m + memory: 64Mi + limits: + cpu: 200m + memory: 256Mi + env: + - name: JETSON_EXPORTER_PORT + value: "9100" + volumeMounts: + - name: script + mountPath: /etc/tegrastats-exporter + readOnly: true + - name: tegrastats-bin + mountPath: /host/usr/bin/tegrastats + readOnly: true + command: + - python + - /etc/tegrastats-exporter/exporter.py + volumes: + - name: script + configMap: + name: jetson-tegrastats-exporter-script + defaultMode: 0555 + - name: tegrastats-bin + hostPath: + path: /usr/bin/tegrastats + type: File +--- +apiVersion: v1 +kind: Service +metadata: + name: jetson-tegrastats-exporter + namespace: monitoring + labels: + app: jetson-tegrastats-exporter +spec: + selector: + app: jetson-tegrastats-exporter + ports: + - name: metrics + port: 9100 + targetPort: metrics diff --git a/services/monitoring/kustomization.yaml b/services/monitoring/kustomization.yaml index a50a1c1..0dafba7 100644 --- a/services/monitoring/kustomization.yaml +++ b/services/monitoring/kustomization.yaml @@ -11,6 +11,35 @@ resources: - grafana-dashboard-storage.yaml - grafana-dashboard-network.yaml - grafana-dashboard-gpu.yaml + - grafana-dashboard-mail.yaml - dcgm-exporter.yaml + - jetson-tegrastats-exporter.yaml + - postmark-exporter-service.yaml + - postmark-exporter-deployment.yaml + - grafana-alerting-config.yaml + - grafana-smtp-sync-serviceaccount.yaml + - grafana-smtp-sync-rbac.yaml + - grafana-smtp-sync-cronjob.yaml - grafana-folders.yaml - helmrelease.yaml + - grafana-org-bootstrap.yaml + +configMapGenerator: + - name: postmark-exporter-script + namespace: monitoring + files: + - monitoring_postmark_exporter.py=scripts/postmark_exporter.py + options: + disableNameSuffixHash: true + - name: grafana-smtp-sync-script + namespace: monitoring + files: + - sync.sh=scripts/grafana_smtp_sync.sh + options: + disableNameSuffixHash: true + - name: jetson-tegrastats-exporter-script + namespace: monitoring + files: + - exporter.py=scripts/jetson_tegrastats_exporter.py + options: + disableNameSuffixHash: true diff --git a/services/monitoring/postmark-exporter-deployment.yaml b/services/monitoring/postmark-exporter-deployment.yaml new file mode 100644 index 0000000..646c455 --- /dev/null +++ b/services/monitoring/postmark-exporter-deployment.yaml @@ -0,0 +1,71 @@ +# services/monitoring/postmark-exporter-deployment.yaml +apiVersion: apps/v1 +kind: Deployment +metadata: + name: postmark-exporter +spec: + replicas: 1 + selector: + matchLabels: + app: postmark-exporter + template: + metadata: + labels: + app: postmark-exporter + annotations: + prometheus.io/scrape: "true" + prometheus.io/port: "8000" + prometheus.io/path: "/metrics" + bstein.dev/restarted-at: "2026-01-06T00:00:00Z" + spec: + containers: + - name: exporter + image: python:3.12-alpine + imagePullPolicy: IfNotPresent + command: ["/bin/sh", "-c"] + args: + - | + set -euo pipefail + pip install --no-cache-dir prometheus-client==0.22.1 requests==2.32.3 + exec python /app/monitoring_postmark_exporter.py + env: + - name: POSTMARK_SERVER_TOKEN + valueFrom: + secretKeyRef: + name: postmark-exporter + key: server-token + - name: POSTMARK_SERVER_TOKEN_FALLBACK + valueFrom: + secretKeyRef: + name: postmark-exporter + key: server-token-fallback + - name: POSTMARK_SENDING_LIMIT + valueFrom: + secretKeyRef: + name: postmark-exporter + key: sending-limit + optional: true + - name: POSTMARK_SENDING_LIMIT_WINDOW + value: "30d" + - name: POLL_INTERVAL_SECONDS + value: "60" + - name: LISTEN_PORT + value: "8000" + ports: + - name: http + containerPort: 8000 + volumeMounts: + - name: script + mountPath: /app + readOnly: true + resources: + requests: + cpu: 50m + memory: 64Mi + limits: + cpu: 250m + memory: 256Mi + volumes: + - name: script + configMap: + name: postmark-exporter-script diff --git a/services/monitoring/postmark-exporter-service.yaml b/services/monitoring/postmark-exporter-service.yaml new file mode 100644 index 0000000..957973a --- /dev/null +++ b/services/monitoring/postmark-exporter-service.yaml @@ -0,0 +1,18 @@ +# services/monitoring/postmark-exporter-service.yaml +apiVersion: v1 +kind: Service +metadata: + name: postmark-exporter + annotations: + prometheus.io/scrape: "true" + prometheus.io/port: "8000" + prometheus.io/path: "/metrics" +spec: + type: ClusterIP + selector: + app: postmark-exporter + ports: + - name: http + port: 8000 + targetPort: http + diff --git a/services/monitoring/scripts/grafana_smtp_sync.sh b/services/monitoring/scripts/grafana_smtp_sync.sh new file mode 100644 index 0000000..c8207ad --- /dev/null +++ b/services/monitoring/scripts/grafana_smtp_sync.sh @@ -0,0 +1,31 @@ +#!/bin/sh +set -euo pipefail + +SOURCE_NS=${SOURCE_NS:-mailu-mailserver} +SOURCE_SECRET=${SOURCE_SECRET:-mailu-postmark-relay} +TARGET_NS=${TARGET_NS:-monitoring} +TARGET_SECRET=${TARGET_SECRET:-grafana-smtp} + +tmp=$(mktemp) +cleanup() { rm -f "$tmp"; } +trap cleanup EXIT + +kubectl -n "$SOURCE_NS" get secret "$SOURCE_SECRET" -o json > "$tmp" + +pass=$(jq -r '.data["relay-password"]' "$tmp") +user=$pass + +if [ -z "$user" ] || [ -z "$pass" ] || [ "$user" = "null" ] || [ "$pass" = "null" ]; then + echo "missing credentials from $SOURCE_NS/$SOURCE_SECRET" >&2 + exit 1 +fi + +cat < dict: + today = dt.date.today() + fromdate = today - dt.timedelta(days=window.days) + params = {"fromdate": fromdate.isoformat(), "todate": today.isoformat()} + headers = { + "Accept": "application/json", + "X-Postmark-Server-Token": token, + } + response = requests.get( + f"{API_BASE}/stats/outbound", + headers=headers, + params=params, + timeout=15, + ) + response.raise_for_status() + return response.json() + + +def update_metrics(token: str) -> None: + sent_by_window = {} + for window in WINDOWS: + data = fetch_outbound_stats(token, window) + sent = int(data.get("Sent", 0) or 0) + bounced = int(data.get("Bounced", 0) or 0) + rate = (bounced / sent * 100.0) if sent else 0.0 + sent_by_window[window.label] = sent + POSTMARK_OUTBOUND_SENT.labels(window=window.label).set(sent) + POSTMARK_OUTBOUND_BOUNCED.labels(window=window.label).set(bounced) + POSTMARK_OUTBOUND_BOUNCE_RATE.labels(window=window.label).set(rate) + + POSTMARK_SENDING_LIMIT_GAUGE.set(SENDING_LIMIT) + limit_window_sent = sent_by_window.get(LIMIT_WINDOW, 0) + POSTMARK_SENDING_LIMIT_USED.set(limit_window_sent) + if SENDING_LIMIT: + POSTMARK_SENDING_LIMIT_USED_PERCENT.set(limit_window_sent / SENDING_LIMIT * 100.0) + else: + POSTMARK_SENDING_LIMIT_USED_PERCENT.set(0.0) + + +def main() -> None: + if not PRIMARY_TOKEN and not FALLBACK_TOKEN: + raise SystemExit("POSTMARK_SERVER_TOKEN or POSTMARK_SERVER_TOKEN_FALLBACK is required") + + start_http_server(LISTEN_PORT, addr=LISTEN_ADDRESS) + + tokens = [token for token in (PRIMARY_TOKEN, FALLBACK_TOKEN) if token] + token_index = 0 + + while True: + token = tokens[token_index % len(tokens)] + token_index += 1 + try: + update_metrics(token) + POSTMARK_API_UP.set(1) + POSTMARK_LAST_SUCCESS.set(time.time()) + except Exception as exc: # noqa: BLE001 + POSTMARK_API_UP.set(0) + POSTMARK_REQUEST_ERRORS.inc() + print(f"postmark_exporter: refresh failed: {exc}", flush=True) + time.sleep(POLL_INTERVAL_SECONDS) + + +if __name__ == "__main__": + main() diff --git a/services/nextcloud-mail-sync/cronjob.yaml b/services/nextcloud-mail-sync/cronjob.yaml new file mode 100644 index 0000000..9976d8e --- /dev/null +++ b/services/nextcloud-mail-sync/cronjob.yaml @@ -0,0 +1,96 @@ +# services/nextcloud-mail-sync/cronjob.yaml +apiVersion: batch/v1 +kind: CronJob +metadata: + name: nextcloud-mail-sync + namespace: nextcloud +spec: + schedule: "0 5 * * *" + concurrencyPolicy: Forbid + successfulJobsHistoryLimit: 3 + failedJobsHistoryLimit: 1 + jobTemplate: + spec: + template: + spec: + restartPolicy: OnFailure + securityContext: + runAsUser: 0 + runAsGroup: 0 + containers: + - name: mail-sync + image: nextcloud:29-apache + imagePullPolicy: IfNotPresent + command: + - /bin/bash + - /sync/sync.sh + env: + - name: KC_BASE + value: https://sso.bstein.dev + - name: KC_REALM + value: atlas + - name: KC_ADMIN_USER + valueFrom: + secretKeyRef: + name: nextcloud-keycloak-admin + key: username + - name: KC_ADMIN_PASS + valueFrom: + secretKeyRef: + name: nextcloud-keycloak-admin + key: password + - name: MAILU_DOMAIN + value: bstein.dev + - name: POSTGRES_HOST + value: postgres-service.postgres.svc.cluster.local + - name: POSTGRES_DB + valueFrom: + secretKeyRef: + name: nextcloud-db + key: database + - name: POSTGRES_USER + valueFrom: + secretKeyRef: + name: nextcloud-db + key: db-username + - name: POSTGRES_PASSWORD + valueFrom: + secretKeyRef: + name: nextcloud-db + key: db-password + resources: + requests: + cpu: 100m + memory: 256Mi + limits: + cpu: 500m + memory: 512Mi + volumeMounts: + - name: nextcloud-web + mountPath: /var/www/html + - name: nextcloud-config-pvc + mountPath: /var/www/html/config + - name: nextcloud-custom-apps + mountPath: /var/www/html/custom_apps + - name: nextcloud-user-data + mountPath: /var/www/html/data + - name: sync-script + mountPath: /sync/sync.sh + subPath: sync.sh + volumes: + - name: nextcloud-config-pvc + persistentVolumeClaim: + claimName: nextcloud-config-v2 + - name: nextcloud-custom-apps + persistentVolumeClaim: + claimName: nextcloud-custom-apps-v2 + - name: nextcloud-user-data + persistentVolumeClaim: + claimName: nextcloud-user-data-v2 + - name: nextcloud-web + persistentVolumeClaim: + claimName: nextcloud-web-v2 + - name: sync-script + configMap: + name: nextcloud-mail-sync-script + defaultMode: 0755 diff --git a/services/nextcloud-mail-sync/kustomization.yaml b/services/nextcloud-mail-sync/kustomization.yaml new file mode 100644 index 0000000..fb2a077 --- /dev/null +++ b/services/nextcloud-mail-sync/kustomization.yaml @@ -0,0 +1,13 @@ +# services/nextcloud-mail-sync/kustomization.yaml +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization +namespace: nextcloud +resources: + - cronjob.yaml + - portal-rbac.yaml +configMapGenerator: + - name: nextcloud-mail-sync-script + files: + - sync.sh=scripts/nextcloud-mail-sync.sh + options: + disableNameSuffixHash: true diff --git a/services/nextcloud-mail-sync/portal-rbac.yaml b/services/nextcloud-mail-sync/portal-rbac.yaml new file mode 100644 index 0000000..dc9a4e4 --- /dev/null +++ b/services/nextcloud-mail-sync/portal-rbac.yaml @@ -0,0 +1,29 @@ +# services/nextcloud-mail-sync/portal-rbac.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: Role +metadata: + name: bstein-dev-home-nextcloud-mail-sync +rules: + - apiGroups: ["batch"] + resources: ["cronjobs"] + verbs: ["get"] + resourceNames: ["nextcloud-mail-sync"] + - apiGroups: ["batch"] + resources: ["jobs"] + verbs: ["create", "get", "list", "watch"] + - apiGroups: [""] + resources: ["pods"] + verbs: ["get", "list"] +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + name: bstein-dev-home-nextcloud-mail-sync +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: Role + name: bstein-dev-home-nextcloud-mail-sync +subjects: + - kind: ServiceAccount + name: bstein-dev-home + namespace: bstein-dev-home diff --git a/services/nextcloud-mail-sync/scripts/nextcloud-mail-sync.sh b/services/nextcloud-mail-sync/scripts/nextcloud-mail-sync.sh new file mode 100755 index 0000000..6c883fc --- /dev/null +++ b/services/nextcloud-mail-sync/scripts/nextcloud-mail-sync.sh @@ -0,0 +1,244 @@ +#!/bin/bash +set -euo pipefail + +KC_BASE="${KC_BASE:?}" +KC_REALM="${KC_REALM:?}" +KC_ADMIN_USER="${KC_ADMIN_USER:?}" +KC_ADMIN_PASS="${KC_ADMIN_PASS:?}" +MAILU_DOMAIN="${MAILU_DOMAIN:?}" +ONLY_USERNAME="${ONLY_USERNAME:-}" +POSTGRES_HOST="${POSTGRES_HOST:-}" +POSTGRES_DB="${POSTGRES_DB:-}" +POSTGRES_USER="${POSTGRES_USER:-}" +POSTGRES_PASSWORD="${POSTGRES_PASSWORD:-}" + +if ! command -v jq >/dev/null 2>&1; then + apt-get update && apt-get install -y jq curl >/dev/null +fi + +ensure_psql() { + if command -v psql >/dev/null 2>&1; then + return 0 + fi + apt-get update && apt-get install -y postgresql-client >/dev/null +} + +set_editor_mode_richtext() { + local ids=("$@") + + if [[ ${#ids[@]} -eq 0 ]]; then + return 0 + fi + + if [[ -z "${POSTGRES_HOST}" || -z "${POSTGRES_DB}" || -z "${POSTGRES_USER}" || -z "${POSTGRES_PASSWORD}" ]]; then + echo "WARN: missing postgres env; cannot update mail editor_mode" >&2 + return 0 + fi + + ensure_psql + + local ids_csv + ids_csv=$(IFS=,; echo "${ids[*]}") + + PGPASSWORD="${POSTGRES_PASSWORD}" psql \ + -h "${POSTGRES_HOST}" \ + -U "${POSTGRES_USER}" \ + -d "${POSTGRES_DB}" \ + -v ON_ERROR_STOP=1 \ + -c "UPDATE oc_mail_accounts SET editor_mode='richtext' WHERE id IN (${ids_csv}) AND editor_mode <> 'richtext';" \ + >/dev/null +} + +list_mail_accounts() { + local user_id="${1}" + local export_out + + # Nextcloud Mail does not provide a list command; export is safe (does not print passwords). + # Some occ commands emit to stderr; capture both streams so we don't mis-detect "no accounts". + if ! export_out=$(/usr/sbin/runuser -u www-data -- php occ mail:account:export "${user_id}" 2>&1); then + echo "WARN: unable to export mail accounts for ${user_id}; skipping sync for safety" >&2 + return 1 + fi + + # The export output is human-readable and includes blocks like: + # Account 10: + # - E-Mail: user@example.com + # Extract "account-id email" pairs. + awk ' + /^Account[[:space:]]+[0-9]+:/ { + id=$2; + sub(/:$/, "", id); + next; + } + id != "" && /@/ { + # Keep the regex simple (mawk does not support interval expressions like {2,}). + if (match($0, /[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+/)) { + printf("%s\t%s\n", id, substr($0, RSTART, RLENGTH)); + id=""; + } + } + ' <<<"${export_out}" | sort -u +} + +token=$( + curl -s -d "grant_type=password" \ + -d "client_id=admin-cli" \ + -d "username=${KC_ADMIN_USER}" \ + -d "password=${KC_ADMIN_PASS}" \ + "${KC_BASE}/realms/master/protocol/openid-connect/token" | jq -r '.access_token' +) + +if [[ -z "${token}" || "${token}" == "null" ]]; then + echo "Failed to obtain admin token" + exit 1 +fi + +cd /var/www/html + +kc_users_url="${KC_BASE}/admin/realms/${KC_REALM}/users?max=2000" +if [[ -n "${ONLY_USERNAME}" ]]; then + username_q=$(jq -nr --arg v "${ONLY_USERNAME}" '$v|@uri') + kc_users_url="${KC_BASE}/admin/realms/${KC_REALM}/users?username=${username_q}&exact=true&max=1" +fi + +users=$(curl -s -H "Authorization: Bearer ${token}" "${kc_users_url}") + +kc_set_user_mail_meta() { + local user_id="${1}" + local primary_email="${2}" + local mailu_account_count="${3}" + local synced_at="${4}" + + # Fetch the full user representation so we don't accidentally clobber attributes. + local user_json updated_json + if ! user_json=$(curl -fsS -H "Authorization: Bearer ${token}" \ + "${KC_BASE}/admin/realms/${KC_REALM}/users/${user_id}"); then + echo "WARN: unable to fetch Keycloak user ${user_id} for metadata writeback" >&2 + return 1 + fi + + updated_json=$( + jq -c \ + --arg primary_email "${primary_email}" \ + --arg mailu_account_count "${mailu_account_count}" \ + --arg synced_at "${synced_at}" \ + ' + .attributes = (.attributes // {}) | + .attributes.nextcloud_mail_primary_email = [$primary_email] | + .attributes.nextcloud_mail_account_count = [$mailu_account_count] | + .attributes.nextcloud_mail_synced_at = [$synced_at] | + del(.access) + ' <<<"${user_json}" + ) + + curl -fsS -X PUT \ + -H "Authorization: Bearer ${token}" \ + -H "Content-Type: application/json" \ + -d "${updated_json}" \ + "${KC_BASE}/admin/realms/${KC_REALM}/users/${user_id}" >/dev/null +} + +while read -r user; do + user_id=$(jq -r '.id' <<<"${user}") + username=$(jq -r '.username' <<<"${user}") + keycloak_email=$(echo "${user}" | jq -r '.email // empty') + mailu_email=$(echo "${user}" | jq -r '(.attributes.mailu_email[0] // .attributes.mailu_email // empty)') + app_pw=$(echo "${user}" | jq -r '(.attributes.mailu_app_password[0] // .attributes.mailu_app_password // empty)') + + if [[ -z "${mailu_email}" ]]; then + if [[ -n "${keycloak_email}" && "${keycloak_email,,}" == *"@${MAILU_DOMAIN,,}" ]]; then + mailu_email="${keycloak_email}" + else + mailu_email="${username}@${MAILU_DOMAIN}" + fi + fi + + [[ -z "${mailu_email}" || -z "${app_pw}" ]] && continue + + if ! accounts=$(list_mail_accounts "${username}"); then + continue + fi + + # Manage only internal Mailu-domain accounts; leave any external accounts untouched. + mailu_accounts=$(awk -v d="${MAILU_DOMAIN,,}" 'tolower($2) ~ ("@" d "$") {print}' <<<"${accounts}" || true) + + desired_email="${mailu_email}" + primary_id="" + primary_email="" + + if [[ -n "${mailu_accounts}" ]]; then + while IFS=$'\t' read -r account_id account_email; do + if [[ -z "${primary_id}" ]]; then + primary_id="${account_id}" + primary_email="${account_email}" + fi + if [[ "${account_email,,}" == "${desired_email,,}" ]]; then + primary_id="${account_id}" + primary_email="${account_email}" + break + fi + done <<<"${mailu_accounts}" + + echo "Updating ${username} mail account ${primary_id} (${primary_email})" + /usr/sbin/runuser -u www-data -- php occ mail:account:update -q "${primary_id}" \ + --name "${username}" \ + --email "${desired_email}" \ + --imap-host mail.bstein.dev \ + --imap-port 993 \ + --imap-ssl-mode ssl \ + --imap-user "${desired_email}" \ + --imap-password "${app_pw}" \ + --smtp-host mail.bstein.dev \ + --smtp-port 587 \ + --smtp-ssl-mode tls \ + --smtp-user "${desired_email}" \ + --smtp-password "${app_pw}" \ + --auth-method password >/dev/null 2>&1 || true + + # Remove any extra Mailu-domain accounts for this user to prevent duplicates. + while IFS=$'\t' read -r account_id account_email; do + if [[ "${account_id}" == "${primary_id}" ]]; then + continue + fi + echo "Deleting extra mail account ${account_id} (${account_email})" + /usr/sbin/runuser -u www-data -- php occ mail:account:delete -q "${account_id}" >/dev/null 2>&1 || true + done <<<"${mailu_accounts}" + else + echo "Creating mail account for ${username} (${desired_email})" + /usr/sbin/runuser -u www-data -- php occ mail:account:create -q \ + "${username}" "${username}" "${desired_email}" \ + mail.bstein.dev 993 ssl "${desired_email}" "${app_pw}" \ + mail.bstein.dev 587 tls "${desired_email}" "${app_pw}" password >/dev/null 2>&1 || true + fi + + # Write non-secret metadata back to Keycloak for UI introspection and onboarding gating. + synced_at=$(date -u +"%Y-%m-%dT%H:%M:%SZ") + if accounts_after=$(list_mail_accounts "${username}"); then + mailu_accounts_after=$(awk -v d="${MAILU_DOMAIN,,}" 'tolower($2) ~ ("@" d "$") {print}' <<<"${accounts_after}" || true) + if [[ -n "${mailu_accounts_after}" ]]; then + mailu_account_count=$(printf '%s\n' "${mailu_accounts_after}" | wc -l | tr -d ' ') + else + mailu_account_count="0" + fi + primary_email_after="" + editor_mode_ids=() + if [[ -n "${mailu_accounts_after}" ]]; then + while IFS=$'\t' read -r _account_id account_email; do + editor_mode_ids+=("${_account_id}") + if [[ "${account_email,,}" == "${desired_email,,}" ]]; then + primary_email_after="${account_email}" + break + fi + if [[ -z "${primary_email_after}" ]]; then + primary_email_after="${account_email}" + fi + done <<<"${mailu_accounts_after}" + fi + set_editor_mode_richtext "${editor_mode_ids[@]}" + else + mailu_account_count="0" + primary_email_after="" + fi + + kc_set_user_mail_meta "${user_id}" "${primary_email_after}" "${mailu_account_count}" "${synced_at}" || true +done < <(jq -c '.[]' <<<"${users}") diff --git a/services/nextcloud/collabora.yaml b/services/nextcloud/collabora.yaml new file mode 100644 index 0000000..1cda2ea --- /dev/null +++ b/services/nextcloud/collabora.yaml @@ -0,0 +1,79 @@ +# services/nextcloud/collabora.yaml +apiVersion: apps/v1 +kind: Deployment +metadata: + name: collabora + namespace: nextcloud + labels: + app: collabora +spec: + replicas: 1 + selector: + matchLabels: + app: collabora + template: + metadata: + labels: + app: collabora + spec: + nodeSelector: + hardware: rpi5 + containers: + - name: collabora + image: collabora/code:latest + imagePullPolicy: IfNotPresent + env: + - name: domain + value: cloud\\.bstein\\.dev + - name: DONT_GEN_SSL_CERT + value: "true" + - name: extra_params + value: --o:ssl.enable=false --o:ssl.termination=true + ports: + - containerPort: 9980 + name: http + resources: + requests: + cpu: 250m + memory: 512Mi + limits: + cpu: 1 + memory: 2Gi +--- +apiVersion: v1 +kind: Service +metadata: + name: collabora + namespace: nextcloud +spec: + selector: + app: collabora + ports: + - name: http + port: 9980 + targetPort: http +--- +apiVersion: networking.k8s.io/v1 +kind: Ingress +metadata: + name: collabora + namespace: nextcloud + annotations: + cert-manager.io/cluster-issuer: letsencrypt-prod + traefik.ingress.kubernetes.io/router.entrypoints: websecure +spec: + tls: + - hosts: + - office.bstein.dev + secretName: collabora-tls + rules: + - host: office.bstein.dev + http: + paths: + - path: / + pathType: Prefix + backend: + service: + name: collabora + port: + number: 9980 diff --git a/services/nextcloud/configmap.yaml b/services/nextcloud/configmap.yaml index a6e917c..21098a2 100644 --- a/services/nextcloud/configmap.yaml +++ b/services/nextcloud/configmap.yaml @@ -7,7 +7,7 @@ metadata: data: extra.config.php: | array ( 0 => 'cloud.bstein.dev', @@ -25,13 +25,29 @@ data: 'mail_smtpauthtype' => 'LOGIN', 'mail_domain' => 'bstein.dev', 'mail_from_address' => 'no-reply', + 'datadirectory' => '/var/www/html/data', + 'apps_paths' => + array ( + 0 => + array ( + 'path' => '/var/www/html/apps', + 'url' => '/apps', + 'writable' => false, + ), + 1 => + array ( + 'path' => '/var/www/html/custom_apps', + 'url' => '/custom_apps', + 'writable' => true, + ), + ), 'oidc_login_provider_url' => 'https://sso.bstein.dev/realms/atlas', 'oidc_login_client_id' => getenv('OIDC_CLIENT_ID'), 'oidc_login_client_secret' => getenv('OIDC_CLIENT_SECRET'), - 'oidc_login_auto_redirect' => false, + 'oidc_login_auto_redirect' => true, 'oidc_login_end_session_redirect' => true, 'oidc_login_button_text' => 'Login with Keycloak', - 'oidc_login_hide_password_form' => false, + 'oidc_login_hide_password_form' => true, 'oidc_login_attributes' => array ( 'id' => 'preferred_username', @@ -45,4 +61,4 @@ data: 'oidc_login_create_groups' => false, # External storage for user data should be configured to Asteria via the External Storage app (admin UI), # keeping the astreae PVC for app internals only. - ); + )); diff --git a/services/nextcloud/cronjob.yaml b/services/nextcloud/cronjob.yaml index 86c55e1..cc0091b 100644 --- a/services/nextcloud/cronjob.yaml +++ b/services/nextcloud/cronjob.yaml @@ -24,9 +24,24 @@ spec: args: - "cd /var/www/html && php -f cron.php" volumeMounts: - - name: nextcloud-data + - name: nextcloud-web mountPath: /var/www/html + - name: nextcloud-config-pvc + mountPath: /var/www/html/config + - name: nextcloud-custom-apps + mountPath: /var/www/html/custom_apps + - name: nextcloud-user-data + mountPath: /var/www/html/data volumes: - - name: nextcloud-data + - name: nextcloud-config-pvc persistentVolumeClaim: - claimName: nextcloud-data + claimName: nextcloud-config-v2 + - name: nextcloud-custom-apps + persistentVolumeClaim: + claimName: nextcloud-custom-apps-v2 + - name: nextcloud-user-data + persistentVolumeClaim: + claimName: nextcloud-user-data-v2 + - name: nextcloud-web + persistentVolumeClaim: + claimName: nextcloud-web-v2 diff --git a/services/nextcloud/deployment.yaml b/services/nextcloud/deployment.yaml index b2c590f..295435e 100644 --- a/services/nextcloud/deployment.yaml +++ b/services/nextcloud/deployment.yaml @@ -23,20 +23,178 @@ spec: runAsUser: 33 runAsGroup: 33 initContainers: + - name: seed-nextcloud-web + image: nextcloud:29-apache + command: ["/bin/sh", "-c"] + args: + - | + if [ ! -f /var/www/html/version.php ]; then + echo "Seeding Nextcloud webroot..." + tar -C /usr/src/nextcloud -cf - \ + --exclude=./config \ + --exclude=./data \ + --exclude=./custom_apps \ + . | tar -C /var/www/html -xf - + chown -R 33:33 /var/www/html || true + fi + securityContext: + runAsUser: 0 + runAsGroup: 0 + volumeMounts: + - name: nextcloud-web + mountPath: /var/www/html - name: fix-perms image: alpine:3.20 command: ["/bin/sh", "-c"] args: - | - chown -R 33:33 /var/www/html/config || true - chown -R 33:33 /var/www/html/data || true + if [ ! -s /var/www/html/config/config.php ]; then + rm -f /var/www/html/config/config.php || true + fi + mkdir -p /var/www/html/config /var/www/html/data /var/www/html/custom_apps || true + if [ ! -s /var/www/html/config/config.php ]; then + rm -f /var/www/html/data/.ocdata || true + fi + if [ -s /var/www/html/config/config.php ] && [ ! -f /var/www/html/data/.ocdata ]; then + touch /var/www/html/data/.ocdata + fi + chown -R 33:33 /var/www/html/config /var/www/html/data /var/www/html/custom_apps || true securityContext: runAsUser: 0 runAsGroup: 0 volumeMounts: - - name: nextcloud-data + - name: nextcloud-config-pvc + mountPath: /var/www/html/config + - name: nextcloud-custom-apps + mountPath: /var/www/html/custom_apps + - name: nextcloud-user-data + mountPath: /var/www/html/data + - name: nextcloud-config-extra + mountPath: /var/www/html/config/extra.config.php + subPath: extra.config.php + - name: install-nextcloud + image: nextcloud:29-apache + securityContext: + runAsUser: 0 + runAsGroup: 0 + command: ["/bin/sh", "-c"] + args: + - | + installed="$(su -s /bin/sh www-data -c "php /var/www/html/occ status" 2>/dev/null | awk '/installed:/{print $3}' || true)" + if [ ! -s /var/www/html/config/config.php ]; then + su -s /bin/sh www-data -c "php /var/www/html/occ maintenance:install --database pgsql --database-host \"${POSTGRES_HOST}\" --database-name \"${POSTGRES_DB}\" --database-user \"${POSTGRES_USER}\" --database-pass \"${POSTGRES_PASSWORD}\" --admin-user \"${NEXTCLOUD_ADMIN_USER}\" --admin-pass \"${NEXTCLOUD_ADMIN_PASSWORD}\" --data-dir /var/www/html/data" + chown 33:33 /var/www/html/config/config.php || true + chown -R 33:33 /var/www/html/data || true + fi + installed="$(su -s /bin/sh www-data -c "php /var/www/html/occ status" 2>/dev/null | awk '/installed:/{print $3}' || true)" + if [ "${installed}" = "true" ]; then + configure_oidc() { + su -s /bin/sh www-data -c "php /var/www/html/occ config:system:set oidc_login_provider_url --value='https://sso.bstein.dev/realms/atlas'" + su -s /bin/sh www-data -c "php /var/www/html/occ config:system:set oidc_login_client_id --value='${OIDC_CLIENT_ID}'" + su -s /bin/sh www-data -c "php /var/www/html/occ config:system:set oidc_login_client_secret --value='${OIDC_CLIENT_SECRET}'" + su -s /bin/sh www-data -c "php /var/www/html/occ config:system:set oidc_login_auto_redirect --type=boolean --value=true" + su -s /bin/sh www-data -c "php /var/www/html/occ config:system:set oidc_login_hide_password_form --type=boolean --value=true" + su -s /bin/sh www-data -c "php /var/www/html/occ config:system:set oidc_login_disable_registration --type=boolean --value=false" + } + configure_office() { + su -s /bin/sh www-data -c "php /var/www/html/occ config:app:set richdocuments wopi_url --value='https://office.bstein.dev'" + su -s /bin/sh www-data -c "php /var/www/html/occ config:app:set richdocuments public_wopi_url --value='https://office.bstein.dev'" + } + ensure_mime_defaults() { + cfg_dir="/var/www/html/resources/config" + mkdir -p "${cfg_dir}" + if [ ! -s "${cfg_dir}/mimetypemapping.dist.json" ]; then + curl -fsSL https://raw.githubusercontent.com/nextcloud/server/v29.0.16/resources/config/mimetypemapping.dist.json -o "${cfg_dir}/mimetypemapping.dist.json" || true + fi + if [ ! -s "${cfg_dir}/mimetypealiases.dist.json" ]; then + curl -fsSL https://raw.githubusercontent.com/nextcloud/server/v29.0.16/resources/config/mimetypealiases.dist.json -o "${cfg_dir}/mimetypealiases.dist.json" || true + fi + chown -R 33:33 "${cfg_dir}" || true + } + install_app() { + app="$1" + url="$2" + target="/var/www/html/custom_apps/${app}" + rm -rf "${target}" + mkdir -p /tmp/nextcloud-apps + curl -fsSL "${url}" -o "/tmp/nextcloud-apps/${app}.tar.gz" + tar -xzf "/tmp/nextcloud-apps/${app}.tar.gz" -C /var/www/html/custom_apps + rm -f "/tmp/nextcloud-apps/${app}.tar.gz" + chown -R 33:33 "${target}" + su -s /bin/sh www-data -c "php /var/www/html/occ app:enable --force ${app}" || true + } + reset_external_config() { + su -s /bin/sh www-data -c "php /var/www/html/occ app:remove external" || true + su -s /bin/sh www-data -c "php /var/www/html/occ config:app:delete external jwt_token_privkey_es256" || true + su -s /bin/sh www-data -c "php /var/www/html/occ config:app:delete external jwt_token_pubkey_es256" || true + su -s /bin/sh www-data -c "php /var/www/html/occ config:app:delete external jwt_token_privkey_ed25519" || true + su -s /bin/sh www-data -c "php /var/www/html/occ config:app:delete external jwt_token_pubkey_ed25519" || true + } + ensure_app() { + app="$1" + target="/var/www/html/custom_apps/${app}" + rm -rf "${target}" + su -s /bin/sh www-data -c "php /var/www/html/occ app:remove ${app}" || true + su -s /bin/sh www-data -c "php /var/www/html/occ app:install --force ${app}" || true + su -s /bin/sh www-data -c "php /var/www/html/occ app:enable --force ${app}" || true + } + ensure_mime_defaults + reset_external_config + install_app external https://github.com/nextcloud-releases/external/releases/download/v5.4.1/external-v5.4.1.tar.gz + install_app mail https://github.com/nextcloud-releases/mail/releases/download/v3.7.24/mail-stable3.7.tar.gz + install_app oidc_login https://github.com/pulsejet/nextcloud-oidc-login/releases/download/v3.2.2/oidc_login.tar.gz + ensure_app richdocuments + configure_office + configure_oidc + fi + env: + - name: POSTGRES_HOST + value: postgres-service.postgres.svc.cluster.local + - name: POSTGRES_DB + valueFrom: + secretKeyRef: + name: nextcloud-db + key: database + - name: POSTGRES_USER + valueFrom: + secretKeyRef: + name: nextcloud-db + key: db-username + - name: POSTGRES_PASSWORD + valueFrom: + secretKeyRef: + name: nextcloud-db + key: db-password + - name: NEXTCLOUD_ADMIN_USER + valueFrom: + secretKeyRef: + name: nextcloud-admin + key: admin-user + - name: NEXTCLOUD_ADMIN_PASSWORD + valueFrom: + secretKeyRef: + name: nextcloud-admin + key: admin-password + - name: OIDC_CLIENT_ID + valueFrom: + secretKeyRef: + name: nextcloud-oidc + key: client-id + - name: OIDC_CLIENT_SECRET + valueFrom: + secretKeyRef: + name: nextcloud-oidc + key: client-secret + volumeMounts: + - name: nextcloud-web mountPath: /var/www/html - - name: nextcloud-config + - name: nextcloud-config-pvc + mountPath: /var/www/html/config + - name: nextcloud-custom-apps + mountPath: /var/www/html/custom_apps + - name: nextcloud-user-data + mountPath: /var/www/html/data + - name: nextcloud-config-extra mountPath: /var/www/html/config/extra.config.php subPath: extra.config.php containers: @@ -121,9 +279,15 @@ spec: - containerPort: 80 name: http volumeMounts: - - name: nextcloud-data + - name: nextcloud-web mountPath: /var/www/html - - name: nextcloud-config + - name: nextcloud-config-pvc + mountPath: /var/www/html/config + - name: nextcloud-custom-apps + mountPath: /var/www/html/custom_apps + - name: nextcloud-user-data + mountPath: /var/www/html/data + - name: nextcloud-config-extra mountPath: /var/www/html/config/extra.config.php subPath: extra.config.php resources: @@ -134,10 +298,19 @@ spec: cpu: 1 memory: 3Gi volumes: - - name: nextcloud-data + - name: nextcloud-web persistentVolumeClaim: - claimName: nextcloud-data - - name: nextcloud-config + claimName: nextcloud-web-v2 + - name: nextcloud-config-pvc + persistentVolumeClaim: + claimName: nextcloud-config-v2 + - name: nextcloud-custom-apps + persistentVolumeClaim: + claimName: nextcloud-custom-apps-v2 + - name: nextcloud-user-data + persistentVolumeClaim: + claimName: nextcloud-user-data-v2 + - name: nextcloud-config-extra configMap: name: nextcloud-config defaultMode: 0444 diff --git a/services/nextcloud/kustomization.yaml b/services/nextcloud/kustomization.yaml index 5e3b414..14e0ec1 100644 --- a/services/nextcloud/kustomization.yaml +++ b/services/nextcloud/kustomization.yaml @@ -7,19 +7,14 @@ resources: - configmap.yaml - pvc.yaml - deployment.yaml + - collabora.yaml + - cronjob.yaml + - maintenance-cronjob.yaml - service.yaml - ingress.yaml - - cronjob.yaml - - mail-sync-cronjob.yaml - - maintenance-cronjob.yaml configMapGenerator: - name: nextcloud-maintenance-script files: - - maintenance.sh=../../scripts/nextcloud-maintenance.sh - options: - disableNameSuffixHash: true - - name: nextcloud-mail-sync-script - files: - - sync.sh=../../scripts/nextcloud-mail-sync.sh + - maintenance.sh=scripts/nextcloud-maintenance.sh options: disableNameSuffixHash: true diff --git a/services/nextcloud/mail-sync-cronjob.yaml b/services/nextcloud/mail-sync-cronjob.yaml deleted file mode 100644 index 52dc3ea..0000000 --- a/services/nextcloud/mail-sync-cronjob.yaml +++ /dev/null @@ -1,58 +0,0 @@ -# services/nextcloud/mail-sync-cronjob.yaml -apiVersion: batch/v1 -kind: CronJob -metadata: - name: nextcloud-mail-sync - namespace: nextcloud -spec: - schedule: "0 5 * * *" - concurrencyPolicy: Forbid - jobTemplate: - spec: - template: - spec: - restartPolicy: OnFailure - securityContext: - runAsUser: 0 - runAsGroup: 0 - containers: - - name: mail-sync - image: nextcloud:29-apache - imagePullPolicy: IfNotPresent - command: ["/bin/bash", "/sync/sync.sh"] - env: - - name: KC_BASE - value: https://sso.bstein.dev - - name: KC_REALM - value: atlas - - name: KC_ADMIN_USER - valueFrom: - secretKeyRef: - name: nextcloud-keycloak-admin - key: username - - name: KC_ADMIN_PASS - valueFrom: - secretKeyRef: - name: nextcloud-keycloak-admin - key: password - volumeMounts: - - name: nextcloud-data - mountPath: /var/www/html - - name: sync-script - mountPath: /sync/sync.sh - subPath: sync.sh - resources: - requests: - cpu: 100m - memory: 256Mi - limits: - cpu: 500m - memory: 512Mi - volumes: - - name: nextcloud-data - persistentVolumeClaim: - claimName: nextcloud-data - - name: sync-script - configMap: - name: nextcloud-mail-sync-script - defaultMode: 0755 diff --git a/services/nextcloud/maintenance-cronjob.yaml b/services/nextcloud/maintenance-cronjob.yaml index 55fcbd1..618f548 100644 --- a/services/nextcloud/maintenance-cronjob.yaml +++ b/services/nextcloud/maintenance-cronjob.yaml @@ -34,8 +34,14 @@ spec: name: nextcloud-admin key: admin-password volumeMounts: - - name: nextcloud-data + - name: nextcloud-web mountPath: /var/www/html + - name: nextcloud-config-pvc + mountPath: /var/www/html/config + - name: nextcloud-custom-apps + mountPath: /var/www/html/custom_apps + - name: nextcloud-user-data + mountPath: /var/www/html/data - name: maintenance-script mountPath: /maintenance/maintenance.sh subPath: maintenance.sh @@ -47,9 +53,18 @@ spec: cpu: 500m memory: 512Mi volumes: - - name: nextcloud-data + - name: nextcloud-config-pvc persistentVolumeClaim: - claimName: nextcloud-data + claimName: nextcloud-config-v2 + - name: nextcloud-custom-apps + persistentVolumeClaim: + claimName: nextcloud-custom-apps-v2 + - name: nextcloud-user-data + persistentVolumeClaim: + claimName: nextcloud-user-data-v2 + - name: nextcloud-web + persistentVolumeClaim: + claimName: nextcloud-web-v2 - name: maintenance-script configMap: name: nextcloud-maintenance-script diff --git a/services/nextcloud/pvc.yaml b/services/nextcloud/pvc.yaml index dd929b6..c0779ef 100644 --- a/services/nextcloud/pvc.yaml +++ b/services/nextcloud/pvc.yaml @@ -2,12 +2,51 @@ apiVersion: v1 kind: PersistentVolumeClaim metadata: - name: nextcloud-data + name: nextcloud-config-v2 namespace: nextcloud spec: accessModes: - ReadWriteMany resources: requests: - storage: 200Gi + storage: 5Gi storageClassName: astreae +--- +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: nextcloud-custom-apps-v2 + namespace: nextcloud +spec: + accessModes: + - ReadWriteMany + resources: + requests: + storage: 5Gi + storageClassName: astreae +--- +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: nextcloud-web-v2 + namespace: nextcloud +spec: + accessModes: + - ReadWriteMany + resources: + requests: + storage: 5Gi + storageClassName: astreae +--- +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: nextcloud-user-data-v2 + namespace: nextcloud +spec: + accessModes: + - ReadWriteMany + resources: + requests: + storage: 2Ti + storageClassName: asteria diff --git a/services/nextcloud/scripts/nextcloud-maintenance.sh b/services/nextcloud/scripts/nextcloud-maintenance.sh new file mode 100755 index 0000000..ab38616 --- /dev/null +++ b/services/nextcloud/scripts/nextcloud-maintenance.sh @@ -0,0 +1,108 @@ +#!/bin/bash +set -euo pipefail + +NC_URL="${NC_URL:-https://cloud.bstein.dev}" +ADMIN_USER="${ADMIN_USER:?}" +ADMIN_PASS="${ADMIN_PASS:?}" + +export DEBIAN_FRONTEND=noninteractive +apt-get update -qq +apt-get install -y -qq curl jq >/dev/null + +run_occ() { + runuser -u www-data -- php /var/www/html/occ "$@" +} + +log() { echo "[$(date -Is)] $*"; } + +log "Ensuring Nextcloud app files are present" +if [[ ! -d /var/www/html/lib && -d /usr/src/nextcloud/lib ]]; then + rsync -a --delete \ + --exclude config \ + --exclude data \ + /usr/src/nextcloud/ /var/www/html/ +fi + +log "Ensuring Nextcloud permissions" +mkdir -p /var/www/html/data +chown 33:33 /var/www/html || true +chmod 775 /var/www/html || true +chown -R 33:33 /var/www/html/apps /var/www/html/custom_apps /var/www/html/data /var/www/html/config 2>/dev/null || true + +log "Applying Atlas theming" +run_occ config:app:set theming name --value "Atlas Cloud" +run_occ config:app:set theming slogan --value "Unified access to Atlas services" +run_occ config:app:set theming url --value "https://cloud.bstein.dev" +run_occ config:app:set theming color --value "#0f172a" +run_occ config:app:set theming disable-user-theming --value "yes" + +log "Applying Atlas Mail styling defaults" +run_occ app:install customcss >/dev/null 2>&1 || true +run_occ app:enable customcss >/dev/null 2>&1 || true +MAIL_CSS=$(cat <<'CSS' +.mail-message-body, .mail-message-body pre, .mail-message-body code, .mail-message-body table { + font-family: "Inter", "Source Sans 3", "Helvetica Neue", Arial, sans-serif; + font-size: 14px; + line-height: 1.6; + color: var(--color-main-text); +} +.mail-message-body pre { + background: rgba(15, 23, 42, 0.06); + padding: 12px; + border-radius: 8px; +} +.mail-message-body blockquote { + border-left: 3px solid var(--color-border); + padding-left: 12px; + margin: 8px 0; + color: var(--color-text-lighter); +} +.mail-message-body img { + max-width: 100%; + border-radius: 6px; +} +CSS +) +run_occ config:app:set customcss css --value "${MAIL_CSS}" >/dev/null + +log "Setting default quota to 250 GB" +run_occ config:app:set files default_quota --value "250 GB" + +API_BASE="${NC_URL}/ocs/v2.php/apps/external/api/v1" +AUTH=(-u "${ADMIN_USER}:${ADMIN_PASS}" -H "OCS-APIRequest: true") + +log "Removing existing external links" +existing=$(curl -sf "${AUTH[@]}" "${API_BASE}?format=json" | jq -r '.ocs.data[].id // empty') +for id in ${existing}; do + curl -sf "${AUTH[@]}" -X DELETE "${API_BASE}/sites/${id}?format=json" >/dev/null || true +done + +SITES=( + "Vaultwarden|https://vault.bstein.dev" + "Jellyfin|https://stream.bstein.dev" + "Gitea|https://scm.bstein.dev" + "Jenkins|https://ci.bstein.dev" + "Harbor|https://registry.bstein.dev" + "Vault|https://secret.bstein.dev" + "Jitsi|https://meet.bstein.dev" + "Grafana|https://metrics.bstein.dev" + "Chat LLM|https://chat.ai.bstein.dev" + "Vision|https://draw.ai.bstein.dev" + "STT/TTS|https://talk.ai.bstein.dev" +) + +log "Seeding external links" +for entry in "${SITES[@]}"; do + IFS="|" read -r name url <<<"${entry}" + curl -sf "${AUTH[@]}" -X POST "${API_BASE}/sites?format=json" \ + -d "name=${name}" \ + -d "url=${url}" \ + -d "lang=" \ + -d "type=link" \ + -d "device=" \ + -d "icon=" \ + -d "groups[]=" \ + -d "redirect=1" >/dev/null +done + +log "Maintenance run completed" diff --git a/services/openldap/kustomization.yaml b/services/openldap/kustomization.yaml new file mode 100644 index 0000000..798f7e8 --- /dev/null +++ b/services/openldap/kustomization.yaml @@ -0,0 +1,7 @@ +# services/openldap/kustomization.yaml +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization +namespace: sso +resources: + - service.yaml + - statefulset.yaml diff --git a/services/openldap/service.yaml b/services/openldap/service.yaml new file mode 100644 index 0000000..38c2176 --- /dev/null +++ b/services/openldap/service.yaml @@ -0,0 +1,19 @@ +# services/openldap/service.yaml +apiVersion: v1 +kind: Service +metadata: + name: openldap + namespace: sso + labels: + app: openldap +spec: + clusterIP: None + selector: + app: openldap + ports: + - name: ldap + port: 389 + targetPort: ldap + - name: ldaps + port: 636 + targetPort: ldaps diff --git a/services/openldap/statefulset.yaml b/services/openldap/statefulset.yaml new file mode 100644 index 0000000..ee8c792 --- /dev/null +++ b/services/openldap/statefulset.yaml @@ -0,0 +1,80 @@ +# services/openldap/statefulset.yaml +apiVersion: apps/v1 +kind: StatefulSet +metadata: + name: openldap + namespace: sso + labels: + app: openldap +spec: + serviceName: openldap + replicas: 1 + selector: + matchLabels: + app: openldap + template: + metadata: + labels: + app: openldap + spec: + nodeSelector: + kubernetes.io/arch: arm64 + node-role.kubernetes.io/worker: "true" + containers: + - name: openldap + image: docker.io/osixia/openldap:1.5.0 + imagePullPolicy: IfNotPresent + ports: + - name: ldap + containerPort: 389 + - name: ldaps + containerPort: 636 + env: + - name: LDAP_ORGANISATION + value: Atlas + - name: LDAP_DOMAIN + value: bstein.dev + - name: LDAP_ADMIN_PASSWORD + valueFrom: + secretKeyRef: + name: openldap-admin + key: LDAP_ADMIN_PASSWORD + - name: LDAP_CONFIG_PASSWORD + valueFrom: + secretKeyRef: + name: openldap-admin + key: LDAP_CONFIG_PASSWORD + readinessProbe: + tcpSocket: + port: ldap + initialDelaySeconds: 10 + periodSeconds: 10 + livenessProbe: + tcpSocket: + port: ldap + initialDelaySeconds: 30 + periodSeconds: 20 + volumeMounts: + - name: ldap-data + mountPath: /var/lib/ldap + - name: slapd-config + mountPath: /etc/ldap/slapd.d + volumeClaimTemplates: + - metadata: + name: ldap-data + spec: + accessModes: + - ReadWriteOnce + storageClassName: astreae + resources: + requests: + storage: 1Gi + - metadata: + name: slapd-config + spec: + accessModes: + - ReadWriteOnce + storageClassName: astreae + resources: + requests: + storage: 1Gi diff --git a/services/outline/deployment.yaml b/services/outline/deployment.yaml new file mode 100644 index 0000000..9f8160e --- /dev/null +++ b/services/outline/deployment.yaml @@ -0,0 +1,107 @@ +# services/outline/deployment.yaml +apiVersion: apps/v1 +kind: Deployment +metadata: + name: outline + namespace: outline + labels: + app: outline +spec: + replicas: 1 + selector: + matchLabels: + app: outline + strategy: + type: RollingUpdate + rollingUpdate: + maxSurge: 0 + maxUnavailable: 1 + template: + metadata: + labels: + app: outline + spec: + nodeSelector: + node-role.kubernetes.io/worker: "true" + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: hardware + operator: In + values: ["rpi4", "rpi5"] + containers: + - name: outline + image: outlinewiki/outline:1.2.0 + ports: + - name: http + containerPort: 3000 + env: + - name: NODE_ENV + value: production + - name: URL + value: https://notes.bstein.dev + - name: PORT + value: "3000" + - name: REDIS_URL + value: redis://outline-redis:6379 + - name: PGSSLMODE + value: disable + - name: FILE_STORAGE + value: local + - name: FILE_STORAGE_LOCAL_ROOT_DIR + value: /var/lib/outline/data + - name: FORCE_HTTPS + value: "true" + - name: OIDC_ENFORCED + value: "true" + - name: OIDC_SCOPES + value: openid profile email + - name: OIDC_USERNAME_CLAIM + value: preferred_username + - name: OIDC_DISPLAY_NAME + value: Atlas SSO + - name: SMTP_SECURE + value: "false" + - name: SMTP_PORT + value: "25" + envFrom: + - secretRef: + name: outline-db + - secretRef: + name: outline-secrets + - secretRef: + name: outline-oidc + - secretRef: + name: outline-smtp + volumeMounts: + - name: user-data + mountPath: /var/lib/outline/data + readinessProbe: + httpGet: + path: /_health + port: http + initialDelaySeconds: 15 + periodSeconds: 10 + timeoutSeconds: 3 + failureThreshold: 6 + livenessProbe: + httpGet: + path: /_health + port: http + initialDelaySeconds: 30 + periodSeconds: 20 + timeoutSeconds: 3 + failureThreshold: 6 + resources: + requests: + cpu: 200m + memory: 512Mi + limits: + cpu: "1" + memory: 2Gi + volumes: + - name: user-data + persistentVolumeClaim: + claimName: outline-user-data diff --git a/services/outline/ingress.yaml b/services/outline/ingress.yaml new file mode 100644 index 0000000..735baae --- /dev/null +++ b/services/outline/ingress.yaml @@ -0,0 +1,26 @@ +# services/outline/ingress.yaml +apiVersion: networking.k8s.io/v1 +kind: Ingress +metadata: + name: outline + namespace: outline + annotations: + kubernetes.io/ingress.class: traefik + traefik.ingress.kubernetes.io/router.entrypoints: websecure + traefik.ingress.kubernetes.io/router.tls: "true" + cert-manager.io/cluster-issuer: letsencrypt +spec: + tls: + - hosts: ["notes.bstein.dev"] + secretName: outline-tls + rules: + - host: notes.bstein.dev + http: + paths: + - path: / + pathType: Prefix + backend: + service: + name: outline + port: + number: 80 diff --git a/services/outline/kustomization.yaml b/services/outline/kustomization.yaml new file mode 100644 index 0000000..33640f6 --- /dev/null +++ b/services/outline/kustomization.yaml @@ -0,0 +1,12 @@ +# services/outline/kustomization.yaml +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization +namespace: outline +resources: + - namespace.yaml + - user-pvc.yaml + - redis-deployment.yaml + - redis-service.yaml + - deployment.yaml + - service.yaml + - ingress.yaml diff --git a/services/outline/namespace.yaml b/services/outline/namespace.yaml new file mode 100644 index 0000000..4172c02 --- /dev/null +++ b/services/outline/namespace.yaml @@ -0,0 +1,5 @@ +# services/outline/namespace.yaml +apiVersion: v1 +kind: Namespace +metadata: + name: outline diff --git a/services/outline/redis-deployment.yaml b/services/outline/redis-deployment.yaml new file mode 100644 index 0000000..5e08128 --- /dev/null +++ b/services/outline/redis-deployment.yaml @@ -0,0 +1,47 @@ +# services/outline/redis-deployment.yaml +apiVersion: apps/v1 +kind: Deployment +metadata: + name: outline-redis + namespace: outline + labels: + app: outline-redis +spec: + replicas: 1 + selector: + matchLabels: + app: outline-redis + template: + metadata: + labels: + app: outline-redis + spec: + nodeSelector: + node-role.kubernetes.io/worker: "true" + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: hardware + operator: In + values: ["rpi4", "rpi5"] + containers: + - name: redis + image: redis:7.4.1-alpine + ports: + - name: redis + containerPort: 6379 + resources: + requests: + cpu: 50m + memory: 128Mi + limits: + cpu: 250m + memory: 256Mi + volumeMounts: + - name: data + mountPath: /data + volumes: + - name: data + emptyDir: {} diff --git a/services/outline/redis-service.yaml b/services/outline/redis-service.yaml new file mode 100644 index 0000000..a80def2 --- /dev/null +++ b/services/outline/redis-service.yaml @@ -0,0 +1,15 @@ +# services/outline/redis-service.yaml +apiVersion: v1 +kind: Service +metadata: + name: outline-redis + namespace: outline + labels: + app: outline-redis +spec: + selector: + app: outline-redis + ports: + - name: redis + port: 6379 + targetPort: redis diff --git a/services/outline/service.yaml b/services/outline/service.yaml new file mode 100644 index 0000000..383df0e --- /dev/null +++ b/services/outline/service.yaml @@ -0,0 +1,15 @@ +# services/outline/service.yaml +apiVersion: v1 +kind: Service +metadata: + name: outline + namespace: outline + labels: + app: outline +spec: + selector: + app: outline + ports: + - name: http + port: 80 + targetPort: http diff --git a/services/outline/user-pvc.yaml b/services/outline/user-pvc.yaml new file mode 100644 index 0000000..f31426d --- /dev/null +++ b/services/outline/user-pvc.yaml @@ -0,0 +1,12 @@ +# services/outline/user-pvc.yaml +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: outline-user-data + namespace: outline +spec: + accessModes: ["ReadWriteOnce"] + storageClassName: asteria + resources: + requests: + storage: 5Gi diff --git a/services/planka/app-pvc.yaml b/services/planka/app-pvc.yaml new file mode 100644 index 0000000..7ef6a91 --- /dev/null +++ b/services/planka/app-pvc.yaml @@ -0,0 +1,12 @@ +# services/planka/app-pvc.yaml +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: planka-app-data + namespace: planka +spec: + accessModes: ["ReadWriteOnce"] + storageClassName: astreae + resources: + requests: + storage: 2Gi diff --git a/services/planka/deployment.yaml b/services/planka/deployment.yaml new file mode 100644 index 0000000..9524245 --- /dev/null +++ b/services/planka/deployment.yaml @@ -0,0 +1,127 @@ +# services/planka/deployment.yaml +apiVersion: apps/v1 +kind: Deployment +metadata: + name: planka + namespace: planka + labels: + app: planka +spec: + replicas: 1 + selector: + matchLabels: + app: planka + strategy: + type: RollingUpdate + rollingUpdate: + maxSurge: 0 + maxUnavailable: 1 + template: + metadata: + labels: + app: planka + spec: + nodeSelector: + node-role.kubernetes.io/worker: "true" + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: hardware + operator: In + values: ["rpi4", "rpi5"] + securityContext: + runAsUser: 1000 + runAsGroup: 1000 + fsGroup: 1000 + fsGroupChangePolicy: OnRootMismatch + initContainers: + - name: init-user-data + image: docker.io/alpine:3.20 + securityContext: + runAsUser: 0 + runAsGroup: 0 + command: ["/bin/sh", "-c"] + args: + - | + set -e + mkdir -p /data/public/user-avatars \ + /data/public/background-images \ + /data/private/attachments + chown -R 1000:1000 /data /tmp-data + volumeMounts: + - name: user-data + mountPath: /data + - name: app-data + mountPath: /tmp-data + containers: + - name: planka + image: ghcr.io/plankanban/planka:2.0.0-rc.4 + ports: + - name: http + containerPort: 1337 + env: + - name: BASE_URL + value: https://tasks.bstein.dev + - name: TRUST_PROXY + value: "true" + - name: OIDC_IGNORE_ROLES + value: "false" + - name: OIDC_ADMIN_ROLES + value: admin + - name: OIDC_PROJECT_OWNER_ROLES + value: planka-users + - name: OIDC_ROLES_ATTRIBUTE + value: groups + envFrom: + - secretRef: + name: planka-db + - secretRef: + name: planka-secrets + - secretRef: + name: planka-oidc + - secretRef: + name: planka-smtp + volumeMounts: + - name: user-data + mountPath: /app/public/user-avatars + subPath: public/user-avatars + - name: user-data + mountPath: /app/public/background-images + subPath: public/background-images + - name: user-data + mountPath: /app/private/attachments + subPath: private/attachments + - name: app-data + mountPath: /app/.tmp + readinessProbe: + httpGet: + path: / + port: http + initialDelaySeconds: 10 + periodSeconds: 10 + timeoutSeconds: 3 + failureThreshold: 6 + livenessProbe: + httpGet: + path: / + port: http + initialDelaySeconds: 30 + periodSeconds: 20 + timeoutSeconds: 3 + failureThreshold: 6 + resources: + requests: + cpu: 200m + memory: 512Mi + limits: + cpu: "1" + memory: 2Gi + volumes: + - name: user-data + persistentVolumeClaim: + claimName: planka-user-data + - name: app-data + persistentVolumeClaim: + claimName: planka-app-data diff --git a/services/planka/ingress.yaml b/services/planka/ingress.yaml new file mode 100644 index 0000000..7bd2912 --- /dev/null +++ b/services/planka/ingress.yaml @@ -0,0 +1,26 @@ +# services/planka/ingress.yaml +apiVersion: networking.k8s.io/v1 +kind: Ingress +metadata: + name: planka + namespace: planka + annotations: + kubernetes.io/ingress.class: traefik + traefik.ingress.kubernetes.io/router.entrypoints: websecure + traefik.ingress.kubernetes.io/router.tls: "true" + cert-manager.io/cluster-issuer: letsencrypt +spec: + tls: + - hosts: ["tasks.bstein.dev"] + secretName: planka-tls + rules: + - host: tasks.bstein.dev + http: + paths: + - path: / + pathType: Prefix + backend: + service: + name: planka + port: + number: 80 diff --git a/services/planka/kustomization.yaml b/services/planka/kustomization.yaml new file mode 100644 index 0000000..ab42954 --- /dev/null +++ b/services/planka/kustomization.yaml @@ -0,0 +1,11 @@ +# services/planka/kustomization.yaml +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization +namespace: planka +resources: + - namespace.yaml + - user-data-pvc.yaml + - app-pvc.yaml + - deployment.yaml + - service.yaml + - ingress.yaml diff --git a/services/planka/namespace.yaml b/services/planka/namespace.yaml new file mode 100644 index 0000000..6a56e21 --- /dev/null +++ b/services/planka/namespace.yaml @@ -0,0 +1,5 @@ +# services/planka/namespace.yaml +apiVersion: v1 +kind: Namespace +metadata: + name: planka diff --git a/services/ci-demo/service.yaml b/services/planka/service.yaml similarity index 52% rename from services/ci-demo/service.yaml rename to services/planka/service.yaml index c094387..6abf6cf 100644 --- a/services/ci-demo/service.yaml +++ b/services/planka/service.yaml @@ -1,14 +1,15 @@ -# services/ci-demo/service.yaml +# services/planka/service.yaml apiVersion: v1 kind: Service metadata: - name: ci-demo - namespace: ci-demo + name: planka + namespace: planka + labels: + app: planka spec: selector: - app.kubernetes.io/name: ci-demo + app: planka ports: - name: http port: 80 targetPort: http - diff --git a/services/planka/user-data-pvc.yaml b/services/planka/user-data-pvc.yaml new file mode 100644 index 0000000..760f33c --- /dev/null +++ b/services/planka/user-data-pvc.yaml @@ -0,0 +1,12 @@ +# services/planka/user-data-pvc.yaml +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: planka-user-data + namespace: planka +spec: + accessModes: ["ReadWriteOnce"] + storageClassName: asteria + resources: + requests: + storage: 20Gi diff --git a/services/postgres/kustomization.yaml b/services/postgres/kustomization.yaml new file mode 100644 index 0000000..e9d2c98 --- /dev/null +++ b/services/postgres/kustomization.yaml @@ -0,0 +1,10 @@ +# services/postgres/kustomization.yaml +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization +namespace: postgres +resources: + - namespace.yaml + - serviceaccount.yaml + - secretproviderclass.yaml + - service.yaml + - statefulset.yaml diff --git a/services/postgres/namespace.yaml b/services/postgres/namespace.yaml new file mode 100644 index 0000000..c5503ce --- /dev/null +++ b/services/postgres/namespace.yaml @@ -0,0 +1,5 @@ +# services/postgres/namespace.yaml +apiVersion: v1 +kind: Namespace +metadata: + name: postgres diff --git a/services/postgres/secretproviderclass.yaml b/services/postgres/secretproviderclass.yaml new file mode 100644 index 0000000..31d247e --- /dev/null +++ b/services/postgres/secretproviderclass.yaml @@ -0,0 +1,15 @@ +# services/postgres/secretproviderclass.yaml +apiVersion: secrets-store.csi.x-k8s.io/v1 +kind: SecretProviderClass +metadata: + name: postgres-vault + namespace: postgres +spec: + provider: vault + parameters: + vaultAddress: "http://vault.vault.svc.cluster.local:8200" + roleName: "postgres" + objects: | + - objectName: "postgres_password" + secretPath: "kv/data/postgres" + secretKey: "POSTGRES_PASSWORD" diff --git a/services/postgres/service.yaml b/services/postgres/service.yaml new file mode 100644 index 0000000..52c4656 --- /dev/null +++ b/services/postgres/service.yaml @@ -0,0 +1,15 @@ +# services/postgres/service.yaml +apiVersion: v1 +kind: Service +metadata: + name: postgres-service + namespace: postgres +spec: + clusterIP: None + ports: + - name: postgres + port: 5432 + protocol: TCP + targetPort: 5432 + selector: + app: postgres diff --git a/services/postgres/serviceaccount.yaml b/services/postgres/serviceaccount.yaml new file mode 100644 index 0000000..0c3db0c --- /dev/null +++ b/services/postgres/serviceaccount.yaml @@ -0,0 +1,6 @@ +# services/postgres/serviceaccount.yaml +apiVersion: v1 +kind: ServiceAccount +metadata: + name: postgres-vault + namespace: postgres diff --git a/services/postgres/statefulset.yaml b/services/postgres/statefulset.yaml new file mode 100644 index 0000000..aa96003 --- /dev/null +++ b/services/postgres/statefulset.yaml @@ -0,0 +1,76 @@ +# services/postgres/statefulset.yaml +apiVersion: apps/v1 +kind: StatefulSet +metadata: + name: postgres + namespace: postgres + labels: + app: postgres +spec: + serviceName: postgres-service + replicas: 1 + selector: + matchLabels: + app: postgres + persistentVolumeClaimRetentionPolicy: + whenDeleted: Retain + whenScaled: Retain + updateStrategy: + type: RollingUpdate + template: + metadata: + labels: + app: postgres + spec: + serviceAccountName: postgres-vault + nodeSelector: + node-role.kubernetes.io/worker: "true" + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: node-role.kubernetes.io/worker + operator: In + values: ["true"] + - key: hardware + operator: In + values: ["rpi4", "rpi5"] + containers: + - name: postgres + image: postgres:15 + ports: + - name: postgres + containerPort: 5432 + protocol: TCP + env: + - name: PGDATA + value: /var/lib/postgresql/data/pgdata + - name: POSTGRES_USER + value: postgres + - name: POSTGRES_PASSWORD_FILE + value: /mnt/vault/postgres_password + - name: POSTGRES_DB + value: postgres + volumeMounts: + - name: postgres-data + mountPath: /var/lib/postgresql/data + - name: vault-secrets + mountPath: /mnt/vault + readOnly: true + volumes: + - name: vault-secrets + csi: + driver: secrets-store.csi.k8s.io + readOnly: true + volumeAttributes: + secretProviderClass: postgres-vault + volumeClaimTemplates: + - metadata: + name: postgres-data + spec: + accessModes: ["ReadWriteOnce"] + storageClassName: astreae + resources: + requests: + storage: 100Gi diff --git a/services/vault/configmap.yaml b/services/vault/configmap.yaml new file mode 100644 index 0000000..d4ffdb5 --- /dev/null +++ b/services/vault/configmap.yaml @@ -0,0 +1,24 @@ +# services/vault/configmap.yaml +apiVersion: v1 +kind: ConfigMap +metadata: + name: vault-config + namespace: vault +data: + local.hcl: | + ui = true + cluster_name = "vault-k8s" + disable_mlock = true + + listener "tcp" { + address = "0.0.0.0:8200" + cluster_address = "0.0.0.0:8201" + tls_disable = true + } + + storage "raft" { + path = "/vault/data" + } + + api_addr = "https://secret.bstein.dev" + cluster_addr = "https://vault-0.vault-internal:8201" diff --git a/services/vault/helmrelease.yaml b/services/vault/helmrelease.yaml deleted file mode 100644 index 604d31c..0000000 --- a/services/vault/helmrelease.yaml +++ /dev/null @@ -1,68 +0,0 @@ -# services/vault/helmrelease.yaml -apiVersion: helm.toolkit.fluxcd.io/v2 -kind: HelmRelease -metadata: - name: vault - namespace: vault -spec: - interval: 30m - chart: - spec: - chart: vault - version: 0.x.x - sourceRef: - kind: HelmRepository - name: hashicorp - namespace: flux-system - install: - remediation: { retries: 3 } - upgrade: - remediation: { retries: 3 } - values: - injector: - enabled: true - resources: - requests: { cpu: "50m", memory: "64Mi" } - csi: - enabled: false - server: - ha: - enabled: true - replicas: 1 - raft: - enabled: true - extraEnvironmentVars: - VAULT_API_ADDR: "https://secret.bstein.dev" - VAULT_REDIRECT_ADDR: "https://secret.bstein.dev" - dataStorage: - enabled: true - size: 10Gi - storageClass: astreae - resources: - requests: { cpu: "100m", memory: "256Mi" } - service: - type: ClusterIP - extraVolumes: - - type: secret - name: vault-server-tls - path: /vault/userconfig/tls - extraVolumeMounts: - - name: vault-server-tls - mountPath: /vault/userconfig/tls - readOnly: true - config: | - ui = true - cluster_name = "vault-k8s" - listener "tcp" { - address = "0.0.0.0:8200" - cluster_address = "0.0.0.0:8201" - tls_cert_file = "/vault/userconfig/tls/tls.crt" - tls_key_file = "/vault/userconfig/tls/tls.key" - } - storage "raft" { - path = "/vault/data" - } - api_addr = "https://secret.bstein.dev" - cluster_addr = "https://vault-0.vault-internal:8201" - ui: - enabled: true diff --git a/services/vault/ingress.yaml b/services/vault/ingress.yaml index 91d9ca4..1d9d523 100644 --- a/services/vault/ingress.yaml +++ b/services/vault/ingress.yaml @@ -1,4 +1,4 @@ -# services/vault/helmrelease.yaml +# services/vault/ingress.yaml apiVersion: networking.k8s.io/v1 kind: Ingress metadata: @@ -7,8 +7,6 @@ metadata: annotations: kubernetes.io/ingress.class: traefik traefik.ingress.kubernetes.io/router.entrypoints: websecure - traefik.ingress.kubernetes.io/service.serversscheme: https - traefik.ingress.kubernetes.io/service.serversTransport: vault-vault-to-https@kubernetescrd spec: ingressClassName: traefik tls: @@ -22,6 +20,6 @@ spec: pathType: Prefix backend: service: - name: vault-ui + name: vault port: number: 8200 diff --git a/services/vault/kustomization.yaml b/services/vault/kustomization.yaml index 1d7af87..b39fc48 100644 --- a/services/vault/kustomization.yaml +++ b/services/vault/kustomization.yaml @@ -4,7 +4,11 @@ kind: Kustomization namespace: vault resources: - namespace.yaml - - helmrelease.yaml - - certificate.yaml + - serviceaccount.yaml + - rbac.yaml + - configmap.yaml + - statefulset.yaml + - service.yaml - ingress.yaml + - certificate.yaml - serverstransport.yaml diff --git a/services/vault/rbac.yaml b/services/vault/rbac.yaml new file mode 100644 index 0000000..d1caa18 --- /dev/null +++ b/services/vault/rbac.yaml @@ -0,0 +1,13 @@ +# services/vault/rbac.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: vault-auth-delegator +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: system:auth-delegator +subjects: + - kind: ServiceAccount + name: vault + namespace: vault diff --git a/services/vault/service.yaml b/services/vault/service.yaml new file mode 100644 index 0000000..0c1c451 --- /dev/null +++ b/services/vault/service.yaml @@ -0,0 +1,37 @@ +# services/vault/service.yaml +--- +apiVersion: v1 +kind: Service +metadata: + name: vault + namespace: vault +spec: + ports: + - name: api + port: 8200 + targetPort: 8200 + - name: cluster + port: 8201 + targetPort: 8201 + selector: + app: vault + +--- +apiVersion: v1 +kind: Service +metadata: + name: vault-internal + namespace: vault + labels: + app: vault +spec: + clusterIP: None + ports: + - name: api + port: 8200 + targetPort: 8200 + - name: cluster + port: 8201 + targetPort: 8201 + selector: + app: vault diff --git a/services/vault/serviceaccount.yaml b/services/vault/serviceaccount.yaml new file mode 100644 index 0000000..56c4181 --- /dev/null +++ b/services/vault/serviceaccount.yaml @@ -0,0 +1,6 @@ +# services/vault/serviceaccount.yaml +apiVersion: v1 +kind: ServiceAccount +metadata: + name: vault + namespace: vault diff --git a/services/vault/statefulset.yaml b/services/vault/statefulset.yaml new file mode 100644 index 0000000..bd15607 --- /dev/null +++ b/services/vault/statefulset.yaml @@ -0,0 +1,124 @@ +# services/vault/statefulset.yaml +apiVersion: apps/v1 +kind: StatefulSet +metadata: + name: vault + namespace: vault + labels: + app: vault +spec: + serviceName: vault-internal + replicas: 1 + selector: + matchLabels: + app: vault + template: + metadata: + labels: + app: vault + spec: + serviceAccountName: vault + nodeSelector: + node-role.kubernetes.io/worker: "true" + kubernetes.io/arch: arm64 + securityContext: + fsGroup: 1000 + initContainers: + - name: setup-config + image: alpine:3.20 + command: + - sh + - -c + - | + set -euo pipefail + cp /config-src/local.hcl /vault/config/local.hcl + chown 1000:1000 /vault/config/local.hcl + chmod 640 /vault/config/local.hcl + securityContext: + runAsUser: 0 + runAsGroup: 0 + allowPrivilegeEscalation: false + volumeMounts: + - name: config-template + mountPath: /config-src + - name: config + mountPath: /vault/config + containers: + - name: vault + image: hashicorp/vault:1.17.6 + imagePullPolicy: IfNotPresent + command: ["vault"] + args: ["server", "-config=/vault/config/local.hcl"] + ports: + - name: api + containerPort: 8200 + - name: cluster + containerPort: 8201 + env: + - name: VAULT_ADDR + value: "http://127.0.0.1:8200" + - name: VAULT_API_ADDR + value: "https://secret.bstein.dev" + - name: VAULT_CLUSTER_ADDR + value: "https://vault-0.vault-internal:8201" + - name: VAULT_REDIRECT_ADDR + value: "https://secret.bstein.dev" + - name: VAULT_LOG_LEVEL + value: "info" + - name: VAULT_DISABLE_MLOCK + value: "true" + - name: VAULT_DISABLE_PERM_MGMT + value: "true" + - name: SKIP_CHOWN + value: "true" + - name: SKIP_SETCAP + value: "true" + readinessProbe: + exec: + command: ["sh", "-c", "VAULT_ADDR=http://127.0.0.1:8200 vault status"] + initialDelaySeconds: 10 + periodSeconds: 10 + timeoutSeconds: 5 + failureThreshold: 6 + livenessProbe: + exec: + command: ["sh", "-c", "VAULT_ADDR=http://127.0.0.1:8200 vault status >/dev/null 2>&1 || true"] + initialDelaySeconds: 60 + periodSeconds: 20 + timeoutSeconds: 5 + failureThreshold: 6 + securityContext: + runAsNonRoot: true + runAsUser: 1000 + runAsGroup: 1000 + allowPrivilegeEscalation: false + capabilities: + add: ["IPC_LOCK"] + drop: ["ALL"] + volumeMounts: + - name: config + mountPath: /vault/config + - name: data + mountPath: /vault/data + - name: tls + mountPath: /vault/userconfig/tls + readOnly: true + volumes: + - name: config-template + configMap: + name: vault-config + - name: config + emptyDir: {} + - name: tls + secret: + secretName: vault-server-tls + optional: false + volumeClaimTemplates: + - metadata: + name: data + spec: + accessModes: ["ReadWriteOnce"] + resources: + requests: + storage: 10Gi + storageClassName: astreae diff --git a/services/vaultwarden/deployment.yaml b/services/vaultwarden/deployment.yaml new file mode 100644 index 0000000..9e65c22 --- /dev/null +++ b/services/vaultwarden/deployment.yaml @@ -0,0 +1,66 @@ +# services/vaultwarden/deployment.yaml +apiVersion: apps/v1 +kind: Deployment +metadata: + name: vaultwarden + namespace: vaultwarden +spec: + replicas: 1 + strategy: + type: RollingUpdate + rollingUpdate: + maxSurge: 0 + maxUnavailable: 1 + selector: + matchLabels: + app: vaultwarden + template: + metadata: + labels: + app: vaultwarden + spec: + containers: + - name: vaultwarden + image: vaultwarden/server:1.33.2 + env: + - name: SIGNUPS_ALLOWED + value: "false" + - name: INVITATIONS_ALLOWED + value: "true" + - name: DOMAIN + value: "https://vault.bstein.dev" + - name: SMTP_HOST + value: "mailu-front.mailu-mailserver.svc.cluster.local" + - name: SMTP_PORT + value: "25" + - name: SMTP_SECURITY + value: "starttls" + - name: SMTP_ACCEPT_INVALID_HOSTNAMES + value: "true" + - name: SMTP_ACCEPT_INVALID_CERTS + value: "true" + - name: SMTP_FROM + value: "postmaster@bstein.dev" + - name: SMTP_FROM_NAME + value: "Atlas Vaultwarden" + - name: DATABASE_URL + valueFrom: + secretKeyRef: + name: vaultwarden-db-url + key: DATABASE_URL + - name: ADMIN_TOKEN + valueFrom: + secretKeyRef: + name: vaultwarden-admin + key: ADMIN_TOKEN + ports: + - name: http + containerPort: 80 + protocol: TCP + volumeMounts: + - name: vaultwarden-data + mountPath: /data + volumes: + - name: vaultwarden-data + persistentVolumeClaim: + claimName: vaultwarden-data diff --git a/services/vaultwarden/ingress.yaml b/services/vaultwarden/ingress.yaml new file mode 100644 index 0000000..2eaa991 --- /dev/null +++ b/services/vaultwarden/ingress.yaml @@ -0,0 +1,28 @@ +# services/vaultwarden/ingress.yaml +apiVersion: networking.k8s.io/v1 +kind: Ingress +metadata: + name: vaultwarden-ingress + namespace: vaultwarden + annotations: + traefik.ingress.kubernetes.io/router.entrypoints: websecure + traefik.ingress.kubernetes.io/router.tls: "true" + traefik.ingress.kubernetes.io/router.tls.certresolver: letsencrypt-prod + cert-manager.io/cluster-issuer: letsencrypt-prod +spec: + ingressClassName: traefik + rules: + - host: vault.bstein.dev + http: + paths: + - path: / + pathType: Prefix + backend: + service: + name: vaultwarden-service + port: + number: 80 + tls: + - hosts: + - vault.bstein.dev + secretName: vaultwarden-tls diff --git a/services/jitsi/kustomization.yaml b/services/vaultwarden/kustomization.yaml similarity index 71% rename from services/jitsi/kustomization.yaml rename to services/vaultwarden/kustomization.yaml index 8864598..f0d02fd 100644 --- a/services/jitsi/kustomization.yaml +++ b/services/vaultwarden/kustomization.yaml @@ -1,10 +1,10 @@ -# services/jitsi/kustomization.yaml +# services/vaultwarden/kustomization.yaml apiVersion: kustomize.config.k8s.io/v1beta1 kind: Kustomization +namespace: vaultwarden resources: - namespace.yaml + - pvc.yaml - deployment.yaml - service.yaml - - pvc.yaml - ingress.yaml - - secret.yaml diff --git a/services/vaultwarden/namespace.yaml b/services/vaultwarden/namespace.yaml new file mode 100644 index 0000000..2e97e87 --- /dev/null +++ b/services/vaultwarden/namespace.yaml @@ -0,0 +1,5 @@ +# services/vaultwarden/namespace.yaml +apiVersion: v1 +kind: Namespace +metadata: + name: vaultwarden diff --git a/services/vaultwarden/pvc.yaml b/services/vaultwarden/pvc.yaml new file mode 100644 index 0000000..b4e0617 --- /dev/null +++ b/services/vaultwarden/pvc.yaml @@ -0,0 +1,12 @@ +# services/vaultwarden/pvc.yaml +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: vaultwarden-data + namespace: vaultwarden +spec: + accessModes: ["ReadWriteOnce"] + storageClassName: astreae + resources: + requests: + storage: 100Gi diff --git a/services/vaultwarden/service.yaml b/services/vaultwarden/service.yaml new file mode 100644 index 0000000..7cc05a0 --- /dev/null +++ b/services/vaultwarden/service.yaml @@ -0,0 +1,15 @@ +# services/vaultwarden/service.yaml +apiVersion: v1 +kind: Service +metadata: + name: vaultwarden-service + namespace: vaultwarden +spec: + type: ClusterIP + selector: + app: vaultwarden + ports: + - name: http + port: 80 + protocol: TCP + targetPort: http