Compare commits

...

190 Commits

Author SHA1 Message Date
55fa2cbce4 zot: restore main branch config 2025-12-11 17:26:15 -03:00
d5a526c5fa zot: revert to unauthenticated registry 2025-12-11 17:22:16 -03:00
efd258fc71 vault: drop traefik basicauth 2025-12-11 17:09:05 -03:00
3852ebc0f1 zot,vault: remove oauth2-proxy sso 2025-12-11 17:04:19 -03:00
88db462f8f longhorn/vault: gate via oauth2-proxy 2025-12-07 19:44:02 -03:00
e44def25f8 auth: remove error middleware to allow redirect 2025-12-07 13:19:45 -03:00
7ae8bf9705 oauth2-proxy: drop groups scope to avoid invalid_scope 2025-12-07 13:09:29 -03:00
088fed6720 auth: forward-auth via external auth host (svc traffic flaky) 2025-12-07 13:03:29 -03:00
84e4dc0616 oauth2-proxy: schedule on worker rpis 2025-12-07 12:49:38 -03:00
96a8d271a9 oauth2-proxy: ensure error middleware on auth ingress 2025-12-07 12:03:14 -03:00
84aa870cda auth: use internal oauth2-proxy svc for forward-auth 2025-12-07 11:25:29 -03:00
876ec19543 auth: add 401 redirect middleware to oauth2-proxy 2025-12-07 11:14:25 -03:00
ec1d33f1ca auth: point forward-auth to external auth host 2025-12-07 11:09:09 -03:00
1de9d94138 oauth2-proxy: temporarily drop group restriction 2025-12-07 10:42:13 -03:00
571bf759a2 auth: add namespace-local forward-auth middlewares 2025-12-07 10:25:44 -03:00
7525289a0c auth: wire oauth2-proxy and enable grafana oidc 2025-12-07 02:01:21 -03:00
c7b73555c4 add oauth2-proxy for SSO forward-auth 2025-12-06 14:42:24 -03:00
de727eee07 keycloak: restrict to worker rpis with titan-24 fallback 2025-12-06 01:44:23 -03:00
2122ce3e31 keycloak: require rpi nodes with titan-24 fallback 2025-12-06 01:40:24 -03:00
f2d496c6c0 keycloak: prefer rpi nodes, avoid titan-24 2025-12-06 01:36:33 -03:00
127d09755e keycloak: honor xforwarded headers and hostname url 2025-12-06 01:23:07 -03:00
9f5e61ebed keycloak: enable health/metrics management port 2025-12-06 00:51:47 -03:00
b1b39c4dcd keycloak: set fsGroup for data volume 2025-12-06 00:49:17 -03:00
65d8986279 keycloak: remove optimized flag for first start 2025-12-06 00:43:24 -03:00
b9202b6829 chore: drop AGENTS.md from repo 2025-12-06 00:43:17 -03:00
1e8de60198 notes: capture GPU share change and flux branch 2025-12-03 12:28:45 -03:00
2906e3e5d9 monitoring: show GPU share over dashboard range 2025-12-02 20:28:35 -03:00
7210c0784d flux: add keycloak kustomization 2025-12-02 18:10:20 -03:00
46b6d471eb flux: track feature/sso 2025-12-02 18:00:49 -03:00
7e46ffc075 keycloak: add raw manifests backed by shared postgres 2025-12-02 17:58:19 -03:00
d8f466e53e Merge pull request 'feature/atlas-monitoring' (#3) from feature/atlas-monitoring into main
Reviewed-on: #3
2025-12-02 20:52:35 +00:00
ffdb4ed010 notes: add postgres centralization guidance 2025-12-02 17:36:37 -03:00
5af23034de notes: add sso plan sketch 2025-12-02 17:14:45 -03:00
72a83a1af9 notes: update monitoring and next steps 2025-12-02 17:01:32 -03:00
42b3ac0139 monitoring: show top12 root disks 2025-12-02 15:21:02 -03:00
e53ca4dd91 monitoring: expand worker/control/root rows 2025-12-02 15:15:21 -03:00
134e39d9a4 monitoring: shrink hottest node row height 2025-12-02 15:12:16 -03:00
12fd5229dc monitoring: fix gpu share query and root bar labels 2025-12-02 14:56:36 -03:00
1963fadec1 monitoring: polish dashboards and folders 2025-12-02 14:41:39 -03:00
d23e2fe78c monitoring: regen dashboards with gpu details 2025-12-02 13:16:00 -03:00
e7d521f203 monitoring: mirror dcgm-exporter as multi-arch 2025-12-02 12:36:24 -03:00
54e4a1ed93 monitoring: run dcgm-exporter with nvidia runtime 2025-12-02 12:25:30 -03:00
9895695b36 monitoring: always pull dcgm-exporter tag 2025-12-02 12:19:16 -03:00
2fc73097ba monitoring: add registry pull secret for dcgm-exporter 2025-12-02 12:07:11 -03:00
7b1cc7061a monitoring: allow dcgm rollout with unavailable node 2025-12-02 11:59:55 -03:00
f44370c41f monitoring: use mirrored dcgm-exporter tag 2025-12-02 11:54:53 -03:00
3fbaa54f4f monitoring: reenable dcgm exporter 2025-11-20 13:11:13 -03:00
ea60425d42 traefik: use responding timeouts only 2025-11-18 20:01:16 -03:00
a8cb8c0287 traefik: extend upload timeouts 2025-11-18 19:43:19 -03:00
f7f124ad71 monitoring: control-plane stat and namespace share tweaks 2025-11-18 17:09:13 -03:00
d062c10675 monitoring: refine network metrics and control-plane allowance 2025-11-18 16:18:52 -03:00
97b7b479bc monitoring: adjust overview spacing and net panels 2025-11-18 15:55:24 -03:00
0b44f2d1d4 monitoring: disable dcgm exporter 2025-11-18 15:10:58 -03:00
bcda1b396d flux: disable wait for monitoring 2025-11-18 15:04:18 -03:00
a15ee26ae2 flux: scope monitoring health checks 2025-11-18 14:33:24 -03:00
1970b820e7 monitoring: fix dcgm image 2025-11-18 14:19:23 -03:00
e4f0eeca99 monitoring: refresh overview dashboards 2025-11-18 14:08:33 -03:00
00e9c90746 monitoring: rework gpu share + gauges 2025-11-18 12:11:47 -03:00
b1d84d646a monitoring: clean namespace gpu share and layout 2025-11-18 11:42:24 -03:00
7e4b2f8ba2 monitoring: resolve pie errors and network data 2025-11-18 11:30:33 -03:00
a028fde4f7 monitoring: fix namespace gpu share and network stats 2025-11-18 11:12:03 -03:00
703e1d4e3c monitoring: add gpu node fallback 2025-11-18 10:47:24 -03:00
16f8b5f30b monitoring: source gpu pie from limits and node nets 2025-11-18 01:01:10 -03:00
ebfeb78e87 monitoring: fix gpu pie data and network panels 2025-11-18 00:31:51 -03:00
d5e1003de8 monitoring: stabilize namespace pies and labels 2025-11-18 00:19:45 -03:00
a411694bda monitoring: add gpu pie and tidy net panels 2025-11-18 00:11:39 -03:00
1df06f18f6 Revert GPU pie chart additions 2025-11-17 23:42:55 -03:00
9bd7effdee monitoring: fix hottest stats and gpu share 2025-11-17 23:40:22 -03:00
991d6defc4 monitoring: reorder namespace pies and add gpu data 2025-11-17 23:18:53 -03:00
43b9265cdf monitoring: add namespace gpu share 2025-11-17 23:12:16 -03:00
9233ba60fc monitoring: express namespace share as cluster percent 2025-11-17 22:58:57 -03:00
ccca363fb4 monitoring: fix pie colors & thresholds 2025-11-17 22:39:50 -03:00
f22c19bc5d monitoring: color namespace pies 2025-11-17 22:36:50 -03:00
0e9b293e95 monitoring: fix namespace share percentages 2025-11-17 22:19:01 -03:00
5a2cafb5db monitoring: normalize namespace share 2025-11-17 22:06:06 -03:00
5ce1493b3b monitoring: unify namespace share panels 2025-11-17 21:57:40 -03:00
c85c6b1bc3 monitoring: worker/control-plane splits 2025-11-17 21:48:12 -03:00
64059a08f5 monitoring: restore top1 hottest stats 2025-11-17 21:20:19 -03:00
2073ffe944 monitoring: fix net/io legend labels 2025-11-17 20:19:20 -03:00
a99e1ba227 monitoring: attach nodes to net/io stats 2025-11-17 20:14:11 -03:00
8d42f501e5 monitoring: tidy hottest node labels 2025-11-17 20:04:50 -03:00
7358f9e618 monitoring: show hottest node labels 2025-11-17 20:00:40 -03:00
831d1fe707 monitoring: fix hottest node labels 2025-11-17 19:56:57 -03:00
8c263b36b9 monitoring: show hottest node names 2025-11-17 19:53:39 -03:00
bf31272339 monitoring: reorder overview stats 2025-11-17 19:49:50 -03:00
a34e58d319 monitoring: fix hottest stats and titan-db scrape 2025-11-17 19:38:40 -03:00
6a60e4284a monitoring: tighten overview stats 2025-11-17 19:24:03 -03:00
0f7d0b7bac monitoring: polish dashboards 2025-11-17 18:55:11 -03:00
665dfa2e52 monitoring: rebuild atlas dashboards 2025-11-17 16:27:38 -03:00
5858a80c72 monitoring: restructure grafana dashboards 2025-11-17 14:22:46 -03:00
d844e068ec monitoring: enrich dashboards 2025-11-16 12:58:08 -03:00
77c3e260a3 monitoring: refresh grafana dashboards 2025-11-15 21:03:11 -03:00
2e6b9a47c8 dashboards: improve public view and fix color 2025-11-15 11:59:48 -03:00
48f9c6d715 grafana: set datasource uid 2025-11-15 11:35:27 -03:00
da82ebd469 grafana: use atlas metrics hostname 2025-11-15 11:18:40 -03:00
37b93de3e7 victoria-metrics: revert storageclass change 2025-11-15 11:16:37 -03:00
89c0fbfd44 monitoring: fix domain 2025-11-14 19:13:40 -03:00
cb402d0bb9 monitoring: fix ingress and env formats 2025-11-14 08:51:09 -03:00
597556d1c0 grafana: use string host format 2025-11-14 08:37:46 -03:00
f886e2b873 grafana: fix dashboard provider list 2025-11-14 08:33:53 -03:00
94f0cd939d monitoring: fix grafana values 2025-11-14 08:29:59 -03:00
bc757265cf monitoring: add grafana and alertmanager 2025-11-14 00:02:59 -03:00
4d3a4cd2b4 flux-system: track main branch 2025-11-12 01:06:26 -03:00
ac7863802a monitoring: disable wait on node-exporter 2025-11-09 14:03:14 -03:00
afb926439f core: disable wait to unblock reconciliation 2025-11-09 13:46:56 -03:00
ebf5a8aef9 core: remove gpu health gate 2025-11-09 13:37:59 -03:00
dca749cc04 gpu: drop runtimeClass from minipc plugin 2025-11-09 13:28:40 -03:00
65b3e3fbb8 monitoring: disable kube-state annotations 2025-11-09 13:20:50 -03:00
45ad2a2b06 monitoring: clean helm values 2025-11-09 13:16:21 -03:00
396acb818a monitoring: disable chart prometheusScrape 2025-11-09 13:11:40 -03:00
aae55a14f8 monitoring: annotate kube-state svc manually 2025-11-09 13:07:39 -03:00
8ac040a7d8 monitoring: drop duplicate annotations 2025-11-09 13:03:40 -03:00
79a17412af monitoring: reference prometheus repo 2025-11-09 12:59:03 -03:00
1bdc0efdac core: point flux to infrastructure path 2025-11-09 12:49:54 -03:00
8b6ddcd44d platform: fix relative paths 2025-11-09 12:39:32 -03:00
ffbfee1ebd platform: include cert-manager clusterissuer 2025-11-09 12:38:20 -03:00
85aa07c0cc chore: fix vmagent relabel indentation 2025-11-09 12:33:11 -03:00
e2e2916139 fix: flux automation and monitoring config 2025-11-09 12:31:38 -03:00
077654fa2d refactor: restructure atlas flux layout 2025-11-09 11:48:45 -03:00
3c229baece pegasus on 2025-10-09 23:26:20 -05:00
48995cc6ed Merge pull request 'minor tweaks' (#2) from fea/titan24-gpu into main
Reviewed-on: #2
2025-10-10 02:23:01 +00:00
c94959a687 minor tweaks 2025-10-09 21:21:54 -05:00
d992be1061 Merge pull request 'gpu(titan-24): add RuntimeClass + NVIDIA device-plugin DS; enable containerd nvidia runtime' (#1) from fea/titan24-gpu into main
Reviewed-on: #1
2025-10-09 23:29:26 +00:00
79d71f471f gpu(titan-24): add RuntimeClass + NVIDIA device-plugin DS; enable containerd nvidia runtime 2025-10-09 18:28:20 -05:00
8f724e02be pegasus chill 2025-10-08 04:26:26 -05:00
d2ffd738ef storageclass update 2025-10-08 03:13:12 -05:00
16b2c15eda asteria corrections 2025-10-08 00:50:42 -05:00
761fdd29b2 jellyfin restart 2025-10-07 23:28:40 -05:00
4567b1685c monitoring add, jellyfin/pegasus update, and traefik tweaks 2025-10-07 23:26:27 -05:00
2182e98c05 jellyfin pvc size increase 2025-10-04 09:00:41 -05:00
503a95a8e8 fixed jellyfin pv issue 2025-10-04 08:50:56 -05:00
9dfe6bb700 jellyfin and pegasus in same group 2025-09-18 10:12:08 -05:00
358da0ea00 jellyfin and pegasus in same group 2025-09-18 09:55:00 -05:00
3b50199e1d jellyfin and pegasus in same group 2025-09-18 09:38:46 -05:00
5b97966395 jellyfin and pegasus in same group 2025-09-18 08:52:58 -05:00
9a34ee3d2e pegasus 1.2.32 2025-09-18 02:33:37 -05:00
53d3079bce gavilon to gavilan 2025-09-17 19:12:03 -05:00
259451e273 added gavilon to account for pegasus 2025-09-17 18:29:33 -05:00
518d7bb160 pegasus 1.2.31 2025-09-17 18:08:49 -05:00
632949c29c pegasus 1.2.31 2025-09-17 09:38:49 -05:00
6a77f7749f pegasus 1.2.30 2025-09-17 09:09:24 -05:00
16997fba10 pegasus 1.2.29 2025-09-17 09:00:52 -05:00
3637a99bfb pegasus 1.2.28 2025-09-17 08:52:11 -05:00
7e2baa343c pegasus 1.2.27 2025-09-17 08:21:51 -05:00
02bde10852 pegasus 1.2.26 2025-09-17 07:57:36 -05:00
e224215406 pegasus 1.2.25 2025-09-17 07:46:48 -05:00
03d43d097b pegasus 1.2.24 2025-09-17 07:24:10 -05:00
ca62df5508 pegasus 1.2.22 2025-09-17 01:33:11 -05:00
2f68bc664a pegasus 1.2.22 2025-09-17 01:02:33 -05:00
3878d39579 pegasus 1.2.21 2025-09-17 00:08:18 -05:00
19ae80e5e0 pegasus 1.2.20 2025-09-16 23:10:58 -05:00
46f02ee826 pegasus 1.2.17 2025-09-16 22:45:15 -05:00
e34744d144 pegasus 1.2.17 2025-09-16 20:08:50 -05:00
fdbd8ef048 pegasus 1.2.17 2025-09-16 18:02:55 -05:00
535c3de0bf pegasus 1.2.16 2025-09-16 17:18:42 -05:00
2be629a998 pegasus 1.2.15 2025-09-16 16:56:49 -05:00
0b5aed217d pegasus 1.2.14 2025-09-16 09:53:26 -05:00
eb6aeae2d2 pegasus 1.2.13 2025-09-16 09:12:41 -05:00
3276e4f196 pegasus 1.2.12 2025-09-16 08:54:32 -05:00
e31bf05cc1 pegasus 1.2.11 2025-09-16 08:29:47 -05:00
e0169b5bba pegasus 1.2.10 2025-09-16 07:19:54 -05:00
ba140fb638 pegasus 1.2.9 2025-09-16 05:33:36 -05:00
10b34c353b pegasus 1.2.8 2025-09-16 04:09:10 -05:00
26e15f7651 pegasus 1.2.7 - json fix 2025-09-16 03:35:12 -05:00
22683b0dc4 pegasus 1.2.6 - json fix 2025-09-16 03:05:50 -05:00
7468e62023 mapping to list 2025-09-16 02:36:43 -05:00
0d492eb622 pegasus updates 1.2.5 2025-09-16 01:55:36 -05:00
c8a91ebe4f pegasus updates 1.2.4 2025-09-16 01:01:23 -05:00
ee3b0f3f25 pegasus updates 2025-09-16 00:06:26 -05:00
ab02f4537e pegasus updates 2025-09-15 22:52:58 -05:00
f51c06efac pegasus updates 2025-09-15 22:40:00 -05:00
773637273d pegasus updates 2025-09-15 19:55:20 -05:00
8b1c083fe0 pegasus: pin image digest + command + probes + tls 2025-09-15 13:00:39 -05:00
128fad192c pegasus flux'd 2025-09-15 12:32:52 -05:00
eac7aaa91b pegasus flux'd 2025-09-15 12:28:56 -05:00
28903add8f pegasus fix 2025-09-15 12:09:24 -05:00
eea64c7eb1 pegasus on 2025-09-15 02:45:22 -05:00
c7a184eace zot fix 2025-09-15 02:15:27 -05:00
ba233fd909 zot fix 2025-09-15 01:03:32 -05:00
04cd5b0c62 zot middleware add 2025-09-09 11:27:42 -05:00
ec744e45bf zot middleware add 2025-09-09 01:43:13 -05:00
b16eda5894 zot simplification 2025-09-09 01:16:33 -05:00
1ba463001a zot simplification 2025-09-09 00:22:24 -05:00
2304c41ba8 zot configmap update 2025-09-08 23:08:32 -05:00
7ca10afce7 zot version pin 2025-09-08 22:52:41 -05:00
ead0c486a5 zot troubleshooting 2025-09-08 22:25:41 -05:00
1de7fcc287 zot middleware fix 2025-09-08 21:58:50 -05:00
7efc4a4dfb jitsi corrections 2025-09-07 14:31:53 -05:00
19bfa0878c pegasus corrections 2025-09-07 13:34:06 -05:00
fab2d944ff jitsi setup 2025-09-07 13:20:49 -05:00
145 changed files with 11557 additions and 106 deletions

0
-c
View File

1
.gitignore vendored Normal file
View File

@ -0,0 +1 @@
AGENTS.md

View File

@ -0,0 +1,12 @@
# clusters/atlas/applications/kustomization.yaml
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
resources:
- ../../services/crypto
- ../../services/gitea
- ../../services/jellyfin
- ../../services/jitsi
- ../../services/monitoring
- ../../services/pegasus
- ../../services/vault
- ../../services/zot

View File

@ -1,4 +1,4 @@
# infrastructure/flux-system/kustomization-crypto.yaml
# clusters/atlas/flux-system/applications/crypto/kustomization.yaml
apiVersion: kustomize.toolkit.fluxcd.io/v1
kind: Kustomization
metadata:

View File

@ -1,4 +1,4 @@
# infrastructure/flux-system/kustomization-gitea.yaml
# clusters/atlas/flux-system/applications/gitea/kustomization.yaml
apiVersion: kustomize.toolkit.fluxcd.io/v1beta2
kind: Kustomization
metadata:

View File

@ -1,4 +1,4 @@
# infrastructure/flux-system/kustomization-jellyfin.yaml
# clusters/atlas/flux-system/applications/jellyfin/kustomization.yaml
apiVersion: kustomize.toolkit.fluxcd.io/v1
kind: Kustomization
metadata:

View File

@ -0,0 +1,19 @@
# clusters/atlas/flux-system/applications/jitsi/kustomization.yaml
apiVersion: kustomize.toolkit.fluxcd.io/v1
kind: Kustomization
metadata:
name: jitsi
namespace: flux-system
spec:
interval: 10m
path: ./services/jitsi
targetNamespace: jitsi
prune: true
sourceRef:
kind: GitRepository
name: flux-system
namespace: flux-system
dependsOn:
- name: core
wait: true
timeout: 5m

View File

@ -0,0 +1,15 @@
# clusters/atlas/flux-system/applications/keycloak/kustomization.yaml
apiVersion: kustomize.toolkit.fluxcd.io/v1
kind: Kustomization
metadata:
name: keycloak
namespace: flux-system
spec:
interval: 10m
prune: true
sourceRef:
kind: GitRepository
name: flux-system
path: ./services/keycloak
targetNamespace: sso
timeout: 2m

View File

@ -0,0 +1,17 @@
# clusters/atlas/flux-system/applications/kustomization.yaml
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
resources:
- zot/kustomization.yaml
- gitea/kustomization.yaml
- vault/kustomization.yaml
- jitsi/kustomization.yaml
- crypto/kustomization.yaml
- monerod/kustomization.yaml
- pegasus/kustomization.yaml
- pegasus/image-automation.yaml
- jellyfin/kustomization.yaml
- xmr-miner/kustomization.yaml
- sui-metrics/kustomization.yaml
- keycloak/kustomization.yaml
- oauth2-proxy/kustomization.yaml

View File

@ -1,4 +1,4 @@
# infrastructure/flux-system/kustomization-monerod.yaml
# clusters/atlas/flux-system/applications/monerod/kustomization.yaml
apiVersion: kustomize.toolkit.fluxcd.io/v1
kind: Kustomization
metadata:

View File

@ -0,0 +1,15 @@
# clusters/atlas/flux-system/applications/oauth2-proxy/kustomization.yaml
apiVersion: kustomize.toolkit.fluxcd.io/v1
kind: Kustomization
metadata:
name: oauth2-proxy
namespace: flux-system
spec:
interval: 10m
prune: true
sourceRef:
kind: GitRepository
name: flux-system
path: ./services/oauth2-proxy
targetNamespace: sso
timeout: 2m

View File

@ -0,0 +1,20 @@
# clusters/atlas/flux-system/applications/pegasus/image-automation.yaml
apiVersion: image.toolkit.fluxcd.io/v1beta1
kind: ImageUpdateAutomation
metadata:
name: pegasus
namespace: flux-system
spec:
interval: 1m0s
sourceRef:
kind: GitRepository
name: flux-system
git:
commit:
author:
email: ops@bstein.dev
name: flux-bot
messageTemplate: "chore(pegasus): update image to {{range .Updated.Images}}{{.}}{{end}}"
update:
strategy: Setters
path: ./services/pegasus

View File

@ -0,0 +1,19 @@
# clusters/atlas/flux-system/applications/pegasus/kustomization.yaml
apiVersion: kustomize.toolkit.fluxcd.io/v1
kind: Kustomization
metadata:
name: pegasus
namespace: flux-system
spec:
interval: 10m
path: ./services/pegasus
targetNamespace: jellyfin
prune: true
sourceRef:
kind: GitRepository
name: flux-system
namespace: flux-system
dependsOn:
- name: core
wait: true
timeout: 5m

View File

@ -0,0 +1,19 @@
# clusters/atlas/flux-system/applications/sui-metrics/kustomization.yaml
apiVersion: kustomize.toolkit.fluxcd.io/v1
kind: Kustomization
metadata:
name: sui-metrics
namespace: flux-system
spec:
interval: 10m
path: ./services/sui-metrics/overlays/atlas
prune: true
dependsOn:
- name: monitoring
sourceRef:
kind: GitRepository
name: flux-system
namespace: flux-system
wait: true
timeout: 5m
targetNamespace: sui-metrics

View File

@ -1,4 +1,4 @@
# infrastructure/flux-system/kustomization-vault.yaml
# clusters/atlas/flux-system/applications/vault/kustomization.yaml
apiVersion: kustomize.toolkit.fluxcd.io/v1
kind: Kustomization
metadata:

View File

@ -1,4 +1,4 @@
# infrastructure/flux-system/kustomization-core.yaml
# clusters/atlas/flux-system/applications/xmr-miner/kustomization.yaml
apiVersion: kustomize.toolkit.fluxcd.io/v1
kind: Kustomization
metadata:

View File

@ -1,4 +1,4 @@
# infrastructure/flux-system/kustomization-zot.yaml
# clusters/atlas/flux-system/applications/zot/kustomization.yaml
apiVersion: kustomize.toolkit.fluxcd.io/v1
kind: Kustomization
metadata:

View File

@ -1,6 +1,6 @@
---
# This manifest was generated by flux. DO NOT EDIT.
# Flux Version: v2.5.1
# Flux Version: v2.5.1f reconzaq1= zaq1= aq1= 1= w2cile kustomization flux-system --namespace flux-system --with-source
# Components: source-controller,kustomize-controller,helm-controller,notification-controller
apiVersion: v1
kind: Namespace

View File

@ -8,7 +8,7 @@ metadata:
spec:
interval: 1m0s
ref:
branch: main
branch: feature/sso
secretRef:
name: flux-system-gitea
url: ssh://git@scm.bstein.dev:2242/bstein/titan-iac.git
@ -20,7 +20,7 @@ metadata:
namespace: flux-system
spec:
interval: 10m0s
path: ./
path: ./clusters/atlas/flux-system
prune: true
sourceRef:
kind: GitRepository

View File

@ -0,0 +1,8 @@
# clusters/atlas/flux-system/kustomization.yaml
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
resources:
- gotk-components.yaml
- gotk-sync.yaml
- platform
- applications

View File

@ -0,0 +1,15 @@
# clusters/atlas/flux-system/platform/core/kustomization.yaml
apiVersion: kustomize.toolkit.fluxcd.io/v1
kind: Kustomization
metadata:
name: core
namespace: flux-system
spec:
interval: 10m
path: ./infrastructure/core
prune: true
sourceRef:
kind: GitRepository
name: flux-system
namespace: flux-system
wait: false

View File

@ -1,4 +1,4 @@
# infrastructure/flux-system/kustomization-helm.yaml
# clusters/atlas/flux-system/platform/helm/kustomization.yaml
apiVersion: kustomize.toolkit.fluxcd.io/v1
kind: Kustomization
metadata:

View File

@ -0,0 +1,9 @@
# clusters/atlas/flux-system/platform/kustomization.yaml
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
resources:
- core/kustomization.yaml
- helm/kustomization.yaml
- traefik/kustomization.yaml
- monitoring/kustomization.yaml
- longhorn-ui/kustomization.yaml

View File

@ -1,3 +1,4 @@
# clusters/atlas/flux-system/platform/longhorn-ui/kustomization.yaml
apiVersion: kustomize.toolkit.fluxcd.io/v1
kind: Kustomization
metadata:

View File

@ -0,0 +1,14 @@
# clusters/atlas/flux-system/platform/monitoring/kustomization.yaml
apiVersion: kustomize.toolkit.fluxcd.io/v1
kind: Kustomization
metadata:
name: monitoring
namespace: flux-system
spec:
interval: 10m
path: ./services/monitoring
prune: true
sourceRef:
kind: GitRepository
name: flux-system
wait: false

View File

@ -0,0 +1,18 @@
# clusters/atlas/flux-system/platform/traefik/kustomization.yaml
apiVersion: kustomize.toolkit.fluxcd.io/v1
kind: Kustomization
metadata:
name: traefik
namespace: flux-system
spec:
interval: 10m
path: ./infrastructure/traefik
targetNamespace: traefik
prune: true
sourceRef:
kind: GitRepository
name: flux-system
namespace: flux-system
dependsOn:
- name: core
wait: true

View File

@ -0,0 +1,7 @@
# clusters/atlas/platform/kustomization.yaml
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
resources:
- ../../../infrastructure/modules/base
- ../../../infrastructure/modules/profiles/atlas-ha
- ../../../infrastructure/sources/cert-manager/letsencrypt.yaml

View File

@ -0,0 +1,5 @@
# Oceanus Cluster Scaffold
This directory prepares the Flux and Kustomize layout for a future Oceanus-managed cluster.
Populate `flux-system/` with `gotk-components.yaml` and related manifests after running `flux bootstrap`.
Define node-specific resources under `infrastructure/modules/profiles/oceanus-validator/` and reference workloads in `applications/` as they come online.

View File

@ -0,0 +1,4 @@
# clusters/oceanus/applications/kustomization.yaml
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
resources: []

View File

@ -0,0 +1,9 @@
# clusters/oceanus/flux-system/kustomization.yaml
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
resources:
# Populate when oceanus cluster is bootstrapped with Flux.
# - gotk-components.yaml
# - gotk-sync.yaml
- ../platform
- ../applications

View File

@ -0,0 +1,6 @@
# clusters/oceanus/platform/kustomization.yaml
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
resources:
- ../../infrastructure/modules/base
- ../../infrastructure/modules/profiles/oceanus-validator

16
docs/topology.md Normal file
View File

@ -0,0 +1,16 @@
# Titan Homelab Topology
| Hostname | Role / Function | Managed By | Notes |
|------------|--------------------------------|---------------------|-------|
| titan-0a | Kubernetes control-plane | Flux (atlas cluster)| HA leader, tainted for control only |
| titan-0b | Kubernetes control-plane | Flux (atlas cluster)| Standby control node |
| titan-0c | Kubernetes control-plane | Flux (atlas cluster)| Standby control node |
| titan-04-19| Raspberry Pi workers | Flux (atlas cluster)| Workload nodes, labelled per hardware |
| titan-22 | GPU mini-PC (Jellyfin) | Flux + Ansible | NVIDIA runtime managed via `modules/profiles/atlas-ha` |
| titan-24 | Tethys hybrid node | Flux + Ansible | Runs SUI metrics via K8s, validator via Ansible |
| titan-db | HA control plane database | Ansible | PostgreSQL / etcd backing services |
| titan-jh | Jumphost & bastion | Ansible | Entry point / future KVM services |
| oceanus | Dedicated SUI validator host | Ansible / Flux prep | Baremetal validator workloads, exposes metrics to atlas; Kustomize scaffold under `clusters/oceanus/` |
| styx | Air-gapped workstation | Manual / Scripts | Remains isolated, scripts tracked in `hosts/styx` |
Use the `clusters/` directory for cluster-scoped state and the `hosts/` directory for baremetal orchestration.

View File

@ -0,0 +1,2 @@
# hosts/group_vars/all.yaml
validator_version: latest

View File

@ -0,0 +1,2 @@
# hosts/host_vars/titan-24.yaml
validator_compose_path: /opt/sui-validator

28
hosts/inventory/lab.yaml Normal file
View File

@ -0,0 +1,28 @@
# hosts/inventory/lab.yaml
# Replace ansible_host and ansible_user values with real connectivity details.
all:
children:
atlas:
hosts:
titan-24:
ansible_host: REPLACE_ME
ansible_user: ubuntu
roleset: tethys_hybrid
titan-22:
ansible_host: REPLACE_ME
ansible_user: debian
roleset: minipc_gpu
baremetal:
hosts:
titan-db:
ansible_host: REPLACE_ME
ansible_user: postgres
roleset: database
titan-jh:
ansible_host: REPLACE_ME
ansible_user: jump
roleset: jumphost
oceanus:
ansible_host: REPLACE_ME
ansible_user: validator
roleset: validator

29
hosts/playbooks/site.yaml Normal file
View File

@ -0,0 +1,29 @@
# hosts/playbooks/site.yaml
---
- name: Configure titan-db
hosts: titan-db
gather_facts: true
roles:
- common
- titan_db
- name: Configure titan-jh
hosts: titan-jh
gather_facts: true
roles:
- common
- titan_jh
- name: Configure oceanus validator host
hosts: oceanus
gather_facts: true
roles:
- common
- oceanus_base
- name: Prepare hybrid tethys node
hosts: titan-24
gather_facts: true
roles:
- common
- tethys_canary

View File

@ -0,0 +1,9 @@
# hosts/roles/common/tasks/main.yaml
---
- name: Ensure base packages present
ansible.builtin.package:
name:
- curl
- vim
state: present
tags: ['common', 'packages']

View File

@ -0,0 +1,6 @@
# hosts/roles/oceanus_base/tasks/main.yaml
---
- name: Placeholder for oceanus base configuration
ansible.builtin.debug:
msg: "Install validator prerequisites and monitoring exporters here."
tags: ['oceanus']

View File

@ -0,0 +1,6 @@
# hosts/roles/tethys_canary/tasks/main.yaml
---
- name: Placeholder for SUI validator container runtime setup
ansible.builtin.debug:
msg: "Configure container runtime and validator compose stack here."
tags: ['tethys', 'validator']

View File

@ -0,0 +1,6 @@
# hosts/roles/titan_db/tasks/main.yaml
---
- name: Placeholder for titan-db provisioning
ansible.builtin.debug:
msg: "Install database packages, configure backups, and manage users here."
tags: ['titan_db']

View File

@ -0,0 +1,6 @@
# hosts/roles/titan_jh/tasks/main.yaml
---
- name: Placeholder for jumphost hardening
ansible.builtin.debug:
msg: "Harden SSH, manage bastion tooling, and configure audit logging here."
tags: ['jumphost']

2
hosts/styx/README.md Normal file
View File

@ -0,0 +1,2 @@
# hosts/styx/README.md
Styx is air-gapped; provisioning scripts live under `scripts/`.

View File

@ -1,5 +0,0 @@
# infrastructure/core/gpu/daemonsets/profiles/jetson-only/kustomization.yaml
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
resources:
- ../../device-plugin-jetson

View File

@ -1,6 +0,0 @@
# infrastructure/core/gpu/daemonsets/profiles/minipc-and-jetson/kustomization.yaml
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
resources:
- ../../device-plugin-minipc
- ../../device-plugin-jetson

View File

@ -1,5 +0,0 @@
# infrastructure/core/gpu/daemonsets/profiles/minipc-only/kustomization.yaml
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
resources:
- ../../device-plugin-minipc

View File

@ -2,7 +2,6 @@
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
resources:
- base
# - gpu/profiles/jetson-only
# - gpu/profiles/minipc-and-jetson
- gpu/profiles/minipc-only
- ../modules/base
- ../modules/profiles/atlas-ha
- ../sources/cert-manager/letsencrypt.yaml

View File

@ -1,22 +0,0 @@
# infrastructure/flux-system/kustomization-core.yaml
apiVersion: kustomize.toolkit.fluxcd.io/v1
kind: Kustomization
metadata:
name: core
namespace: flux-system
spec:
interval: 10m
path: ./infrastructure/core
prune: true
sourceRef:
kind: GitRepository
name: flux-system
namespace: flux-system
wait: true
# Only wait for the NVIDIA device-plugin DaemonSet on titan-22
healthChecks:
- apiVersion: apps/v1
kind: DaemonSet
name: nvidia-device-plugin-minipc
namespace: kube-system

View File

@ -2,15 +2,4 @@
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
resources:
- gotk-components.yaml
- gotk-sync.yaml
- kustomization-zot.yaml
- kustomization-core.yaml
- kustomization-helm.yaml
- kustomization-gitea.yaml
- kustomization-vault.yaml
- kustomization-crypto.yaml
- kustomization-monerod.yaml
- kustomization-jellyfin.yaml
- kustomization-xmr-miner.yaml
- kustomization-longhorn-ui.yaml
- ../clusters/atlas/flux-system

View File

@ -7,7 +7,7 @@ metadata:
annotations:
traefik.ingress.kubernetes.io/router.entrypoints: websecure
traefik.ingress.kubernetes.io/router.tls: "true"
traefik.ingress.kubernetes.io/router.middlewares: longhorn-system-longhorn-basicauth@kubernetescrd,longhorn-system-longhorn-headers@kubernetescrd
traefik.ingress.kubernetes.io/router.middlewares: ""
spec:
ingressClassName: traefik
tls:
@ -21,6 +21,6 @@ spec:
pathType: Prefix
backend:
service:
name: longhorn-frontend
name: oauth2-proxy-longhorn
port:
number: 80

View File

@ -4,3 +4,4 @@ kind: Kustomization
resources:
- middleware.yaml
- ingress.yaml
- oauth2-proxy-longhorn.yaml

View File

@ -20,3 +20,20 @@ spec:
headers:
customRequestHeaders:
X-Forwarded-Proto: "https"
---
apiVersion: traefik.io/v1alpha1
kind: Middleware
metadata:
name: longhorn-forward-auth
namespace: longhorn-system
spec:
forwardAuth:
address: https://auth.bstein.dev/oauth2/auth
trustForwardHeader: true
authResponseHeaders:
- Authorization
- X-Auth-Request-Email
- X-Auth-Request-User
- X-Auth-Request-Groups

View File

@ -0,0 +1,102 @@
# infrastructure/longhorn/ui-ingress/oauth2-proxy-longhorn.yaml
apiVersion: v1
kind: Service
metadata:
name: oauth2-proxy-longhorn
namespace: longhorn-system
labels:
app: oauth2-proxy-longhorn
spec:
ports:
- name: http
port: 80
targetPort: 4180
selector:
app: oauth2-proxy-longhorn
---
apiVersion: apps/v1
kind: Deployment
metadata:
name: oauth2-proxy-longhorn
namespace: longhorn-system
labels:
app: oauth2-proxy-longhorn
spec:
replicas: 2
selector:
matchLabels:
app: oauth2-proxy-longhorn
template:
metadata:
labels:
app: oauth2-proxy-longhorn
spec:
nodeSelector:
node-role.kubernetes.io/worker: "true"
affinity:
nodeAffinity:
preferredDuringSchedulingIgnoredDuringExecution:
- weight: 90
preference:
matchExpressions:
- key: hardware
operator: In
values: ["rpi5","rpi4"]
containers:
- name: oauth2-proxy
image: quay.io/oauth2-proxy/oauth2-proxy:v7.6.0
imagePullPolicy: IfNotPresent
args:
- --provider=oidc
- --redirect-url=https://longhorn.bstein.dev/oauth2/callback
- --oidc-issuer-url=https://sso.bstein.dev/realms/atlas
- --scope=openid profile email groups
- --email-domain=*
- --allowed-group=admin
- --set-xauthrequest=true
- --pass-access-token=true
- --set-authorization-header=true
- --cookie-secure=true
- --cookie-samesite=lax
- --cookie-refresh=20m
- --cookie-expire=168h
- --insecure-oidc-allow-unverified-email=true
- --upstream=http://longhorn-frontend.longhorn-system.svc.cluster.local
- --http-address=0.0.0.0:4180
- --skip-provider-button=true
- --skip-jwt-bearer-tokens=true
- --oidc-groups-claim=groups
- --cookie-domain=longhorn.bstein.dev
env:
- name: OAUTH2_PROXY_CLIENT_ID
valueFrom:
secretKeyRef:
name: oauth2-proxy-longhorn-oidc
key: client_id
- name: OAUTH2_PROXY_CLIENT_SECRET
valueFrom:
secretKeyRef:
name: oauth2-proxy-longhorn-oidc
key: client_secret
- name: OAUTH2_PROXY_COOKIE_SECRET
valueFrom:
secretKeyRef:
name: oauth2-proxy-longhorn-oidc
key: cookie_secret
ports:
- containerPort: 4180
name: http
readinessProbe:
httpGet:
path: /ping
port: 4180
initialDelaySeconds: 5
periodSeconds: 10
livenessProbe:
httpGet:
path: /ping
port: 4180
initialDelaySeconds: 20
periodSeconds: 20

View File

@ -1,4 +1,4 @@
# infrastructure/core/base/kustomization.yaml
# infrastructure/modules/base/kustomization.yaml
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
resources:

View File

@ -1,4 +1,4 @@
# infrastructure/core/base/priorityclass/kustomization.yaml
# infrastructure/modules/base/priorityclass/kustomization.yaml
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
resources:

View File

@ -1,4 +1,4 @@
# infrastructure/core/base/priorityclass/scavenger.yaml
# infrastructure/modules/base/priorityclass/scavenger.yaml
apiVersion: scheduling.k8s.io/v1
kind: PriorityClass
metadata:

View File

@ -1,4 +1,4 @@
# infrastructure/core/base/storageclass/kustomization.yaml
# infrastructure/modules/base/runtimeclass/kustomization.yaml
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
resources:

View File

@ -1,4 +1,4 @@
# services/jellyfin/runtimeclass.yaml
# infrastructure/modules/base/runtimeclass/runtimeclass.yaml
apiVersion: node.k8s.io/v1
kind: RuntimeClass
metadata:

View File

@ -1,4 +1,4 @@
# infrastructure/core/base/storageclass/asteria.yaml
# infrastructure/modules/base/storageclass/asteria.yaml
apiVersion: storage.k8s.io/v1
kind: StorageClass
metadata:
@ -8,6 +8,9 @@ parameters:
fromBackup: ""
numberOfReplicas: "2"
staleReplicaTimeout: "30"
fsType: "ext4"
replicaAutoBalance: "least-effort"
dataLocality: "disabled"
provisioner: driver.longhorn.io
reclaimPolicy: Retain
allowVolumeExpansion: true

View File

@ -1,4 +1,4 @@
# infrastructure/core/base/storageclass/astreae.yaml
# infrastructure/modules/base/storageclass/astreae.yaml
apiVersion: storage.k8s.io/v1
kind: StorageClass
metadata:

View File

@ -1,4 +1,4 @@
# infrastructure/core/base/storageclass/kustomization.yaml
# infrastructure/modules/base/storageclass/kustomization.yaml
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
resources:

View File

@ -0,0 +1,7 @@
# infrastructure/modules/profiles/atlas-ha/kustomization.yaml
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
resources:
- ../components/device-plugin-jetson
- ../components/device-plugin-minipc
- ../components/device-plugin-tethys

View File

@ -1,4 +1,4 @@
# infrastructure/core/gpu/daemonsets/device-plugin-jetson/daemonset.yaml
# infrastructure/modules/profiles/components/device-plugin-jetson/daemonset.yaml
apiVersion: apps/v1
kind: DaemonSet
metadata:

View File

@ -1,4 +1,4 @@
# infrastructure/core/gpu/daemonsets/device-plugin-jetson/kustomization.yaml
# infrastructure/modules/profiles/components/device-plugin-jetson/kustomization.yaml
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
resources:

View File

@ -1,4 +1,4 @@
# infrastructure/core/gpu/daemonsets/device-plugin-minipc/daemonset.yaml
# infrastructure/modules/profiles/components/device-plugin-minipc/daemonset.yaml
apiVersion: apps/v1
kind: DaemonSet
metadata:
@ -24,7 +24,6 @@ spec:
tolerations:
- operator: Exists
priorityClassName: system-node-critical
runtimeClassName: nvidia
containers:
- name: nvidia-device-plugin-ctr
image: nvcr.io/nvidia/k8s-device-plugin:v0.16.2

View File

@ -1,4 +1,4 @@
# infrastructure/core/gpu/daemonsets/device-plugin-minipc/kustomization.yaml
# infrastructure/modules/profiles/components/device-plugin-minipc/kustomization.yaml
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
resources:

View File

@ -0,0 +1,49 @@
# infrastructure/modules/profiles/components/device-plugin-tethys/daemonset.yaml
apiVersion: apps/v1
kind: DaemonSet
metadata:
name: nvidia-device-plugin-tethys
namespace: kube-system
labels:
app.kubernetes.io/name: nvidia-device-plugin
app.kubernetes.io/instance: titan24
spec:
selector:
matchLabels:
app.kubernetes.io/name: nvidia-device-plugin
app.kubernetes.io/instance: titan24
template:
metadata:
labels:
app.kubernetes.io/name: nvidia-device-plugin
app.kubernetes.io/instance: titan24
spec:
nodeSelector:
kubernetes.io/hostname: titan-24
kubernetes.io/arch: amd64
tolerations:
- operator: Exists
priorityClassName: system-node-critical
runtimeClassName: nvidia
containers:
- name: nvidia-device-plugin-ctr
image: nvcr.io/nvidia/k8s-device-plugin:v0.16.2
imagePullPolicy: IfNotPresent
args:
- "--fail-on-init-error=false"
- "--device-list-strategy=envvar"
- "--mig-strategy=none"
securityContext:
privileged: true
env:
- name: NVIDIA_VISIBLE_DEVICES
value: "all"
- name: NVIDIA_DRIVER_CAPABILITIES
value: "compute,video,utility"
volumeMounts:
- name: device-plugin
mountPath: /var/lib/kubelet/device-plugins
volumes:
- name: device-plugin
hostPath:
path: /var/lib/kubelet/device-plugins

View File

@ -0,0 +1,5 @@
# infrastructure/modules/profiles/components/device-plugin-tethys/kustomization.yaml
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
resources:
- daemonset.yaml

View File

@ -0,0 +1,4 @@
# infrastructure/modules/profiles/oceanus-validator/kustomization.yaml
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
resources: []

View File

@ -0,0 +1,5 @@
# infrastructure/modules/profiles/tethys-hybrid/kustomization.yaml
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
resources:
- ../components/device-plugin-tethys

View File

@ -4,7 +4,7 @@ metadata:
name: letsencrypt
spec:
acme:
email: you@bstein.dev
email: brad.stein@gmail.com
server: https://acme-v02.api.letsencrypt.org/directory
privateKeySecretRef:
name: letsencrypt-account-key

View File

@ -1,3 +1,4 @@
# infrastructure/sources/helm/grafana.yaml
apiVersion: source.toolkit.fluxcd.io/v1
kind: HelmRepository
metadata:

View File

@ -1,3 +1,4 @@
# infrastructure/sources/helm/hashicorp.yaml
apiVersion: source.toolkit.fluxcd.io/v1
kind: HelmRepository
metadata:

View File

@ -1,3 +1,4 @@
# infrastructure/sources/helm/jetstack.yaml
apiVersion: source.toolkit.fluxcd.io/v1
kind: HelmRepository
metadata:

View File

@ -1,3 +1,4 @@
# infrastructure/sources/helm/prometheus.yaml
apiVersion: source.toolkit.fluxcd.io/v1
kind: HelmRepository
metadata:

View File

@ -0,0 +1,9 @@
# infrastructure/sources/helm/victoria-metrics.yaml
apiVersion: source.toolkit.fluxcd.io/v1
kind: HelmRepository
metadata:
name: victoria-metrics
namespace: flux-system
spec:
interval: 1h
url: https://victoriametrics.github.io/helm-charts/

View File

@ -35,6 +35,18 @@ items:
- --entrypoints.web.address=:80
- --entrypoints.websecure.address=:443
- --api.dashboard=true
- --metrics.prometheus=true
- --metrics.prometheus.addEntryPointsLabels=true
- --metrics.prometheus.addRoutersLabels=true
- --metrics.prometheus.addServicesLabels=true
- --entrypoints.web.transport.respondingTimeouts.readTimeout=0s
- --entrypoints.web.transport.respondingTimeouts.writeTimeout=0s
- --entrypoints.web.transport.respondingTimeouts.idleTimeout=0s
- --entrypoints.websecure.transport.respondingTimeouts.readTimeout=0s
- --entrypoints.websecure.transport.respondingTimeouts.writeTimeout=0s
- --entrypoints.websecure.transport.respondingTimeouts.idleTimeout=0s
- --entrypoints.metrics.address=:9100
- --metrics.prometheus.entryPoint=metrics
image: traefik:v3.3.3
imagePullPolicy: IfNotPresent
name: traefik
@ -48,6 +60,9 @@ items:
- containerPort: 8080
name: admin
protocol: TCP
- containerPort: 9100
name: metrics
protocol: TCP
terminationMessagePath: /dev/termination-log
terminationMessagePolicy: File
dnsPolicy: ClusterFirst

View File

@ -9,3 +9,4 @@ resources:
- serviceaccount.yaml
- clusterrole.yaml
- clusterrolebinding.yaml
- service.yaml

View File

@ -0,0 +1,20 @@
# infrastructure/traefik/service.yaml
apiVersion: v1
kind: Service
metadata:
name: traefik-metrics
namespace: traefik
labels:
app: traefik
annotations:
prometheus.io/scrape: "true"
prometheus.io/port: "9100"
prometheus.io/path: "/metrics"
spec:
type: ClusterIP
selector:
app: traefik
ports:
- name: metrics
port: 9100
targetPort: metrics

File diff suppressed because it is too large Load Diff

2
scripts/longhorn_volume_usage.fish Normal file → Executable file
View File

@ -1,3 +1,5 @@
#!/usr/bin/env fish
function pvc-usage --description "Show Longhorn PVC usage (human-readable) mapped to namespace/name"
begin
kubectl -n longhorn-system get volumes.longhorn.io -o json \

View File

@ -0,0 +1,218 @@
#!/usr/bin/env bash
set -euo pipefail
# 0) Create dedicated user if it doesn't exist
if ! id -u styx >/dev/null 2>&1; then
sudo useradd -m -s /bin/bash styx
echo "Created user 'styx'"
fi
# 1) App directory
sudo mkdir -p /opt/styx-kiosk/keys
sudo chown -R styx:styx /opt/styx-kiosk
# 2) Drop the kiosk app (written below) into place
sudo tee /opt/styx-kiosk/kiosk.py >/dev/null <<'PY'
#!/usr/bin/env python3
import base64, json, os, subprocess, threading, tempfile
from datetime import datetime
import tkinter as tk
from tkinter import ttk, messagebox
APP_TITLE = "STYX Airgap Signer"
CAMERA_DEV = os.environ.get("ZBAR_DEV", "/dev/video0")
KEY_PATH = os.environ.get("STYX_KEY", "/vault/keys/signer_ed25519.pem") # in the LUKS vault
ALGO = os.environ.get("STYX_ALGO", "ed25519") # or 'secp256r1'
QR_TMP = "/tmp/styx_signed.png"
def zbar_scan_oneshot():
# --raw -> data only; --nodisplay -> no preview window; --oneshot -> exit after first code
# (zbarcam supports --oneshot; prints one code and exits). :contentReference[oaicite:2]{index=2}
cmd = ["zbarcam", "--raw", "--nodisplay", "--oneshot", CAMERA_DEV]
try:
out = subprocess.check_output(cmd, text=True, timeout=30)
out = out.strip()
return out if out else None
except Exception as e:
return None
def openssl_pub_der_b64(key_path):
der = subprocess.check_output(["openssl","pkey","-in",key_path,"-pubout","-outform","DER"])
return base64.b64encode(der).decode()
def sign_bytes(msg: bytes, key_path: str, algo: str) -> bytes:
with tempfile.NamedTemporaryFile(delete=False) as f:
f.write(msg)
msg_path = f.name
try:
if algo.lower() == "ed25519":
# Ed25519 expects raw message; OpenSSL handles hashing internally.
sig = subprocess.check_output(
["openssl","pkeyutl","-sign","-inkey",key_path,"-rawin","-in",msg_path]
)
return sig
elif algo.lower() in ("secp256r1","prime256v1","p256"):
# ECDSA over P-256; hash with SHA-256; OpenSSL returns DER-encoded (r,s)
sig = subprocess.check_output(
["openssl","dgst","-sha256","-sign",key_path,msg_path]
)
return sig
else:
raise RuntimeError(f"Unsupported algo: {algo}")
finally:
try: os.unlink(msg_path)
except: pass
def make_signed_envelope(scanned_text: str, key_path: str, algo: str) -> dict:
# Accept either raw string or JSON with 'tx_bytes' (base64) or 'message'
try:
obj = json.loads(scanned_text)
if "tx_bytes" in obj:
msg = base64.b64decode(obj["tx_bytes"])
elif "message" in obj:
msg = obj["message"].encode()
else:
# If it's JSON but doesn't carry known fields, sign canonical JSON bytes
msg = json.dumps(obj, sort_keys=True, separators=(",",":")).encode()
request_id = obj.get("request_id")
except Exception:
# Non-JSON → treat the scanned text as the message to sign
msg = scanned_text.encode()
request_id = None
sig = sign_bytes(msg, key_path, algo)
env = {
"algo": algo.lower(),
"signature_b64": base64.b64encode(sig).decode(),
"pubkey_spki_der_b64": openssl_pub_der_b64(key_path),
"payload_sha256_b64": base64.b64encode(subprocess.check_output(["openssl","dgst","-sha256","-binary"], input=msg)).decode(),
"quote_raw": scanned_text,
"request_id": request_id,
"device": os.uname().nodename,
"ts_utc": datetime.utcnow().isoformat(timespec="seconds") + "Z",
}
return env
def qrencode_to_file(text: str, path: str):
# Use qrencode CLI to render a PNG we can display.
subprocess.run(["qrencode","-l","M","-s","16","-t","PNG","-o",path], input=text.encode(), check=True)
class App(tk.Tk):
def __init__(self):
super().__init__()
self.title(APP_TITLE)
self.attributes("-fullscreen", True)
self.configure(background="black")
self.bind("<Escape>", lambda e: self.quit()) # for maintenance only
s = ttk.Style(self)
s.configure("Big.TButton", font=("DejaVu Sans", 48), padding=24)
s.configure("Big.TLabel", font=("DejaVu Sans", 32), foreground="white", background="black")
self.container = tk.Frame(self, bg="black")
self.container.pack(expand=True, fill="both")
self.status = ttk.Label(self.container, text="Ready", style="Big.TLabel")
self.status.pack(pady=20)
self.scan_btn = ttk.Button(self.container, text="SCAN", style="Big.TButton", command=self.start_scan)
self.scan_btn.pack(pady=20)
self.image_label = tk.Label(self.container, bg="black")
self.image_label.pack(pady=10)
self.new_btn = ttk.Button(self.container, text="NEW SCAN", style="Big.TButton", command=self.reset)
self.new_btn.pack_forget()
self.note = ttk.Label(self.container, text="", style="Big.TLabel")
self.note.pack(pady=10)
if not os.path.exists(KEY_PATH):
self.status.config(text=f"Key not found at {KEY_PATH}\nInsert/unlock vault to proceed.")
def reset(self):
self.image_label.configure(image="")
self.image_label.image = None
self.new_btn.pack_forget()
self.note.config(text="")
self.status.config(text="Ready")
self.scan_btn.config(state="normal")
def start_scan(self):
if not os.path.exists(KEY_PATH):
messagebox.showerror("Key missing", f"Signing key not found at:\n{KEY_PATH}\nUnlock your vault.")
return
self.status.config(text="Scanning…")
self.scan_btn.config(state="disabled")
threading.Thread(target=self._do_scan_and_sign, daemon=True).start()
def _do_scan_and_sign(self):
scanned = zbar_scan_oneshot()
if not scanned:
self.after(0, self._scan_failed)
return
try:
envelope = make_signed_envelope(scanned, KEY_PATH, ALGO)
payload = json.dumps(envelope, separators=(",",":"))
qrencode_to_file(payload, QR_TMP)
self.after(0, self._show_qr, envelope)
except Exception as e:
self.after(0, lambda: self._error(str(e)))
def _scan_failed(self):
self.status.config(text="No QR detected. Try again.")
self.scan_btn.config(state="normal")
def _show_qr(self, envelope):
# Display the PNG produced by qrencode
try:
img = tk.PhotoImage(file=QR_TMP)
self.image_label.configure(image=img)
self.image_label.image = img
except Exception as e:
self.status.config(text=f"QR render failed: {e}")
self.scan_btn.config(state="normal")
return
self.status.config(text="Signed. Show this QR to your online box.")
self.note.config(text=f"Algo: {envelope['algo']} Host: {envelope['device']}")
self.new_btn.pack(pady=20)
if __name__ == "__main__":
App().mainloop()
PY
sudo chmod +x /opt/styx-kiosk/kiosk.py
sudo chown -R styx:styx /opt/styx-kiosk
# 3) Minimal X session: openbox + kiosk; no mouse pointer
sudo -u styx tee /home/styx/.xinitrc >/dev/null <<'XRC'
xset -dpms
xset s off
xset s noblank
# If 'unclutter' is installed, uncomment the next line to hide cursor:
# unclutter -idle 0 -root &
openbox-session &
/opt/styx-kiosk/kiosk.py
XRC
sudo chown styx:styx /home/styx/.xinitrc
sudo chmod 0755 /home/styx/.xinitrc
# 4) Autologin the 'styx' user on tty1, auto-start X
sudo mkdir -p /etc/systemd/system/getty@tty1.service.d
sudo tee /etc/systemd/system/getty@tty1.service.d/override.conf >/dev/null <<'OVR'
[Service]
ExecStart=
ExecStart=-/sbin/agetty --autologin styx --noclear %I $TERM
Type=idle
OVR
sudo -u styx tee -a /home/styx/.bash_profile >/dev/null <<'BRC'
# Start X on the first tty automatically, headless
if [ -z "$DISPLAY" ] && [ "$(tty)" = "/dev/tty1" ]; then
exec startx -- -nocursor
fi
BRC
sudo systemctl daemon-reload
sudo systemctl enable getty@tty1.service
echo "Done. Reboot to try the kiosk."

195
scripts/styx_prep.sh Executable file
View File

@ -0,0 +1,195 @@
#!/usr/bin/env bash
set -euo pipefail
# === CONFIG ===
STYX_USER="styx"
STYX_PASS="TempPass#123" # change at first login
STYX_HOSTNAME="styx"
SSH_PUBKEY="" # e.g., 'ssh-ed25519 AAAA... your@host' (optional)
# === helpers ===
require_root() {
if [[ $EUID -ne 0 ]]; then exec sudo -E "$0" "$@"; fi
}
ensure_binfmt_arm64() {
# If binfmt for arm64 isn't registered, register it via Docker (idempotent).
if [[ ! -e /proc/sys/fs/binfmt_misc/qemu-aarch64 ]]; then
command -v docker >/dev/null || { echo "Docker required to register binfmt (sudo pacman -S docker)"; exit 1; }
sudo systemctl enable --now docker >/dev/null 2>&1 || true
sudo docker run --rm --privileged tonistiigi/binfmt --install arm64
fi
}
find_parts() {
BOOT=$(lsblk -o LABEL,PATH -nr | awk '$1=="system-boot"{print $2}' | head -n1)
ROOT=$(lsblk -o LABEL,PATH -nr | awk '$1=="writable"{print $2}' | head -n1)
if [[ -z "${BOOT:-}" || -z "${ROOT:-}" ]]; then
echo "Could not find 'system-boot'/'writable' on any device."
lsblk -o NAME,SIZE,FSTYPE,LABEL,PATH -nr
exit 1
fi
}
mount_parts() {
mkdir -p /mnt/pi-boot /mnt/pi-root
mount "$ROOT" /mnt/pi-root
mount "$BOOT" /mnt/pi-boot
# Bind only what we need (avoid /run to prevent postinst fights)
for d in dev dev/pts proc sys; do mount --bind "/$d" "/mnt/pi-root/$d"; done
# Ubuntu images use a resolv.conf symlink—replace with a real file
if [[ -L /mnt/pi-root/etc/resolv.conf || ! -e /mnt/pi-root/etc/resolv.conf ]]; then
rm -f /mnt/pi-root/etc/resolv.conf
cat /etc/resolv.conf > /mnt/pi-root/etc/resolv.conf
fi
}
prep_chroot() {
# Block service starts inside chroot (no systemd there)
cat >/mnt/pi-root/usr/sbin/policy-rc.d <<'EOF'
#!/bin/sh
exit 101
EOF
chmod +x /mnt/pi-root/usr/sbin/policy-rc.d
# All the work happens inside the ARM64 rootfs
CHCMD=$(cat <<'EOS'
set -euo pipefail
export DEBIAN_FRONTEND=noninteractive TZ=Etc/UTC
# Ensure sbin is in PATH so user/group tools work
export PATH="/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin"
apt-get update
apt-get -y full-upgrade
# Remove snaps and keep them gone (Ubuntu for Pi ships with snaps)
apt-get -y purge snapd || true
rm -rf /snap /var/snap /var/lib/snapd /home/*/snap || true
mkdir -p /etc/apt/preferences.d
printf 'Package: snapd\nPin: release *\nPin-Priority: -10\n' > /etc/apt/preferences.d/nosnap.pref
# Ensure user/group tools exist
apt-get install -y passwd adduser || true
getent group i2c >/dev/null || /usr/sbin/groupadd i2c
# Base packages
BASE_PKGS="openssh-server git i2c-tools python3-smbus python3-pil zbar-tools qrencode lm-sensors"
apt-get install -y $BASE_PKGS
# ------- OLED (Luma) -------
# Prefer distro package; fall back to pip if not present in this release
if ! dpkg -s python3-luma.oled >/dev/null 2>&1; then
apt-get update
if ! apt-get install -y python3-luma.oled; then
apt-get install -y python3-pip
pip3 install --no-input --break-system-packages luma.oled
fi
fi
# ------- Camera apps -------
# Ubuntu renamed libcamera-apps -> rpicam-apps for Raspberry Pi.
# Try in order; tolerate absence (the box might be display-only).
apt-get update
if ! apt-get install -y rpicam-apps; then
apt-get install -y libcamera-apps || apt-get install -y libcamera-tools || true
fi
# Enable SSH on boot (no systemctl in chroot)
mkdir -p /etc/systemd/system/multi-user.target.wants
ln -sf /lib/systemd/system/ssh.service /etc/systemd/system/multi-user.target.wants/ssh.service
# Create user and set password
if ! id -u STYX_USER >/dev/null 2>&1; then
/usr/sbin/useradd -m -s /bin/bash -G sudo,video,i2c STYX_USER
fi
echo 'STYX_USER:STYX_PASS' | /usr/sbin/chpasswd
# Optional: preload SSH key
if [ -n 'SSH_PUBKEY' ] && echo 'SSH_PUBKEY' | grep -q 'ssh-'; then
install -d -m700 /home/STYX_USER/.ssh
echo 'SSH_PUBKEY' >> /home/STYX_USER/.ssh/authorized_keys
chmod 600 /home/STYX_USER/.ssh/authorized_keys
chown -R STYX_USER:STYX_USER /home/STYX_USER/.ssh
fi
# Freenove code
git clone https://github.com/Freenove/Freenove_Computer_Case_Kit_for_Raspberry_Pi.git /opt/freenove || true
# Hostname
echo 'STYX_HOSTNAME' > /etc/hostname
if grep -q '^127\.0\.1\.1' /etc/hosts; then
sed -i 's/^127\.0\.1\.1.*/127.0.1.1\tSTYX_HOSTNAME/' /etc/hosts
else
echo -e '127.0.1.1\tSTYX_HOSTNAME' >> /etc/hosts
fi
apt-get clean
EOS
)
# Inject config values safely
CHCMD="${CHCMD//STYX_USER/${STYX_USER}}"
CHCMD="${CHCMD//STYX_PASS/${STYX_PASS}}"
CHCMD="${CHCMD//STYX_HOSTNAME/${STYX_HOSTNAME}}"
CHCMD="${CHCMD//SSH_PUBKEY/${SSH_PUBKEY}}"
chroot /mnt/pi-root /bin/bash -lc "$CHCMD"
}
install_service_host() {
# Systemd unit for the Freenove example app
mkdir -p /mnt/pi-root/etc/systemd/system/multi-user.target.wants
cat >/mnt/pi-root/etc/systemd/system/freenove-case.service <<'SERVICE'
[Unit]
Description=Freenove Case OLED/Fans/LEDs
After=multi-user.target
[Service]
Type=simple
ExecStart=/usr/bin/python3 /opt/freenove/Code/application.py
Restart=on-failure
[Install]
WantedBy=multi-user.target
SERVICE
ln -sf /etc/systemd/system/freenove-case.service \
/mnt/pi-root/etc/systemd/system/multi-user.target.wants/freenove-case.service || true
}
boot_tweaks() {
# Enable I2C and set DSI panel on the BOOT partition
grep -q 'dtparam=i2c_arm=on' /mnt/pi-boot/config.txt || echo 'dtparam=i2c_arm=on' >> /mnt/pi-boot/config.txt
# Append kernel cmdline only once
if ! grep -q 'DSI-1:800x480@60D' /mnt/pi-boot/cmdline.txt 2>/dev/null; then
sed -i '1 s#$# video=DSI-1:800x480@60D video=HDMI-A-1:off video=HDMI-A-2:off#' /mnt/pi-boot/cmdline.txt || true
fi
}
cleanup() {
rm -f /mnt/pi-root/usr/sbin/policy-rc.d || true
for d in dev/pts dev proc sys; do umount -lf "/mnt/pi-root/$d" 2>/dev/null || true; done
umount -lf /mnt/pi-boot 2>/dev/null || true
umount -lf /mnt/pi-root 2>/dev/null || true
sync || true
}
main() {
require_root
ensure_binfmt_arm64
find_parts
trap 'echo "ERROR at line $LINENO" >&2; cleanup' ERR INT
mount_parts
prep_chroot
install_service_host
boot_tweaks
cleanup
echo "✅ Done. Move the NVMe to the Pi and boot."
echo " Login: user '${STYX_USER}' pass '${STYX_PASS}' (change with 'passwd')."
echo " Quick checks on the Pi:"
echo " sudo i2cdetect -y 1"
echo " rpicam-still -n -o test.jpg # (if rpicam-apps installed)"
echo " libcamera-still -n -o test.jpg # (if legacy libcamera-apps installed)"
echo " systemctl status freenove-case"
}
main "$@"

575
scripts/styx_prep_nvme_luks.sh Executable file
View File

@ -0,0 +1,575 @@
#!/usr/bin/env bash
set -euo pipefail
# --- CONFIG (edit if needed) ---
# Leave NVME empty → script will auto-detect the SSK dock.
NVME="${NVME:-}"
FLAVOR="${FLAVOR:-desktop}"
# Persistent cache so the image survives reboots.
IMG_DIR="${IMG_DIR:-/var/cache/styx-rpi}"
IMG_FILE="${IMG_FILE:-ubuntu-24.04.3-preinstalled-${FLAVOR}-arm64+raspi.img}"
IMG_BOOT_MNT="${IMG_BOOT_MNT:-/mnt/img-boot}"
IMG_ROOT_MNT="${IMG_ROOT_MNT:-/mnt/img-root}"
TGT_ROOT="/mnt/target-root"
TGT_BOOT="/mnt/target-boot"
STYX_USER="styx"
STYX_HOSTNAME="titan-ag"
STYX_PASS="TempPass#123" # will be forced to change on first login via cloud-init
SSH_PUBKEY="ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIOb8oMX6u0z3sH/p/WBGlvPXXdbGETCKzWYwR/dd6fZb titan-bastion"
# Video / input prefs
DSI_FLAGS="video=DSI-1:800x480@60D video=HDMI-A-1:off video=HDMI-A-2:off"
# --- Helpers ---
fatal(){ echo "ERROR: $*" >&2; exit 1; }
need(){ command -v "$1" >/dev/null || fatal "Missing tool: $1"; }
require_root(){ [[ $EUID -eq 0 ]] || exec sudo -E "$0" "$@"; }
part() {
local n="$1"
if [[ "$NVME" =~ [0-9]$ ]]; then
echo "${NVME}p${n}"
else
echo "${NVME}${n}"
fi
}
auto_detect_target_disk() {
# If user already set NVME, validate and return
if [[ -n "${NVME:-}" ]]; then
[[ -b "$NVME" ]] || fatal "NVME='$NVME' is not a block device"
return
fi
# Prefer stable by-id symlinks
local byid
byid=$(ls -1 /dev/disk/by-id/usb-SSK* 2>/dev/null | head -n1 || true)
if [[ -n "$byid" ]]; then
NVME=$(readlink -f "$byid")
else
# Heuristic via lsblk -S: look for USB with SSK/Ingram/Storage in vendor/model
NVME=$(lsblk -S -p -o NAME,TRAN,VENDOR,MODEL | \
awk '/ usb / && ($3 ~ /SSK|Ingram/i || $4 ~ /SSK|Storage/i){print $1; exit}')
fi
[[ -n "${NVME:-}" && -b "$NVME" ]] || fatal "Could not auto-detect SSK USB NVMe dock. Export NVME=/dev/sdX and re-run."
echo "Auto-detected target disk: $NVME"
}
preflight_cleanup() {
local img="$IMG_DIR/$IMG_FILE"
# 1) Unmount image mountpoints and detach only loops for this IMG
umount -lf "$IMG_BOOT_MNT" "$IMG_ROOT_MNT" 2>/dev/null || true
# losetup -j exits non-zero if no association → tolerate it
{ losetup -j "$img" | cut -d: -f1 | xargs -r losetup -d; } 2>/dev/null || true
# 2) Unmount our target mounts
umount -lf "$TGT_ROOT/boot/firmware" "$TGT_BOOT" "$TGT_ROOT" 2>/dev/null || true
# 3) Unmount the actual target partitions if mounted anywhere (tolerate 'not found')
for p in "$(part 1)" "$(part 2)"; do
# findmnt returns 1 when no match → capture and iterate if any
while read -r mnt; do
[ -n "$mnt" ] && umount -lf "$mnt" 2>/dev/null || true
done < <(findmnt -rno TARGET -S "$p" 2>/dev/null || true)
done
# 4) Close dm-crypt mapping (if it exists)
cryptsetup luksClose cryptroot 2>/dev/null || true
dmsetup remove -f cryptroot 2>/dev/null || true
# 5) Let udev settle
command -v udevadm >/dev/null && udevadm settle || true
}
guard_target_device() {
# Refuse to operate if NVME appears to be the current system disk
local root_src root_disk
root_src=$(findmnt -no SOURCE /)
root_disk=$(lsblk -no pkname "$root_src" 2>/dev/null || true)
if [[ -n "$root_disk" && "/dev/$root_disk" == "$NVME" ]]; then
fatal "Refusing to operate on system disk ($NVME). Pick the external NVMe."
fi
}
need_host_fido2() {
if ! command -v fido2-token >/dev/null 2>&1; then
echo "Host is missing fido2-token. On Arch: sudo pacman -S libfido2"
echo "On Debian/Ubuntu host: sudo apt-get install fido2-tools"
exit 1
fi
}
ensure_image() {
mkdir -p "$IMG_DIR"
chmod 755 "$IMG_DIR"
local BASE="https://cdimage.ubuntu.com/releases/noble/release"
local XZ="ubuntu-24.04.3-preinstalled-${FLAVOR}-arm64+raspi.img.xz"
# If the decompressed .img is missing, fetch/decompress into the cache.
if [[ ! -f "$IMG_DIR/$IMG_FILE" ]]; then
need curl; need unxz # Arch: pacman -S curl xz | Ubuntu: apt-get install curl xz-utils
if [[ ! -f "$IMG_DIR/$XZ" ]]; then
echo "Fetching image…"
curl -fL -o "$IMG_DIR/$XZ" "$BASE/$XZ"
fi
echo "Decompressing to $IMG_DIR/$IMG_FILE"
# Keep the .xz for future runs; stream-decompress to the .img
if command -v unxz >/dev/null 2>&1; then
unxz -c "$IMG_DIR/$XZ" > "$IMG_DIR/$IMG_FILE"
else
need xz
xz -dc "$IMG_DIR/$XZ" > "$IMG_DIR/$IMG_FILE"
fi
sync
else
echo "Using cached image: $IMG_DIR/$IMG_FILE"
fi
}
ensure_binfmt_aarch64(){
# Register qemu-aarch64 for chrooted ARM64 apt runs
if [[ ! -e /proc/sys/fs/binfmt_misc/qemu-aarch64 ]]; then
need docker
systemctl enable --now docker >/dev/null 2>&1 || true
docker run --rm --privileged tonistiigi/binfmt --install arm64 >/dev/null
fi
if [[ ! -x /usr/local/bin/qemu-aarch64-static ]]; then
docker rm -f qemu-static >/dev/null 2>&1 || true
docker create --name qemu-static docker.io/multiarch/qemu-user-static:latest >/dev/null
docker cp qemu-static:/usr/bin/qemu-aarch64-static /usr/local/bin/
install -D -m755 /usr/local/bin/qemu-aarch64-static /usr/local/bin/qemu-aarch64-static
docker rm qemu-static >/dev/null
fi
}
open_image() {
[[ -r "$IMG_DIR/$IMG_FILE" ]] || fatal "Image not found: $IMG_DIR/$IMG_FILE"
mkdir -p "$IMG_BOOT_MNT" "$IMG_ROOT_MNT"
# Pre-clean: detach any previous loop(s) for this image (tolerate absence)
umount -lf "$IMG_BOOT_MNT" 2>/dev/null || true
umount -lf "$IMG_ROOT_MNT" 2>/dev/null || true
# If no loop is attached, losetup -j returns non-zero → swallow it
mapfile -t OLD < <({ losetup -j "$IMG_DIR/$IMG_FILE" | cut -d: -f1; } 2>/dev/null || true)
for L in "${OLD[@]:-}"; do losetup -d "$L" 2>/dev/null || true; done
command -v udevadm >/dev/null && udevadm settle || true
# Attach with partition scan; wait for partition nodes to exist
LOOP=$(losetup --find --show --partscan "$IMG_DIR/$IMG_FILE") || fatal "losetup failed"
command -v udevadm >/dev/null && udevadm settle || true
for _ in {1..25}; do
[[ -b "${LOOP}p1" && -b "${LOOP}p2" ]] && break
sleep 0.1
command -v udevadm >/dev/null && udevadm settle || true
done
[[ -b "${LOOP}p1" ]] || fatal "loop partitions not present for $LOOP"
# Cleanup on exit: unmount first, then detach loop (tolerate absence)
trap 'umount -lf "'"$IMG_BOOT_MNT"'" "'"$IMG_ROOT_MNT"'" 2>/dev/null; losetup -d "'"$LOOP"'" 2>/dev/null' EXIT
# Mount image partitions read-only
mount -o ro "${LOOP}p1" "$IMG_BOOT_MNT"
mount -o ro "${LOOP}p2" "$IMG_ROOT_MNT"
# Sanity checks without using failing pipelines
# start*.elf must exist
if ! compgen -G "$IMG_BOOT_MNT/start*.elf" > /dev/null; then
fatal "start*.elf not found in image"
fi
# vmlinuz-* must exist
if ! compgen -G "$IMG_ROOT_MNT/boot/vmlinuz-*" > /dev/null; then
fatal "vmlinuz-* not found in image root"
fi
}
confirm_and_wipe(){
lsblk -o NAME,SIZE,MODEL,TRAN,LABEL "$NVME"
read -rp "Type EXACTLY 'WIPE' to destroy ALL DATA on $NVME: " ACK
[[ "$ACK" == "WIPE" ]] || fatal "Aborted"
wipefs -a "$NVME"
sgdisk -Zo "$NVME"
# GPT: 1: 1MiB..513MiB vfat ESP; 2: rest LUKS
parted -s "$NVME" mklabel gpt \
mkpart system-boot fat32 1MiB 513MiB set 1 esp on \
mkpart cryptroot 513MiB 100%
partprobe "$NVME"; sleep 1
mkfs.vfat -F32 -n system-boot "$(part 1)"
}
setup_luks(){
echo "Create LUKS2 on $(part 2) (you will be prompted for a passphrase; keep it as fallback)"
need cryptsetup
cryptsetup luksFormat --type luks2 "$(part 2)"
cryptsetup open "$(part 2)" cryptroot
mkfs.ext4 -L rootfs /dev/mapper/cryptroot
}
mount_targets(){
mkdir -p "$TGT_ROOT" "$TGT_BOOT"
mount /dev/mapper/cryptroot "$TGT_ROOT"
mkdir -p "$TGT_ROOT/boot/firmware"
mount "$(part 1)" "$TGT_BOOT"
mount --bind "$TGT_BOOT" "$TGT_ROOT/boot/firmware"
}
rsync_root_and_boot(){
need rsync
rsync -aAXH --numeric-ids --delete \
--exclude='/boot/firmware' --exclude='/boot/firmware/**' \
--exclude='/dev/*' --exclude='/proc/*' --exclude='/sys/*' \
--exclude='/run/*' --exclude='/tmp/*' --exclude='/mnt/*' \
--exclude='/media/*' --exclude='/lost+found' \
"$IMG_ROOT_MNT"/ "$TGT_ROOT"/
rsync -aH --delete "$IMG_BOOT_MNT"/ "$TGT_ROOT/boot/firmware"/
}
write_crypttab_fstab(){
LUUID=$(blkid -s UUID -o value "$(part 2)")
printf 'cryptroot UUID=%s none luks,discard,fido2-device=auto\n' "$LUUID" > "$TGT_ROOT/etc/crypttab"
cat > "$TGT_ROOT/etc/fstab" <<EOF
/dev/mapper/cryptroot / ext4 defaults,discard,errors=remount-ro 0 1
LABEL=system-boot /boot/firmware vfat defaults,umask=0077 0 1
EOF
}
fix_firmware_files(){
local C="$TGT_ROOT/boot/firmware/config.txt"
local CL="$TGT_ROOT/boot/firmware/cmdline.txt"
[[ -f "$C" ]] || fatal "missing $C"
# Always boot the uncompressed Pi 5 kernel
if grep -q '^kernel=' "$C"; then
sed -i 's#^kernel=.*#kernel=kernel_2712.img#' "$C"
else
sed -i '1i kernel=kernel_2712.img' "$C"
fi
# Ensure initramfs and cmdline indirection are set
grep -q '^initramfs ' "$C" || echo 'initramfs initrd.img followkernel' >> "$C"
grep -q '^cmdline=cmdline.txt' "$C" || sed -i '1i cmdline=cmdline.txt' "$C"
# Display & buses (Pi 5)
grep -q '^dtoverlay=vc4-kms-v3d-pi5' "$C" || echo 'dtoverlay=vc4-kms-v3d-pi5' >> "$C"
grep -q '^dtparam=i2c_arm=on' "$C" || echo 'dtparam=i2c_arm=on' >> "$C"
grep -q '^dtparam=pciex1=on' "$C" || echo 'dtparam=pciex1=on' >> "$C"
grep -q '^dtparam=pciex1_gen=2' "$C" || echo 'dtparam=pciex1_gen=2' >> "$C"
grep -q '^enable_uart=1' "$C" || echo 'enable_uart=1' >> "$C"
# Minimal, correct dracut hints using the bare UUID
local LUUID; LUUID=$(blkid -s UUID -o value "$(part 2)")
: > "$CL"
{
echo -n "rd.luks.uuid=$LUUID rd.luks.name=$LUUID=cryptroot "
echo -n "root=/dev/mapper/cryptroot rootfstype=ext4 rootwait fixrtc "
echo "console=serial0,115200 console=tty1 ds=nocloud;s=file:///boot/firmware/ ${DSI_FLAGS} rd.debug"
} >> "$CL"
}
seed_cloud_init(){
# NoCloud seed to create user, lock down SSH, set hostname, and enable avahi.
cat > "$TGT_ROOT/boot/firmware/user-data" <<EOF
#cloud-config
hostname: $STYX_HOSTNAME
manage_etc_hosts: true
users:
- name: $STYX_USER
gecos: "$STYX_USER"
shell: /bin/bash
groups: [sudo,video,i2c]
sudo: ALL=(ALL) NOPASSWD:ALL
lock_passwd: false
ssh_authorized_keys:
- $SSH_PUBKEY
chpasswd:
list: |
$STYX_USER:$STYX_PASS
expire: true
ssh_pwauth: false
package_update: true
packages: [openssh-server, avahi-daemon]
runcmd:
- systemctl enable --now ssh
- systemctl enable --now avahi-daemon || true
EOF
# Minimal meta-data for NoCloud
date +%s | awk '{print "instance-id: iid-titan-ag-"$1"\nlocal-hostname: '"$STYX_HOSTNAME"'"}' \
> "$TGT_ROOT/boot/firmware/meta-data"
}
prep_chroot_mounts(){
for d in dev proc sys; do mount --bind "/$d" "$TGT_ROOT/$d"; done
mount -t devpts devpts "$TGT_ROOT/dev/pts"
# Replace the usual resolv.conf symlink with a real file for apt to work
rm -f "$TGT_ROOT/etc/resolv.conf"
cp /etc/resolv.conf "$TGT_ROOT/etc/resolv.conf"
# Block service starts (no systemd in chroot)
cat > "$TGT_ROOT/usr/sbin/policy-rc.d" <<'EOP'
#!/bin/sh
exit 101
EOP
chmod +x "$TGT_ROOT/usr/sbin/policy-rc.d"
# Ensure qemu static is present inside chroot
install -D -m755 /usr/local/bin/qemu-aarch64-static "$TGT_ROOT/usr/bin/qemu-aarch64-static"
}
in_chroot(){
chroot "$TGT_ROOT" /usr/bin/qemu-aarch64-static /bin/bash -lc '
set -euo pipefail
export DEBIAN_FRONTEND=noninteractive TZ=Etc/UTC
# --- APT sources (ports) ---
cat > /etc/apt/sources.list <<'"'"'EOS'"'"'
deb http://ports.ubuntu.com/ubuntu-ports noble main restricted universe multiverse
deb http://ports.ubuntu.com/ubuntu-ports noble-updates main restricted universe multiverse
deb http://ports.ubuntu.com/ubuntu-ports noble-security main restricted universe multiverse
EOS
apt-get update
# --- Remove snaps and pin them off ---
apt-get -y purge snapd || true
rm -rf /snap /var/snap /var/lib/snapd /home/*/snap || true
mkdir -p /etc/apt/preferences.d
cat > /etc/apt/preferences.d/nosnap.pref <<'"'"'EOS'"'"'
Package: snapd
Pin: release *
Pin-Priority: -10
EOS
# --- Base tools (no flash-kernel; we use dracut) ---
apt-get install -y --no-install-recommends \
openssh-client openssh-server openssh-sftp-server avahi-daemon \
cryptsetup dracut fido2-tools libfido2-1 i2c-tools \
python3-smbus python3-pil zbar-tools qrencode lm-sensors \
file zstd lz4 || true
# Camera apps: try rpicam-apps; otherwise basic libcamera tools
apt-get install -y rpicam-apps || apt-get install -y libcamera-tools || true
# --- Persistent journal so we can read logs after failed boot ---
mkdir -p /etc/systemd/journald.conf.d
cat > /etc/systemd/journald.conf.d/99-persistent.conf <<'"'"'EOS'"'"'
[Journal]
Storage=persistent
EOS
# --- SSH hardening (ensure file exists even if package was half-installed) ---
if [ ! -f /etc/ssh/sshd_config ]; then
mkdir -p /etc/ssh
cat > /etc/ssh/sshd_config <<'"'"'EOS'"'"'
PermitRootLogin no
PasswordAuthentication no
KbdInteractiveAuthentication no
PubkeyAuthentication yes
# Accept defaults for the rest
EOS
fi
sed -i -e "s/^#\?PasswordAuthentication .*/PasswordAuthentication no/" \
-e "s/^#\?KbdInteractiveAuthentication .*/KbdInteractiveAuthentication no/" \
-e "s/^#\?PermitRootLogin .*/PermitRootLogin no/" \
-e "s/^#\?PubkeyAuthentication .*/PubkeyAuthentication yes/" /etc/ssh/sshd_config || true
# --- Hostname & hosts ---
echo "'"$STYX_HOSTNAME"'" > /etc/hostname
if grep -q "^127\\.0\\.1\\.1" /etc/hosts; then
sed -i "s/^127\\.0\\.1\\.1.*/127.0.1.1\t'"$STYX_HOSTNAME"'/" /etc/hosts
else
echo -e "127.0.1.1\t'"$STYX_HOSTNAME"'" >> /etc/hosts
fi
# --- Enable services on first boot ---
mkdir -p /etc/systemd/system/multi-user.target.wants
ln -sf /lib/systemd/system/ssh.service /etc/systemd/system/multi-user.target.wants/ssh.service
ln -sf /lib/systemd/system/avahi-daemon.service /etc/systemd/system/multi-user.target.wants/avahi-daemon.service || true
# --- Ensure i2c group ---
getent group i2c >/dev/null || groupadd i2c
# --- Dracut configuration (generic, not host-only) ---
mkdir -p /etc/dracut.conf.d
cat > /etc/dracut.conf.d/00-hostonly.conf <<'"'"'EOS'"'"'
hostonly=no
EOS
cat > /etc/dracut.conf.d/10-systemd-crypt.conf <<'"'"'EOS'"'"'
add_dracutmodules+=" systemd crypt "
EOS
cat > /etc/dracut.conf.d/20-drivers.conf <<'"'"'EOS'"'"'
add_drivers+=" nvme xhci_pci xhci_hcd usbhid hid_generic hid "
EOS
cat > /etc/dracut.conf.d/30-fido2.conf <<'"'"'EOS'"'"'
install_items+="/usr/bin/systemd-cryptsetup /usr/bin/fido2-token /usr/lib/*/libfido2.so* /usr/lib/*/libcbor.so*"
EOS
# --- Build initramfs and place it where firmware expects it ---
KVER=$(ls -1 /lib/modules | sort -V | tail -n1)
dracut --force /boot/initramfs-$KVER.img $KVER
ln -sf initramfs-$KVER.img /boot/initrd.img
ln -sf initramfs-$KVER.img /boot/initrd.img-$KVER
cp -a /boot/initramfs-$KVER.img /boot/firmware/initrd.img
# --- Create uncompressed kernel for Pi 5 firmware ---
if [ -f "/usr/lib/linux-image-$KVER/Image" ]; then
cp -a "/usr/lib/linux-image-$KVER/Image" /boot/firmware/kernel_2712.img
else
FMT=$(file -b "/boot/vmlinuz-$KVER" || true)
case "$FMT" in
*Zstandard*|*zstd*) zstd -dc "/boot/vmlinuz-$KVER" > /boot/firmware/kernel_2712.img ;;
*LZ4*) lz4 -dc "/boot/vmlinuz-$KVER" > /boot/firmware/kernel_2712.img ;;
*gzip*) zcat "/boot/vmlinuz-$KVER" > /boot/firmware/kernel_2712.img ;;
*) cp -a "/boot/vmlinuz-$KVER" /boot/firmware/kernel_2712.img ;;
esac
fi
# --- Ensure Pi 5 DTB is present on the boot partition ---
DTB=$(find /lib/firmware -type f -name "bcm2712-rpi-5-b.dtb" | sort | tail -n1 || true)
[ -n "$DTB" ] && cp -a "$DTB" /boot/firmware/
# --- Dracut hook to copy rdsosreport.txt to the FAT partition on failure ---
mkdir -p /usr/lib/dracut/modules.d/99copylog
cat > /usr/lib/dracut/modules.d/99copylog/module-setup.sh <<'"'"'EOS'"'"'
#!/bin/bash
check() { return 0; }
depends() { echo base; return 0; }
install() {
# Guard $moddir for nounset; derive if absent
local mdir="${moddir:-$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" && pwd)}"
inst_hook emergency 99 "$mdir/copylog.sh"
}
EOS
chmod +x /usr/lib/dracut/modules.d/99copylog/module-setup.sh
cat > /usr/lib/dracut/modules.d/99copylog/copylog.sh <<'"'"'EOS'"'"'
#!/bin/sh
set -e
for dev in /dev/nvme0n1p1 /dev/sda1 /dev/sdb1 /dev/mmcblk0p1; do
[ -b "$dev" ] || continue
mkdir -p /mnt/bootfat
if mount -t vfat "$dev" /mnt/bootfat 2>/dev/null; then
if [ -s /run/initramfs/rdsosreport.txt ]; then
cp -f /run/initramfs/rdsosreport.txt /mnt/bootfat/rdsosreport.txt 2>/dev/null || true
sync || true
fi
umount /mnt/bootfat || true
break
fi
done
EOS
chmod +x /usr/lib/dracut/modules.d/99copylog/copylog.sh
# Rebuild to ensure the copylog module is included
dracut --force /boot/initramfs-$KVER.img $KVER
ln -sf initramfs-$KVER.img /boot/initrd.img
cp -a /boot/initramfs-$KVER.img /boot/firmware/initrd.img
true
'
}
verify_boot_assets(){
echo "---- verify boot assets on FAT ----"
file "$TGT_ROOT/boot/firmware/kernel_2712.img" || true
ls -lh "$TGT_ROOT/boot/firmware/initrd.img" || true
echo "-- config.txt (key lines) --"
grep -E '^(kernel|initramfs|cmdline)=|^dtoverlay=|^dtparam=' "$TGT_ROOT/boot/firmware/config.txt" || true
echo "-- cmdline.txt --"
cat "$TGT_ROOT/boot/firmware/cmdline.txt" || true
echo "-- firmware blobs (sample) --"
ls -1 "$TGT_ROOT/boot/firmware"/start*.elf "$TGT_ROOT/boot/firmware"/fixup*.dat | head -n 8 || true
echo "-- Pi5 DTB --"
ls -l "$TGT_ROOT/boot/firmware/"*rpi-5-b.dtb || true
}
enroll_fido_tokens(){
echo "Enrolling FIDO2 Solo keys into $(part 2) ..."
need systemd-cryptenroll
need fido2-token
# Collect all hidraw paths from both output styles (some distros print 'Device: /dev/hidrawX')
mapfile -t DEVS < <(
fido2-token -L \
| sed -n 's,^\(/dev/hidraw[0-9]\+\):.*,\1,p; s,^Device:[[:space:]]\+/dev/hidraw\([0-9]\+\).*,/dev/hidraw\1,p' \
| sort -u
)
if (( ${#DEVS[@]} == 0 )); then
echo "No FIDO2 tokens detected; skipping enrollment (you can enroll later)."
echo "Example later: systemd-cryptenroll $(part 2) --fido2-device=/dev/hidrawX --fido2-with-client-pin=no"
return 0
fi
# Recommend keeping exactly ONE key plugged during first enrollment to avoid ambiguity.
if (( ${#DEVS[@]} > 1 )); then
echo "Note: multiple FIDO2 tokens present: ${DEVS[*]}"
echo "If enrollment fails, try with only one key inserted."
fi
local rc=0
for D in "${DEVS[@]}"; do
echo "-> Enrolling $D (you should be asked to touch the key)"
if ! SYSTEMD_LOG_LEVEL=debug systemd-cryptenroll "$(part 2)" \
--fido2-device="$D" \
--fido2-with-client-pin=no \
--fido2-with-user-presence=yes \
--fido2-with-user-verification=no \
--label="solo-$(basename "$D")"; then
echo "WARN: enrollment failed for $D"
rc=1
fi
done
echo "Tokens enrolled (if any):"
systemd-cryptenroll "$(part 2)" --list || true
return $rc
}
cleanup(){
rm -f "$TGT_ROOT/usr/sbin/policy-rc.d" || true
umount -lf "$TGT_ROOT/dev/pts" 2>/dev/null || true
for d in dev proc sys; do umount -lf "$TGT_ROOT/$d" 2>/dev/null || true; done
umount -lf "$TGT_ROOT/boot/firmware" 2>/dev/null || true
umount -lf "$TGT_BOOT" 2>/dev/null || true
umount -lf "$TGT_ROOT" 2>/dev/null || true
cryptsetup close cryptroot 2>/dev/null || true
umount -lf "$IMG_BOOT_MNT" 2>/dev/null || true
umount -lf "$IMG_ROOT_MNT" 2>/dev/null || true
}
main(){
require_root
need losetup; need parted; need rsync
auto_detect_target_disk
echo "Target disk: $NVME"
ensure_binfmt_aarch64
ensure_image
preflight_cleanup
guard_target_device
open_image
confirm_and_wipe
setup_luks
mount_targets
rsync_root_and_boot
write_crypttab_fstab
fix_firmware_files
seed_cloud_init
prep_chroot_mounts
in_chroot
verify_boot_assets
need_host_fido2
enroll_fido_tokens
cleanup
echo "✅ NVMe prepared."
echo " Install in the Pi 5 and boot with no SD."
echo " Expect LUKS to unlock automatically with a Solo key inserted;"
echo " passphrase fallback remains. Hostname: ${STYX_HOSTNAME} User: ${STYX_USER}"
echo " On first boot, reach it via: ssh -i ~/.ssh/id_ed25519_titan styx@titan-ag.local"
}
main "$@"

View File

@ -23,6 +23,11 @@ spec:
spec:
nodeSelector:
jellyfin: "true"
securityContext:
runAsUser: 1000
fsGroup: 65532
fsGroupChangePolicy: OnRootMismatch
runAsGroup: 65532
runtimeClassName: nvidia
containers:
- name: jellyfin
@ -36,6 +41,12 @@ spec:
value: "compute,video,utility"
- name: JELLYFIN_PublishedServerUrl
value: "https://stream.bstein.dev"
- name: PUID
value: "1000"
- name: PGID
value: "65532"
- name: UMASK
value: "002"
resources:
limits:
nvidia.com/gpu: 1
@ -64,4 +75,4 @@ spec:
claimName: jellyfin-cache-astreae
- name: media
persistentVolumeClaim:
claimName: jellyfin-media-asteria
claimName: jellyfin-media-asteria-new

View File

@ -36,5 +36,19 @@ spec:
accessModes: ["ReadWriteMany"]
resources:
requests:
storage: 2Ti
storage: 4Ti
storageClassName: asteria
---
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
name: jellyfin-media-asteria-new
namespace: jellyfin
spec:
accessModes: ["ReadWriteMany"]
resources:
requests:
storage: 4Ti
storageClassName: asteria

View File

@ -0,0 +1,171 @@
# services/jitsi/deployment.yaml
apiVersion: apps/v1
kind: Deployment
metadata:
name: jitsi-prosody
namespace: jitsi
spec:
replicas: 0
selector:
matchLabels: { app: jitsi-prosody }
template:
metadata:
labels: { app: jitsi-prosody }
spec:
nodeSelector:
kubernetes.io/hostname: titan-22
kubernetes.io/arch: amd64
containers:
- name: prosody
image: jitsi/prosody:stable
ports:
- { name: c2s, containerPort: 5222, protocol: TCP }
- { name: http, containerPort: 5280, protocol: TCP }
- { name: comp, containerPort: 5347, protocol: TCP }
env:
- { name: XMPP_DOMAIN, value: "meet.jitsi" }
- { name: XMPP_AUTH_DOMAIN, value: "auth.meet.jitsi" }
- { name: XMPP_MUC_DOMAIN, value: "muc.meet.jitsi" }
- { name: XMPP_INTERNAL_MUC_DOMAIN, value: "internal-muc.meet.jitsi" }
- { name: ENABLE_AUTH, value: "0" } # open instance, no auth (fastest path)
- { name: ENABLE_GUESTS, value: "1" }
- { name: JICOFO_AUTH_USER, value: "focus" }
- { name: JVB_AUTH_USER, value: "jvb" }
- name: JICOFO_AUTH_PASSWORD
valueFrom: { secretKeyRef: { name: jitsi-internal-secrets, key: JICOFO_AUTH_PASSWORD } }
- name: JICOFO_COMPONENT_SECRET
valueFrom: { secretKeyRef: { name: jitsi-internal-secrets, key: JICOFO_COMPONENT_SECRET } }
- name: JVB_AUTH_PASSWORD
valueFrom: { secretKeyRef: { name: jitsi-internal-secrets, key: JVB_AUTH_PASSWORD } }
volumeMounts:
- { name: cfg, mountPath: /config }
volumes:
- name: cfg
persistentVolumeClaim: { claimName: jitsi-prosody-config }
---
apiVersion: apps/v1
kind: Deployment
metadata:
name: jitsi-jicofo
namespace: jitsi
spec:
replicas: 0
selector:
matchLabels: { app: jitsi-jicofo }
template:
metadata:
labels: { app: jitsi-jicofo }
spec:
nodeSelector:
kubernetes.io/hostname: titan-22
kubernetes.io/arch: amd64
containers:
- name: jicofo
image: jitsi/jicofo:stable
env:
- { name: XMPP_DOMAIN, value: "meet.jitsi" }
- { name: XMPP_AUTH_DOMAIN, value: "auth.meet.jitsi" }
- { name: XMPP_MUC_DOMAIN, value: "muc.meet.jitsi" }
- { name: XMPP_INTERNAL_MUC_DOMAIN, value: "internal-muc.meet.jitsi" }
- { name: XMPP_SERVER, value: "jitsi-prosody.jitsi.svc.cluster.local" }
- { name: JICOFO_AUTH_USER, value: "focus" }
- name: JICOFO_AUTH_PASSWORD
valueFrom: { secretKeyRef: { name: jitsi-internal-secrets, key: JICOFO_AUTH_PASSWORD } }
- name: JICOFO_COMPONENT_SECRET
valueFrom: { secretKeyRef: { name: jitsi-internal-secrets, key: JICOFO_COMPONENT_SECRET } }
- { name: JVB_BREWERY_MUC, value: "jvbbrewery" }
volumeMounts:
- { name: cfg, mountPath: /config }
volumes:
- name: cfg
persistentVolumeClaim: { claimName: jitsi-jicofo-config }
---
apiVersion: apps/v1
kind: Deployment
metadata:
name: jitsi-jvb
namespace: jitsi
spec:
replicas: 0
selector:
matchLabels: { app: jitsi-jvb }
template:
metadata:
labels: { app: jitsi-jvb }
spec:
nodeSelector:
kubernetes.io/hostname: titan-22
kubernetes.io/arch: amd64
containers:
- name: jvb
image: jitsi/jvb:stable
ports:
- { name: colibri-ws, containerPort: 9090, protocol: TCP } # WebSocket control channel
- { name: rtp-udp, containerPort: 10000, hostPort: 10000, protocol: UDP } # media
- { name: rtp-tcp, containerPort: 4443, hostPort: 4443, protocol: TCP }
env:
- { name: XMPP_DOMAIN, value: "meet.jitsi" }
- { name: XMPP_AUTH_DOMAIN, value: "auth.meet.jitsi" }
- { name: XMPP_MUC_DOMAIN, value: "muc.meet.jitsi" }
- { name: XMPP_INTERNAL_MUC_DOMAIN, value: "internal-muc.meet.jitsi" }
- { name: XMPP_SERVER, value: "jitsi-prosody.jitsi.svc.cluster.local" }
- { name: JVB_AUTH_USER, value: "jvb" }
- name: JVB_AUTH_PASSWORD
valueFrom: { secretKeyRef: { name: jitsi-internal-secrets, key: JVB_AUTH_PASSWORD } }
- { name: JVB_BREWERY_MUC, value: "jvbbrewery" }
- { name: JVB_PORT, value: "10000" } # matches hostPort above
- { name: ENABLE_COLIBRI_WEBSOCKET, value: "1" } # enables /colibri-ws
# - { name: JVB_STUN_SERVERS, value: "stun.l.google.com:19302,stun1.l.google.com:19302,meet-jit-si-turnrelay.jitsi.net:443" }
- { name: JVB_ENABLE_APIS, value: "rest,colibri" }
- { name: JVB_WS_DOMAIN, value: "meet.bstein.dev:443" }
- { name: JVB_WS_TLS, value: "true" }
- { name: JVB_ADVERTISE_IPS, value: "38.28.125.112" }
- { name: JVB_TCP_HARVESTER_DISABLED, value: "false" }
- { name: JVB_TCP_PORT, value: "4443" }
volumeMounts:
- { name: cfg, mountPath: /config }
volumes:
- name: cfg
persistentVolumeClaim: { claimName: jitsi-jvb-config }
---
apiVersion: apps/v1
kind: Deployment
metadata:
name: jitsi-web
namespace: jitsi
spec:
replicas: 0
selector:
matchLabels: { app: jitsi-web }
template:
metadata:
labels: { app: jitsi-web }
spec:
nodeSelector:
kubernetes.io/hostname: titan-22
kubernetes.io/arch: amd64
containers:
- name: web
image: jitsi/web:stable
ports:
- { name: http, containerPort: 80, protocol: TCP }
env:
- { name: PUBLIC_URL, value: "https://meet.bstein.dev" }
- { name: XMPP_DOMAIN, value: "meet.jitsi" }
- { name: XMPP_AUTH_DOMAIN, value: "auth.meet.jitsi" }
- { name: XMPP_MUC_DOMAIN, value: "muc.meet.jitsi" }
- { name: XMPP_INTERNAL_MUC_DOMAIN, value: "internal-muc.meet.jitsi" }
- { name: XMPP_BOSH_URL_BASE, value: "https://meet.bstein.dev" }
- { name: ENABLE_XMPP_WEBSOCKET, value: "1" }
- { name: ENABLE_COLIBRI_WEBSOCKET, value: "1" }
volumeMounts:
- { name: cfg, mountPath: /config }
volumes:
- name: cfg
persistentVolumeClaim: { claimName: jitsi-web-config }

View File

@ -0,0 +1,41 @@
# services/jitsi/ingress.yaml
apiVersion: networking.k8s.io/v1
kind: Ingress
metadata:
name: jitsi
namespace: jitsi
annotations:
cert-manager.io/cluster-issuer: "letsencrypt-prod"
spec:
ingressClassName: traefik
tls:
- hosts: [ "meet.bstein.dev" ]
secretName: jitsi-meet-tls
rules:
- host: meet.bstein.dev
http:
paths:
- path: /colibri-ws
pathType: Prefix
backend:
service:
name: jitsi-jvb
port: { number: 9090 }
- path: /xmpp-websocket
pathType: Prefix
backend:
service:
name: jitsi-prosody
port: { number: 5280 }
- path: /http-bind
pathType: Prefix
backend:
service:
name: jitsi-prosody
port: { number: 5280 }
- path: /
pathType: Prefix
backend:
service:
name: jitsi-web
port: { number: 80 }

View File

@ -0,0 +1,10 @@
# services/jitsi/kustomization.yaml
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
resources:
- namespace.yaml
- deployment.yaml
- service.yaml
- pvc.yaml
- ingress.yaml
- secret.yaml

View File

@ -0,0 +1,5 @@
# services/jitsi/namespace.yaml
apiVersion: v1
kind: Namespace
metadata:
name: jitsi

42
services/jitsi/pvc.yaml Normal file
View File

@ -0,0 +1,42 @@
# services/jitsi/pvc.yaml
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
name: jitsi-web-config
namespace: jitsi
spec:
accessModes: ["ReadWriteOnce"]
resources: { requests: { storage: 10Gi } }
---
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
name: jitsi-prosody-config
namespace: jitsi
spec:
accessModes: ["ReadWriteOnce"]
resources: { requests: { storage: 10Gi } }
---
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
name: jitsi-jicofo-config
namespace: jitsi
spec:
accessModes: ["ReadWriteOnce"]
resources: { requests: { storage: 10Gi } }
---
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
name: jitsi-jvb-config
namespace: jitsi
spec:
accessModes: ["ReadWriteOnce"]
resources: { requests: { storage: 10Gi } }

View File

@ -0,0 +1,36 @@
# services/jitsi/service.yaml
apiVersion: v1
kind: Service
metadata:
name: jitsi-prosody
namespace: jitsi
spec:
selector: { app: jitsi-prosody }
ports:
- { name: c2s, port: 5222, targetPort: 5222, protocol: TCP }
- { name: http, port: 5280, targetPort: 5280, protocol: TCP }
- { name: comp, port: 5347, targetPort: 5347, protocol: TCP }
---
apiVersion: v1
kind: Service
metadata:
name: jitsi-jvb
namespace: jitsi
spec:
selector: { app: jitsi-jvb }
ports:
- { name: colibri-ws, port: 9090, targetPort: 9090, protocol: TCP }
---
apiVersion: v1
kind: Service
metadata:
name: jitsi-web
namespace: jitsi
spec:
selector: { app: jitsi-web }
ports:
- { name: http, port: 80, targetPort: 80, protocol: TCP }

View File

@ -0,0 +1,27 @@
# services/keycloak
Keycloak is deployed via raw manifests and backed by the shared Postgres (`postgres-service.postgres.svc.cluster.local:5432`). Create these secrets before applying:
```bash
# DB creds (per-service DB/user in shared Postgres)
kubectl -n sso create secret generic keycloak-db \
--from-literal=username=keycloak \
--from-literal=password='<DB_PASSWORD>' \
--from-literal=database=keycloak
# Admin console creds (maps to KC admin user)
kubectl -n sso create secret generic keycloak-admin \
--from-literal=username=brad@bstein.dev \
--from-literal=password='<ADMIN_PASSWORD>'
```
Apply:
```bash
kubectl apply -k services/keycloak
```
Notes
- Service: `keycloak.sso.svc:80` (Ingress `sso.bstein.dev`, TLS via cert-manager).
- Uses Postgres schema `public`; DB/user should be provisioned in the shared Postgres instance.
- Health endpoints on :9000 are wired for probes.

View File

@ -0,0 +1,132 @@
# services/keycloak/deployment.yaml
apiVersion: apps/v1
kind: Deployment
metadata:
name: keycloak
namespace: sso
labels:
app: keycloak
spec:
replicas: 1
selector:
matchLabels:
app: keycloak
template:
metadata:
labels:
app: keycloak
spec:
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: hardware
operator: In
values: ["rpi5","rpi4"]
- key: node-role.kubernetes.io/worker
operator: Exists
- matchExpressions:
- key: kubernetes.io/hostname
operator: In
values: ["titan-24"]
preferredDuringSchedulingIgnoredDuringExecution:
- weight: 90
preference:
matchExpressions:
- key: hardware
operator: In
values: ["rpi5"]
- weight: 70
preference:
matchExpressions:
- key: hardware
operator: In
values: ["rpi4"]
securityContext:
runAsUser: 1000
runAsGroup: 0
fsGroup: 1000
fsGroupChangePolicy: OnRootMismatch
containers:
- name: keycloak
image: quay.io/keycloak/keycloak:26.0.7
imagePullPolicy: IfNotPresent
args:
- start
env:
- name: KC_DB
value: postgres
- name: KC_DB_URL_HOST
value: postgres-service.postgres.svc.cluster.local
- name: KC_DB_URL_DATABASE
valueFrom:
secretKeyRef:
name: keycloak-db
key: database
- name: KC_DB_USERNAME
valueFrom:
secretKeyRef:
name: keycloak-db
key: username
- name: KC_DB_PASSWORD
valueFrom:
secretKeyRef:
name: keycloak-db
key: password
- name: KC_DB_SCHEMA
value: public
- name: KC_HOSTNAME
value: sso.bstein.dev
- name: KC_HOSTNAME_URL
value: https://sso.bstein.dev
- name: KC_PROXY
value: edge
- name: KC_PROXY_HEADERS
value: xforwarded
- name: KC_HTTP_ENABLED
value: "true"
- name: KC_HTTP_MANAGEMENT_PORT
value: "9000"
- name: KC_HTTP_MANAGEMENT_BIND_ADDRESS
value: 0.0.0.0
- name: KC_HEALTH_ENABLED
value: "true"
- name: KC_METRICS_ENABLED
value: "true"
- name: KEYCLOAK_ADMIN
valueFrom:
secretKeyRef:
name: keycloak-admin
key: username
- name: KEYCLOAK_ADMIN_PASSWORD
valueFrom:
secretKeyRef:
name: keycloak-admin
key: password
ports:
- containerPort: 8080
name: http
- containerPort: 9000
name: metrics
readinessProbe:
httpGet:
path: /health/ready
port: 9000
initialDelaySeconds: 15
periodSeconds: 10
failureThreshold: 6
livenessProbe:
httpGet:
path: /health/live
port: 9000
initialDelaySeconds: 60
periodSeconds: 15
failureThreshold: 6
volumeMounts:
- name: data
mountPath: /opt/keycloak/data
volumes:
- name: data
persistentVolumeClaim:
claimName: keycloak-data

View File

@ -0,0 +1,24 @@
# services/keycloak/ingress.yaml
apiVersion: networking.k8s.io/v1
kind: Ingress
metadata:
name: keycloak
namespace: sso
annotations:
cert-manager.io/cluster-issuer: letsencrypt
spec:
ingressClassName: traefik
rules:
- host: sso.bstein.dev
http:
paths:
- path: /
pathType: Prefix
backend:
service:
name: keycloak
port:
number: 80
tls:
- hosts: [sso.bstein.dev]
secretName: keycloak-tls

View File

@ -0,0 +1,10 @@
# services/keycloak/kustomization.yaml
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
namespace: sso
resources:
- namespace.yaml
- pvc.yaml
- deployment.yaml
- service.yaml
- ingress.yaml

View File

@ -0,0 +1,5 @@
# services/keycloak/namespace.yaml
apiVersion: v1
kind: Namespace
metadata:
name: sso

View File

@ -0,0 +1,12 @@
# services/keycloak/pvc.yaml
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
name: keycloak-data
namespace: sso
spec:
accessModes: ["ReadWriteOnce"]
resources:
requests:
storage: 10Gi
storageClassName: astreae

View File

@ -0,0 +1,15 @@
# services/keycloak/service.yaml
apiVersion: v1
kind: Service
metadata:
name: keycloak
namespace: sso
labels:
app: keycloak
spec:
selector:
app: keycloak
ports:
- name: http
port: 80
targetPort: http

View File

@ -0,0 +1,28 @@
# services/monitoring
## Grafana admin secret
The Grafana Helm release expects a pre-existing secret named `grafana-admin`
in the `monitoring` namespace. Create or rotate it with:
```bash
kubectl create secret generic grafana-admin \
--namespace monitoring \
--from-literal=admin-user=admin \
--from-literal=admin-password='REPLACE_ME'
```
Update the password whenever you rotate credentials.
## DCGM exporter image
The NVIDIA GPU metrics DaemonSet expects `registry.bstein.dev/monitoring/dcgm-exporter:4.4.2-4.7.0-ubuntu22.04`, mirrored from `docker.io/nvidia/dcgm-exporter:4.4.2-4.7.0-ubuntu22.04`. Refresh it in Zot when bumping versions:
```bash
skopeo copy \
--all \
docker://docker.io/nvidia/dcgm-exporter:4.4.2-4.7.0-ubuntu22.04 \
docker://registry.bstein.dev/monitoring/dcgm-exporter:4.4.2-4.7.0-ubuntu22.04
```
When finished mirroring from the control-plane, you can remove temporary tooling with `sudo apt-get purge -y skopeo && sudo apt-get autoremove -y` and clear `~/.config/containers/auth.json`.

View File

@ -0,0 +1,184 @@
{
"uid": "atlas-gpu",
"title": "Atlas GPU",
"folderUid": "atlas-internal",
"editable": true,
"panels": [
{
"id": 1,
"type": "piechart",
"title": "Namespace GPU Share",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 8,
"w": 12,
"x": 0,
"y": 0
},
"targets": [
{
"expr": "100 * ( ( (sum by (namespace) (max_over_time(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\"}[$__range]))) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ) / clamp_min(sum( ( (sum by (namespace) (max_over_time(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\"}[$__range]))) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ), 1)",
"refId": "A",
"legendFormat": "{{namespace}}"
}
],
"fieldConfig": {
"defaults": {
"unit": "percent",
"color": {
"mode": "palette-classic"
}
},
"overrides": []
},
"options": {
"legend": {
"displayMode": "list",
"placement": "right"
},
"pieType": "pie",
"displayLabels": [
"percent"
],
"tooltip": {
"mode": "single"
},
"colorScheme": "interpolateSpectral",
"colorBy": "value",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
}
}
},
{
"id": 2,
"type": "timeseries",
"title": "GPU Util by Namespace",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 8,
"w": 12,
"x": 12,
"y": 0
},
"targets": [
{
"expr": "sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\"}) by (namespace)",
"refId": "A",
"legendFormat": "{{namespace}}"
}
],
"fieldConfig": {
"defaults": {
"unit": "percent"
},
"overrides": []
},
"options": {
"legend": {
"displayMode": "table",
"placement": "right"
},
"tooltip": {
"mode": "multi"
}
}
},
{
"id": 3,
"type": "timeseries",
"title": "GPU Util by Node",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 8,
"w": 12,
"x": 0,
"y": 8
},
"targets": [
{
"expr": "sum by (Hostname) (DCGM_FI_DEV_GPU_UTIL{pod!=\"\"})",
"refId": "A",
"legendFormat": "{{Hostname}}"
}
],
"fieldConfig": {
"defaults": {
"unit": "percent"
},
"overrides": []
},
"options": {
"legend": {
"displayMode": "table",
"placement": "right"
},
"tooltip": {
"mode": "multi"
}
}
},
{
"id": 4,
"type": "table",
"title": "Top Pods by GPU Util",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 8,
"w": 12,
"x": 12,
"y": 8
},
"targets": [
{
"expr": "topk(10, sum(DCGM_FI_DEV_GPU_UTIL{pod!=\"\"}) by (namespace,pod,Hostname))",
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"unit": "percent"
},
"overrides": []
},
"options": {
"showHeader": true
},
"transformations": [
{
"id": "labelsToFields",
"options": {}
}
]
}
],
"time": {
"from": "now-12h",
"to": "now"
},
"annotations": {
"list": []
},
"schemaVersion": 39,
"style": "dark",
"tags": [
"atlas",
"gpu"
]
}

Some files were not shown because too many files have changed in this diff Show More