Compare commits

..

278 Commits

Author SHA1 Message Date
ba12854639 flux: lower controller log verbosity 2025-12-18 02:15:32 -03:00
aa1c7d62c1 flux: reset image automation log level 2025-12-18 02:15:32 -03:00
flux-bot
3de36441f4 chore(ci-demo): apply image updates 2025-12-18 02:15:32 -03:00
e5238a7f91 chore: simplify image automation commit messages 2025-12-18 02:15:32 -03:00
d8077798db chore: update image automation templates 2025-12-18 02:15:32 -03:00
5a52c8606b ci-demo: move image policy to flux-system 2025-12-18 02:15:32 -03:00
be23851878 Merge pull request 'feature/bstein-dev-home' (#7) from feature/bstein-dev-home into main
Reviewed-on: #7
2025-12-18 04:23:01 +00:00
6f6fb363b3 Add bstein-dev-home deployment and Jenkins job 2025-12-18 01:14:09 -03:00
449574d59f Merge remote-tracking branch 'origin/feature/ci-gitops' into feature/bstein-dev-home 2025-12-18 01:07:01 -03:00
5f300c47a5 flux: bump image automation api to v1 2025-12-18 00:46:25 -03:00
c04a38fac5 flux: enable debug logging for controllers 2025-12-18 00:44:11 -03:00
5d4a0814c1 flux: enable debug logging for image automation 2025-12-18 00:40:55 -03:00
61d9f05fef flux: update pegasus image automation api 2025-12-18 00:39:39 -03:00
609347991e flux: upgrade controllers to v2.7.5 2025-12-18 00:38:32 -03:00
9816354d0f ci-demo: bump to v0.0.0-2 2025-12-17 23:12:03 -03:00
39275db74e ci-demo: set tag v0.0.0-1 2025-12-17 19:49:53 -03:00
9635100675 ci-demo: fix imagepolicy tag regex 2025-12-17 19:45:15 -03:00
bbb84c1182 jenkins: add ci-demo job 2025-12-17 19:27:23 -03:00
daa354e2cd ci-demo: add flux image automation 2025-12-17 19:18:29 -03:00
0a42289516 harbor: pin components to v2.14.1-arm64 2025-12-17 17:54:50 -03:00
b7246f5835 harbor: suspend automation, pin redis 2025-12-17 17:29:03 -03:00
flux-bot
b7709b3f40 chore(harbor): update images to registry.bstein.dev/infra/harbor-redis:v2.14.1-arm64.14registry.bstein.dev/infra/harbor-core:v2.14.1-arm64.14registry.bstein.dev/infra/harbor-jobservice:v2.14.1-arm64.14registry.bstein.dev/infra/harbor-portal:v2.14.1-arm64.14registry.bstein.dev/infra/harbor-registry:v2.14.1-arm64.14registry.bstein.dev/infra/harbor-registryctl:v2.14.1-arm64.14registry.bstein.dev/infra/harbor-nginx:v2.14.1-arm64.14registry.bstein.dev/infra/harbor-prepare:v2.14.1-arm64.14 2025-12-17 19:38:57 +00:00
bb8de41cdb harbor: run image automation in harbor ns 2025-12-17 16:38:37 -03:00
1d788a5dc4 harbor: fix imagepolicy tag setters 2025-12-17 16:32:42 -03:00
37a50622a2 harbor: fix image automation push schema 2025-12-17 16:25:16 -03:00
cde135c59e harbor: enable image automation push 2025-12-17 16:17:07 -03:00
543f2a9ccd harbor: fix image policy tag regex 2025-12-17 13:16:57 -03:00
efa6d92b69 harbor: automate nginx and prepare 2025-12-17 13:14:31 -03:00
2f66afd970 flux(atlas): use scoped health checks 2025-12-17 04:47:12 -03:00
f55d3fd956 flux(atlas): limit kustomization health checks 2025-12-17 04:11:26 -03:00
1a8c6857e7 harbor: re-pin workloads to titan-05 2025-12-17 03:30:31 -03:00
f28d5680f2 harbor: add image automation 2025-12-17 03:21:35 -03:00
8d04f6c6c7 jenkins: pin controller to rpi4 2025-12-17 02:53:23 -03:00
d93d24d5ef jenkins: disable chart local auth realm 2025-12-17 02:30:41 -03:00
7dcfd5f6cf jenkins: stop JCasC resetting OIDC 2025-12-17 02:23:54 -03:00
d3aa456bee jenkins: poll harbor-arm-build scm 2025-12-17 01:58:10 -03:00
a52b811e5b jenkins: source pipeline creds from secrets 2025-12-17 01:47:33 -03:00
cd1b9b57b0 harbor: add helm remediation and timeouts 2025-12-17 01:39:49 -03:00
5e6f9c6c83 chore: stop tracking NOTES.md 2025-12-17 01:29:48 -03:00
f512e0fa29 jenkins: harden oidc and timeouts 2025-12-17 01:11:07 -03:00
4e479147ec jenkins: run jcasc cleanup initcontainer as jenkins user 2025-12-17 00:43:55 -03:00
1f98a5be12 jenkins: clean stale JCasC files on startup 2025-12-17 00:37:37 -03:00
30048a9ae5 jenkins: drop invalid JCasC OIDC realm (use init script) 2025-12-17 00:28:52 -03:00
60a8192f61 jenkins: enforce OIDC via JCasC (no node move) 2025-12-17 00:23:15 -03:00
ce7631f896 jenkins: enforce OIDC via JCasC and pin to arm64 2025-12-16 23:38:08 -03:00
f3335028b1 jenkins: disable scm trigger for harbor arm build 2025-12-16 23:12:27 -03:00
0385a653af fix: use FullControlOnceLoggedIn auth strategy 2025-12-16 20:33:03 -03:00
6759871b43 fix: add casc support plugin 2025-12-16 20:27:41 -03:00
3e4a49e7fb fix: add job-dsl plugin for JCasC jobs 2025-12-16 20:21:33 -03:00
b951058dc6 fix: enforce Jenkins OIDC via init groovy only 2025-12-16 20:16:18 -03:00
cfa7bd8198 fix: jenkins casc OIDC using explicit endpoints 2025-12-16 20:13:52 -03:00
162fe3339f fix: pin Jenkins OIDC realm via JCasC 2025-12-16 20:04:21 -03:00
fc858fc8df ci: seed harbor-arm-build pipeline in Jenkins 2025-12-16 19:26:46 -03:00
8b9fc8ff1c chore: remove zot stack 2025-12-16 14:10:04 -03:00
3066db793d harbor: bootstrap arm64 images on titan-05 2025-12-16 11:16:34 -03:00
759a77c745 harbor: run arm64 images on rpi workers 2025-12-16 03:22:01 -03:00
c661658a12 Add AC Infinity ingestion plan 2025-12-16 01:45:04 -03:00
144a860a88 harbor: use project paths for crypto/pegasus images 2025-12-16 00:15:22 -03:00
bd64a36165 registry: point workloads to harbor 2025-12-16 00:08:11 -03:00
22b611f8ea harbor: set redis affinity to amd64 titan-22 first 2025-12-15 23:14:26 -03:00
a8bde2edc7 harbor: pin to amd64, prefer titan-22 2025-12-15 23:02:58 -03:00
d51a19cab9 harbor: prefer rpi nodes 2025-12-15 23:00:11 -03:00
3e3cab6845 harbor: increase helm timeout 2025-12-15 22:32:29 -03:00
9cda32c0bf harbor: use astreae storageclass for pvc 2025-12-15 22:22:48 -03:00
0f49849761 Regenerate dashboards after availability thresholds tweak 2025-12-15 22:14:26 -03:00
252743e416 harbor: use existing secrets and correct admin key 2025-12-15 22:08:52 -03:00
dba7cf00a4 harbor: deploy chart via flux 2025-12-15 22:05:40 -03:00
aa0df1f62b harbor: add helm repo and deploy via helmrelease 2025-12-15 22:05:32 -03:00
aa2bb09873 zot: allow upstream basic auth from oauth2-proxy 2025-12-15 14:22:48 -03:00
54406661f2 zot: forward authorization header to ui 2025-12-15 14:14:49 -03:00
caef505677 zot ui: send basic creds from oauth2-proxy, remove traefik header 2025-12-15 14:08:18 -03:00
54eb9e1ac5 zot: restore UI basic header middleware 2025-12-15 14:01:18 -03:00
1899bb7677 zot: move basic auth to oauth2-proxy upstream 2025-12-15 13:53:53 -03:00
0416493f49 zot: fix htpasswd volume to avoid type conflict 2025-12-15 13:00:51 -03:00
b87f06f6ff zot: add oauth proxy and user sync scripts 2025-12-15 12:57:02 -03:00
828f66d18c gitea: enable OIDC auto-registration 2025-12-14 23:08:38 -03:00
7a1f3bfc3f gitea: add proxy/session headers for OIDC 2025-12-14 22:25:46 -03:00
294542e718 gitea: reference secret via env; remove secret file 2025-12-14 22:16:49 -03:00
c3a8c7ddae gitea: remove committed secret and env refs 2025-12-14 22:10:13 -03:00
29da4be557 gitea: pin secret/internal token and include secret manifest 2025-12-14 22:06:25 -03:00
fc5b0cccf8 gitea: drop required claim constraint on keycloak auth 2025-12-14 21:58:36 -03:00
c8b89c3120 gitea: enforce keycloak auth source via init container 2025-12-14 21:54:18 -03:00
9b994111cb gitea: remove bootstrap job (immutable error) 2025-12-14 21:49:07 -03:00
a174e451d9 gitea: fix bootstrap job immutability 2025-12-14 21:47:50 -03:00
d8dab08cd8 gitea: set trace logging for oidc 2025-12-14 21:44:43 -03:00
0d93929e3d gitea: relax required signin, set admin group+skip 2fa 2025-12-14 21:42:08 -03:00
2ffc906487 gitea: enable debug logging for oauth 2025-12-14 21:38:32 -03:00
37761fa118 jenkins: fix OIDC retriever null 2025-12-14 21:23:15 -03:00
a46226bb0a ci: enable oidc for jenkins/gitops/gitea 2025-12-14 20:58:57 -03:00
04602a2914 jenkins: auto-configure OIDC via init script 2025-12-14 19:22:47 -03:00
fc0fa59981 jenkins: drop JCasC OIDC script to unblock startup 2025-12-14 18:10:49 -03:00
0286f4f317 jenkins: restore plugin list without pinned versions 2025-12-14 17:59:48 -03:00
90bf1f7d8e jenkins: start without plugin installs to unblock bootstrap 2025-12-14 16:02:05 -03:00
6def1aa479 jenkins: use latest plugin versions to avoid 404 2025-12-14 16:00:45 -03:00
4eff9ebcc1 jenkins: add helm release with ingress + astreae storage 2025-12-14 15:57:42 -03:00
ccfc473521 cleanup: stop tracking extra md files; switch gitops cert to letsencrypt 2025-12-14 15:52:12 -03:00
b575c64de1 chore: drop stray NOTES.md 2025-12-14 15:43:06 -03:00
39d732d74d git: ignore fixed 2025-12-14 15:39:27 -03:00
b28e393524 gitops-ui: open ingress for acme solver 2025-12-14 15:14:11 -03:00
694bb4d12e gitops-ui: allow acme solver from kube-system traefik 2025-12-14 15:12:38 -03:00
6993f51ef7 gitops-ui: allow acme solver ingress from traefik 2025-12-14 15:08:44 -03:00
85cea34fe8 gitops-ui: cert + switch flux to feature/ci-gitops 2025-12-14 15:04:13 -03:00
055ce7d18c Merge pull request 'feature/mailu' (#5) from feature/mailu into main
Reviewed-on: #5
2025-12-14 17:48:02 +00:00
1a161b4d3c monitoring: longer data history 2025-12-14 14:47:20 -03:00
f7bf990d62 flux: bump gitops-ui kustomization 2025-12-14 14:41:52 -03:00
63bf153c8b flux: add weave gitops ui 2025-12-14 14:38:08 -03:00
8fceebd7a7 nextcloud: integration with mailu & gitops-ui: initial install 2025-12-14 14:21:40 -03:00
0d0216c8f5 Add tests and dedupe nextcloud mail sync 2025-12-14 14:15:19 -03:00
c8b49560b6 Keep nextcloud scripts single-sourced under scripts/ 2025-12-14 14:05:01 -03:00
327a7bed57 Extract nextcloud scripts to files 2025-12-14 13:59:16 -03:00
aae09c5074 Normalize doc layout and README guidance 2025-12-14 13:47:59 -03:00
56bb4e91b9 Group namespace plurality rows to one per namespace 2025-12-13 22:17:47 -03:00
18f3a2cefe Fix namespace plurality mask and bump v26 2025-12-13 20:53:11 -03:00
1ec3ca29a4 Use OR-joined node ranks for plurality tie-break 2025-12-13 19:04:22 -03:00
4812958e82 Deduplicate namespace plurality rows with ranked tie-break 2025-12-13 18:39:31 -03:00
9ad5f7f405 Restore namespace plurality panel data 2025-12-13 18:25:03 -03:00
57ea397027 Use table format for namespace plurality panel 2025-12-13 18:23:19 -03:00
be0ac48b33 Simplify namespace plurality table rendering 2025-12-13 18:07:56 -03:00
2156b6f6aa Hide table footer on namespace plurality table 2025-12-13 18:03:51 -03:00
4fcc7c84f2 Make namespace plurality table non-filterable 2025-12-13 17:55:52 -03:00
a4b3273bab Remove filter bar from namespace plurality table 2025-12-13 17:38:57 -03:00
c536a13d55 Disable column filters on namespace plurality table 2025-12-13 17:35:52 -03:00
13eb02c19b Hide filters on namespace plurality table 2025-12-13 17:32:19 -03:00
134a4ad001 Fix namespace plurality table query 2025-12-13 17:29:55 -03:00
3e0a84b074 atlas pods: plurality table v11 (deterministic top node) 2025-12-13 17:19:03 -03:00
7f67793ee5 atlas pods: plurality table v10 2025-12-13 16:36:25 -03:00
e87d54f19d atlas pods: per-namespace top node via topk 2025-12-13 15:51:45 -03:00
6ac01e5879 atlas pods: simplify plurality table (no filter) 2025-12-13 15:29:08 -03:00
d0ed188179 monitoring: drop README per convention 2025-12-13 15:25:21 -03:00
b703e66b98 monitoring: restore README 2025-12-13 15:11:50 -03:00
68d4f43903 atlas pods: stabilize plurality query to avoid 422 2025-12-13 15:11:21 -03:00
cf9dacd4ea atlas pods: show per-namespace top node without vars 2025-12-13 15:02:52 -03:00
6eee7b8853 atlas pods: drop non-leading nodes in plurality table 2025-12-13 13:39:06 -03:00
03a4ca4d84 atlas pods: simplify plurality table query 2025-12-13 12:06:18 -03:00
c7adb0c8cb atlas pods: fix plurality table query 2025-12-13 12:00:31 -03:00
9d1163f580 atlas pods: use prom share() for plurality table 2025-12-13 11:53:27 -03:00
001f0f95a6 atlas pods: fix plurality query with bool max match 2025-12-13 11:51:18 -03:00
2177a8009e atlas pods: robust per-namespace top-node share 2025-12-13 11:48:44 -03:00
6a3d1311b9 atlas pods: select per-namespace top node via max match 2025-12-13 04:15:03 -03:00
d916e5a7f1 atlas pods: sort plurality table by node then share 2025-12-13 04:10:10 -03:00
5d6d34c274 atlas pods: simplify namespace plurality query 2025-12-13 04:06:46 -03:00
53423c7a46 atlas pods: fix namespace plurality query 2025-12-13 04:00:57 -03:00
d274738e9e restore readmes removed in last commit 2025-12-13 03:57:44 -03:00
f0265d6b94 atlas pods: add namespace plurality by node table 2025-12-13 03:57:20 -03:00
8a755e0c42 mailu: forcing version 1.4 clamav over 1.2 2025-12-13 00:11:40 -03:00
e22293db3e forcing 12-r3 over 12-r6 for redis 2025-12-12 22:09:04 -03:00
6f8a70fd58 atlas overview: include titan-db in control plane panels 2025-12-12 21:55:53 -03:00
580d1731f9 monitoring: drop duplicate titan-db scrape job 2025-12-12 21:48:03 -03:00
4def298b83 monitoring: scrape titan-db node_exporter 2025-12-12 21:38:10 -03:00
1166069640 atlas dashboards: align percent thresholds and disk bars 2025-12-12 21:13:31 -03:00
e56bed284e atlas overview: refine alert thresholds and availability colors 2025-12-12 20:50:41 -03:00
24376594ff atlas dashboards: use threshold colors for stats 2025-12-12 20:44:20 -03:00
5277c98385 atlas dashboards: fix pod share display and zero/red stat thresholds 2025-12-12 20:40:32 -03:00
056b7b7770 atlas dashboards: show pod counts (not %) and make zero-friendly stats 2025-12-12 20:30:00 -03:00
b770575b42 atlas dashboards: show pod counts with top12 bars 2025-12-12 20:20:13 -03:00
9e76277c22 atlas dashboards: drop empty nodes and enforce top12 pod bars 2025-12-12 19:09:51 -03:00
93b3c6d2ec atlas dashboards: cap pod count bars at top12 2025-12-12 18:56:13 -03:00
596bf46863 atlas dashboards: sort pod counts and add pod row to overview 2025-12-12 18:51:43 -03:00
8b703f8655 atlas pods: add pod count bar and tidy pie 2025-12-12 18:45:29 -03:00
ec59d25ad8 atlas dashboards: fix overview links and add pods-by-node pie 2025-12-12 18:32:45 -03:00
bf6179f907 atlas internal dashboards: add SLO/burn and api health panels 2025-12-12 18:00:43 -03:00
0a0966db78 atlas overview: fix availability scaling 2025-12-12 16:36:47 -03:00
87fbba0d3e atlas overview: show availability percent with 3 decimals 2025-12-12 16:15:37 -03:00
b200dba5b9 atlas overview: show availability percent and keep uptime centered 2025-12-12 16:11:28 -03:00
697ce3c18f atlas overview: center uptime and reorder top row 2025-12-12 15:56:33 -03:00
8e39c6a28b atlas overview: add uptime and crashloop panels 2025-12-12 15:23:51 -03:00
38ab8e3364 standardize cert issuers to letsencrypt 2025-12-12 15:18:40 -03:00
29d22ba539 mailu: fix unbound sidecar mounts 2025-12-12 01:19:27 -03:00
118032d2c6 mailu: use mvance unbound sidecar and current redis image 2025-12-12 01:12:48 -03:00
4cfe92feb2 mailu: remove force upgrade to avoid pvc replace 2025-12-12 01:09:25 -03:00
ca27cc95b6 mailu: add validating dns sidecar and disable vip hostports 2025-12-12 01:06:38 -03:00
6c77b8e7f8 restore docs after gitignore change 2025-12-12 00:50:02 -03:00
78195c4685 mailu: fix admin dns and tame vip 2025-12-12 00:49:45 -03:00
5ef0b4edf6 mailu: capture helm release and cert 2025-12-11 23:54:43 -03:00
9f226c1584 Merge pull request 'feature/sso' (#4) from feature/sso into main
Reviewed-on: #4
2025-12-11 20:43:34 +00:00
319b515882 zot: restore main branch config 2025-12-11 17:26:15 -03:00
cb2b2ec1cd zot: revert to unauthenticated registry 2025-12-11 17:22:16 -03:00
20cd185c0b vault: drop traefik basicauth 2025-12-11 17:09:05 -03:00
2f368f6975 zot,vault: remove oauth2-proxy sso 2025-12-11 17:04:19 -03:00
6c62d42f7a longhorn/vault: gate via oauth2-proxy 2025-12-07 19:44:02 -03:00
a7e9f1f7d8 auth: remove error middleware to allow redirect 2025-12-07 13:19:45 -03:00
ceb692f7ee oauth2-proxy: drop groups scope to avoid invalid_scope 2025-12-07 13:09:29 -03:00
24fbaad040 auth: forward-auth via external auth host (svc traffic flaky) 2025-12-07 13:03:29 -03:00
04aa32a762 oauth2-proxy: schedule on worker rpis 2025-12-07 12:49:38 -03:00
25ee698021 oauth2-proxy: ensure error middleware on auth ingress 2025-12-07 12:03:14 -03:00
4a089876ba auth: use internal oauth2-proxy svc for forward-auth 2025-12-07 11:25:29 -03:00
20bb776625 auth: add 401 redirect middleware to oauth2-proxy 2025-12-07 11:14:25 -03:00
5e59f20bc3 auth: point forward-auth to external auth host 2025-12-07 11:09:09 -03:00
dbede55ad4 oauth2-proxy: temporarily drop group restriction 2025-12-07 10:42:13 -03:00
27e5c9391c auth: add namespace-local forward-auth middlewares 2025-12-07 10:25:44 -03:00
8d5e6c267c auth: wire oauth2-proxy and enable grafana oidc 2025-12-07 02:01:21 -03:00
a55502fe27 add oauth2-proxy for SSO forward-auth 2025-12-06 14:42:24 -03:00
598bdfc727 keycloak: restrict to worker rpis with titan-24 fallback 2025-12-06 01:44:23 -03:00
88c7a1c2aa keycloak: require rpi nodes with titan-24 fallback 2025-12-06 01:40:24 -03:00
f4da27271e keycloak: prefer rpi nodes, avoid titan-24 2025-12-06 01:36:33 -03:00
141c05b08f keycloak: honor xforwarded headers and hostname url 2025-12-06 01:23:07 -03:00
f0a8f6d35e keycloak: enable health/metrics management port 2025-12-06 00:51:47 -03:00
1b01052eda keycloak: set fsGroup for data volume 2025-12-06 00:49:17 -03:00
1d346edd28 keycloak: remove optimized flag for first start 2025-12-06 00:43:24 -03:00
b14a9dcb98 chore: drop AGENTS.md from repo 2025-12-06 00:43:17 -03:00
47caf08885 notes: capture GPU share change and flux branch 2025-12-03 12:28:45 -03:00
0db149605d monitoring: show GPU share over dashboard range 2025-12-02 20:28:35 -03:00
f64e60c5a2 flux: add keycloak kustomization 2025-12-02 18:10:20 -03:00
61c5db5c99 flux: track feature/sso 2025-12-02 18:00:49 -03:00
2db550afdd keycloak: add raw manifests backed by shared postgres 2025-12-02 17:58:19 -03:00
65d389193f Merge pull request 'feature/atlas-monitoring' (#3) from feature/atlas-monitoring into main
Reviewed-on: #3
2025-12-02 20:52:35 +00:00
e80505a773 notes: add postgres centralization guidance 2025-12-02 17:36:37 -03:00
762aa7bb0f notes: add sso plan sketch 2025-12-02 17:14:45 -03:00
839fb94836 notes: update monitoring and next steps 2025-12-02 17:01:32 -03:00
6eba26b359 monitoring: show top12 root disks 2025-12-02 15:21:02 -03:00
ace383bedd monitoring: expand worker/control/root rows 2025-12-02 15:15:21 -03:00
b93636ecb9 monitoring: shrink hottest node row height 2025-12-02 15:12:16 -03:00
5df94a7937 monitoring: fix gpu share query and root bar labels 2025-12-02 14:56:36 -03:00
a3dc9391ee monitoring: polish dashboards and folders 2025-12-02 14:41:39 -03:00
eed67b3db0 monitoring: regen dashboards with gpu details 2025-12-02 13:16:00 -03:00
f1d0970aa0 monitoring: mirror dcgm-exporter as multi-arch 2025-12-02 12:36:24 -03:00
e26ef44d1a monitoring: run dcgm-exporter with nvidia runtime 2025-12-02 12:25:30 -03:00
a18c3e6f67 monitoring: always pull dcgm-exporter tag 2025-12-02 12:19:16 -03:00
ee923df567 monitoring: add registry pull secret for dcgm-exporter 2025-12-02 12:07:11 -03:00
d87a1dbc47 monitoring: allow dcgm rollout with unavailable node 2025-12-02 11:59:55 -03:00
5b89b0533e monitoring: use mirrored dcgm-exporter tag 2025-12-02 11:54:53 -03:00
d99bb06eeb monitoring: reenable dcgm exporter 2025-11-20 13:11:13 -03:00
75f6a59316 traefik: use responding timeouts only 2025-11-18 20:01:16 -03:00
630f1f2a81 traefik: extend upload timeouts 2025-11-18 19:43:19 -03:00
e4f93e85d2 monitoring: control-plane stat and namespace share tweaks 2025-11-18 17:09:13 -03:00
f06be37f44 monitoring: refine network metrics and control-plane allowance 2025-11-18 16:18:52 -03:00
c7b7bc7a6d monitoring: adjust overview spacing and net panels 2025-11-18 15:55:24 -03:00
7b2a69cfe3 monitoring: disable dcgm exporter 2025-11-18 15:10:58 -03:00
909cb4ff26 flux: disable wait for monitoring 2025-11-18 15:04:18 -03:00
5a2575d54e flux: scope monitoring health checks 2025-11-18 14:33:24 -03:00
46410c9a9d monitoring: fix dcgm image 2025-11-18 14:19:23 -03:00
ff056551c7 monitoring: refresh overview dashboards 2025-11-18 14:08:33 -03:00
8e6c0a3cfe monitoring: rework gpu share + gauges 2025-11-18 12:11:47 -03:00
497164a1ad monitoring: clean namespace gpu share and layout 2025-11-18 11:42:24 -03:00
fab5552039 monitoring: resolve pie errors and network data 2025-11-18 11:30:33 -03:00
7009a4f9ff monitoring: fix namespace gpu share and network stats 2025-11-18 11:12:03 -03:00
d7e4bcd533 monitoring: add gpu node fallback 2025-11-18 10:47:24 -03:00
ec76563a86 monitoring: source gpu pie from limits and node nets 2025-11-18 01:01:10 -03:00
5144bbe1f2 monitoring: fix gpu pie data and network panels 2025-11-18 00:31:51 -03:00
ac62387e07 monitoring: stabilize namespace pies and labels 2025-11-18 00:19:45 -03:00
2ba642d49f monitoring: add gpu pie and tidy net panels 2025-11-18 00:11:39 -03:00
beb3243839 Revert GPU pie chart additions 2025-11-17 23:42:55 -03:00
aef3176c1c monitoring: fix hottest stats and gpu share 2025-11-17 23:40:22 -03:00
f4dd1de43f monitoring: reorder namespace pies and add gpu data 2025-11-17 23:18:53 -03:00
0708522b28 monitoring: add namespace gpu share 2025-11-17 23:12:16 -03:00
c53c518301 monitoring: express namespace share as cluster percent 2025-11-17 22:58:57 -03:00
442a89d327 monitoring: fix pie colors & thresholds 2025-11-17 22:39:50 -03:00
255e014e0a monitoring: color namespace pies 2025-11-17 22:36:50 -03:00
cc62f497e9 monitoring: fix namespace share percentages 2025-11-17 22:19:01 -03:00
37e51b361b monitoring: normalize namespace share 2025-11-17 22:06:06 -03:00
be6052c47c monitoring: unify namespace share panels 2025-11-17 21:57:40 -03:00
b59677615c monitoring: worker/control-plane splits 2025-11-17 21:48:12 -03:00
76d3dc6ae2 monitoring: restore top1 hottest stats 2025-11-17 21:20:19 -03:00
53427cc8fa monitoring: fix net/io legend labels 2025-11-17 20:19:20 -03:00
b8998a3c6a monitoring: attach nodes to net/io stats 2025-11-17 20:14:11 -03:00
a67a6a1f3a monitoring: tidy hottest node labels 2025-11-17 20:04:50 -03:00
b28e7501b7 monitoring: show hottest node labels 2025-11-17 20:00:40 -03:00
4aece7e5cb monitoring: fix hottest node labels 2025-11-17 19:56:57 -03:00
bcaa0a3327 monitoring: show hottest node names 2025-11-17 19:53:39 -03:00
41e8a6a582 monitoring: reorder overview stats 2025-11-17 19:49:50 -03:00
a1e731e929 monitoring: fix hottest stats and titan-db scrape 2025-11-17 19:38:40 -03:00
fe8deea9c7 monitoring: tighten overview stats 2025-11-17 19:24:03 -03:00
349d9c56ac monitoring: polish dashboards 2025-11-17 18:55:11 -03:00
8f5781d3cf monitoring: rebuild atlas dashboards 2025-11-17 16:27:38 -03:00
a41f25e66d monitoring: restructure grafana dashboards 2025-11-17 14:22:46 -03:00
b004bf99dc monitoring: enrich dashboards 2025-11-16 12:58:08 -03:00
0b1437b77c monitoring: refresh grafana dashboards 2025-11-15 21:03:11 -03:00
eb3991b628 dashboards: improve public view and fix color 2025-11-15 11:59:48 -03:00
46b6b1f3b8 grafana: set datasource uid 2025-11-15 11:35:27 -03:00
683dc84289 grafana: use atlas metrics hostname 2025-11-15 11:18:40 -03:00
d0b6fbe763 victoria-metrics: revert storageclass change 2025-11-15 11:16:37 -03:00
3cfe639387 monitoring: fix domain 2025-11-14 19:13:40 -03:00
418329e173 monitoring: fix ingress and env formats 2025-11-14 08:51:09 -03:00
394fcf2ee4 grafana: use string host format 2025-11-14 08:37:46 -03:00
465103a57e grafana: fix dashboard provider list 2025-11-14 08:33:53 -03:00
c2cb901102 monitoring: fix grafana values 2025-11-14 08:29:59 -03:00
06337f2b9d monitoring: add grafana and alertmanager 2025-11-14 00:02:59 -03:00
a875b0a42e flux-system: track main branch 2025-11-12 01:06:26 -03:00
143 changed files with 18976 additions and 3841 deletions

3
.gitignore vendored
View File

@ -1 +1,2 @@
AGENTS.md
*.md
!README.md

3
README.md Normal file
View File

@ -0,0 +1,3 @@
# titan-iac
Flux-managed Kubernetes cluster for bstein.dev services.

View File

@ -9,4 +9,4 @@ resources:
- ../../services/monitoring
- ../../services/pegasus
- ../../services/vault
- ../../services/zot
- ../../services/bstein-dev-home

View File

@ -0,0 +1,26 @@
# clusters/atlas/flux-system/applications/bstein-dev-home/image-automation.yaml
apiVersion: image.toolkit.fluxcd.io/v1
kind: ImageUpdateAutomation
metadata:
name: bstein-dev-home
namespace: flux-system
spec:
interval: 1m0s
sourceRef:
kind: GitRepository
name: flux-system
namespace: flux-system
git:
checkout:
ref:
branch: feature/ci-gitops
commit:
author:
email: ops@bstein.dev
name: flux-bot
messageTemplate: "chore(bstein-dev-home): update images to {{range .Updated.Images}}{{.}}{{end}}"
push:
branch: feature/ci-gitops
update:
strategy: Setters
path: services/bstein-dev-home

View File

@ -0,0 +1,15 @@
# clusters/atlas/flux-system/applications/bstein-dev-home/kustomization.yaml
apiVersion: kustomize.toolkit.fluxcd.io/v1
kind: Kustomization
metadata:
name: bstein-dev-home
namespace: flux-system
spec:
interval: 10m
path: ./services/bstein-dev-home
prune: true
sourceRef:
kind: GitRepository
name: flux-system
targetNamespace: bstein-dev-home
wait: false

View File

@ -0,0 +1,26 @@
# clusters/atlas/flux-system/applications/ci-demo/image-automation.yaml
apiVersion: image.toolkit.fluxcd.io/v1
kind: ImageUpdateAutomation
metadata:
name: ci-demo
namespace: flux-system
spec:
interval: 1m0s
sourceRef:
kind: GitRepository
name: flux-system
namespace: flux-system
git:
checkout:
ref:
branch: feature/ci-gitops
commit:
author:
email: ops@bstein.dev
name: flux-bot
messageTemplate: "chore(ci-demo): apply image updates"
push:
branch: feature/ci-gitops
update:
strategy: Setters
path: services/ci-demo

View File

@ -0,0 +1,17 @@
# clusters/atlas/flux-system/applications/ci-demo/kustomization.yaml
apiVersion: kustomize.toolkit.fluxcd.io/v1
kind: Kustomization
metadata:
name: ci-demo
namespace: flux-system
spec:
interval: 10m
path: ./services/ci-demo
prune: true
sourceRef:
kind: GitRepository
name: flux-system
namespace: flux-system
dependsOn:
- name: core
wait: false

View File

@ -0,0 +1,27 @@
# clusters/atlas/flux-system/applications/harbor/image-automation.yaml
apiVersion: image.toolkit.fluxcd.io/v1
kind: ImageUpdateAutomation
metadata:
name: harbor
namespace: harbor
spec:
suspend: true
interval: 5m0s
sourceRef:
kind: GitRepository
name: flux-system
namespace: flux-system
git:
checkout:
ref:
branch: feature/ci-gitops
commit:
author:
email: ops@bstein.dev
name: flux-bot
messageTemplate: "chore(harbor): apply image updates"
push:
branch: feature/ci-gitops
update:
strategy: Setters
path: ./services/harbor

View File

@ -0,0 +1,23 @@
# clusters/atlas/flux-system/applications/harbor/kustomization.yaml
apiVersion: kustomize.toolkit.fluxcd.io/v1
kind: Kustomization
metadata:
name: harbor
namespace: flux-system
spec:
interval: 10m
path: ./services/harbor
targetNamespace: harbor
prune: false
sourceRef:
kind: GitRepository
name: flux-system
namespace: flux-system
healthChecks:
- apiVersion: helm.toolkit.fluxcd.io/v2
kind: HelmRelease
name: harbor
namespace: harbor
wait: false
dependsOn:
- name: core

View File

@ -0,0 +1,23 @@
# clusters/atlas/flux-system/applications/jenkins/kustomization.yaml
apiVersion: kustomize.toolkit.fluxcd.io/v1
kind: Kustomization
metadata:
name: jenkins
namespace: flux-system
spec:
interval: 10m
path: ./services/jenkins
prune: true
sourceRef:
kind: GitRepository
name: flux-system
targetNamespace: jenkins
dependsOn:
- name: helm
- name: traefik
healthChecks:
- apiVersion: helm.toolkit.fluxcd.io/v2
kind: HelmRelease
name: jenkins
namespace: jenkins
wait: false

View File

@ -0,0 +1,15 @@
# clusters/atlas/flux-system/applications/keycloak/kustomization.yaml
apiVersion: kustomize.toolkit.fluxcd.io/v1
kind: Kustomization
metadata:
name: keycloak
namespace: flux-system
spec:
interval: 10m
prune: true
sourceRef:
kind: GitRepository
name: flux-system
path: ./services/keycloak
targetNamespace: sso
timeout: 2m

View File

@ -2,7 +2,6 @@
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
resources:
- zot/kustomization.yaml
- gitea/kustomization.yaml
- vault/kustomization.yaml
- jitsi/kustomization.yaml
@ -10,6 +9,16 @@ resources:
- monerod/kustomization.yaml
- pegasus/kustomization.yaml
- pegasus/image-automation.yaml
- bstein-dev-home/kustomization.yaml
- bstein-dev-home/image-automation.yaml
- harbor/kustomization.yaml
- harbor/image-automation.yaml
- jellyfin/kustomization.yaml
- xmr-miner/kustomization.yaml
- sui-metrics/kustomization.yaml
- keycloak/kustomization.yaml
- oauth2-proxy/kustomization.yaml
- mailu/kustomization.yaml
- jenkins/kustomization.yaml
- ci-demo/kustomization.yaml
- ci-demo/image-automation.yaml

View File

@ -1,18 +1,18 @@
# clusters/atlas/flux-system/applications/zot/kustomization.yaml
# clusters/atlas/flux-system/applications/mailu/kustomization.yaml
apiVersion: kustomize.toolkit.fluxcd.io/v1
kind: Kustomization
metadata:
name: zot
name: mailu
namespace: flux-system
spec:
interval: 10m
path: ./services/zot
targetNamespace: zot
prune: false
sourceRef:
kind: GitRepository
name: flux-system
namespace: flux-system
path: ./services/mailu
targetNamespace: mailu-mailserver
prune: true
wait: true
dependsOn:
- name: core
- name: helm

View File

@ -0,0 +1,15 @@
# clusters/atlas/flux-system/applications/oauth2-proxy/kustomization.yaml
apiVersion: kustomize.toolkit.fluxcd.io/v1
kind: Kustomization
metadata:
name: oauth2-proxy
namespace: flux-system
spec:
interval: 10m
prune: true
sourceRef:
kind: GitRepository
name: flux-system
path: ./services/oauth2-proxy
targetNamespace: sso
timeout: 2m

View File

@ -1,5 +1,5 @@
# clusters/atlas/flux-system/applications/pegasus/image-automation.yaml
apiVersion: image.toolkit.fluxcd.io/v1beta1
apiVersion: image.toolkit.fluxcd.io/v1
kind: ImageUpdateAutomation
metadata:
name: pegasus
@ -9,12 +9,18 @@ spec:
sourceRef:
kind: GitRepository
name: flux-system
namespace: flux-system
git:
checkout:
ref:
branch: feature/ci-gitops
commit:
author:
email: ops@bstein.dev
name: flux-bot
messageTemplate: "chore(pegasus): update image to {{range .Updated.Images}}{{.}}{{end}}"
messageTemplate: "chore(pegasus): apply image updates"
push:
branch: feature/ci-gitops
update:
strategy: Setters
path: ./services/pegasus
path: services/pegasus

File diff suppressed because it is too large Load Diff

View File

@ -8,7 +8,7 @@ metadata:
spec:
interval: 1m0s
ref:
branch: restructure/hybrid-clusters
branch: feature/ci-gitops
secretRef:
name: flux-system-gitea
url: ssh://git@scm.bstein.dev:2242/bstein/titan-iac.git

View File

@ -0,0 +1,20 @@
# clusters/atlas/flux-system/platform/gitops-ui/kustomization.yaml
apiVersion: kustomize.toolkit.fluxcd.io/v1
kind: Kustomization
metadata:
name: gitops-ui
namespace: flux-system
spec:
interval: 10m
timeout: 10m
path: ./services/gitops-ui
prune: true
sourceRef:
kind: GitRepository
name: flux-system
namespace: flux-system
targetNamespace: flux-system
dependsOn:
- name: helm
- name: traefik
wait: true

View File

@ -5,5 +5,6 @@ resources:
- core/kustomization.yaml
- helm/kustomization.yaml
- traefik/kustomization.yaml
- gitops-ui/kustomization.yaml
- monitoring/kustomization.yaml
- longhorn-ui/kustomization.yaml

View File

@ -11,4 +11,4 @@ spec:
sourceRef:
kind: GitRepository
name: flux-system
wait: true
wait: false

View File

@ -1,5 +0,0 @@
# Oceanus Cluster Scaffold
This directory prepares the Flux and Kustomize layout for a future Oceanus-managed cluster.
Populate `flux-system/` with `gotk-components.yaml` and related manifests after running `flux bootstrap`.
Define node-specific resources under `infrastructure/modules/profiles/oceanus-validator/` and reference workloads in `applications/` as they come online.

View File

@ -1,16 +0,0 @@
# Titan Homelab Topology
| Hostname | Role / Function | Managed By | Notes |
|------------|--------------------------------|---------------------|-------|
| titan-0a | Kubernetes control-plane | Flux (atlas cluster)| HA leader, tainted for control only |
| titan-0b | Kubernetes control-plane | Flux (atlas cluster)| Standby control node |
| titan-0c | Kubernetes control-plane | Flux (atlas cluster)| Standby control node |
| titan-04-19| Raspberry Pi workers | Flux (atlas cluster)| Workload nodes, labelled per hardware |
| titan-22 | GPU mini-PC (Jellyfin) | Flux + Ansible | NVIDIA runtime managed via `modules/profiles/atlas-ha` |
| titan-24 | Tethys hybrid node | Flux + Ansible | Runs SUI metrics via K8s, validator via Ansible |
| titan-db | HA control plane database | Ansible | PostgreSQL / etcd backing services |
| titan-jh | Jumphost & bastion | Ansible | Entry point / future KVM services |
| oceanus | Dedicated SUI validator host | Ansible / Flux prep | Baremetal validator workloads, exposes metrics to atlas; Kustomize scaffold under `clusters/oceanus/` |
| styx | Air-gapped workstation | Manual / Scripts | Remains isolated, scripts tracked in `hosts/styx` |
Use the `clusters/` directory for cluster-scoped state and the `hosts/` directory for baremetal orchestration.

View File

@ -1,2 +0,0 @@
# hosts/styx/README.md
Styx is air-gapped; provisioning scripts live under `scripts/`.

View File

@ -5,3 +5,4 @@ resources:
- ../modules/base
- ../modules/profiles/atlas-ha
- ../sources/cert-manager/letsencrypt.yaml
- ../sources/cert-manager/letsencrypt-prod.yaml

View File

@ -7,7 +7,7 @@ metadata:
annotations:
traefik.ingress.kubernetes.io/router.entrypoints: websecure
traefik.ingress.kubernetes.io/router.tls: "true"
traefik.ingress.kubernetes.io/router.middlewares: longhorn-system-longhorn-basicauth@kubernetescrd,longhorn-system-longhorn-headers@kubernetescrd
traefik.ingress.kubernetes.io/router.middlewares: ""
spec:
ingressClassName: traefik
tls:
@ -21,6 +21,6 @@ spec:
pathType: Prefix
backend:
service:
name: longhorn-frontend
name: oauth2-proxy-longhorn
port:
number: 80

View File

@ -4,3 +4,4 @@ kind: Kustomization
resources:
- middleware.yaml
- ingress.yaml
- oauth2-proxy-longhorn.yaml

View File

@ -20,3 +20,20 @@ spec:
headers:
customRequestHeaders:
X-Forwarded-Proto: "https"
---
apiVersion: traefik.io/v1alpha1
kind: Middleware
metadata:
name: longhorn-forward-auth
namespace: longhorn-system
spec:
forwardAuth:
address: https://auth.bstein.dev/oauth2/auth
trustForwardHeader: true
authResponseHeaders:
- Authorization
- X-Auth-Request-Email
- X-Auth-Request-User
- X-Auth-Request-Groups

View File

@ -0,0 +1,102 @@
# infrastructure/longhorn/ui-ingress/oauth2-proxy-longhorn.yaml
apiVersion: v1
kind: Service
metadata:
name: oauth2-proxy-longhorn
namespace: longhorn-system
labels:
app: oauth2-proxy-longhorn
spec:
ports:
- name: http
port: 80
targetPort: 4180
selector:
app: oauth2-proxy-longhorn
---
apiVersion: apps/v1
kind: Deployment
metadata:
name: oauth2-proxy-longhorn
namespace: longhorn-system
labels:
app: oauth2-proxy-longhorn
spec:
replicas: 2
selector:
matchLabels:
app: oauth2-proxy-longhorn
template:
metadata:
labels:
app: oauth2-proxy-longhorn
spec:
nodeSelector:
node-role.kubernetes.io/worker: "true"
affinity:
nodeAffinity:
preferredDuringSchedulingIgnoredDuringExecution:
- weight: 90
preference:
matchExpressions:
- key: hardware
operator: In
values: ["rpi5","rpi4"]
containers:
- name: oauth2-proxy
image: quay.io/oauth2-proxy/oauth2-proxy:v7.6.0
imagePullPolicy: IfNotPresent
args:
- --provider=oidc
- --redirect-url=https://longhorn.bstein.dev/oauth2/callback
- --oidc-issuer-url=https://sso.bstein.dev/realms/atlas
- --scope=openid profile email groups
- --email-domain=*
- --allowed-group=admin
- --set-xauthrequest=true
- --pass-access-token=true
- --set-authorization-header=true
- --cookie-secure=true
- --cookie-samesite=lax
- --cookie-refresh=20m
- --cookie-expire=168h
- --insecure-oidc-allow-unverified-email=true
- --upstream=http://longhorn-frontend.longhorn-system.svc.cluster.local
- --http-address=0.0.0.0:4180
- --skip-provider-button=true
- --skip-jwt-bearer-tokens=true
- --oidc-groups-claim=groups
- --cookie-domain=longhorn.bstein.dev
env:
- name: OAUTH2_PROXY_CLIENT_ID
valueFrom:
secretKeyRef:
name: oauth2-proxy-longhorn-oidc
key: client_id
- name: OAUTH2_PROXY_CLIENT_SECRET
valueFrom:
secretKeyRef:
name: oauth2-proxy-longhorn-oidc
key: client_secret
- name: OAUTH2_PROXY_COOKIE_SECRET
valueFrom:
secretKeyRef:
name: oauth2-proxy-longhorn-oidc
key: cookie_secret
ports:
- containerPort: 4180
name: http
readinessProbe:
httpGet:
path: /ping
port: 4180
initialDelaySeconds: 5
periodSeconds: 10
livenessProbe:
httpGet:
path: /ping
port: 4180
initialDelaySeconds: 20
periodSeconds: 20

View File

@ -0,0 +1,14 @@
apiVersion: cert-manager.io/v1
kind: ClusterIssuer
metadata:
name: letsencrypt-prod
spec:
acme:
email: brad.stein@gmail.com
server: https://acme-v02.api.letsencrypt.org/directory
privateKeySecretRef:
name: letsencrypt-prod-account-key
solvers:
- http01:
ingress:
class: traefik

View File

@ -0,0 +1,9 @@
# infrastructure/sources/helm/harbor.yaml
apiVersion: source.toolkit.fluxcd.io/v1beta2
kind: HelmRepository
metadata:
name: harbor
namespace: flux-system
spec:
interval: 10m
url: https://helm.goharbor.io

View File

@ -0,0 +1,9 @@
# infrastructure/sources/helm/jenkins.yaml
apiVersion: source.toolkit.fluxcd.io/v1
kind: HelmRepository
metadata:
name: jenkins
namespace: flux-system
spec:
interval: 1h
url: https://charts.jenkins.io

View File

@ -0,0 +1,12 @@
# infrastructure/sources/helm/kustomization.yaml
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
resources:
- grafana.yaml
- hashicorp.yaml
- jetstack.yaml
- jenkins.yaml
- mailu.yaml
- harbor.yaml
- prometheus.yaml
- victoria-metrics.yaml

View File

@ -0,0 +1,9 @@
# infrastructure/sources/helm/mailu.yaml
apiVersion: source.toolkit.fluxcd.io/v1
kind: HelmRepository
metadata:
name: mailu
namespace: flux-system
spec:
interval: 1h
url: https://mailu.github.io/helm-charts

View File

@ -39,6 +39,12 @@ items:
- --metrics.prometheus.addEntryPointsLabels=true
- --metrics.prometheus.addRoutersLabels=true
- --metrics.prometheus.addServicesLabels=true
- --entrypoints.web.transport.respondingTimeouts.readTimeout=0s
- --entrypoints.web.transport.respondingTimeouts.writeTimeout=0s
- --entrypoints.web.transport.respondingTimeouts.idleTimeout=0s
- --entrypoints.websecure.transport.respondingTimeouts.readTimeout=0s
- --entrypoints.websecure.transport.respondingTimeouts.writeTimeout=0s
- --entrypoints.websecure.transport.respondingTimeouts.idleTimeout=0s
- --entrypoints.metrics.address=:9100
- --metrics.prometheus.entryPoint=metrics
image: traefik:v3.3.3

View File

@ -371,9 +371,9 @@ function xmrwallet_bootstrap --description "Interactive setup of monero-wallet-r
echo "Skipping daemon probe due to xmrwallet_SKIP_DAEMON_CHECK=1"
end
# Use your private image by default (in Zot)
read -P "Container image for wallet RPC [registry.bstein.dev/infra/monero-wallet-rpc:0.18.4.1]: " image
if test -z "$image"; set image registry.bstein.dev/infra/monero-wallet-rpc:0.18.4.1; end
# Use your private image by default (in Harbor)
read -P "Container image for wallet RPC [registry.bstein.dev/crypto/monero-wallet-rpc:0.18.4.1]: " image
if test -z "$image"; set image registry.bstein.dev/crypto/monero-wallet-rpc:0.18.4.1; end
_require "Container image" $image; or return 1
# --- Secrets (defaults: RPC user=wallet name, passwords auto if missing)
@ -1375,4 +1375,3 @@ function xmrwallet_help_detailed
echo " Probes it via a temporary port-forward so it works from your workstation."
echo " Set xmrwallet_SKIP_DAEMON_CHECK=1 to bypass the daemon probe (not recommended)."
end

View File

@ -23,7 +23,7 @@ end
# Default image chooser (you should override with your own multi-arch image)
function _sui_default_image -a NET
echo registry.bstein.dev/infra/sui-tools:1.53.2
echo registry.bstein.dev/crypto/sui-tools:1.53.2
end
# Convert any string to a k8s-safe name (RFC-1123 label-ish)

File diff suppressed because it is too large Load Diff

92
scripts/gitea_cred_sync.sh Executable file
View File

@ -0,0 +1,92 @@
#!/usr/bin/env bash
# Sync Keycloak users into Gitea local accounts (for CLI + tokens).
# Requires: curl, jq, kubectl. Expects a Keycloak client with realm-management
# permissions (manage-users) and a Gitea admin token stored in a secret.
set -euo pipefail
require() { command -v "$1" >/dev/null 2>&1 || { echo "missing required binary: $1" >&2; exit 1; }; }
require curl; require jq; require kubectl
: "${KEYCLOAK_URL:=https://sso.bstein.dev}"
: "${KEYCLOAK_REALM:=atlas}"
: "${KEYCLOAK_CLIENT_ID:?set KEYCLOAK_CLIENT_ID or export via secret}"
: "${KEYCLOAK_CLIENT_SECRET:?set KEYCLOAK_CLIENT_SECRET or export via secret}"
: "${GITEA_BASE_URL:=https://scm.bstein.dev}"
: "${GITEA_NAMESPACE:=gitea}"
: "${GITEA_TOKEN_SECRET_NAME:=gitea-admin-token}"
: "${GITEA_TOKEN_SECRET_KEY:=token}"
: "${DEFAULT_PASSWORD:=TempSsoPass!2025}"
fetch_token() {
curl -fsS -X POST \
-d "grant_type=client_credentials" \
-d "client_id=${KEYCLOAK_CLIENT_ID}" \
-d "client_secret=${KEYCLOAK_CLIENT_SECRET}" \
"${KEYCLOAK_URL}/realms/${KEYCLOAK_REALM}/protocol/openid-connect/token" \
| jq -r '.access_token'
}
pull_users() {
local token="$1"
curl -fsS -H "Authorization: Bearer ${token}" \
"${KEYCLOAK_URL}/admin/realms/${KEYCLOAK_REALM}/users?max=500" \
| jq -r '.[] | select(.enabled == true) | select(.username | startswith("service-account-") | not) | [.username, (.email // ""), (.firstName // ""), (.lastName // "")] | @tsv'
}
get_gitea_token() {
if [[ -n "${GITEA_ADMIN_TOKEN:-}" ]]; then
echo "${GITEA_ADMIN_TOKEN}"
return
fi
kubectl -n "${GITEA_NAMESPACE}" get secret "${GITEA_TOKEN_SECRET_NAME}" -o "jsonpath={.data.${GITEA_TOKEN_SECRET_KEY}}" \
| base64 -d
}
user_exists() {
local token="$1" username="$2"
local code
code=$(curl -s -o /dev/null -w '%{http_code}' \
-H "Authorization: token ${token}" \
"${GITEA_BASE_URL}/api/v1/admin/users/${username}")
[[ "${code}" == "200" ]]
}
create_user() {
local token="$1" username="$2" email="$3" fname="$4" lname="$5"
local body status fullname
fullname="$(echo "${fname} ${lname}" | xargs)"
if [[ -z "${email}" ]]; then
email="${username}@example.local"
fi
body=$(jq -n --arg u "${username}" --arg e "${email}" --arg p "${DEFAULT_PASSWORD}" \
--arg fn "${fullname}" '{username:$u, email:$e, password:$p, must_change_password:false, full_name:$fn}')
status=$(curl -s -o /dev/null -w '%{http_code}' \
-H "Authorization: token ${token}" \
-H "Content-Type: application/json" \
-X POST \
-d "${body}" \
"${GITEA_BASE_URL}/api/v1/admin/users")
if [[ "${status}" == "201" ]]; then
echo "created gitea user ${username}"
elif [[ "${status}" == "409" ]]; then
echo "gitea user ${username} already exists (409)" >&2
else
echo "failed to create gitea user ${username} (status ${status})" >&2
fi
}
main() {
local kc_token gitea_token
kc_token="$(fetch_token)"
gitea_token="$(get_gitea_token)"
while IFS=$'\t' read -r username email fname lname; do
if user_exists "${gitea_token}" "${username}"; then
continue
fi
create_user "${gitea_token}" "${username}" "${email}" "${fname}" "${lname}"
done < <(pull_users "${kc_token}")
}
main "$@"

87
scripts/gitops_cred_sync.sh Executable file
View File

@ -0,0 +1,87 @@
#!/usr/bin/env bash
# Ensure Keycloak users are in the GitOps admin group used by weave-gitops (cd.bstein.dev).
# Weave GitOps relies on OIDC; membership in the "admin" group maps to cluster-admin via RBAC.
# Requires: curl, jq. Needs a Keycloak client with realm-management (manage-users/groups).
set -euo pipefail
require() { command -v "$1" >/dev/null 2>&1 || { echo "missing required binary: $1" >&2; exit 1; }; }
require curl; require jq
: "${KEYCLOAK_URL:=https://sso.bstein.dev}"
: "${KEYCLOAK_REALM:=atlas}"
: "${KEYCLOAK_CLIENT_ID:?set KEYCLOAK_CLIENT_ID or export via secret}"
: "${KEYCLOAK_CLIENT_SECRET:?set KEYCLOAK_CLIENT_SECRET or export via secret}"
: "${GITOPS_GROUP:=admin}"
# Comma-separated usernames to sync; set SYNC_ALL_USERS=true to include all Keycloak users.
: "${TARGET_USERNAMES:=bstein}"
: "${SYNC_ALL_USERS:=false}"
fetch_token() {
curl -fsS -X POST \
-d "grant_type=client_credentials" \
-d "client_id=${KEYCLOAK_CLIENT_ID}" \
-d "client_secret=${KEYCLOAK_CLIENT_SECRET}" \
"${KEYCLOAK_URL}/realms/${KEYCLOAK_REALM}/protocol/openid-connect/token" \
| jq -r '.access_token'
}
ensure_group() {
local token="$1" group="$2" id
id=$(curl -fsS -H "Authorization: Bearer ${token}" \
"${KEYCLOAK_URL}/admin/realms/${KEYCLOAK_REALM}/groups?search=${group}" \
| jq -r --arg g "${group}" '.[] | select(.name==$g) | .id' | head -n1)
if [[ -n "${id}" ]]; then
echo "${id}"
return
fi
curl -fsS -H "Authorization: Bearer ${token}" \
-H "Content-Type: application/json" \
-d "{\"name\":\"${group}\"}" \
-X POST "${KEYCLOAK_URL}/admin/realms/${KEYCLOAK_REALM}/groups"
# Fetch again to get id
curl -fsS -H "Authorization: Bearer ${token}" \
"${KEYCLOAK_URL}/admin/realms/${KEYCLOAK_REALM}/groups?search=${group}" \
| jq -r --arg g "${group}" '.[] | select(.name==$g) | .id' | head -n1
}
user_id_by_name() {
local token="$1" username="$2"
curl -fsS -H "Authorization: Bearer ${token}" \
"${KEYCLOAK_URL}/admin/realms/${KEYCLOAK_REALM}/users?username=${username}" \
| jq -r '.[0].id'
}
add_user_to_group() {
local token="$1" user_id="$2" group_id="$3" username="$4"
if [[ -z "${user_id}" ]]; then
echo "user ${username} not found in Keycloak; skip" >&2
return
fi
curl -fsS -o /dev/null -w '%{http_code}' \
-H "Authorization: Bearer ${token}" \
-X PUT "${KEYCLOAK_URL}/admin/realms/${KEYCLOAK_REALM}/users/${user_id}/groups/${group_id}" \
| grep -qE '^(204|409)$' || echo "failed adding ${username} to group" >&2
}
main() {
local token group_id users=()
token="$(fetch_token)"
group_id="$(ensure_group "${token}" "${GITOPS_GROUP}")"
if [[ "${SYNC_ALL_USERS}" == "true" ]]; then
readarray -t users < <(curl -fsS -H "Authorization: Bearer ${token}" \
"${KEYCLOAK_URL}/admin/realms/${KEYCLOAK_REALM}/users?max=500" \
| jq -r '.[] | select(.enabled==true) | .username')
else
IFS=',' read -ra users <<< "${TARGET_USERNAMES}"
fi
for user in "${users[@]}"; do
user="$(echo "${user}" | xargs)"
[[ -z "${user}" ]] && continue
add_user_to_group "${token}" "$(user_id_by_name "${token}" "${user}")" "${group_id}" "${user}"
done
}
main "$@"

94
scripts/jenkins_cred_sync.sh Executable file
View File

@ -0,0 +1,94 @@
#!/usr/bin/env bash
# Sync Keycloak users into Jenkins local accounts (for CLI/API use).
# Jenkins is OIDC-enabled, but local users can still be provisioned for tokens.
# Requires: curl, jq, kubectl. Needs Jenkins admin user+API token.
set -euo pipefail
require() { command -v "$1" >/dev/null 2>&1 || { echo "missing required binary: $1" >&2; exit 1; }; }
require curl; require jq; require kubectl
: "${KEYCLOAK_URL:=https://sso.bstein.dev}"
: "${KEYCLOAK_REALM:=atlas}"
: "${KEYCLOAK_CLIENT_ID:?set KEYCLOAK_CLIENT_ID or export via secret}"
: "${KEYCLOAK_CLIENT_SECRET:?set KEYCLOAK_CLIENT_SECRET or export via secret}"
: "${JENKINS_URL:=https://ci.bstein.dev}"
: "${JENKINS_NAMESPACE:=jenkins}"
: "${JENKINS_ADMIN_SECRET_NAME:=jenkins-admin-token}"
: "${JENKINS_ADMIN_USER_KEY:=username}"
: "${JENKINS_ADMIN_TOKEN_KEY:=token}"
: "${DEFAULT_PASSWORD:=TempSsoPass!2025}"
fetch_token() {
curl -fsS -X POST \
-d "grant_type=client_credentials" \
-d "client_id=${KEYCLOAK_CLIENT_ID}" \
-d "client_secret=${KEYCLOAK_CLIENT_SECRET}" \
"${KEYCLOAK_URL}/realms/${KEYCLOAK_REALM}/protocol/openid-connect/token" \
| jq -r '.access_token'
}
pull_users() {
local token="$1"
curl -fsS -H "Authorization: Bearer ${token}" \
"${KEYCLOAK_URL}/admin/realms/${KEYCLOAK_REALM}/users?max=500" \
| jq -r '.[] | select(.enabled == true) | select(.username | startswith("service-account-") | not) | [.id, .username, (.email // "")] | @tsv'
}
get_admin_auth() {
local user token
if [[ -n "${JENKINS_ADMIN_USER:-}" && -n "${JENKINS_ADMIN_TOKEN:-}" ]]; then
echo "${JENKINS_ADMIN_USER}:${JENKINS_ADMIN_TOKEN}"
return
fi
user=$(kubectl -n "${JENKINS_NAMESPACE}" get secret "${JENKINS_ADMIN_SECRET_NAME}" -o "jsonpath={.data.${JENKINS_ADMIN_USER_KEY}}" | base64 -d)
token=$(kubectl -n "${JENKINS_NAMESPACE}" get secret "${JENKINS_ADMIN_SECRET_NAME}" -o "jsonpath={.data.${JENKINS_ADMIN_TOKEN_KEY}}" | base64 -d)
echo "${user}:${token}"
}
get_crumb() {
local auth="$1"
curl -fsS -u "${auth}" "${JENKINS_URL}/crumbIssuer/api/json" | jq -r .crumb
}
user_exists() {
local auth="$1" user="$2"
local code
code=$(curl -s -o /dev/null -w '%{http_code}' -u "${auth}" "${JENKINS_URL}/user/${user}/api/json")
[[ "${code}" == "200" ]]
}
create_user() {
local auth="$1" crumb="$2" username="$3" email="$4"
local status
status=$(curl -s -o /dev/null -w '%{http_code}' \
-u "${auth}" \
-H "Jenkins-Crumb: ${crumb}" \
-X POST \
--data "username=${username}&password1=${DEFAULT_PASSWORD}&password2=${DEFAULT_PASSWORD}&fullname=${username}&email=${email}" \
"${JENKINS_URL}/securityRealm/createAccountByAdmin")
if [[ "${status}" == "200" || "${status}" == "302" ]]; then
echo "created jenkins user ${username}"
elif [[ "${status}" == "400" ]]; then
echo "jenkins user ${username} already exists (400)" >&2
else
echo "failed to create jenkins user ${username} (status ${status})" >&2
fi
}
main() {
local kc_token auth crumb
kc_token="$(fetch_token)"
auth="$(get_admin_auth)"
crumb="$(get_crumb "${auth}")"
while IFS=$'\t' read -r _ uid email; do
if user_exists "${auth}" "${uid}"; then
continue
fi
create_user "${auth}" "${crumb}" "${uid}" "${email}"
done < <(pull_users "${kc_token}")
}
main "$@"

View File

@ -1,6 +1,6 @@
#!/usr/bin/env fish
function pvc-usage --description "Show Longhorn PVC usage (human-readable) mapped to namespace/name"
function pvc-usage --description "Show Longhorn PVC usage mapped to namespace/name"
begin
kubectl -n longhorn-system get volumes.longhorn.io -o json \
| jq -r '.items[] | "\(.metadata.name)\t\(.status.actualSize)\t\(.spec.size)"' \

204
scripts/mailu_sync.py Normal file
View File

@ -0,0 +1,204 @@
#!/usr/bin/env python3
"""
Sync Keycloak users to Mailu mailboxes.
- Generates/stores a mailu_app_password attribute in Keycloak (admin-only)
- Upserts the mailbox in Mailu Postgres using that password
"""
import os
import sys
import json
import time
import secrets
import string
import datetime
import requests
import psycopg2
from psycopg2.extras import RealDictCursor
from passlib.hash import bcrypt_sha256
KC_BASE = os.environ["KEYCLOAK_BASE_URL"].rstrip("/")
KC_REALM = os.environ["KEYCLOAK_REALM"]
KC_CLIENT_ID = os.environ["KEYCLOAK_CLIENT_ID"]
KC_CLIENT_SECRET = os.environ["KEYCLOAK_CLIENT_SECRET"]
MAILU_DOMAIN = os.environ["MAILU_DOMAIN"]
MAILU_DEFAULT_QUOTA = int(os.environ.get("MAILU_DEFAULT_QUOTA", "20000000000"))
DB_CONFIG = {
"host": os.environ["MAILU_DB_HOST"],
"port": int(os.environ.get("MAILU_DB_PORT", "5432")),
"dbname": os.environ["MAILU_DB_NAME"],
"user": os.environ["MAILU_DB_USER"],
"password": os.environ["MAILU_DB_PASSWORD"],
}
SESSION = requests.Session()
def log(msg):
sys.stdout.write(f"{msg}\n")
sys.stdout.flush()
def get_kc_token():
resp = SESSION.post(
f"{KC_BASE}/realms/{KC_REALM}/protocol/openid-connect/token",
data={
"grant_type": "client_credentials",
"client_id": KC_CLIENT_ID,
"client_secret": KC_CLIENT_SECRET,
},
timeout=15,
)
resp.raise_for_status()
return resp.json()["access_token"]
def kc_get_users(token):
users = []
first = 0
max_results = 200
headers = {"Authorization": f"Bearer {token}"}
while True:
resp = SESSION.get(
f"{KC_BASE}/admin/realms/{KC_REALM}/users",
params={"first": first, "max": max_results, "enabled": "true"},
headers=headers,
timeout=20,
)
resp.raise_for_status()
batch = resp.json()
users.extend(batch)
if len(batch) < max_results:
break
first += max_results
return users
def kc_update_attributes(token, user, attributes):
headers = {
"Authorization": f"Bearer {token}",
"Content-Type": "application/json",
}
payload = {
"firstName": user.get("firstName"),
"lastName": user.get("lastName"),
"email": user.get("email"),
"enabled": user.get("enabled", True),
"username": user["username"],
"emailVerified": user.get("emailVerified", False),
"attributes": attributes,
}
user_url = f"{KC_BASE}/admin/realms/{KC_REALM}/users/{user['id']}"
resp = SESSION.put(user_url, headers=headers, json=payload, timeout=20)
resp.raise_for_status()
verify = SESSION.get(
user_url,
headers={"Authorization": f"Bearer {token}"},
params={"briefRepresentation": "false"},
timeout=15,
)
verify.raise_for_status()
attrs = verify.json().get("attributes") or {}
if not attrs.get("mailu_app_password"):
raise Exception(f"attribute not persisted for {user.get('email') or user['username']}")
def random_password():
alphabet = string.ascii_letters + string.digits
return "".join(secrets.choice(alphabet) for _ in range(24))
def ensure_mailu_user(cursor, email, password, display_name):
localpart, domain = email.split("@", 1)
if domain.lower() != MAILU_DOMAIN.lower():
return
hashed = bcrypt_sha256.hash(password)
now = datetime.datetime.utcnow()
cursor.execute(
"""
INSERT INTO "user" (
email, localpart, domain_name, password,
quota_bytes, quota_bytes_used,
global_admin, enabled, enable_imap, enable_pop, allow_spoofing,
forward_enabled, forward_destination, forward_keep,
reply_enabled, reply_subject, reply_body, reply_startdate, reply_enddate,
displayed_name, spam_enabled, spam_mark_as_read, spam_threshold,
change_pw_next_login, created_at, updated_at, comment
)
VALUES (
%(email)s, %(localpart)s, %(domain)s, %(password)s,
%(quota)s, 0,
false, true, true, true, false,
false, '', true,
false, NULL, NULL, DATE '1900-01-01', DATE '2999-12-31',
%(display)s, true, true, 80,
false, CURRENT_DATE, %(now)s, ''
)
ON CONFLICT (email) DO UPDATE
SET password = EXCLUDED.password,
enabled = true,
updated_at = EXCLUDED.updated_at
""",
{
"email": email,
"localpart": localpart,
"domain": domain,
"password": hashed,
"quota": MAILU_DEFAULT_QUOTA,
"display": display_name or localpart,
"now": now,
},
)
def main():
token = get_kc_token()
users = kc_get_users(token)
if not users:
log("No users found; exiting.")
return
conn = psycopg2.connect(**DB_CONFIG)
conn.autocommit = True
cursor = conn.cursor(cursor_factory=RealDictCursor)
for user in users:
attrs = user.get("attributes", {}) or {}
app_pw_value = attrs.get("mailu_app_password")
if isinstance(app_pw_value, list):
app_pw = app_pw_value[0] if app_pw_value else None
elif isinstance(app_pw_value, str):
app_pw = app_pw_value
else:
app_pw = None
email = user.get("email")
if not email:
email = f"{user['username']}@{MAILU_DOMAIN}"
if not app_pw:
app_pw = random_password()
attrs["mailu_app_password"] = app_pw
kc_update_attributes(token, user, attrs)
log(f"Set mailu_app_password for {email}")
display_name = " ".join(
part for part in [user.get("firstName"), user.get("lastName")] if part
).strip()
ensure_mailu_user(cursor, email, app_pw, display_name)
log(f"Synced mailbox for {email}")
cursor.close()
conn.close()
if __name__ == "__main__":
try:
main()
except Exception as exc:
log(f"ERROR: {exc}")
sys.exit(1)

49
scripts/nextcloud-mail-sync.sh Executable file
View File

@ -0,0 +1,49 @@
#!/bin/bash
set -euo pipefail
KC_BASE="${KC_BASE:?}"
KC_REALM="${KC_REALM:?}"
KC_ADMIN_USER="${KC_ADMIN_USER:?}"
KC_ADMIN_PASS="${KC_ADMIN_PASS:?}"
if ! command -v jq >/dev/null 2>&1; then
apt-get update && apt-get install -y jq curl >/dev/null
fi
account_exists() {
# Skip if the account email is already present in the mail app.
runuser -u www-data -- php occ mail:account:list 2>/dev/null | grep -Fq " ${1}" || \
runuser -u www-data -- php occ mail:account:list 2>/dev/null | grep -Fq "${1} "
}
token=$(
curl -s -d "grant_type=password" \
-d "client_id=admin-cli" \
-d "username=${KC_ADMIN_USER}" \
-d "password=${KC_ADMIN_PASS}" \
"${KC_BASE}/realms/master/protocol/openid-connect/token" | jq -r '.access_token'
)
if [[ -z "${token}" || "${token}" == "null" ]]; then
echo "Failed to obtain admin token"
exit 1
fi
users=$(curl -s -H "Authorization: Bearer ${token}" \
"${KC_BASE}/admin/realms/${KC_REALM}/users?max=2000")
echo "${users}" | jq -c '.[]' | while read -r user; do
username=$(echo "${user}" | jq -r '.username')
email=$(echo "${user}" | jq -r '.email // empty')
app_pw=$(echo "${user}" | jq -r '.attributes.mailu_app_password[0] // empty')
[[ -z "${email}" || -z "${app_pw}" ]] && continue
if account_exists "${email}"; then
echo "Skipping ${email}, already exists"
continue
fi
echo "Syncing ${email}"
runuser -u www-data -- php occ mail:account:create \
"${username}" "${username}" "${email}" \
mail.bstein.dev 993 ssl "${email}" "${app_pw}" \
mail.bstein.dev 587 tls "${email}" "${app_pw}" login || true
done

View File

@ -0,0 +1,65 @@
#!/bin/bash
set -euo pipefail
NC_URL="${NC_URL:-https://cloud.bstein.dev}"
ADMIN_USER="${ADMIN_USER:?}"
ADMIN_PASS="${ADMIN_PASS:?}"
export DEBIAN_FRONTEND=noninteractive
apt-get update -qq
apt-get install -y -qq curl jq >/dev/null
run_occ() {
runuser -u www-data -- php occ "$@"
}
log() { echo "[$(date -Is)] $*"; }
log "Applying Atlas theming"
run_occ theming:config name "Atlas Cloud"
run_occ theming:config slogan "Unified access to Atlas services"
run_occ theming:config url "https://cloud.bstein.dev"
run_occ theming:config color "#0f172a"
run_occ theming:config disable-user-theming yes
log "Setting default quota to 200 GB"
run_occ config:app:set files default_quota --value "200 GB"
API_BASE="${NC_URL}/ocs/v2.php/apps/external/api/v1"
AUTH=(-u "${ADMIN_USER}:${ADMIN_PASS}" -H "OCS-APIRequest: true")
log "Removing existing external links"
existing=$(curl -sf "${AUTH[@]}" "${API_BASE}?format=json" | jq -r '.ocs.data[].id // empty')
for id in ${existing}; do
curl -sf "${AUTH[@]}" -X DELETE "${API_BASE}/sites/${id}?format=json" >/dev/null || true
done
SITES=(
"Vaultwarden|https://vault.bstein.dev"
"Jellyfin|https://stream.bstein.dev"
"Gitea|https://scm.bstein.dev"
"Jenkins|https://ci.bstein.dev"
"Harbor|https://registry.bstein.dev"
"Vault|https://secret.bstein.dev"
"Jitsi|https://meet.bstein.dev"
"Grafana|https://metrics.bstein.dev"
"Chat LLM|https://chat.ai.bstein.dev"
"Vision|https://draw.ai.bstein.dev"
"STT/TTS|https://talk.ai.bstein.dev"
)
log "Seeding external links"
for entry in "${SITES[@]}"; do
IFS="|" read -r name url <<<"${entry}"
curl -sf "${AUTH[@]}" -X POST "${API_BASE}/sites?format=json" \
-d "name=${name}" \
-d "url=${url}" \
-d "lang=" \
-d "type=link" \
-d "device=" \
-d "icon=" \
-d "groups[]=" \
-d "redirect=1" >/dev/null
done
log "Maintenance run completed"

575
scripts/styx_prep_nvme_luks.sh Executable file
View File

@ -0,0 +1,575 @@
#!/usr/bin/env bash
set -euo pipefail
# --- CONFIG (edit if needed) ---
# Leave NVME empty → script will auto-detect the SSK dock.
NVME="${NVME:-}"
FLAVOR="${FLAVOR:-desktop}"
# Persistent cache so the image survives reboots.
IMG_DIR="${IMG_DIR:-/var/cache/styx-rpi}"
IMG_FILE="${IMG_FILE:-ubuntu-24.04.3-preinstalled-${FLAVOR}-arm64+raspi.img}"
IMG_BOOT_MNT="${IMG_BOOT_MNT:-/mnt/img-boot}"
IMG_ROOT_MNT="${IMG_ROOT_MNT:-/mnt/img-root}"
TGT_ROOT="/mnt/target-root"
TGT_BOOT="/mnt/target-boot"
STYX_USER="styx"
STYX_HOSTNAME="titan-ag"
STYX_PASS="TempPass#123" # will be forced to change on first login via cloud-init
SSH_PUBKEY="ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIOb8oMX6u0z3sH/p/WBGlvPXXdbGETCKzWYwR/dd6fZb titan-bastion"
# Video / input prefs
DSI_FLAGS="video=DSI-1:800x480@60D video=HDMI-A-1:off video=HDMI-A-2:off"
# --- Helpers ---
fatal(){ echo "ERROR: $*" >&2; exit 1; }
need(){ command -v "$1" >/dev/null || fatal "Missing tool: $1"; }
require_root(){ [[ $EUID -eq 0 ]] || exec sudo -E "$0" "$@"; }
part() {
local n="$1"
if [[ "$NVME" =~ [0-9]$ ]]; then
echo "${NVME}p${n}"
else
echo "${NVME}${n}"
fi
}
auto_detect_target_disk() {
# If user already set NVME, validate and return
if [[ -n "${NVME:-}" ]]; then
[[ -b "$NVME" ]] || fatal "NVME='$NVME' is not a block device"
return
fi
# Prefer stable by-id symlinks
local byid
byid=$(ls -1 /dev/disk/by-id/usb-SSK* 2>/dev/null | head -n1 || true)
if [[ -n "$byid" ]]; then
NVME=$(readlink -f "$byid")
else
# Heuristic via lsblk -S: look for USB with SSK/Ingram/Storage in vendor/model
NVME=$(lsblk -S -p -o NAME,TRAN,VENDOR,MODEL | \
awk '/ usb / && ($3 ~ /SSK|Ingram/i || $4 ~ /SSK|Storage/i){print $1; exit}')
fi
[[ -n "${NVME:-}" && -b "$NVME" ]] || fatal "Could not auto-detect SSK USB NVMe dock. Export NVME=/dev/sdX and re-run."
echo "Auto-detected target disk: $NVME"
}
preflight_cleanup() {
local img="$IMG_DIR/$IMG_FILE"
# 1) Unmount image mountpoints and detach only loops for this IMG
umount -lf "$IMG_BOOT_MNT" "$IMG_ROOT_MNT" 2>/dev/null || true
# losetup -j exits non-zero if no association → tolerate it
{ losetup -j "$img" | cut -d: -f1 | xargs -r losetup -d; } 2>/dev/null || true
# 2) Unmount our target mounts
umount -lf "$TGT_ROOT/boot/firmware" "$TGT_BOOT" "$TGT_ROOT" 2>/dev/null || true
# 3) Unmount the actual target partitions if mounted anywhere (tolerate 'not found')
for p in "$(part 1)" "$(part 2)"; do
# findmnt returns 1 when no match → capture and iterate if any
while read -r mnt; do
[ -n "$mnt" ] && umount -lf "$mnt" 2>/dev/null || true
done < <(findmnt -rno TARGET -S "$p" 2>/dev/null || true)
done
# 4) Close dm-crypt mapping (if it exists)
cryptsetup luksClose cryptroot 2>/dev/null || true
dmsetup remove -f cryptroot 2>/dev/null || true
# 5) Let udev settle
command -v udevadm >/dev/null && udevadm settle || true
}
guard_target_device() {
# Refuse to operate if NVME appears to be the current system disk
local root_src root_disk
root_src=$(findmnt -no SOURCE /)
root_disk=$(lsblk -no pkname "$root_src" 2>/dev/null || true)
if [[ -n "$root_disk" && "/dev/$root_disk" == "$NVME" ]]; then
fatal "Refusing to operate on system disk ($NVME). Pick the external NVMe."
fi
}
need_host_fido2() {
if ! command -v fido2-token >/dev/null 2>&1; then
echo "Host is missing fido2-token. On Arch: sudo pacman -S libfido2"
echo "On Debian/Ubuntu host: sudo apt-get install fido2-tools"
exit 1
fi
}
ensure_image() {
mkdir -p "$IMG_DIR"
chmod 755 "$IMG_DIR"
local BASE="https://cdimage.ubuntu.com/releases/noble/release"
local XZ="ubuntu-24.04.3-preinstalled-${FLAVOR}-arm64+raspi.img.xz"
# If the decompressed .img is missing, fetch/decompress into the cache.
if [[ ! -f "$IMG_DIR/$IMG_FILE" ]]; then
need curl; need unxz # Arch: pacman -S curl xz | Ubuntu: apt-get install curl xz-utils
if [[ ! -f "$IMG_DIR/$XZ" ]]; then
echo "Fetching image…"
curl -fL -o "$IMG_DIR/$XZ" "$BASE/$XZ"
fi
echo "Decompressing to $IMG_DIR/$IMG_FILE"
# Keep the .xz for future runs; stream-decompress to the .img
if command -v unxz >/dev/null 2>&1; then
unxz -c "$IMG_DIR/$XZ" > "$IMG_DIR/$IMG_FILE"
else
need xz
xz -dc "$IMG_DIR/$XZ" > "$IMG_DIR/$IMG_FILE"
fi
sync
else
echo "Using cached image: $IMG_DIR/$IMG_FILE"
fi
}
ensure_binfmt_aarch64(){
# Register qemu-aarch64 for chrooted ARM64 apt runs
if [[ ! -e /proc/sys/fs/binfmt_misc/qemu-aarch64 ]]; then
need docker
systemctl enable --now docker >/dev/null 2>&1 || true
docker run --rm --privileged tonistiigi/binfmt --install arm64 >/dev/null
fi
if [[ ! -x /usr/local/bin/qemu-aarch64-static ]]; then
docker rm -f qemu-static >/dev/null 2>&1 || true
docker create --name qemu-static docker.io/multiarch/qemu-user-static:latest >/dev/null
docker cp qemu-static:/usr/bin/qemu-aarch64-static /usr/local/bin/
install -D -m755 /usr/local/bin/qemu-aarch64-static /usr/local/bin/qemu-aarch64-static
docker rm qemu-static >/dev/null
fi
}
open_image() {
[[ -r "$IMG_DIR/$IMG_FILE" ]] || fatal "Image not found: $IMG_DIR/$IMG_FILE"
mkdir -p "$IMG_BOOT_MNT" "$IMG_ROOT_MNT"
# Pre-clean: detach any previous loop(s) for this image (tolerate absence)
umount -lf "$IMG_BOOT_MNT" 2>/dev/null || true
umount -lf "$IMG_ROOT_MNT" 2>/dev/null || true
# If no loop is attached, losetup -j returns non-zero → swallow it
mapfile -t OLD < <({ losetup -j "$IMG_DIR/$IMG_FILE" | cut -d: -f1; } 2>/dev/null || true)
for L in "${OLD[@]:-}"; do losetup -d "$L" 2>/dev/null || true; done
command -v udevadm >/dev/null && udevadm settle || true
# Attach with partition scan; wait for partition nodes to exist
LOOP=$(losetup --find --show --partscan "$IMG_DIR/$IMG_FILE") || fatal "losetup failed"
command -v udevadm >/dev/null && udevadm settle || true
for _ in {1..25}; do
[[ -b "${LOOP}p1" && -b "${LOOP}p2" ]] && break
sleep 0.1
command -v udevadm >/dev/null && udevadm settle || true
done
[[ -b "${LOOP}p1" ]] || fatal "loop partitions not present for $LOOP"
# Cleanup on exit: unmount first, then detach loop (tolerate absence)
trap 'umount -lf "'"$IMG_BOOT_MNT"'" "'"$IMG_ROOT_MNT"'" 2>/dev/null; losetup -d "'"$LOOP"'" 2>/dev/null' EXIT
# Mount image partitions read-only
mount -o ro "${LOOP}p1" "$IMG_BOOT_MNT"
mount -o ro "${LOOP}p2" "$IMG_ROOT_MNT"
# Sanity checks without using failing pipelines
# start*.elf must exist
if ! compgen -G "$IMG_BOOT_MNT/start*.elf" > /dev/null; then
fatal "start*.elf not found in image"
fi
# vmlinuz-* must exist
if ! compgen -G "$IMG_ROOT_MNT/boot/vmlinuz-*" > /dev/null; then
fatal "vmlinuz-* not found in image root"
fi
}
confirm_and_wipe(){
lsblk -o NAME,SIZE,MODEL,TRAN,LABEL "$NVME"
read -rp "Type EXACTLY 'WIPE' to destroy ALL DATA on $NVME: " ACK
[[ "$ACK" == "WIPE" ]] || fatal "Aborted"
wipefs -a "$NVME"
sgdisk -Zo "$NVME"
# GPT: 1: 1MiB..513MiB vfat ESP; 2: rest LUKS
parted -s "$NVME" mklabel gpt \
mkpart system-boot fat32 1MiB 513MiB set 1 esp on \
mkpart cryptroot 513MiB 100%
partprobe "$NVME"; sleep 1
mkfs.vfat -F32 -n system-boot "$(part 1)"
}
setup_luks(){
echo "Create LUKS2 on $(part 2) (you will be prompted for a passphrase; keep it as fallback)"
need cryptsetup
cryptsetup luksFormat --type luks2 "$(part 2)"
cryptsetup open "$(part 2)" cryptroot
mkfs.ext4 -L rootfs /dev/mapper/cryptroot
}
mount_targets(){
mkdir -p "$TGT_ROOT" "$TGT_BOOT"
mount /dev/mapper/cryptroot "$TGT_ROOT"
mkdir -p "$TGT_ROOT/boot/firmware"
mount "$(part 1)" "$TGT_BOOT"
mount --bind "$TGT_BOOT" "$TGT_ROOT/boot/firmware"
}
rsync_root_and_boot(){
need rsync
rsync -aAXH --numeric-ids --delete \
--exclude='/boot/firmware' --exclude='/boot/firmware/**' \
--exclude='/dev/*' --exclude='/proc/*' --exclude='/sys/*' \
--exclude='/run/*' --exclude='/tmp/*' --exclude='/mnt/*' \
--exclude='/media/*' --exclude='/lost+found' \
"$IMG_ROOT_MNT"/ "$TGT_ROOT"/
rsync -aH --delete "$IMG_BOOT_MNT"/ "$TGT_ROOT/boot/firmware"/
}
write_crypttab_fstab(){
LUUID=$(blkid -s UUID -o value "$(part 2)")
printf 'cryptroot UUID=%s none luks,discard,fido2-device=auto\n' "$LUUID" > "$TGT_ROOT/etc/crypttab"
cat > "$TGT_ROOT/etc/fstab" <<EOF
/dev/mapper/cryptroot / ext4 defaults,discard,errors=remount-ro 0 1
LABEL=system-boot /boot/firmware vfat defaults,umask=0077 0 1
EOF
}
fix_firmware_files(){
local C="$TGT_ROOT/boot/firmware/config.txt"
local CL="$TGT_ROOT/boot/firmware/cmdline.txt"
[[ -f "$C" ]] || fatal "missing $C"
# Always boot the uncompressed Pi 5 kernel
if grep -q '^kernel=' "$C"; then
sed -i 's#^kernel=.*#kernel=kernel_2712.img#' "$C"
else
sed -i '1i kernel=kernel_2712.img' "$C"
fi
# Ensure initramfs and cmdline indirection are set
grep -q '^initramfs ' "$C" || echo 'initramfs initrd.img followkernel' >> "$C"
grep -q '^cmdline=cmdline.txt' "$C" || sed -i '1i cmdline=cmdline.txt' "$C"
# Display & buses (Pi 5)
grep -q '^dtoverlay=vc4-kms-v3d-pi5' "$C" || echo 'dtoverlay=vc4-kms-v3d-pi5' >> "$C"
grep -q '^dtparam=i2c_arm=on' "$C" || echo 'dtparam=i2c_arm=on' >> "$C"
grep -q '^dtparam=pciex1=on' "$C" || echo 'dtparam=pciex1=on' >> "$C"
grep -q '^dtparam=pciex1_gen=2' "$C" || echo 'dtparam=pciex1_gen=2' >> "$C"
grep -q '^enable_uart=1' "$C" || echo 'enable_uart=1' >> "$C"
# Minimal, correct dracut hints using the bare UUID
local LUUID; LUUID=$(blkid -s UUID -o value "$(part 2)")
: > "$CL"
{
echo -n "rd.luks.uuid=$LUUID rd.luks.name=$LUUID=cryptroot "
echo -n "root=/dev/mapper/cryptroot rootfstype=ext4 rootwait fixrtc "
echo "console=serial0,115200 console=tty1 ds=nocloud;s=file:///boot/firmware/ ${DSI_FLAGS} rd.debug"
} >> "$CL"
}
seed_cloud_init(){
# NoCloud seed to create user, lock down SSH, set hostname, and enable avahi.
cat > "$TGT_ROOT/boot/firmware/user-data" <<EOF
#cloud-config
hostname: $STYX_HOSTNAME
manage_etc_hosts: true
users:
- name: $STYX_USER
gecos: "$STYX_USER"
shell: /bin/bash
groups: [sudo,video,i2c]
sudo: ALL=(ALL) NOPASSWD:ALL
lock_passwd: false
ssh_authorized_keys:
- $SSH_PUBKEY
chpasswd:
list: |
$STYX_USER:$STYX_PASS
expire: true
ssh_pwauth: false
package_update: true
packages: [openssh-server, avahi-daemon]
runcmd:
- systemctl enable --now ssh
- systemctl enable --now avahi-daemon || true
EOF
# Minimal meta-data for NoCloud
date +%s | awk '{print "instance-id: iid-titan-ag-"$1"\nlocal-hostname: '"$STYX_HOSTNAME"'"}' \
> "$TGT_ROOT/boot/firmware/meta-data"
}
prep_chroot_mounts(){
for d in dev proc sys; do mount --bind "/$d" "$TGT_ROOT/$d"; done
mount -t devpts devpts "$TGT_ROOT/dev/pts"
# Replace the usual resolv.conf symlink with a real file for apt to work
rm -f "$TGT_ROOT/etc/resolv.conf"
cp /etc/resolv.conf "$TGT_ROOT/etc/resolv.conf"
# Block service starts (no systemd in chroot)
cat > "$TGT_ROOT/usr/sbin/policy-rc.d" <<'EOP'
#!/bin/sh
exit 101
EOP
chmod +x "$TGT_ROOT/usr/sbin/policy-rc.d"
# Ensure qemu static is present inside chroot
install -D -m755 /usr/local/bin/qemu-aarch64-static "$TGT_ROOT/usr/bin/qemu-aarch64-static"
}
in_chroot(){
chroot "$TGT_ROOT" /usr/bin/qemu-aarch64-static /bin/bash -lc '
set -euo pipefail
export DEBIAN_FRONTEND=noninteractive TZ=Etc/UTC
# --- APT sources (ports) ---
cat > /etc/apt/sources.list <<'"'"'EOS'"'"'
deb http://ports.ubuntu.com/ubuntu-ports noble main restricted universe multiverse
deb http://ports.ubuntu.com/ubuntu-ports noble-updates main restricted universe multiverse
deb http://ports.ubuntu.com/ubuntu-ports noble-security main restricted universe multiverse
EOS
apt-get update
# --- Remove snaps and pin them off ---
apt-get -y purge snapd || true
rm -rf /snap /var/snap /var/lib/snapd /home/*/snap || true
mkdir -p /etc/apt/preferences.d
cat > /etc/apt/preferences.d/nosnap.pref <<'"'"'EOS'"'"'
Package: snapd
Pin: release *
Pin-Priority: -10
EOS
# --- Base tools (no flash-kernel; we use dracut) ---
apt-get install -y --no-install-recommends \
openssh-client openssh-server openssh-sftp-server avahi-daemon \
cryptsetup dracut fido2-tools libfido2-1 i2c-tools \
python3-smbus python3-pil zbar-tools qrencode lm-sensors \
file zstd lz4 || true
# Camera apps: try rpicam-apps; otherwise basic libcamera tools
apt-get install -y rpicam-apps || apt-get install -y libcamera-tools || true
# --- Persistent journal so we can read logs after failed boot ---
mkdir -p /etc/systemd/journald.conf.d
cat > /etc/systemd/journald.conf.d/99-persistent.conf <<'"'"'EOS'"'"'
[Journal]
Storage=persistent
EOS
# --- SSH hardening (ensure file exists even if package was half-installed) ---
if [ ! -f /etc/ssh/sshd_config ]; then
mkdir -p /etc/ssh
cat > /etc/ssh/sshd_config <<'"'"'EOS'"'"'
PermitRootLogin no
PasswordAuthentication no
KbdInteractiveAuthentication no
PubkeyAuthentication yes
# Accept defaults for the rest
EOS
fi
sed -i -e "s/^#\?PasswordAuthentication .*/PasswordAuthentication no/" \
-e "s/^#\?KbdInteractiveAuthentication .*/KbdInteractiveAuthentication no/" \
-e "s/^#\?PermitRootLogin .*/PermitRootLogin no/" \
-e "s/^#\?PubkeyAuthentication .*/PubkeyAuthentication yes/" /etc/ssh/sshd_config || true
# --- Hostname & hosts ---
echo "'"$STYX_HOSTNAME"'" > /etc/hostname
if grep -q "^127\\.0\\.1\\.1" /etc/hosts; then
sed -i "s/^127\\.0\\.1\\.1.*/127.0.1.1\t'"$STYX_HOSTNAME"'/" /etc/hosts
else
echo -e "127.0.1.1\t'"$STYX_HOSTNAME"'" >> /etc/hosts
fi
# --- Enable services on first boot ---
mkdir -p /etc/systemd/system/multi-user.target.wants
ln -sf /lib/systemd/system/ssh.service /etc/systemd/system/multi-user.target.wants/ssh.service
ln -sf /lib/systemd/system/avahi-daemon.service /etc/systemd/system/multi-user.target.wants/avahi-daemon.service || true
# --- Ensure i2c group ---
getent group i2c >/dev/null || groupadd i2c
# --- Dracut configuration (generic, not host-only) ---
mkdir -p /etc/dracut.conf.d
cat > /etc/dracut.conf.d/00-hostonly.conf <<'"'"'EOS'"'"'
hostonly=no
EOS
cat > /etc/dracut.conf.d/10-systemd-crypt.conf <<'"'"'EOS'"'"'
add_dracutmodules+=" systemd crypt "
EOS
cat > /etc/dracut.conf.d/20-drivers.conf <<'"'"'EOS'"'"'
add_drivers+=" nvme xhci_pci xhci_hcd usbhid hid_generic hid "
EOS
cat > /etc/dracut.conf.d/30-fido2.conf <<'"'"'EOS'"'"'
install_items+="/usr/bin/systemd-cryptsetup /usr/bin/fido2-token /usr/lib/*/libfido2.so* /usr/lib/*/libcbor.so*"
EOS
# --- Build initramfs and place it where firmware expects it ---
KVER=$(ls -1 /lib/modules | sort -V | tail -n1)
dracut --force /boot/initramfs-$KVER.img $KVER
ln -sf initramfs-$KVER.img /boot/initrd.img
ln -sf initramfs-$KVER.img /boot/initrd.img-$KVER
cp -a /boot/initramfs-$KVER.img /boot/firmware/initrd.img
# --- Create uncompressed kernel for Pi 5 firmware ---
if [ -f "/usr/lib/linux-image-$KVER/Image" ]; then
cp -a "/usr/lib/linux-image-$KVER/Image" /boot/firmware/kernel_2712.img
else
FMT=$(file -b "/boot/vmlinuz-$KVER" || true)
case "$FMT" in
*Zstandard*|*zstd*) zstd -dc "/boot/vmlinuz-$KVER" > /boot/firmware/kernel_2712.img ;;
*LZ4*) lz4 -dc "/boot/vmlinuz-$KVER" > /boot/firmware/kernel_2712.img ;;
*gzip*) zcat "/boot/vmlinuz-$KVER" > /boot/firmware/kernel_2712.img ;;
*) cp -a "/boot/vmlinuz-$KVER" /boot/firmware/kernel_2712.img ;;
esac
fi
# --- Ensure Pi 5 DTB is present on the boot partition ---
DTB=$(find /lib/firmware -type f -name "bcm2712-rpi-5-b.dtb" | sort | tail -n1 || true)
[ -n "$DTB" ] && cp -a "$DTB" /boot/firmware/
# --- Dracut hook to copy rdsosreport.txt to the FAT partition on failure ---
mkdir -p /usr/lib/dracut/modules.d/99copylog
cat > /usr/lib/dracut/modules.d/99copylog/module-setup.sh <<'"'"'EOS'"'"'
#!/bin/bash
check() { return 0; }
depends() { echo base; return 0; }
install() {
# Guard $moddir for nounset; derive if absent
local mdir="${moddir:-$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" && pwd)}"
inst_hook emergency 99 "$mdir/copylog.sh"
}
EOS
chmod +x /usr/lib/dracut/modules.d/99copylog/module-setup.sh
cat > /usr/lib/dracut/modules.d/99copylog/copylog.sh <<'"'"'EOS'"'"'
#!/bin/sh
set -e
for dev in /dev/nvme0n1p1 /dev/sda1 /dev/sdb1 /dev/mmcblk0p1; do
[ -b "$dev" ] || continue
mkdir -p /mnt/bootfat
if mount -t vfat "$dev" /mnt/bootfat 2>/dev/null; then
if [ -s /run/initramfs/rdsosreport.txt ]; then
cp -f /run/initramfs/rdsosreport.txt /mnt/bootfat/rdsosreport.txt 2>/dev/null || true
sync || true
fi
umount /mnt/bootfat || true
break
fi
done
EOS
chmod +x /usr/lib/dracut/modules.d/99copylog/copylog.sh
# Rebuild to ensure the copylog module is included
dracut --force /boot/initramfs-$KVER.img $KVER
ln -sf initramfs-$KVER.img /boot/initrd.img
cp -a /boot/initramfs-$KVER.img /boot/firmware/initrd.img
true
'
}
verify_boot_assets(){
echo "---- verify boot assets on FAT ----"
file "$TGT_ROOT/boot/firmware/kernel_2712.img" || true
ls -lh "$TGT_ROOT/boot/firmware/initrd.img" || true
echo "-- config.txt (key lines) --"
grep -E '^(kernel|initramfs|cmdline)=|^dtoverlay=|^dtparam=' "$TGT_ROOT/boot/firmware/config.txt" || true
echo "-- cmdline.txt --"
cat "$TGT_ROOT/boot/firmware/cmdline.txt" || true
echo "-- firmware blobs (sample) --"
ls -1 "$TGT_ROOT/boot/firmware"/start*.elf "$TGT_ROOT/boot/firmware"/fixup*.dat | head -n 8 || true
echo "-- Pi5 DTB --"
ls -l "$TGT_ROOT/boot/firmware/"*rpi-5-b.dtb || true
}
enroll_fido_tokens(){
echo "Enrolling FIDO2 Solo keys into $(part 2) ..."
need systemd-cryptenroll
need fido2-token
# Collect all hidraw paths from both output styles (some distros print 'Device: /dev/hidrawX')
mapfile -t DEVS < <(
fido2-token -L \
| sed -n 's,^\(/dev/hidraw[0-9]\+\):.*,\1,p; s,^Device:[[:space:]]\+/dev/hidraw\([0-9]\+\).*,/dev/hidraw\1,p' \
| sort -u
)
if (( ${#DEVS[@]} == 0 )); then
echo "No FIDO2 tokens detected; skipping enrollment (you can enroll later)."
echo "Example later: systemd-cryptenroll $(part 2) --fido2-device=/dev/hidrawX --fido2-with-client-pin=no"
return 0
fi
# Recommend keeping exactly ONE key plugged during first enrollment to avoid ambiguity.
if (( ${#DEVS[@]} > 1 )); then
echo "Note: multiple FIDO2 tokens present: ${DEVS[*]}"
echo "If enrollment fails, try with only one key inserted."
fi
local rc=0
for D in "${DEVS[@]}"; do
echo "-> Enrolling $D (you should be asked to touch the key)"
if ! SYSTEMD_LOG_LEVEL=debug systemd-cryptenroll "$(part 2)" \
--fido2-device="$D" \
--fido2-with-client-pin=no \
--fido2-with-user-presence=yes \
--fido2-with-user-verification=no \
--label="solo-$(basename "$D")"; then
echo "WARN: enrollment failed for $D"
rc=1
fi
done
echo "Tokens enrolled (if any):"
systemd-cryptenroll "$(part 2)" --list || true
return $rc
}
cleanup(){
rm -f "$TGT_ROOT/usr/sbin/policy-rc.d" || true
umount -lf "$TGT_ROOT/dev/pts" 2>/dev/null || true
for d in dev proc sys; do umount -lf "$TGT_ROOT/$d" 2>/dev/null || true; done
umount -lf "$TGT_ROOT/boot/firmware" 2>/dev/null || true
umount -lf "$TGT_BOOT" 2>/dev/null || true
umount -lf "$TGT_ROOT" 2>/dev/null || true
cryptsetup close cryptroot 2>/dev/null || true
umount -lf "$IMG_BOOT_MNT" 2>/dev/null || true
umount -lf "$IMG_ROOT_MNT" 2>/dev/null || true
}
main(){
require_root
need losetup; need parted; need rsync
auto_detect_target_disk
echo "Target disk: $NVME"
ensure_binfmt_aarch64
ensure_image
preflight_cleanup
guard_target_device
open_image
confirm_and_wipe
setup_luks
mount_targets
rsync_root_and_boot
write_crypttab_fstab
fix_firmware_files
seed_cloud_init
prep_chroot_mounts
in_chroot
verify_boot_assets
need_host_fido2
enroll_fido_tokens
cleanup
echo "✅ NVMe prepared."
echo " Install in the Pi 5 and boot with no SD."
echo " Expect LUKS to unlock automatically with a Solo key inserted;"
echo " passphrase fallback remains. Hostname: ${STYX_HOSTNAME} User: ${STYX_USER}"
echo " On first boot, reach it via: ssh -i ~/.ssh/id_ed25519_titan styx@titan-ag.local"
}
main "$@"

View File

@ -0,0 +1,58 @@
import importlib.util
import pathlib
def load_module():
path = pathlib.Path(__file__).resolve().parents[1] / "dashboards_render_atlas.py"
spec = importlib.util.spec_from_file_location("dashboards_render_atlas", path)
module = importlib.util.module_from_spec(spec)
assert spec.loader is not None
spec.loader.exec_module(module)
return module
def test_table_panel_options_and_filterable():
mod = load_module()
panel = mod.table_panel(
1,
"test",
"metric",
{"h": 1, "w": 1, "x": 0, "y": 0},
unit="percent",
transformations=[{"id": "labelsToFields", "options": {}}],
instant=True,
options={"showColumnFilters": False},
filterable=False,
footer={"show": False, "fields": "", "calcs": []},
format="table",
)
assert panel["fieldConfig"]["defaults"]["unit"] == "percent"
assert panel["fieldConfig"]["defaults"]["custom"]["filterable"] is False
assert panel["options"]["showHeader"] is True
assert panel["targets"][0]["format"] == "table"
def test_node_filter_and_expr_helpers():
mod = load_module()
expr = mod.node_filter("titan-.*")
assert "label_replace" in expr
cpu_expr = mod.node_cpu_expr("titan-.*")
mem_expr = mod.node_mem_expr("titan-.*")
assert "node_cpu_seconds_total" in cpu_expr
assert "node_memory_MemAvailable_bytes" in mem_expr
def test_render_configmap_writes(tmp_path):
mod = load_module()
mod.DASHBOARD_DIR = tmp_path / "dash"
mod.ROOT = tmp_path
uid = "atlas-test"
info = {"configmap": tmp_path / "cm.yaml"}
data = {"title": "Atlas Test"}
mod.write_json(uid, data)
mod.render_configmap(uid, info)
json_path = mod.DASHBOARD_DIR / f"{uid}.json"
assert json_path.exists()
content = (tmp_path / "cm.yaml").read_text()
assert "kind: ConfigMap" in content
assert f"{uid}.json" in content

View File

@ -0,0 +1,181 @@
import importlib.util
import pathlib
import pytest
def load_sync_module(monkeypatch):
# Minimal env required by module import
env = {
"KEYCLOAK_BASE_URL": "http://keycloak",
"KEYCLOAK_REALM": "atlas",
"KEYCLOAK_CLIENT_ID": "mailu-sync",
"KEYCLOAK_CLIENT_SECRET": "secret",
"MAILU_DOMAIN": "example.com",
"MAILU_DB_HOST": "localhost",
"MAILU_DB_PORT": "5432",
"MAILU_DB_NAME": "mailu",
"MAILU_DB_USER": "mailu",
"MAILU_DB_PASSWORD": "pw",
}
for k, v in env.items():
monkeypatch.setenv(k, v)
module_path = pathlib.Path(__file__).resolve().parents[1] / "mailu_sync.py"
spec = importlib.util.spec_from_file_location("mailu_sync_testmod", module_path)
module = importlib.util.module_from_spec(spec)
assert spec.loader is not None
spec.loader.exec_module(module)
return module
def test_random_password_length_and_charset(monkeypatch):
sync = load_sync_module(monkeypatch)
pw = sync.random_password()
assert len(pw) == 24
assert all(ch.isalnum() for ch in pw)
class _FakeResponse:
def __init__(self, json_data=None, status=200):
self._json_data = json_data or {}
self.status_code = status
def raise_for_status(self):
if self.status_code >= 400:
raise AssertionError(f"status {self.status_code}")
def json(self):
return self._json_data
class _FakeSession:
def __init__(self, put_resp, get_resp):
self.put_resp = put_resp
self.get_resp = get_resp
self.put_called = False
self.get_called = False
def post(self, *args, **kwargs):
return _FakeResponse({"access_token": "dummy"})
def put(self, *args, **kwargs):
self.put_called = True
return self.put_resp
def get(self, *args, **kwargs):
self.get_called = True
return self.get_resp
def test_kc_update_attributes_succeeds(monkeypatch):
sync = load_sync_module(monkeypatch)
ok_resp = _FakeResponse({"attributes": {"mailu_app_password": ["abc"]}})
sync.SESSION = _FakeSession(_FakeResponse({}), ok_resp)
sync.kc_update_attributes("token", {"id": "u1", "username": "u1"}, {"mailu_app_password": "abc"})
assert sync.SESSION.put_called and sync.SESSION.get_called
def test_kc_update_attributes_raises_without_attribute(monkeypatch):
sync = load_sync_module(monkeypatch)
missing_attr_resp = _FakeResponse({"attributes": {}}, status=200)
sync.SESSION = _FakeSession(_FakeResponse({}), missing_attr_resp)
with pytest.raises(Exception):
sync.kc_update_attributes("token", {"id": "u1", "username": "u1"}, {"mailu_app_password": "abc"})
def test_kc_get_users_paginates(monkeypatch):
sync = load_sync_module(monkeypatch)
class _PagedSession:
def __init__(self):
self.calls = 0
def post(self, *_, **__):
return _FakeResponse({"access_token": "tok"})
def get(self, *_, **__):
self.calls += 1
if self.calls == 1:
return _FakeResponse([{"id": "u1"}, {"id": "u2"}])
return _FakeResponse([]) # stop pagination
sync.SESSION = _PagedSession()
users = sync.kc_get_users("tok")
assert [u["id"] for u in users] == ["u1", "u2"]
assert sync.SESSION.calls == 2
def test_ensure_mailu_user_skips_foreign_domain(monkeypatch):
sync = load_sync_module(monkeypatch)
executed = []
class _Cursor:
def execute(self, sql, params):
executed.append((sql, params))
sync.ensure_mailu_user(_Cursor(), "user@other.com", "pw", "User")
assert not executed
def test_ensure_mailu_user_upserts(monkeypatch):
sync = load_sync_module(monkeypatch)
captured = {}
class _Cursor:
def execute(self, sql, params):
captured.update(params)
sync.ensure_mailu_user(_Cursor(), "user@example.com", "pw", "User Example")
assert captured["email"] == "user@example.com"
assert captured["localpart"] == "user"
# password should be hashed, not the raw string
assert captured["password"] != "pw"
def test_main_generates_password_and_upserts(monkeypatch):
sync = load_sync_module(monkeypatch)
users = [
{"id": "u1", "username": "user1", "email": "user1@example.com", "attributes": {}},
{"id": "u2", "username": "user2", "email": "user2@example.com", "attributes": {"mailu_app_password": ["keepme"]}},
{"id": "u3", "username": "user3", "email": "user3@other.com", "attributes": {}},
]
updated = []
class _Cursor:
def __init__(self):
self.executions = []
def execute(self, sql, params):
self.executions.append(params)
def close(self):
return None
class _Conn:
def __init__(self):
self.autocommit = False
self._cursor = _Cursor()
def cursor(self, cursor_factory=None):
return self._cursor
def close(self):
return None
monkeypatch.setattr(sync, "get_kc_token", lambda: "tok")
monkeypatch.setattr(sync, "kc_get_users", lambda token: users)
monkeypatch.setattr(sync, "kc_update_attributes", lambda token, user, attrs: updated.append((user["id"], attrs["mailu_app_password"])))
conns = []
def _connect(**kwargs):
conn = _Conn()
conns.append(conn)
return conn
monkeypatch.setattr(sync.psycopg2, "connect", _connect)
sync.main()
# Should attempt two inserts (third user skipped due to domain mismatch)
assert len(updated) == 1 # only one missing attr was backfilled
assert conns and len(conns[0]._cursor.executions) == 2

View File

@ -0,0 +1,48 @@
# services/bstein-dev-home/backend-deployment.yaml
apiVersion: apps/v1
kind: Deployment
metadata:
name: bstein-dev-home-backend
namespace: bstein-dev-home
spec:
replicas: 2
revisionHistoryLimit: 3
selector:
matchLabels:
app: bstein-dev-home-backend
template:
metadata:
labels:
app: bstein-dev-home-backend
spec:
nodeSelector:
kubernetes.io/arch: arm64
node-role.kubernetes.io/worker: "true"
imagePullSecrets:
- name: harbor-bstein-robot
containers:
- name: backend
image: registry.bstein.dev/bstein/bstein-dev-home-backend:latest
imagePullPolicy: Always
ports:
- name: http
containerPort: 8080
readinessProbe:
httpGet:
path: /api/healthz
port: http
initialDelaySeconds: 2
periodSeconds: 5
livenessProbe:
httpGet:
path: /api/healthz
port: http
initialDelaySeconds: 10
periodSeconds: 10
resources:
requests:
cpu: 50m
memory: 64Mi
limits:
cpu: 300m
memory: 256Mi

View File

@ -0,0 +1,12 @@
apiVersion: v1
kind: Service
metadata:
name: bstein-dev-home-backend
namespace: bstein-dev-home
spec:
selector:
app: bstein-dev-home-backend
ports:
- name: http
port: 80
targetPort: 8080

View File

@ -0,0 +1,48 @@
# services/bstein-dev-home/frontend-deployment.yaml
apiVersion: apps/v1
kind: Deployment
metadata:
name: bstein-dev-home-frontend
namespace: bstein-dev-home
spec:
replicas: 2
revisionHistoryLimit: 3
selector:
matchLabels:
app: bstein-dev-home-frontend
template:
metadata:
labels:
app: bstein-dev-home-frontend
spec:
nodeSelector:
kubernetes.io/arch: arm64
node-role.kubernetes.io/worker: "true"
imagePullSecrets:
- name: harbor-bstein-robot
containers:
- name: frontend
image: registry.bstein.dev/bstein/bstein-dev-home-frontend:latest
imagePullPolicy: Always
ports:
- name: http
containerPort: 80
readinessProbe:
httpGet:
path: /
port: http
initialDelaySeconds: 2
periodSeconds: 5
livenessProbe:
httpGet:
path: /
port: http
initialDelaySeconds: 10
periodSeconds: 10
resources:
requests:
cpu: 50m
memory: 64Mi
limits:
cpu: 300m
memory: 256Mi

View File

@ -0,0 +1,12 @@
apiVersion: v1
kind: Service
metadata:
name: bstein-dev-home-frontend
namespace: bstein-dev-home
spec:
selector:
app: bstein-dev-home-frontend
ports:
- name: http
port: 80
targetPort: 80

View File

@ -0,0 +1,48 @@
# services/bstein-dev-home/image.yaml
apiVersion: image.toolkit.fluxcd.io/v1beta2
kind: ImageRepository
metadata:
name: bstein-dev-home-frontend
namespace: bstein-dev-home
spec:
image: registry.bstein.dev/bstein/bstein-dev-home-frontend
interval: 1m0s
---
apiVersion: image.toolkit.fluxcd.io/v1beta2
kind: ImagePolicy
metadata:
name: bstein-dev-home-frontend
namespace: bstein-dev-home
spec:
imageRepositoryRef:
name: bstein-dev-home-frontend
filterTags:
pattern: '^v?(?P<version>[0-9]+\\.[0-9]+\\.[0-9]+(?:[-.][0-9A-Za-z]+)?)$'
extract: '$version'
policy:
semver:
range: ">=0.1.0"
---
apiVersion: image.toolkit.fluxcd.io/v1beta2
kind: ImageRepository
metadata:
name: bstein-dev-home-backend
namespace: bstein-dev-home
spec:
image: registry.bstein.dev/bstein/bstein-dev-home-backend
interval: 1m0s
---
apiVersion: image.toolkit.fluxcd.io/v1beta2
kind: ImagePolicy
metadata:
name: bstein-dev-home-backend
namespace: bstein-dev-home
spec:
imageRepositoryRef:
name: bstein-dev-home-backend
filterTags:
pattern: '^v?(?P<version>[0-9]+\\.[0-9]+\\.[0-9]+(?:[-.][0-9A-Za-z]+)?)$'
extract: '$version'
policy:
semver:
range: ">=0.1.0"

View File

@ -0,0 +1,31 @@
# services/bstein-dev-home/ingress.yaml
apiVersion: networking.k8s.io/v1
kind: Ingress
metadata:
name: bstein-dev-home
namespace: bstein-dev-home
annotations:
kubernetes.io/ingress.class: traefik
traefik.ingress.kubernetes.io/router.entrypoints: websecure
traefik.ingress.kubernetes.io/router.tls: "true"
cert-manager.io/cluster-issuer: letsencrypt
spec:
tls:
- hosts: [ "bstein.dev" ]
secretName: bstein-dev-home-tls
rules:
- host: bstein.dev
http:
paths:
- path: /api
pathType: Prefix
backend:
service:
name: bstein-dev-home-backend
port: { number: 80 }
- path: /
pathType: Prefix
backend:
service:
name: bstein-dev-home-frontend
port: { number: 80 }

View File

@ -0,0 +1,17 @@
# services/bstein-dev-home/kustomization.yaml
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
namespace: bstein-dev-home
resources:
- namespace.yaml
- image.yaml
- frontend-deployment.yaml
- frontend-service.yaml
- backend-deployment.yaml
- backend-service.yaml
- ingress.yaml
images:
- name: registry.bstein.dev/bstein/bstein-dev-home-frontend
newTag: latest # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend"}
- name: registry.bstein.dev/bstein/bstein-dev-home-backend
newTag: latest # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend"}

View File

@ -0,0 +1,4 @@
apiVersion: v1
kind: Namespace
metadata:
name: bstein-dev-home

View File

@ -0,0 +1,31 @@
# services/ci-demo/deployment.yaml
apiVersion: apps/v1
kind: Deployment
metadata:
name: ci-demo
namespace: ci-demo
spec:
replicas: 1
selector:
matchLabels:
app.kubernetes.io/name: ci-demo
template:
metadata:
labels:
app.kubernetes.io/name: ci-demo
spec:
nodeSelector:
hardware: rpi4
containers:
- name: ci-demo
image: registry.bstein.dev/infra/ci-demo:latest
ports:
- name: http
containerPort: 8080
readinessProbe:
httpGet:
path: /
port: http
initialDelaySeconds: 2
periodSeconds: 5

View File

@ -0,0 +1,24 @@
# services/ci-demo/image.yaml
apiVersion: image.toolkit.fluxcd.io/v1
kind: ImageRepository
metadata:
name: ci-demo
namespace: flux-system
spec:
image: registry.bstein.dev/infra/ci-demo
interval: 1m0s
---
apiVersion: image.toolkit.fluxcd.io/v1
kind: ImagePolicy
metadata:
name: ci-demo
namespace: flux-system
spec:
imageRepositoryRef:
name: ci-demo
filterTags:
pattern: '^v(?P<version>0\.0\.0-\d+)$'
extract: '$version'
policy:
semver:
range: ">=0.0.0-0"

View File

@ -0,0 +1,11 @@
# services/ci-demo/kustomization.yaml
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
resources:
- namespace.yaml
- image.yaml
- deployment.yaml
- service.yaml
images:
- name: registry.bstein.dev/infra/ci-demo
newTag: registry.bstein.dev/infra/ci-demo:v0.0.0-3 # {"$imagepolicy": "flux-system:ci-demo"}

View File

@ -0,0 +1,6 @@
# services/ci-demo/namespace.yaml
apiVersion: v1
kind: Namespace
metadata:
name: ci-demo

View File

@ -0,0 +1,14 @@
# services/ci-demo/service.yaml
apiVersion: v1
kind: Service
metadata:
name: ci-demo
namespace: ci-demo
spec:
selector:
app.kubernetes.io/name: ci-demo
ports:
- name: http
port: 80
targetPort: http

View File

@ -35,7 +35,7 @@ spec:
values: ["rpi4"]
containers:
- name: monerod
image: registry.bstein.dev/infra/monerod:0.18.4.1
image: registry.bstein.dev/crypto/monerod:0.18.4.1
command: ["/opt/monero/monerod"]
args:
- --data-dir=/data

View File

@ -32,7 +32,7 @@ spec:
values: ["rpi4"]
containers:
- name: monero-p2pool
image: registry.bstein.dev/infra/monero-p2pool:4.9
image: registry.bstein.dev/crypto/monero-p2pool:4.9
imagePullPolicy: Always
command: ["p2pool"]
args:

View File

@ -21,6 +21,72 @@ spec:
labels:
app: gitea
spec:
initContainers:
- name: configure-oidc
image: gitea/gitea:1.23
securityContext:
runAsUser: 1000
runAsGroup: 1000
env:
- name: CLIENT_ID
valueFrom:
secretKeyRef:
name: gitea-oidc
key: client_id
- name: CLIENT_SECRET
valueFrom:
secretKeyRef:
name: gitea-oidc
key: client_secret
- name: DISCOVERY_URL
valueFrom:
secretKeyRef:
name: gitea-oidc
key: openid_auto_discovery_url
command:
- /bin/bash
- -c
- |
set -euo pipefail
APPINI=/data/gitea/conf/app.ini
BIN=/usr/local/bin/gitea
list="$($BIN -c "$APPINI" admin auth list)"
id=$(echo "$list" | awk '$2=="keycloak"{print $1}')
if [ -n "$id" ]; then
echo "Updating existing auth source id=$id"
$BIN -c "$APPINI" admin auth update-oauth \
--id "$id" \
--name keycloak \
--provider openidConnect \
--key "$CLIENT_ID" \
--secret "$CLIENT_SECRET" \
--auto-discover-url "$DISCOVERY_URL" \
--scopes "openid profile email groups" \
--required-claim-name "" \
--required-claim-value "" \
--group-claim-name groups \
--admin-group admin \
--skip-local-2fa
else
echo "Creating keycloak auth source"
$BIN -c "$APPINI" admin auth add-oauth \
--name keycloak \
--provider openidConnect \
--key "$CLIENT_ID" \
--secret "$CLIENT_SECRET" \
--auto-discover-url "$DISCOVERY_URL" \
--scopes "openid profile email groups" \
--required-claim-name "" \
--required-claim-value "" \
--group-claim-name groups \
--admin-group admin \
--skip-local-2fa
fi
volumeMounts:
- name: gitea-data
mountPath: /data
nodeSelector:
node-role.kubernetes.io/worker: "true"
affinity:
@ -55,6 +121,36 @@ spec:
value: "master"
- name: ROOT_URL
value: "https://scm.bstein.dev"
- name: GITEA__service__ENABLE_OPENID_SIGNIN
value: "true"
- name: GITEA__oauth2_client__ENABLE_AUTO_REGISTRATION
value: "true"
- name: GITEA__service__ALLOW_ONLY_EXTERNAL_REGISTRATION
value: "true"
- name: GITEA__service__DISABLE_REGISTRATION
value: "false"
- name: GITEA__log__LEVEL
value: "trace"
- name: GITEA__service__REQUIRE_SIGNIN_VIEW
value: "false"
- name: GITEA__server__PROXY_HEADERS
value: "X-Forwarded-For, X-Forwarded-Proto, X-Forwarded-Host"
- name: GITEA__session__COOKIE_SECURE
value: "true"
- name: GITEA__session__DOMAIN
value: "scm.bstein.dev"
- name: GITEA__session__SAME_SITE
value: "lax"
- name: GITEA__security__SECRET_KEY
valueFrom:
secretKeyRef:
name: gitea-secret
key: SECRET_KEY
- name: GITEA__security__INTERNAL_TOKEN
valueFrom:
secretKeyRef:
name: gitea-secret
key: INTERNAL_TOKEN
- name: DB_TYPE
value: "postgres"
- name: DB_HOST

View File

@ -5,7 +5,7 @@ metadata:
name: gitea-ingress
namespace: gitea
annotations:
cert-manager.io/cluster-issuer: "letsencrypt-prod"
cert-manager.io/cluster-issuer: letsencrypt
nginx.ingress.kubernetes.io/ssl-redirect: "true"
spec:
tls:

View File

@ -0,0 +1,13 @@
# services/gitops-ui/certificate.yaml
apiVersion: cert-manager.io/v1
kind: Certificate
metadata:
name: gitops-ui-tls
namespace: flux-system
spec:
secretName: gitops-ui-tls
issuerRef:
kind: ClusterIssuer
name: letsencrypt
dnsNames:
- cd.bstein.dev

View File

@ -0,0 +1,48 @@
# services/gitops-ui/helmrelease.yaml
apiVersion: helm.toolkit.fluxcd.io/v2
kind: HelmRelease
metadata:
name: weave-gitops
namespace: flux-system
spec:
interval: 30m
chart:
spec:
chart: ./charts/gitops-server
sourceRef:
kind: GitRepository
name: weave-gitops-upstream
namespace: flux-system
# track upstream tag; see source object for version pin
install:
remediation:
retries: 3
upgrade:
remediation:
retries: 3
remediateLastFailure: true
cleanupOnFail: true
values:
additionalArgs:
- --auth-methods=oidc
adminUser:
create: false
ingress:
enabled: true
className: traefik
annotations:
cert-manager.io/cluster-issuer: letsencrypt
traefik.ingress.kubernetes.io/router.entrypoints: websecure
hosts:
- host: cd.bstein.dev
paths:
- path: /
pathType: Prefix
tls:
- secretName: gitops-ui-tls
hosts:
- cd.bstein.dev
oidcSecret:
create: false
metrics:
enabled: true

View File

@ -0,0 +1,10 @@
# services/gitops-ui/kustomization.yaml
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
namespace: flux-system
resources:
- source.yaml
- helmrelease.yaml
- certificate.yaml
- networkpolicy-acme.yaml
- rbac.yaml

View File

@ -0,0 +1,14 @@
# services/gitops-ui/networkpolicy-acme.yaml
apiVersion: networking.k8s.io/v1
kind: NetworkPolicy
metadata:
name: allow-acme-solver
namespace: flux-system
spec:
podSelector:
matchLabels:
acme.cert-manager.io/http01-solver: "true"
policyTypes:
- Ingress
ingress:
- {}

View File

@ -0,0 +1,15 @@
# services/gitops-ui/rbac.yaml
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
name: gitops-admins
labels:
app.kubernetes.io/name: weave-gitops
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: cluster-admin
subjects:
- kind: Group
name: admin
apiGroup: rbac.authorization.k8s.io

View File

@ -0,0 +1,11 @@
# services/gitops-ui/source.yaml
apiVersion: source.toolkit.fluxcd.io/v1
kind: GitRepository
metadata:
name: weave-gitops-upstream
namespace: flux-system
spec:
interval: 1h
url: https://github.com/weaveworks/weave-gitops.git
ref:
tag: v0.38.0

View File

@ -0,0 +1,12 @@
# services/harbor/certificate.yaml
apiVersion: cert-manager.io/v1
kind: Certificate
metadata:
name: registry-bstein-dev
namespace: harbor
spec:
secretName: registry-bstein-dev-tls
dnsNames: [ "registry.bstein.dev" ]
issuerRef:
name: letsencrypt
kind: ClusterIssuer

View File

@ -0,0 +1,259 @@
# services/harbor/helmrelease.yaml
apiVersion: helm.toolkit.fluxcd.io/v2
kind: HelmRelease
metadata:
name: harbor
namespace: harbor
spec:
interval: 10m
install:
timeout: 20m
remediation:
retries: 3
upgrade:
timeout: 20m
remediation:
retries: 3
remediateLastFailure: true
cleanupOnFail: true
rollback:
timeout: 20m
chart:
spec:
chart: harbor
version: 1.18.1
sourceRef:
kind: HelmRepository
name: harbor
namespace: flux-system
values:
externalURL: https://registry.bstein.dev
imagePullPolicy: IfNotPresent
expose:
type: ingress
tls:
enabled: true
certSource: secret
secret:
secretName: registry-bstein-dev-tls
ingress:
className: traefik
annotations:
cert-manager.io/cluster-issuer: letsencrypt
traefik.ingress.kubernetes.io/router.entrypoints: websecure
traefik.ingress.kubernetes.io/router.tls: "true"
hosts:
core: registry.bstein.dev
persistence:
enabled: true
resourcePolicy: keep
persistentVolumeClaim:
registry:
existingClaim: harbor-registry
accessMode: ReadWriteOnce
size: 50Gi
jobservice:
jobLog:
existingClaim: harbor-jobservice-logs
accessMode: ReadWriteOnce
size: 5Gi
imageChartStorage:
type: filesystem
filesystem:
rootdirectory: /storage
database:
type: external
external:
host: postgres-service.postgres.svc.cluster.local
port: "5432"
username: harbor
coreDatabase: harbor
existingSecret: harbor-db
sslmode: disable
redis:
type: internal
internal:
image:
repository: registry.bstein.dev/infra/harbor-redis
tag: v2.14.1-arm64 # {"$imagepolicy": "harbor:harbor-redis:tag"}
nodeSelector:
kubernetes.io/hostname: titan-05
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: kubernetes.io/arch
operator: In
values: ["arm64"]
preferredDuringSchedulingIgnoredDuringExecution:
- weight: 90
preference:
matchExpressions:
- key: hardware
operator: In
values: ["rpi5"]
- weight: 50
preference:
matchExpressions:
- key: hardware
operator: In
values: ["rpi4"]
trivy:
enabled: false
metrics:
enabled: false
cache:
enabled: false
existingSecretAdminPassword: harbor-core
existingSecretAdminPasswordKey: harbor_admin_password
existingSecretSecretKey: harbor-core
core:
image:
repository: registry.bstein.dev/infra/harbor-core
tag: v2.14.1-arm64 # {"$imagepolicy": "harbor:harbor-core:tag"}
nodeSelector:
kubernetes.io/hostname: titan-05
existingSecret: harbor-core
existingXsrfSecret: harbor-core
existingXsrfSecretKey: CSRF_KEY
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: kubernetes.io/arch
operator: In
values: ["arm64"]
preferredDuringSchedulingIgnoredDuringExecution:
- weight: 90
preference:
matchExpressions:
- key: hardware
operator: In
values: ["rpi5"]
- weight: 50
preference:
matchExpressions:
- key: hardware
operator: In
values: ["rpi4"]
jobservice:
image:
repository: registry.bstein.dev/infra/harbor-jobservice
tag: v2.14.1-arm64 # {"$imagepolicy": "harbor:harbor-jobservice:tag"}
nodeSelector:
kubernetes.io/hostname: titan-05
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: kubernetes.io/arch
operator: In
values: ["arm64"]
preferredDuringSchedulingIgnoredDuringExecution:
- weight: 90
preference:
matchExpressions:
- key: hardware
operator: In
values: ["rpi5"]
- weight: 50
preference:
matchExpressions:
- key: hardware
operator: In
values: ["rpi4"]
portal:
image:
repository: registry.bstein.dev/infra/harbor-portal
tag: v2.14.1-arm64 # {"$imagepolicy": "harbor:harbor-portal:tag"}
nodeSelector:
kubernetes.io/hostname: titan-05
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: kubernetes.io/arch
operator: In
values: ["arm64"]
preferredDuringSchedulingIgnoredDuringExecution:
- weight: 90
preference:
matchExpressions:
- key: hardware
operator: In
values: ["rpi5"]
- weight: 50
preference:
matchExpressions:
- key: hardware
operator: In
values: ["rpi4"]
registry:
registry:
image:
repository: registry.bstein.dev/infra/harbor-registry
tag: v2.14.1-arm64 # {"$imagepolicy": "harbor:harbor-registry:tag"}
controller:
image:
repository: registry.bstein.dev/infra/harbor-registryctl
tag: v2.14.1-arm64 # {"$imagepolicy": "harbor:harbor-registryctl:tag"}
nodeSelector:
kubernetes.io/hostname: titan-05
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: kubernetes.io/arch
operator: In
values: ["arm64"]
preferredDuringSchedulingIgnoredDuringExecution:
- weight: 90
preference:
matchExpressions:
- key: hardware
operator: In
values: ["rpi5"]
- weight: 50
preference:
matchExpressions:
- key: hardware
operator: In
values: ["rpi4"]
nginx:
image:
repository: registry.bstein.dev/infra/harbor-nginx
tag: v2.14.1-arm64 # {"$imagepolicy": "harbor:harbor-nginx:tag"}
nodeSelector:
kubernetes.io/hostname: titan-05
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: kubernetes.io/arch
operator: In
values: ["arm64"]
preferredDuringSchedulingIgnoredDuringExecution:
- weight: 90
preference:
matchExpressions:
- key: hardware
operator: In
values: ["rpi5"]
- weight: 50
preference:
matchExpressions:
- key: hardware
operator: In
values: ["rpi4"]
prepare:
image:
repository: registry.bstein.dev/infra/harbor-prepare
tag: v2.14.1-arm64 # {"$imagepolicy": "harbor:harbor-prepare:tag"}
updateStrategy:
type: Recreate

192
services/harbor/image.yaml Normal file
View File

@ -0,0 +1,192 @@
# services/harbor/image.yaml
apiVersion: image.toolkit.fluxcd.io/v1beta2
kind: ImageRepository
metadata:
name: harbor-core
namespace: harbor
spec:
image: registry.bstein.dev/infra/harbor-core
interval: 5m0s
---
apiVersion: image.toolkit.fluxcd.io/v1beta2
kind: ImagePolicy
metadata:
name: harbor-core
namespace: harbor
spec:
imageRepositoryRef:
name: harbor-core
filterTags:
pattern: '^v(?P<version>\d+\.\d+\.\d+-arm64(\.\d+)?)$'
extract: '$version'
policy:
semver:
range: ">=2.14.0-0 <2.15.0-0"
---
apiVersion: image.toolkit.fluxcd.io/v1beta2
kind: ImageRepository
metadata:
name: harbor-jobservice
namespace: harbor
spec:
image: registry.bstein.dev/infra/harbor-jobservice
interval: 5m0s
---
apiVersion: image.toolkit.fluxcd.io/v1beta2
kind: ImagePolicy
metadata:
name: harbor-jobservice
namespace: harbor
spec:
imageRepositoryRef:
name: harbor-jobservice
filterTags:
pattern: '^v(?P<version>\d+\.\d+\.\d+-arm64(\.\d+)?)$'
extract: '$version'
policy:
semver:
range: ">=2.14.0-0 <2.15.0-0"
---
apiVersion: image.toolkit.fluxcd.io/v1beta2
kind: ImageRepository
metadata:
name: harbor-portal
namespace: harbor
spec:
image: registry.bstein.dev/infra/harbor-portal
interval: 5m0s
---
apiVersion: image.toolkit.fluxcd.io/v1beta2
kind: ImagePolicy
metadata:
name: harbor-portal
namespace: harbor
spec:
imageRepositoryRef:
name: harbor-portal
filterTags:
pattern: '^v(?P<version>\d+\.\d+\.\d+-arm64(\.\d+)?)$'
extract: '$version'
policy:
semver:
range: ">=2.14.0-0 <2.15.0-0"
---
apiVersion: image.toolkit.fluxcd.io/v1beta2
kind: ImageRepository
metadata:
name: harbor-registry
namespace: harbor
spec:
image: registry.bstein.dev/infra/harbor-registry
interval: 5m0s
---
apiVersion: image.toolkit.fluxcd.io/v1beta2
kind: ImagePolicy
metadata:
name: harbor-registry
namespace: harbor
spec:
imageRepositoryRef:
name: harbor-registry
filterTags:
pattern: '^v(?P<version>\d+\.\d+\.\d+-arm64(\.\d+)?)$'
extract: '$version'
policy:
semver:
range: ">=2.14.0-0 <2.15.0-0"
---
apiVersion: image.toolkit.fluxcd.io/v1beta2
kind: ImageRepository
metadata:
name: harbor-registryctl
namespace: harbor
spec:
image: registry.bstein.dev/infra/harbor-registryctl
interval: 5m0s
---
apiVersion: image.toolkit.fluxcd.io/v1beta2
kind: ImagePolicy
metadata:
name: harbor-registryctl
namespace: harbor
spec:
imageRepositoryRef:
name: harbor-registryctl
filterTags:
pattern: '^v(?P<version>\d+\.\d+\.\d+-arm64(\.\d+)?)$'
extract: '$version'
policy:
semver:
range: ">=2.14.0-0 <2.15.0-0"
---
apiVersion: image.toolkit.fluxcd.io/v1beta2
kind: ImageRepository
metadata:
name: harbor-redis
namespace: harbor
spec:
image: registry.bstein.dev/infra/harbor-redis
interval: 5m0s
---
apiVersion: image.toolkit.fluxcd.io/v1beta2
kind: ImagePolicy
metadata:
name: harbor-redis
namespace: harbor
spec:
imageRepositoryRef:
name: harbor-redis
filterTags:
pattern: '^v(?P<version>\d+\.\d+\.\d+-arm64(\.\d+)?)$'
extract: '$version'
policy:
semver:
range: ">=2.14.0-0 <2.15.0-0"
---
apiVersion: image.toolkit.fluxcd.io/v1beta2
kind: ImageRepository
metadata:
name: harbor-nginx
namespace: harbor
spec:
image: registry.bstein.dev/infra/harbor-nginx
interval: 5m0s
---
apiVersion: image.toolkit.fluxcd.io/v1beta2
kind: ImagePolicy
metadata:
name: harbor-nginx
namespace: harbor
spec:
imageRepositoryRef:
name: harbor-nginx
filterTags:
pattern: '^v(?P<version>\d+\.\d+\.\d+-arm64(\.\d+)?)$'
extract: '$version'
policy:
semver:
range: ">=2.14.0-0 <2.15.0-0"
---
apiVersion: image.toolkit.fluxcd.io/v1beta2
kind: ImageRepository
metadata:
name: harbor-prepare
namespace: harbor
spec:
image: registry.bstein.dev/infra/harbor-prepare
interval: 5m0s
---
apiVersion: image.toolkit.fluxcd.io/v1beta2
kind: ImagePolicy
metadata:
name: harbor-prepare
namespace: harbor
spec:
imageRepositoryRef:
name: harbor-prepare
filterTags:
pattern: '^v(?P<version>\d+\.\d+\.\d+-arm64(\.\d+)?)$'
extract: '$version'
policy:
semver:
range: ">=2.14.0-0 <2.15.0-0"

View File

@ -0,0 +1,10 @@
# services/harbor/kustomization.yaml
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
namespace: harbor
resources:
- namespace.yaml
- pvc.yaml
- certificate.yaml
- helmrelease.yaml
- image.yaml

View File

@ -0,0 +1,5 @@
# services/harbor/namespace.yaml
apiVersion: v1
kind: Namespace
metadata:
name: harbor

24
services/harbor/pvc.yaml Normal file
View File

@ -0,0 +1,24 @@
# services/harbor/pvc.yaml
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
name: harbor-registry
namespace: harbor
spec:
accessModes: [ "ReadWriteOnce" ]
resources:
requests:
storage: 50Gi
storageClassName: astreae
---
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
name: harbor-jobservice-logs
namespace: harbor
spec:
accessModes: [ "ReadWriteOnce" ]
resources:
requests:
storage: 5Gi
storageClassName: astreae

View File

@ -0,0 +1,314 @@
# services/jenkins/helmrelease.yaml
apiVersion: helm.toolkit.fluxcd.io/v2
kind: HelmRelease
metadata:
name: jenkins
namespace: jenkins
spec:
interval: 30m
chart:
spec:
chart: jenkins
version: 5.8.114
sourceRef:
kind: HelmRepository
name: jenkins
namespace: flux-system
install:
timeout: 15m
remediation:
retries: 3
upgrade:
timeout: 15m
remediation:
retries: 3
remediateLastFailure: true
cleanupOnFail: true
rollback:
timeout: 15m
values:
controller:
nodeSelector:
hardware: rpi4
jenkinsUrl: https://ci.bstein.dev
ingress:
enabled: true
hostName: ci.bstein.dev
ingressClassName: traefik
annotations:
cert-manager.io/cluster-issuer: letsencrypt
traefik.ingress.kubernetes.io/router.entrypoints: websecure
tls:
- secretName: jenkins-tls
hosts:
- ci.bstein.dev
installPlugins:
- kubernetes
- workflow-aggregator
- git
- configuration-as-code
- oic-auth
- job-dsl
- configuration-as-code-support
containerEnv:
- name: ENABLE_OIDC
value: "true"
- name: OIDC_ISSUER
value: "https://sso.bstein.dev/realms/atlas"
- name: OIDC_CLIENT_ID
valueFrom:
secretKeyRef:
name: jenkins-oidc
key: clientId
- name: OIDC_CLIENT_SECRET
valueFrom:
secretKeyRef:
name: jenkins-oidc
key: clientSecret
- name: OIDC_AUTH_URL
valueFrom:
secretKeyRef:
name: jenkins-oidc
key: authorizationUrl
- name: OIDC_TOKEN_URL
valueFrom:
secretKeyRef:
name: jenkins-oidc
key: tokenUrl
- name: OIDC_USERINFO_URL
valueFrom:
secretKeyRef:
name: jenkins-oidc
key: userInfoUrl
- name: OIDC_LOGOUT_URL
valueFrom:
secretKeyRef:
name: jenkins-oidc
key: logoutUrl
- name: GITEA_PAT_USERNAME
valueFrom:
secretKeyRef:
name: gitea-pat
key: username
- name: GITEA_PAT_TOKEN
valueFrom:
secretKeyRef:
name: gitea-pat
key: token
customInitContainers:
- name: clean-jcasc-stale
image: alpine:3.20
imagePullPolicy: IfNotPresent
command:
- sh
- -c
- |
set -euo pipefail
rm -f /var/jenkins_home/casc_configs/* || true
securityContext:
runAsNonRoot: true
runAsUser: 1000
runAsGroup: 1000
volumeMounts:
- name: jenkins-home
mountPath: /var/jenkins_home
initScripts:
oidc.groovy: |
import hudson.util.Secret
import jenkins.model.IdStrategy
import jenkins.model.Jenkins
import org.jenkinsci.plugins.oic.OicSecurityRealm
import org.jenkinsci.plugins.oic.OicServerWellKnownConfiguration
import hudson.security.FullControlOnceLoggedInAuthorizationStrategy
def env = System.getenv()
if (!(env['ENABLE_OIDC'] ?: 'false').toBoolean()) {
println("OIDC disabled (ENABLE_OIDC=false); keeping default security realm")
return
}
def required = ['OIDC_CLIENT_ID','OIDC_CLIENT_SECRET','OIDC_ISSUER']
if (!required.every { env[it] }) {
throw new IllegalStateException("OIDC enabled but missing vars: ${required.findAll { !env[it] }}")
}
try {
def wellKnown = "${env['OIDC_ISSUER']}/.well-known/openid-configuration"
def serverCfg = new OicServerWellKnownConfiguration(wellKnown)
serverCfg.setScopesOverride('openid profile email')
def realm = new OicSecurityRealm(
env['OIDC_CLIENT_ID'],
Secret.fromString(env['OIDC_CLIENT_SECRET']),
serverCfg,
false,
IdStrategy.CASE_INSENSITIVE,
IdStrategy.CASE_INSENSITIVE
)
realm.createProxyAwareResourceRetriver()
realm.setLogoutFromOpenidProvider(true)
realm.setPostLogoutRedirectUrl('https://ci.bstein.dev')
realm.setUserNameField('preferred_username')
realm.setFullNameFieldName('name')
realm.setEmailFieldName('email')
realm.setGroupsFieldName('groups')
realm.setRootURLFromRequest(true)
realm.setSendScopesInTokenRequest(true)
def j = Jenkins.get()
j.setSecurityRealm(realm)
def auth = new FullControlOnceLoggedInAuthorizationStrategy()
auth.setAllowAnonymousRead(false)
j.setAuthorizationStrategy(auth)
j.save()
println("Configured OIDC realm from init script (well-known)")
} catch (Exception e) {
println("Failed to configure OIDC realm: ${e}")
throw e
}
JCasC:
defaultConfig: false
securityRealm: ""
authorizationStrategy: ""
configScripts:
base.yaml: |
jenkins:
disableRememberMe: false
mode: NORMAL
numExecutors: 0
labelString: ""
projectNamingStrategy: "standard"
markupFormatter:
plainText
clouds:
- kubernetes:
containerCapStr: "10"
defaultsProviderTemplate: ""
connectTimeout: "5"
readTimeout: "15"
jenkinsUrl: "http://jenkins.jenkins.svc.cluster.local:8080"
jenkinsTunnel: "jenkins-agent.jenkins.svc.cluster.local:50000"
skipTlsVerify: false
usageRestricted: false
maxRequestsPerHostStr: "32"
retentionTimeout: "5"
waitForPodSec: "600"
name: "kubernetes"
namespace: "jenkins"
restrictedPssSecurityContext: false
serverUrl: "https://kubernetes.default"
credentialsId: ""
podLabels:
- key: "jenkins/jenkins-jenkins-agent"
value: "true"
templates:
- name: "default"
namespace: "jenkins"
id: a23c9bbcd21e360a77d51b426f05bd7b8032d8fdedd6ffb97c436883ce6c5ffa
containers:
- name: "jnlp"
alwaysPullImage: false
args: "^${computer.jnlpmac} ^${computer.name}"
envVars:
- envVar:
key: "JENKINS_URL"
value: "http://jenkins.jenkins.svc.cluster.local:8080/"
image: "jenkins/inbound-agent:3355.v388858a_47b_33-3"
privileged: "false"
resourceLimitCpu: 512m
resourceLimitMemory: 512Mi
resourceRequestCpu: 512m
resourceRequestMemory: 512Mi
ttyEnabled: false
workingDir: /home/jenkins/agent
idleMinutes: 0
instanceCap: 2147483647
label: "jenkins-jenkins-agent "
nodeUsageMode: "NORMAL"
podRetention: Never
showRawYaml: true
serviceAccount: "default"
slaveConnectTimeoutStr: "100"
yamlMergeStrategy: override
inheritYamlMergeStrategy: false
slaveAgentPort: 50000
crumbIssuer:
standard:
excludeClientIPFromCrumb: true
security:
apiToken:
creationOfLegacyTokenEnabled: false
tokenGenerationOnCreationEnabled: false
usageStatisticsEnabled: true
unclassified:
creds.yaml: |
credentials:
system:
domainCredentials:
- credentials:
- usernamePassword:
scope: GLOBAL
id: gitea-pat
username: "${GITEA_PAT_USERNAME}"
password: "${GITEA_PAT_TOKEN}"
description: "Gitea PAT for pipelines"
jobs.yaml: |
jobs:
- script: |
pipelineJob('harbor-arm-build') {
triggers {
scm('H/5 * * * *')
}
definition {
cpsScm {
scm {
git {
remote {
url('https://scm.bstein.dev/bstein/harbor-arm-build.git')
credentials('gitea-pat')
}
branches('*/master')
}
}
}
}
}
pipelineJob('ci-demo') {
triggers {
scm('H/1 * * * *')
}
definition {
cpsScm {
scm {
git {
remote {
url('https://scm.bstein.dev/bstein/ci-demo.git')
credentials('gitea-pat')
}
branches('*/master')
}
}
scriptPath('Jenkinsfile')
}
}
}
pipelineJob('bstein-dev-home') {
triggers {
scm('H/2 * * * *')
}
definition {
cpsScm {
scm {
git {
remote {
url('https://scm.bstein.dev/bstein/bstein-dev-home.git')
credentials('gitea-pat')
}
branches('*/master')
}
}
scriptPath('Jenkinsfile')
}
}
}
persistence:
enabled: true
storageClass: astreae
size: 50Gi
serviceAccount:
create: true

View File

@ -0,0 +1,7 @@
# services/jenkins/kustomization.yaml
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
namespace: jenkins
resources:
- namespace.yaml
- helmrelease.yaml

View File

@ -0,0 +1,5 @@
# services/jenkins/namespace.yaml
apiVersion: v1
kind: Namespace
metadata:
name: jenkins

View File

@ -5,7 +5,7 @@ metadata:
name: jitsi
namespace: jitsi
annotations:
cert-manager.io/cluster-issuer: "letsencrypt-prod"
cert-manager.io/cluster-issuer: letsencrypt
spec:
ingressClassName: traefik
tls:

View File

@ -0,0 +1,152 @@
# services/keycloak/deployment.yaml
apiVersion: apps/v1
kind: Deployment
metadata:
name: keycloak
namespace: sso
labels:
app: keycloak
spec:
replicas: 1
selector:
matchLabels:
app: keycloak
template:
metadata:
labels:
app: keycloak
spec:
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: hardware
operator: In
values: ["rpi5","rpi4"]
- key: node-role.kubernetes.io/worker
operator: Exists
- matchExpressions:
- key: kubernetes.io/hostname
operator: In
values: ["titan-24"]
preferredDuringSchedulingIgnoredDuringExecution:
- weight: 90
preference:
matchExpressions:
- key: hardware
operator: In
values: ["rpi5"]
- weight: 70
preference:
matchExpressions:
- key: hardware
operator: In
values: ["rpi4"]
securityContext:
runAsUser: 1000
runAsGroup: 0
fsGroup: 1000
fsGroupChangePolicy: OnRootMismatch
initContainers:
- name: mailu-http-listener
image: registry.bstein.dev/sso/mailu-http-listener:0.1.0
imagePullPolicy: IfNotPresent
command: ["/bin/sh", "-c"]
args:
- |
cp /plugin/mailu-http-listener-0.1.0.jar /providers/
cp -r /plugin/src /providers/src
volumeMounts:
- name: providers
mountPath: /providers
containers:
- name: keycloak
image: quay.io/keycloak/keycloak:26.0.7
imagePullPolicy: IfNotPresent
args:
- start
env:
- name: KC_DB
value: postgres
- name: KC_DB_URL_HOST
value: postgres-service.postgres.svc.cluster.local
- name: KC_DB_URL_DATABASE
valueFrom:
secretKeyRef:
name: keycloak-db
key: database
- name: KC_DB_USERNAME
valueFrom:
secretKeyRef:
name: keycloak-db
key: username
- name: KC_DB_PASSWORD
valueFrom:
secretKeyRef:
name: keycloak-db
key: password
- name: KC_DB_SCHEMA
value: public
- name: KC_HOSTNAME
value: sso.bstein.dev
- name: KC_HOSTNAME_URL
value: https://sso.bstein.dev
- name: KC_PROXY
value: edge
- name: KC_PROXY_HEADERS
value: xforwarded
- name: KC_HTTP_ENABLED
value: "true"
- name: KC_HTTP_MANAGEMENT_PORT
value: "9000"
- name: KC_HTTP_MANAGEMENT_BIND_ADDRESS
value: 0.0.0.0
- name: KC_HEALTH_ENABLED
value: "true"
- name: KC_METRICS_ENABLED
value: "true"
- name: KEYCLOAK_ADMIN
valueFrom:
secretKeyRef:
name: keycloak-admin
key: username
- name: KEYCLOAK_ADMIN_PASSWORD
valueFrom:
secretKeyRef:
name: keycloak-admin
key: password
- name: KC_EVENTS_LISTENERS
value: jboss-logging,mailu-http
- name: KC_SPI_EVENTS_LISTENER_MAILU-HTTP_ENDPOINT
value: http://mailu-sync-listener.mailu-mailserver.svc.cluster.local:8080/events
ports:
- containerPort: 8080
name: http
- containerPort: 9000
name: metrics
readinessProbe:
httpGet:
path: /health/ready
port: 9000
initialDelaySeconds: 15
periodSeconds: 10
failureThreshold: 6
livenessProbe:
httpGet:
path: /health/live
port: 9000
initialDelaySeconds: 60
periodSeconds: 15
failureThreshold: 6
volumeMounts:
- name: data
mountPath: /opt/keycloak/data
- name: providers
mountPath: /opt/keycloak/providers
volumes:
- name: data
persistentVolumeClaim:
claimName: keycloak-data
- name: providers
emptyDir: {}

View File

@ -0,0 +1,24 @@
# services/keycloak/ingress.yaml
apiVersion: networking.k8s.io/v1
kind: Ingress
metadata:
name: keycloak
namespace: sso
annotations:
cert-manager.io/cluster-issuer: letsencrypt
spec:
ingressClassName: traefik
rules:
- host: sso.bstein.dev
http:
paths:
- path: /
pathType: Prefix
backend:
service:
name: keycloak
port:
number: 80
tls:
- hosts: [sso.bstein.dev]
secretName: keycloak-tls

View File

@ -1,11 +1,10 @@
# services/zot/kustomization.yaml
# services/keycloak/kustomization.yaml
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
namespace: sso
resources:
- namespace.yaml
- pvc.yaml
- deployment.yaml
- configmap.yaml
- service.yaml
- ingress.yaml
- middleware.yaml

View File

@ -0,0 +1,5 @@
# services/keycloak/namespace.yaml
apiVersion: v1
kind: Namespace
metadata:
name: sso

View File

@ -0,0 +1,12 @@
# services/keycloak/pvc.yaml
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
name: keycloak-data
namespace: sso
spec:
accessModes: ["ReadWriteOnce"]
resources:
requests:
storage: 10Gi
storageClassName: astreae

View File

@ -0,0 +1,15 @@
# services/keycloak/service.yaml
apiVersion: v1
kind: Service
metadata:
name: keycloak
namespace: sso
labels:
app: keycloak
spec:
selector:
app: keycloak
ports:
- name: http
port: 80
targetPort: http

View File

@ -0,0 +1,13 @@
# services/mailu/certificate.yaml
apiVersion: cert-manager.io/v1
kind: Certificate
metadata:
name: mailu-tls
namespace: mailu-mailserver
spec:
secretName: mailu-certificates
issuerRef:
kind: ClusterIssuer
name: letsencrypt-prod
dnsNames:
- mail.bstein.dev

View File

@ -0,0 +1,287 @@
# services/mailu/helmrelease.yaml
apiVersion: helm.toolkit.fluxcd.io/v2
kind: HelmRelease
metadata:
name: mailu
namespace: mailu-mailserver
spec:
interval: 30m
chart:
spec:
chart: mailu
version: 2.1.2
sourceRef:
kind: HelmRepository
name: mailu
namespace: flux-system
install:
remediation: { retries: 3 }
timeout: 10m
upgrade:
remediation:
retries: 3
remediateLastFailure: true
cleanupOnFail: true
timeout: 10m
values:
mailuVersion: "2024.06"
domain: bstein.dev
hostnames: [mail.bstein.dev]
domains:
- name: bstein.dev
enabled: true
dkim:
enabled: true
externalRelay:
host: "[email-smtp.us-east-2.amazonaws.com]:587"
existingSecret: mailu-ses-relay
usernameKey: relay-username
passwordKey: relay-password
timezone: Etc/UTC
subnet: 10.42.0.0/16
existingSecret: mailu-secret
tls:
outboundLevel: encrypt
externalDatabase:
enabled: true
type: postgresql
host: postgres-service.postgres.svc.cluster.local
port: 5432
database: mailu
username: mailu
existingSecret: mailu-db-secret
existingSecretUsernameKey: username
existingSecretPasswordKey: password
existingSecretDatabaseKey: database
initialAccount:
enabled: true
username: test
domain: bstein.dev
existingSecret: mailu-initial-account-secret
existingSecretPasswordKey: password
persistence:
accessModes: [ReadWriteMany]
size: 100Gi
storageClass: astreae
single_pvc: true
front:
hostnames: [mail.bstein.dev]
proxied: true
hostPort:
enabled: false
https:
enabled: false
external: false
forceHttps: false
externalService:
enabled: true
type: LoadBalancer
externalTrafficPolicy: Cluster
ports:
submission: true
nodePorts:
pop3: 30010
pop3s: 30011
imap: 30143
imaps: 30993
manageSieve: 30419
smtp: 30025
smtps: 30465
submission: 30587
logLevel: DEBUG
nodeSelector:
hardware: rpi4
admin:
logLevel: DEBUG
nodeSelector:
hardware: rpi4
podLivenessProbe:
enabled: true
initialDelaySeconds: 30
periodSeconds: 10
timeoutSeconds: 5
failureThreshold: 6
successThreshold: 1
podReadinessProbe:
enabled: true
initialDelaySeconds: 20
periodSeconds: 10
timeoutSeconds: 5
failureThreshold: 6
successThreshold: 1
extraEnvVars:
- name: FLASK_DEBUG
value: "1"
- name: ACCESSLOG
value: /dev/stdout
- name: ERRORLOG
value: /dev/stderr
- name: WEBROOT_REDIRECT
value: ""
- name: FORWARDED_ALLOW_IPS
value: 127.0.0.1,10.42.0.0/16
- name: DNS_RESOLVERS
value: 1.1.1.1,9.9.9.9
extraVolumes:
- name: unbound-config
configMap:
name: mailu-unbound
- name: unbound-run
emptyDir: {}
extraVolumeMounts:
- name: unbound-run
mountPath: /var/lib/unbound
extraContainers:
- name: unbound
image: docker.io/alpine:3.20
command: ["/bin/sh", "-c"]
args:
- |
while :; do
printf "nameserver 10.43.0.10\n" > /etc/resolv.conf
if apk add --no-cache unbound bind-tools; then
break
fi
echo "apk failed, retrying" >&2
sleep 10
done
cat >/etc/resolv.conf <<'EOF'
search mailu-mailserver.svc.cluster.local svc.cluster.local cluster.local
nameserver 127.0.0.1
EOF
unbound-anchor -a /var/lib/unbound/root.key || true
exec unbound -d -c /opt/unbound/etc/unbound/unbound.conf
ports:
- containerPort: 53
protocol: UDP
- containerPort: 53
protocol: TCP
volumeMounts:
- name: unbound-config
mountPath: /opt/unbound/etc/unbound
- name: unbound-run
mountPath: /var/lib/unbound
dnsPolicy: None
dnsConfig:
nameservers:
- 127.0.0.1
searches:
- mailu-mailserver.svc.cluster.local
- svc.cluster.local
- cluster.local
clamav:
image:
repository: clamav/clamav-debian
tag: "1.4"
logLevel: DEBUG
nodeSelector:
hardware: rpi5
resources:
requests:
cpu: 200m
memory: 1Gi
limits:
cpu: 500m
memory: 3Gi
livenessProbe:
enabled: false
initialDelaySeconds: 300
periodSeconds: 30
timeoutSeconds: 5
failureThreshold: 6
successThreshold: 1
startupProbe:
enabled: false
initialDelaySeconds: 60
periodSeconds: 30
timeoutSeconds: 5
failureThreshold: 20
successThreshold: 1
readinessProbe:
enabled: false
initialDelaySeconds: 300
periodSeconds: 30
timeoutSeconds: 5
failureThreshold: 6
successThreshold: 1
dovecot:
logLevel: DEBUG
nodeSelector:
hardware: rpi4
oletools:
logLevel: DEBUG
nodeSelector:
hardware: rpi4
postfix:
logLevel: DEBUG
nodeSelector:
hardware: rpi4
overrides:
smtp_use_tls: "yes"
smtp_tls_security_level: "encrypt"
smtp_sasl_security_options: "noanonymous"
redis:
enabled: true
architecture: standalone
logLevel: DEBUG
image:
repository: bitnamilegacy/redis
tag: 8.0.3-debian-12-r3
master:
nodeSelector:
hardware: rpi4
persistence:
enabled: true
accessModes: [ReadWriteMany]
size: 8Gi
storageClass: astreae
rspamd:
logLevel: DEBUG
nodeSelector:
hardware: rpi4
persistence:
accessModes: [ReadWriteOnce]
size: 8Gi
storageClass: astreae
tika:
logLevel: DEBUG
nodeSelector:
hardware: rpi4
global:
logLevel: DEBUG
storageClass: astreae
webmail:
enabled: false
nodeSelector:
hardware: rpi4
ingress:
enabled: false
ingressClassName: traefik
tls: true
existingSecret: mailu-certificates
annotations:
traefik.ingress.kubernetes.io/router.entrypoints: websecure
traefik.ingress.kubernetes.io/service.serversscheme: https
traefik.ingress.kubernetes.io/service.serverstransport: mailu-transport@kubernetescrd
extraRules:
- host: mail.bstein.dev
http:
paths:
- path: /
pathType: Prefix
backend:
service:
name: mailu-front
port:
number: 443
service:
ports:
smtp:
port: 25
targetPort: 25
smtps:
port: 465
targetPort: 465
submission:
port: 587
targetPort: 587

View File

@ -0,0 +1,19 @@
# services/mailu/ingressroute.yaml
apiVersion: traefik.io/v1alpha1
kind: IngressRoute
metadata:
name: mailu
namespace: mailu-mailserver
spec:
entryPoints:
- websecure
routes:
- match: Host(`mail.bstein.dev`)
kind: Rule
services:
- name: mailu-front
port: 443
scheme: https
serversTransport: mailu-transport
tls:
secretName: mailu-certificates

View File

@ -0,0 +1,23 @@
# services/mailu/kustomization.yaml
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
namespace: mailu-mailserver
resources:
- namespace.yaml
- helmrelease.yaml
- certificate.yaml
- vip-controller.yaml
- unbound-configmap.yaml
- serverstransport.yaml
- ingressroute.yaml
- mailu-sync-job.yaml
- mailu-sync-cronjob.yaml
- mailu-sync-listener.yaml
configMapGenerator:
- name: mailu-sync-script
namespace: mailu-mailserver
files:
- sync.py=../../scripts/mailu_sync.py
options:
disableNameSuffixHash: true

View File

@ -0,0 +1,77 @@
# services/mailu/mailu-sync-cronjob.yaml
apiVersion: batch/v1
kind: CronJob
metadata:
name: mailu-sync-nightly
namespace: mailu-mailserver
spec:
schedule: "30 4 * * *"
concurrencyPolicy: Forbid
jobTemplate:
spec:
template:
spec:
restartPolicy: OnFailure
containers:
- name: mailu-sync
image: python:3.11-alpine
imagePullPolicy: IfNotPresent
command: ["/bin/sh", "-c"]
args:
- |
pip install --no-cache-dir requests psycopg2-binary passlib >/tmp/pip.log \
&& python /app/sync.py
env:
- name: KEYCLOAK_BASE_URL
value: http://keycloak.sso.svc.cluster.local
- name: KEYCLOAK_REALM
value: atlas
- name: MAILU_DOMAIN
value: bstein.dev
- name: MAILU_DEFAULT_QUOTA
value: "20000000000"
- name: MAILU_DB_HOST
value: postgres-service.postgres.svc.cluster.local
- name: MAILU_DB_PORT
value: "5432"
- name: MAILU_DB_NAME
valueFrom:
secretKeyRef:
name: mailu-db-secret
key: database
- name: MAILU_DB_USER
valueFrom:
secretKeyRef:
name: mailu-db-secret
key: username
- name: MAILU_DB_PASSWORD
valueFrom:
secretKeyRef:
name: mailu-db-secret
key: password
- name: KEYCLOAK_CLIENT_ID
valueFrom:
secretKeyRef:
name: mailu-sync-credentials
key: client-id
- name: KEYCLOAK_CLIENT_SECRET
valueFrom:
secretKeyRef:
name: mailu-sync-credentials
key: client-secret
volumeMounts:
- name: sync-script
mountPath: /app/sync.py
subPath: sync.py
resources:
requests:
cpu: 50m
memory: 128Mi
limits:
cpu: 200m
memory: 256Mi
volumes:
- name: sync-script
configMap:
name: mailu-sync-script
defaultMode: 0444

View File

@ -0,0 +1,73 @@
# services/mailu/mailu-sync-job.yaml
apiVersion: batch/v1
kind: Job
metadata:
name: mailu-sync
namespace: mailu-mailserver
spec:
template:
spec:
restartPolicy: OnFailure
containers:
- name: mailu-sync
image: python:3.11-alpine
imagePullPolicy: IfNotPresent
command: ["/bin/sh", "-c"]
args:
- |
pip install --no-cache-dir requests psycopg2-binary passlib >/tmp/pip.log \
&& python /app/sync.py
env:
- name: KEYCLOAK_BASE_URL
value: http://keycloak.sso.svc.cluster.local
- name: KEYCLOAK_REALM
value: atlas
- name: MAILU_DOMAIN
value: bstein.dev
- name: MAILU_DEFAULT_QUOTA
value: "20000000000"
- name: MAILU_DB_HOST
value: postgres-service.postgres.svc.cluster.local
- name: MAILU_DB_PORT
value: "5432"
- name: MAILU_DB_NAME
valueFrom:
secretKeyRef:
name: mailu-db-secret
key: database
- name: MAILU_DB_USER
valueFrom:
secretKeyRef:
name: mailu-db-secret
key: username
- name: MAILU_DB_PASSWORD
valueFrom:
secretKeyRef:
name: mailu-db-secret
key: password
- name: KEYCLOAK_CLIENT_ID
valueFrom:
secretKeyRef:
name: mailu-sync-credentials
key: client-id
- name: KEYCLOAK_CLIENT_SECRET
valueFrom:
secretKeyRef:
name: mailu-sync-credentials
key: client-secret
volumeMounts:
- name: sync-script
mountPath: /app/sync.py
subPath: sync.py
resources:
requests:
cpu: 50m
memory: 128Mi
limits:
cpu: 200m
memory: 256Mi
volumes:
- name: sync-script
configMap:
name: mailu-sync-script
defaultMode: 0444

View File

@ -0,0 +1,154 @@
# services/mailu/mailu-sync-listener.yaml
apiVersion: v1
kind: Service
metadata:
name: mailu-sync-listener
namespace: mailu-mailserver
spec:
selector:
app: mailu-sync-listener
ports:
- name: http
port: 8080
targetPort: 8080
---
apiVersion: apps/v1
kind: Deployment
metadata:
name: mailu-sync-listener
namespace: mailu-mailserver
labels:
app: mailu-sync-listener
spec:
replicas: 1
selector:
matchLabels:
app: mailu-sync-listener
template:
metadata:
labels:
app: mailu-sync-listener
spec:
restartPolicy: Always
containers:
- name: listener
image: python:3.11-alpine
imagePullPolicy: IfNotPresent
command: ["/bin/sh", "-c"]
args:
- |
pip install --no-cache-dir requests psycopg2-binary passlib >/tmp/pip.log \
&& python /app/listener.py
env:
- name: KEYCLOAK_BASE_URL
value: http://keycloak.sso.svc.cluster.local
- name: KEYCLOAK_REALM
value: atlas
- name: MAILU_DOMAIN
value: bstein.dev
- name: MAILU_DEFAULT_QUOTA
value: "20000000000"
- name: MAILU_DB_HOST
value: postgres-service.postgres.svc.cluster.local
- name: MAILU_DB_PORT
value: "5432"
- name: MAILU_DB_NAME
valueFrom:
secretKeyRef:
name: mailu-db-secret
key: database
- name: MAILU_DB_USER
valueFrom:
secretKeyRef:
name: mailu-db-secret
key: username
- name: MAILU_DB_PASSWORD
valueFrom:
secretKeyRef:
name: mailu-db-secret
key: password
- name: KEYCLOAK_CLIENT_ID
valueFrom:
secretKeyRef:
name: mailu-sync-credentials
key: client-id
- name: KEYCLOAK_CLIENT_SECRET
valueFrom:
secretKeyRef:
name: mailu-sync-credentials
key: client-secret
volumeMounts:
- name: sync-script
mountPath: /app/sync.py
subPath: sync.py
- name: listener-script
mountPath: /app/listener.py
subPath: listener.py
resources:
requests:
cpu: 50m
memory: 128Mi
limits:
cpu: 200m
memory: 256Mi
volumes:
- name: sync-script
configMap:
name: mailu-sync-script
defaultMode: 0444
- name: listener-script
configMap:
name: mailu-sync-listener
defaultMode: 0444
---
apiVersion: v1
kind: ConfigMap
metadata:
name: mailu-sync-listener
namespace: mailu-mailserver
data:
listener.py: |
import http.server
import json
import os
import subprocess
import threading
from time import time
# Simple debounce to avoid hammering on bursts
MIN_INTERVAL_SECONDS = 10
last_run = 0.0
lock = threading.Lock()
def trigger_sync():
global last_run
with lock:
now = time()
if now - last_run < MIN_INTERVAL_SECONDS:
return
last_run = now
# Fire and forget; output to stdout
subprocess.Popen(["python", "/app/sync.py"], stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
class Handler(http.server.BaseHTTPRequestHandler):
def do_POST(self):
length = int(self.headers.get("Content-Length", 0))
body = self.rfile.read(length) if length else b""
try:
json.loads(body or b"{}")
except json.JSONDecodeError:
self.send_response(400)
self.end_headers()
return
trigger_sync()
self.send_response(202)
self.end_headers()
def log_message(self, fmt, *args):
# Quiet logging
return
if __name__ == "__main__":
server = http.server.ThreadingHTTPServer(("", 8080), Handler)
server.serve_forever()

View File

@ -0,0 +1,5 @@
# services/mailu/namespace.yaml
apiVersion: v1
kind: Namespace
metadata:
name: mailu-mailserver

View File

@ -0,0 +1,10 @@
# services/mailu/serverstransport.yaml
apiVersion: traefik.io/v1alpha1
kind: ServersTransport
metadata:
name: mailu-transport
namespace: mailu-mailserver
spec:
# Force SNI to mail.bstein.dev and skip backend cert verification (backend cert is for the host, not the pod IP).
serverName: mail.bstein.dev
insecureSkipVerify: true

View File

@ -0,0 +1,49 @@
# services/mailu/unbound-configmap.yaml
apiVersion: v1
kind: ConfigMap
metadata:
name: mailu-unbound
namespace: mailu-mailserver
data:
unbound.conf: |
server:
verbosity: 1
interface: 0.0.0.0
do-ip4: yes
do-ip6: no
do-udp: yes
do-tcp: yes
auto-trust-anchor-file: "/var/lib/unbound/root.key"
prefetch: yes
qname-minimisation: yes
harden-dnssec-stripped: yes
val-clean-additional: yes
domain-insecure: "mailu-mailserver.svc.cluster.local."
domain-insecure: "svc.cluster.local."
domain-insecure: "cluster.local."
cache-min-ttl: 120
cache-max-ttl: 86400
access-control: 0.0.0.0/0 allow
forward-zone:
name: "mailu-mailserver.svc.cluster.local."
forward-addr: 10.43.0.10
forward-no-cache: yes
forward-first: yes
forward-zone:
name: "svc.cluster.local."
forward-addr: 10.43.0.10
forward-no-cache: yes
forward-first: yes
forward-zone:
name: "cluster.local."
forward-addr: 10.43.0.10
forward-no-cache: yes
forward-first: yes
forward-zone:
name: "."
forward-addr: 9.9.9.9
forward-addr: 1.1.1.1

View File

@ -0,0 +1,71 @@
# services/mailu/vip-controller.yaml
---
apiVersion: v1
kind: ServiceAccount
metadata:
name: vip-controller
namespace: mailu-mailserver
---
apiVersion: rbac.authorization.k8s.io/v1
kind: Role
metadata:
name: vip-controller-role
namespace: mailu-mailserver
rules:
- apiGroups: ["apps"]
resources: ["deployments"]
verbs: ["get", "list", "patch", "update"]
---
apiVersion: rbac.authorization.k8s.io/v1
kind: RoleBinding
metadata:
name: vip-controller-binding
namespace: mailu-mailserver
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: Role
name: vip-controller-role
subjects:
- kind: ServiceAccount
name: vip-controller
namespace: mailu-mailserver
---
apiVersion: apps/v1
kind: DaemonSet
metadata:
name: vip-controller
namespace: mailu-mailserver
spec:
selector:
matchLabels:
app: vip-controller
template:
metadata:
labels:
app: vip-controller
spec:
serviceAccountName: vip-controller
hostNetwork: true
nodeSelector:
mailu.bstein.dev/vip: "true"
containers:
- name: vip-controller
image: lachlanevenson/k8s-kubectl:latest
imagePullPolicy: IfNotPresent
command:
- /bin/sh
- -c
args:
- |
set -e
while true; do
if ip addr show end0 | grep -q 'inet 192\.168\.22\.9/32'; then
NODE=$(hostname)
echo "VIP found on node ${NODE}."
kubectl patch deployment mailu-front -n mailu-mailserver --type='merge' \
-p "{\"spec\":{\"template\":{\"spec\":{\"nodeSelector\":{\"kubernetes.io/hostname\":\"${NODE}\"}}}}}"
else
echo "No VIP on node ${HOSTNAME}."
fi
sleep 60
done

View File

@ -0,0 +1,186 @@
{
"uid": "atlas-gpu",
"title": "Atlas GPU",
"folderUid": "atlas-internal",
"editable": true,
"panels": [
{
"id": 1,
"type": "piechart",
"title": "Namespace GPU Share",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 8,
"w": 12,
"x": 0,
"y": 0
},
"targets": [
{
"expr": "100 * ( ( (sum by (namespace) (max_over_time(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\"}[$__range]))) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ) / clamp_min(sum( ( (sum by (namespace) (max_over_time(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\"}[$__range]))) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ), 1)",
"refId": "A",
"legendFormat": "{{namespace}}"
}
],
"fieldConfig": {
"defaults": {
"unit": "percent",
"color": {
"mode": "palette-classic"
}
},
"overrides": []
},
"options": {
"legend": {
"displayMode": "list",
"placement": "right"
},
"pieType": "pie",
"displayLabels": [],
"tooltip": {
"mode": "single"
},
"colorScheme": "interpolateSpectral",
"colorBy": "value",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
}
}
},
{
"id": 2,
"type": "timeseries",
"title": "GPU Util by Namespace",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 8,
"w": 12,
"x": 12,
"y": 0
},
"targets": [
{
"expr": "sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\"}) by (namespace)",
"refId": "A",
"legendFormat": "{{namespace}}"
}
],
"fieldConfig": {
"defaults": {
"unit": "percent"
},
"overrides": []
},
"options": {
"legend": {
"displayMode": "table",
"placement": "right"
},
"tooltip": {
"mode": "multi"
}
}
},
{
"id": 3,
"type": "timeseries",
"title": "GPU Util by Node",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 8,
"w": 12,
"x": 0,
"y": 8
},
"targets": [
{
"expr": "sum by (Hostname) (DCGM_FI_DEV_GPU_UTIL{pod!=\"\"})",
"refId": "A",
"legendFormat": "{{Hostname}}"
}
],
"fieldConfig": {
"defaults": {
"unit": "percent"
},
"overrides": []
},
"options": {
"legend": {
"displayMode": "table",
"placement": "right"
},
"tooltip": {
"mode": "multi"
}
}
},
{
"id": 4,
"type": "table",
"title": "Top Pods by GPU Util",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 8,
"w": 12,
"x": 12,
"y": 8
},
"targets": [
{
"expr": "topk(10, sum(DCGM_FI_DEV_GPU_UTIL{pod!=\"\"}) by (namespace,pod,Hostname))",
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"unit": "percent",
"custom": {
"filterable": true
}
},
"overrides": []
},
"options": {
"showHeader": true,
"columnFilters": false
},
"transformations": [
{
"id": "labelsToFields",
"options": {}
}
]
}
],
"time": {
"from": "now-12h",
"to": "now"
},
"annotations": {
"list": []
},
"schemaVersion": 39,
"style": "dark",
"tags": [
"atlas",
"gpu"
]
}

View File

@ -0,0 +1,668 @@
{
"uid": "atlas-network",
"title": "Atlas Network",
"folderUid": "atlas-internal",
"editable": true,
"panels": [
{
"id": 1,
"type": "stat",
"title": "Ingress Success Rate (5m)",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 4,
"w": 6,
"x": 0,
"y": 0
},
"targets": [
{
"expr": "(sum(rate(traefik_entrypoint_requests_total{code!~\"5..\"}[5m]))) / clamp_min(sum(rate(traefik_entrypoint_requests_total[5m])), 1)",
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "red",
"value": null
},
{
"color": "orange",
"value": 0.995
},
{
"color": "yellow",
"value": 0.999
},
{
"color": "green",
"value": 0.9995
}
]
},
"unit": "percentunit",
"custom": {
"displayMode": "auto"
},
"decimals": 2
},
"overrides": []
},
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "center",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"textMode": "value"
}
},
{
"id": 2,
"type": "stat",
"title": "Error Budget Burn (1h)",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 4,
"w": 6,
"x": 6,
"y": 0
},
"targets": [
{
"expr": "(1 - ((sum(rate(traefik_entrypoint_requests_total{code!~\"5..\"}[1h]))) / clamp_min(sum(rate(traefik_entrypoint_requests_total[1h])), 1))) / 0.0010000000000000009",
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "yellow",
"value": 1
},
{
"color": "orange",
"value": 2
},
{
"color": "red",
"value": 4
}
]
},
"unit": "none",
"custom": {
"displayMode": "auto"
},
"decimals": 2
},
"overrides": []
},
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "center",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"textMode": "value"
}
},
{
"id": 3,
"type": "stat",
"title": "Error Budget Burn (6h)",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 4,
"w": 6,
"x": 12,
"y": 0
},
"targets": [
{
"expr": "(1 - ((sum(rate(traefik_entrypoint_requests_total{code!~\"5..\"}[6h]))) / clamp_min(sum(rate(traefik_entrypoint_requests_total[6h])), 1))) / 0.0010000000000000009",
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "yellow",
"value": 1
},
{
"color": "orange",
"value": 2
},
{
"color": "red",
"value": 4
}
]
},
"unit": "none",
"custom": {
"displayMode": "auto"
},
"decimals": 2
},
"overrides": []
},
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "center",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"textMode": "value"
}
},
{
"id": 4,
"type": "stat",
"title": "Edge P99 Latency (ms)",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 4,
"w": 6,
"x": 18,
"y": 0
},
"targets": [
{
"expr": "histogram_quantile(0.99, sum by (le) (rate(traefik_entrypoint_request_duration_seconds_bucket[5m]))) * 1000",
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "yellow",
"value": 200
},
{
"color": "orange",
"value": 350
},
{
"color": "red",
"value": 500
}
]
},
"unit": "ms",
"custom": {
"displayMode": "auto"
},
"decimals": 1
},
"overrides": []
},
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "center",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"textMode": "value"
}
},
{
"id": 5,
"type": "stat",
"title": "Ingress Traffic",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 4,
"w": 8,
"x": 0,
"y": 4
},
"targets": [
{
"expr": "sum(rate(node_network_receive_bytes_total{device!~\"lo|cni.*|veth.*|flannel.*|docker.*|virbr.*|vxlan.*|wg.*\"}[5m])) or on() vector(0)",
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "rgba(115, 115, 115, 1)",
"value": null
},
{
"color": "green",
"value": 1
}
]
},
"unit": "Bps",
"custom": {
"displayMode": "auto"
}
},
"overrides": []
},
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "center",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"textMode": "value"
}
},
{
"id": 6,
"type": "stat",
"title": "Egress Traffic",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 4,
"w": 8,
"x": 8,
"y": 4
},
"targets": [
{
"expr": "sum(rate(node_network_transmit_bytes_total{device!~\"lo|cni.*|veth.*|flannel.*|docker.*|virbr.*|vxlan.*|wg.*\"}[5m])) or on() vector(0)",
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "rgba(115, 115, 115, 1)",
"value": null
},
{
"color": "green",
"value": 1
}
]
},
"unit": "Bps",
"custom": {
"displayMode": "auto"
}
},
"overrides": []
},
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "center",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"textMode": "value"
}
},
{
"id": 7,
"type": "stat",
"title": "Intra-Cluster Traffic",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 4,
"w": 8,
"x": 16,
"y": 4
},
"targets": [
{
"expr": "sum(rate(container_network_receive_bytes_total{namespace!=\"traefik\",pod!=\"\"}[5m]) + rate(container_network_transmit_bytes_total{namespace!=\"traefik\",pod!=\"\"}[5m])) or on() vector(0)",
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "rgba(115, 115, 115, 1)",
"value": null
},
{
"color": "green",
"value": 1
}
]
},
"unit": "Bps",
"custom": {
"displayMode": "auto"
}
},
"overrides": []
},
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "center",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"textMode": "value"
}
},
{
"id": 8,
"type": "timeseries",
"title": "Per-Node Throughput",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 8,
"w": 24,
"x": 0,
"y": 8
},
"targets": [
{
"expr": "avg by (node) ((sum(rate(node_network_transmit_bytes_total{device!~\"lo|cni.*|veth.*|flannel.*|docker.*|virbr.*|vxlan.*|wg.*\"}[5m])) or on() vector(0) + sum(rate(node_network_receive_bytes_total{device!~\"lo|cni.*|veth.*|flannel.*|docker.*|virbr.*|vxlan.*|wg.*\"}[5m])) or on() vector(0)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))",
"refId": "A",
"legendFormat": "{{node}}"
}
],
"fieldConfig": {
"defaults": {
"unit": "Bps"
},
"overrides": []
},
"options": {
"legend": {
"displayMode": "table",
"placement": "right"
},
"tooltip": {
"mode": "multi"
}
}
},
{
"id": 9,
"type": "table",
"title": "Top Namespaces",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 9,
"w": 12,
"x": 0,
"y": 16
},
"targets": [
{
"expr": "topk(10, sum(rate(container_network_transmit_bytes_total{namespace!=\"\"}[5m]) + rate(container_network_receive_bytes_total{namespace!=\"\"}[5m])) by (namespace))",
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"unit": "Bps",
"custom": {
"filterable": true
}
},
"overrides": []
},
"options": {
"showHeader": true,
"columnFilters": false
},
"transformations": [
{
"id": "labelsToFields",
"options": {}
}
]
},
{
"id": 10,
"type": "table",
"title": "Top Pods",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 9,
"w": 12,
"x": 12,
"y": 16
},
"targets": [
{
"expr": "topk(10, sum(rate(container_network_transmit_bytes_total{pod!=\"\"}[5m]) + rate(container_network_receive_bytes_total{pod!=\"\"}[5m])) by (namespace,pod))",
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"unit": "Bps",
"custom": {
"filterable": true
}
},
"overrides": []
},
"options": {
"showHeader": true,
"columnFilters": false
},
"transformations": [
{
"id": "labelsToFields",
"options": {}
}
]
},
{
"id": 11,
"type": "timeseries",
"title": "Traefik Routers (req/s)",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 9,
"w": 12,
"x": 0,
"y": 25
},
"targets": [
{
"expr": "topk(10, sum by (router) (rate(traefik_router_requests_total[5m])))",
"refId": "A",
"legendFormat": "{{router}}"
}
],
"fieldConfig": {
"defaults": {
"unit": "req/s"
},
"overrides": []
},
"options": {
"legend": {
"displayMode": "table",
"placement": "right"
},
"tooltip": {
"mode": "multi"
}
}
},
{
"id": 12,
"type": "timeseries",
"title": "Traefik Entrypoints (req/s)",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 9,
"w": 12,
"x": 12,
"y": 25
},
"targets": [
{
"expr": "sum by (entrypoint) (rate(traefik_entrypoint_requests_total[5m]))",
"refId": "A",
"legendFormat": "{{entrypoint}}"
}
],
"fieldConfig": {
"defaults": {
"unit": "req/s"
},
"overrides": []
},
"options": {
"legend": {
"displayMode": "table",
"placement": "right"
},
"tooltip": {
"mode": "multi"
}
}
}
],
"time": {
"from": "now-12h",
"to": "now"
},
"annotations": {
"list": []
},
"schemaVersion": 39,
"style": "dark",
"tags": [
"atlas",
"network"
]
}

View File

@ -0,0 +1,602 @@
{
"uid": "atlas-nodes",
"title": "Atlas Nodes",
"folderUid": "atlas-internal",
"editable": true,
"panels": [
{
"id": 1,
"type": "stat",
"title": "Worker Nodes Ready",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 4,
"w": 8,
"x": 0,
"y": 0
},
"targets": [
{
"expr": "sum(kube_node_status_condition{condition=\"Ready\",status=\"true\",node=~\"titan-04|titan-05|titan-06|titan-07|titan-08|titan-09|titan-10|titan-11|titan-12|titan-13|titan-14|titan-15|titan-16|titan-17|titan-18|titan-19|titan-22|titan-24\"})",
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "rgba(115, 115, 115, 1)",
"value": null
},
{
"color": "green",
"value": 1
}
]
},
"unit": "none",
"custom": {
"displayMode": "auto",
"valueSuffix": "/18"
}
},
"overrides": []
},
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "center",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"textMode": "value"
}
},
{
"id": 2,
"type": "stat",
"title": "Control Plane Ready",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 4,
"w": 8,
"x": 8,
"y": 0
},
"targets": [
{
"expr": "sum(kube_node_status_condition{condition=\"Ready\",status=\"true\",node=~\"titan-0a|titan-0b|titan-0c\"})",
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "rgba(115, 115, 115, 1)",
"value": null
},
{
"color": "green",
"value": 1
}
]
},
"unit": "none",
"custom": {
"displayMode": "auto",
"valueSuffix": "/3"
}
},
"overrides": []
},
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "center",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"textMode": "value"
}
},
{
"id": 3,
"type": "stat",
"title": "Control Plane Workloads",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 4,
"w": 8,
"x": 16,
"y": 0
},
"targets": [
{
"expr": "sum(kube_pod_info{node=~\"titan-0a|titan-0b|titan-0c\",namespace!~\"kube-system|kube-public|kube-node-lease|longhorn-system|monitoring|flux-system\"})",
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "rgba(115, 115, 115, 1)",
"value": null
},
{
"color": "green",
"value": 1
}
]
},
"unit": "none",
"custom": {
"displayMode": "auto"
}
},
"overrides": []
},
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "center",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"textMode": "value"
}
},
{
"id": 9,
"type": "stat",
"title": "API Server 5xx rate",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 4,
"w": 8,
"x": 0,
"y": 4
},
"targets": [
{
"expr": "sum(rate(apiserver_request_total{code=~\"5..\"}[5m]))",
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "yellow",
"value": 0.05
},
{
"color": "orange",
"value": 0.2
},
{
"color": "red",
"value": 0.5
}
]
},
"unit": "req/s",
"custom": {
"displayMode": "auto"
},
"decimals": 3
},
"overrides": []
},
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "center",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"textMode": "value"
}
},
{
"id": 10,
"type": "stat",
"title": "API Server P99 latency",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 4,
"w": 8,
"x": 8,
"y": 4
},
"targets": [
{
"expr": "histogram_quantile(0.99, sum by (le) (rate(apiserver_request_duration_seconds_bucket[5m]))) * 1000",
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "yellow",
"value": 250
},
{
"color": "orange",
"value": 400
},
{
"color": "red",
"value": 600
}
]
},
"unit": "ms",
"custom": {
"displayMode": "auto"
},
"decimals": 1
},
"overrides": []
},
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "center",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"textMode": "value"
}
},
{
"id": 11,
"type": "stat",
"title": "etcd P99 latency",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 4,
"w": 8,
"x": 16,
"y": 4
},
"targets": [
{
"expr": "histogram_quantile(0.99, sum by (le) (rate(etcd_request_duration_seconds_bucket[5m]))) * 1000",
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "yellow",
"value": 50
},
{
"color": "orange",
"value": 100
},
{
"color": "red",
"value": 200
}
]
},
"unit": "ms",
"custom": {
"displayMode": "auto"
},
"decimals": 1
},
"overrides": []
},
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "center",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"textMode": "value"
}
},
{
"id": 4,
"type": "timeseries",
"title": "Node CPU",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 9,
"w": 24,
"x": 0,
"y": 8
},
"targets": [
{
"expr": "avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))",
"refId": "A",
"legendFormat": "{{node}}"
}
],
"fieldConfig": {
"defaults": {
"unit": "percent"
},
"overrides": []
},
"options": {
"legend": {
"displayMode": "table",
"placement": "right",
"calcs": [
"last"
]
},
"tooltip": {
"mode": "multi"
}
}
},
{
"id": 5,
"type": "timeseries",
"title": "Node RAM",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 9,
"w": 24,
"x": 0,
"y": 17
},
"targets": [
{
"expr": "avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))",
"refId": "A",
"legendFormat": "{{node}}"
}
],
"fieldConfig": {
"defaults": {
"unit": "percent"
},
"overrides": []
},
"options": {
"legend": {
"displayMode": "table",
"placement": "right",
"calcs": [
"last"
]
},
"tooltip": {
"mode": "multi"
}
}
},
{
"id": 6,
"type": "timeseries",
"title": "Control Plane (incl. titan-db) CPU",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 9,
"w": 12,
"x": 0,
"y": 26
},
"targets": [
{
"expr": "(avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-0a|titan-0b|titan-0c|titan-db\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")",
"refId": "A",
"legendFormat": "{{node}}"
}
],
"fieldConfig": {
"defaults": {
"unit": "percent"
},
"overrides": []
},
"options": {
"legend": {
"displayMode": "table",
"placement": "right"
},
"tooltip": {
"mode": "multi"
}
}
},
{
"id": 7,
"type": "timeseries",
"title": "Control Plane (incl. titan-db) RAM",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 9,
"w": 12,
"x": 12,
"y": 26
},
"targets": [
{
"expr": "(avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-0a|titan-0b|titan-0c|titan-db\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")",
"refId": "A",
"legendFormat": "{{node}}"
}
],
"fieldConfig": {
"defaults": {
"unit": "percent"
},
"overrides": []
},
"options": {
"legend": {
"displayMode": "table",
"placement": "right"
},
"tooltip": {
"mode": "multi"
}
}
},
{
"id": 8,
"type": "timeseries",
"title": "Root Filesystem Usage",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 9,
"w": 24,
"x": 0,
"y": 35
},
"targets": [
{
"expr": "avg by (node) ((avg by (instance) ((1 - (node_filesystem_avail_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"})) * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))",
"refId": "A",
"legendFormat": "{{node}}"
}
],
"fieldConfig": {
"defaults": {
"unit": "percent"
},
"overrides": []
},
"options": {
"legend": {
"displayMode": "table",
"placement": "right"
},
"tooltip": {
"mode": "multi"
}
},
"timeFrom": "30d"
}
],
"time": {
"from": "now-12h",
"to": "now"
},
"annotations": {
"list": []
},
"schemaVersion": 39,
"style": "dark",
"tags": [
"atlas",
"nodes"
]
}

Some files were not shown because too many files have changed in this diff Show More