Compare commits

..

206 Commits

Author SHA1 Message Date
ba12854639 flux: lower controller log verbosity 2025-12-18 02:15:32 -03:00
aa1c7d62c1 flux: reset image automation log level 2025-12-18 02:15:32 -03:00
flux-bot
3de36441f4 chore(ci-demo): apply image updates 2025-12-18 02:15:32 -03:00
e5238a7f91 chore: simplify image automation commit messages 2025-12-18 02:15:32 -03:00
d8077798db chore: update image automation templates 2025-12-18 02:15:32 -03:00
5a52c8606b ci-demo: move image policy to flux-system 2025-12-18 02:15:32 -03:00
be23851878 Merge pull request 'feature/bstein-dev-home' (#7) from feature/bstein-dev-home into main
Reviewed-on: #7
2025-12-18 04:23:01 +00:00
6f6fb363b3 Add bstein-dev-home deployment and Jenkins job 2025-12-18 01:14:09 -03:00
449574d59f Merge remote-tracking branch 'origin/feature/ci-gitops' into feature/bstein-dev-home 2025-12-18 01:07:01 -03:00
5f300c47a5 flux: bump image automation api to v1 2025-12-18 00:46:25 -03:00
c04a38fac5 flux: enable debug logging for controllers 2025-12-18 00:44:11 -03:00
5d4a0814c1 flux: enable debug logging for image automation 2025-12-18 00:40:55 -03:00
61d9f05fef flux: update pegasus image automation api 2025-12-18 00:39:39 -03:00
609347991e flux: upgrade controllers to v2.7.5 2025-12-18 00:38:32 -03:00
9816354d0f ci-demo: bump to v0.0.0-2 2025-12-17 23:12:03 -03:00
39275db74e ci-demo: set tag v0.0.0-1 2025-12-17 19:49:53 -03:00
9635100675 ci-demo: fix imagepolicy tag regex 2025-12-17 19:45:15 -03:00
bbb84c1182 jenkins: add ci-demo job 2025-12-17 19:27:23 -03:00
daa354e2cd ci-demo: add flux image automation 2025-12-17 19:18:29 -03:00
0a42289516 harbor: pin components to v2.14.1-arm64 2025-12-17 17:54:50 -03:00
b7246f5835 harbor: suspend automation, pin redis 2025-12-17 17:29:03 -03:00
flux-bot
b7709b3f40 chore(harbor): update images to registry.bstein.dev/infra/harbor-redis:v2.14.1-arm64.14registry.bstein.dev/infra/harbor-core:v2.14.1-arm64.14registry.bstein.dev/infra/harbor-jobservice:v2.14.1-arm64.14registry.bstein.dev/infra/harbor-portal:v2.14.1-arm64.14registry.bstein.dev/infra/harbor-registry:v2.14.1-arm64.14registry.bstein.dev/infra/harbor-registryctl:v2.14.1-arm64.14registry.bstein.dev/infra/harbor-nginx:v2.14.1-arm64.14registry.bstein.dev/infra/harbor-prepare:v2.14.1-arm64.14 2025-12-17 19:38:57 +00:00
bb8de41cdb harbor: run image automation in harbor ns 2025-12-17 16:38:37 -03:00
1d788a5dc4 harbor: fix imagepolicy tag setters 2025-12-17 16:32:42 -03:00
37a50622a2 harbor: fix image automation push schema 2025-12-17 16:25:16 -03:00
cde135c59e harbor: enable image automation push 2025-12-17 16:17:07 -03:00
543f2a9ccd harbor: fix image policy tag regex 2025-12-17 13:16:57 -03:00
efa6d92b69 harbor: automate nginx and prepare 2025-12-17 13:14:31 -03:00
2f66afd970 flux(atlas): use scoped health checks 2025-12-17 04:47:12 -03:00
f55d3fd956 flux(atlas): limit kustomization health checks 2025-12-17 04:11:26 -03:00
1a8c6857e7 harbor: re-pin workloads to titan-05 2025-12-17 03:30:31 -03:00
f28d5680f2 harbor: add image automation 2025-12-17 03:21:35 -03:00
8d04f6c6c7 jenkins: pin controller to rpi4 2025-12-17 02:53:23 -03:00
d93d24d5ef jenkins: disable chart local auth realm 2025-12-17 02:30:41 -03:00
7dcfd5f6cf jenkins: stop JCasC resetting OIDC 2025-12-17 02:23:54 -03:00
d3aa456bee jenkins: poll harbor-arm-build scm 2025-12-17 01:58:10 -03:00
a52b811e5b jenkins: source pipeline creds from secrets 2025-12-17 01:47:33 -03:00
cd1b9b57b0 harbor: add helm remediation and timeouts 2025-12-17 01:39:49 -03:00
5e6f9c6c83 chore: stop tracking NOTES.md 2025-12-17 01:29:48 -03:00
f512e0fa29 jenkins: harden oidc and timeouts 2025-12-17 01:11:07 -03:00
4e479147ec jenkins: run jcasc cleanup initcontainer as jenkins user 2025-12-17 00:43:55 -03:00
1f98a5be12 jenkins: clean stale JCasC files on startup 2025-12-17 00:37:37 -03:00
30048a9ae5 jenkins: drop invalid JCasC OIDC realm (use init script) 2025-12-17 00:28:52 -03:00
60a8192f61 jenkins: enforce OIDC via JCasC (no node move) 2025-12-17 00:23:15 -03:00
ce7631f896 jenkins: enforce OIDC via JCasC and pin to arm64 2025-12-16 23:38:08 -03:00
f3335028b1 jenkins: disable scm trigger for harbor arm build 2025-12-16 23:12:27 -03:00
0385a653af fix: use FullControlOnceLoggedIn auth strategy 2025-12-16 20:33:03 -03:00
6759871b43 fix: add casc support plugin 2025-12-16 20:27:41 -03:00
3e4a49e7fb fix: add job-dsl plugin for JCasC jobs 2025-12-16 20:21:33 -03:00
b951058dc6 fix: enforce Jenkins OIDC via init groovy only 2025-12-16 20:16:18 -03:00
cfa7bd8198 fix: jenkins casc OIDC using explicit endpoints 2025-12-16 20:13:52 -03:00
162fe3339f fix: pin Jenkins OIDC realm via JCasC 2025-12-16 20:04:21 -03:00
fc858fc8df ci: seed harbor-arm-build pipeline in Jenkins 2025-12-16 19:26:46 -03:00
8b9fc8ff1c chore: remove zot stack 2025-12-16 14:10:04 -03:00
3066db793d harbor: bootstrap arm64 images on titan-05 2025-12-16 11:16:34 -03:00
759a77c745 harbor: run arm64 images on rpi workers 2025-12-16 03:22:01 -03:00
c661658a12 Add AC Infinity ingestion plan 2025-12-16 01:45:04 -03:00
144a860a88 harbor: use project paths for crypto/pegasus images 2025-12-16 00:15:22 -03:00
bd64a36165 registry: point workloads to harbor 2025-12-16 00:08:11 -03:00
22b611f8ea harbor: set redis affinity to amd64 titan-22 first 2025-12-15 23:14:26 -03:00
a8bde2edc7 harbor: pin to amd64, prefer titan-22 2025-12-15 23:02:58 -03:00
d51a19cab9 harbor: prefer rpi nodes 2025-12-15 23:00:11 -03:00
3e3cab6845 harbor: increase helm timeout 2025-12-15 22:32:29 -03:00
9cda32c0bf harbor: use astreae storageclass for pvc 2025-12-15 22:22:48 -03:00
0f49849761 Regenerate dashboards after availability thresholds tweak 2025-12-15 22:14:26 -03:00
252743e416 harbor: use existing secrets and correct admin key 2025-12-15 22:08:52 -03:00
dba7cf00a4 harbor: deploy chart via flux 2025-12-15 22:05:40 -03:00
aa0df1f62b harbor: add helm repo and deploy via helmrelease 2025-12-15 22:05:32 -03:00
aa2bb09873 zot: allow upstream basic auth from oauth2-proxy 2025-12-15 14:22:48 -03:00
54406661f2 zot: forward authorization header to ui 2025-12-15 14:14:49 -03:00
caef505677 zot ui: send basic creds from oauth2-proxy, remove traefik header 2025-12-15 14:08:18 -03:00
54eb9e1ac5 zot: restore UI basic header middleware 2025-12-15 14:01:18 -03:00
1899bb7677 zot: move basic auth to oauth2-proxy upstream 2025-12-15 13:53:53 -03:00
0416493f49 zot: fix htpasswd volume to avoid type conflict 2025-12-15 13:00:51 -03:00
b87f06f6ff zot: add oauth proxy and user sync scripts 2025-12-15 12:57:02 -03:00
828f66d18c gitea: enable OIDC auto-registration 2025-12-14 23:08:38 -03:00
7a1f3bfc3f gitea: add proxy/session headers for OIDC 2025-12-14 22:25:46 -03:00
294542e718 gitea: reference secret via env; remove secret file 2025-12-14 22:16:49 -03:00
c3a8c7ddae gitea: remove committed secret and env refs 2025-12-14 22:10:13 -03:00
29da4be557 gitea: pin secret/internal token and include secret manifest 2025-12-14 22:06:25 -03:00
fc5b0cccf8 gitea: drop required claim constraint on keycloak auth 2025-12-14 21:58:36 -03:00
c8b89c3120 gitea: enforce keycloak auth source via init container 2025-12-14 21:54:18 -03:00
9b994111cb gitea: remove bootstrap job (immutable error) 2025-12-14 21:49:07 -03:00
a174e451d9 gitea: fix bootstrap job immutability 2025-12-14 21:47:50 -03:00
d8dab08cd8 gitea: set trace logging for oidc 2025-12-14 21:44:43 -03:00
0d93929e3d gitea: relax required signin, set admin group+skip 2fa 2025-12-14 21:42:08 -03:00
2ffc906487 gitea: enable debug logging for oauth 2025-12-14 21:38:32 -03:00
37761fa118 jenkins: fix OIDC retriever null 2025-12-14 21:23:15 -03:00
a46226bb0a ci: enable oidc for jenkins/gitops/gitea 2025-12-14 20:58:57 -03:00
04602a2914 jenkins: auto-configure OIDC via init script 2025-12-14 19:22:47 -03:00
fc0fa59981 jenkins: drop JCasC OIDC script to unblock startup 2025-12-14 18:10:49 -03:00
0286f4f317 jenkins: restore plugin list without pinned versions 2025-12-14 17:59:48 -03:00
90bf1f7d8e jenkins: start without plugin installs to unblock bootstrap 2025-12-14 16:02:05 -03:00
6def1aa479 jenkins: use latest plugin versions to avoid 404 2025-12-14 16:00:45 -03:00
4eff9ebcc1 jenkins: add helm release with ingress + astreae storage 2025-12-14 15:57:42 -03:00
ccfc473521 cleanup: stop tracking extra md files; switch gitops cert to letsencrypt 2025-12-14 15:52:12 -03:00
b575c64de1 chore: drop stray NOTES.md 2025-12-14 15:43:06 -03:00
39d732d74d git: ignore fixed 2025-12-14 15:39:27 -03:00
b28e393524 gitops-ui: open ingress for acme solver 2025-12-14 15:14:11 -03:00
694bb4d12e gitops-ui: allow acme solver from kube-system traefik 2025-12-14 15:12:38 -03:00
6993f51ef7 gitops-ui: allow acme solver ingress from traefik 2025-12-14 15:08:44 -03:00
85cea34fe8 gitops-ui: cert + switch flux to feature/ci-gitops 2025-12-14 15:04:13 -03:00
055ce7d18c Merge pull request 'feature/mailu' (#5) from feature/mailu into main
Reviewed-on: #5
2025-12-14 17:48:02 +00:00
1a161b4d3c monitoring: longer data history 2025-12-14 14:47:20 -03:00
f7bf990d62 flux: bump gitops-ui kustomization 2025-12-14 14:41:52 -03:00
63bf153c8b flux: add weave gitops ui 2025-12-14 14:38:08 -03:00
8fceebd7a7 nextcloud: integration with mailu & gitops-ui: initial install 2025-12-14 14:21:40 -03:00
0d0216c8f5 Add tests and dedupe nextcloud mail sync 2025-12-14 14:15:19 -03:00
c8b49560b6 Keep nextcloud scripts single-sourced under scripts/ 2025-12-14 14:05:01 -03:00
327a7bed57 Extract nextcloud scripts to files 2025-12-14 13:59:16 -03:00
aae09c5074 Normalize doc layout and README guidance 2025-12-14 13:47:59 -03:00
56bb4e91b9 Group namespace plurality rows to one per namespace 2025-12-13 22:17:47 -03:00
18f3a2cefe Fix namespace plurality mask and bump v26 2025-12-13 20:53:11 -03:00
1ec3ca29a4 Use OR-joined node ranks for plurality tie-break 2025-12-13 19:04:22 -03:00
4812958e82 Deduplicate namespace plurality rows with ranked tie-break 2025-12-13 18:39:31 -03:00
9ad5f7f405 Restore namespace plurality panel data 2025-12-13 18:25:03 -03:00
57ea397027 Use table format for namespace plurality panel 2025-12-13 18:23:19 -03:00
be0ac48b33 Simplify namespace plurality table rendering 2025-12-13 18:07:56 -03:00
2156b6f6aa Hide table footer on namespace plurality table 2025-12-13 18:03:51 -03:00
4fcc7c84f2 Make namespace plurality table non-filterable 2025-12-13 17:55:52 -03:00
a4b3273bab Remove filter bar from namespace plurality table 2025-12-13 17:38:57 -03:00
c536a13d55 Disable column filters on namespace plurality table 2025-12-13 17:35:52 -03:00
13eb02c19b Hide filters on namespace plurality table 2025-12-13 17:32:19 -03:00
134a4ad001 Fix namespace plurality table query 2025-12-13 17:29:55 -03:00
3e0a84b074 atlas pods: plurality table v11 (deterministic top node) 2025-12-13 17:19:03 -03:00
7f67793ee5 atlas pods: plurality table v10 2025-12-13 16:36:25 -03:00
e87d54f19d atlas pods: per-namespace top node via topk 2025-12-13 15:51:45 -03:00
6ac01e5879 atlas pods: simplify plurality table (no filter) 2025-12-13 15:29:08 -03:00
d0ed188179 monitoring: drop README per convention 2025-12-13 15:25:21 -03:00
b703e66b98 monitoring: restore README 2025-12-13 15:11:50 -03:00
68d4f43903 atlas pods: stabilize plurality query to avoid 422 2025-12-13 15:11:21 -03:00
cf9dacd4ea atlas pods: show per-namespace top node without vars 2025-12-13 15:02:52 -03:00
6eee7b8853 atlas pods: drop non-leading nodes in plurality table 2025-12-13 13:39:06 -03:00
03a4ca4d84 atlas pods: simplify plurality table query 2025-12-13 12:06:18 -03:00
c7adb0c8cb atlas pods: fix plurality table query 2025-12-13 12:00:31 -03:00
9d1163f580 atlas pods: use prom share() for plurality table 2025-12-13 11:53:27 -03:00
001f0f95a6 atlas pods: fix plurality query with bool max match 2025-12-13 11:51:18 -03:00
2177a8009e atlas pods: robust per-namespace top-node share 2025-12-13 11:48:44 -03:00
6a3d1311b9 atlas pods: select per-namespace top node via max match 2025-12-13 04:15:03 -03:00
d916e5a7f1 atlas pods: sort plurality table by node then share 2025-12-13 04:10:10 -03:00
5d6d34c274 atlas pods: simplify namespace plurality query 2025-12-13 04:06:46 -03:00
53423c7a46 atlas pods: fix namespace plurality query 2025-12-13 04:00:57 -03:00
d274738e9e restore readmes removed in last commit 2025-12-13 03:57:44 -03:00
f0265d6b94 atlas pods: add namespace plurality by node table 2025-12-13 03:57:20 -03:00
8a755e0c42 mailu: forcing version 1.4 clamav over 1.2 2025-12-13 00:11:40 -03:00
e22293db3e forcing 12-r3 over 12-r6 for redis 2025-12-12 22:09:04 -03:00
6f8a70fd58 atlas overview: include titan-db in control plane panels 2025-12-12 21:55:53 -03:00
580d1731f9 monitoring: drop duplicate titan-db scrape job 2025-12-12 21:48:03 -03:00
4def298b83 monitoring: scrape titan-db node_exporter 2025-12-12 21:38:10 -03:00
1166069640 atlas dashboards: align percent thresholds and disk bars 2025-12-12 21:13:31 -03:00
e56bed284e atlas overview: refine alert thresholds and availability colors 2025-12-12 20:50:41 -03:00
24376594ff atlas dashboards: use threshold colors for stats 2025-12-12 20:44:20 -03:00
5277c98385 atlas dashboards: fix pod share display and zero/red stat thresholds 2025-12-12 20:40:32 -03:00
056b7b7770 atlas dashboards: show pod counts (not %) and make zero-friendly stats 2025-12-12 20:30:00 -03:00
b770575b42 atlas dashboards: show pod counts with top12 bars 2025-12-12 20:20:13 -03:00
9e76277c22 atlas dashboards: drop empty nodes and enforce top12 pod bars 2025-12-12 19:09:51 -03:00
93b3c6d2ec atlas dashboards: cap pod count bars at top12 2025-12-12 18:56:13 -03:00
596bf46863 atlas dashboards: sort pod counts and add pod row to overview 2025-12-12 18:51:43 -03:00
8b703f8655 atlas pods: add pod count bar and tidy pie 2025-12-12 18:45:29 -03:00
ec59d25ad8 atlas dashboards: fix overview links and add pods-by-node pie 2025-12-12 18:32:45 -03:00
bf6179f907 atlas internal dashboards: add SLO/burn and api health panels 2025-12-12 18:00:43 -03:00
0a0966db78 atlas overview: fix availability scaling 2025-12-12 16:36:47 -03:00
87fbba0d3e atlas overview: show availability percent with 3 decimals 2025-12-12 16:15:37 -03:00
b200dba5b9 atlas overview: show availability percent and keep uptime centered 2025-12-12 16:11:28 -03:00
697ce3c18f atlas overview: center uptime and reorder top row 2025-12-12 15:56:33 -03:00
8e39c6a28b atlas overview: add uptime and crashloop panels 2025-12-12 15:23:51 -03:00
38ab8e3364 standardize cert issuers to letsencrypt 2025-12-12 15:18:40 -03:00
29d22ba539 mailu: fix unbound sidecar mounts 2025-12-12 01:19:27 -03:00
118032d2c6 mailu: use mvance unbound sidecar and current redis image 2025-12-12 01:12:48 -03:00
4cfe92feb2 mailu: remove force upgrade to avoid pvc replace 2025-12-12 01:09:25 -03:00
ca27cc95b6 mailu: add validating dns sidecar and disable vip hostports 2025-12-12 01:06:38 -03:00
6c77b8e7f8 restore docs after gitignore change 2025-12-12 00:50:02 -03:00
78195c4685 mailu: fix admin dns and tame vip 2025-12-12 00:49:45 -03:00
5ef0b4edf6 mailu: capture helm release and cert 2025-12-11 23:54:43 -03:00
9f226c1584 Merge pull request 'feature/sso' (#4) from feature/sso into main
Reviewed-on: #4
2025-12-11 20:43:34 +00:00
319b515882 zot: restore main branch config 2025-12-11 17:26:15 -03:00
cb2b2ec1cd zot: revert to unauthenticated registry 2025-12-11 17:22:16 -03:00
20cd185c0b vault: drop traefik basicauth 2025-12-11 17:09:05 -03:00
2f368f6975 zot,vault: remove oauth2-proxy sso 2025-12-11 17:04:19 -03:00
6c62d42f7a longhorn/vault: gate via oauth2-proxy 2025-12-07 19:44:02 -03:00
a7e9f1f7d8 auth: remove error middleware to allow redirect 2025-12-07 13:19:45 -03:00
ceb692f7ee oauth2-proxy: drop groups scope to avoid invalid_scope 2025-12-07 13:09:29 -03:00
24fbaad040 auth: forward-auth via external auth host (svc traffic flaky) 2025-12-07 13:03:29 -03:00
04aa32a762 oauth2-proxy: schedule on worker rpis 2025-12-07 12:49:38 -03:00
25ee698021 oauth2-proxy: ensure error middleware on auth ingress 2025-12-07 12:03:14 -03:00
4a089876ba auth: use internal oauth2-proxy svc for forward-auth 2025-12-07 11:25:29 -03:00
20bb776625 auth: add 401 redirect middleware to oauth2-proxy 2025-12-07 11:14:25 -03:00
5e59f20bc3 auth: point forward-auth to external auth host 2025-12-07 11:09:09 -03:00
dbede55ad4 oauth2-proxy: temporarily drop group restriction 2025-12-07 10:42:13 -03:00
27e5c9391c auth: add namespace-local forward-auth middlewares 2025-12-07 10:25:44 -03:00
8d5e6c267c auth: wire oauth2-proxy and enable grafana oidc 2025-12-07 02:01:21 -03:00
a55502fe27 add oauth2-proxy for SSO forward-auth 2025-12-06 14:42:24 -03:00
598bdfc727 keycloak: restrict to worker rpis with titan-24 fallback 2025-12-06 01:44:23 -03:00
88c7a1c2aa keycloak: require rpi nodes with titan-24 fallback 2025-12-06 01:40:24 -03:00
f4da27271e keycloak: prefer rpi nodes, avoid titan-24 2025-12-06 01:36:33 -03:00
141c05b08f keycloak: honor xforwarded headers and hostname url 2025-12-06 01:23:07 -03:00
f0a8f6d35e keycloak: enable health/metrics management port 2025-12-06 00:51:47 -03:00
1b01052eda keycloak: set fsGroup for data volume 2025-12-06 00:49:17 -03:00
1d346edd28 keycloak: remove optimized flag for first start 2025-12-06 00:43:24 -03:00
b14a9dcb98 chore: drop AGENTS.md from repo 2025-12-06 00:43:17 -03:00
47caf08885 notes: capture GPU share change and flux branch 2025-12-03 12:28:45 -03:00
0db149605d monitoring: show GPU share over dashboard range 2025-12-02 20:28:35 -03:00
f64e60c5a2 flux: add keycloak kustomization 2025-12-02 18:10:20 -03:00
61c5db5c99 flux: track feature/sso 2025-12-02 18:00:49 -03:00
2db550afdd keycloak: add raw manifests backed by shared postgres 2025-12-02 17:58:19 -03:00
65d389193f Merge pull request 'feature/atlas-monitoring' (#3) from feature/atlas-monitoring into main
Reviewed-on: #3
2025-12-02 20:52:35 +00:00
140 changed files with 10631 additions and 4671 deletions

3
.gitignore vendored
View File

@ -1 +1,2 @@
AGENTS.md *.md
!README.md

View File

@ -1,68 +0,0 @@
Repository Guidelines
## Project Structure & Module Organization
- `infrastructure/`: cluster-scoped building blocks (core, flux-system, traefik, longhorn). Add new platform features by mirroring this layout.
- `services/`: workload manifests per app (`services/gitea/`, etc.) with `kustomization.yaml` plus one file per kind; keep diffs small and focused.
- `dockerfiles/` hosts bespoke images, while `scripts/` stores operational Fish/Bash helpers—extend these directories instead of relying on ad-hoc commands.
## Build, Test, and Development Commands
- `kustomize build services/<app>` (or `kubectl kustomize ...`) renders manifests exactly as Flux will.
- `kubectl apply --server-side --dry-run=client -k services/<app>` checks schema compatibility without touching the cluster.
- `flux reconcile kustomization <name> --namespace flux-system --with-source` pulls the latest Git state after merges or hotfixes.
- `fish scripts/flux_hammer.fish --help` explains the recovery tool; read it before running against production workloads.
## Coding Style & Naming Conventions
- YAML uses two-space indents; retain the leading path comment (e.g. `# services/gitea/deployment.yaml`) to speed code review.
- Keep resource names lowercase kebab-case, align labels/selectors, and mirror namespaces with directory names.
- List resources in `kustomization.yaml` from namespace/config, through storage, then workloads and networking for predictable diffs.
- Scripts start with `#!/usr/bin/env fish` or bash, stay executable, and follow snake_case names such as `flux_hammer.fish`.
## Testing Guidelines
- Run `kustomize build` and the dry-run apply for every service you touch; capture failures before opening a PR.
- `flux diff kustomization <name> --path services/<app>` previews reconciliations—link notable output when behavior shifts.
- Docker edits: `docker build -f dockerfiles/Dockerfile.monerod .` (swap the file you changed) to verify image builds.
## Commit & Pull Request Guidelines
- Keep commit subjects short, present-tense, and optionally scoped (`gpu(titan-24): add RuntimeClass`); squash fixups before review.
- Describe linked issues, affected services, and required operator steps (e.g. `flux reconcile kustomization services-gitea`) in the PR body.
- Focus each PR on one kustomization or service and update `infrastructure/flux-system` when Flux must track new folders.
- Record the validation you ran (dry-runs, diffs, builds) and add screenshots only when ingress or UI behavior changes.
## Security & Configuration Tips
- Never commit credentials; use Vault workflows (`services/vault/`) or SOPS-encrypted manifests wired through `infrastructure/flux-system`.
- Node selectors and tolerations gate workloads to hardware like `hardware: rpi4`; confirm labels before scaling or renaming nodes.
- Pin external images by digest or rely on Flux image automation to follow approved tags and avoid drift.
## Dashboard roadmap / context (2025-12-02)
- Atlas dashboards are generated via `scripts/dashboards_render_atlas.py --build`, which writes JSON under `services/monitoring/dashboards/` and ConfigMaps under `services/monitoring/`. Keep the Grafana manifests in sync by regenerating after edits.
- Atlas Overview panels are paired with internal dashboards (pods, nodes, storage, network, GPU). A new `atlas-gpu` internal dashboard holds the detailed GPU metrics that feed the overview share pie.
- Old Grafana folders (`Atlas Storage`, `Atlas SRE`, `Atlas Public`, `Atlas Nodes`) should be removed in Grafana UI when convenient; only `Atlas Overview` and `Atlas Internal` should remain provisioned.
- Future work: add a separate generator (e.g., `dashboards_render_oceanus.py`) for SUI/oceanus validation dashboards, mirroring the atlas pattern of internal dashboards feeding a public overview.
## Monitoring state (2025-12-03)
- dcgm-exporter DaemonSet pulls `registry.bstein.dev/monitoring/dcgm-exporter:4.4.2-4.7.0-ubuntu22.04` with nvidia runtime/imagePullSecret; titan-24 exports metrics, titan-22 remains NotReady.
- Atlas Overview is the Grafana home (1h range, 1m refresh), Overview folder UID `overview`, internal folder `atlas-internal` (oceanus-internal stub).
- Panels standardized via generator; hottest row compressed, worker/control rows taller, root disk row taller and top12 bar gauge with labels. GPU share pie uses 1h avg_over_time to persist idle activity.
- Internal dashboards are provisioned without Viewer role; if anonymous still sees them, restart Grafana and tighten auth if needed.
## Upcoming priorities (SSO/storage/mail)
- Establish SSO (Keycloak or similar) and federate Grafana, Gitea, Zot, Nextcloud, Pegasus/Jellyfin; keep Vaultwarden separate until safe.
- Add Nextcloud (limit to rpi5 workers) with office suite; integrate with SSO; plan storage class and ingress.
- Plan mail: mostly self-hosted, relay through trusted provider for outbound; integrate with services (Nextcloud, Vaultwarden, etc.) for notifications and account flows.
## SSO plan sketch (2025-12-03)
- IdP: use Keycloak (preferred) in a new `sso` namespace, Bitnami or codecentric chart with Postgres backing store (single PVC), ingress `sso.bstein.dev`, admin user bound to brad@bstein.dev; stick with local DB initially (no external IdP).
- Auth flow goals: Grafana (OIDC), Gitea (OAuth2/Keycloak), Zot (via Traefik forward-auth/oauth2-proxy), Jellyfin/Pegasus via Jellyfin OAuth/OpenID plugin (map existing usernames; run migration to pre-create users in Keycloak with same usernames/emails and temporary passwords), Pegasus keeps using Jellyfin tokens.
- Steps to implement:
1) Add service folder `services/keycloak/` (namespace, PVC, HelmRelease, ingress, secret for admin creds). Verify with kustomize + Flux reconcile.
2) Seed realm `atlas` with users (import CSV/realm). Create client for Grafana (public/implicit), Gitea (confidential), and a “jellyfin” client for the OAuth plugin; set email for brad@bstein.dev as admin.
3) Reconfigure Grafana to OIDC (disable anonymous to internal folders, leave Overview public via folder permissions). Reconfigure Gitea to OIDC (app.ini).
4) Add Traefik forward-auth (oauth2-proxy) in front of Zot and any other services needing headers-based auth.
5) Deploy Jellyfin OpenID plugin; map Keycloak users to existing Jellyfin usernames; communicate password reset path.
- Migration caution: do not delete existing local creds until SSO validated; keep Pegasus working via Jellyfin tokens during transition.
## Postgres centralization (2025-12-03)
- Prefer a shared in-cluster Postgres deployment with per-service databases to reduce resource sprawl on Pi nodes. Use it for services that can easily point at an external DB.
- Candidates to migrate to shared Postgres: Keycloak (realm DB), Gitea (git DB), Nextcloud (app DB), possibly Grafana (if persistence needed beyond current provisioner), Jitsi prosody/JVB state (if external DB supported). Keep tightly-coupled or lightweight embedded DBs as-is when migration is painful or not supported.

3
README.md Normal file
View File

@ -0,0 +1,3 @@
# titan-iac
Flux-managed Kubernetes cluster for bstein.dev services.

View File

@ -9,4 +9,4 @@ resources:
- ../../services/monitoring - ../../services/monitoring
- ../../services/pegasus - ../../services/pegasus
- ../../services/vault - ../../services/vault
- ../../services/zot - ../../services/bstein-dev-home

View File

@ -0,0 +1,26 @@
# clusters/atlas/flux-system/applications/bstein-dev-home/image-automation.yaml
apiVersion: image.toolkit.fluxcd.io/v1
kind: ImageUpdateAutomation
metadata:
name: bstein-dev-home
namespace: flux-system
spec:
interval: 1m0s
sourceRef:
kind: GitRepository
name: flux-system
namespace: flux-system
git:
checkout:
ref:
branch: feature/ci-gitops
commit:
author:
email: ops@bstein.dev
name: flux-bot
messageTemplate: "chore(bstein-dev-home): update images to {{range .Updated.Images}}{{.}}{{end}}"
push:
branch: feature/ci-gitops
update:
strategy: Setters
path: services/bstein-dev-home

View File

@ -0,0 +1,15 @@
# clusters/atlas/flux-system/applications/bstein-dev-home/kustomization.yaml
apiVersion: kustomize.toolkit.fluxcd.io/v1
kind: Kustomization
metadata:
name: bstein-dev-home
namespace: flux-system
spec:
interval: 10m
path: ./services/bstein-dev-home
prune: true
sourceRef:
kind: GitRepository
name: flux-system
targetNamespace: bstein-dev-home
wait: false

View File

@ -0,0 +1,26 @@
# clusters/atlas/flux-system/applications/ci-demo/image-automation.yaml
apiVersion: image.toolkit.fluxcd.io/v1
kind: ImageUpdateAutomation
metadata:
name: ci-demo
namespace: flux-system
spec:
interval: 1m0s
sourceRef:
kind: GitRepository
name: flux-system
namespace: flux-system
git:
checkout:
ref:
branch: feature/ci-gitops
commit:
author:
email: ops@bstein.dev
name: flux-bot
messageTemplate: "chore(ci-demo): apply image updates"
push:
branch: feature/ci-gitops
update:
strategy: Setters
path: services/ci-demo

View File

@ -0,0 +1,17 @@
# clusters/atlas/flux-system/applications/ci-demo/kustomization.yaml
apiVersion: kustomize.toolkit.fluxcd.io/v1
kind: Kustomization
metadata:
name: ci-demo
namespace: flux-system
spec:
interval: 10m
path: ./services/ci-demo
prune: true
sourceRef:
kind: GitRepository
name: flux-system
namespace: flux-system
dependsOn:
- name: core
wait: false

View File

@ -0,0 +1,27 @@
# clusters/atlas/flux-system/applications/harbor/image-automation.yaml
apiVersion: image.toolkit.fluxcd.io/v1
kind: ImageUpdateAutomation
metadata:
name: harbor
namespace: harbor
spec:
suspend: true
interval: 5m0s
sourceRef:
kind: GitRepository
name: flux-system
namespace: flux-system
git:
checkout:
ref:
branch: feature/ci-gitops
commit:
author:
email: ops@bstein.dev
name: flux-bot
messageTemplate: "chore(harbor): apply image updates"
push:
branch: feature/ci-gitops
update:
strategy: Setters
path: ./services/harbor

View File

@ -0,0 +1,23 @@
# clusters/atlas/flux-system/applications/harbor/kustomization.yaml
apiVersion: kustomize.toolkit.fluxcd.io/v1
kind: Kustomization
metadata:
name: harbor
namespace: flux-system
spec:
interval: 10m
path: ./services/harbor
targetNamespace: harbor
prune: false
sourceRef:
kind: GitRepository
name: flux-system
namespace: flux-system
healthChecks:
- apiVersion: helm.toolkit.fluxcd.io/v2
kind: HelmRelease
name: harbor
namespace: harbor
wait: false
dependsOn:
- name: core

View File

@ -0,0 +1,23 @@
# clusters/atlas/flux-system/applications/jenkins/kustomization.yaml
apiVersion: kustomize.toolkit.fluxcd.io/v1
kind: Kustomization
metadata:
name: jenkins
namespace: flux-system
spec:
interval: 10m
path: ./services/jenkins
prune: true
sourceRef:
kind: GitRepository
name: flux-system
targetNamespace: jenkins
dependsOn:
- name: helm
- name: traefik
healthChecks:
- apiVersion: helm.toolkit.fluxcd.io/v2
kind: HelmRelease
name: jenkins
namespace: jenkins
wait: false

View File

@ -0,0 +1,15 @@
# clusters/atlas/flux-system/applications/keycloak/kustomization.yaml
apiVersion: kustomize.toolkit.fluxcd.io/v1
kind: Kustomization
metadata:
name: keycloak
namespace: flux-system
spec:
interval: 10m
prune: true
sourceRef:
kind: GitRepository
name: flux-system
path: ./services/keycloak
targetNamespace: sso
timeout: 2m

View File

@ -2,7 +2,6 @@
apiVersion: kustomize.config.k8s.io/v1beta1 apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization kind: Kustomization
resources: resources:
- zot/kustomization.yaml
- gitea/kustomization.yaml - gitea/kustomization.yaml
- vault/kustomization.yaml - vault/kustomization.yaml
- jitsi/kustomization.yaml - jitsi/kustomization.yaml
@ -10,6 +9,16 @@ resources:
- monerod/kustomization.yaml - monerod/kustomization.yaml
- pegasus/kustomization.yaml - pegasus/kustomization.yaml
- pegasus/image-automation.yaml - pegasus/image-automation.yaml
- bstein-dev-home/kustomization.yaml
- bstein-dev-home/image-automation.yaml
- harbor/kustomization.yaml
- harbor/image-automation.yaml
- jellyfin/kustomization.yaml - jellyfin/kustomization.yaml
- xmr-miner/kustomization.yaml - xmr-miner/kustomization.yaml
- sui-metrics/kustomization.yaml - sui-metrics/kustomization.yaml
- keycloak/kustomization.yaml
- oauth2-proxy/kustomization.yaml
- mailu/kustomization.yaml
- jenkins/kustomization.yaml
- ci-demo/kustomization.yaml
- ci-demo/image-automation.yaml

View File

@ -1,18 +1,18 @@
# clusters/atlas/flux-system/applications/zot/kustomization.yaml # clusters/atlas/flux-system/applications/mailu/kustomization.yaml
apiVersion: kustomize.toolkit.fluxcd.io/v1 apiVersion: kustomize.toolkit.fluxcd.io/v1
kind: Kustomization kind: Kustomization
metadata: metadata:
name: zot name: mailu
namespace: flux-system namespace: flux-system
spec: spec:
interval: 10m interval: 10m
path: ./services/zot
targetNamespace: zot
prune: false
sourceRef: sourceRef:
kind: GitRepository kind: GitRepository
name: flux-system name: flux-system
namespace: flux-system namespace: flux-system
path: ./services/mailu
targetNamespace: mailu-mailserver
prune: true
wait: true wait: true
dependsOn: dependsOn:
- name: core - name: helm

View File

@ -0,0 +1,15 @@
# clusters/atlas/flux-system/applications/oauth2-proxy/kustomization.yaml
apiVersion: kustomize.toolkit.fluxcd.io/v1
kind: Kustomization
metadata:
name: oauth2-proxy
namespace: flux-system
spec:
interval: 10m
prune: true
sourceRef:
kind: GitRepository
name: flux-system
path: ./services/oauth2-proxy
targetNamespace: sso
timeout: 2m

View File

@ -1,5 +1,5 @@
# clusters/atlas/flux-system/applications/pegasus/image-automation.yaml # clusters/atlas/flux-system/applications/pegasus/image-automation.yaml
apiVersion: image.toolkit.fluxcd.io/v1beta1 apiVersion: image.toolkit.fluxcd.io/v1
kind: ImageUpdateAutomation kind: ImageUpdateAutomation
metadata: metadata:
name: pegasus name: pegasus
@ -9,12 +9,18 @@ spec:
sourceRef: sourceRef:
kind: GitRepository kind: GitRepository
name: flux-system name: flux-system
namespace: flux-system
git: git:
checkout:
ref:
branch: feature/ci-gitops
commit: commit:
author: author:
email: ops@bstein.dev email: ops@bstein.dev
name: flux-bot name: flux-bot
messageTemplate: "chore(pegasus): update image to {{range .Updated.Images}}{{.}}{{end}}" messageTemplate: "chore(pegasus): apply image updates"
push:
branch: feature/ci-gitops
update: update:
strategy: Setters strategy: Setters
path: ./services/pegasus path: services/pegasus

File diff suppressed because it is too large Load Diff

View File

@ -8,7 +8,7 @@ metadata:
spec: spec:
interval: 1m0s interval: 1m0s
ref: ref:
branch: feature/atlas-monitoring branch: feature/ci-gitops
secretRef: secretRef:
name: flux-system-gitea name: flux-system-gitea
url: ssh://git@scm.bstein.dev:2242/bstein/titan-iac.git url: ssh://git@scm.bstein.dev:2242/bstein/titan-iac.git

View File

@ -0,0 +1,20 @@
# clusters/atlas/flux-system/platform/gitops-ui/kustomization.yaml
apiVersion: kustomize.toolkit.fluxcd.io/v1
kind: Kustomization
metadata:
name: gitops-ui
namespace: flux-system
spec:
interval: 10m
timeout: 10m
path: ./services/gitops-ui
prune: true
sourceRef:
kind: GitRepository
name: flux-system
namespace: flux-system
targetNamespace: flux-system
dependsOn:
- name: helm
- name: traefik
wait: true

View File

@ -5,5 +5,6 @@ resources:
- core/kustomization.yaml - core/kustomization.yaml
- helm/kustomization.yaml - helm/kustomization.yaml
- traefik/kustomization.yaml - traefik/kustomization.yaml
- gitops-ui/kustomization.yaml
- monitoring/kustomization.yaml - monitoring/kustomization.yaml
- longhorn-ui/kustomization.yaml - longhorn-ui/kustomization.yaml

View File

@ -1,5 +0,0 @@
# Oceanus Cluster Scaffold
This directory prepares the Flux and Kustomize layout for a future Oceanus-managed cluster.
Populate `flux-system/` with `gotk-components.yaml` and related manifests after running `flux bootstrap`.
Define node-specific resources under `infrastructure/modules/profiles/oceanus-validator/` and reference workloads in `applications/` as they come online.

View File

@ -1,16 +0,0 @@
# Titan Homelab Topology
| Hostname | Role / Function | Managed By | Notes |
|------------|--------------------------------|---------------------|-------|
| titan-0a | Kubernetes control-plane | Flux (atlas cluster)| HA leader, tainted for control only |
| titan-0b | Kubernetes control-plane | Flux (atlas cluster)| Standby control node |
| titan-0c | Kubernetes control-plane | Flux (atlas cluster)| Standby control node |
| titan-04-19| Raspberry Pi workers | Flux (atlas cluster)| Workload nodes, labelled per hardware |
| titan-22 | GPU mini-PC (Jellyfin) | Flux + Ansible | NVIDIA runtime managed via `modules/profiles/atlas-ha` |
| titan-24 | Tethys hybrid node | Flux + Ansible | Runs SUI metrics via K8s, validator via Ansible |
| titan-db | HA control plane database | Ansible | PostgreSQL / etcd backing services |
| titan-jh | Jumphost & bastion | Ansible | Entry point / future KVM services |
| oceanus | Dedicated SUI validator host | Ansible / Flux prep | Baremetal validator workloads, exposes metrics to atlas; Kustomize scaffold under `clusters/oceanus/` |
| styx | Air-gapped workstation | Manual / Scripts | Remains isolated, scripts tracked in `hosts/styx` |
Use the `clusters/` directory for cluster-scoped state and the `hosts/` directory for baremetal orchestration.

View File

@ -1,2 +0,0 @@
# hosts/styx/README.md
Styx is air-gapped; provisioning scripts live under `scripts/`.

View File

@ -5,3 +5,4 @@ resources:
- ../modules/base - ../modules/base
- ../modules/profiles/atlas-ha - ../modules/profiles/atlas-ha
- ../sources/cert-manager/letsencrypt.yaml - ../sources/cert-manager/letsencrypt.yaml
- ../sources/cert-manager/letsencrypt-prod.yaml

View File

@ -7,7 +7,7 @@ metadata:
annotations: annotations:
traefik.ingress.kubernetes.io/router.entrypoints: websecure traefik.ingress.kubernetes.io/router.entrypoints: websecure
traefik.ingress.kubernetes.io/router.tls: "true" traefik.ingress.kubernetes.io/router.tls: "true"
traefik.ingress.kubernetes.io/router.middlewares: longhorn-system-longhorn-basicauth@kubernetescrd,longhorn-system-longhorn-headers@kubernetescrd traefik.ingress.kubernetes.io/router.middlewares: ""
spec: spec:
ingressClassName: traefik ingressClassName: traefik
tls: tls:
@ -21,6 +21,6 @@ spec:
pathType: Prefix pathType: Prefix
backend: backend:
service: service:
name: longhorn-frontend name: oauth2-proxy-longhorn
port: port:
number: 80 number: 80

View File

@ -4,3 +4,4 @@ kind: Kustomization
resources: resources:
- middleware.yaml - middleware.yaml
- ingress.yaml - ingress.yaml
- oauth2-proxy-longhorn.yaml

View File

@ -20,3 +20,20 @@ spec:
headers: headers:
customRequestHeaders: customRequestHeaders:
X-Forwarded-Proto: "https" X-Forwarded-Proto: "https"
---
apiVersion: traefik.io/v1alpha1
kind: Middleware
metadata:
name: longhorn-forward-auth
namespace: longhorn-system
spec:
forwardAuth:
address: https://auth.bstein.dev/oauth2/auth
trustForwardHeader: true
authResponseHeaders:
- Authorization
- X-Auth-Request-Email
- X-Auth-Request-User
- X-Auth-Request-Groups

View File

@ -0,0 +1,102 @@
# infrastructure/longhorn/ui-ingress/oauth2-proxy-longhorn.yaml
apiVersion: v1
kind: Service
metadata:
name: oauth2-proxy-longhorn
namespace: longhorn-system
labels:
app: oauth2-proxy-longhorn
spec:
ports:
- name: http
port: 80
targetPort: 4180
selector:
app: oauth2-proxy-longhorn
---
apiVersion: apps/v1
kind: Deployment
metadata:
name: oauth2-proxy-longhorn
namespace: longhorn-system
labels:
app: oauth2-proxy-longhorn
spec:
replicas: 2
selector:
matchLabels:
app: oauth2-proxy-longhorn
template:
metadata:
labels:
app: oauth2-proxy-longhorn
spec:
nodeSelector:
node-role.kubernetes.io/worker: "true"
affinity:
nodeAffinity:
preferredDuringSchedulingIgnoredDuringExecution:
- weight: 90
preference:
matchExpressions:
- key: hardware
operator: In
values: ["rpi5","rpi4"]
containers:
- name: oauth2-proxy
image: quay.io/oauth2-proxy/oauth2-proxy:v7.6.0
imagePullPolicy: IfNotPresent
args:
- --provider=oidc
- --redirect-url=https://longhorn.bstein.dev/oauth2/callback
- --oidc-issuer-url=https://sso.bstein.dev/realms/atlas
- --scope=openid profile email groups
- --email-domain=*
- --allowed-group=admin
- --set-xauthrequest=true
- --pass-access-token=true
- --set-authorization-header=true
- --cookie-secure=true
- --cookie-samesite=lax
- --cookie-refresh=20m
- --cookie-expire=168h
- --insecure-oidc-allow-unverified-email=true
- --upstream=http://longhorn-frontend.longhorn-system.svc.cluster.local
- --http-address=0.0.0.0:4180
- --skip-provider-button=true
- --skip-jwt-bearer-tokens=true
- --oidc-groups-claim=groups
- --cookie-domain=longhorn.bstein.dev
env:
- name: OAUTH2_PROXY_CLIENT_ID
valueFrom:
secretKeyRef:
name: oauth2-proxy-longhorn-oidc
key: client_id
- name: OAUTH2_PROXY_CLIENT_SECRET
valueFrom:
secretKeyRef:
name: oauth2-proxy-longhorn-oidc
key: client_secret
- name: OAUTH2_PROXY_COOKIE_SECRET
valueFrom:
secretKeyRef:
name: oauth2-proxy-longhorn-oidc
key: cookie_secret
ports:
- containerPort: 4180
name: http
readinessProbe:
httpGet:
path: /ping
port: 4180
initialDelaySeconds: 5
periodSeconds: 10
livenessProbe:
httpGet:
path: /ping
port: 4180
initialDelaySeconds: 20
periodSeconds: 20

View File

@ -0,0 +1,14 @@
apiVersion: cert-manager.io/v1
kind: ClusterIssuer
metadata:
name: letsencrypt-prod
spec:
acme:
email: brad.stein@gmail.com
server: https://acme-v02.api.letsencrypt.org/directory
privateKeySecretRef:
name: letsencrypt-prod-account-key
solvers:
- http01:
ingress:
class: traefik

View File

@ -0,0 +1,9 @@
# infrastructure/sources/helm/harbor.yaml
apiVersion: source.toolkit.fluxcd.io/v1beta2
kind: HelmRepository
metadata:
name: harbor
namespace: flux-system
spec:
interval: 10m
url: https://helm.goharbor.io

View File

@ -0,0 +1,9 @@
# infrastructure/sources/helm/jenkins.yaml
apiVersion: source.toolkit.fluxcd.io/v1
kind: HelmRepository
metadata:
name: jenkins
namespace: flux-system
spec:
interval: 1h
url: https://charts.jenkins.io

View File

@ -0,0 +1,12 @@
# infrastructure/sources/helm/kustomization.yaml
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
resources:
- grafana.yaml
- hashicorp.yaml
- jetstack.yaml
- jenkins.yaml
- mailu.yaml
- harbor.yaml
- prometheus.yaml
- victoria-metrics.yaml

View File

@ -0,0 +1,9 @@
# infrastructure/sources/helm/mailu.yaml
apiVersion: source.toolkit.fluxcd.io/v1
kind: HelmRepository
metadata:
name: mailu
namespace: flux-system
spec:
interval: 1h
url: https://mailu.github.io/helm-charts

View File

@ -371,9 +371,9 @@ function xmrwallet_bootstrap --description "Interactive setup of monero-wallet-r
echo "Skipping daemon probe due to xmrwallet_SKIP_DAEMON_CHECK=1" echo "Skipping daemon probe due to xmrwallet_SKIP_DAEMON_CHECK=1"
end end
# Use your private image by default (in Zot) # Use your private image by default (in Harbor)
read -P "Container image for wallet RPC [registry.bstein.dev/infra/monero-wallet-rpc:0.18.4.1]: " image read -P "Container image for wallet RPC [registry.bstein.dev/crypto/monero-wallet-rpc:0.18.4.1]: " image
if test -z "$image"; set image registry.bstein.dev/infra/monero-wallet-rpc:0.18.4.1; end if test -z "$image"; set image registry.bstein.dev/crypto/monero-wallet-rpc:0.18.4.1; end
_require "Container image" $image; or return 1 _require "Container image" $image; or return 1
# --- Secrets (defaults: RPC user=wallet name, passwords auto if missing) # --- Secrets (defaults: RPC user=wallet name, passwords auto if missing)
@ -1375,4 +1375,3 @@ function xmrwallet_help_detailed
echo " Probes it via a temporary port-forward so it works from your workstation." echo " Probes it via a temporary port-forward so it works from your workstation."
echo " Set xmrwallet_SKIP_DAEMON_CHECK=1 to bypass the daemon probe (not recommended)." echo " Set xmrwallet_SKIP_DAEMON_CHECK=1 to bypass the daemon probe (not recommended)."
end end

View File

@ -23,7 +23,7 @@ end
# Default image chooser (you should override with your own multi-arch image) # Default image chooser (you should override with your own multi-arch image)
function _sui_default_image -a NET function _sui_default_image -a NET
echo registry.bstein.dev/infra/sui-tools:1.53.2 echo registry.bstein.dev/crypto/sui-tools:1.53.2
end end
# Convert any string to a k8s-safe name (RFC-1123 label-ish) # Convert any string to a k8s-safe name (RFC-1123 label-ish)

View File

@ -36,11 +36,12 @@ PUBLIC_FOLDER = "overview"
PRIVATE_FOLDER = "atlas-internal" PRIVATE_FOLDER = "atlas-internal"
PERCENT_THRESHOLDS = { PERCENT_THRESHOLDS = {
"mode": "percentage", "mode": "absolute",
"steps": [ "steps": [
{"color": "green", "value": None}, {"color": "green", "value": None},
{"color": "yellow", "value": 70}, {"color": "yellow", "value": 50},
{"color": "red", "value": 85}, {"color": "orange", "value": 75},
{"color": "red", "value": 91.5},
], ],
} }
@ -81,7 +82,7 @@ CONTROL_SUFFIX = f"/{CONTROL_TOTAL}"
WORKER_SUFFIX = f"/{WORKER_TOTAL}" WORKER_SUFFIX = f"/{WORKER_TOTAL}"
CP_ALLOWED_NS = "kube-system|kube-public|kube-node-lease|longhorn-system|monitoring|flux-system" CP_ALLOWED_NS = "kube-system|kube-public|kube-node-lease|longhorn-system|monitoring|flux-system"
LONGHORN_NODE_REGEX = "titan-1[2-9]|titan-2[24]" LONGHORN_NODE_REGEX = "titan-1[2-9]|titan-2[24]"
GAUGE_WIDTHS = [5, 5, 5, 5, 4] GAUGE_WIDTHS = [4, 3, 3, 4, 3, 3, 4]
CONTROL_WORKLOADS_EXPR = ( CONTROL_WORKLOADS_EXPR = (
f'sum(kube_pod_info{{node=~"{CONTROL_REGEX}",namespace!~"{CP_ALLOWED_NS}"}}) or on() vector(0)' f'sum(kube_pod_info{{node=~"{CONTROL_REGEX}",namespace!~"{CP_ALLOWED_NS}"}}) or on() vector(0)'
) )
@ -187,17 +188,65 @@ def namespace_gpu_share_expr():
return namespace_share_expr(NAMESPACE_GPU_RAW) return namespace_share_expr(NAMESPACE_GPU_RAW)
PROBLEM_PODS_EXPR = 'sum(max by (namespace,pod) (kube_pod_status_phase{phase!~"Running|Succeeded"}))' PROBLEM_PODS_EXPR = (
'sum(max by (namespace,pod) (kube_pod_status_phase{phase!~"Running|Succeeded"})) '
"or on() vector(0)"
)
CRASHLOOP_EXPR = ( CRASHLOOP_EXPR = (
'sum(max by (namespace,pod) (kube_pod_container_status_waiting_reason' 'sum(max by (namespace,pod) (kube_pod_container_status_waiting_reason'
'{reason=~"CrashLoopBackOff|ImagePullBackOff"}))' '{reason=~"CrashLoopBackOff|ImagePullBackOff"})) '
"or on() vector(0)"
) )
STUCK_TERMINATING_EXPR = ( STUCK_TERMINATING_EXPR = (
'sum(max by (namespace,pod) (' 'sum(max by (namespace,pod) ('
'((time() - kube_pod_deletion_timestamp{pod!=""}) > bool 600)' '((time() - kube_pod_deletion_timestamp{pod!=""}) > bool 600)'
' and on(namespace,pod) (kube_pod_deletion_timestamp{pod!=""} > bool 0)' ' and on(namespace,pod) (kube_pod_deletion_timestamp{pod!=""} > bool 0)'
'))' ')) '
"or on() vector(0)"
) )
UPTIME_WINDOW = "30d"
TRAEFIK_READY_EXPR = (
"("
'sum(kube_deployment_status_replicas_available{namespace=~"traefik|kube-system",deployment="traefik"})'
" / clamp_min("
'sum(kube_deployment_spec_replicas{namespace=~"traefik|kube-system",deployment="traefik"}), 1)'
")"
)
CONTROL_READY_FRACTION_EXPR = (
f"(sum(kube_node_status_condition{{condition=\"Ready\",status=\"true\",node=~\"{CONTROL_REGEX}\"}})"
f" / {CONTROL_TOTAL})"
)
UPTIME_AVAIL_EXPR = (
f"min(({CONTROL_READY_FRACTION_EXPR}), ({TRAEFIK_READY_EXPR}))"
)
# Tie-breaker to deterministically pick one node per namespace when shares tie.
NODE_TIEBREAKER = " + ".join(
f"({node_filter(node)}) * 1e-6 * {idx}"
for idx, node in enumerate(CONTROL_ALL + WORKER_NODES, start=1)
)
UPTIME_AVG_EXPR = f"avg_over_time(({UPTIME_AVAIL_EXPR})[{UPTIME_WINDOW}:5m])"
UPTIME_PERCENT_EXPR = UPTIME_AVG_EXPR
UPTIME_NINES_EXPR = f"-log10(1 - clamp_max({UPTIME_AVG_EXPR}, 0.999999999))"
UPTIME_THRESHOLDS = {
"mode": "absolute",
"steps": [
{"color": "red", "value": None},
{"color": "orange", "value": 2},
{"color": "yellow", "value": 3},
{"color": "green", "value": 3.5},
],
}
UPTIME_PERCENT_THRESHOLDS = {
"mode": "absolute",
"steps": [
{"color": "red", "value": None},
{"color": "orange", "value": 0.99},
{"color": "yellow", "value": 0.999},
{"color": "green", "value": 0.9999},
{"color": "blue", "value": 0.99999},
],
}
PROBLEM_TABLE_EXPR = ( PROBLEM_TABLE_EXPR = (
"(time() - kube_pod_created{pod!=\"\"}) " "(time() - kube_pod_created{pod!=\"\"}) "
"* on(namespace,pod) group_left(node) kube_pod_info " "* on(namespace,pod) group_left(node) kube_pod_info "
@ -232,7 +281,7 @@ NAMESPACE_GPU_ALLOC = (
' or kube_pod_container_resource_limits{namespace!="",resource="nvidia.com/gpu"})) by (namespace)' ' or kube_pod_container_resource_limits{namespace!="",resource="nvidia.com/gpu"})) by (namespace)'
) )
NAMESPACE_GPU_USAGE_SHARE = ( NAMESPACE_GPU_USAGE_SHARE = (
'sum by (namespace) (avg_over_time(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\"}[1h]))' 'sum by (namespace) (max_over_time(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\"}[$__range]))'
) )
NAMESPACE_GPU_USAGE_INSTANT = 'sum(DCGM_FI_DEV_GPU_UTIL{namespace!="",pod!=""}) by (namespace)' NAMESPACE_GPU_USAGE_INSTANT = 'sum(DCGM_FI_DEV_GPU_UTIL{namespace!="",pod!=""}) by (namespace)'
NAMESPACE_GPU_RAW = ( NAMESPACE_GPU_RAW = (
@ -291,6 +340,34 @@ NET_INTERNAL_EXPR = (
'+ rate(container_network_transmit_bytes_total{namespace!="traefik",pod!=""}[5m]))' '+ rate(container_network_transmit_bytes_total{namespace!="traefik",pod!=""}[5m]))'
' or on() vector(0)' ' or on() vector(0)'
) )
APISERVER_5XX_RATE = 'sum(rate(apiserver_request_total{code=~"5.."}[5m]))'
APISERVER_P99_LATENCY_MS = (
"histogram_quantile(0.99, sum by (le) (rate(apiserver_request_duration_seconds_bucket[5m]))) * 1000"
)
ETCD_P99_LATENCY_MS = (
"histogram_quantile(0.99, sum by (le) (rate(etcd_request_duration_seconds_bucket[5m]))) * 1000"
)
TRAEFIK_TOTAL_5M = "sum(rate(traefik_entrypoint_requests_total[5m]))"
TRAEFIK_SUCCESS_5M = 'sum(rate(traefik_entrypoint_requests_total{code!~"5.."}[5m]))'
TRAEFIK_SLI_5M = f"({TRAEFIK_SUCCESS_5M}) / clamp_min({TRAEFIK_TOTAL_5M}, 1)"
TRAEFIK_P99_LATENCY_MS = (
"histogram_quantile(0.99, sum by (le) (rate(traefik_entrypoint_request_duration_seconds_bucket[5m]))) * 1000"
)
TRAEFIK_P95_LATENCY_MS = (
"histogram_quantile(0.95, sum by (le) (rate(traefik_entrypoint_request_duration_seconds_bucket[5m]))) * 1000"
)
SLO_AVAILABILITY = 0.999
def traefik_sli(window):
total = f'sum(rate(traefik_entrypoint_requests_total[{window}]))'
success = f'sum(rate(traefik_entrypoint_requests_total{{code!~"5.."}}[{window}]))'
return f"({success}) / clamp_min({total}, 1)"
def traefik_burn(window):
sli = traefik_sli(window)
return f"(1 - ({sli})) / {1 - SLO_AVAILABILITY}"
# --------------------------------------------------------------------------- # ---------------------------------------------------------------------------
# Panel factories # Panel factories
@ -304,6 +381,7 @@ def stat_panel(
grid, grid,
*, *,
unit="none", unit="none",
decimals=None,
thresholds=None, thresholds=None,
text_mode="value", text_mode="value",
legend=None, legend=None,
@ -313,7 +391,7 @@ def stat_panel(
): ):
"""Return a Grafana stat panel definition.""" """Return a Grafana stat panel definition."""
defaults = { defaults = {
"color": {"mode": "palette-classic"}, "color": {"mode": "thresholds"},
"mappings": [], "mappings": [],
"thresholds": thresholds "thresholds": thresholds
or { or {
@ -328,6 +406,8 @@ def stat_panel(
} }
if value_suffix: if value_suffix:
defaults["custom"]["valueSuffix"] = value_suffix defaults["custom"]["valueSuffix"] = value_suffix
if decimals is not None:
defaults["decimals"] = decimals
panel = { panel = {
"id": panel_id, "id": panel_id,
"type": "stat", "type": "stat",
@ -446,17 +526,32 @@ def table_panel(
*, *,
unit="none", unit="none",
transformations=None, transformations=None,
instant=False,
options=None,
filterable=True,
footer=None,
format=None,
): ):
"""Return a Grafana table panel definition.""" """Return a Grafana table panel definition."""
# Optional PromQL subquery helpers in expr: share(), etc.
panel_options = {"showHeader": True, "columnFilters": False}
if options:
panel_options.update(options)
if footer is not None:
panel_options["footer"] = footer
field_defaults = {"unit": unit, "custom": {"filterable": filterable}}
target = {"expr": expr, "refId": "A", **({"instant": True} if instant else {})}
if format:
target["format"] = format
panel = { panel = {
"id": panel_id, "id": panel_id,
"type": "table", "type": "table",
"title": title, "title": title,
"datasource": PROM_DS, "datasource": PROM_DS,
"gridPos": grid, "gridPos": grid,
"targets": [{"expr": expr, "refId": "A"}], "targets": [target],
"fieldConfig": {"defaults": {"unit": unit}, "overrides": []}, "fieldConfig": {"defaults": field_defaults, "overrides": []},
"options": {"showHeader": True}, "options": panel_options,
} }
if transformations: if transformations:
panel["transformations"] = transformations panel["transformations"] = transformations
@ -482,7 +577,7 @@ def pie_panel(panel_id, title, expr, grid):
"options": { "options": {
"legend": {"displayMode": "list", "placement": "right"}, "legend": {"displayMode": "list", "placement": "right"},
"pieType": "pie", "pieType": "pie",
"displayLabels": ["percent"], "displayLabels": [],
"tooltip": {"mode": "single"}, "tooltip": {"mode": "single"},
"colorScheme": "interpolateSpectral", "colorScheme": "interpolateSpectral",
"colorBy": "value", "colorBy": "value",
@ -491,7 +586,19 @@ def pie_panel(panel_id, title, expr, grid):
} }
def bargauge_panel(panel_id, title, expr, grid, *, unit="none", links=None): def bargauge_panel(
panel_id,
title,
expr,
grid,
*,
unit="none",
links=None,
limit=None,
thresholds=None,
decimals=None,
instant=False,
):
"""Return a bar gauge panel with label-aware reduction.""" """Return a bar gauge panel with label-aware reduction."""
panel = { panel = {
"id": panel_id, "id": panel_id,
@ -499,13 +606,16 @@ def bargauge_panel(panel_id, title, expr, grid, *, unit="none", links=None):
"title": title, "title": title,
"datasource": PROM_DS, "datasource": PROM_DS,
"gridPos": grid, "gridPos": grid,
"targets": [{"expr": expr, "refId": "A", "legendFormat": "{{node}}"}], "targets": [
{"expr": expr, "refId": "A", "legendFormat": "{{node}}", **({"instant": True} if instant else {})}
],
"fieldConfig": { "fieldConfig": {
"defaults": { "defaults": {
"unit": unit, "unit": unit,
"min": 0, "min": 0,
"max": 100 if unit == "percent" else None, "max": 100 if unit == "percent" else None,
"thresholds": { "thresholds": thresholds
or {
"mode": "absolute", "mode": "absolute",
"steps": [ "steps": [
{"color": "green", "value": None}, {"color": "green", "value": None},
@ -527,8 +637,19 @@ def bargauge_panel(panel_id, title, expr, grid, *, unit="none", links=None):
}, },
}, },
} }
if decimals is not None:
panel["fieldConfig"]["defaults"]["decimals"] = decimals
if links: if links:
panel["links"] = links panel["links"] = links
# Keep bars ordered by value descending for readability.
panel["transformations"] = [
{
"id": "sortBy",
"options": {"fields": ["Value"], "order": "desc"},
}
]
if limit:
panel["transformations"].append({"id": "limit", "options": {"limit": limit}})
return panel return panel
@ -555,81 +676,37 @@ def link_to(uid):
def build_overview(): def build_overview():
panels = [] panels = []
count_thresholds = {
"mode": "absolute",
"steps": [
{"color": "green", "value": None},
{"color": "yellow", "value": 1},
{"color": "orange", "value": 2},
{"color": "red", "value": 3},
],
}
row1_stats = [ row1_stats = [
( {
1, "id": 2,
"Workers Ready", "title": "Control Plane Ready",
f'sum(kube_node_status_condition{{condition="Ready",status="true",node=~"{WORKER_REGEX}"}})', "expr": f'sum(kube_node_status_condition{{condition="Ready",status="true",node=~"{CONTROL_REGEX}"}})',
WORKER_SUFFIX, "kind": "gauge",
WORKER_TOTAL, "max_value": CONTROL_TOTAL,
None, "thresholds": {
),
(
2,
"Control Plane Ready",
f'sum(kube_node_status_condition{{condition="Ready",status="true",node=~"{CONTROL_REGEX}"}})',
CONTROL_SUFFIX,
CONTROL_TOTAL,
None,
),
(
3,
"Control Plane Workloads",
CONTROL_WORKLOADS_EXPR,
None,
4,
link_to("atlas-pods"),
),
(
4,
"Problem Pods",
PROBLEM_PODS_EXPR,
None,
1,
link_to("atlas-pods"),
),
(
5,
"Stuck Terminating",
STUCK_TERMINATING_EXPR,
None,
1,
link_to("atlas-pods"),
),
]
def gauge_grid(idx):
width = GAUGE_WIDTHS[idx] if idx < len(GAUGE_WIDTHS) else 4
x = sum(GAUGE_WIDTHS[:idx])
return width, x
for idx, (panel_id, title, expr, suffix, ok_value, links) in enumerate(row1_stats):
thresholds = None
min_value = 0
max_value = ok_value or 5
if panel_id == 1:
max_value = WORKER_TOTAL
thresholds = {
"mode": "absolute",
"steps": [
{"color": "red", "value": None},
{"color": "orange", "value": WORKER_TOTAL - 2},
{"color": "yellow", "value": WORKER_TOTAL - 1},
{"color": "green", "value": WORKER_TOTAL},
],
}
elif panel_id == 2:
max_value = CONTROL_TOTAL
thresholds = {
"mode": "absolute", "mode": "absolute",
"steps": [ "steps": [
{"color": "red", "value": None}, {"color": "red", "value": None},
{"color": "green", "value": CONTROL_TOTAL}, {"color": "green", "value": CONTROL_TOTAL},
], ],
} },
elif panel_id in (3, 4, 5): },
max_value = 4 {
thresholds = { "id": 3,
"title": "Control Plane Workloads",
"expr": CONTROL_WORKLOADS_EXPR,
"kind": "stat",
"thresholds": {
"mode": "absolute", "mode": "absolute",
"steps": [ "steps": [
{"color": "green", "value": None}, {"color": "green", "value": None},
@ -637,40 +714,122 @@ def build_overview():
{"color": "orange", "value": 2}, {"color": "orange", "value": 2},
{"color": "red", "value": 3}, {"color": "red", "value": 3},
], ],
} },
else: "links": link_to("atlas-pods"),
thresholds = { },
{
"id": 5,
"title": "Stuck Terminating",
"expr": STUCK_TERMINATING_EXPR,
"kind": "stat",
"thresholds": {
"mode": "absolute", "mode": "absolute",
"steps": [ "steps": [
{"color": "green", "value": None}, {"color": "green", "value": None},
{"color": "red", "value": max_value}, {"color": "yellow", "value": 1},
{"color": "orange", "value": 2},
{"color": "red", "value": 3},
], ],
} },
"links": link_to("atlas-pods"),
},
{
"id": 27,
"title": "Atlas Availability (30d)",
"expr": UPTIME_PERCENT_EXPR,
"kind": "stat",
"thresholds": UPTIME_PERCENT_THRESHOLDS,
"unit": "percentunit",
"decimals": 3,
"text_mode": "value",
},
{
"id": 4,
"title": "Problem Pods",
"expr": PROBLEM_PODS_EXPR,
"kind": "stat",
"thresholds": {
"mode": "absolute",
"steps": [
{"color": "green", "value": None},
{"color": "yellow", "value": 1},
{"color": "orange", "value": 2},
{"color": "red", "value": 3},
],
},
"links": link_to("atlas-pods"),
},
{
"id": 6,
"title": "CrashLoop / ImagePull",
"expr": CRASHLOOP_EXPR,
"kind": "stat",
"thresholds": {
"mode": "absolute",
"steps": [
{"color": "green", "value": None},
{"color": "yellow", "value": 1},
{"color": "orange", "value": 2},
{"color": "red", "value": 3},
],
},
"links": link_to("atlas-pods"),
},
{
"id": 1,
"title": "Workers Ready",
"expr": f'sum(kube_node_status_condition{{condition="Ready",status="true",node=~"{WORKER_REGEX}"}})',
"kind": "gauge",
"max_value": WORKER_TOTAL,
"thresholds": {
"mode": "absolute",
"steps": [
{"color": "red", "value": None},
{"color": "orange", "value": WORKER_TOTAL - 2},
{"color": "yellow", "value": WORKER_TOTAL - 1},
{"color": "green", "value": WORKER_TOTAL},
],
},
},
]
def gauge_grid(idx):
width = GAUGE_WIDTHS[idx] if idx < len(GAUGE_WIDTHS) else 4
x = sum(GAUGE_WIDTHS[:idx])
return width, x
for idx, item in enumerate(row1_stats):
panel_id = item["id"]
width, x = gauge_grid(idx) width, x = gauge_grid(idx)
if panel_id in (3, 4, 5): grid = {"h": 5, "w": width, "x": x, "y": 0}
kind = item.get("kind", "gauge")
if kind == "stat":
panels.append( panels.append(
stat_panel( stat_panel(
panel_id, panel_id,
title, item["title"],
expr, item["expr"],
{"h": 5, "w": width, "x": x, "y": 0}, grid,
thresholds=thresholds, thresholds=item.get("thresholds"),
legend=None, legend=None,
links=links, links=item.get("links"),
text_mode="value", text_mode=item.get("text_mode", "value"),
) value_suffix=item.get("value_suffix"),
) unit=item.get("unit", "none"),
decimals=item.get("decimals"),
)
)
else: else:
panels.append( panels.append(
gauge_panel( gauge_panel(
panel_id, panel_id,
title, item["title"],
expr, item["expr"],
{"h": 5, "w": width, "x": x, "y": 0}, grid,
min_value=min_value, min_value=0,
max_value=max_value, max_value=item.get("max_value", 5),
thresholds=thresholds, thresholds=item.get("thresholds"),
links=links, links=item.get("links"),
) )
) )
@ -774,7 +933,7 @@ def build_overview():
timeseries_panel( timeseries_panel(
16, 16,
"Control plane CPU", "Control plane CPU",
node_cpu_expr(CONTROL_REGEX), node_cpu_expr(CONTROL_ALL_REGEX),
{"h": 10, "w": 12, "x": 0, "y": 44}, {"h": 10, "w": 12, "x": 0, "y": 44},
unit="percent", unit="percent",
legend="{{node}}", legend="{{node}}",
@ -786,7 +945,7 @@ def build_overview():
timeseries_panel( timeseries_panel(
17, 17,
"Control plane RAM", "Control plane RAM",
node_mem_expr(CONTROL_REGEX), node_mem_expr(CONTROL_ALL_REGEX),
{"h": 10, "w": 12, "x": 12, "y": 44}, {"h": 10, "w": 12, "x": 12, "y": 44},
unit="percent", unit="percent",
legend="{{node}}", legend="{{node}}",
@ -795,6 +954,36 @@ def build_overview():
) )
) )
panels.append(
pie_panel(
28,
"Node Pod Share",
'(sum(kube_pod_info{pod!="" , node!=""}) by (node) / clamp_min(sum(kube_pod_info{pod!="" , node!=""}), 1)) * 100',
{"h": 10, "w": 12, "x": 0, "y": 54},
)
)
panels.append(
bargauge_panel(
29,
"Top Nodes by Pod Count",
'topk(12, sum(kube_pod_info{pod!="" , node!=""}) by (node))',
{"h": 10, "w": 12, "x": 12, "y": 54},
unit="none",
limit=12,
decimals=0,
thresholds={
"mode": "absolute",
"steps": [
{"color": "green", "value": None},
{"color": "yellow", "value": 50},
{"color": "orange", "value": 75},
{"color": "red", "value": 100},
],
},
instant=True,
)
)
panels.append( panels.append(
timeseries_panel( timeseries_panel(
18, 18,
@ -840,7 +1029,7 @@ def build_overview():
21, 21,
"Root Filesystem Usage", "Root Filesystem Usage",
root_usage_expr(), root_usage_expr(),
{"h": 16, "w": 12, "x": 0, "y": 54}, {"h": 16, "w": 12, "x": 0, "y": 64},
unit="percent", unit="percent",
legend="{{node}}", legend="{{node}}",
legend_calcs=["last"], legend_calcs=["last"],
@ -855,8 +1044,9 @@ def build_overview():
22, 22,
"Nodes Closest to Full Root Disks", "Nodes Closest to Full Root Disks",
f"topk(12, {root_usage_expr()})", f"topk(12, {root_usage_expr()})",
{"h": 16, "w": 12, "x": 12, "y": 54}, {"h": 16, "w": 12, "x": 12, "y": 64},
unit="percent", unit="percent",
thresholds=PERCENT_THRESHOLDS,
links=link_to("atlas-storage"), links=link_to("atlas-storage"),
) )
) )
@ -874,13 +1064,7 @@ def build_overview():
"templating": {"list": []}, "templating": {"list": []},
"time": {"from": "now-1h", "to": "now"}, "time": {"from": "now-1h", "to": "now"},
"refresh": "1m", "refresh": "1m",
"links": [ "links": [],
{"title": "Atlas Pods", "type": "dashboard", "dashboardUid": "atlas-pods", "keepTime": False},
{"title": "Atlas Nodes", "type": "dashboard", "dashboardUid": "atlas-nodes", "keepTime": False},
{"title": "Atlas Storage", "type": "dashboard", "dashboardUid": "atlas-storage", "keepTime": False},
{"title": "Atlas Network", "type": "dashboard", "dashboardUid": "atlas-network", "keepTime": False},
{"title": "Atlas GPU", "type": "dashboard", "dashboardUid": "atlas-gpu", "keepTime": False},
],
} }
@ -980,6 +1164,91 @@ def build_pods_dashboard():
], ],
) )
) )
panels.append(
pie_panel(
8,
"Node Pod Share",
'(sum(kube_pod_info{pod!="" , node!=""}) by (node) / clamp_min(sum(kube_pod_info{pod!="" , node!=""}), 1)) * 100',
{"h": 8, "w": 12, "x": 12, "y": 34},
)
)
panels.append(
bargauge_panel(
9,
"Top Nodes by Pod Count",
'topk(12, sum(kube_pod_info{pod!="" , node!=""}) by (node))',
{"h": 8, "w": 12, "x": 0, "y": 34},
unit="none",
limit=12,
decimals=0,
thresholds={
"mode": "absolute",
"steps": [
{"color": "green", "value": None},
{"color": "yellow", "value": 50},
{"color": "orange", "value": 75},
{"color": "red", "value": 100},
],
},
instant=True,
)
)
share_expr = (
'(sum by (namespace,node) (kube_pod_info{pod!="" , node!=""}) '
'/ on(namespace) group_left() clamp_min(sum by (namespace) (kube_pod_info{pod!=""}), 1) * 100)'
)
rank_terms = [
f"(sum by (node) (kube_node_info{{node=\"{node}\"}}) * 0 + {idx * 1e-3})"
for idx, node in enumerate(CONTROL_ALL + WORKER_NODES, start=1)
]
rank_expr = " or ".join(rank_terms)
score_expr = f"{share_expr} + on(node) group_left() ({rank_expr})"
mask_expr = (
f"{score_expr} == bool on(namespace) group_left() "
f"(max by (namespace) ({score_expr}))"
)
panels.append(
table_panel(
10,
"Namespace Plurality by Node v27",
(
f"{share_expr} * on(namespace,node) group_left() "
f"({mask_expr})"
),
{"h": 8, "w": 24, "x": 0, "y": 42},
unit="percent",
transformations=[
{"id": "labelsToFields", "options": {}},
{"id": "organize", "options": {"excludeByName": {"Time": True}}},
{"id": "filterByValue", "options": {"match": "Value", "operator": "gt", "value": 0}},
{
"id": "sortBy",
"options": {"fields": ["Value"], "order": "desc"},
},
{
"id": "groupBy",
"options": {
"fields": {
"namespace": {
"aggregations": [
{"field": "Value", "operation": "max"},
{"field": "node", "operation": "first"},
]
}
},
"rowBy": ["namespace"],
},
},
],
instant=True,
options={"showColumnFilters": False},
filterable=False,
footer={"show": False, "fields": "", "calcs": []},
format="table",
)
)
return { return {
"uid": "atlas-pods", "uid": "atlas-pods",
"title": "Atlas Pods", "title": "Atlas Pods",
@ -1022,12 +1291,69 @@ def build_nodes_dashboard():
{"h": 4, "w": 8, "x": 16, "y": 0}, {"h": 4, "w": 8, "x": 16, "y": 0},
) )
) )
panels.append(
stat_panel(
9,
"API Server 5xx rate",
APISERVER_5XX_RATE,
{"h": 4, "w": 8, "x": 0, "y": 4},
unit="req/s",
thresholds={
"mode": "absolute",
"steps": [
{"color": "green", "value": None},
{"color": "yellow", "value": 0.05},
{"color": "orange", "value": 0.2},
{"color": "red", "value": 0.5},
],
},
decimals=3,
)
)
panels.append(
stat_panel(
10,
"API Server P99 latency",
APISERVER_P99_LATENCY_MS,
{"h": 4, "w": 8, "x": 8, "y": 4},
unit="ms",
thresholds={
"mode": "absolute",
"steps": [
{"color": "green", "value": None},
{"color": "yellow", "value": 250},
{"color": "orange", "value": 400},
{"color": "red", "value": 600},
],
},
decimals=1,
)
)
panels.append(
stat_panel(
11,
"etcd P99 latency",
ETCD_P99_LATENCY_MS,
{"h": 4, "w": 8, "x": 16, "y": 4},
unit="ms",
thresholds={
"mode": "absolute",
"steps": [
{"color": "green", "value": None},
{"color": "yellow", "value": 50},
{"color": "orange", "value": 100},
{"color": "red", "value": 200},
],
},
decimals=1,
)
)
panels.append( panels.append(
timeseries_panel( timeseries_panel(
4, 4,
"Node CPU", "Node CPU",
node_cpu_expr(), node_cpu_expr(),
{"h": 9, "w": 24, "x": 0, "y": 4}, {"h": 9, "w": 24, "x": 0, "y": 8},
unit="percent", unit="percent",
legend="{{node}}", legend="{{node}}",
legend_calcs=["last"], legend_calcs=["last"],
@ -1040,7 +1366,7 @@ def build_nodes_dashboard():
5, 5,
"Node RAM", "Node RAM",
node_mem_expr(), node_mem_expr(),
{"h": 9, "w": 24, "x": 0, "y": 13}, {"h": 9, "w": 24, "x": 0, "y": 17},
unit="percent", unit="percent",
legend="{{node}}", legend="{{node}}",
legend_calcs=["last"], legend_calcs=["last"],
@ -1053,7 +1379,7 @@ def build_nodes_dashboard():
6, 6,
"Control Plane (incl. titan-db) CPU", "Control Plane (incl. titan-db) CPU",
node_cpu_expr(CONTROL_ALL_REGEX), node_cpu_expr(CONTROL_ALL_REGEX),
{"h": 9, "w": 12, "x": 0, "y": 22}, {"h": 9, "w": 12, "x": 0, "y": 26},
unit="percent", unit="percent",
legend="{{node}}", legend="{{node}}",
legend_display="table", legend_display="table",
@ -1065,7 +1391,7 @@ def build_nodes_dashboard():
7, 7,
"Control Plane (incl. titan-db) RAM", "Control Plane (incl. titan-db) RAM",
node_mem_expr(CONTROL_ALL_REGEX), node_mem_expr(CONTROL_ALL_REGEX),
{"h": 9, "w": 12, "x": 12, "y": 22}, {"h": 9, "w": 12, "x": 12, "y": 26},
unit="percent", unit="percent",
legend="{{node}}", legend="{{node}}",
legend_display="table", legend_display="table",
@ -1077,7 +1403,7 @@ def build_nodes_dashboard():
8, 8,
"Root Filesystem Usage", "Root Filesystem Usage",
root_usage_expr(), root_usage_expr(),
{"h": 9, "w": 24, "x": 0, "y": 31}, {"h": 9, "w": 24, "x": 0, "y": 35},
unit="percent", unit="percent",
legend="{{node}}", legend="{{node}}",
legend_display="table", legend_display="table",
@ -1204,43 +1530,107 @@ def build_network_dashboard():
panels.append( panels.append(
stat_panel( stat_panel(
1, 1,
"Ingress Traffic", "Ingress Success Rate (5m)",
NET_INGRESS_EXPR, TRAEFIK_SLI_5M,
{"h": 4, "w": 8, "x": 0, "y": 0}, {"h": 4, "w": 6, "x": 0, "y": 0},
unit="Bps", unit="percentunit",
decimals=2,
thresholds={
"mode": "absolute",
"steps": [
{"color": "red", "value": None},
{"color": "orange", "value": 0.995},
{"color": "yellow", "value": 0.999},
{"color": "green", "value": 0.9995},
],
},
) )
) )
panels.append( panels.append(
stat_panel( stat_panel(
2, 2,
"Egress Traffic", "Error Budget Burn (1h)",
NET_EGRESS_EXPR, traefik_burn("1h"),
{"h": 4, "w": 8, "x": 8, "y": 0}, {"h": 4, "w": 6, "x": 6, "y": 0},
unit="Bps", thresholds={
"mode": "absolute",
"steps": [
{"color": "green", "value": None},
{"color": "yellow", "value": 1},
{"color": "orange", "value": 2},
{"color": "red", "value": 4},
],
},
decimals=2,
) )
) )
panels.append( panels.append(
stat_panel( stat_panel(
3, 3,
"Intra-Cluster Traffic", "Error Budget Burn (6h)",
NET_INTERNAL_EXPR, traefik_burn("6h"),
{"h": 4, "w": 8, "x": 16, "y": 0}, {"h": 4, "w": 6, "x": 12, "y": 0},
unit="Bps", thresholds={
"mode": "absolute",
"steps": [
{"color": "green", "value": None},
{"color": "yellow", "value": 1},
{"color": "orange", "value": 2},
{"color": "red", "value": 4},
],
},
decimals=2,
) )
) )
panels.append( panels.append(
stat_panel( stat_panel(
4, 4,
"Top Router req/s", "Edge P99 Latency (ms)",
f"topk(1, {TRAEFIK_ROUTER_EXPR})", TRAEFIK_P99_LATENCY_MS,
{"h": 4, "w": 6, "x": 18, "y": 0},
unit="ms",
thresholds={
"mode": "absolute",
"steps": [
{"color": "green", "value": None},
{"color": "yellow", "value": 200},
{"color": "orange", "value": 350},
{"color": "red", "value": 500},
],
},
decimals=1,
)
)
panels.append(
stat_panel(
5,
"Ingress Traffic",
NET_INGRESS_EXPR,
{"h": 4, "w": 8, "x": 0, "y": 4}, {"h": 4, "w": 8, "x": 0, "y": 4},
unit="req/s", unit="Bps",
legend="{{router}}", )
)
panels.append(
stat_panel(
6,
"Egress Traffic",
NET_EGRESS_EXPR,
{"h": 4, "w": 8, "x": 8, "y": 4},
unit="Bps",
)
)
panels.append(
stat_panel(
7,
"Intra-Cluster Traffic",
NET_INTERNAL_EXPR,
{"h": 4, "w": 8, "x": 16, "y": 4},
unit="Bps",
) )
) )
panels.append( panels.append(
timeseries_panel( timeseries_panel(
5, 8,
"Per-Node Throughput", "Per-Node Throughput",
f'avg by (node) (({NET_NODE_TX_PHYS} + {NET_NODE_RX_PHYS}) * on(instance) group_left(node) {NODE_INFO})', f'avg by (node) (({NET_NODE_TX_PHYS} + {NET_NODE_RX_PHYS}) * on(instance) group_left(node) {NODE_INFO})',
{"h": 8, "w": 24, "x": 0, "y": 8}, {"h": 8, "w": 24, "x": 0, "y": 8},
@ -1252,7 +1642,7 @@ def build_network_dashboard():
) )
panels.append( panels.append(
table_panel( table_panel(
6, 9,
"Top Namespaces", "Top Namespaces",
'topk(10, sum(rate(container_network_transmit_bytes_total{namespace!=""}[5m]) ' 'topk(10, sum(rate(container_network_transmit_bytes_total{namespace!=""}[5m]) '
'+ rate(container_network_receive_bytes_total{namespace!=""}[5m])) by (namespace))', '+ rate(container_network_receive_bytes_total{namespace!=""}[5m])) by (namespace))',
@ -1263,7 +1653,7 @@ def build_network_dashboard():
) )
panels.append( panels.append(
table_panel( table_panel(
7, 10,
"Top Pods", "Top Pods",
'topk(10, sum(rate(container_network_transmit_bytes_total{pod!=""}[5m]) ' 'topk(10, sum(rate(container_network_transmit_bytes_total{pod!=""}[5m]) '
'+ rate(container_network_receive_bytes_total{pod!=""}[5m])) by (namespace,pod))', '+ rate(container_network_receive_bytes_total{pod!=""}[5m])) by (namespace,pod))',
@ -1274,7 +1664,7 @@ def build_network_dashboard():
) )
panels.append( panels.append(
timeseries_panel( timeseries_panel(
8, 11,
"Traefik Routers (req/s)", "Traefik Routers (req/s)",
f"topk(10, {TRAEFIK_ROUTER_EXPR})", f"topk(10, {TRAEFIK_ROUTER_EXPR})",
{"h": 9, "w": 12, "x": 0, "y": 25}, {"h": 9, "w": 12, "x": 0, "y": 25},
@ -1286,7 +1676,7 @@ def build_network_dashboard():
) )
panels.append( panels.append(
timeseries_panel( timeseries_panel(
9, 12,
"Traefik Entrypoints (req/s)", "Traefik Entrypoints (req/s)",
'sum by (entrypoint) (rate(traefik_entrypoint_requests_total[5m]))', 'sum by (entrypoint) (rate(traefik_entrypoint_requests_total[5m]))',
{"h": 9, "w": 12, "x": 12, "y": 25}, {"h": 9, "w": 12, "x": 12, "y": 25},

92
scripts/gitea_cred_sync.sh Executable file
View File

@ -0,0 +1,92 @@
#!/usr/bin/env bash
# Sync Keycloak users into Gitea local accounts (for CLI + tokens).
# Requires: curl, jq, kubectl. Expects a Keycloak client with realm-management
# permissions (manage-users) and a Gitea admin token stored in a secret.
set -euo pipefail
require() { command -v "$1" >/dev/null 2>&1 || { echo "missing required binary: $1" >&2; exit 1; }; }
require curl; require jq; require kubectl
: "${KEYCLOAK_URL:=https://sso.bstein.dev}"
: "${KEYCLOAK_REALM:=atlas}"
: "${KEYCLOAK_CLIENT_ID:?set KEYCLOAK_CLIENT_ID or export via secret}"
: "${KEYCLOAK_CLIENT_SECRET:?set KEYCLOAK_CLIENT_SECRET or export via secret}"
: "${GITEA_BASE_URL:=https://scm.bstein.dev}"
: "${GITEA_NAMESPACE:=gitea}"
: "${GITEA_TOKEN_SECRET_NAME:=gitea-admin-token}"
: "${GITEA_TOKEN_SECRET_KEY:=token}"
: "${DEFAULT_PASSWORD:=TempSsoPass!2025}"
fetch_token() {
curl -fsS -X POST \
-d "grant_type=client_credentials" \
-d "client_id=${KEYCLOAK_CLIENT_ID}" \
-d "client_secret=${KEYCLOAK_CLIENT_SECRET}" \
"${KEYCLOAK_URL}/realms/${KEYCLOAK_REALM}/protocol/openid-connect/token" \
| jq -r '.access_token'
}
pull_users() {
local token="$1"
curl -fsS -H "Authorization: Bearer ${token}" \
"${KEYCLOAK_URL}/admin/realms/${KEYCLOAK_REALM}/users?max=500" \
| jq -r '.[] | select(.enabled == true) | select(.username | startswith("service-account-") | not) | [.username, (.email // ""), (.firstName // ""), (.lastName // "")] | @tsv'
}
get_gitea_token() {
if [[ -n "${GITEA_ADMIN_TOKEN:-}" ]]; then
echo "${GITEA_ADMIN_TOKEN}"
return
fi
kubectl -n "${GITEA_NAMESPACE}" get secret "${GITEA_TOKEN_SECRET_NAME}" -o "jsonpath={.data.${GITEA_TOKEN_SECRET_KEY}}" \
| base64 -d
}
user_exists() {
local token="$1" username="$2"
local code
code=$(curl -s -o /dev/null -w '%{http_code}' \
-H "Authorization: token ${token}" \
"${GITEA_BASE_URL}/api/v1/admin/users/${username}")
[[ "${code}" == "200" ]]
}
create_user() {
local token="$1" username="$2" email="$3" fname="$4" lname="$5"
local body status fullname
fullname="$(echo "${fname} ${lname}" | xargs)"
if [[ -z "${email}" ]]; then
email="${username}@example.local"
fi
body=$(jq -n --arg u "${username}" --arg e "${email}" --arg p "${DEFAULT_PASSWORD}" \
--arg fn "${fullname}" '{username:$u, email:$e, password:$p, must_change_password:false, full_name:$fn}')
status=$(curl -s -o /dev/null -w '%{http_code}' \
-H "Authorization: token ${token}" \
-H "Content-Type: application/json" \
-X POST \
-d "${body}" \
"${GITEA_BASE_URL}/api/v1/admin/users")
if [[ "${status}" == "201" ]]; then
echo "created gitea user ${username}"
elif [[ "${status}" == "409" ]]; then
echo "gitea user ${username} already exists (409)" >&2
else
echo "failed to create gitea user ${username} (status ${status})" >&2
fi
}
main() {
local kc_token gitea_token
kc_token="$(fetch_token)"
gitea_token="$(get_gitea_token)"
while IFS=$'\t' read -r username email fname lname; do
if user_exists "${gitea_token}" "${username}"; then
continue
fi
create_user "${gitea_token}" "${username}" "${email}" "${fname}" "${lname}"
done < <(pull_users "${kc_token}")
}
main "$@"

87
scripts/gitops_cred_sync.sh Executable file
View File

@ -0,0 +1,87 @@
#!/usr/bin/env bash
# Ensure Keycloak users are in the GitOps admin group used by weave-gitops (cd.bstein.dev).
# Weave GitOps relies on OIDC; membership in the "admin" group maps to cluster-admin via RBAC.
# Requires: curl, jq. Needs a Keycloak client with realm-management (manage-users/groups).
set -euo pipefail
require() { command -v "$1" >/dev/null 2>&1 || { echo "missing required binary: $1" >&2; exit 1; }; }
require curl; require jq
: "${KEYCLOAK_URL:=https://sso.bstein.dev}"
: "${KEYCLOAK_REALM:=atlas}"
: "${KEYCLOAK_CLIENT_ID:?set KEYCLOAK_CLIENT_ID or export via secret}"
: "${KEYCLOAK_CLIENT_SECRET:?set KEYCLOAK_CLIENT_SECRET or export via secret}"
: "${GITOPS_GROUP:=admin}"
# Comma-separated usernames to sync; set SYNC_ALL_USERS=true to include all Keycloak users.
: "${TARGET_USERNAMES:=bstein}"
: "${SYNC_ALL_USERS:=false}"
fetch_token() {
curl -fsS -X POST \
-d "grant_type=client_credentials" \
-d "client_id=${KEYCLOAK_CLIENT_ID}" \
-d "client_secret=${KEYCLOAK_CLIENT_SECRET}" \
"${KEYCLOAK_URL}/realms/${KEYCLOAK_REALM}/protocol/openid-connect/token" \
| jq -r '.access_token'
}
ensure_group() {
local token="$1" group="$2" id
id=$(curl -fsS -H "Authorization: Bearer ${token}" \
"${KEYCLOAK_URL}/admin/realms/${KEYCLOAK_REALM}/groups?search=${group}" \
| jq -r --arg g "${group}" '.[] | select(.name==$g) | .id' | head -n1)
if [[ -n "${id}" ]]; then
echo "${id}"
return
fi
curl -fsS -H "Authorization: Bearer ${token}" \
-H "Content-Type: application/json" \
-d "{\"name\":\"${group}\"}" \
-X POST "${KEYCLOAK_URL}/admin/realms/${KEYCLOAK_REALM}/groups"
# Fetch again to get id
curl -fsS -H "Authorization: Bearer ${token}" \
"${KEYCLOAK_URL}/admin/realms/${KEYCLOAK_REALM}/groups?search=${group}" \
| jq -r --arg g "${group}" '.[] | select(.name==$g) | .id' | head -n1
}
user_id_by_name() {
local token="$1" username="$2"
curl -fsS -H "Authorization: Bearer ${token}" \
"${KEYCLOAK_URL}/admin/realms/${KEYCLOAK_REALM}/users?username=${username}" \
| jq -r '.[0].id'
}
add_user_to_group() {
local token="$1" user_id="$2" group_id="$3" username="$4"
if [[ -z "${user_id}" ]]; then
echo "user ${username} not found in Keycloak; skip" >&2
return
fi
curl -fsS -o /dev/null -w '%{http_code}' \
-H "Authorization: Bearer ${token}" \
-X PUT "${KEYCLOAK_URL}/admin/realms/${KEYCLOAK_REALM}/users/${user_id}/groups/${group_id}" \
| grep -qE '^(204|409)$' || echo "failed adding ${username} to group" >&2
}
main() {
local token group_id users=()
token="$(fetch_token)"
group_id="$(ensure_group "${token}" "${GITOPS_GROUP}")"
if [[ "${SYNC_ALL_USERS}" == "true" ]]; then
readarray -t users < <(curl -fsS -H "Authorization: Bearer ${token}" \
"${KEYCLOAK_URL}/admin/realms/${KEYCLOAK_REALM}/users?max=500" \
| jq -r '.[] | select(.enabled==true) | .username')
else
IFS=',' read -ra users <<< "${TARGET_USERNAMES}"
fi
for user in "${users[@]}"; do
user="$(echo "${user}" | xargs)"
[[ -z "${user}" ]] && continue
add_user_to_group "${token}" "$(user_id_by_name "${token}" "${user}")" "${group_id}" "${user}"
done
}
main "$@"

94
scripts/jenkins_cred_sync.sh Executable file
View File

@ -0,0 +1,94 @@
#!/usr/bin/env bash
# Sync Keycloak users into Jenkins local accounts (for CLI/API use).
# Jenkins is OIDC-enabled, but local users can still be provisioned for tokens.
# Requires: curl, jq, kubectl. Needs Jenkins admin user+API token.
set -euo pipefail
require() { command -v "$1" >/dev/null 2>&1 || { echo "missing required binary: $1" >&2; exit 1; }; }
require curl; require jq; require kubectl
: "${KEYCLOAK_URL:=https://sso.bstein.dev}"
: "${KEYCLOAK_REALM:=atlas}"
: "${KEYCLOAK_CLIENT_ID:?set KEYCLOAK_CLIENT_ID or export via secret}"
: "${KEYCLOAK_CLIENT_SECRET:?set KEYCLOAK_CLIENT_SECRET or export via secret}"
: "${JENKINS_URL:=https://ci.bstein.dev}"
: "${JENKINS_NAMESPACE:=jenkins}"
: "${JENKINS_ADMIN_SECRET_NAME:=jenkins-admin-token}"
: "${JENKINS_ADMIN_USER_KEY:=username}"
: "${JENKINS_ADMIN_TOKEN_KEY:=token}"
: "${DEFAULT_PASSWORD:=TempSsoPass!2025}"
fetch_token() {
curl -fsS -X POST \
-d "grant_type=client_credentials" \
-d "client_id=${KEYCLOAK_CLIENT_ID}" \
-d "client_secret=${KEYCLOAK_CLIENT_SECRET}" \
"${KEYCLOAK_URL}/realms/${KEYCLOAK_REALM}/protocol/openid-connect/token" \
| jq -r '.access_token'
}
pull_users() {
local token="$1"
curl -fsS -H "Authorization: Bearer ${token}" \
"${KEYCLOAK_URL}/admin/realms/${KEYCLOAK_REALM}/users?max=500" \
| jq -r '.[] | select(.enabled == true) | select(.username | startswith("service-account-") | not) | [.id, .username, (.email // "")] | @tsv'
}
get_admin_auth() {
local user token
if [[ -n "${JENKINS_ADMIN_USER:-}" && -n "${JENKINS_ADMIN_TOKEN:-}" ]]; then
echo "${JENKINS_ADMIN_USER}:${JENKINS_ADMIN_TOKEN}"
return
fi
user=$(kubectl -n "${JENKINS_NAMESPACE}" get secret "${JENKINS_ADMIN_SECRET_NAME}" -o "jsonpath={.data.${JENKINS_ADMIN_USER_KEY}}" | base64 -d)
token=$(kubectl -n "${JENKINS_NAMESPACE}" get secret "${JENKINS_ADMIN_SECRET_NAME}" -o "jsonpath={.data.${JENKINS_ADMIN_TOKEN_KEY}}" | base64 -d)
echo "${user}:${token}"
}
get_crumb() {
local auth="$1"
curl -fsS -u "${auth}" "${JENKINS_URL}/crumbIssuer/api/json" | jq -r .crumb
}
user_exists() {
local auth="$1" user="$2"
local code
code=$(curl -s -o /dev/null -w '%{http_code}' -u "${auth}" "${JENKINS_URL}/user/${user}/api/json")
[[ "${code}" == "200" ]]
}
create_user() {
local auth="$1" crumb="$2" username="$3" email="$4"
local status
status=$(curl -s -o /dev/null -w '%{http_code}' \
-u "${auth}" \
-H "Jenkins-Crumb: ${crumb}" \
-X POST \
--data "username=${username}&password1=${DEFAULT_PASSWORD}&password2=${DEFAULT_PASSWORD}&fullname=${username}&email=${email}" \
"${JENKINS_URL}/securityRealm/createAccountByAdmin")
if [[ "${status}" == "200" || "${status}" == "302" ]]; then
echo "created jenkins user ${username}"
elif [[ "${status}" == "400" ]]; then
echo "jenkins user ${username} already exists (400)" >&2
else
echo "failed to create jenkins user ${username} (status ${status})" >&2
fi
}
main() {
local kc_token auth crumb
kc_token="$(fetch_token)"
auth="$(get_admin_auth)"
crumb="$(get_crumb "${auth}")"
while IFS=$'\t' read -r _ uid email; do
if user_exists "${auth}" "${uid}"; then
continue
fi
create_user "${auth}" "${crumb}" "${uid}" "${email}"
done < <(pull_users "${kc_token}")
}
main "$@"

View File

@ -1,6 +1,6 @@
#!/usr/bin/env fish #!/usr/bin/env fish
function pvc-usage --description "Show Longhorn PVC usage (human-readable) mapped to namespace/name" function pvc-usage --description "Show Longhorn PVC usage mapped to namespace/name"
begin begin
kubectl -n longhorn-system get volumes.longhorn.io -o json \ kubectl -n longhorn-system get volumes.longhorn.io -o json \
| jq -r '.items[] | "\(.metadata.name)\t\(.status.actualSize)\t\(.spec.size)"' \ | jq -r '.items[] | "\(.metadata.name)\t\(.status.actualSize)\t\(.spec.size)"' \

204
scripts/mailu_sync.py Normal file
View File

@ -0,0 +1,204 @@
#!/usr/bin/env python3
"""
Sync Keycloak users to Mailu mailboxes.
- Generates/stores a mailu_app_password attribute in Keycloak (admin-only)
- Upserts the mailbox in Mailu Postgres using that password
"""
import os
import sys
import json
import time
import secrets
import string
import datetime
import requests
import psycopg2
from psycopg2.extras import RealDictCursor
from passlib.hash import bcrypt_sha256
KC_BASE = os.environ["KEYCLOAK_BASE_URL"].rstrip("/")
KC_REALM = os.environ["KEYCLOAK_REALM"]
KC_CLIENT_ID = os.environ["KEYCLOAK_CLIENT_ID"]
KC_CLIENT_SECRET = os.environ["KEYCLOAK_CLIENT_SECRET"]
MAILU_DOMAIN = os.environ["MAILU_DOMAIN"]
MAILU_DEFAULT_QUOTA = int(os.environ.get("MAILU_DEFAULT_QUOTA", "20000000000"))
DB_CONFIG = {
"host": os.environ["MAILU_DB_HOST"],
"port": int(os.environ.get("MAILU_DB_PORT", "5432")),
"dbname": os.environ["MAILU_DB_NAME"],
"user": os.environ["MAILU_DB_USER"],
"password": os.environ["MAILU_DB_PASSWORD"],
}
SESSION = requests.Session()
def log(msg):
sys.stdout.write(f"{msg}\n")
sys.stdout.flush()
def get_kc_token():
resp = SESSION.post(
f"{KC_BASE}/realms/{KC_REALM}/protocol/openid-connect/token",
data={
"grant_type": "client_credentials",
"client_id": KC_CLIENT_ID,
"client_secret": KC_CLIENT_SECRET,
},
timeout=15,
)
resp.raise_for_status()
return resp.json()["access_token"]
def kc_get_users(token):
users = []
first = 0
max_results = 200
headers = {"Authorization": f"Bearer {token}"}
while True:
resp = SESSION.get(
f"{KC_BASE}/admin/realms/{KC_REALM}/users",
params={"first": first, "max": max_results, "enabled": "true"},
headers=headers,
timeout=20,
)
resp.raise_for_status()
batch = resp.json()
users.extend(batch)
if len(batch) < max_results:
break
first += max_results
return users
def kc_update_attributes(token, user, attributes):
headers = {
"Authorization": f"Bearer {token}",
"Content-Type": "application/json",
}
payload = {
"firstName": user.get("firstName"),
"lastName": user.get("lastName"),
"email": user.get("email"),
"enabled": user.get("enabled", True),
"username": user["username"],
"emailVerified": user.get("emailVerified", False),
"attributes": attributes,
}
user_url = f"{KC_BASE}/admin/realms/{KC_REALM}/users/{user['id']}"
resp = SESSION.put(user_url, headers=headers, json=payload, timeout=20)
resp.raise_for_status()
verify = SESSION.get(
user_url,
headers={"Authorization": f"Bearer {token}"},
params={"briefRepresentation": "false"},
timeout=15,
)
verify.raise_for_status()
attrs = verify.json().get("attributes") or {}
if not attrs.get("mailu_app_password"):
raise Exception(f"attribute not persisted for {user.get('email') or user['username']}")
def random_password():
alphabet = string.ascii_letters + string.digits
return "".join(secrets.choice(alphabet) for _ in range(24))
def ensure_mailu_user(cursor, email, password, display_name):
localpart, domain = email.split("@", 1)
if domain.lower() != MAILU_DOMAIN.lower():
return
hashed = bcrypt_sha256.hash(password)
now = datetime.datetime.utcnow()
cursor.execute(
"""
INSERT INTO "user" (
email, localpart, domain_name, password,
quota_bytes, quota_bytes_used,
global_admin, enabled, enable_imap, enable_pop, allow_spoofing,
forward_enabled, forward_destination, forward_keep,
reply_enabled, reply_subject, reply_body, reply_startdate, reply_enddate,
displayed_name, spam_enabled, spam_mark_as_read, spam_threshold,
change_pw_next_login, created_at, updated_at, comment
)
VALUES (
%(email)s, %(localpart)s, %(domain)s, %(password)s,
%(quota)s, 0,
false, true, true, true, false,
false, '', true,
false, NULL, NULL, DATE '1900-01-01', DATE '2999-12-31',
%(display)s, true, true, 80,
false, CURRENT_DATE, %(now)s, ''
)
ON CONFLICT (email) DO UPDATE
SET password = EXCLUDED.password,
enabled = true,
updated_at = EXCLUDED.updated_at
""",
{
"email": email,
"localpart": localpart,
"domain": domain,
"password": hashed,
"quota": MAILU_DEFAULT_QUOTA,
"display": display_name or localpart,
"now": now,
},
)
def main():
token = get_kc_token()
users = kc_get_users(token)
if not users:
log("No users found; exiting.")
return
conn = psycopg2.connect(**DB_CONFIG)
conn.autocommit = True
cursor = conn.cursor(cursor_factory=RealDictCursor)
for user in users:
attrs = user.get("attributes", {}) or {}
app_pw_value = attrs.get("mailu_app_password")
if isinstance(app_pw_value, list):
app_pw = app_pw_value[0] if app_pw_value else None
elif isinstance(app_pw_value, str):
app_pw = app_pw_value
else:
app_pw = None
email = user.get("email")
if not email:
email = f"{user['username']}@{MAILU_DOMAIN}"
if not app_pw:
app_pw = random_password()
attrs["mailu_app_password"] = app_pw
kc_update_attributes(token, user, attrs)
log(f"Set mailu_app_password for {email}")
display_name = " ".join(
part for part in [user.get("firstName"), user.get("lastName")] if part
).strip()
ensure_mailu_user(cursor, email, app_pw, display_name)
log(f"Synced mailbox for {email}")
cursor.close()
conn.close()
if __name__ == "__main__":
try:
main()
except Exception as exc:
log(f"ERROR: {exc}")
sys.exit(1)

49
scripts/nextcloud-mail-sync.sh Executable file
View File

@ -0,0 +1,49 @@
#!/bin/bash
set -euo pipefail
KC_BASE="${KC_BASE:?}"
KC_REALM="${KC_REALM:?}"
KC_ADMIN_USER="${KC_ADMIN_USER:?}"
KC_ADMIN_PASS="${KC_ADMIN_PASS:?}"
if ! command -v jq >/dev/null 2>&1; then
apt-get update && apt-get install -y jq curl >/dev/null
fi
account_exists() {
# Skip if the account email is already present in the mail app.
runuser -u www-data -- php occ mail:account:list 2>/dev/null | grep -Fq " ${1}" || \
runuser -u www-data -- php occ mail:account:list 2>/dev/null | grep -Fq "${1} "
}
token=$(
curl -s -d "grant_type=password" \
-d "client_id=admin-cli" \
-d "username=${KC_ADMIN_USER}" \
-d "password=${KC_ADMIN_PASS}" \
"${KC_BASE}/realms/master/protocol/openid-connect/token" | jq -r '.access_token'
)
if [[ -z "${token}" || "${token}" == "null" ]]; then
echo "Failed to obtain admin token"
exit 1
fi
users=$(curl -s -H "Authorization: Bearer ${token}" \
"${KC_BASE}/admin/realms/${KC_REALM}/users?max=2000")
echo "${users}" | jq -c '.[]' | while read -r user; do
username=$(echo "${user}" | jq -r '.username')
email=$(echo "${user}" | jq -r '.email // empty')
app_pw=$(echo "${user}" | jq -r '.attributes.mailu_app_password[0] // empty')
[[ -z "${email}" || -z "${app_pw}" ]] && continue
if account_exists "${email}"; then
echo "Skipping ${email}, already exists"
continue
fi
echo "Syncing ${email}"
runuser -u www-data -- php occ mail:account:create \
"${username}" "${username}" "${email}" \
mail.bstein.dev 993 ssl "${email}" "${app_pw}" \
mail.bstein.dev 587 tls "${email}" "${app_pw}" login || true
done

View File

@ -0,0 +1,65 @@
#!/bin/bash
set -euo pipefail
NC_URL="${NC_URL:-https://cloud.bstein.dev}"
ADMIN_USER="${ADMIN_USER:?}"
ADMIN_PASS="${ADMIN_PASS:?}"
export DEBIAN_FRONTEND=noninteractive
apt-get update -qq
apt-get install -y -qq curl jq >/dev/null
run_occ() {
runuser -u www-data -- php occ "$@"
}
log() { echo "[$(date -Is)] $*"; }
log "Applying Atlas theming"
run_occ theming:config name "Atlas Cloud"
run_occ theming:config slogan "Unified access to Atlas services"
run_occ theming:config url "https://cloud.bstein.dev"
run_occ theming:config color "#0f172a"
run_occ theming:config disable-user-theming yes
log "Setting default quota to 200 GB"
run_occ config:app:set files default_quota --value "200 GB"
API_BASE="${NC_URL}/ocs/v2.php/apps/external/api/v1"
AUTH=(-u "${ADMIN_USER}:${ADMIN_PASS}" -H "OCS-APIRequest: true")
log "Removing existing external links"
existing=$(curl -sf "${AUTH[@]}" "${API_BASE}?format=json" | jq -r '.ocs.data[].id // empty')
for id in ${existing}; do
curl -sf "${AUTH[@]}" -X DELETE "${API_BASE}/sites/${id}?format=json" >/dev/null || true
done
SITES=(
"Vaultwarden|https://vault.bstein.dev"
"Jellyfin|https://stream.bstein.dev"
"Gitea|https://scm.bstein.dev"
"Jenkins|https://ci.bstein.dev"
"Harbor|https://registry.bstein.dev"
"Vault|https://secret.bstein.dev"
"Jitsi|https://meet.bstein.dev"
"Grafana|https://metrics.bstein.dev"
"Chat LLM|https://chat.ai.bstein.dev"
"Vision|https://draw.ai.bstein.dev"
"STT/TTS|https://talk.ai.bstein.dev"
)
log "Seeding external links"
for entry in "${SITES[@]}"; do
IFS="|" read -r name url <<<"${entry}"
curl -sf "${AUTH[@]}" -X POST "${API_BASE}/sites?format=json" \
-d "name=${name}" \
-d "url=${url}" \
-d "lang=" \
-d "type=link" \
-d "device=" \
-d "icon=" \
-d "groups[]=" \
-d "redirect=1" >/dev/null
done
log "Maintenance run completed"

View File

@ -0,0 +1,58 @@
import importlib.util
import pathlib
def load_module():
path = pathlib.Path(__file__).resolve().parents[1] / "dashboards_render_atlas.py"
spec = importlib.util.spec_from_file_location("dashboards_render_atlas", path)
module = importlib.util.module_from_spec(spec)
assert spec.loader is not None
spec.loader.exec_module(module)
return module
def test_table_panel_options_and_filterable():
mod = load_module()
panel = mod.table_panel(
1,
"test",
"metric",
{"h": 1, "w": 1, "x": 0, "y": 0},
unit="percent",
transformations=[{"id": "labelsToFields", "options": {}}],
instant=True,
options={"showColumnFilters": False},
filterable=False,
footer={"show": False, "fields": "", "calcs": []},
format="table",
)
assert panel["fieldConfig"]["defaults"]["unit"] == "percent"
assert panel["fieldConfig"]["defaults"]["custom"]["filterable"] is False
assert panel["options"]["showHeader"] is True
assert panel["targets"][0]["format"] == "table"
def test_node_filter_and_expr_helpers():
mod = load_module()
expr = mod.node_filter("titan-.*")
assert "label_replace" in expr
cpu_expr = mod.node_cpu_expr("titan-.*")
mem_expr = mod.node_mem_expr("titan-.*")
assert "node_cpu_seconds_total" in cpu_expr
assert "node_memory_MemAvailable_bytes" in mem_expr
def test_render_configmap_writes(tmp_path):
mod = load_module()
mod.DASHBOARD_DIR = tmp_path / "dash"
mod.ROOT = tmp_path
uid = "atlas-test"
info = {"configmap": tmp_path / "cm.yaml"}
data = {"title": "Atlas Test"}
mod.write_json(uid, data)
mod.render_configmap(uid, info)
json_path = mod.DASHBOARD_DIR / f"{uid}.json"
assert json_path.exists()
content = (tmp_path / "cm.yaml").read_text()
assert "kind: ConfigMap" in content
assert f"{uid}.json" in content

View File

@ -0,0 +1,181 @@
import importlib.util
import pathlib
import pytest
def load_sync_module(monkeypatch):
# Minimal env required by module import
env = {
"KEYCLOAK_BASE_URL": "http://keycloak",
"KEYCLOAK_REALM": "atlas",
"KEYCLOAK_CLIENT_ID": "mailu-sync",
"KEYCLOAK_CLIENT_SECRET": "secret",
"MAILU_DOMAIN": "example.com",
"MAILU_DB_HOST": "localhost",
"MAILU_DB_PORT": "5432",
"MAILU_DB_NAME": "mailu",
"MAILU_DB_USER": "mailu",
"MAILU_DB_PASSWORD": "pw",
}
for k, v in env.items():
monkeypatch.setenv(k, v)
module_path = pathlib.Path(__file__).resolve().parents[1] / "mailu_sync.py"
spec = importlib.util.spec_from_file_location("mailu_sync_testmod", module_path)
module = importlib.util.module_from_spec(spec)
assert spec.loader is not None
spec.loader.exec_module(module)
return module
def test_random_password_length_and_charset(monkeypatch):
sync = load_sync_module(monkeypatch)
pw = sync.random_password()
assert len(pw) == 24
assert all(ch.isalnum() for ch in pw)
class _FakeResponse:
def __init__(self, json_data=None, status=200):
self._json_data = json_data or {}
self.status_code = status
def raise_for_status(self):
if self.status_code >= 400:
raise AssertionError(f"status {self.status_code}")
def json(self):
return self._json_data
class _FakeSession:
def __init__(self, put_resp, get_resp):
self.put_resp = put_resp
self.get_resp = get_resp
self.put_called = False
self.get_called = False
def post(self, *args, **kwargs):
return _FakeResponse({"access_token": "dummy"})
def put(self, *args, **kwargs):
self.put_called = True
return self.put_resp
def get(self, *args, **kwargs):
self.get_called = True
return self.get_resp
def test_kc_update_attributes_succeeds(monkeypatch):
sync = load_sync_module(monkeypatch)
ok_resp = _FakeResponse({"attributes": {"mailu_app_password": ["abc"]}})
sync.SESSION = _FakeSession(_FakeResponse({}), ok_resp)
sync.kc_update_attributes("token", {"id": "u1", "username": "u1"}, {"mailu_app_password": "abc"})
assert sync.SESSION.put_called and sync.SESSION.get_called
def test_kc_update_attributes_raises_without_attribute(monkeypatch):
sync = load_sync_module(monkeypatch)
missing_attr_resp = _FakeResponse({"attributes": {}}, status=200)
sync.SESSION = _FakeSession(_FakeResponse({}), missing_attr_resp)
with pytest.raises(Exception):
sync.kc_update_attributes("token", {"id": "u1", "username": "u1"}, {"mailu_app_password": "abc"})
def test_kc_get_users_paginates(monkeypatch):
sync = load_sync_module(monkeypatch)
class _PagedSession:
def __init__(self):
self.calls = 0
def post(self, *_, **__):
return _FakeResponse({"access_token": "tok"})
def get(self, *_, **__):
self.calls += 1
if self.calls == 1:
return _FakeResponse([{"id": "u1"}, {"id": "u2"}])
return _FakeResponse([]) # stop pagination
sync.SESSION = _PagedSession()
users = sync.kc_get_users("tok")
assert [u["id"] for u in users] == ["u1", "u2"]
assert sync.SESSION.calls == 2
def test_ensure_mailu_user_skips_foreign_domain(monkeypatch):
sync = load_sync_module(monkeypatch)
executed = []
class _Cursor:
def execute(self, sql, params):
executed.append((sql, params))
sync.ensure_mailu_user(_Cursor(), "user@other.com", "pw", "User")
assert not executed
def test_ensure_mailu_user_upserts(monkeypatch):
sync = load_sync_module(monkeypatch)
captured = {}
class _Cursor:
def execute(self, sql, params):
captured.update(params)
sync.ensure_mailu_user(_Cursor(), "user@example.com", "pw", "User Example")
assert captured["email"] == "user@example.com"
assert captured["localpart"] == "user"
# password should be hashed, not the raw string
assert captured["password"] != "pw"
def test_main_generates_password_and_upserts(monkeypatch):
sync = load_sync_module(monkeypatch)
users = [
{"id": "u1", "username": "user1", "email": "user1@example.com", "attributes": {}},
{"id": "u2", "username": "user2", "email": "user2@example.com", "attributes": {"mailu_app_password": ["keepme"]}},
{"id": "u3", "username": "user3", "email": "user3@other.com", "attributes": {}},
]
updated = []
class _Cursor:
def __init__(self):
self.executions = []
def execute(self, sql, params):
self.executions.append(params)
def close(self):
return None
class _Conn:
def __init__(self):
self.autocommit = False
self._cursor = _Cursor()
def cursor(self, cursor_factory=None):
return self._cursor
def close(self):
return None
monkeypatch.setattr(sync, "get_kc_token", lambda: "tok")
monkeypatch.setattr(sync, "kc_get_users", lambda token: users)
monkeypatch.setattr(sync, "kc_update_attributes", lambda token, user, attrs: updated.append((user["id"], attrs["mailu_app_password"])))
conns = []
def _connect(**kwargs):
conn = _Conn()
conns.append(conn)
return conn
monkeypatch.setattr(sync.psycopg2, "connect", _connect)
sync.main()
# Should attempt two inserts (third user skipped due to domain mismatch)
assert len(updated) == 1 # only one missing attr was backfilled
assert conns and len(conns[0]._cursor.executions) == 2

View File

@ -0,0 +1,48 @@
# services/bstein-dev-home/backend-deployment.yaml
apiVersion: apps/v1
kind: Deployment
metadata:
name: bstein-dev-home-backend
namespace: bstein-dev-home
spec:
replicas: 2
revisionHistoryLimit: 3
selector:
matchLabels:
app: bstein-dev-home-backend
template:
metadata:
labels:
app: bstein-dev-home-backend
spec:
nodeSelector:
kubernetes.io/arch: arm64
node-role.kubernetes.io/worker: "true"
imagePullSecrets:
- name: harbor-bstein-robot
containers:
- name: backend
image: registry.bstein.dev/bstein/bstein-dev-home-backend:latest
imagePullPolicy: Always
ports:
- name: http
containerPort: 8080
readinessProbe:
httpGet:
path: /api/healthz
port: http
initialDelaySeconds: 2
periodSeconds: 5
livenessProbe:
httpGet:
path: /api/healthz
port: http
initialDelaySeconds: 10
periodSeconds: 10
resources:
requests:
cpu: 50m
memory: 64Mi
limits:
cpu: 300m
memory: 256Mi

View File

@ -0,0 +1,12 @@
apiVersion: v1
kind: Service
metadata:
name: bstein-dev-home-backend
namespace: bstein-dev-home
spec:
selector:
app: bstein-dev-home-backend
ports:
- name: http
port: 80
targetPort: 8080

View File

@ -0,0 +1,48 @@
# services/bstein-dev-home/frontend-deployment.yaml
apiVersion: apps/v1
kind: Deployment
metadata:
name: bstein-dev-home-frontend
namespace: bstein-dev-home
spec:
replicas: 2
revisionHistoryLimit: 3
selector:
matchLabels:
app: bstein-dev-home-frontend
template:
metadata:
labels:
app: bstein-dev-home-frontend
spec:
nodeSelector:
kubernetes.io/arch: arm64
node-role.kubernetes.io/worker: "true"
imagePullSecrets:
- name: harbor-bstein-robot
containers:
- name: frontend
image: registry.bstein.dev/bstein/bstein-dev-home-frontend:latest
imagePullPolicy: Always
ports:
- name: http
containerPort: 80
readinessProbe:
httpGet:
path: /
port: http
initialDelaySeconds: 2
periodSeconds: 5
livenessProbe:
httpGet:
path: /
port: http
initialDelaySeconds: 10
periodSeconds: 10
resources:
requests:
cpu: 50m
memory: 64Mi
limits:
cpu: 300m
memory: 256Mi

View File

@ -0,0 +1,12 @@
apiVersion: v1
kind: Service
metadata:
name: bstein-dev-home-frontend
namespace: bstein-dev-home
spec:
selector:
app: bstein-dev-home-frontend
ports:
- name: http
port: 80
targetPort: 80

View File

@ -0,0 +1,48 @@
# services/bstein-dev-home/image.yaml
apiVersion: image.toolkit.fluxcd.io/v1beta2
kind: ImageRepository
metadata:
name: bstein-dev-home-frontend
namespace: bstein-dev-home
spec:
image: registry.bstein.dev/bstein/bstein-dev-home-frontend
interval: 1m0s
---
apiVersion: image.toolkit.fluxcd.io/v1beta2
kind: ImagePolicy
metadata:
name: bstein-dev-home-frontend
namespace: bstein-dev-home
spec:
imageRepositoryRef:
name: bstein-dev-home-frontend
filterTags:
pattern: '^v?(?P<version>[0-9]+\\.[0-9]+\\.[0-9]+(?:[-.][0-9A-Za-z]+)?)$'
extract: '$version'
policy:
semver:
range: ">=0.1.0"
---
apiVersion: image.toolkit.fluxcd.io/v1beta2
kind: ImageRepository
metadata:
name: bstein-dev-home-backend
namespace: bstein-dev-home
spec:
image: registry.bstein.dev/bstein/bstein-dev-home-backend
interval: 1m0s
---
apiVersion: image.toolkit.fluxcd.io/v1beta2
kind: ImagePolicy
metadata:
name: bstein-dev-home-backend
namespace: bstein-dev-home
spec:
imageRepositoryRef:
name: bstein-dev-home-backend
filterTags:
pattern: '^v?(?P<version>[0-9]+\\.[0-9]+\\.[0-9]+(?:[-.][0-9A-Za-z]+)?)$'
extract: '$version'
policy:
semver:
range: ">=0.1.0"

View File

@ -0,0 +1,31 @@
# services/bstein-dev-home/ingress.yaml
apiVersion: networking.k8s.io/v1
kind: Ingress
metadata:
name: bstein-dev-home
namespace: bstein-dev-home
annotations:
kubernetes.io/ingress.class: traefik
traefik.ingress.kubernetes.io/router.entrypoints: websecure
traefik.ingress.kubernetes.io/router.tls: "true"
cert-manager.io/cluster-issuer: letsencrypt
spec:
tls:
- hosts: [ "bstein.dev" ]
secretName: bstein-dev-home-tls
rules:
- host: bstein.dev
http:
paths:
- path: /api
pathType: Prefix
backend:
service:
name: bstein-dev-home-backend
port: { number: 80 }
- path: /
pathType: Prefix
backend:
service:
name: bstein-dev-home-frontend
port: { number: 80 }

View File

@ -0,0 +1,17 @@
# services/bstein-dev-home/kustomization.yaml
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
namespace: bstein-dev-home
resources:
- namespace.yaml
- image.yaml
- frontend-deployment.yaml
- frontend-service.yaml
- backend-deployment.yaml
- backend-service.yaml
- ingress.yaml
images:
- name: registry.bstein.dev/bstein/bstein-dev-home-frontend
newTag: latest # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend"}
- name: registry.bstein.dev/bstein/bstein-dev-home-backend
newTag: latest # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend"}

View File

@ -0,0 +1,4 @@
apiVersion: v1
kind: Namespace
metadata:
name: bstein-dev-home

View File

@ -0,0 +1,31 @@
# services/ci-demo/deployment.yaml
apiVersion: apps/v1
kind: Deployment
metadata:
name: ci-demo
namespace: ci-demo
spec:
replicas: 1
selector:
matchLabels:
app.kubernetes.io/name: ci-demo
template:
metadata:
labels:
app.kubernetes.io/name: ci-demo
spec:
nodeSelector:
hardware: rpi4
containers:
- name: ci-demo
image: registry.bstein.dev/infra/ci-demo:latest
ports:
- name: http
containerPort: 8080
readinessProbe:
httpGet:
path: /
port: http
initialDelaySeconds: 2
periodSeconds: 5

View File

@ -0,0 +1,24 @@
# services/ci-demo/image.yaml
apiVersion: image.toolkit.fluxcd.io/v1
kind: ImageRepository
metadata:
name: ci-demo
namespace: flux-system
spec:
image: registry.bstein.dev/infra/ci-demo
interval: 1m0s
---
apiVersion: image.toolkit.fluxcd.io/v1
kind: ImagePolicy
metadata:
name: ci-demo
namespace: flux-system
spec:
imageRepositoryRef:
name: ci-demo
filterTags:
pattern: '^v(?P<version>0\.0\.0-\d+)$'
extract: '$version'
policy:
semver:
range: ">=0.0.0-0"

View File

@ -0,0 +1,11 @@
# services/ci-demo/kustomization.yaml
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
resources:
- namespace.yaml
- image.yaml
- deployment.yaml
- service.yaml
images:
- name: registry.bstein.dev/infra/ci-demo
newTag: registry.bstein.dev/infra/ci-demo:v0.0.0-3 # {"$imagepolicy": "flux-system:ci-demo"}

View File

@ -0,0 +1,6 @@
# services/ci-demo/namespace.yaml
apiVersion: v1
kind: Namespace
metadata:
name: ci-demo

View File

@ -0,0 +1,14 @@
# services/ci-demo/service.yaml
apiVersion: v1
kind: Service
metadata:
name: ci-demo
namespace: ci-demo
spec:
selector:
app.kubernetes.io/name: ci-demo
ports:
- name: http
port: 80
targetPort: http

View File

@ -35,7 +35,7 @@ spec:
values: ["rpi4"] values: ["rpi4"]
containers: containers:
- name: monerod - name: monerod
image: registry.bstein.dev/infra/monerod:0.18.4.1 image: registry.bstein.dev/crypto/monerod:0.18.4.1
command: ["/opt/monero/monerod"] command: ["/opt/monero/monerod"]
args: args:
- --data-dir=/data - --data-dir=/data

View File

@ -32,7 +32,7 @@ spec:
values: ["rpi4"] values: ["rpi4"]
containers: containers:
- name: monero-p2pool - name: monero-p2pool
image: registry.bstein.dev/infra/monero-p2pool:4.9 image: registry.bstein.dev/crypto/monero-p2pool:4.9
imagePullPolicy: Always imagePullPolicy: Always
command: ["p2pool"] command: ["p2pool"]
args: args:

View File

@ -21,6 +21,72 @@ spec:
labels: labels:
app: gitea app: gitea
spec: spec:
initContainers:
- name: configure-oidc
image: gitea/gitea:1.23
securityContext:
runAsUser: 1000
runAsGroup: 1000
env:
- name: CLIENT_ID
valueFrom:
secretKeyRef:
name: gitea-oidc
key: client_id
- name: CLIENT_SECRET
valueFrom:
secretKeyRef:
name: gitea-oidc
key: client_secret
- name: DISCOVERY_URL
valueFrom:
secretKeyRef:
name: gitea-oidc
key: openid_auto_discovery_url
command:
- /bin/bash
- -c
- |
set -euo pipefail
APPINI=/data/gitea/conf/app.ini
BIN=/usr/local/bin/gitea
list="$($BIN -c "$APPINI" admin auth list)"
id=$(echo "$list" | awk '$2=="keycloak"{print $1}')
if [ -n "$id" ]; then
echo "Updating existing auth source id=$id"
$BIN -c "$APPINI" admin auth update-oauth \
--id "$id" \
--name keycloak \
--provider openidConnect \
--key "$CLIENT_ID" \
--secret "$CLIENT_SECRET" \
--auto-discover-url "$DISCOVERY_URL" \
--scopes "openid profile email groups" \
--required-claim-name "" \
--required-claim-value "" \
--group-claim-name groups \
--admin-group admin \
--skip-local-2fa
else
echo "Creating keycloak auth source"
$BIN -c "$APPINI" admin auth add-oauth \
--name keycloak \
--provider openidConnect \
--key "$CLIENT_ID" \
--secret "$CLIENT_SECRET" \
--auto-discover-url "$DISCOVERY_URL" \
--scopes "openid profile email groups" \
--required-claim-name "" \
--required-claim-value "" \
--group-claim-name groups \
--admin-group admin \
--skip-local-2fa
fi
volumeMounts:
- name: gitea-data
mountPath: /data
nodeSelector: nodeSelector:
node-role.kubernetes.io/worker: "true" node-role.kubernetes.io/worker: "true"
affinity: affinity:
@ -55,6 +121,36 @@ spec:
value: "master" value: "master"
- name: ROOT_URL - name: ROOT_URL
value: "https://scm.bstein.dev" value: "https://scm.bstein.dev"
- name: GITEA__service__ENABLE_OPENID_SIGNIN
value: "true"
- name: GITEA__oauth2_client__ENABLE_AUTO_REGISTRATION
value: "true"
- name: GITEA__service__ALLOW_ONLY_EXTERNAL_REGISTRATION
value: "true"
- name: GITEA__service__DISABLE_REGISTRATION
value: "false"
- name: GITEA__log__LEVEL
value: "trace"
- name: GITEA__service__REQUIRE_SIGNIN_VIEW
value: "false"
- name: GITEA__server__PROXY_HEADERS
value: "X-Forwarded-For, X-Forwarded-Proto, X-Forwarded-Host"
- name: GITEA__session__COOKIE_SECURE
value: "true"
- name: GITEA__session__DOMAIN
value: "scm.bstein.dev"
- name: GITEA__session__SAME_SITE
value: "lax"
- name: GITEA__security__SECRET_KEY
valueFrom:
secretKeyRef:
name: gitea-secret
key: SECRET_KEY
- name: GITEA__security__INTERNAL_TOKEN
valueFrom:
secretKeyRef:
name: gitea-secret
key: INTERNAL_TOKEN
- name: DB_TYPE - name: DB_TYPE
value: "postgres" value: "postgres"
- name: DB_HOST - name: DB_HOST

View File

@ -5,7 +5,7 @@ metadata:
name: gitea-ingress name: gitea-ingress
namespace: gitea namespace: gitea
annotations: annotations:
cert-manager.io/cluster-issuer: "letsencrypt-prod" cert-manager.io/cluster-issuer: letsencrypt
nginx.ingress.kubernetes.io/ssl-redirect: "true" nginx.ingress.kubernetes.io/ssl-redirect: "true"
spec: spec:
tls: tls:

View File

@ -0,0 +1,13 @@
# services/gitops-ui/certificate.yaml
apiVersion: cert-manager.io/v1
kind: Certificate
metadata:
name: gitops-ui-tls
namespace: flux-system
spec:
secretName: gitops-ui-tls
issuerRef:
kind: ClusterIssuer
name: letsencrypt
dnsNames:
- cd.bstein.dev

View File

@ -0,0 +1,48 @@
# services/gitops-ui/helmrelease.yaml
apiVersion: helm.toolkit.fluxcd.io/v2
kind: HelmRelease
metadata:
name: weave-gitops
namespace: flux-system
spec:
interval: 30m
chart:
spec:
chart: ./charts/gitops-server
sourceRef:
kind: GitRepository
name: weave-gitops-upstream
namespace: flux-system
# track upstream tag; see source object for version pin
install:
remediation:
retries: 3
upgrade:
remediation:
retries: 3
remediateLastFailure: true
cleanupOnFail: true
values:
additionalArgs:
- --auth-methods=oidc
adminUser:
create: false
ingress:
enabled: true
className: traefik
annotations:
cert-manager.io/cluster-issuer: letsencrypt
traefik.ingress.kubernetes.io/router.entrypoints: websecure
hosts:
- host: cd.bstein.dev
paths:
- path: /
pathType: Prefix
tls:
- secretName: gitops-ui-tls
hosts:
- cd.bstein.dev
oidcSecret:
create: false
metrics:
enabled: true

View File

@ -0,0 +1,10 @@
# services/gitops-ui/kustomization.yaml
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
namespace: flux-system
resources:
- source.yaml
- helmrelease.yaml
- certificate.yaml
- networkpolicy-acme.yaml
- rbac.yaml

View File

@ -0,0 +1,14 @@
# services/gitops-ui/networkpolicy-acme.yaml
apiVersion: networking.k8s.io/v1
kind: NetworkPolicy
metadata:
name: allow-acme-solver
namespace: flux-system
spec:
podSelector:
matchLabels:
acme.cert-manager.io/http01-solver: "true"
policyTypes:
- Ingress
ingress:
- {}

View File

@ -0,0 +1,15 @@
# services/gitops-ui/rbac.yaml
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
name: gitops-admins
labels:
app.kubernetes.io/name: weave-gitops
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: cluster-admin
subjects:
- kind: Group
name: admin
apiGroup: rbac.authorization.k8s.io

View File

@ -0,0 +1,11 @@
# services/gitops-ui/source.yaml
apiVersion: source.toolkit.fluxcd.io/v1
kind: GitRepository
metadata:
name: weave-gitops-upstream
namespace: flux-system
spec:
interval: 1h
url: https://github.com/weaveworks/weave-gitops.git
ref:
tag: v0.38.0

View File

@ -0,0 +1,12 @@
# services/harbor/certificate.yaml
apiVersion: cert-manager.io/v1
kind: Certificate
metadata:
name: registry-bstein-dev
namespace: harbor
spec:
secretName: registry-bstein-dev-tls
dnsNames: [ "registry.bstein.dev" ]
issuerRef:
name: letsencrypt
kind: ClusterIssuer

View File

@ -0,0 +1,259 @@
# services/harbor/helmrelease.yaml
apiVersion: helm.toolkit.fluxcd.io/v2
kind: HelmRelease
metadata:
name: harbor
namespace: harbor
spec:
interval: 10m
install:
timeout: 20m
remediation:
retries: 3
upgrade:
timeout: 20m
remediation:
retries: 3
remediateLastFailure: true
cleanupOnFail: true
rollback:
timeout: 20m
chart:
spec:
chart: harbor
version: 1.18.1
sourceRef:
kind: HelmRepository
name: harbor
namespace: flux-system
values:
externalURL: https://registry.bstein.dev
imagePullPolicy: IfNotPresent
expose:
type: ingress
tls:
enabled: true
certSource: secret
secret:
secretName: registry-bstein-dev-tls
ingress:
className: traefik
annotations:
cert-manager.io/cluster-issuer: letsencrypt
traefik.ingress.kubernetes.io/router.entrypoints: websecure
traefik.ingress.kubernetes.io/router.tls: "true"
hosts:
core: registry.bstein.dev
persistence:
enabled: true
resourcePolicy: keep
persistentVolumeClaim:
registry:
existingClaim: harbor-registry
accessMode: ReadWriteOnce
size: 50Gi
jobservice:
jobLog:
existingClaim: harbor-jobservice-logs
accessMode: ReadWriteOnce
size: 5Gi
imageChartStorage:
type: filesystem
filesystem:
rootdirectory: /storage
database:
type: external
external:
host: postgres-service.postgres.svc.cluster.local
port: "5432"
username: harbor
coreDatabase: harbor
existingSecret: harbor-db
sslmode: disable
redis:
type: internal
internal:
image:
repository: registry.bstein.dev/infra/harbor-redis
tag: v2.14.1-arm64 # {"$imagepolicy": "harbor:harbor-redis:tag"}
nodeSelector:
kubernetes.io/hostname: titan-05
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: kubernetes.io/arch
operator: In
values: ["arm64"]
preferredDuringSchedulingIgnoredDuringExecution:
- weight: 90
preference:
matchExpressions:
- key: hardware
operator: In
values: ["rpi5"]
- weight: 50
preference:
matchExpressions:
- key: hardware
operator: In
values: ["rpi4"]
trivy:
enabled: false
metrics:
enabled: false
cache:
enabled: false
existingSecretAdminPassword: harbor-core
existingSecretAdminPasswordKey: harbor_admin_password
existingSecretSecretKey: harbor-core
core:
image:
repository: registry.bstein.dev/infra/harbor-core
tag: v2.14.1-arm64 # {"$imagepolicy": "harbor:harbor-core:tag"}
nodeSelector:
kubernetes.io/hostname: titan-05
existingSecret: harbor-core
existingXsrfSecret: harbor-core
existingXsrfSecretKey: CSRF_KEY
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: kubernetes.io/arch
operator: In
values: ["arm64"]
preferredDuringSchedulingIgnoredDuringExecution:
- weight: 90
preference:
matchExpressions:
- key: hardware
operator: In
values: ["rpi5"]
- weight: 50
preference:
matchExpressions:
- key: hardware
operator: In
values: ["rpi4"]
jobservice:
image:
repository: registry.bstein.dev/infra/harbor-jobservice
tag: v2.14.1-arm64 # {"$imagepolicy": "harbor:harbor-jobservice:tag"}
nodeSelector:
kubernetes.io/hostname: titan-05
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: kubernetes.io/arch
operator: In
values: ["arm64"]
preferredDuringSchedulingIgnoredDuringExecution:
- weight: 90
preference:
matchExpressions:
- key: hardware
operator: In
values: ["rpi5"]
- weight: 50
preference:
matchExpressions:
- key: hardware
operator: In
values: ["rpi4"]
portal:
image:
repository: registry.bstein.dev/infra/harbor-portal
tag: v2.14.1-arm64 # {"$imagepolicy": "harbor:harbor-portal:tag"}
nodeSelector:
kubernetes.io/hostname: titan-05
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: kubernetes.io/arch
operator: In
values: ["arm64"]
preferredDuringSchedulingIgnoredDuringExecution:
- weight: 90
preference:
matchExpressions:
- key: hardware
operator: In
values: ["rpi5"]
- weight: 50
preference:
matchExpressions:
- key: hardware
operator: In
values: ["rpi4"]
registry:
registry:
image:
repository: registry.bstein.dev/infra/harbor-registry
tag: v2.14.1-arm64 # {"$imagepolicy": "harbor:harbor-registry:tag"}
controller:
image:
repository: registry.bstein.dev/infra/harbor-registryctl
tag: v2.14.1-arm64 # {"$imagepolicy": "harbor:harbor-registryctl:tag"}
nodeSelector:
kubernetes.io/hostname: titan-05
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: kubernetes.io/arch
operator: In
values: ["arm64"]
preferredDuringSchedulingIgnoredDuringExecution:
- weight: 90
preference:
matchExpressions:
- key: hardware
operator: In
values: ["rpi5"]
- weight: 50
preference:
matchExpressions:
- key: hardware
operator: In
values: ["rpi4"]
nginx:
image:
repository: registry.bstein.dev/infra/harbor-nginx
tag: v2.14.1-arm64 # {"$imagepolicy": "harbor:harbor-nginx:tag"}
nodeSelector:
kubernetes.io/hostname: titan-05
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: kubernetes.io/arch
operator: In
values: ["arm64"]
preferredDuringSchedulingIgnoredDuringExecution:
- weight: 90
preference:
matchExpressions:
- key: hardware
operator: In
values: ["rpi5"]
- weight: 50
preference:
matchExpressions:
- key: hardware
operator: In
values: ["rpi4"]
prepare:
image:
repository: registry.bstein.dev/infra/harbor-prepare
tag: v2.14.1-arm64 # {"$imagepolicy": "harbor:harbor-prepare:tag"}
updateStrategy:
type: Recreate

192
services/harbor/image.yaml Normal file
View File

@ -0,0 +1,192 @@
# services/harbor/image.yaml
apiVersion: image.toolkit.fluxcd.io/v1beta2
kind: ImageRepository
metadata:
name: harbor-core
namespace: harbor
spec:
image: registry.bstein.dev/infra/harbor-core
interval: 5m0s
---
apiVersion: image.toolkit.fluxcd.io/v1beta2
kind: ImagePolicy
metadata:
name: harbor-core
namespace: harbor
spec:
imageRepositoryRef:
name: harbor-core
filterTags:
pattern: '^v(?P<version>\d+\.\d+\.\d+-arm64(\.\d+)?)$'
extract: '$version'
policy:
semver:
range: ">=2.14.0-0 <2.15.0-0"
---
apiVersion: image.toolkit.fluxcd.io/v1beta2
kind: ImageRepository
metadata:
name: harbor-jobservice
namespace: harbor
spec:
image: registry.bstein.dev/infra/harbor-jobservice
interval: 5m0s
---
apiVersion: image.toolkit.fluxcd.io/v1beta2
kind: ImagePolicy
metadata:
name: harbor-jobservice
namespace: harbor
spec:
imageRepositoryRef:
name: harbor-jobservice
filterTags:
pattern: '^v(?P<version>\d+\.\d+\.\d+-arm64(\.\d+)?)$'
extract: '$version'
policy:
semver:
range: ">=2.14.0-0 <2.15.0-0"
---
apiVersion: image.toolkit.fluxcd.io/v1beta2
kind: ImageRepository
metadata:
name: harbor-portal
namespace: harbor
spec:
image: registry.bstein.dev/infra/harbor-portal
interval: 5m0s
---
apiVersion: image.toolkit.fluxcd.io/v1beta2
kind: ImagePolicy
metadata:
name: harbor-portal
namespace: harbor
spec:
imageRepositoryRef:
name: harbor-portal
filterTags:
pattern: '^v(?P<version>\d+\.\d+\.\d+-arm64(\.\d+)?)$'
extract: '$version'
policy:
semver:
range: ">=2.14.0-0 <2.15.0-0"
---
apiVersion: image.toolkit.fluxcd.io/v1beta2
kind: ImageRepository
metadata:
name: harbor-registry
namespace: harbor
spec:
image: registry.bstein.dev/infra/harbor-registry
interval: 5m0s
---
apiVersion: image.toolkit.fluxcd.io/v1beta2
kind: ImagePolicy
metadata:
name: harbor-registry
namespace: harbor
spec:
imageRepositoryRef:
name: harbor-registry
filterTags:
pattern: '^v(?P<version>\d+\.\d+\.\d+-arm64(\.\d+)?)$'
extract: '$version'
policy:
semver:
range: ">=2.14.0-0 <2.15.0-0"
---
apiVersion: image.toolkit.fluxcd.io/v1beta2
kind: ImageRepository
metadata:
name: harbor-registryctl
namespace: harbor
spec:
image: registry.bstein.dev/infra/harbor-registryctl
interval: 5m0s
---
apiVersion: image.toolkit.fluxcd.io/v1beta2
kind: ImagePolicy
metadata:
name: harbor-registryctl
namespace: harbor
spec:
imageRepositoryRef:
name: harbor-registryctl
filterTags:
pattern: '^v(?P<version>\d+\.\d+\.\d+-arm64(\.\d+)?)$'
extract: '$version'
policy:
semver:
range: ">=2.14.0-0 <2.15.0-0"
---
apiVersion: image.toolkit.fluxcd.io/v1beta2
kind: ImageRepository
metadata:
name: harbor-redis
namespace: harbor
spec:
image: registry.bstein.dev/infra/harbor-redis
interval: 5m0s
---
apiVersion: image.toolkit.fluxcd.io/v1beta2
kind: ImagePolicy
metadata:
name: harbor-redis
namespace: harbor
spec:
imageRepositoryRef:
name: harbor-redis
filterTags:
pattern: '^v(?P<version>\d+\.\d+\.\d+-arm64(\.\d+)?)$'
extract: '$version'
policy:
semver:
range: ">=2.14.0-0 <2.15.0-0"
---
apiVersion: image.toolkit.fluxcd.io/v1beta2
kind: ImageRepository
metadata:
name: harbor-nginx
namespace: harbor
spec:
image: registry.bstein.dev/infra/harbor-nginx
interval: 5m0s
---
apiVersion: image.toolkit.fluxcd.io/v1beta2
kind: ImagePolicy
metadata:
name: harbor-nginx
namespace: harbor
spec:
imageRepositoryRef:
name: harbor-nginx
filterTags:
pattern: '^v(?P<version>\d+\.\d+\.\d+-arm64(\.\d+)?)$'
extract: '$version'
policy:
semver:
range: ">=2.14.0-0 <2.15.0-0"
---
apiVersion: image.toolkit.fluxcd.io/v1beta2
kind: ImageRepository
metadata:
name: harbor-prepare
namespace: harbor
spec:
image: registry.bstein.dev/infra/harbor-prepare
interval: 5m0s
---
apiVersion: image.toolkit.fluxcd.io/v1beta2
kind: ImagePolicy
metadata:
name: harbor-prepare
namespace: harbor
spec:
imageRepositoryRef:
name: harbor-prepare
filterTags:
pattern: '^v(?P<version>\d+\.\d+\.\d+-arm64(\.\d+)?)$'
extract: '$version'
policy:
semver:
range: ">=2.14.0-0 <2.15.0-0"

View File

@ -0,0 +1,10 @@
# services/harbor/kustomization.yaml
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
namespace: harbor
resources:
- namespace.yaml
- pvc.yaml
- certificate.yaml
- helmrelease.yaml
- image.yaml

View File

@ -0,0 +1,5 @@
# services/harbor/namespace.yaml
apiVersion: v1
kind: Namespace
metadata:
name: harbor

24
services/harbor/pvc.yaml Normal file
View File

@ -0,0 +1,24 @@
# services/harbor/pvc.yaml
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
name: harbor-registry
namespace: harbor
spec:
accessModes: [ "ReadWriteOnce" ]
resources:
requests:
storage: 50Gi
storageClassName: astreae
---
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
name: harbor-jobservice-logs
namespace: harbor
spec:
accessModes: [ "ReadWriteOnce" ]
resources:
requests:
storage: 5Gi
storageClassName: astreae

View File

@ -0,0 +1,314 @@
# services/jenkins/helmrelease.yaml
apiVersion: helm.toolkit.fluxcd.io/v2
kind: HelmRelease
metadata:
name: jenkins
namespace: jenkins
spec:
interval: 30m
chart:
spec:
chart: jenkins
version: 5.8.114
sourceRef:
kind: HelmRepository
name: jenkins
namespace: flux-system
install:
timeout: 15m
remediation:
retries: 3
upgrade:
timeout: 15m
remediation:
retries: 3
remediateLastFailure: true
cleanupOnFail: true
rollback:
timeout: 15m
values:
controller:
nodeSelector:
hardware: rpi4
jenkinsUrl: https://ci.bstein.dev
ingress:
enabled: true
hostName: ci.bstein.dev
ingressClassName: traefik
annotations:
cert-manager.io/cluster-issuer: letsencrypt
traefik.ingress.kubernetes.io/router.entrypoints: websecure
tls:
- secretName: jenkins-tls
hosts:
- ci.bstein.dev
installPlugins:
- kubernetes
- workflow-aggregator
- git
- configuration-as-code
- oic-auth
- job-dsl
- configuration-as-code-support
containerEnv:
- name: ENABLE_OIDC
value: "true"
- name: OIDC_ISSUER
value: "https://sso.bstein.dev/realms/atlas"
- name: OIDC_CLIENT_ID
valueFrom:
secretKeyRef:
name: jenkins-oidc
key: clientId
- name: OIDC_CLIENT_SECRET
valueFrom:
secretKeyRef:
name: jenkins-oidc
key: clientSecret
- name: OIDC_AUTH_URL
valueFrom:
secretKeyRef:
name: jenkins-oidc
key: authorizationUrl
- name: OIDC_TOKEN_URL
valueFrom:
secretKeyRef:
name: jenkins-oidc
key: tokenUrl
- name: OIDC_USERINFO_URL
valueFrom:
secretKeyRef:
name: jenkins-oidc
key: userInfoUrl
- name: OIDC_LOGOUT_URL
valueFrom:
secretKeyRef:
name: jenkins-oidc
key: logoutUrl
- name: GITEA_PAT_USERNAME
valueFrom:
secretKeyRef:
name: gitea-pat
key: username
- name: GITEA_PAT_TOKEN
valueFrom:
secretKeyRef:
name: gitea-pat
key: token
customInitContainers:
- name: clean-jcasc-stale
image: alpine:3.20
imagePullPolicy: IfNotPresent
command:
- sh
- -c
- |
set -euo pipefail
rm -f /var/jenkins_home/casc_configs/* || true
securityContext:
runAsNonRoot: true
runAsUser: 1000
runAsGroup: 1000
volumeMounts:
- name: jenkins-home
mountPath: /var/jenkins_home
initScripts:
oidc.groovy: |
import hudson.util.Secret
import jenkins.model.IdStrategy
import jenkins.model.Jenkins
import org.jenkinsci.plugins.oic.OicSecurityRealm
import org.jenkinsci.plugins.oic.OicServerWellKnownConfiguration
import hudson.security.FullControlOnceLoggedInAuthorizationStrategy
def env = System.getenv()
if (!(env['ENABLE_OIDC'] ?: 'false').toBoolean()) {
println("OIDC disabled (ENABLE_OIDC=false); keeping default security realm")
return
}
def required = ['OIDC_CLIENT_ID','OIDC_CLIENT_SECRET','OIDC_ISSUER']
if (!required.every { env[it] }) {
throw new IllegalStateException("OIDC enabled but missing vars: ${required.findAll { !env[it] }}")
}
try {
def wellKnown = "${env['OIDC_ISSUER']}/.well-known/openid-configuration"
def serverCfg = new OicServerWellKnownConfiguration(wellKnown)
serverCfg.setScopesOverride('openid profile email')
def realm = new OicSecurityRealm(
env['OIDC_CLIENT_ID'],
Secret.fromString(env['OIDC_CLIENT_SECRET']),
serverCfg,
false,
IdStrategy.CASE_INSENSITIVE,
IdStrategy.CASE_INSENSITIVE
)
realm.createProxyAwareResourceRetriver()
realm.setLogoutFromOpenidProvider(true)
realm.setPostLogoutRedirectUrl('https://ci.bstein.dev')
realm.setUserNameField('preferred_username')
realm.setFullNameFieldName('name')
realm.setEmailFieldName('email')
realm.setGroupsFieldName('groups')
realm.setRootURLFromRequest(true)
realm.setSendScopesInTokenRequest(true)
def j = Jenkins.get()
j.setSecurityRealm(realm)
def auth = new FullControlOnceLoggedInAuthorizationStrategy()
auth.setAllowAnonymousRead(false)
j.setAuthorizationStrategy(auth)
j.save()
println("Configured OIDC realm from init script (well-known)")
} catch (Exception e) {
println("Failed to configure OIDC realm: ${e}")
throw e
}
JCasC:
defaultConfig: false
securityRealm: ""
authorizationStrategy: ""
configScripts:
base.yaml: |
jenkins:
disableRememberMe: false
mode: NORMAL
numExecutors: 0
labelString: ""
projectNamingStrategy: "standard"
markupFormatter:
plainText
clouds:
- kubernetes:
containerCapStr: "10"
defaultsProviderTemplate: ""
connectTimeout: "5"
readTimeout: "15"
jenkinsUrl: "http://jenkins.jenkins.svc.cluster.local:8080"
jenkinsTunnel: "jenkins-agent.jenkins.svc.cluster.local:50000"
skipTlsVerify: false
usageRestricted: false
maxRequestsPerHostStr: "32"
retentionTimeout: "5"
waitForPodSec: "600"
name: "kubernetes"
namespace: "jenkins"
restrictedPssSecurityContext: false
serverUrl: "https://kubernetes.default"
credentialsId: ""
podLabels:
- key: "jenkins/jenkins-jenkins-agent"
value: "true"
templates:
- name: "default"
namespace: "jenkins"
id: a23c9bbcd21e360a77d51b426f05bd7b8032d8fdedd6ffb97c436883ce6c5ffa
containers:
- name: "jnlp"
alwaysPullImage: false
args: "^${computer.jnlpmac} ^${computer.name}"
envVars:
- envVar:
key: "JENKINS_URL"
value: "http://jenkins.jenkins.svc.cluster.local:8080/"
image: "jenkins/inbound-agent:3355.v388858a_47b_33-3"
privileged: "false"
resourceLimitCpu: 512m
resourceLimitMemory: 512Mi
resourceRequestCpu: 512m
resourceRequestMemory: 512Mi
ttyEnabled: false
workingDir: /home/jenkins/agent
idleMinutes: 0
instanceCap: 2147483647
label: "jenkins-jenkins-agent "
nodeUsageMode: "NORMAL"
podRetention: Never
showRawYaml: true
serviceAccount: "default"
slaveConnectTimeoutStr: "100"
yamlMergeStrategy: override
inheritYamlMergeStrategy: false
slaveAgentPort: 50000
crumbIssuer:
standard:
excludeClientIPFromCrumb: true
security:
apiToken:
creationOfLegacyTokenEnabled: false
tokenGenerationOnCreationEnabled: false
usageStatisticsEnabled: true
unclassified:
creds.yaml: |
credentials:
system:
domainCredentials:
- credentials:
- usernamePassword:
scope: GLOBAL
id: gitea-pat
username: "${GITEA_PAT_USERNAME}"
password: "${GITEA_PAT_TOKEN}"
description: "Gitea PAT for pipelines"
jobs.yaml: |
jobs:
- script: |
pipelineJob('harbor-arm-build') {
triggers {
scm('H/5 * * * *')
}
definition {
cpsScm {
scm {
git {
remote {
url('https://scm.bstein.dev/bstein/harbor-arm-build.git')
credentials('gitea-pat')
}
branches('*/master')
}
}
}
}
}
pipelineJob('ci-demo') {
triggers {
scm('H/1 * * * *')
}
definition {
cpsScm {
scm {
git {
remote {
url('https://scm.bstein.dev/bstein/ci-demo.git')
credentials('gitea-pat')
}
branches('*/master')
}
}
scriptPath('Jenkinsfile')
}
}
}
pipelineJob('bstein-dev-home') {
triggers {
scm('H/2 * * * *')
}
definition {
cpsScm {
scm {
git {
remote {
url('https://scm.bstein.dev/bstein/bstein-dev-home.git')
credentials('gitea-pat')
}
branches('*/master')
}
}
scriptPath('Jenkinsfile')
}
}
}
persistence:
enabled: true
storageClass: astreae
size: 50Gi
serviceAccount:
create: true

View File

@ -0,0 +1,7 @@
# services/jenkins/kustomization.yaml
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
namespace: jenkins
resources:
- namespace.yaml
- helmrelease.yaml

View File

@ -0,0 +1,5 @@
# services/jenkins/namespace.yaml
apiVersion: v1
kind: Namespace
metadata:
name: jenkins

View File

@ -5,7 +5,7 @@ metadata:
name: jitsi name: jitsi
namespace: jitsi namespace: jitsi
annotations: annotations:
cert-manager.io/cluster-issuer: "letsencrypt-prod" cert-manager.io/cluster-issuer: letsencrypt
spec: spec:
ingressClassName: traefik ingressClassName: traefik
tls: tls:

View File

@ -0,0 +1,152 @@
# services/keycloak/deployment.yaml
apiVersion: apps/v1
kind: Deployment
metadata:
name: keycloak
namespace: sso
labels:
app: keycloak
spec:
replicas: 1
selector:
matchLabels:
app: keycloak
template:
metadata:
labels:
app: keycloak
spec:
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: hardware
operator: In
values: ["rpi5","rpi4"]
- key: node-role.kubernetes.io/worker
operator: Exists
- matchExpressions:
- key: kubernetes.io/hostname
operator: In
values: ["titan-24"]
preferredDuringSchedulingIgnoredDuringExecution:
- weight: 90
preference:
matchExpressions:
- key: hardware
operator: In
values: ["rpi5"]
- weight: 70
preference:
matchExpressions:
- key: hardware
operator: In
values: ["rpi4"]
securityContext:
runAsUser: 1000
runAsGroup: 0
fsGroup: 1000
fsGroupChangePolicy: OnRootMismatch
initContainers:
- name: mailu-http-listener
image: registry.bstein.dev/sso/mailu-http-listener:0.1.0
imagePullPolicy: IfNotPresent
command: ["/bin/sh", "-c"]
args:
- |
cp /plugin/mailu-http-listener-0.1.0.jar /providers/
cp -r /plugin/src /providers/src
volumeMounts:
- name: providers
mountPath: /providers
containers:
- name: keycloak
image: quay.io/keycloak/keycloak:26.0.7
imagePullPolicy: IfNotPresent
args:
- start
env:
- name: KC_DB
value: postgres
- name: KC_DB_URL_HOST
value: postgres-service.postgres.svc.cluster.local
- name: KC_DB_URL_DATABASE
valueFrom:
secretKeyRef:
name: keycloak-db
key: database
- name: KC_DB_USERNAME
valueFrom:
secretKeyRef:
name: keycloak-db
key: username
- name: KC_DB_PASSWORD
valueFrom:
secretKeyRef:
name: keycloak-db
key: password
- name: KC_DB_SCHEMA
value: public
- name: KC_HOSTNAME
value: sso.bstein.dev
- name: KC_HOSTNAME_URL
value: https://sso.bstein.dev
- name: KC_PROXY
value: edge
- name: KC_PROXY_HEADERS
value: xforwarded
- name: KC_HTTP_ENABLED
value: "true"
- name: KC_HTTP_MANAGEMENT_PORT
value: "9000"
- name: KC_HTTP_MANAGEMENT_BIND_ADDRESS
value: 0.0.0.0
- name: KC_HEALTH_ENABLED
value: "true"
- name: KC_METRICS_ENABLED
value: "true"
- name: KEYCLOAK_ADMIN
valueFrom:
secretKeyRef:
name: keycloak-admin
key: username
- name: KEYCLOAK_ADMIN_PASSWORD
valueFrom:
secretKeyRef:
name: keycloak-admin
key: password
- name: KC_EVENTS_LISTENERS
value: jboss-logging,mailu-http
- name: KC_SPI_EVENTS_LISTENER_MAILU-HTTP_ENDPOINT
value: http://mailu-sync-listener.mailu-mailserver.svc.cluster.local:8080/events
ports:
- containerPort: 8080
name: http
- containerPort: 9000
name: metrics
readinessProbe:
httpGet:
path: /health/ready
port: 9000
initialDelaySeconds: 15
periodSeconds: 10
failureThreshold: 6
livenessProbe:
httpGet:
path: /health/live
port: 9000
initialDelaySeconds: 60
periodSeconds: 15
failureThreshold: 6
volumeMounts:
- name: data
mountPath: /opt/keycloak/data
- name: providers
mountPath: /opt/keycloak/providers
volumes:
- name: data
persistentVolumeClaim:
claimName: keycloak-data
- name: providers
emptyDir: {}

View File

@ -0,0 +1,24 @@
# services/keycloak/ingress.yaml
apiVersion: networking.k8s.io/v1
kind: Ingress
metadata:
name: keycloak
namespace: sso
annotations:
cert-manager.io/cluster-issuer: letsencrypt
spec:
ingressClassName: traefik
rules:
- host: sso.bstein.dev
http:
paths:
- path: /
pathType: Prefix
backend:
service:
name: keycloak
port:
number: 80
tls:
- hosts: [sso.bstein.dev]
secretName: keycloak-tls

View File

@ -1,11 +1,10 @@
# services/zot/kustomization.yaml # services/keycloak/kustomization.yaml
apiVersion: kustomize.config.k8s.io/v1beta1 apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization kind: Kustomization
namespace: sso
resources: resources:
- namespace.yaml - namespace.yaml
- pvc.yaml - pvc.yaml
- deployment.yaml - deployment.yaml
- configmap.yaml
- service.yaml - service.yaml
- ingress.yaml - ingress.yaml
- middleware.yaml

View File

@ -0,0 +1,5 @@
# services/keycloak/namespace.yaml
apiVersion: v1
kind: Namespace
metadata:
name: sso

View File

@ -0,0 +1,12 @@
# services/keycloak/pvc.yaml
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
name: keycloak-data
namespace: sso
spec:
accessModes: ["ReadWriteOnce"]
resources:
requests:
storage: 10Gi
storageClassName: astreae

View File

@ -0,0 +1,15 @@
# services/keycloak/service.yaml
apiVersion: v1
kind: Service
metadata:
name: keycloak
namespace: sso
labels:
app: keycloak
spec:
selector:
app: keycloak
ports:
- name: http
port: 80
targetPort: http

View File

@ -0,0 +1,13 @@
# services/mailu/certificate.yaml
apiVersion: cert-manager.io/v1
kind: Certificate
metadata:
name: mailu-tls
namespace: mailu-mailserver
spec:
secretName: mailu-certificates
issuerRef:
kind: ClusterIssuer
name: letsencrypt-prod
dnsNames:
- mail.bstein.dev

View File

@ -0,0 +1,287 @@
# services/mailu/helmrelease.yaml
apiVersion: helm.toolkit.fluxcd.io/v2
kind: HelmRelease
metadata:
name: mailu
namespace: mailu-mailserver
spec:
interval: 30m
chart:
spec:
chart: mailu
version: 2.1.2
sourceRef:
kind: HelmRepository
name: mailu
namespace: flux-system
install:
remediation: { retries: 3 }
timeout: 10m
upgrade:
remediation:
retries: 3
remediateLastFailure: true
cleanupOnFail: true
timeout: 10m
values:
mailuVersion: "2024.06"
domain: bstein.dev
hostnames: [mail.bstein.dev]
domains:
- name: bstein.dev
enabled: true
dkim:
enabled: true
externalRelay:
host: "[email-smtp.us-east-2.amazonaws.com]:587"
existingSecret: mailu-ses-relay
usernameKey: relay-username
passwordKey: relay-password
timezone: Etc/UTC
subnet: 10.42.0.0/16
existingSecret: mailu-secret
tls:
outboundLevel: encrypt
externalDatabase:
enabled: true
type: postgresql
host: postgres-service.postgres.svc.cluster.local
port: 5432
database: mailu
username: mailu
existingSecret: mailu-db-secret
existingSecretUsernameKey: username
existingSecretPasswordKey: password
existingSecretDatabaseKey: database
initialAccount:
enabled: true
username: test
domain: bstein.dev
existingSecret: mailu-initial-account-secret
existingSecretPasswordKey: password
persistence:
accessModes: [ReadWriteMany]
size: 100Gi
storageClass: astreae
single_pvc: true
front:
hostnames: [mail.bstein.dev]
proxied: true
hostPort:
enabled: false
https:
enabled: false
external: false
forceHttps: false
externalService:
enabled: true
type: LoadBalancer
externalTrafficPolicy: Cluster
ports:
submission: true
nodePorts:
pop3: 30010
pop3s: 30011
imap: 30143
imaps: 30993
manageSieve: 30419
smtp: 30025
smtps: 30465
submission: 30587
logLevel: DEBUG
nodeSelector:
hardware: rpi4
admin:
logLevel: DEBUG
nodeSelector:
hardware: rpi4
podLivenessProbe:
enabled: true
initialDelaySeconds: 30
periodSeconds: 10
timeoutSeconds: 5
failureThreshold: 6
successThreshold: 1
podReadinessProbe:
enabled: true
initialDelaySeconds: 20
periodSeconds: 10
timeoutSeconds: 5
failureThreshold: 6
successThreshold: 1
extraEnvVars:
- name: FLASK_DEBUG
value: "1"
- name: ACCESSLOG
value: /dev/stdout
- name: ERRORLOG
value: /dev/stderr
- name: WEBROOT_REDIRECT
value: ""
- name: FORWARDED_ALLOW_IPS
value: 127.0.0.1,10.42.0.0/16
- name: DNS_RESOLVERS
value: 1.1.1.1,9.9.9.9
extraVolumes:
- name: unbound-config
configMap:
name: mailu-unbound
- name: unbound-run
emptyDir: {}
extraVolumeMounts:
- name: unbound-run
mountPath: /var/lib/unbound
extraContainers:
- name: unbound
image: docker.io/alpine:3.20
command: ["/bin/sh", "-c"]
args:
- |
while :; do
printf "nameserver 10.43.0.10\n" > /etc/resolv.conf
if apk add --no-cache unbound bind-tools; then
break
fi
echo "apk failed, retrying" >&2
sleep 10
done
cat >/etc/resolv.conf <<'EOF'
search mailu-mailserver.svc.cluster.local svc.cluster.local cluster.local
nameserver 127.0.0.1
EOF
unbound-anchor -a /var/lib/unbound/root.key || true
exec unbound -d -c /opt/unbound/etc/unbound/unbound.conf
ports:
- containerPort: 53
protocol: UDP
- containerPort: 53
protocol: TCP
volumeMounts:
- name: unbound-config
mountPath: /opt/unbound/etc/unbound
- name: unbound-run
mountPath: /var/lib/unbound
dnsPolicy: None
dnsConfig:
nameservers:
- 127.0.0.1
searches:
- mailu-mailserver.svc.cluster.local
- svc.cluster.local
- cluster.local
clamav:
image:
repository: clamav/clamav-debian
tag: "1.4"
logLevel: DEBUG
nodeSelector:
hardware: rpi5
resources:
requests:
cpu: 200m
memory: 1Gi
limits:
cpu: 500m
memory: 3Gi
livenessProbe:
enabled: false
initialDelaySeconds: 300
periodSeconds: 30
timeoutSeconds: 5
failureThreshold: 6
successThreshold: 1
startupProbe:
enabled: false
initialDelaySeconds: 60
periodSeconds: 30
timeoutSeconds: 5
failureThreshold: 20
successThreshold: 1
readinessProbe:
enabled: false
initialDelaySeconds: 300
periodSeconds: 30
timeoutSeconds: 5
failureThreshold: 6
successThreshold: 1
dovecot:
logLevel: DEBUG
nodeSelector:
hardware: rpi4
oletools:
logLevel: DEBUG
nodeSelector:
hardware: rpi4
postfix:
logLevel: DEBUG
nodeSelector:
hardware: rpi4
overrides:
smtp_use_tls: "yes"
smtp_tls_security_level: "encrypt"
smtp_sasl_security_options: "noanonymous"
redis:
enabled: true
architecture: standalone
logLevel: DEBUG
image:
repository: bitnamilegacy/redis
tag: 8.0.3-debian-12-r3
master:
nodeSelector:
hardware: rpi4
persistence:
enabled: true
accessModes: [ReadWriteMany]
size: 8Gi
storageClass: astreae
rspamd:
logLevel: DEBUG
nodeSelector:
hardware: rpi4
persistence:
accessModes: [ReadWriteOnce]
size: 8Gi
storageClass: astreae
tika:
logLevel: DEBUG
nodeSelector:
hardware: rpi4
global:
logLevel: DEBUG
storageClass: astreae
webmail:
enabled: false
nodeSelector:
hardware: rpi4
ingress:
enabled: false
ingressClassName: traefik
tls: true
existingSecret: mailu-certificates
annotations:
traefik.ingress.kubernetes.io/router.entrypoints: websecure
traefik.ingress.kubernetes.io/service.serversscheme: https
traefik.ingress.kubernetes.io/service.serverstransport: mailu-transport@kubernetescrd
extraRules:
- host: mail.bstein.dev
http:
paths:
- path: /
pathType: Prefix
backend:
service:
name: mailu-front
port:
number: 443
service:
ports:
smtp:
port: 25
targetPort: 25
smtps:
port: 465
targetPort: 465
submission:
port: 587
targetPort: 587

View File

@ -0,0 +1,19 @@
# services/mailu/ingressroute.yaml
apiVersion: traefik.io/v1alpha1
kind: IngressRoute
metadata:
name: mailu
namespace: mailu-mailserver
spec:
entryPoints:
- websecure
routes:
- match: Host(`mail.bstein.dev`)
kind: Rule
services:
- name: mailu-front
port: 443
scheme: https
serversTransport: mailu-transport
tls:
secretName: mailu-certificates

View File

@ -0,0 +1,23 @@
# services/mailu/kustomization.yaml
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
namespace: mailu-mailserver
resources:
- namespace.yaml
- helmrelease.yaml
- certificate.yaml
- vip-controller.yaml
- unbound-configmap.yaml
- serverstransport.yaml
- ingressroute.yaml
- mailu-sync-job.yaml
- mailu-sync-cronjob.yaml
- mailu-sync-listener.yaml
configMapGenerator:
- name: mailu-sync-script
namespace: mailu-mailserver
files:
- sync.py=../../scripts/mailu_sync.py
options:
disableNameSuffixHash: true

View File

@ -0,0 +1,77 @@
# services/mailu/mailu-sync-cronjob.yaml
apiVersion: batch/v1
kind: CronJob
metadata:
name: mailu-sync-nightly
namespace: mailu-mailserver
spec:
schedule: "30 4 * * *"
concurrencyPolicy: Forbid
jobTemplate:
spec:
template:
spec:
restartPolicy: OnFailure
containers:
- name: mailu-sync
image: python:3.11-alpine
imagePullPolicy: IfNotPresent
command: ["/bin/sh", "-c"]
args:
- |
pip install --no-cache-dir requests psycopg2-binary passlib >/tmp/pip.log \
&& python /app/sync.py
env:
- name: KEYCLOAK_BASE_URL
value: http://keycloak.sso.svc.cluster.local
- name: KEYCLOAK_REALM
value: atlas
- name: MAILU_DOMAIN
value: bstein.dev
- name: MAILU_DEFAULT_QUOTA
value: "20000000000"
- name: MAILU_DB_HOST
value: postgres-service.postgres.svc.cluster.local
- name: MAILU_DB_PORT
value: "5432"
- name: MAILU_DB_NAME
valueFrom:
secretKeyRef:
name: mailu-db-secret
key: database
- name: MAILU_DB_USER
valueFrom:
secretKeyRef:
name: mailu-db-secret
key: username
- name: MAILU_DB_PASSWORD
valueFrom:
secretKeyRef:
name: mailu-db-secret
key: password
- name: KEYCLOAK_CLIENT_ID
valueFrom:
secretKeyRef:
name: mailu-sync-credentials
key: client-id
- name: KEYCLOAK_CLIENT_SECRET
valueFrom:
secretKeyRef:
name: mailu-sync-credentials
key: client-secret
volumeMounts:
- name: sync-script
mountPath: /app/sync.py
subPath: sync.py
resources:
requests:
cpu: 50m
memory: 128Mi
limits:
cpu: 200m
memory: 256Mi
volumes:
- name: sync-script
configMap:
name: mailu-sync-script
defaultMode: 0444

View File

@ -0,0 +1,73 @@
# services/mailu/mailu-sync-job.yaml
apiVersion: batch/v1
kind: Job
metadata:
name: mailu-sync
namespace: mailu-mailserver
spec:
template:
spec:
restartPolicy: OnFailure
containers:
- name: mailu-sync
image: python:3.11-alpine
imagePullPolicy: IfNotPresent
command: ["/bin/sh", "-c"]
args:
- |
pip install --no-cache-dir requests psycopg2-binary passlib >/tmp/pip.log \
&& python /app/sync.py
env:
- name: KEYCLOAK_BASE_URL
value: http://keycloak.sso.svc.cluster.local
- name: KEYCLOAK_REALM
value: atlas
- name: MAILU_DOMAIN
value: bstein.dev
- name: MAILU_DEFAULT_QUOTA
value: "20000000000"
- name: MAILU_DB_HOST
value: postgres-service.postgres.svc.cluster.local
- name: MAILU_DB_PORT
value: "5432"
- name: MAILU_DB_NAME
valueFrom:
secretKeyRef:
name: mailu-db-secret
key: database
- name: MAILU_DB_USER
valueFrom:
secretKeyRef:
name: mailu-db-secret
key: username
- name: MAILU_DB_PASSWORD
valueFrom:
secretKeyRef:
name: mailu-db-secret
key: password
- name: KEYCLOAK_CLIENT_ID
valueFrom:
secretKeyRef:
name: mailu-sync-credentials
key: client-id
- name: KEYCLOAK_CLIENT_SECRET
valueFrom:
secretKeyRef:
name: mailu-sync-credentials
key: client-secret
volumeMounts:
- name: sync-script
mountPath: /app/sync.py
subPath: sync.py
resources:
requests:
cpu: 50m
memory: 128Mi
limits:
cpu: 200m
memory: 256Mi
volumes:
- name: sync-script
configMap:
name: mailu-sync-script
defaultMode: 0444

View File

@ -0,0 +1,154 @@
# services/mailu/mailu-sync-listener.yaml
apiVersion: v1
kind: Service
metadata:
name: mailu-sync-listener
namespace: mailu-mailserver
spec:
selector:
app: mailu-sync-listener
ports:
- name: http
port: 8080
targetPort: 8080
---
apiVersion: apps/v1
kind: Deployment
metadata:
name: mailu-sync-listener
namespace: mailu-mailserver
labels:
app: mailu-sync-listener
spec:
replicas: 1
selector:
matchLabels:
app: mailu-sync-listener
template:
metadata:
labels:
app: mailu-sync-listener
spec:
restartPolicy: Always
containers:
- name: listener
image: python:3.11-alpine
imagePullPolicy: IfNotPresent
command: ["/bin/sh", "-c"]
args:
- |
pip install --no-cache-dir requests psycopg2-binary passlib >/tmp/pip.log \
&& python /app/listener.py
env:
- name: KEYCLOAK_BASE_URL
value: http://keycloak.sso.svc.cluster.local
- name: KEYCLOAK_REALM
value: atlas
- name: MAILU_DOMAIN
value: bstein.dev
- name: MAILU_DEFAULT_QUOTA
value: "20000000000"
- name: MAILU_DB_HOST
value: postgres-service.postgres.svc.cluster.local
- name: MAILU_DB_PORT
value: "5432"
- name: MAILU_DB_NAME
valueFrom:
secretKeyRef:
name: mailu-db-secret
key: database
- name: MAILU_DB_USER
valueFrom:
secretKeyRef:
name: mailu-db-secret
key: username
- name: MAILU_DB_PASSWORD
valueFrom:
secretKeyRef:
name: mailu-db-secret
key: password
- name: KEYCLOAK_CLIENT_ID
valueFrom:
secretKeyRef:
name: mailu-sync-credentials
key: client-id
- name: KEYCLOAK_CLIENT_SECRET
valueFrom:
secretKeyRef:
name: mailu-sync-credentials
key: client-secret
volumeMounts:
- name: sync-script
mountPath: /app/sync.py
subPath: sync.py
- name: listener-script
mountPath: /app/listener.py
subPath: listener.py
resources:
requests:
cpu: 50m
memory: 128Mi
limits:
cpu: 200m
memory: 256Mi
volumes:
- name: sync-script
configMap:
name: mailu-sync-script
defaultMode: 0444
- name: listener-script
configMap:
name: mailu-sync-listener
defaultMode: 0444
---
apiVersion: v1
kind: ConfigMap
metadata:
name: mailu-sync-listener
namespace: mailu-mailserver
data:
listener.py: |
import http.server
import json
import os
import subprocess
import threading
from time import time
# Simple debounce to avoid hammering on bursts
MIN_INTERVAL_SECONDS = 10
last_run = 0.0
lock = threading.Lock()
def trigger_sync():
global last_run
with lock:
now = time()
if now - last_run < MIN_INTERVAL_SECONDS:
return
last_run = now
# Fire and forget; output to stdout
subprocess.Popen(["python", "/app/sync.py"], stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
class Handler(http.server.BaseHTTPRequestHandler):
def do_POST(self):
length = int(self.headers.get("Content-Length", 0))
body = self.rfile.read(length) if length else b""
try:
json.loads(body or b"{}")
except json.JSONDecodeError:
self.send_response(400)
self.end_headers()
return
trigger_sync()
self.send_response(202)
self.end_headers()
def log_message(self, fmt, *args):
# Quiet logging
return
if __name__ == "__main__":
server = http.server.ThreadingHTTPServer(("", 8080), Handler)
server.serve_forever()

View File

@ -0,0 +1,5 @@
# services/mailu/namespace.yaml
apiVersion: v1
kind: Namespace
metadata:
name: mailu-mailserver

View File

@ -0,0 +1,10 @@
# services/mailu/serverstransport.yaml
apiVersion: traefik.io/v1alpha1
kind: ServersTransport
metadata:
name: mailu-transport
namespace: mailu-mailserver
spec:
# Force SNI to mail.bstein.dev and skip backend cert verification (backend cert is for the host, not the pod IP).
serverName: mail.bstein.dev
insecureSkipVerify: true

View File

@ -0,0 +1,49 @@
# services/mailu/unbound-configmap.yaml
apiVersion: v1
kind: ConfigMap
metadata:
name: mailu-unbound
namespace: mailu-mailserver
data:
unbound.conf: |
server:
verbosity: 1
interface: 0.0.0.0
do-ip4: yes
do-ip6: no
do-udp: yes
do-tcp: yes
auto-trust-anchor-file: "/var/lib/unbound/root.key"
prefetch: yes
qname-minimisation: yes
harden-dnssec-stripped: yes
val-clean-additional: yes
domain-insecure: "mailu-mailserver.svc.cluster.local."
domain-insecure: "svc.cluster.local."
domain-insecure: "cluster.local."
cache-min-ttl: 120
cache-max-ttl: 86400
access-control: 0.0.0.0/0 allow
forward-zone:
name: "mailu-mailserver.svc.cluster.local."
forward-addr: 10.43.0.10
forward-no-cache: yes
forward-first: yes
forward-zone:
name: "svc.cluster.local."
forward-addr: 10.43.0.10
forward-no-cache: yes
forward-first: yes
forward-zone:
name: "cluster.local."
forward-addr: 10.43.0.10
forward-no-cache: yes
forward-first: yes
forward-zone:
name: "."
forward-addr: 9.9.9.9
forward-addr: 1.1.1.1

View File

@ -0,0 +1,71 @@
# services/mailu/vip-controller.yaml
---
apiVersion: v1
kind: ServiceAccount
metadata:
name: vip-controller
namespace: mailu-mailserver
---
apiVersion: rbac.authorization.k8s.io/v1
kind: Role
metadata:
name: vip-controller-role
namespace: mailu-mailserver
rules:
- apiGroups: ["apps"]
resources: ["deployments"]
verbs: ["get", "list", "patch", "update"]
---
apiVersion: rbac.authorization.k8s.io/v1
kind: RoleBinding
metadata:
name: vip-controller-binding
namespace: mailu-mailserver
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: Role
name: vip-controller-role
subjects:
- kind: ServiceAccount
name: vip-controller
namespace: mailu-mailserver
---
apiVersion: apps/v1
kind: DaemonSet
metadata:
name: vip-controller
namespace: mailu-mailserver
spec:
selector:
matchLabels:
app: vip-controller
template:
metadata:
labels:
app: vip-controller
spec:
serviceAccountName: vip-controller
hostNetwork: true
nodeSelector:
mailu.bstein.dev/vip: "true"
containers:
- name: vip-controller
image: lachlanevenson/k8s-kubectl:latest
imagePullPolicy: IfNotPresent
command:
- /bin/sh
- -c
args:
- |
set -e
while true; do
if ip addr show end0 | grep -q 'inet 192\.168\.22\.9/32'; then
NODE=$(hostname)
echo "VIP found on node ${NODE}."
kubectl patch deployment mailu-front -n mailu-mailserver --type='merge' \
-p "{\"spec\":{\"template\":{\"spec\":{\"nodeSelector\":{\"kubernetes.io/hostname\":\"${NODE}\"}}}}}"
else
echo "No VIP on node ${HOSTNAME}."
fi
sleep 60
done

View File

@ -1,28 +0,0 @@
# services/monitoring
## Grafana admin secret
The Grafana Helm release expects a pre-existing secret named `grafana-admin`
in the `monitoring` namespace. Create or rotate it with:
```bash
kubectl create secret generic grafana-admin \
--namespace monitoring \
--from-literal=admin-user=admin \
--from-literal=admin-password='REPLACE_ME'
```
Update the password whenever you rotate credentials.
## DCGM exporter image
The NVIDIA GPU metrics DaemonSet expects `registry.bstein.dev/monitoring/dcgm-exporter:4.4.2-4.7.0-ubuntu22.04`, mirrored from `docker.io/nvidia/dcgm-exporter:4.4.2-4.7.0-ubuntu22.04`. Refresh it in Zot when bumping versions:
```bash
skopeo copy \
--all \
docker://docker.io/nvidia/dcgm-exporter:4.4.2-4.7.0-ubuntu22.04 \
docker://registry.bstein.dev/monitoring/dcgm-exporter:4.4.2-4.7.0-ubuntu22.04
```
When finished mirroring from the control-plane, you can remove temporary tooling with `sudo apt-get purge -y skopeo && sudo apt-get autoremove -y` and clear `~/.config/containers/auth.json`.

View File

@ -20,7 +20,7 @@
}, },
"targets": [ "targets": [
{ {
"expr": "100 * ( ( (sum by (namespace) (avg_over_time(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\"}[1h]))) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ) / clamp_min(sum( ( (sum by (namespace) (avg_over_time(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\"}[1h]))) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ), 1)", "expr": "100 * ( ( (sum by (namespace) (max_over_time(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\"}[$__range]))) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ) / clamp_min(sum( ( (sum by (namespace) (max_over_time(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\"}[$__range]))) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ), 1)",
"refId": "A", "refId": "A",
"legendFormat": "{{namespace}}" "legendFormat": "{{namespace}}"
} }
@ -40,9 +40,7 @@
"placement": "right" "placement": "right"
}, },
"pieType": "pie", "pieType": "pie",
"displayLabels": [ "displayLabels": [],
"percent"
],
"tooltip": { "tooltip": {
"mode": "single" "mode": "single"
}, },
@ -153,12 +151,16 @@
], ],
"fieldConfig": { "fieldConfig": {
"defaults": { "defaults": {
"unit": "percent" "unit": "percent",
"custom": {
"filterable": true
}
}, },
"overrides": [] "overrides": []
}, },
"options": { "options": {
"showHeader": true "showHeader": true,
"columnFilters": false
}, },
"transformations": [ "transformations": [
{ {

View File

@ -7,46 +7,55 @@
{ {
"id": 1, "id": 1,
"type": "stat", "type": "stat",
"title": "Ingress Traffic", "title": "Ingress Success Rate (5m)",
"datasource": { "datasource": {
"type": "prometheus", "type": "prometheus",
"uid": "atlas-vm" "uid": "atlas-vm"
}, },
"gridPos": { "gridPos": {
"h": 4, "h": 4,
"w": 8, "w": 6,
"x": 0, "x": 0,
"y": 0 "y": 0
}, },
"targets": [ "targets": [
{ {
"expr": "sum(rate(node_network_receive_bytes_total{device!~\"lo|cni.*|veth.*|flannel.*|docker.*|virbr.*|vxlan.*|wg.*\"}[5m])) or on() vector(0)", "expr": "(sum(rate(traefik_entrypoint_requests_total{code!~\"5..\"}[5m]))) / clamp_min(sum(rate(traefik_entrypoint_requests_total[5m])), 1)",
"refId": "A" "refId": "A"
} }
], ],
"fieldConfig": { "fieldConfig": {
"defaults": { "defaults": {
"color": { "color": {
"mode": "palette-classic" "mode": "thresholds"
}, },
"mappings": [], "mappings": [],
"thresholds": { "thresholds": {
"mode": "absolute", "mode": "absolute",
"steps": [ "steps": [
{ {
"color": "rgba(115, 115, 115, 1)", "color": "red",
"value": null "value": null
}, },
{
"color": "orange",
"value": 0.995
},
{
"color": "yellow",
"value": 0.999
},
{ {
"color": "green", "color": "green",
"value": 1 "value": 0.9995
} }
] ]
}, },
"unit": "Bps", "unit": "percentunit",
"custom": { "custom": {
"displayMode": "auto" "displayMode": "auto"
} },
"decimals": 2
}, },
"overrides": [] "overrides": []
}, },
@ -67,46 +76,55 @@
{ {
"id": 2, "id": 2,
"type": "stat", "type": "stat",
"title": "Egress Traffic", "title": "Error Budget Burn (1h)",
"datasource": { "datasource": {
"type": "prometheus", "type": "prometheus",
"uid": "atlas-vm" "uid": "atlas-vm"
}, },
"gridPos": { "gridPos": {
"h": 4, "h": 4,
"w": 8, "w": 6,
"x": 8, "x": 6,
"y": 0 "y": 0
}, },
"targets": [ "targets": [
{ {
"expr": "sum(rate(node_network_transmit_bytes_total{device!~\"lo|cni.*|veth.*|flannel.*|docker.*|virbr.*|vxlan.*|wg.*\"}[5m])) or on() vector(0)", "expr": "(1 - ((sum(rate(traefik_entrypoint_requests_total{code!~\"5..\"}[1h]))) / clamp_min(sum(rate(traefik_entrypoint_requests_total[1h])), 1))) / 0.0010000000000000009",
"refId": "A" "refId": "A"
} }
], ],
"fieldConfig": { "fieldConfig": {
"defaults": { "defaults": {
"color": { "color": {
"mode": "palette-classic" "mode": "thresholds"
}, },
"mappings": [], "mappings": [],
"thresholds": { "thresholds": {
"mode": "absolute", "mode": "absolute",
"steps": [ "steps": [
{ {
"color": "rgba(115, 115, 115, 1)", "color": "green",
"value": null "value": null
}, },
{ {
"color": "green", "color": "yellow",
"value": 1 "value": 1
},
{
"color": "orange",
"value": 2
},
{
"color": "red",
"value": 4
} }
] ]
}, },
"unit": "Bps", "unit": "none",
"custom": { "custom": {
"displayMode": "auto" "displayMode": "auto"
} },
"decimals": 2
}, },
"overrides": [] "overrides": []
}, },
@ -127,7 +145,145 @@
{ {
"id": 3, "id": 3,
"type": "stat", "type": "stat",
"title": "Intra-Cluster Traffic", "title": "Error Budget Burn (6h)",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 4,
"w": 6,
"x": 12,
"y": 0
},
"targets": [
{
"expr": "(1 - ((sum(rate(traefik_entrypoint_requests_total{code!~\"5..\"}[6h]))) / clamp_min(sum(rate(traefik_entrypoint_requests_total[6h])), 1))) / 0.0010000000000000009",
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "yellow",
"value": 1
},
{
"color": "orange",
"value": 2
},
{
"color": "red",
"value": 4
}
]
},
"unit": "none",
"custom": {
"displayMode": "auto"
},
"decimals": 2
},
"overrides": []
},
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "center",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"textMode": "value"
}
},
{
"id": 4,
"type": "stat",
"title": "Edge P99 Latency (ms)",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 4,
"w": 6,
"x": 18,
"y": 0
},
"targets": [
{
"expr": "histogram_quantile(0.99, sum by (le) (rate(traefik_entrypoint_request_duration_seconds_bucket[5m]))) * 1000",
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "yellow",
"value": 200
},
{
"color": "orange",
"value": 350
},
{
"color": "red",
"value": 500
}
]
},
"unit": "ms",
"custom": {
"displayMode": "auto"
},
"decimals": 1
},
"overrides": []
},
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "center",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"textMode": "value"
}
},
{
"id": 5,
"type": "stat",
"title": "Ingress Traffic",
"datasource": { "datasource": {
"type": "prometheus", "type": "prometheus",
"uid": "atlas-vm" "uid": "atlas-vm"
@ -135,19 +291,19 @@
"gridPos": { "gridPos": {
"h": 4, "h": 4,
"w": 8, "w": 8,
"x": 16, "x": 0,
"y": 0 "y": 4
}, },
"targets": [ "targets": [
{ {
"expr": "sum(rate(container_network_receive_bytes_total{namespace!=\"traefik\",pod!=\"\"}[5m]) + rate(container_network_transmit_bytes_total{namespace!=\"traefik\",pod!=\"\"}[5m])) or on() vector(0)", "expr": "sum(rate(node_network_receive_bytes_total{device!~\"lo|cni.*|veth.*|flannel.*|docker.*|virbr.*|vxlan.*|wg.*\"}[5m])) or on() vector(0)",
"refId": "A" "refId": "A"
} }
], ],
"fieldConfig": { "fieldConfig": {
"defaults": { "defaults": {
"color": { "color": {
"mode": "palette-classic" "mode": "thresholds"
}, },
"mappings": [], "mappings": [],
"thresholds": { "thresholds": {
@ -185,9 +341,9 @@
} }
}, },
{ {
"id": 4, "id": 6,
"type": "stat", "type": "stat",
"title": "Top Router req/s", "title": "Egress Traffic",
"datasource": { "datasource": {
"type": "prometheus", "type": "prometheus",
"uid": "atlas-vm" "uid": "atlas-vm"
@ -195,20 +351,19 @@
"gridPos": { "gridPos": {
"h": 4, "h": 4,
"w": 8, "w": 8,
"x": 0, "x": 8,
"y": 4 "y": 4
}, },
"targets": [ "targets": [
{ {
"expr": "topk(1, sum by (router) (rate(traefik_router_requests_total[5m])))", "expr": "sum(rate(node_network_transmit_bytes_total{device!~\"lo|cni.*|veth.*|flannel.*|docker.*|virbr.*|vxlan.*|wg.*\"}[5m])) or on() vector(0)",
"refId": "A", "refId": "A"
"legendFormat": "{{router}}"
} }
], ],
"fieldConfig": { "fieldConfig": {
"defaults": { "defaults": {
"color": { "color": {
"mode": "palette-classic" "mode": "thresholds"
}, },
"mappings": [], "mappings": [],
"thresholds": { "thresholds": {
@ -224,7 +379,7 @@
} }
] ]
}, },
"unit": "req/s", "unit": "Bps",
"custom": { "custom": {
"displayMode": "auto" "displayMode": "auto"
} }
@ -246,7 +401,67 @@
} }
}, },
{ {
"id": 5, "id": 7,
"type": "stat",
"title": "Intra-Cluster Traffic",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 4,
"w": 8,
"x": 16,
"y": 4
},
"targets": [
{
"expr": "sum(rate(container_network_receive_bytes_total{namespace!=\"traefik\",pod!=\"\"}[5m]) + rate(container_network_transmit_bytes_total{namespace!=\"traefik\",pod!=\"\"}[5m])) or on() vector(0)",
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "rgba(115, 115, 115, 1)",
"value": null
},
{
"color": "green",
"value": 1
}
]
},
"unit": "Bps",
"custom": {
"displayMode": "auto"
}
},
"overrides": []
},
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "center",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"textMode": "value"
}
},
{
"id": 8,
"type": "timeseries", "type": "timeseries",
"title": "Per-Node Throughput", "title": "Per-Node Throughput",
"datasource": { "datasource": {
@ -283,7 +498,7 @@
} }
}, },
{ {
"id": 6, "id": 9,
"type": "table", "type": "table",
"title": "Top Namespaces", "title": "Top Namespaces",
"datasource": { "datasource": {
@ -304,12 +519,16 @@
], ],
"fieldConfig": { "fieldConfig": {
"defaults": { "defaults": {
"unit": "Bps" "unit": "Bps",
"custom": {
"filterable": true
}
}, },
"overrides": [] "overrides": []
}, },
"options": { "options": {
"showHeader": true "showHeader": true,
"columnFilters": false
}, },
"transformations": [ "transformations": [
{ {
@ -319,7 +538,7 @@
] ]
}, },
{ {
"id": 7, "id": 10,
"type": "table", "type": "table",
"title": "Top Pods", "title": "Top Pods",
"datasource": { "datasource": {
@ -340,12 +559,16 @@
], ],
"fieldConfig": { "fieldConfig": {
"defaults": { "defaults": {
"unit": "Bps" "unit": "Bps",
"custom": {
"filterable": true
}
}, },
"overrides": [] "overrides": []
}, },
"options": { "options": {
"showHeader": true "showHeader": true,
"columnFilters": false
}, },
"transformations": [ "transformations": [
{ {
@ -355,7 +578,7 @@
] ]
}, },
{ {
"id": 8, "id": 11,
"type": "timeseries", "type": "timeseries",
"title": "Traefik Routers (req/s)", "title": "Traefik Routers (req/s)",
"datasource": { "datasource": {
@ -392,7 +615,7 @@
} }
}, },
{ {
"id": 9, "id": 12,
"type": "timeseries", "type": "timeseries",
"title": "Traefik Entrypoints (req/s)", "title": "Traefik Entrypoints (req/s)",
"datasource": { "datasource": {

View File

@ -27,7 +27,7 @@
"fieldConfig": { "fieldConfig": {
"defaults": { "defaults": {
"color": { "color": {
"mode": "palette-classic" "mode": "thresholds"
}, },
"mappings": [], "mappings": [],
"thresholds": { "thresholds": {
@ -88,7 +88,7 @@
"fieldConfig": { "fieldConfig": {
"defaults": { "defaults": {
"color": { "color": {
"mode": "palette-classic" "mode": "thresholds"
}, },
"mappings": [], "mappings": [],
"thresholds": { "thresholds": {
@ -149,7 +149,7 @@
"fieldConfig": { "fieldConfig": {
"defaults": { "defaults": {
"color": { "color": {
"mode": "palette-classic" "mode": "thresholds"
}, },
"mappings": [], "mappings": [],
"thresholds": { "thresholds": {
@ -186,6 +186,213 @@
"textMode": "value" "textMode": "value"
} }
}, },
{
"id": 9,
"type": "stat",
"title": "API Server 5xx rate",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 4,
"w": 8,
"x": 0,
"y": 4
},
"targets": [
{
"expr": "sum(rate(apiserver_request_total{code=~\"5..\"}[5m]))",
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "yellow",
"value": 0.05
},
{
"color": "orange",
"value": 0.2
},
{
"color": "red",
"value": 0.5
}
]
},
"unit": "req/s",
"custom": {
"displayMode": "auto"
},
"decimals": 3
},
"overrides": []
},
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "center",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"textMode": "value"
}
},
{
"id": 10,
"type": "stat",
"title": "API Server P99 latency",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 4,
"w": 8,
"x": 8,
"y": 4
},
"targets": [
{
"expr": "histogram_quantile(0.99, sum by (le) (rate(apiserver_request_duration_seconds_bucket[5m]))) * 1000",
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "yellow",
"value": 250
},
{
"color": "orange",
"value": 400
},
{
"color": "red",
"value": 600
}
]
},
"unit": "ms",
"custom": {
"displayMode": "auto"
},
"decimals": 1
},
"overrides": []
},
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "center",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"textMode": "value"
}
},
{
"id": 11,
"type": "stat",
"title": "etcd P99 latency",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 4,
"w": 8,
"x": 16,
"y": 4
},
"targets": [
{
"expr": "histogram_quantile(0.99, sum by (le) (rate(etcd_request_duration_seconds_bucket[5m]))) * 1000",
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "yellow",
"value": 50
},
{
"color": "orange",
"value": 100
},
{
"color": "red",
"value": 200
}
]
},
"unit": "ms",
"custom": {
"displayMode": "auto"
},
"decimals": 1
},
"overrides": []
},
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "center",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"textMode": "value"
}
},
{ {
"id": 4, "id": 4,
"type": "timeseries", "type": "timeseries",
@ -198,7 +405,7 @@
"h": 9, "h": 9,
"w": 24, "w": 24,
"x": 0, "x": 0,
"y": 4 "y": 8
}, },
"targets": [ "targets": [
{ {
@ -238,7 +445,7 @@
"h": 9, "h": 9,
"w": 24, "w": 24,
"x": 0, "x": 0,
"y": 13 "y": 17
}, },
"targets": [ "targets": [
{ {
@ -278,7 +485,7 @@
"h": 9, "h": 9,
"w": 12, "w": 12,
"x": 0, "x": 0,
"y": 22 "y": 26
}, },
"targets": [ "targets": [
{ {
@ -315,7 +522,7 @@
"h": 9, "h": 9,
"w": 12, "w": 12,
"x": 12, "x": 12,
"y": 22 "y": 26
}, },
"targets": [ "targets": [
{ {
@ -352,7 +559,7 @@
"h": 9, "h": 9,
"w": 24, "w": 24,
"x": 0, "x": 0,
"y": 31 "y": 35
}, },
"targets": [ "targets": [
{ {

View File

@ -7,67 +7,6 @@
"list": [] "list": []
}, },
"panels": [ "panels": [
{
"id": 1,
"type": "gauge",
"title": "Workers Ready",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 5,
"w": 5,
"x": 0,
"y": 0
},
"targets": [
{
"expr": "sum(kube_node_status_condition{condition=\"Ready\",status=\"true\",node=~\"titan-04|titan-05|titan-06|titan-07|titan-08|titan-09|titan-10|titan-11|titan-12|titan-13|titan-14|titan-15|titan-16|titan-17|titan-18|titan-19|titan-22|titan-24\"})",
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"min": 0,
"max": 18,
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "red",
"value": null
},
{
"color": "orange",
"value": 16
},
{
"color": "yellow",
"value": 17
},
{
"color": "green",
"value": 18
}
]
}
},
"overrides": []
},
"options": {
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"orientation": "auto",
"showThresholdMarkers": false,
"showThresholdLabels": false
}
},
{ {
"id": 2, "id": 2,
"type": "gauge", "type": "gauge",
@ -78,8 +17,8 @@
}, },
"gridPos": { "gridPos": {
"h": 5, "h": 5,
"w": 5, "w": 4,
"x": 5, "x": 0,
"y": 0 "y": 0
}, },
"targets": [ "targets": [
@ -131,8 +70,8 @@
}, },
"gridPos": { "gridPos": {
"h": 5, "h": 5,
"w": 5, "w": 3,
"x": 10, "x": 4,
"y": 0 "y": 0
}, },
"targets": [ "targets": [
@ -144,82 +83,7 @@
"fieldConfig": { "fieldConfig": {
"defaults": { "defaults": {
"color": { "color": {
"mode": "palette-classic" "mode": "thresholds"
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "yellow",
"value": 1
},
{
"color": "orange",
"value": 2
},
{
"color": "red",
"value": 3
}
]
},
"unit": "none",
"custom": {
"displayMode": "auto"
}
},
"overrides": []
},
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "center",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"textMode": "value"
},
"links": [
{
"title": "Open atlas-pods dashboard",
"url": "/d/atlas-pods",
"targetBlank": true
}
]
},
{
"id": 4,
"type": "stat",
"title": "Problem Pods",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 5,
"w": 5,
"x": 15,
"y": 0
},
"targets": [
{
"expr": "sum(max by (namespace,pod) (kube_pod_status_phase{phase!~\"Running|Succeeded\"}))",
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
}, },
"mappings": [], "mappings": [],
"thresholds": { "thresholds": {
@ -281,20 +145,20 @@
}, },
"gridPos": { "gridPos": {
"h": 5, "h": 5,
"w": 4, "w": 3,
"x": 20, "x": 7,
"y": 0 "y": 0
}, },
"targets": [ "targets": [
{ {
"expr": "sum(max by (namespace,pod) (((time() - kube_pod_deletion_timestamp{pod!=\"\"}) > bool 600) and on(namespace,pod) (kube_pod_deletion_timestamp{pod!=\"\"} > bool 0)))", "expr": "sum(max by (namespace,pod) (((time() - kube_pod_deletion_timestamp{pod!=\"\"}) > bool 600) and on(namespace,pod) (kube_pod_deletion_timestamp{pod!=\"\"} > bool 0))) or on() vector(0)",
"refId": "A" "refId": "A"
} }
], ],
"fieldConfig": { "fieldConfig": {
"defaults": { "defaults": {
"color": { "color": {
"mode": "palette-classic" "mode": "thresholds"
}, },
"mappings": [], "mappings": [],
"thresholds": { "thresholds": {
@ -346,6 +210,290 @@
} }
] ]
}, },
{
"id": 27,
"type": "stat",
"title": "Atlas Availability (30d)",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 5,
"w": 4,
"x": 10,
"y": 0
},
"targets": [
{
"expr": "avg_over_time((min(((sum(kube_node_status_condition{condition=\"Ready\",status=\"true\",node=~\"titan-0a|titan-0b|titan-0c\"}) / 3)), ((sum(kube_deployment_status_replicas_available{namespace=~\"traefik|kube-system\",deployment=\"traefik\"}) / clamp_min(sum(kube_deployment_spec_replicas{namespace=~\"traefik|kube-system\",deployment=\"traefik\"}), 1)))))[30d:5m])",
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "red",
"value": null
},
{
"color": "orange",
"value": 0.99
},
{
"color": "yellow",
"value": 0.999
},
{
"color": "green",
"value": 0.9999
},
{
"color": "blue",
"value": 0.99999
}
]
},
"unit": "percentunit",
"custom": {
"displayMode": "auto"
},
"decimals": 3
},
"overrides": []
},
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "center",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"textMode": "value"
}
},
{
"id": 4,
"type": "stat",
"title": "Problem Pods",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 5,
"w": 3,
"x": 14,
"y": 0
},
"targets": [
{
"expr": "sum(max by (namespace,pod) (kube_pod_status_phase{phase!~\"Running|Succeeded\"})) or on() vector(0)",
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "yellow",
"value": 1
},
{
"color": "orange",
"value": 2
},
{
"color": "red",
"value": 3
}
]
},
"unit": "none",
"custom": {
"displayMode": "auto"
}
},
"overrides": []
},
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "center",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"textMode": "value"
},
"links": [
{
"title": "Open atlas-pods dashboard",
"url": "/d/atlas-pods",
"targetBlank": true
}
]
},
{
"id": 6,
"type": "stat",
"title": "CrashLoop / ImagePull",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 5,
"w": 3,
"x": 17,
"y": 0
},
"targets": [
{
"expr": "sum(max by (namespace,pod) (kube_pod_container_status_waiting_reason{reason=~\"CrashLoopBackOff|ImagePullBackOff\"})) or on() vector(0)",
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "yellow",
"value": 1
},
{
"color": "orange",
"value": 2
},
{
"color": "red",
"value": 3
}
]
},
"unit": "none",
"custom": {
"displayMode": "auto"
}
},
"overrides": []
},
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "center",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"textMode": "value"
},
"links": [
{
"title": "Open atlas-pods dashboard",
"url": "/d/atlas-pods",
"targetBlank": true
}
]
},
{
"id": 1,
"type": "gauge",
"title": "Workers Ready",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 5,
"w": 4,
"x": 20,
"y": 0
},
"targets": [
{
"expr": "sum(kube_node_status_condition{condition=\"Ready\",status=\"true\",node=~\"titan-04|titan-05|titan-06|titan-07|titan-08|titan-09|titan-10|titan-11|titan-12|titan-13|titan-14|titan-15|titan-16|titan-17|titan-18|titan-19|titan-22|titan-24\"})",
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"min": 0,
"max": 18,
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "red",
"value": null
},
{
"color": "orange",
"value": 16
},
{
"color": "yellow",
"value": 17
},
{
"color": "green",
"value": 18
}
]
}
},
"overrides": []
},
"options": {
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"orientation": "auto",
"showThresholdMarkers": false,
"showThresholdLabels": false
}
},
{ {
"id": 7, "id": 7,
"type": "stat", "type": "stat",
@ -371,11 +519,11 @@
"fieldConfig": { "fieldConfig": {
"defaults": { "defaults": {
"color": { "color": {
"mode": "palette-classic" "mode": "thresholds"
}, },
"mappings": [], "mappings": [],
"thresholds": { "thresholds": {
"mode": "percentage", "mode": "absolute",
"steps": [ "steps": [
{ {
"color": "green", "color": "green",
@ -383,11 +531,15 @@
}, },
{ {
"color": "yellow", "color": "yellow",
"value": 70 "value": 50
},
{
"color": "orange",
"value": 75
}, },
{ {
"color": "red", "color": "red",
"value": 85 "value": 91.5
} }
] ]
}, },
@ -444,11 +596,11 @@
"fieldConfig": { "fieldConfig": {
"defaults": { "defaults": {
"color": { "color": {
"mode": "palette-classic" "mode": "thresholds"
}, },
"mappings": [], "mappings": [],
"thresholds": { "thresholds": {
"mode": "percentage", "mode": "absolute",
"steps": [ "steps": [
{ {
"color": "green", "color": "green",
@ -456,11 +608,15 @@
}, },
{ {
"color": "yellow", "color": "yellow",
"value": 70 "value": 50
},
{
"color": "orange",
"value": 75
}, },
{ {
"color": "red", "color": "red",
"value": 85 "value": 91.5
} }
] ]
}, },
@ -517,7 +673,7 @@
"fieldConfig": { "fieldConfig": {
"defaults": { "defaults": {
"color": { "color": {
"mode": "palette-classic" "mode": "thresholds"
}, },
"mappings": [], "mappings": [],
"thresholds": { "thresholds": {
@ -586,7 +742,7 @@
"fieldConfig": { "fieldConfig": {
"defaults": { "defaults": {
"color": { "color": {
"mode": "palette-classic" "mode": "thresholds"
}, },
"mappings": [], "mappings": [],
"thresholds": { "thresholds": {
@ -653,11 +809,11 @@
"fieldConfig": { "fieldConfig": {
"defaults": { "defaults": {
"color": { "color": {
"mode": "palette-classic" "mode": "thresholds"
}, },
"mappings": [], "mappings": [],
"thresholds": { "thresholds": {
"mode": "percentage", "mode": "absolute",
"steps": [ "steps": [
{ {
"color": "green", "color": "green",
@ -665,11 +821,15 @@
}, },
{ {
"color": "yellow", "color": "yellow",
"value": 70 "value": 50
},
{
"color": "orange",
"value": 75
}, },
{ {
"color": "red", "color": "red",
"value": 85 "value": 91.5
} }
] ]
}, },
@ -724,11 +884,11 @@
"fieldConfig": { "fieldConfig": {
"defaults": { "defaults": {
"color": { "color": {
"mode": "palette-classic" "mode": "thresholds"
}, },
"mappings": [], "mappings": [],
"thresholds": { "thresholds": {
"mode": "percentage", "mode": "absolute",
"steps": [ "steps": [
{ {
"color": "green", "color": "green",
@ -736,11 +896,15 @@
}, },
{ {
"color": "yellow", "color": "yellow",
"value": 70 "value": 50
},
{
"color": "orange",
"value": 75
}, },
{ {
"color": "red", "color": "red",
"value": 85 "value": 91.5
} }
] ]
}, },
@ -795,7 +959,7 @@
"fieldConfig": { "fieldConfig": {
"defaults": { "defaults": {
"color": { "color": {
"mode": "palette-classic" "mode": "thresholds"
}, },
"mappings": [], "mappings": [],
"thresholds": { "thresholds": {
@ -862,7 +1026,7 @@
"fieldConfig": { "fieldConfig": {
"defaults": { "defaults": {
"color": { "color": {
"mode": "palette-classic" "mode": "thresholds"
}, },
"mappings": [], "mappings": [],
"thresholds": { "thresholds": {
@ -942,9 +1106,7 @@
"placement": "right" "placement": "right"
}, },
"pieType": "pie", "pieType": "pie",
"displayLabels": [ "displayLabels": [],
"percent"
],
"tooltip": { "tooltip": {
"mode": "single" "mode": "single"
}, },
@ -975,7 +1137,7 @@
}, },
"targets": [ "targets": [
{ {
"expr": "100 * ( ( (sum by (namespace) (avg_over_time(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\"}[1h]))) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ) / clamp_min(sum( ( (sum by (namespace) (avg_over_time(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\"}[1h]))) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ), 1)", "expr": "100 * ( ( (sum by (namespace) (max_over_time(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\"}[$__range]))) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ) / clamp_min(sum( ( (sum by (namespace) (max_over_time(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\"}[$__range]))) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ), 1)",
"refId": "A", "refId": "A",
"legendFormat": "{{namespace}}" "legendFormat": "{{namespace}}"
} }
@ -995,9 +1157,7 @@
"placement": "right" "placement": "right"
}, },
"pieType": "pie", "pieType": "pie",
"displayLabels": [ "displayLabels": [],
"percent"
],
"tooltip": { "tooltip": {
"mode": "single" "mode": "single"
}, },
@ -1048,9 +1208,7 @@
"placement": "right" "placement": "right"
}, },
"pieType": "pie", "pieType": "pie",
"displayLabels": [ "displayLabels": [],
"percent"
],
"tooltip": { "tooltip": {
"mode": "single" "mode": "single"
}, },
@ -1175,7 +1333,7 @@
}, },
"targets": [ "targets": [
{ {
"expr": "(avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-0a|titan-0b|titan-0c\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")", "expr": "(avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-0a|titan-0b|titan-0c|titan-db\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")",
"refId": "A", "refId": "A",
"legendFormat": "{{node}}" "legendFormat": "{{node}}"
} }
@ -1212,7 +1370,7 @@
}, },
"targets": [ "targets": [
{ {
"expr": "(avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-0a|titan-0b|titan-0c\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")", "expr": "(avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-0a|titan-0b|titan-0c|titan-db\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")",
"refId": "A", "refId": "A",
"legendFormat": "{{node}}" "legendFormat": "{{node}}"
} }
@ -1233,6 +1391,138 @@
} }
} }
}, },
{
"id": 28,
"type": "piechart",
"title": "Node Pod Share",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 10,
"w": 12,
"x": 0,
"y": 54
},
"targets": [
{
"expr": "(sum(kube_pod_info{pod!=\"\" , node!=\"\"}) by (node) / clamp_min(sum(kube_pod_info{pod!=\"\" , node!=\"\"}), 1)) * 100",
"refId": "A",
"legendFormat": "{{namespace}}"
}
],
"fieldConfig": {
"defaults": {
"unit": "percent",
"color": {
"mode": "palette-classic"
}
},
"overrides": []
},
"options": {
"legend": {
"displayMode": "list",
"placement": "right"
},
"pieType": "pie",
"displayLabels": [],
"tooltip": {
"mode": "single"
},
"colorScheme": "interpolateSpectral",
"colorBy": "value",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
}
}
},
{
"id": 29,
"type": "bargauge",
"title": "Top Nodes by Pod Count",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 10,
"w": 12,
"x": 12,
"y": 54
},
"targets": [
{
"expr": "topk(12, sum(kube_pod_info{pod!=\"\" , node!=\"\"}) by (node))",
"refId": "A",
"legendFormat": "{{node}}",
"instant": true
}
],
"fieldConfig": {
"defaults": {
"unit": "none",
"min": 0,
"max": null,
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "yellow",
"value": 50
},
{
"color": "orange",
"value": 75
},
{
"color": "red",
"value": 100
}
]
},
"decimals": 0
},
"overrides": []
},
"options": {
"displayMode": "gradient",
"orientation": "horizontal",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
}
},
"transformations": [
{
"id": "sortBy",
"options": {
"fields": [
"Value"
],
"order": "desc"
}
},
{
"id": "limit",
"options": {
"limit": 12
}
}
]
},
{ {
"id": 18, "id": 18,
"type": "timeseries", "type": "timeseries",
@ -1377,7 +1667,7 @@
"h": 16, "h": 16,
"w": 12, "w": 12,
"x": 0, "x": 0,
"y": 54 "y": 64
}, },
"targets": [ "targets": [
{ {
@ -1425,7 +1715,7 @@
"h": 16, "h": 16,
"w": 12, "w": 12,
"x": 12, "x": 12,
"y": 54 "y": 64
}, },
"targets": [ "targets": [
{ {
@ -1452,11 +1742,11 @@
}, },
{ {
"color": "orange", "color": "orange",
"value": 70 "value": 75
}, },
{ {
"color": "red", "color": "red",
"value": 85 "value": 91.5
} }
] ]
} }
@ -1480,6 +1770,17 @@
"url": "/d/atlas-storage", "url": "/d/atlas-storage",
"targetBlank": true "targetBlank": true
} }
],
"transformations": [
{
"id": "sortBy",
"options": {
"fields": [
"Value"
],
"order": "desc"
}
}
] ]
} }
], ],
@ -1497,36 +1798,5 @@
"to": "now" "to": "now"
}, },
"refresh": "1m", "refresh": "1m",
"links": [ "links": []
{
"title": "Atlas Pods",
"type": "dashboard",
"dashboardUid": "atlas-pods",
"keepTime": false
},
{
"title": "Atlas Nodes",
"type": "dashboard",
"dashboardUid": "atlas-nodes",
"keepTime": false
},
{
"title": "Atlas Storage",
"type": "dashboard",
"dashboardUid": "atlas-storage",
"keepTime": false
},
{
"title": "Atlas Network",
"type": "dashboard",
"dashboardUid": "atlas-network",
"keepTime": false
},
{
"title": "Atlas GPU",
"type": "dashboard",
"dashboardUid": "atlas-gpu",
"keepTime": false
}
]
} }

Some files were not shown because too many files have changed in this diff Show More