Compare commits

...

104 Commits

Author SHA1 Message Date
055ce7d18c Merge pull request 'feature/mailu' (#5) from feature/mailu into main
Reviewed-on: #5
2025-12-14 17:48:02 +00:00
1a161b4d3c monitoring: longer data history 2025-12-14 14:47:20 -03:00
f7bf990d62 flux: bump gitops-ui kustomization 2025-12-14 14:41:52 -03:00
63bf153c8b flux: add weave gitops ui 2025-12-14 14:38:08 -03:00
8fceebd7a7 nextcloud: integration with mailu & gitops-ui: initial install 2025-12-14 14:21:40 -03:00
0d0216c8f5 Add tests and dedupe nextcloud mail sync 2025-12-14 14:15:19 -03:00
c8b49560b6 Keep nextcloud scripts single-sourced under scripts/ 2025-12-14 14:05:01 -03:00
327a7bed57 Extract nextcloud scripts to files 2025-12-14 13:59:16 -03:00
aae09c5074 Normalize doc layout and README guidance 2025-12-14 13:47:59 -03:00
56bb4e91b9 Group namespace plurality rows to one per namespace 2025-12-13 22:17:47 -03:00
18f3a2cefe Fix namespace plurality mask and bump v26 2025-12-13 20:53:11 -03:00
1ec3ca29a4 Use OR-joined node ranks for plurality tie-break 2025-12-13 19:04:22 -03:00
4812958e82 Deduplicate namespace plurality rows with ranked tie-break 2025-12-13 18:39:31 -03:00
9ad5f7f405 Restore namespace plurality panel data 2025-12-13 18:25:03 -03:00
57ea397027 Use table format for namespace plurality panel 2025-12-13 18:23:19 -03:00
be0ac48b33 Simplify namespace plurality table rendering 2025-12-13 18:07:56 -03:00
2156b6f6aa Hide table footer on namespace plurality table 2025-12-13 18:03:51 -03:00
4fcc7c84f2 Make namespace plurality table non-filterable 2025-12-13 17:55:52 -03:00
a4b3273bab Remove filter bar from namespace plurality table 2025-12-13 17:38:57 -03:00
c536a13d55 Disable column filters on namespace plurality table 2025-12-13 17:35:52 -03:00
13eb02c19b Hide filters on namespace plurality table 2025-12-13 17:32:19 -03:00
134a4ad001 Fix namespace plurality table query 2025-12-13 17:29:55 -03:00
3e0a84b074 atlas pods: plurality table v11 (deterministic top node) 2025-12-13 17:19:03 -03:00
7f67793ee5 atlas pods: plurality table v10 2025-12-13 16:36:25 -03:00
e87d54f19d atlas pods: per-namespace top node via topk 2025-12-13 15:51:45 -03:00
6ac01e5879 atlas pods: simplify plurality table (no filter) 2025-12-13 15:29:08 -03:00
d0ed188179 monitoring: drop README per convention 2025-12-13 15:25:21 -03:00
b703e66b98 monitoring: restore README 2025-12-13 15:11:50 -03:00
68d4f43903 atlas pods: stabilize plurality query to avoid 422 2025-12-13 15:11:21 -03:00
cf9dacd4ea atlas pods: show per-namespace top node without vars 2025-12-13 15:02:52 -03:00
6eee7b8853 atlas pods: drop non-leading nodes in plurality table 2025-12-13 13:39:06 -03:00
03a4ca4d84 atlas pods: simplify plurality table query 2025-12-13 12:06:18 -03:00
c7adb0c8cb atlas pods: fix plurality table query 2025-12-13 12:00:31 -03:00
9d1163f580 atlas pods: use prom share() for plurality table 2025-12-13 11:53:27 -03:00
001f0f95a6 atlas pods: fix plurality query with bool max match 2025-12-13 11:51:18 -03:00
2177a8009e atlas pods: robust per-namespace top-node share 2025-12-13 11:48:44 -03:00
6a3d1311b9 atlas pods: select per-namespace top node via max match 2025-12-13 04:15:03 -03:00
d916e5a7f1 atlas pods: sort plurality table by node then share 2025-12-13 04:10:10 -03:00
5d6d34c274 atlas pods: simplify namespace plurality query 2025-12-13 04:06:46 -03:00
53423c7a46 atlas pods: fix namespace plurality query 2025-12-13 04:00:57 -03:00
d274738e9e restore readmes removed in last commit 2025-12-13 03:57:44 -03:00
f0265d6b94 atlas pods: add namespace plurality by node table 2025-12-13 03:57:20 -03:00
8a755e0c42 mailu: forcing version 1.4 clamav over 1.2 2025-12-13 00:11:40 -03:00
e22293db3e forcing 12-r3 over 12-r6 for redis 2025-12-12 22:09:04 -03:00
6f8a70fd58 atlas overview: include titan-db in control plane panels 2025-12-12 21:55:53 -03:00
580d1731f9 monitoring: drop duplicate titan-db scrape job 2025-12-12 21:48:03 -03:00
4def298b83 monitoring: scrape titan-db node_exporter 2025-12-12 21:38:10 -03:00
1166069640 atlas dashboards: align percent thresholds and disk bars 2025-12-12 21:13:31 -03:00
e56bed284e atlas overview: refine alert thresholds and availability colors 2025-12-12 20:50:41 -03:00
24376594ff atlas dashboards: use threshold colors for stats 2025-12-12 20:44:20 -03:00
5277c98385 atlas dashboards: fix pod share display and zero/red stat thresholds 2025-12-12 20:40:32 -03:00
056b7b7770 atlas dashboards: show pod counts (not %) and make zero-friendly stats 2025-12-12 20:30:00 -03:00
b770575b42 atlas dashboards: show pod counts with top12 bars 2025-12-12 20:20:13 -03:00
9e76277c22 atlas dashboards: drop empty nodes and enforce top12 pod bars 2025-12-12 19:09:51 -03:00
93b3c6d2ec atlas dashboards: cap pod count bars at top12 2025-12-12 18:56:13 -03:00
596bf46863 atlas dashboards: sort pod counts and add pod row to overview 2025-12-12 18:51:43 -03:00
8b703f8655 atlas pods: add pod count bar and tidy pie 2025-12-12 18:45:29 -03:00
ec59d25ad8 atlas dashboards: fix overview links and add pods-by-node pie 2025-12-12 18:32:45 -03:00
bf6179f907 atlas internal dashboards: add SLO/burn and api health panels 2025-12-12 18:00:43 -03:00
0a0966db78 atlas overview: fix availability scaling 2025-12-12 16:36:47 -03:00
87fbba0d3e atlas overview: show availability percent with 3 decimals 2025-12-12 16:15:37 -03:00
b200dba5b9 atlas overview: show availability percent and keep uptime centered 2025-12-12 16:11:28 -03:00
697ce3c18f atlas overview: center uptime and reorder top row 2025-12-12 15:56:33 -03:00
8e39c6a28b atlas overview: add uptime and crashloop panels 2025-12-12 15:23:51 -03:00
38ab8e3364 standardize cert issuers to letsencrypt 2025-12-12 15:18:40 -03:00
29d22ba539 mailu: fix unbound sidecar mounts 2025-12-12 01:19:27 -03:00
118032d2c6 mailu: use mvance unbound sidecar and current redis image 2025-12-12 01:12:48 -03:00
4cfe92feb2 mailu: remove force upgrade to avoid pvc replace 2025-12-12 01:09:25 -03:00
ca27cc95b6 mailu: add validating dns sidecar and disable vip hostports 2025-12-12 01:06:38 -03:00
6c77b8e7f8 restore docs after gitignore change 2025-12-12 00:50:02 -03:00
78195c4685 mailu: fix admin dns and tame vip 2025-12-12 00:49:45 -03:00
5ef0b4edf6 mailu: capture helm release and cert 2025-12-11 23:54:43 -03:00
9f226c1584 Merge pull request 'feature/sso' (#4) from feature/sso into main
Reviewed-on: #4
2025-12-11 20:43:34 +00:00
319b515882 zot: restore main branch config 2025-12-11 17:26:15 -03:00
cb2b2ec1cd zot: revert to unauthenticated registry 2025-12-11 17:22:16 -03:00
20cd185c0b vault: drop traefik basicauth 2025-12-11 17:09:05 -03:00
2f368f6975 zot,vault: remove oauth2-proxy sso 2025-12-11 17:04:19 -03:00
6c62d42f7a longhorn/vault: gate via oauth2-proxy 2025-12-07 19:44:02 -03:00
a7e9f1f7d8 auth: remove error middleware to allow redirect 2025-12-07 13:19:45 -03:00
ceb692f7ee oauth2-proxy: drop groups scope to avoid invalid_scope 2025-12-07 13:09:29 -03:00
24fbaad040 auth: forward-auth via external auth host (svc traffic flaky) 2025-12-07 13:03:29 -03:00
04aa32a762 oauth2-proxy: schedule on worker rpis 2025-12-07 12:49:38 -03:00
25ee698021 oauth2-proxy: ensure error middleware on auth ingress 2025-12-07 12:03:14 -03:00
4a089876ba auth: use internal oauth2-proxy svc for forward-auth 2025-12-07 11:25:29 -03:00
20bb776625 auth: add 401 redirect middleware to oauth2-proxy 2025-12-07 11:14:25 -03:00
5e59f20bc3 auth: point forward-auth to external auth host 2025-12-07 11:09:09 -03:00
dbede55ad4 oauth2-proxy: temporarily drop group restriction 2025-12-07 10:42:13 -03:00
27e5c9391c auth: add namespace-local forward-auth middlewares 2025-12-07 10:25:44 -03:00
8d5e6c267c auth: wire oauth2-proxy and enable grafana oidc 2025-12-07 02:01:21 -03:00
a55502fe27 add oauth2-proxy for SSO forward-auth 2025-12-06 14:42:24 -03:00
598bdfc727 keycloak: restrict to worker rpis with titan-24 fallback 2025-12-06 01:44:23 -03:00
88c7a1c2aa keycloak: require rpi nodes with titan-24 fallback 2025-12-06 01:40:24 -03:00
f4da27271e keycloak: prefer rpi nodes, avoid titan-24 2025-12-06 01:36:33 -03:00
141c05b08f keycloak: honor xforwarded headers and hostname url 2025-12-06 01:23:07 -03:00
f0a8f6d35e keycloak: enable health/metrics management port 2025-12-06 00:51:47 -03:00
1b01052eda keycloak: set fsGroup for data volume 2025-12-06 00:49:17 -03:00
1d346edd28 keycloak: remove optimized flag for first start 2025-12-06 00:43:24 -03:00
b14a9dcb98 chore: drop AGENTS.md from repo 2025-12-06 00:43:17 -03:00
47caf08885 notes: capture GPU share change and flux branch 2025-12-03 12:28:45 -03:00
0db149605d monitoring: show GPU share over dashboard range 2025-12-02 20:28:35 -03:00
f64e60c5a2 flux: add keycloak kustomization 2025-12-02 18:10:20 -03:00
61c5db5c99 flux: track feature/sso 2025-12-02 18:00:49 -03:00
2db550afdd keycloak: add raw manifests backed by shared postgres 2025-12-02 17:58:19 -03:00
65d389193f Merge pull request 'feature/atlas-monitoring' (#3) from feature/atlas-monitoring into main
Reviewed-on: #3
2025-12-02 20:52:35 +00:00
87 changed files with 5527 additions and 795 deletions

6
.gitignore vendored
View File

@ -1 +1,5 @@
AGENTS.md
# Ignore markdown by default, but keep top-level docs
*.md
!README.md
!AGENTS.md
!**/NOTES.md

View File

@ -2,6 +2,8 @@
Repository Guidelines
> Local-only note: apply changes through Flux-tracked manifests, not by manual kubectl edits in-cluster—manual tweaks will be reverted by Flux.
## Project Structure & Module Organization
- `infrastructure/`: cluster-scoped building blocks (core, flux-system, traefik, longhorn). Add new platform features by mirroring this layout.
- `services/`: workload manifests per app (`services/gitea/`, etc.) with `kustomization.yaml` plus one file per kind; keep diffs small and focused.
@ -46,6 +48,7 @@ Repository Guidelines
- Atlas Overview is the Grafana home (1h range, 1m refresh), Overview folder UID `overview`, internal folder `atlas-internal` (oceanus-internal stub).
- Panels standardized via generator; hottest row compressed, worker/control rows taller, root disk row taller and top12 bar gauge with labels. GPU share pie uses 1h avg_over_time to persist idle activity.
- Internal dashboards are provisioned without Viewer role; if anonymous still sees them, restart Grafana and tighten auth if needed.
- GPU share panel updated (feature/sso) to use `max_over_time(…[$__range])`, so longer ranges (e.g., 12h) keep recent activity visible. Flux tracking `feature/sso`.
## Upcoming priorities (SSO/storage/mail)
- Establish SSO (Keycloak or similar) and federate Grafana, Gitea, Zot, Nextcloud, Pegasus/Jellyfin; keep Vaultwarden separate until safe.
@ -66,3 +69,13 @@ Repository Guidelines
## Postgres centralization (2025-12-03)
- Prefer a shared in-cluster Postgres deployment with per-service databases to reduce resource sprawl on Pi nodes. Use it for services that can easily point at an external DB.
- Candidates to migrate to shared Postgres: Keycloak (realm DB), Gitea (git DB), Nextcloud (app DB), possibly Grafana (if persistence needed beyond current provisioner), Jitsi prosody/JVB state (if external DB supported). Keep tightly-coupled or lightweight embedded DBs as-is when migration is painful or not supported.
## SSO integration snapshot (2025-12-08)
- Current blockers: Zot still prompts for basic auth/double-login; Vault still wants the token UI after Keycloak (previously 502/404 when vault-0 sealed). Forward-auth middleware on Zot Ingress likely still causing the 401/Found hop; Vault OIDC mount not completing UI flow unless unsealed and preferred login is set.
- Flux-only changes required: remove zot forward-auth middleware from Ingress (let oauth2-proxy handle redirect), ensure Vault OIDC mount is preferred UI login and bound to admin group; keep all edits in repo so Flux enforces them.
- Secrets present (per user): `zot-oidc-client` (client_secret only), `oauth2-proxy-zot-oidc`, `oauth2-proxy-vault-oidc`, `vault-oidc-admin-token`. Zot needs its regcred in the zot namespace if image pulls fail.
- Cluster validation blocked here: `kubectl get nodes` fails (403/permission) and DNS to `*.bstein.dev` fails in this session, so no live curl verification could be run. Re-test on a host with cluster/DNS access after Flux applies fixes.
## Docs hygiene
- Do not add per-service `README.md` files; use `NOTES.md` if documentation is needed inside service folders. Keep only the top-level repo README.
- Keep comments succinct and in a human voice—no AI-sounding notes. Use `NOTES.md` for scratch notes instead of sprinkling reminders into code or extra READMEs.

3
NOTES.md Normal file
View File

@ -0,0 +1,3 @@
# Rotation reminders (temporary secrets set by automation)
- Weave GitOps UI (`cd.bstein.dev`) admin: `admin` / `G1tOps!2025` — rotate immediately after first login.

3
README.md Normal file
View File

@ -0,0 +1,3 @@
# titan-iac
Flux-managed Kubernetes cluster for bstein.dev services.

View File

@ -0,0 +1,15 @@
# clusters/atlas/flux-system/applications/keycloak/kustomization.yaml
apiVersion: kustomize.toolkit.fluxcd.io/v1
kind: Kustomization
metadata:
name: keycloak
namespace: flux-system
spec:
interval: 10m
prune: true
sourceRef:
kind: GitRepository
name: flux-system
path: ./services/keycloak
targetNamespace: sso
timeout: 2m

View File

@ -13,3 +13,6 @@ resources:
- jellyfin/kustomization.yaml
- xmr-miner/kustomization.yaml
- sui-metrics/kustomization.yaml
- keycloak/kustomization.yaml
- oauth2-proxy/kustomization.yaml
- mailu/kustomization.yaml

View File

@ -0,0 +1,18 @@
# clusters/atlas/flux-system/applications/mailu/kustomization.yaml
apiVersion: kustomize.toolkit.fluxcd.io/v1
kind: Kustomization
metadata:
name: mailu
namespace: flux-system
spec:
interval: 10m
sourceRef:
kind: GitRepository
name: flux-system
namespace: flux-system
path: ./services/mailu
targetNamespace: mailu-mailserver
prune: true
wait: true
dependsOn:
- name: helm

View File

@ -0,0 +1,15 @@
# clusters/atlas/flux-system/applications/oauth2-proxy/kustomization.yaml
apiVersion: kustomize.toolkit.fluxcd.io/v1
kind: Kustomization
metadata:
name: oauth2-proxy
namespace: flux-system
spec:
interval: 10m
prune: true
sourceRef:
kind: GitRepository
name: flux-system
path: ./services/oauth2-proxy
targetNamespace: sso
timeout: 2m

View File

@ -8,7 +8,7 @@ metadata:
spec:
interval: 1m0s
ref:
branch: feature/atlas-monitoring
branch: feature/mailu
secretRef:
name: flux-system-gitea
url: ssh://git@scm.bstein.dev:2242/bstein/titan-iac.git

View File

@ -0,0 +1,20 @@
# clusters/atlas/flux-system/platform/gitops-ui/kustomization.yaml
apiVersion: kustomize.toolkit.fluxcd.io/v1
kind: Kustomization
metadata:
name: gitops-ui
namespace: flux-system
spec:
interval: 10m
timeout: 10m
path: ./services/gitops-ui
prune: true
sourceRef:
kind: GitRepository
name: flux-system
namespace: flux-system
targetNamespace: flux-system
dependsOn:
- name: helm
- name: traefik
wait: true

View File

@ -5,5 +5,6 @@ resources:
- core/kustomization.yaml
- helm/kustomization.yaml
- traefik/kustomization.yaml
- gitops-ui/kustomization.yaml
- monitoring/kustomization.yaml
- longhorn-ui/kustomization.yaml

View File

@ -1,5 +0,0 @@
# Oceanus Cluster Scaffold
This directory prepares the Flux and Kustomize layout for a future Oceanus-managed cluster.
Populate `flux-system/` with `gotk-components.yaml` and related manifests after running `flux bootstrap`.
Define node-specific resources under `infrastructure/modules/profiles/oceanus-validator/` and reference workloads in `applications/` as they come online.

View File

@ -2,15 +2,14 @@
| Hostname | Role / Function | Managed By | Notes |
|------------|--------------------------------|---------------------|-------|
| titan-db | HA control plane database | Ansible | PostgreSQL / etcd backing services |
| titan-0a | Kubernetes control-plane | Flux (atlas cluster)| HA leader, tainted for control only |
| titan-0b | Kubernetes control-plane | Flux (atlas cluster)| Standby control node |
| titan-0c | Kubernetes control-plane | Flux (atlas cluster)| Standby control node |
| titan-04-19| Raspberry Pi workers | Flux (atlas cluster)| Workload nodes, labelled per hardware |
| titan-20&21| NVIDIA Jetson workers | Flux (atlas cluster)| Workload nodes, labelled per hardware |
| titan-22 | GPU mini-PC (Jellyfin) | Flux + Ansible | NVIDIA runtime managed via `modules/profiles/atlas-ha` |
| titan-23 | Dedicated SUI validator Oceanus| Manual + Ansible | Baremetal validator workloads, exposes metrics to atlas |
| titan-24 | Tethys hybrid node | Flux + Ansible | Runs SUI metrics via K8s, validator via Ansible |
| titan-db | HA control plane database | Ansible | PostgreSQL / etcd backing services |
| titan-jh | Jumphost & bastion | Ansible | Entry point / future KVM services |
| oceanus | Dedicated SUI validator host | Ansible / Flux prep | Baremetal validator workloads, exposes metrics to atlas; Kustomize scaffold under `clusters/oceanus/` |
| titan-jh | Jumphost & bastion & lesavka | Ansible | Entry point / future KVM services / custom kvm - lesavaka |
| styx | Air-gapped workstation | Manual / Scripts | Remains isolated, scripts tracked in `hosts/styx` |
Use the `clusters/` directory for cluster-scoped state and the `hosts/` directory for baremetal orchestration.

View File

@ -5,3 +5,4 @@ resources:
- ../modules/base
- ../modules/profiles/atlas-ha
- ../sources/cert-manager/letsencrypt.yaml
- ../sources/cert-manager/letsencrypt-prod.yaml

View File

@ -7,7 +7,7 @@ metadata:
annotations:
traefik.ingress.kubernetes.io/router.entrypoints: websecure
traefik.ingress.kubernetes.io/router.tls: "true"
traefik.ingress.kubernetes.io/router.middlewares: longhorn-system-longhorn-basicauth@kubernetescrd,longhorn-system-longhorn-headers@kubernetescrd
traefik.ingress.kubernetes.io/router.middlewares: ""
spec:
ingressClassName: traefik
tls:
@ -21,6 +21,6 @@ spec:
pathType: Prefix
backend:
service:
name: longhorn-frontend
name: oauth2-proxy-longhorn
port:
number: 80

View File

@ -4,3 +4,4 @@ kind: Kustomization
resources:
- middleware.yaml
- ingress.yaml
- oauth2-proxy-longhorn.yaml

View File

@ -20,3 +20,20 @@ spec:
headers:
customRequestHeaders:
X-Forwarded-Proto: "https"
---
apiVersion: traefik.io/v1alpha1
kind: Middleware
metadata:
name: longhorn-forward-auth
namespace: longhorn-system
spec:
forwardAuth:
address: https://auth.bstein.dev/oauth2/auth
trustForwardHeader: true
authResponseHeaders:
- Authorization
- X-Auth-Request-Email
- X-Auth-Request-User
- X-Auth-Request-Groups

View File

@ -0,0 +1,102 @@
# infrastructure/longhorn/ui-ingress/oauth2-proxy-longhorn.yaml
apiVersion: v1
kind: Service
metadata:
name: oauth2-proxy-longhorn
namespace: longhorn-system
labels:
app: oauth2-proxy-longhorn
spec:
ports:
- name: http
port: 80
targetPort: 4180
selector:
app: oauth2-proxy-longhorn
---
apiVersion: apps/v1
kind: Deployment
metadata:
name: oauth2-proxy-longhorn
namespace: longhorn-system
labels:
app: oauth2-proxy-longhorn
spec:
replicas: 2
selector:
matchLabels:
app: oauth2-proxy-longhorn
template:
metadata:
labels:
app: oauth2-proxy-longhorn
spec:
nodeSelector:
node-role.kubernetes.io/worker: "true"
affinity:
nodeAffinity:
preferredDuringSchedulingIgnoredDuringExecution:
- weight: 90
preference:
matchExpressions:
- key: hardware
operator: In
values: ["rpi5","rpi4"]
containers:
- name: oauth2-proxy
image: quay.io/oauth2-proxy/oauth2-proxy:v7.6.0
imagePullPolicy: IfNotPresent
args:
- --provider=oidc
- --redirect-url=https://longhorn.bstein.dev/oauth2/callback
- --oidc-issuer-url=https://sso.bstein.dev/realms/atlas
- --scope=openid profile email groups
- --email-domain=*
- --allowed-group=admin
- --set-xauthrequest=true
- --pass-access-token=true
- --set-authorization-header=true
- --cookie-secure=true
- --cookie-samesite=lax
- --cookie-refresh=20m
- --cookie-expire=168h
- --insecure-oidc-allow-unverified-email=true
- --upstream=http://longhorn-frontend.longhorn-system.svc.cluster.local
- --http-address=0.0.0.0:4180
- --skip-provider-button=true
- --skip-jwt-bearer-tokens=true
- --oidc-groups-claim=groups
- --cookie-domain=longhorn.bstein.dev
env:
- name: OAUTH2_PROXY_CLIENT_ID
valueFrom:
secretKeyRef:
name: oauth2-proxy-longhorn-oidc
key: client_id
- name: OAUTH2_PROXY_CLIENT_SECRET
valueFrom:
secretKeyRef:
name: oauth2-proxy-longhorn-oidc
key: client_secret
- name: OAUTH2_PROXY_COOKIE_SECRET
valueFrom:
secretKeyRef:
name: oauth2-proxy-longhorn-oidc
key: cookie_secret
ports:
- containerPort: 4180
name: http
readinessProbe:
httpGet:
path: /ping
port: 4180
initialDelaySeconds: 5
periodSeconds: 10
livenessProbe:
httpGet:
path: /ping
port: 4180
initialDelaySeconds: 20
periodSeconds: 20

View File

@ -0,0 +1,14 @@
apiVersion: cert-manager.io/v1
kind: ClusterIssuer
metadata:
name: letsencrypt-prod
spec:
acme:
email: brad.stein@gmail.com
server: https://acme-v02.api.letsencrypt.org/directory
privateKeySecretRef:
name: letsencrypt-prod-account-key
solvers:
- http01:
ingress:
class: traefik

View File

@ -0,0 +1,10 @@
# infrastructure/sources/helm/kustomization.yaml
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
resources:
- grafana.yaml
- hashicorp.yaml
- jetstack.yaml
- mailu.yaml
- prometheus.yaml
- victoria-metrics.yaml

View File

@ -0,0 +1,9 @@
# infrastructure/sources/helm/mailu.yaml
apiVersion: source.toolkit.fluxcd.io/v1
kind: HelmRepository
metadata:
name: mailu
namespace: flux-system
spec:
interval: 1h
url: https://mailu.github.io/helm-charts

View File

@ -36,11 +36,12 @@ PUBLIC_FOLDER = "overview"
PRIVATE_FOLDER = "atlas-internal"
PERCENT_THRESHOLDS = {
"mode": "percentage",
"mode": "absolute",
"steps": [
{"color": "green", "value": None},
{"color": "yellow", "value": 70},
{"color": "red", "value": 85},
{"color": "yellow", "value": 50},
{"color": "orange", "value": 75},
{"color": "red", "value": 91.5},
],
}
@ -81,7 +82,7 @@ CONTROL_SUFFIX = f"/{CONTROL_TOTAL}"
WORKER_SUFFIX = f"/{WORKER_TOTAL}"
CP_ALLOWED_NS = "kube-system|kube-public|kube-node-lease|longhorn-system|monitoring|flux-system"
LONGHORN_NODE_REGEX = "titan-1[2-9]|titan-2[24]"
GAUGE_WIDTHS = [5, 5, 5, 5, 4]
GAUGE_WIDTHS = [4, 3, 3, 4, 3, 3, 4]
CONTROL_WORKLOADS_EXPR = (
f'sum(kube_pod_info{{node=~"{CONTROL_REGEX}",namespace!~"{CP_ALLOWED_NS}"}}) or on() vector(0)'
)
@ -187,17 +188,64 @@ def namespace_gpu_share_expr():
return namespace_share_expr(NAMESPACE_GPU_RAW)
PROBLEM_PODS_EXPR = 'sum(max by (namespace,pod) (kube_pod_status_phase{phase!~"Running|Succeeded"}))'
PROBLEM_PODS_EXPR = (
'sum(max by (namespace,pod) (kube_pod_status_phase{phase!~"Running|Succeeded"})) '
"or on() vector(0)"
)
CRASHLOOP_EXPR = (
'sum(max by (namespace,pod) (kube_pod_container_status_waiting_reason'
'{reason=~"CrashLoopBackOff|ImagePullBackOff"}))'
'{reason=~"CrashLoopBackOff|ImagePullBackOff"})) '
"or on() vector(0)"
)
STUCK_TERMINATING_EXPR = (
'sum(max by (namespace,pod) ('
'((time() - kube_pod_deletion_timestamp{pod!=""}) > bool 600)'
' and on(namespace,pod) (kube_pod_deletion_timestamp{pod!=""} > bool 0)'
'))'
')) '
"or on() vector(0)"
)
UPTIME_WINDOW = "30d"
TRAEFIK_READY_EXPR = (
"("
'sum(kube_deployment_status_replicas_available{namespace=~"traefik|kube-system",deployment="traefik"})'
" / clamp_min("
'sum(kube_deployment_spec_replicas{namespace=~"traefik|kube-system",deployment="traefik"}), 1)'
")"
)
CONTROL_READY_FRACTION_EXPR = (
f"(sum(kube_node_status_condition{{condition=\"Ready\",status=\"true\",node=~\"{CONTROL_REGEX}\"}})"
f" / {CONTROL_TOTAL})"
)
UPTIME_AVAIL_EXPR = (
f"min(({CONTROL_READY_FRACTION_EXPR}), ({TRAEFIK_READY_EXPR}))"
)
# Tie-breaker to deterministically pick one node per namespace when shares tie.
NODE_TIEBREAKER = " + ".join(
f"({node_filter(node)}) * 1e-6 * {idx}"
for idx, node in enumerate(CONTROL_ALL + WORKER_NODES, start=1)
)
UPTIME_AVG_EXPR = f"avg_over_time(({UPTIME_AVAIL_EXPR})[{UPTIME_WINDOW}:5m])"
UPTIME_PERCENT_EXPR = UPTIME_AVG_EXPR
UPTIME_NINES_EXPR = f"-log10(1 - clamp_max({UPTIME_AVG_EXPR}, 0.999999999))"
UPTIME_THRESHOLDS = {
"mode": "absolute",
"steps": [
{"color": "red", "value": None},
{"color": "orange", "value": 2},
{"color": "yellow", "value": 3},
{"color": "green", "value": 3.5},
],
}
UPTIME_PERCENT_THRESHOLDS = {
"mode": "absolute",
"steps": [
{"color": "red", "value": None},
{"color": "orange", "value": 0.999},
{"color": "yellow", "value": 0.9999},
{"color": "green", "value": 0.99999},
],
}
PROBLEM_TABLE_EXPR = (
"(time() - kube_pod_created{pod!=\"\"}) "
"* on(namespace,pod) group_left(node) kube_pod_info "
@ -232,7 +280,7 @@ NAMESPACE_GPU_ALLOC = (
' or kube_pod_container_resource_limits{namespace!="",resource="nvidia.com/gpu"})) by (namespace)'
)
NAMESPACE_GPU_USAGE_SHARE = (
'sum by (namespace) (avg_over_time(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\"}[1h]))'
'sum by (namespace) (max_over_time(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\"}[$__range]))'
)
NAMESPACE_GPU_USAGE_INSTANT = 'sum(DCGM_FI_DEV_GPU_UTIL{namespace!="",pod!=""}) by (namespace)'
NAMESPACE_GPU_RAW = (
@ -291,6 +339,34 @@ NET_INTERNAL_EXPR = (
'+ rate(container_network_transmit_bytes_total{namespace!="traefik",pod!=""}[5m]))'
' or on() vector(0)'
)
APISERVER_5XX_RATE = 'sum(rate(apiserver_request_total{code=~"5.."}[5m]))'
APISERVER_P99_LATENCY_MS = (
"histogram_quantile(0.99, sum by (le) (rate(apiserver_request_duration_seconds_bucket[5m]))) * 1000"
)
ETCD_P99_LATENCY_MS = (
"histogram_quantile(0.99, sum by (le) (rate(etcd_request_duration_seconds_bucket[5m]))) * 1000"
)
TRAEFIK_TOTAL_5M = "sum(rate(traefik_entrypoint_requests_total[5m]))"
TRAEFIK_SUCCESS_5M = 'sum(rate(traefik_entrypoint_requests_total{code!~"5.."}[5m]))'
TRAEFIK_SLI_5M = f"({TRAEFIK_SUCCESS_5M}) / clamp_min({TRAEFIK_TOTAL_5M}, 1)"
TRAEFIK_P99_LATENCY_MS = (
"histogram_quantile(0.99, sum by (le) (rate(traefik_entrypoint_request_duration_seconds_bucket[5m]))) * 1000"
)
TRAEFIK_P95_LATENCY_MS = (
"histogram_quantile(0.95, sum by (le) (rate(traefik_entrypoint_request_duration_seconds_bucket[5m]))) * 1000"
)
SLO_AVAILABILITY = 0.999
def traefik_sli(window):
total = f'sum(rate(traefik_entrypoint_requests_total[{window}]))'
success = f'sum(rate(traefik_entrypoint_requests_total{{code!~"5.."}}[{window}]))'
return f"({success}) / clamp_min({total}, 1)"
def traefik_burn(window):
sli = traefik_sli(window)
return f"(1 - ({sli})) / {1 - SLO_AVAILABILITY}"
# ---------------------------------------------------------------------------
# Panel factories
@ -304,6 +380,7 @@ def stat_panel(
grid,
*,
unit="none",
decimals=None,
thresholds=None,
text_mode="value",
legend=None,
@ -313,7 +390,7 @@ def stat_panel(
):
"""Return a Grafana stat panel definition."""
defaults = {
"color": {"mode": "palette-classic"},
"color": {"mode": "thresholds"},
"mappings": [],
"thresholds": thresholds
or {
@ -328,6 +405,8 @@ def stat_panel(
}
if value_suffix:
defaults["custom"]["valueSuffix"] = value_suffix
if decimals is not None:
defaults["decimals"] = decimals
panel = {
"id": panel_id,
"type": "stat",
@ -446,17 +525,32 @@ def table_panel(
*,
unit="none",
transformations=None,
instant=False,
options=None,
filterable=True,
footer=None,
format=None,
):
"""Return a Grafana table panel definition."""
# Optional PromQL subquery helpers in expr: share(), etc.
panel_options = {"showHeader": True, "columnFilters": False}
if options:
panel_options.update(options)
if footer is not None:
panel_options["footer"] = footer
field_defaults = {"unit": unit, "custom": {"filterable": filterable}}
target = {"expr": expr, "refId": "A", **({"instant": True} if instant else {})}
if format:
target["format"] = format
panel = {
"id": panel_id,
"type": "table",
"title": title,
"datasource": PROM_DS,
"gridPos": grid,
"targets": [{"expr": expr, "refId": "A"}],
"fieldConfig": {"defaults": {"unit": unit}, "overrides": []},
"options": {"showHeader": True},
"targets": [target],
"fieldConfig": {"defaults": field_defaults, "overrides": []},
"options": panel_options,
}
if transformations:
panel["transformations"] = transformations
@ -482,7 +576,7 @@ def pie_panel(panel_id, title, expr, grid):
"options": {
"legend": {"displayMode": "list", "placement": "right"},
"pieType": "pie",
"displayLabels": ["percent"],
"displayLabels": [],
"tooltip": {"mode": "single"},
"colorScheme": "interpolateSpectral",
"colorBy": "value",
@ -491,7 +585,19 @@ def pie_panel(panel_id, title, expr, grid):
}
def bargauge_panel(panel_id, title, expr, grid, *, unit="none", links=None):
def bargauge_panel(
panel_id,
title,
expr,
grid,
*,
unit="none",
links=None,
limit=None,
thresholds=None,
decimals=None,
instant=False,
):
"""Return a bar gauge panel with label-aware reduction."""
panel = {
"id": panel_id,
@ -499,13 +605,16 @@ def bargauge_panel(panel_id, title, expr, grid, *, unit="none", links=None):
"title": title,
"datasource": PROM_DS,
"gridPos": grid,
"targets": [{"expr": expr, "refId": "A", "legendFormat": "{{node}}"}],
"targets": [
{"expr": expr, "refId": "A", "legendFormat": "{{node}}", **({"instant": True} if instant else {})}
],
"fieldConfig": {
"defaults": {
"unit": unit,
"min": 0,
"max": 100 if unit == "percent" else None,
"thresholds": {
"thresholds": thresholds
or {
"mode": "absolute",
"steps": [
{"color": "green", "value": None},
@ -527,8 +636,19 @@ def bargauge_panel(panel_id, title, expr, grid, *, unit="none", links=None):
},
},
}
if decimals is not None:
panel["fieldConfig"]["defaults"]["decimals"] = decimals
if links:
panel["links"] = links
# Keep bars ordered by value descending for readability.
panel["transformations"] = [
{
"id": "sortBy",
"options": {"fields": ["Value"], "order": "desc"},
}
]
if limit:
panel["transformations"].append({"id": "limit", "options": {"limit": limit}})
return panel
@ -555,81 +675,37 @@ def link_to(uid):
def build_overview():
panels = []
count_thresholds = {
"mode": "absolute",
"steps": [
{"color": "green", "value": None},
{"color": "yellow", "value": 1},
{"color": "orange", "value": 2},
{"color": "red", "value": 3},
],
}
row1_stats = [
(
1,
"Workers Ready",
f'sum(kube_node_status_condition{{condition="Ready",status="true",node=~"{WORKER_REGEX}"}})',
WORKER_SUFFIX,
WORKER_TOTAL,
None,
),
(
2,
"Control Plane Ready",
f'sum(kube_node_status_condition{{condition="Ready",status="true",node=~"{CONTROL_REGEX}"}})',
CONTROL_SUFFIX,
CONTROL_TOTAL,
None,
),
(
3,
"Control Plane Workloads",
CONTROL_WORKLOADS_EXPR,
None,
4,
link_to("atlas-pods"),
),
(
4,
"Problem Pods",
PROBLEM_PODS_EXPR,
None,
1,
link_to("atlas-pods"),
),
(
5,
"Stuck Terminating",
STUCK_TERMINATING_EXPR,
None,
1,
link_to("atlas-pods"),
),
]
def gauge_grid(idx):
width = GAUGE_WIDTHS[idx] if idx < len(GAUGE_WIDTHS) else 4
x = sum(GAUGE_WIDTHS[:idx])
return width, x
for idx, (panel_id, title, expr, suffix, ok_value, links) in enumerate(row1_stats):
thresholds = None
min_value = 0
max_value = ok_value or 5
if panel_id == 1:
max_value = WORKER_TOTAL
thresholds = {
"mode": "absolute",
"steps": [
{"color": "red", "value": None},
{"color": "orange", "value": WORKER_TOTAL - 2},
{"color": "yellow", "value": WORKER_TOTAL - 1},
{"color": "green", "value": WORKER_TOTAL},
],
}
elif panel_id == 2:
max_value = CONTROL_TOTAL
thresholds = {
{
"id": 2,
"title": "Control Plane Ready",
"expr": f'sum(kube_node_status_condition{{condition="Ready",status="true",node=~"{CONTROL_REGEX}"}})',
"kind": "gauge",
"max_value": CONTROL_TOTAL,
"thresholds": {
"mode": "absolute",
"steps": [
{"color": "red", "value": None},
{"color": "green", "value": CONTROL_TOTAL},
],
}
elif panel_id in (3, 4, 5):
max_value = 4
thresholds = {
},
},
{
"id": 3,
"title": "Control Plane Workloads",
"expr": CONTROL_WORKLOADS_EXPR,
"kind": "stat",
"thresholds": {
"mode": "absolute",
"steps": [
{"color": "green", "value": None},
@ -637,40 +713,122 @@ def build_overview():
{"color": "orange", "value": 2},
{"color": "red", "value": 3},
],
}
else:
thresholds = {
},
"links": link_to("atlas-pods"),
},
{
"id": 5,
"title": "Stuck Terminating",
"expr": STUCK_TERMINATING_EXPR,
"kind": "stat",
"thresholds": {
"mode": "absolute",
"steps": [
{"color": "green", "value": None},
{"color": "red", "value": max_value},
{"color": "yellow", "value": 1},
{"color": "orange", "value": 2},
{"color": "red", "value": 3},
],
}
},
"links": link_to("atlas-pods"),
},
{
"id": 27,
"title": "Atlas Availability (30d)",
"expr": UPTIME_PERCENT_EXPR,
"kind": "stat",
"thresholds": UPTIME_PERCENT_THRESHOLDS,
"unit": "percentunit",
"decimals": 3,
"text_mode": "value",
},
{
"id": 4,
"title": "Problem Pods",
"expr": PROBLEM_PODS_EXPR,
"kind": "stat",
"thresholds": {
"mode": "absolute",
"steps": [
{"color": "green", "value": None},
{"color": "yellow", "value": 1},
{"color": "orange", "value": 2},
{"color": "red", "value": 3},
],
},
"links": link_to("atlas-pods"),
},
{
"id": 6,
"title": "CrashLoop / ImagePull",
"expr": CRASHLOOP_EXPR,
"kind": "stat",
"thresholds": {
"mode": "absolute",
"steps": [
{"color": "green", "value": None},
{"color": "yellow", "value": 1},
{"color": "orange", "value": 2},
{"color": "red", "value": 3},
],
},
"links": link_to("atlas-pods"),
},
{
"id": 1,
"title": "Workers Ready",
"expr": f'sum(kube_node_status_condition{{condition="Ready",status="true",node=~"{WORKER_REGEX}"}})',
"kind": "gauge",
"max_value": WORKER_TOTAL,
"thresholds": {
"mode": "absolute",
"steps": [
{"color": "red", "value": None},
{"color": "orange", "value": WORKER_TOTAL - 2},
{"color": "yellow", "value": WORKER_TOTAL - 1},
{"color": "green", "value": WORKER_TOTAL},
],
},
},
]
def gauge_grid(idx):
width = GAUGE_WIDTHS[idx] if idx < len(GAUGE_WIDTHS) else 4
x = sum(GAUGE_WIDTHS[:idx])
return width, x
for idx, item in enumerate(row1_stats):
panel_id = item["id"]
width, x = gauge_grid(idx)
if panel_id in (3, 4, 5):
grid = {"h": 5, "w": width, "x": x, "y": 0}
kind = item.get("kind", "gauge")
if kind == "stat":
panels.append(
stat_panel(
panel_id,
title,
expr,
{"h": 5, "w": width, "x": x, "y": 0},
thresholds=thresholds,
legend=None,
links=links,
text_mode="value",
)
)
item["title"],
item["expr"],
grid,
thresholds=item.get("thresholds"),
legend=None,
links=item.get("links"),
text_mode=item.get("text_mode", "value"),
value_suffix=item.get("value_suffix"),
unit=item.get("unit", "none"),
decimals=item.get("decimals"),
)
)
else:
panels.append(
gauge_panel(
panel_id,
title,
expr,
{"h": 5, "w": width, "x": x, "y": 0},
min_value=min_value,
max_value=max_value,
thresholds=thresholds,
links=links,
item["title"],
item["expr"],
grid,
min_value=0,
max_value=item.get("max_value", 5),
thresholds=item.get("thresholds"),
links=item.get("links"),
)
)
@ -774,7 +932,7 @@ def build_overview():
timeseries_panel(
16,
"Control plane CPU",
node_cpu_expr(CONTROL_REGEX),
node_cpu_expr(CONTROL_ALL_REGEX),
{"h": 10, "w": 12, "x": 0, "y": 44},
unit="percent",
legend="{{node}}",
@ -786,7 +944,7 @@ def build_overview():
timeseries_panel(
17,
"Control plane RAM",
node_mem_expr(CONTROL_REGEX),
node_mem_expr(CONTROL_ALL_REGEX),
{"h": 10, "w": 12, "x": 12, "y": 44},
unit="percent",
legend="{{node}}",
@ -795,6 +953,36 @@ def build_overview():
)
)
panels.append(
pie_panel(
28,
"Node Pod Share",
'(sum(kube_pod_info{pod!="" , node!=""}) by (node) / clamp_min(sum(kube_pod_info{pod!="" , node!=""}), 1)) * 100',
{"h": 10, "w": 12, "x": 0, "y": 54},
)
)
panels.append(
bargauge_panel(
29,
"Top Nodes by Pod Count",
'topk(12, sum(kube_pod_info{pod!="" , node!=""}) by (node))',
{"h": 10, "w": 12, "x": 12, "y": 54},
unit="none",
limit=12,
decimals=0,
thresholds={
"mode": "absolute",
"steps": [
{"color": "green", "value": None},
{"color": "yellow", "value": 50},
{"color": "orange", "value": 75},
{"color": "red", "value": 100},
],
},
instant=True,
)
)
panels.append(
timeseries_panel(
18,
@ -840,7 +1028,7 @@ def build_overview():
21,
"Root Filesystem Usage",
root_usage_expr(),
{"h": 16, "w": 12, "x": 0, "y": 54},
{"h": 16, "w": 12, "x": 0, "y": 64},
unit="percent",
legend="{{node}}",
legend_calcs=["last"],
@ -855,8 +1043,9 @@ def build_overview():
22,
"Nodes Closest to Full Root Disks",
f"topk(12, {root_usage_expr()})",
{"h": 16, "w": 12, "x": 12, "y": 54},
{"h": 16, "w": 12, "x": 12, "y": 64},
unit="percent",
thresholds=PERCENT_THRESHOLDS,
links=link_to("atlas-storage"),
)
)
@ -874,13 +1063,7 @@ def build_overview():
"templating": {"list": []},
"time": {"from": "now-1h", "to": "now"},
"refresh": "1m",
"links": [
{"title": "Atlas Pods", "type": "dashboard", "dashboardUid": "atlas-pods", "keepTime": False},
{"title": "Atlas Nodes", "type": "dashboard", "dashboardUid": "atlas-nodes", "keepTime": False},
{"title": "Atlas Storage", "type": "dashboard", "dashboardUid": "atlas-storage", "keepTime": False},
{"title": "Atlas Network", "type": "dashboard", "dashboardUid": "atlas-network", "keepTime": False},
{"title": "Atlas GPU", "type": "dashboard", "dashboardUid": "atlas-gpu", "keepTime": False},
],
"links": [],
}
@ -980,6 +1163,91 @@ def build_pods_dashboard():
],
)
)
panels.append(
pie_panel(
8,
"Node Pod Share",
'(sum(kube_pod_info{pod!="" , node!=""}) by (node) / clamp_min(sum(kube_pod_info{pod!="" , node!=""}), 1)) * 100',
{"h": 8, "w": 12, "x": 12, "y": 34},
)
)
panels.append(
bargauge_panel(
9,
"Top Nodes by Pod Count",
'topk(12, sum(kube_pod_info{pod!="" , node!=""}) by (node))',
{"h": 8, "w": 12, "x": 0, "y": 34},
unit="none",
limit=12,
decimals=0,
thresholds={
"mode": "absolute",
"steps": [
{"color": "green", "value": None},
{"color": "yellow", "value": 50},
{"color": "orange", "value": 75},
{"color": "red", "value": 100},
],
},
instant=True,
)
)
share_expr = (
'(sum by (namespace,node) (kube_pod_info{pod!="" , node!=""}) '
'/ on(namespace) group_left() clamp_min(sum by (namespace) (kube_pod_info{pod!=""}), 1) * 100)'
)
rank_terms = [
f"(sum by (node) (kube_node_info{{node=\"{node}\"}}) * 0 + {idx * 1e-3})"
for idx, node in enumerate(CONTROL_ALL + WORKER_NODES, start=1)
]
rank_expr = " or ".join(rank_terms)
score_expr = f"{share_expr} + on(node) group_left() ({rank_expr})"
mask_expr = (
f"{score_expr} == bool on(namespace) group_left() "
f"(max by (namespace) ({score_expr}))"
)
panels.append(
table_panel(
10,
"Namespace Plurality by Node v27",
(
f"{share_expr} * on(namespace,node) group_left() "
f"({mask_expr})"
),
{"h": 8, "w": 24, "x": 0, "y": 42},
unit="percent",
transformations=[
{"id": "labelsToFields", "options": {}},
{"id": "organize", "options": {"excludeByName": {"Time": True}}},
{"id": "filterByValue", "options": {"match": "Value", "operator": "gt", "value": 0}},
{
"id": "sortBy",
"options": {"fields": ["Value"], "order": "desc"},
},
{
"id": "groupBy",
"options": {
"fields": {
"namespace": {
"aggregations": [
{"field": "Value", "operation": "max"},
{"field": "node", "operation": "first"},
]
}
},
"rowBy": ["namespace"],
},
},
],
instant=True,
options={"showColumnFilters": False},
filterable=False,
footer={"show": False, "fields": "", "calcs": []},
format="table",
)
)
return {
"uid": "atlas-pods",
"title": "Atlas Pods",
@ -1022,12 +1290,69 @@ def build_nodes_dashboard():
{"h": 4, "w": 8, "x": 16, "y": 0},
)
)
panels.append(
stat_panel(
9,
"API Server 5xx rate",
APISERVER_5XX_RATE,
{"h": 4, "w": 8, "x": 0, "y": 4},
unit="req/s",
thresholds={
"mode": "absolute",
"steps": [
{"color": "green", "value": None},
{"color": "yellow", "value": 0.05},
{"color": "orange", "value": 0.2},
{"color": "red", "value": 0.5},
],
},
decimals=3,
)
)
panels.append(
stat_panel(
10,
"API Server P99 latency",
APISERVER_P99_LATENCY_MS,
{"h": 4, "w": 8, "x": 8, "y": 4},
unit="ms",
thresholds={
"mode": "absolute",
"steps": [
{"color": "green", "value": None},
{"color": "yellow", "value": 250},
{"color": "orange", "value": 400},
{"color": "red", "value": 600},
],
},
decimals=1,
)
)
panels.append(
stat_panel(
11,
"etcd P99 latency",
ETCD_P99_LATENCY_MS,
{"h": 4, "w": 8, "x": 16, "y": 4},
unit="ms",
thresholds={
"mode": "absolute",
"steps": [
{"color": "green", "value": None},
{"color": "yellow", "value": 50},
{"color": "orange", "value": 100},
{"color": "red", "value": 200},
],
},
decimals=1,
)
)
panels.append(
timeseries_panel(
4,
"Node CPU",
node_cpu_expr(),
{"h": 9, "w": 24, "x": 0, "y": 4},
{"h": 9, "w": 24, "x": 0, "y": 8},
unit="percent",
legend="{{node}}",
legend_calcs=["last"],
@ -1040,7 +1365,7 @@ def build_nodes_dashboard():
5,
"Node RAM",
node_mem_expr(),
{"h": 9, "w": 24, "x": 0, "y": 13},
{"h": 9, "w": 24, "x": 0, "y": 17},
unit="percent",
legend="{{node}}",
legend_calcs=["last"],
@ -1053,7 +1378,7 @@ def build_nodes_dashboard():
6,
"Control Plane (incl. titan-db) CPU",
node_cpu_expr(CONTROL_ALL_REGEX),
{"h": 9, "w": 12, "x": 0, "y": 22},
{"h": 9, "w": 12, "x": 0, "y": 26},
unit="percent",
legend="{{node}}",
legend_display="table",
@ -1065,7 +1390,7 @@ def build_nodes_dashboard():
7,
"Control Plane (incl. titan-db) RAM",
node_mem_expr(CONTROL_ALL_REGEX),
{"h": 9, "w": 12, "x": 12, "y": 22},
{"h": 9, "w": 12, "x": 12, "y": 26},
unit="percent",
legend="{{node}}",
legend_display="table",
@ -1077,7 +1402,7 @@ def build_nodes_dashboard():
8,
"Root Filesystem Usage",
root_usage_expr(),
{"h": 9, "w": 24, "x": 0, "y": 31},
{"h": 9, "w": 24, "x": 0, "y": 35},
unit="percent",
legend="{{node}}",
legend_display="table",
@ -1204,43 +1529,107 @@ def build_network_dashboard():
panels.append(
stat_panel(
1,
"Ingress Traffic",
NET_INGRESS_EXPR,
{"h": 4, "w": 8, "x": 0, "y": 0},
unit="Bps",
"Ingress Success Rate (5m)",
TRAEFIK_SLI_5M,
{"h": 4, "w": 6, "x": 0, "y": 0},
unit="percentunit",
decimals=2,
thresholds={
"mode": "absolute",
"steps": [
{"color": "red", "value": None},
{"color": "orange", "value": 0.995},
{"color": "yellow", "value": 0.999},
{"color": "green", "value": 0.9995},
],
},
)
)
panels.append(
stat_panel(
2,
"Egress Traffic",
NET_EGRESS_EXPR,
{"h": 4, "w": 8, "x": 8, "y": 0},
unit="Bps",
"Error Budget Burn (1h)",
traefik_burn("1h"),
{"h": 4, "w": 6, "x": 6, "y": 0},
thresholds={
"mode": "absolute",
"steps": [
{"color": "green", "value": None},
{"color": "yellow", "value": 1},
{"color": "orange", "value": 2},
{"color": "red", "value": 4},
],
},
decimals=2,
)
)
panels.append(
stat_panel(
3,
"Intra-Cluster Traffic",
NET_INTERNAL_EXPR,
{"h": 4, "w": 8, "x": 16, "y": 0},
unit="Bps",
"Error Budget Burn (6h)",
traefik_burn("6h"),
{"h": 4, "w": 6, "x": 12, "y": 0},
thresholds={
"mode": "absolute",
"steps": [
{"color": "green", "value": None},
{"color": "yellow", "value": 1},
{"color": "orange", "value": 2},
{"color": "red", "value": 4},
],
},
decimals=2,
)
)
panels.append(
stat_panel(
4,
"Top Router req/s",
f"topk(1, {TRAEFIK_ROUTER_EXPR})",
"Edge P99 Latency (ms)",
TRAEFIK_P99_LATENCY_MS,
{"h": 4, "w": 6, "x": 18, "y": 0},
unit="ms",
thresholds={
"mode": "absolute",
"steps": [
{"color": "green", "value": None},
{"color": "yellow", "value": 200},
{"color": "orange", "value": 350},
{"color": "red", "value": 500},
],
},
decimals=1,
)
)
panels.append(
stat_panel(
5,
"Ingress Traffic",
NET_INGRESS_EXPR,
{"h": 4, "w": 8, "x": 0, "y": 4},
unit="req/s",
legend="{{router}}",
unit="Bps",
)
)
panels.append(
stat_panel(
6,
"Egress Traffic",
NET_EGRESS_EXPR,
{"h": 4, "w": 8, "x": 8, "y": 4},
unit="Bps",
)
)
panels.append(
stat_panel(
7,
"Intra-Cluster Traffic",
NET_INTERNAL_EXPR,
{"h": 4, "w": 8, "x": 16, "y": 4},
unit="Bps",
)
)
panels.append(
timeseries_panel(
5,
8,
"Per-Node Throughput",
f'avg by (node) (({NET_NODE_TX_PHYS} + {NET_NODE_RX_PHYS}) * on(instance) group_left(node) {NODE_INFO})',
{"h": 8, "w": 24, "x": 0, "y": 8},
@ -1252,7 +1641,7 @@ def build_network_dashboard():
)
panels.append(
table_panel(
6,
9,
"Top Namespaces",
'topk(10, sum(rate(container_network_transmit_bytes_total{namespace!=""}[5m]) '
'+ rate(container_network_receive_bytes_total{namespace!=""}[5m])) by (namespace))',
@ -1263,7 +1652,7 @@ def build_network_dashboard():
)
panels.append(
table_panel(
7,
10,
"Top Pods",
'topk(10, sum(rate(container_network_transmit_bytes_total{pod!=""}[5m]) '
'+ rate(container_network_receive_bytes_total{pod!=""}[5m])) by (namespace,pod))',
@ -1274,7 +1663,7 @@ def build_network_dashboard():
)
panels.append(
timeseries_panel(
8,
11,
"Traefik Routers (req/s)",
f"topk(10, {TRAEFIK_ROUTER_EXPR})",
{"h": 9, "w": 12, "x": 0, "y": 25},
@ -1286,7 +1675,7 @@ def build_network_dashboard():
)
panels.append(
timeseries_panel(
9,
12,
"Traefik Entrypoints (req/s)",
'sum by (entrypoint) (rate(traefik_entrypoint_requests_total[5m]))',
{"h": 9, "w": 12, "x": 12, "y": 25},

204
scripts/mailu_sync.py Normal file
View File

@ -0,0 +1,204 @@
#!/usr/bin/env python3
"""
Sync Keycloak users to Mailu mailboxes.
- Generates/stores a mailu_app_password attribute in Keycloak (admin-only)
- Upserts the mailbox in Mailu Postgres using that password
"""
import os
import sys
import json
import time
import secrets
import string
import datetime
import requests
import psycopg2
from psycopg2.extras import RealDictCursor
from passlib.hash import bcrypt_sha256
KC_BASE = os.environ["KEYCLOAK_BASE_URL"].rstrip("/")
KC_REALM = os.environ["KEYCLOAK_REALM"]
KC_CLIENT_ID = os.environ["KEYCLOAK_CLIENT_ID"]
KC_CLIENT_SECRET = os.environ["KEYCLOAK_CLIENT_SECRET"]
MAILU_DOMAIN = os.environ["MAILU_DOMAIN"]
MAILU_DEFAULT_QUOTA = int(os.environ.get("MAILU_DEFAULT_QUOTA", "20000000000"))
DB_CONFIG = {
"host": os.environ["MAILU_DB_HOST"],
"port": int(os.environ.get("MAILU_DB_PORT", "5432")),
"dbname": os.environ["MAILU_DB_NAME"],
"user": os.environ["MAILU_DB_USER"],
"password": os.environ["MAILU_DB_PASSWORD"],
}
SESSION = requests.Session()
def log(msg):
sys.stdout.write(f"{msg}\n")
sys.stdout.flush()
def get_kc_token():
resp = SESSION.post(
f"{KC_BASE}/realms/{KC_REALM}/protocol/openid-connect/token",
data={
"grant_type": "client_credentials",
"client_id": KC_CLIENT_ID,
"client_secret": KC_CLIENT_SECRET,
},
timeout=15,
)
resp.raise_for_status()
return resp.json()["access_token"]
def kc_get_users(token):
users = []
first = 0
max_results = 200
headers = {"Authorization": f"Bearer {token}"}
while True:
resp = SESSION.get(
f"{KC_BASE}/admin/realms/{KC_REALM}/users",
params={"first": first, "max": max_results, "enabled": "true"},
headers=headers,
timeout=20,
)
resp.raise_for_status()
batch = resp.json()
users.extend(batch)
if len(batch) < max_results:
break
first += max_results
return users
def kc_update_attributes(token, user, attributes):
headers = {
"Authorization": f"Bearer {token}",
"Content-Type": "application/json",
}
payload = {
"firstName": user.get("firstName"),
"lastName": user.get("lastName"),
"email": user.get("email"),
"enabled": user.get("enabled", True),
"username": user["username"],
"emailVerified": user.get("emailVerified", False),
"attributes": attributes,
}
user_url = f"{KC_BASE}/admin/realms/{KC_REALM}/users/{user['id']}"
resp = SESSION.put(user_url, headers=headers, json=payload, timeout=20)
resp.raise_for_status()
verify = SESSION.get(
user_url,
headers={"Authorization": f"Bearer {token}"},
params={"briefRepresentation": "false"},
timeout=15,
)
verify.raise_for_status()
attrs = verify.json().get("attributes") or {}
if not attrs.get("mailu_app_password"):
raise Exception(f"attribute not persisted for {user.get('email') or user['username']}")
def random_password():
alphabet = string.ascii_letters + string.digits
return "".join(secrets.choice(alphabet) for _ in range(24))
def ensure_mailu_user(cursor, email, password, display_name):
localpart, domain = email.split("@", 1)
if domain.lower() != MAILU_DOMAIN.lower():
return
hashed = bcrypt_sha256.hash(password)
now = datetime.datetime.utcnow()
cursor.execute(
"""
INSERT INTO "user" (
email, localpart, domain_name, password,
quota_bytes, quota_bytes_used,
global_admin, enabled, enable_imap, enable_pop, allow_spoofing,
forward_enabled, forward_destination, forward_keep,
reply_enabled, reply_subject, reply_body, reply_startdate, reply_enddate,
displayed_name, spam_enabled, spam_mark_as_read, spam_threshold,
change_pw_next_login, created_at, updated_at, comment
)
VALUES (
%(email)s, %(localpart)s, %(domain)s, %(password)s,
%(quota)s, 0,
false, true, true, true, false,
false, '', true,
false, NULL, NULL, DATE '1900-01-01', DATE '2999-12-31',
%(display)s, true, true, 80,
false, CURRENT_DATE, %(now)s, ''
)
ON CONFLICT (email) DO UPDATE
SET password = EXCLUDED.password,
enabled = true,
updated_at = EXCLUDED.updated_at
""",
{
"email": email,
"localpart": localpart,
"domain": domain,
"password": hashed,
"quota": MAILU_DEFAULT_QUOTA,
"display": display_name or localpart,
"now": now,
},
)
def main():
token = get_kc_token()
users = kc_get_users(token)
if not users:
log("No users found; exiting.")
return
conn = psycopg2.connect(**DB_CONFIG)
conn.autocommit = True
cursor = conn.cursor(cursor_factory=RealDictCursor)
for user in users:
attrs = user.get("attributes", {}) or {}
app_pw_value = attrs.get("mailu_app_password")
if isinstance(app_pw_value, list):
app_pw = app_pw_value[0] if app_pw_value else None
elif isinstance(app_pw_value, str):
app_pw = app_pw_value
else:
app_pw = None
email = user.get("email")
if not email:
email = f"{user['username']}@{MAILU_DOMAIN}"
if not app_pw:
app_pw = random_password()
attrs["mailu_app_password"] = app_pw
kc_update_attributes(token, user, attrs)
log(f"Set mailu_app_password for {email}")
display_name = " ".join(
part for part in [user.get("firstName"), user.get("lastName")] if part
).strip()
ensure_mailu_user(cursor, email, app_pw, display_name)
log(f"Synced mailbox for {email}")
cursor.close()
conn.close()
if __name__ == "__main__":
try:
main()
except Exception as exc:
log(f"ERROR: {exc}")
sys.exit(1)

49
scripts/nextcloud-mail-sync.sh Executable file
View File

@ -0,0 +1,49 @@
#!/bin/bash
set -euo pipefail
KC_BASE="${KC_BASE:?}"
KC_REALM="${KC_REALM:?}"
KC_ADMIN_USER="${KC_ADMIN_USER:?}"
KC_ADMIN_PASS="${KC_ADMIN_PASS:?}"
if ! command -v jq >/dev/null 2>&1; then
apt-get update && apt-get install -y jq curl >/dev/null
fi
account_exists() {
# Skip if the account email is already present in the mail app.
runuser -u www-data -- php occ mail:account:list 2>/dev/null | grep -Fq " ${1}" || \
runuser -u www-data -- php occ mail:account:list 2>/dev/null | grep -Fq "${1} "
}
token=$(
curl -s -d "grant_type=password" \
-d "client_id=admin-cli" \
-d "username=${KC_ADMIN_USER}" \
-d "password=${KC_ADMIN_PASS}" \
"${KC_BASE}/realms/master/protocol/openid-connect/token" | jq -r '.access_token'
)
if [[ -z "${token}" || "${token}" == "null" ]]; then
echo "Failed to obtain admin token"
exit 1
fi
users=$(curl -s -H "Authorization: Bearer ${token}" \
"${KC_BASE}/admin/realms/${KC_REALM}/users?max=2000")
echo "${users}" | jq -c '.[]' | while read -r user; do
username=$(echo "${user}" | jq -r '.username')
email=$(echo "${user}" | jq -r '.email // empty')
app_pw=$(echo "${user}" | jq -r '.attributes.mailu_app_password[0] // empty')
[[ -z "${email}" || -z "${app_pw}" ]] && continue
if account_exists "${email}"; then
echo "Skipping ${email}, already exists"
continue
fi
echo "Syncing ${email}"
runuser -u www-data -- php occ mail:account:create \
"${username}" "${username}" "${email}" \
mail.bstein.dev 993 ssl "${email}" "${app_pw}" \
mail.bstein.dev 587 tls "${email}" "${app_pw}" login || true
done

View File

@ -0,0 +1,65 @@
#!/bin/bash
set -euo pipefail
NC_URL="${NC_URL:-https://cloud.bstein.dev}"
ADMIN_USER="${ADMIN_USER:?}"
ADMIN_PASS="${ADMIN_PASS:?}"
export DEBIAN_FRONTEND=noninteractive
apt-get update -qq
apt-get install -y -qq curl jq >/dev/null
run_occ() {
runuser -u www-data -- php occ "$@"
}
log() { echo "[$(date -Is)] $*"; }
log "Applying Atlas theming"
run_occ theming:config name "Atlas Cloud"
run_occ theming:config slogan "Unified access to Atlas services"
run_occ theming:config url "https://cloud.bstein.dev"
run_occ theming:config color "#0f172a"
run_occ theming:config disable-user-theming yes
log "Setting default quota to 200 GB"
run_occ config:app:set files default_quota --value "200 GB"
API_BASE="${NC_URL}/ocs/v2.php/apps/external/api/v1"
AUTH=(-u "${ADMIN_USER}:${ADMIN_PASS}" -H "OCS-APIRequest: true")
log "Removing existing external links"
existing=$(curl -sf "${AUTH[@]}" "${API_BASE}?format=json" | jq -r '.ocs.data[].id // empty')
for id in ${existing}; do
curl -sf "${AUTH[@]}" -X DELETE "${API_BASE}/sites/${id}?format=json" >/dev/null || true
done
SITES=(
"Vaultwarden|https://vault.bstein.dev"
"Jellyfin|https://stream.bstein.dev"
"Gitea|https://scm.bstein.dev"
"Jenkins|https://ci.bstein.dev"
"Zot|https://registry.bstein.dev"
"Vault|https://secret.bstein.dev"
"Jitsi|https://meet.bstein.dev"
"Grafana|https://metrics.bstein.dev"
"Chat LLM|https://chat.ai.bstein.dev"
"Vision|https://draw.ai.bstein.dev"
"STT/TTS|https://talk.ai.bstein.dev"
)
log "Seeding external links"
for entry in "${SITES[@]}"; do
IFS="|" read -r name url <<<"${entry}"
curl -sf "${AUTH[@]}" -X POST "${API_BASE}/sites?format=json" \
-d "name=${name}" \
-d "url=${url}" \
-d "lang=" \
-d "type=link" \
-d "device=" \
-d "icon=" \
-d "groups[]=" \
-d "redirect=1" >/dev/null
done
log "Maintenance run completed"

View File

@ -0,0 +1,58 @@
import importlib.util
import pathlib
def load_module():
path = pathlib.Path(__file__).resolve().parents[1] / "dashboards_render_atlas.py"
spec = importlib.util.spec_from_file_location("dashboards_render_atlas", path)
module = importlib.util.module_from_spec(spec)
assert spec.loader is not None
spec.loader.exec_module(module)
return module
def test_table_panel_options_and_filterable():
mod = load_module()
panel = mod.table_panel(
1,
"test",
"metric",
{"h": 1, "w": 1, "x": 0, "y": 0},
unit="percent",
transformations=[{"id": "labelsToFields", "options": {}}],
instant=True,
options={"showColumnFilters": False},
filterable=False,
footer={"show": False, "fields": "", "calcs": []},
format="table",
)
assert panel["fieldConfig"]["defaults"]["unit"] == "percent"
assert panel["fieldConfig"]["defaults"]["custom"]["filterable"] is False
assert panel["options"]["showHeader"] is True
assert panel["targets"][0]["format"] == "table"
def test_node_filter_and_expr_helpers():
mod = load_module()
expr = mod.node_filter("titan-.*")
assert "label_replace" in expr
cpu_expr = mod.node_cpu_expr("titan-.*")
mem_expr = mod.node_mem_expr("titan-.*")
assert "node_cpu_seconds_total" in cpu_expr
assert "node_memory_MemAvailable_bytes" in mem_expr
def test_render_configmap_writes(tmp_path):
mod = load_module()
mod.DASHBOARD_DIR = tmp_path / "dash"
mod.ROOT = tmp_path
uid = "atlas-test"
info = {"configmap": tmp_path / "cm.yaml"}
data = {"title": "Atlas Test"}
mod.write_json(uid, data)
mod.render_configmap(uid, info)
json_path = mod.DASHBOARD_DIR / f"{uid}.json"
assert json_path.exists()
content = (tmp_path / "cm.yaml").read_text()
assert "kind: ConfigMap" in content
assert f"{uid}.json" in content

View File

@ -0,0 +1,181 @@
import importlib.util
import pathlib
import pytest
def load_sync_module(monkeypatch):
# Minimal env required by module import
env = {
"KEYCLOAK_BASE_URL": "http://keycloak",
"KEYCLOAK_REALM": "atlas",
"KEYCLOAK_CLIENT_ID": "mailu-sync",
"KEYCLOAK_CLIENT_SECRET": "secret",
"MAILU_DOMAIN": "example.com",
"MAILU_DB_HOST": "localhost",
"MAILU_DB_PORT": "5432",
"MAILU_DB_NAME": "mailu",
"MAILU_DB_USER": "mailu",
"MAILU_DB_PASSWORD": "pw",
}
for k, v in env.items():
monkeypatch.setenv(k, v)
module_path = pathlib.Path(__file__).resolve().parents[1] / "mailu_sync.py"
spec = importlib.util.spec_from_file_location("mailu_sync_testmod", module_path)
module = importlib.util.module_from_spec(spec)
assert spec.loader is not None
spec.loader.exec_module(module)
return module
def test_random_password_length_and_charset(monkeypatch):
sync = load_sync_module(monkeypatch)
pw = sync.random_password()
assert len(pw) == 24
assert all(ch.isalnum() for ch in pw)
class _FakeResponse:
def __init__(self, json_data=None, status=200):
self._json_data = json_data or {}
self.status_code = status
def raise_for_status(self):
if self.status_code >= 400:
raise AssertionError(f"status {self.status_code}")
def json(self):
return self._json_data
class _FakeSession:
def __init__(self, put_resp, get_resp):
self.put_resp = put_resp
self.get_resp = get_resp
self.put_called = False
self.get_called = False
def post(self, *args, **kwargs):
return _FakeResponse({"access_token": "dummy"})
def put(self, *args, **kwargs):
self.put_called = True
return self.put_resp
def get(self, *args, **kwargs):
self.get_called = True
return self.get_resp
def test_kc_update_attributes_succeeds(monkeypatch):
sync = load_sync_module(monkeypatch)
ok_resp = _FakeResponse({"attributes": {"mailu_app_password": ["abc"]}})
sync.SESSION = _FakeSession(_FakeResponse({}), ok_resp)
sync.kc_update_attributes("token", {"id": "u1", "username": "u1"}, {"mailu_app_password": "abc"})
assert sync.SESSION.put_called and sync.SESSION.get_called
def test_kc_update_attributes_raises_without_attribute(monkeypatch):
sync = load_sync_module(monkeypatch)
missing_attr_resp = _FakeResponse({"attributes": {}}, status=200)
sync.SESSION = _FakeSession(_FakeResponse({}), missing_attr_resp)
with pytest.raises(Exception):
sync.kc_update_attributes("token", {"id": "u1", "username": "u1"}, {"mailu_app_password": "abc"})
def test_kc_get_users_paginates(monkeypatch):
sync = load_sync_module(monkeypatch)
class _PagedSession:
def __init__(self):
self.calls = 0
def post(self, *_, **__):
return _FakeResponse({"access_token": "tok"})
def get(self, *_, **__):
self.calls += 1
if self.calls == 1:
return _FakeResponse([{"id": "u1"}, {"id": "u2"}])
return _FakeResponse([]) # stop pagination
sync.SESSION = _PagedSession()
users = sync.kc_get_users("tok")
assert [u["id"] for u in users] == ["u1", "u2"]
assert sync.SESSION.calls == 2
def test_ensure_mailu_user_skips_foreign_domain(monkeypatch):
sync = load_sync_module(monkeypatch)
executed = []
class _Cursor:
def execute(self, sql, params):
executed.append((sql, params))
sync.ensure_mailu_user(_Cursor(), "user@other.com", "pw", "User")
assert not executed
def test_ensure_mailu_user_upserts(monkeypatch):
sync = load_sync_module(monkeypatch)
captured = {}
class _Cursor:
def execute(self, sql, params):
captured.update(params)
sync.ensure_mailu_user(_Cursor(), "user@example.com", "pw", "User Example")
assert captured["email"] == "user@example.com"
assert captured["localpart"] == "user"
# password should be hashed, not the raw string
assert captured["password"] != "pw"
def test_main_generates_password_and_upserts(monkeypatch):
sync = load_sync_module(monkeypatch)
users = [
{"id": "u1", "username": "user1", "email": "user1@example.com", "attributes": {}},
{"id": "u2", "username": "user2", "email": "user2@example.com", "attributes": {"mailu_app_password": ["keepme"]}},
{"id": "u3", "username": "user3", "email": "user3@other.com", "attributes": {}},
]
updated = []
class _Cursor:
def __init__(self):
self.executions = []
def execute(self, sql, params):
self.executions.append(params)
def close(self):
return None
class _Conn:
def __init__(self):
self.autocommit = False
self._cursor = _Cursor()
def cursor(self, cursor_factory=None):
return self._cursor
def close(self):
return None
monkeypatch.setattr(sync, "get_kc_token", lambda: "tok")
monkeypatch.setattr(sync, "kc_get_users", lambda token: users)
monkeypatch.setattr(sync, "kc_update_attributes", lambda token, user, attrs: updated.append((user["id"], attrs["mailu_app_password"])))
conns = []
def _connect(**kwargs):
conn = _Conn()
conns.append(conn)
return conn
monkeypatch.setattr(sync.psycopg2, "connect", _connect)
sync.main()
# Should attempt two inserts (third user skipped due to domain mismatch)
assert len(updated) == 1 # only one missing attr was backfilled
assert conns and len(conns[0]._cursor.executions) == 2

View File

@ -5,7 +5,7 @@ metadata:
name: gitea-ingress
namespace: gitea
annotations:
cert-manager.io/cluster-issuer: "letsencrypt-prod"
cert-manager.io/cluster-issuer: letsencrypt
nginx.ingress.kubernetes.io/ssl-redirect: "true"
spec:
tls:

View File

@ -0,0 +1,49 @@
# services/gitops-ui/helmrelease.yaml
apiVersion: helm.toolkit.fluxcd.io/v2
kind: HelmRelease
metadata:
name: weave-gitops
namespace: flux-system
spec:
interval: 30m
chart:
spec:
chart: ./charts/gitops-server
sourceRef:
kind: GitRepository
name: weave-gitops-upstream
namespace: flux-system
# track upstream tag; see source object for version pin
install:
remediation:
retries: 3
upgrade:
remediation:
retries: 3
remediateLastFailure: true
cleanupOnFail: true
values:
adminUser:
create: true
createClusterRole: true
createSecret: true
username: admin
# bcrypt hash for temporary password "G1tOps!2025" (rotate after login)
passwordHash: "$2y$12$wDEOzR1Gc2dbvNSJ3ZXNdOBVFEjC6YASIxnZmHIbO.W1m0fie/QVi"
ingress:
enabled: true
className: traefik
annotations:
cert-manager.io/cluster-issuer: letsencrypt-prod
traefik.ingress.kubernetes.io/router.entrypoints: websecure
hosts:
- host: cd.bstein.dev
paths:
- path: /
pathType: Prefix
tls:
- secretName: gitops-ui-tls
hosts:
- cd.bstein.dev
metrics:
enabled: true

View File

@ -0,0 +1,7 @@
# services/gitops-ui/kustomization.yaml
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
namespace: flux-system
resources:
- source.yaml
- helmrelease.yaml

View File

@ -0,0 +1,11 @@
# services/gitops-ui/source.yaml
apiVersion: source.toolkit.fluxcd.io/v1
kind: GitRepository
metadata:
name: weave-gitops-upstream
namespace: flux-system
spec:
interval: 1h
url: https://github.com/weaveworks/weave-gitops.git
ref:
tag: v0.38.0

View File

@ -5,7 +5,7 @@ metadata:
name: jitsi
namespace: jitsi
annotations:
cert-manager.io/cluster-issuer: "letsencrypt-prod"
cert-manager.io/cluster-issuer: letsencrypt
spec:
ingressClassName: traefik
tls:

View File

@ -0,0 +1,27 @@
# services/keycloak
Keycloak is deployed via raw manifests and backed by the shared Postgres (`postgres-service.postgres.svc.cluster.local:5432`). Create these secrets before applying:
```bash
# DB creds (per-service DB/user in shared Postgres)
kubectl -n sso create secret generic keycloak-db \
--from-literal=username=keycloak \
--from-literal=password='<DB_PASSWORD>' \
--from-literal=database=keycloak
# Admin console creds (maps to KC admin user)
kubectl -n sso create secret generic keycloak-admin \
--from-literal=username=brad@bstein.dev \
--from-literal=password='<ADMIN_PASSWORD>'
```
Apply:
```bash
kubectl apply -k services/keycloak
```
Notes
- Service: `keycloak.sso.svc:80` (Ingress `sso.bstein.dev`, TLS via cert-manager).
- Uses Postgres schema `public`; DB/user should be provisioned in the shared Postgres instance.
- Health endpoints on :9000 are wired for probes.

View File

@ -0,0 +1,154 @@
# services/keycloak/deployment.yaml
apiVersion: apps/v1
kind: Deployment
metadata:
name: keycloak
namespace: sso
labels:
app: keycloak
spec:
replicas: 1
selector:
matchLabels:
app: keycloak
template:
metadata:
labels:
app: keycloak
spec:
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: hardware
operator: In
values: ["rpi5","rpi4"]
- key: node-role.kubernetes.io/worker
operator: Exists
- matchExpressions:
- key: kubernetes.io/hostname
operator: In
values: ["titan-24"]
preferredDuringSchedulingIgnoredDuringExecution:
- weight: 90
preference:
matchExpressions:
- key: hardware
operator: In
values: ["rpi5"]
- weight: 70
preference:
matchExpressions:
- key: hardware
operator: In
values: ["rpi4"]
securityContext:
runAsUser: 1000
runAsGroup: 0
fsGroup: 1000
fsGroupChangePolicy: OnRootMismatch
imagePullSecrets:
- name: zot-regcred
initContainers:
- name: mailu-http-listener
image: registry.bstein.dev/sso/mailu-http-listener:0.1.0
imagePullPolicy: IfNotPresent
command: ["/bin/sh", "-c"]
args:
- |
cp /plugin/mailu-http-listener-0.1.0.jar /providers/
cp -r /plugin/src /providers/src
volumeMounts:
- name: providers
mountPath: /providers
containers:
- name: keycloak
image: quay.io/keycloak/keycloak:26.0.7
imagePullPolicy: IfNotPresent
args:
- start
env:
- name: KC_DB
value: postgres
- name: KC_DB_URL_HOST
value: postgres-service.postgres.svc.cluster.local
- name: KC_DB_URL_DATABASE
valueFrom:
secretKeyRef:
name: keycloak-db
key: database
- name: KC_DB_USERNAME
valueFrom:
secretKeyRef:
name: keycloak-db
key: username
- name: KC_DB_PASSWORD
valueFrom:
secretKeyRef:
name: keycloak-db
key: password
- name: KC_DB_SCHEMA
value: public
- name: KC_HOSTNAME
value: sso.bstein.dev
- name: KC_HOSTNAME_URL
value: https://sso.bstein.dev
- name: KC_PROXY
value: edge
- name: KC_PROXY_HEADERS
value: xforwarded
- name: KC_HTTP_ENABLED
value: "true"
- name: KC_HTTP_MANAGEMENT_PORT
value: "9000"
- name: KC_HTTP_MANAGEMENT_BIND_ADDRESS
value: 0.0.0.0
- name: KC_HEALTH_ENABLED
value: "true"
- name: KC_METRICS_ENABLED
value: "true"
- name: KEYCLOAK_ADMIN
valueFrom:
secretKeyRef:
name: keycloak-admin
key: username
- name: KEYCLOAK_ADMIN_PASSWORD
valueFrom:
secretKeyRef:
name: keycloak-admin
key: password
- name: KC_EVENTS_LISTENERS
value: jboss-logging,mailu-http
- name: KC_SPI_EVENTS_LISTENER_MAILU-HTTP_ENDPOINT
value: http://mailu-sync-listener.mailu-mailserver.svc.cluster.local:8080/events
ports:
- containerPort: 8080
name: http
- containerPort: 9000
name: metrics
readinessProbe:
httpGet:
path: /health/ready
port: 9000
initialDelaySeconds: 15
periodSeconds: 10
failureThreshold: 6
livenessProbe:
httpGet:
path: /health/live
port: 9000
initialDelaySeconds: 60
periodSeconds: 15
failureThreshold: 6
volumeMounts:
- name: data
mountPath: /opt/keycloak/data
- name: providers
mountPath: /opt/keycloak/providers
volumes:
- name: data
persistentVolumeClaim:
claimName: keycloak-data
- name: providers
emptyDir: {}

View File

@ -0,0 +1,24 @@
# services/keycloak/ingress.yaml
apiVersion: networking.k8s.io/v1
kind: Ingress
metadata:
name: keycloak
namespace: sso
annotations:
cert-manager.io/cluster-issuer: letsencrypt
spec:
ingressClassName: traefik
rules:
- host: sso.bstein.dev
http:
paths:
- path: /
pathType: Prefix
backend:
service:
name: keycloak
port:
number: 80
tls:
- hosts: [sso.bstein.dev]
secretName: keycloak-tls

View File

@ -0,0 +1,10 @@
# services/keycloak/kustomization.yaml
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
namespace: sso
resources:
- namespace.yaml
- pvc.yaml
- deployment.yaml
- service.yaml
- ingress.yaml

View File

@ -0,0 +1,5 @@
# services/keycloak/namespace.yaml
apiVersion: v1
kind: Namespace
metadata:
name: sso

View File

@ -0,0 +1,12 @@
# services/keycloak/pvc.yaml
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
name: keycloak-data
namespace: sso
spec:
accessModes: ["ReadWriteOnce"]
resources:
requests:
storage: 10Gi
storageClassName: astreae

View File

@ -0,0 +1,15 @@
# services/keycloak/service.yaml
apiVersion: v1
kind: Service
metadata:
name: keycloak
namespace: sso
labels:
app: keycloak
spec:
selector:
app: keycloak
ports:
- name: http
port: 80
targetPort: http

View File

@ -0,0 +1,13 @@
# services/mailu/certificate.yaml
apiVersion: cert-manager.io/v1
kind: Certificate
metadata:
name: mailu-tls
namespace: mailu-mailserver
spec:
secretName: mailu-certificates
issuerRef:
kind: ClusterIssuer
name: letsencrypt-prod
dnsNames:
- mail.bstein.dev

View File

@ -0,0 +1,287 @@
# services/mailu/helmrelease.yaml
apiVersion: helm.toolkit.fluxcd.io/v2
kind: HelmRelease
metadata:
name: mailu
namespace: mailu-mailserver
spec:
interval: 30m
chart:
spec:
chart: mailu
version: 2.1.2
sourceRef:
kind: HelmRepository
name: mailu
namespace: flux-system
install:
remediation: { retries: 3 }
timeout: 10m
upgrade:
remediation:
retries: 3
remediateLastFailure: true
cleanupOnFail: true
timeout: 10m
values:
mailuVersion: "2024.06"
domain: bstein.dev
hostnames: [mail.bstein.dev]
domains:
- name: bstein.dev
enabled: true
dkim:
enabled: true
externalRelay:
host: "[email-smtp.us-east-2.amazonaws.com]:587"
existingSecret: mailu-ses-relay
usernameKey: relay-username
passwordKey: relay-password
timezone: Etc/UTC
subnet: 10.42.0.0/16
existingSecret: mailu-secret
tls:
outboundLevel: encrypt
externalDatabase:
enabled: true
type: postgresql
host: postgres-service.postgres.svc.cluster.local
port: 5432
database: mailu
username: mailu
existingSecret: mailu-db-secret
existingSecretUsernameKey: username
existingSecretPasswordKey: password
existingSecretDatabaseKey: database
initialAccount:
enabled: true
username: test
domain: bstein.dev
existingSecret: mailu-initial-account-secret
existingSecretPasswordKey: password
persistence:
accessModes: [ReadWriteMany]
size: 100Gi
storageClass: astreae
single_pvc: true
front:
hostnames: [mail.bstein.dev]
proxied: true
hostPort:
enabled: false
https:
enabled: false
external: false
forceHttps: false
externalService:
enabled: true
type: LoadBalancer
externalTrafficPolicy: Cluster
ports:
submission: true
nodePorts:
pop3: 30010
pop3s: 30011
imap: 30143
imaps: 30993
manageSieve: 30419
smtp: 30025
smtps: 30465
submission: 30587
logLevel: DEBUG
nodeSelector:
hardware: rpi4
admin:
logLevel: DEBUG
nodeSelector:
hardware: rpi4
podLivenessProbe:
enabled: true
initialDelaySeconds: 30
periodSeconds: 10
timeoutSeconds: 5
failureThreshold: 6
successThreshold: 1
podReadinessProbe:
enabled: true
initialDelaySeconds: 20
periodSeconds: 10
timeoutSeconds: 5
failureThreshold: 6
successThreshold: 1
extraEnvVars:
- name: FLASK_DEBUG
value: "1"
- name: ACCESSLOG
value: /dev/stdout
- name: ERRORLOG
value: /dev/stderr
- name: WEBROOT_REDIRECT
value: ""
- name: FORWARDED_ALLOW_IPS
value: 127.0.0.1,10.42.0.0/16
- name: DNS_RESOLVERS
value: 1.1.1.1,9.9.9.9
extraVolumes:
- name: unbound-config
configMap:
name: mailu-unbound
- name: unbound-run
emptyDir: {}
extraVolumeMounts:
- name: unbound-run
mountPath: /var/lib/unbound
extraContainers:
- name: unbound
image: docker.io/alpine:3.20
command: ["/bin/sh", "-c"]
args:
- |
while :; do
printf "nameserver 10.43.0.10\n" > /etc/resolv.conf
if apk add --no-cache unbound bind-tools; then
break
fi
echo "apk failed, retrying" >&2
sleep 10
done
cat >/etc/resolv.conf <<'EOF'
search mailu-mailserver.svc.cluster.local svc.cluster.local cluster.local
nameserver 127.0.0.1
EOF
unbound-anchor -a /var/lib/unbound/root.key || true
exec unbound -d -c /opt/unbound/etc/unbound/unbound.conf
ports:
- containerPort: 53
protocol: UDP
- containerPort: 53
protocol: TCP
volumeMounts:
- name: unbound-config
mountPath: /opt/unbound/etc/unbound
- name: unbound-run
mountPath: /var/lib/unbound
dnsPolicy: None
dnsConfig:
nameservers:
- 127.0.0.1
searches:
- mailu-mailserver.svc.cluster.local
- svc.cluster.local
- cluster.local
clamav:
image:
repository: clamav/clamav-debian
tag: "1.4"
logLevel: DEBUG
nodeSelector:
hardware: rpi5
resources:
requests:
cpu: 200m
memory: 1Gi
limits:
cpu: 500m
memory: 3Gi
livenessProbe:
enabled: false
initialDelaySeconds: 300
periodSeconds: 30
timeoutSeconds: 5
failureThreshold: 6
successThreshold: 1
startupProbe:
enabled: false
initialDelaySeconds: 60
periodSeconds: 30
timeoutSeconds: 5
failureThreshold: 20
successThreshold: 1
readinessProbe:
enabled: false
initialDelaySeconds: 300
periodSeconds: 30
timeoutSeconds: 5
failureThreshold: 6
successThreshold: 1
dovecot:
logLevel: DEBUG
nodeSelector:
hardware: rpi4
oletools:
logLevel: DEBUG
nodeSelector:
hardware: rpi4
postfix:
logLevel: DEBUG
nodeSelector:
hardware: rpi4
overrides:
smtp_use_tls: "yes"
smtp_tls_security_level: "encrypt"
smtp_sasl_security_options: "noanonymous"
redis:
enabled: true
architecture: standalone
logLevel: DEBUG
image:
repository: bitnamilegacy/redis
tag: 8.0.3-debian-12-r3
master:
nodeSelector:
hardware: rpi4
persistence:
enabled: true
accessModes: [ReadWriteMany]
size: 8Gi
storageClass: astreae
rspamd:
logLevel: DEBUG
nodeSelector:
hardware: rpi4
persistence:
accessModes: [ReadWriteOnce]
size: 8Gi
storageClass: astreae
tika:
logLevel: DEBUG
nodeSelector:
hardware: rpi4
global:
logLevel: DEBUG
storageClass: astreae
webmail:
enabled: false
nodeSelector:
hardware: rpi4
ingress:
enabled: false
ingressClassName: traefik
tls: true
existingSecret: mailu-certificates
annotations:
traefik.ingress.kubernetes.io/router.entrypoints: websecure
traefik.ingress.kubernetes.io/service.serversscheme: https
traefik.ingress.kubernetes.io/service.serverstransport: mailu-transport@kubernetescrd
extraRules:
- host: mail.bstein.dev
http:
paths:
- path: /
pathType: Prefix
backend:
service:
name: mailu-front
port:
number: 443
service:
ports:
smtp:
port: 25
targetPort: 25
smtps:
port: 465
targetPort: 465
submission:
port: 587
targetPort: 587

View File

@ -0,0 +1,19 @@
# services/mailu/ingressroute.yaml
apiVersion: traefik.io/v1alpha1
kind: IngressRoute
metadata:
name: mailu
namespace: mailu-mailserver
spec:
entryPoints:
- websecure
routes:
- match: Host(`mail.bstein.dev`)
kind: Rule
services:
- name: mailu-front
port: 443
scheme: https
serversTransport: mailu-transport
tls:
secretName: mailu-certificates

View File

@ -0,0 +1,23 @@
# services/mailu/kustomization.yaml
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
namespace: mailu-mailserver
resources:
- namespace.yaml
- helmrelease.yaml
- certificate.yaml
- vip-controller.yaml
- unbound-configmap.yaml
- serverstransport.yaml
- ingressroute.yaml
- mailu-sync-job.yaml
- mailu-sync-cronjob.yaml
- mailu-sync-listener.yaml
configMapGenerator:
- name: mailu-sync-script
namespace: mailu-mailserver
files:
- sync.py=../../scripts/mailu_sync.py
options:
disableNameSuffixHash: true

View File

@ -0,0 +1,77 @@
# services/mailu/mailu-sync-cronjob.yaml
apiVersion: batch/v1
kind: CronJob
metadata:
name: mailu-sync-nightly
namespace: mailu-mailserver
spec:
schedule: "30 4 * * *"
concurrencyPolicy: Forbid
jobTemplate:
spec:
template:
spec:
restartPolicy: OnFailure
containers:
- name: mailu-sync
image: python:3.11-alpine
imagePullPolicy: IfNotPresent
command: ["/bin/sh", "-c"]
args:
- |
pip install --no-cache-dir requests psycopg2-binary passlib >/tmp/pip.log \
&& python /app/sync.py
env:
- name: KEYCLOAK_BASE_URL
value: http://keycloak.sso.svc.cluster.local
- name: KEYCLOAK_REALM
value: atlas
- name: MAILU_DOMAIN
value: bstein.dev
- name: MAILU_DEFAULT_QUOTA
value: "20000000000"
- name: MAILU_DB_HOST
value: postgres-service.postgres.svc.cluster.local
- name: MAILU_DB_PORT
value: "5432"
- name: MAILU_DB_NAME
valueFrom:
secretKeyRef:
name: mailu-db-secret
key: database
- name: MAILU_DB_USER
valueFrom:
secretKeyRef:
name: mailu-db-secret
key: username
- name: MAILU_DB_PASSWORD
valueFrom:
secretKeyRef:
name: mailu-db-secret
key: password
- name: KEYCLOAK_CLIENT_ID
valueFrom:
secretKeyRef:
name: mailu-sync-credentials
key: client-id
- name: KEYCLOAK_CLIENT_SECRET
valueFrom:
secretKeyRef:
name: mailu-sync-credentials
key: client-secret
volumeMounts:
- name: sync-script
mountPath: /app/sync.py
subPath: sync.py
resources:
requests:
cpu: 50m
memory: 128Mi
limits:
cpu: 200m
memory: 256Mi
volumes:
- name: sync-script
configMap:
name: mailu-sync-script
defaultMode: 0444

View File

@ -0,0 +1,73 @@
# services/mailu/mailu-sync-job.yaml
apiVersion: batch/v1
kind: Job
metadata:
name: mailu-sync
namespace: mailu-mailserver
spec:
template:
spec:
restartPolicy: OnFailure
containers:
- name: mailu-sync
image: python:3.11-alpine
imagePullPolicy: IfNotPresent
command: ["/bin/sh", "-c"]
args:
- |
pip install --no-cache-dir requests psycopg2-binary passlib >/tmp/pip.log \
&& python /app/sync.py
env:
- name: KEYCLOAK_BASE_URL
value: http://keycloak.sso.svc.cluster.local
- name: KEYCLOAK_REALM
value: atlas
- name: MAILU_DOMAIN
value: bstein.dev
- name: MAILU_DEFAULT_QUOTA
value: "20000000000"
- name: MAILU_DB_HOST
value: postgres-service.postgres.svc.cluster.local
- name: MAILU_DB_PORT
value: "5432"
- name: MAILU_DB_NAME
valueFrom:
secretKeyRef:
name: mailu-db-secret
key: database
- name: MAILU_DB_USER
valueFrom:
secretKeyRef:
name: mailu-db-secret
key: username
- name: MAILU_DB_PASSWORD
valueFrom:
secretKeyRef:
name: mailu-db-secret
key: password
- name: KEYCLOAK_CLIENT_ID
valueFrom:
secretKeyRef:
name: mailu-sync-credentials
key: client-id
- name: KEYCLOAK_CLIENT_SECRET
valueFrom:
secretKeyRef:
name: mailu-sync-credentials
key: client-secret
volumeMounts:
- name: sync-script
mountPath: /app/sync.py
subPath: sync.py
resources:
requests:
cpu: 50m
memory: 128Mi
limits:
cpu: 200m
memory: 256Mi
volumes:
- name: sync-script
configMap:
name: mailu-sync-script
defaultMode: 0444

View File

@ -0,0 +1,154 @@
# services/mailu/mailu-sync-listener.yaml
apiVersion: v1
kind: Service
metadata:
name: mailu-sync-listener
namespace: mailu-mailserver
spec:
selector:
app: mailu-sync-listener
ports:
- name: http
port: 8080
targetPort: 8080
---
apiVersion: apps/v1
kind: Deployment
metadata:
name: mailu-sync-listener
namespace: mailu-mailserver
labels:
app: mailu-sync-listener
spec:
replicas: 1
selector:
matchLabels:
app: mailu-sync-listener
template:
metadata:
labels:
app: mailu-sync-listener
spec:
restartPolicy: Always
containers:
- name: listener
image: python:3.11-alpine
imagePullPolicy: IfNotPresent
command: ["/bin/sh", "-c"]
args:
- |
pip install --no-cache-dir requests psycopg2-binary passlib >/tmp/pip.log \
&& python /app/listener.py
env:
- name: KEYCLOAK_BASE_URL
value: http://keycloak.sso.svc.cluster.local
- name: KEYCLOAK_REALM
value: atlas
- name: MAILU_DOMAIN
value: bstein.dev
- name: MAILU_DEFAULT_QUOTA
value: "20000000000"
- name: MAILU_DB_HOST
value: postgres-service.postgres.svc.cluster.local
- name: MAILU_DB_PORT
value: "5432"
- name: MAILU_DB_NAME
valueFrom:
secretKeyRef:
name: mailu-db-secret
key: database
- name: MAILU_DB_USER
valueFrom:
secretKeyRef:
name: mailu-db-secret
key: username
- name: MAILU_DB_PASSWORD
valueFrom:
secretKeyRef:
name: mailu-db-secret
key: password
- name: KEYCLOAK_CLIENT_ID
valueFrom:
secretKeyRef:
name: mailu-sync-credentials
key: client-id
- name: KEYCLOAK_CLIENT_SECRET
valueFrom:
secretKeyRef:
name: mailu-sync-credentials
key: client-secret
volumeMounts:
- name: sync-script
mountPath: /app/sync.py
subPath: sync.py
- name: listener-script
mountPath: /app/listener.py
subPath: listener.py
resources:
requests:
cpu: 50m
memory: 128Mi
limits:
cpu: 200m
memory: 256Mi
volumes:
- name: sync-script
configMap:
name: mailu-sync-script
defaultMode: 0444
- name: listener-script
configMap:
name: mailu-sync-listener
defaultMode: 0444
---
apiVersion: v1
kind: ConfigMap
metadata:
name: mailu-sync-listener
namespace: mailu-mailserver
data:
listener.py: |
import http.server
import json
import os
import subprocess
import threading
from time import time
# Simple debounce to avoid hammering on bursts
MIN_INTERVAL_SECONDS = 10
last_run = 0.0
lock = threading.Lock()
def trigger_sync():
global last_run
with lock:
now = time()
if now - last_run < MIN_INTERVAL_SECONDS:
return
last_run = now
# Fire and forget; output to stdout
subprocess.Popen(["python", "/app/sync.py"], stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
class Handler(http.server.BaseHTTPRequestHandler):
def do_POST(self):
length = int(self.headers.get("Content-Length", 0))
body = self.rfile.read(length) if length else b""
try:
json.loads(body or b"{}")
except json.JSONDecodeError:
self.send_response(400)
self.end_headers()
return
trigger_sync()
self.send_response(202)
self.end_headers()
def log_message(self, fmt, *args):
# Quiet logging
return
if __name__ == "__main__":
server = http.server.ThreadingHTTPServer(("", 8080), Handler)
server.serve_forever()

View File

@ -0,0 +1,5 @@
# services/mailu/namespace.yaml
apiVersion: v1
kind: Namespace
metadata:
name: mailu-mailserver

View File

@ -0,0 +1,10 @@
# services/mailu/serverstransport.yaml
apiVersion: traefik.io/v1alpha1
kind: ServersTransport
metadata:
name: mailu-transport
namespace: mailu-mailserver
spec:
# Force SNI to mail.bstein.dev and skip backend cert verification (backend cert is for the host, not the pod IP).
serverName: mail.bstein.dev
insecureSkipVerify: true

View File

@ -0,0 +1,49 @@
# services/mailu/unbound-configmap.yaml
apiVersion: v1
kind: ConfigMap
metadata:
name: mailu-unbound
namespace: mailu-mailserver
data:
unbound.conf: |
server:
verbosity: 1
interface: 0.0.0.0
do-ip4: yes
do-ip6: no
do-udp: yes
do-tcp: yes
auto-trust-anchor-file: "/var/lib/unbound/root.key"
prefetch: yes
qname-minimisation: yes
harden-dnssec-stripped: yes
val-clean-additional: yes
domain-insecure: "mailu-mailserver.svc.cluster.local."
domain-insecure: "svc.cluster.local."
domain-insecure: "cluster.local."
cache-min-ttl: 120
cache-max-ttl: 86400
access-control: 0.0.0.0/0 allow
forward-zone:
name: "mailu-mailserver.svc.cluster.local."
forward-addr: 10.43.0.10
forward-no-cache: yes
forward-first: yes
forward-zone:
name: "svc.cluster.local."
forward-addr: 10.43.0.10
forward-no-cache: yes
forward-first: yes
forward-zone:
name: "cluster.local."
forward-addr: 10.43.0.10
forward-no-cache: yes
forward-first: yes
forward-zone:
name: "."
forward-addr: 9.9.9.9
forward-addr: 1.1.1.1

View File

@ -0,0 +1,71 @@
# services/mailu/vip-controller.yaml
---
apiVersion: v1
kind: ServiceAccount
metadata:
name: vip-controller
namespace: mailu-mailserver
---
apiVersion: rbac.authorization.k8s.io/v1
kind: Role
metadata:
name: vip-controller-role
namespace: mailu-mailserver
rules:
- apiGroups: ["apps"]
resources: ["deployments"]
verbs: ["get", "list", "patch", "update"]
---
apiVersion: rbac.authorization.k8s.io/v1
kind: RoleBinding
metadata:
name: vip-controller-binding
namespace: mailu-mailserver
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: Role
name: vip-controller-role
subjects:
- kind: ServiceAccount
name: vip-controller
namespace: mailu-mailserver
---
apiVersion: apps/v1
kind: DaemonSet
metadata:
name: vip-controller
namespace: mailu-mailserver
spec:
selector:
matchLabels:
app: vip-controller
template:
metadata:
labels:
app: vip-controller
spec:
serviceAccountName: vip-controller
hostNetwork: true
nodeSelector:
mailu.bstein.dev/vip: "true"
containers:
- name: vip-controller
image: lachlanevenson/k8s-kubectl:latest
imagePullPolicy: IfNotPresent
command:
- /bin/sh
- -c
args:
- |
set -e
while true; do
if ip addr show end0 | grep -q 'inet 192\.168\.22\.9/32'; then
NODE=$(hostname)
echo "VIP found on node ${NODE}."
kubectl patch deployment mailu-front -n mailu-mailserver --type='merge' \
-p "{\"spec\":{\"template\":{\"spec\":{\"nodeSelector\":{\"kubernetes.io/hostname\":\"${NODE}\"}}}}}"
else
echo "No VIP on node ${HOSTNAME}."
fi
sleep 60
done

View File

@ -1,28 +0,0 @@
# services/monitoring
## Grafana admin secret
The Grafana Helm release expects a pre-existing secret named `grafana-admin`
in the `monitoring` namespace. Create or rotate it with:
```bash
kubectl create secret generic grafana-admin \
--namespace monitoring \
--from-literal=admin-user=admin \
--from-literal=admin-password='REPLACE_ME'
```
Update the password whenever you rotate credentials.
## DCGM exporter image
The NVIDIA GPU metrics DaemonSet expects `registry.bstein.dev/monitoring/dcgm-exporter:4.4.2-4.7.0-ubuntu22.04`, mirrored from `docker.io/nvidia/dcgm-exporter:4.4.2-4.7.0-ubuntu22.04`. Refresh it in Zot when bumping versions:
```bash
skopeo copy \
--all \
docker://docker.io/nvidia/dcgm-exporter:4.4.2-4.7.0-ubuntu22.04 \
docker://registry.bstein.dev/monitoring/dcgm-exporter:4.4.2-4.7.0-ubuntu22.04
```
When finished mirroring from the control-plane, you can remove temporary tooling with `sudo apt-get purge -y skopeo && sudo apt-get autoremove -y` and clear `~/.config/containers/auth.json`.

View File

@ -20,7 +20,7 @@
},
"targets": [
{
"expr": "100 * ( ( (sum by (namespace) (avg_over_time(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\"}[1h]))) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ) / clamp_min(sum( ( (sum by (namespace) (avg_over_time(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\"}[1h]))) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ), 1)",
"expr": "100 * ( ( (sum by (namespace) (max_over_time(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\"}[$__range]))) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ) / clamp_min(sum( ( (sum by (namespace) (max_over_time(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\"}[$__range]))) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ), 1)",
"refId": "A",
"legendFormat": "{{namespace}}"
}
@ -40,9 +40,7 @@
"placement": "right"
},
"pieType": "pie",
"displayLabels": [
"percent"
],
"displayLabels": [],
"tooltip": {
"mode": "single"
},
@ -153,12 +151,16 @@
],
"fieldConfig": {
"defaults": {
"unit": "percent"
"unit": "percent",
"custom": {
"filterable": true
}
},
"overrides": []
},
"options": {
"showHeader": true
"showHeader": true,
"columnFilters": false
},
"transformations": [
{

View File

@ -7,46 +7,55 @@
{
"id": 1,
"type": "stat",
"title": "Ingress Traffic",
"title": "Ingress Success Rate (5m)",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 4,
"w": 8,
"w": 6,
"x": 0,
"y": 0
},
"targets": [
{
"expr": "sum(rate(node_network_receive_bytes_total{device!~\"lo|cni.*|veth.*|flannel.*|docker.*|virbr.*|vxlan.*|wg.*\"}[5m])) or on() vector(0)",
"expr": "(sum(rate(traefik_entrypoint_requests_total{code!~\"5..\"}[5m]))) / clamp_min(sum(rate(traefik_entrypoint_requests_total[5m])), 1)",
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
"mode": "thresholds"
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "rgba(115, 115, 115, 1)",
"color": "red",
"value": null
},
{
"color": "orange",
"value": 0.995
},
{
"color": "yellow",
"value": 0.999
},
{
"color": "green",
"value": 1
"value": 0.9995
}
]
},
"unit": "Bps",
"unit": "percentunit",
"custom": {
"displayMode": "auto"
}
},
"decimals": 2
},
"overrides": []
},
@ -67,46 +76,55 @@
{
"id": 2,
"type": "stat",
"title": "Egress Traffic",
"title": "Error Budget Burn (1h)",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 4,
"w": 8,
"x": 8,
"w": 6,
"x": 6,
"y": 0
},
"targets": [
{
"expr": "sum(rate(node_network_transmit_bytes_total{device!~\"lo|cni.*|veth.*|flannel.*|docker.*|virbr.*|vxlan.*|wg.*\"}[5m])) or on() vector(0)",
"expr": "(1 - ((sum(rate(traefik_entrypoint_requests_total{code!~\"5..\"}[1h]))) / clamp_min(sum(rate(traefik_entrypoint_requests_total[1h])), 1))) / 0.0010000000000000009",
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
"mode": "thresholds"
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "rgba(115, 115, 115, 1)",
"color": "green",
"value": null
},
{
"color": "green",
"color": "yellow",
"value": 1
},
{
"color": "orange",
"value": 2
},
{
"color": "red",
"value": 4
}
]
},
"unit": "Bps",
"unit": "none",
"custom": {
"displayMode": "auto"
}
},
"decimals": 2
},
"overrides": []
},
@ -127,7 +145,145 @@
{
"id": 3,
"type": "stat",
"title": "Intra-Cluster Traffic",
"title": "Error Budget Burn (6h)",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 4,
"w": 6,
"x": 12,
"y": 0
},
"targets": [
{
"expr": "(1 - ((sum(rate(traefik_entrypoint_requests_total{code!~\"5..\"}[6h]))) / clamp_min(sum(rate(traefik_entrypoint_requests_total[6h])), 1))) / 0.0010000000000000009",
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "yellow",
"value": 1
},
{
"color": "orange",
"value": 2
},
{
"color": "red",
"value": 4
}
]
},
"unit": "none",
"custom": {
"displayMode": "auto"
},
"decimals": 2
},
"overrides": []
},
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "center",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"textMode": "value"
}
},
{
"id": 4,
"type": "stat",
"title": "Edge P99 Latency (ms)",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 4,
"w": 6,
"x": 18,
"y": 0
},
"targets": [
{
"expr": "histogram_quantile(0.99, sum by (le) (rate(traefik_entrypoint_request_duration_seconds_bucket[5m]))) * 1000",
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "yellow",
"value": 200
},
{
"color": "orange",
"value": 350
},
{
"color": "red",
"value": 500
}
]
},
"unit": "ms",
"custom": {
"displayMode": "auto"
},
"decimals": 1
},
"overrides": []
},
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "center",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"textMode": "value"
}
},
{
"id": 5,
"type": "stat",
"title": "Ingress Traffic",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
@ -135,19 +291,19 @@
"gridPos": {
"h": 4,
"w": 8,
"x": 16,
"y": 0
"x": 0,
"y": 4
},
"targets": [
{
"expr": "sum(rate(container_network_receive_bytes_total{namespace!=\"traefik\",pod!=\"\"}[5m]) + rate(container_network_transmit_bytes_total{namespace!=\"traefik\",pod!=\"\"}[5m])) or on() vector(0)",
"expr": "sum(rate(node_network_receive_bytes_total{device!~\"lo|cni.*|veth.*|flannel.*|docker.*|virbr.*|vxlan.*|wg.*\"}[5m])) or on() vector(0)",
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
"mode": "thresholds"
},
"mappings": [],
"thresholds": {
@ -185,9 +341,9 @@
}
},
{
"id": 4,
"id": 6,
"type": "stat",
"title": "Top Router req/s",
"title": "Egress Traffic",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
@ -195,20 +351,19 @@
"gridPos": {
"h": 4,
"w": 8,
"x": 0,
"x": 8,
"y": 4
},
"targets": [
{
"expr": "topk(1, sum by (router) (rate(traefik_router_requests_total[5m])))",
"refId": "A",
"legendFormat": "{{router}}"
"expr": "sum(rate(node_network_transmit_bytes_total{device!~\"lo|cni.*|veth.*|flannel.*|docker.*|virbr.*|vxlan.*|wg.*\"}[5m])) or on() vector(0)",
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
"mode": "thresholds"
},
"mappings": [],
"thresholds": {
@ -224,7 +379,7 @@
}
]
},
"unit": "req/s",
"unit": "Bps",
"custom": {
"displayMode": "auto"
}
@ -246,7 +401,67 @@
}
},
{
"id": 5,
"id": 7,
"type": "stat",
"title": "Intra-Cluster Traffic",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 4,
"w": 8,
"x": 16,
"y": 4
},
"targets": [
{
"expr": "sum(rate(container_network_receive_bytes_total{namespace!=\"traefik\",pod!=\"\"}[5m]) + rate(container_network_transmit_bytes_total{namespace!=\"traefik\",pod!=\"\"}[5m])) or on() vector(0)",
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "rgba(115, 115, 115, 1)",
"value": null
},
{
"color": "green",
"value": 1
}
]
},
"unit": "Bps",
"custom": {
"displayMode": "auto"
}
},
"overrides": []
},
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "center",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"textMode": "value"
}
},
{
"id": 8,
"type": "timeseries",
"title": "Per-Node Throughput",
"datasource": {
@ -283,7 +498,7 @@
}
},
{
"id": 6,
"id": 9,
"type": "table",
"title": "Top Namespaces",
"datasource": {
@ -304,12 +519,16 @@
],
"fieldConfig": {
"defaults": {
"unit": "Bps"
"unit": "Bps",
"custom": {
"filterable": true
}
},
"overrides": []
},
"options": {
"showHeader": true
"showHeader": true,
"columnFilters": false
},
"transformations": [
{
@ -319,7 +538,7 @@
]
},
{
"id": 7,
"id": 10,
"type": "table",
"title": "Top Pods",
"datasource": {
@ -340,12 +559,16 @@
],
"fieldConfig": {
"defaults": {
"unit": "Bps"
"unit": "Bps",
"custom": {
"filterable": true
}
},
"overrides": []
},
"options": {
"showHeader": true
"showHeader": true,
"columnFilters": false
},
"transformations": [
{
@ -355,7 +578,7 @@
]
},
{
"id": 8,
"id": 11,
"type": "timeseries",
"title": "Traefik Routers (req/s)",
"datasource": {
@ -392,7 +615,7 @@
}
},
{
"id": 9,
"id": 12,
"type": "timeseries",
"title": "Traefik Entrypoints (req/s)",
"datasource": {

View File

@ -27,7 +27,7 @@
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
"mode": "thresholds"
},
"mappings": [],
"thresholds": {
@ -88,7 +88,7 @@
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
"mode": "thresholds"
},
"mappings": [],
"thresholds": {
@ -149,7 +149,7 @@
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
"mode": "thresholds"
},
"mappings": [],
"thresholds": {
@ -186,6 +186,213 @@
"textMode": "value"
}
},
{
"id": 9,
"type": "stat",
"title": "API Server 5xx rate",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 4,
"w": 8,
"x": 0,
"y": 4
},
"targets": [
{
"expr": "sum(rate(apiserver_request_total{code=~\"5..\"}[5m]))",
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "yellow",
"value": 0.05
},
{
"color": "orange",
"value": 0.2
},
{
"color": "red",
"value": 0.5
}
]
},
"unit": "req/s",
"custom": {
"displayMode": "auto"
},
"decimals": 3
},
"overrides": []
},
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "center",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"textMode": "value"
}
},
{
"id": 10,
"type": "stat",
"title": "API Server P99 latency",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 4,
"w": 8,
"x": 8,
"y": 4
},
"targets": [
{
"expr": "histogram_quantile(0.99, sum by (le) (rate(apiserver_request_duration_seconds_bucket[5m]))) * 1000",
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "yellow",
"value": 250
},
{
"color": "orange",
"value": 400
},
{
"color": "red",
"value": 600
}
]
},
"unit": "ms",
"custom": {
"displayMode": "auto"
},
"decimals": 1
},
"overrides": []
},
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "center",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"textMode": "value"
}
},
{
"id": 11,
"type": "stat",
"title": "etcd P99 latency",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 4,
"w": 8,
"x": 16,
"y": 4
},
"targets": [
{
"expr": "histogram_quantile(0.99, sum by (le) (rate(etcd_request_duration_seconds_bucket[5m]))) * 1000",
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "yellow",
"value": 50
},
{
"color": "orange",
"value": 100
},
{
"color": "red",
"value": 200
}
]
},
"unit": "ms",
"custom": {
"displayMode": "auto"
},
"decimals": 1
},
"overrides": []
},
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "center",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"textMode": "value"
}
},
{
"id": 4,
"type": "timeseries",
@ -198,7 +405,7 @@
"h": 9,
"w": 24,
"x": 0,
"y": 4
"y": 8
},
"targets": [
{
@ -238,7 +445,7 @@
"h": 9,
"w": 24,
"x": 0,
"y": 13
"y": 17
},
"targets": [
{
@ -278,7 +485,7 @@
"h": 9,
"w": 12,
"x": 0,
"y": 22
"y": 26
},
"targets": [
{
@ -315,7 +522,7 @@
"h": 9,
"w": 12,
"x": 12,
"y": 22
"y": 26
},
"targets": [
{
@ -352,7 +559,7 @@
"h": 9,
"w": 24,
"x": 0,
"y": 31
"y": 35
},
"targets": [
{

View File

@ -7,67 +7,6 @@
"list": []
},
"panels": [
{
"id": 1,
"type": "gauge",
"title": "Workers Ready",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 5,
"w": 5,
"x": 0,
"y": 0
},
"targets": [
{
"expr": "sum(kube_node_status_condition{condition=\"Ready\",status=\"true\",node=~\"titan-04|titan-05|titan-06|titan-07|titan-08|titan-09|titan-10|titan-11|titan-12|titan-13|titan-14|titan-15|titan-16|titan-17|titan-18|titan-19|titan-22|titan-24\"})",
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"min": 0,
"max": 18,
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "red",
"value": null
},
{
"color": "orange",
"value": 16
},
{
"color": "yellow",
"value": 17
},
{
"color": "green",
"value": 18
}
]
}
},
"overrides": []
},
"options": {
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"orientation": "auto",
"showThresholdMarkers": false,
"showThresholdLabels": false
}
},
{
"id": 2,
"type": "gauge",
@ -78,8 +17,8 @@
},
"gridPos": {
"h": 5,
"w": 5,
"x": 5,
"w": 4,
"x": 0,
"y": 0
},
"targets": [
@ -131,8 +70,8 @@
},
"gridPos": {
"h": 5,
"w": 5,
"x": 10,
"w": 3,
"x": 4,
"y": 0
},
"targets": [
@ -144,82 +83,7 @@
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "yellow",
"value": 1
},
{
"color": "orange",
"value": 2
},
{
"color": "red",
"value": 3
}
]
},
"unit": "none",
"custom": {
"displayMode": "auto"
}
},
"overrides": []
},
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "center",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"textMode": "value"
},
"links": [
{
"title": "Open atlas-pods dashboard",
"url": "/d/atlas-pods",
"targetBlank": true
}
]
},
{
"id": 4,
"type": "stat",
"title": "Problem Pods",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 5,
"w": 5,
"x": 15,
"y": 0
},
"targets": [
{
"expr": "sum(max by (namespace,pod) (kube_pod_status_phase{phase!~\"Running|Succeeded\"}))",
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
"mode": "thresholds"
},
"mappings": [],
"thresholds": {
@ -281,20 +145,20 @@
},
"gridPos": {
"h": 5,
"w": 4,
"x": 20,
"w": 3,
"x": 7,
"y": 0
},
"targets": [
{
"expr": "sum(max by (namespace,pod) (((time() - kube_pod_deletion_timestamp{pod!=\"\"}) > bool 600) and on(namespace,pod) (kube_pod_deletion_timestamp{pod!=\"\"} > bool 0)))",
"expr": "sum(max by (namespace,pod) (((time() - kube_pod_deletion_timestamp{pod!=\"\"}) > bool 600) and on(namespace,pod) (kube_pod_deletion_timestamp{pod!=\"\"} > bool 0))) or on() vector(0)",
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
"mode": "thresholds"
},
"mappings": [],
"thresholds": {
@ -346,6 +210,286 @@
}
]
},
{
"id": 27,
"type": "stat",
"title": "Atlas Availability (30d)",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 5,
"w": 4,
"x": 10,
"y": 0
},
"targets": [
{
"expr": "avg_over_time((min(((sum(kube_node_status_condition{condition=\"Ready\",status=\"true\",node=~\"titan-0a|titan-0b|titan-0c\"}) / 3)), ((sum(kube_deployment_status_replicas_available{namespace=~\"traefik|kube-system\",deployment=\"traefik\"}) / clamp_min(sum(kube_deployment_spec_replicas{namespace=~\"traefik|kube-system\",deployment=\"traefik\"}), 1)))))[30d:5m])",
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "red",
"value": null
},
{
"color": "orange",
"value": 0.999
},
{
"color": "yellow",
"value": 0.9999
},
{
"color": "green",
"value": 0.99999
}
]
},
"unit": "percentunit",
"custom": {
"displayMode": "auto"
},
"decimals": 3
},
"overrides": []
},
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "center",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"textMode": "value"
}
},
{
"id": 4,
"type": "stat",
"title": "Problem Pods",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 5,
"w": 3,
"x": 14,
"y": 0
},
"targets": [
{
"expr": "sum(max by (namespace,pod) (kube_pod_status_phase{phase!~\"Running|Succeeded\"})) or on() vector(0)",
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "yellow",
"value": 1
},
{
"color": "orange",
"value": 2
},
{
"color": "red",
"value": 3
}
]
},
"unit": "none",
"custom": {
"displayMode": "auto"
}
},
"overrides": []
},
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "center",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"textMode": "value"
},
"links": [
{
"title": "Open atlas-pods dashboard",
"url": "/d/atlas-pods",
"targetBlank": true
}
]
},
{
"id": 6,
"type": "stat",
"title": "CrashLoop / ImagePull",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 5,
"w": 3,
"x": 17,
"y": 0
},
"targets": [
{
"expr": "sum(max by (namespace,pod) (kube_pod_container_status_waiting_reason{reason=~\"CrashLoopBackOff|ImagePullBackOff\"})) or on() vector(0)",
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "yellow",
"value": 1
},
{
"color": "orange",
"value": 2
},
{
"color": "red",
"value": 3
}
]
},
"unit": "none",
"custom": {
"displayMode": "auto"
}
},
"overrides": []
},
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "center",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"textMode": "value"
},
"links": [
{
"title": "Open atlas-pods dashboard",
"url": "/d/atlas-pods",
"targetBlank": true
}
]
},
{
"id": 1,
"type": "gauge",
"title": "Workers Ready",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 5,
"w": 4,
"x": 20,
"y": 0
},
"targets": [
{
"expr": "sum(kube_node_status_condition{condition=\"Ready\",status=\"true\",node=~\"titan-04|titan-05|titan-06|titan-07|titan-08|titan-09|titan-10|titan-11|titan-12|titan-13|titan-14|titan-15|titan-16|titan-17|titan-18|titan-19|titan-22|titan-24\"})",
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"min": 0,
"max": 18,
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "red",
"value": null
},
{
"color": "orange",
"value": 16
},
{
"color": "yellow",
"value": 17
},
{
"color": "green",
"value": 18
}
]
}
},
"overrides": []
},
"options": {
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"orientation": "auto",
"showThresholdMarkers": false,
"showThresholdLabels": false
}
},
{
"id": 7,
"type": "stat",
@ -371,11 +515,11 @@
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
"mode": "thresholds"
},
"mappings": [],
"thresholds": {
"mode": "percentage",
"mode": "absolute",
"steps": [
{
"color": "green",
@ -383,11 +527,15 @@
},
{
"color": "yellow",
"value": 70
"value": 50
},
{
"color": "orange",
"value": 75
},
{
"color": "red",
"value": 85
"value": 91.5
}
]
},
@ -444,11 +592,11 @@
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
"mode": "thresholds"
},
"mappings": [],
"thresholds": {
"mode": "percentage",
"mode": "absolute",
"steps": [
{
"color": "green",
@ -456,11 +604,15 @@
},
{
"color": "yellow",
"value": 70
"value": 50
},
{
"color": "orange",
"value": 75
},
{
"color": "red",
"value": 85
"value": 91.5
}
]
},
@ -517,7 +669,7 @@
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
"mode": "thresholds"
},
"mappings": [],
"thresholds": {
@ -586,7 +738,7 @@
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
"mode": "thresholds"
},
"mappings": [],
"thresholds": {
@ -653,11 +805,11 @@
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
"mode": "thresholds"
},
"mappings": [],
"thresholds": {
"mode": "percentage",
"mode": "absolute",
"steps": [
{
"color": "green",
@ -665,11 +817,15 @@
},
{
"color": "yellow",
"value": 70
"value": 50
},
{
"color": "orange",
"value": 75
},
{
"color": "red",
"value": 85
"value": 91.5
}
]
},
@ -724,11 +880,11 @@
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
"mode": "thresholds"
},
"mappings": [],
"thresholds": {
"mode": "percentage",
"mode": "absolute",
"steps": [
{
"color": "green",
@ -736,11 +892,15 @@
},
{
"color": "yellow",
"value": 70
"value": 50
},
{
"color": "orange",
"value": 75
},
{
"color": "red",
"value": 85
"value": 91.5
}
]
},
@ -795,7 +955,7 @@
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
"mode": "thresholds"
},
"mappings": [],
"thresholds": {
@ -862,7 +1022,7 @@
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
"mode": "thresholds"
},
"mappings": [],
"thresholds": {
@ -942,9 +1102,7 @@
"placement": "right"
},
"pieType": "pie",
"displayLabels": [
"percent"
],
"displayLabels": [],
"tooltip": {
"mode": "single"
},
@ -975,7 +1133,7 @@
},
"targets": [
{
"expr": "100 * ( ( (sum by (namespace) (avg_over_time(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\"}[1h]))) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ) / clamp_min(sum( ( (sum by (namespace) (avg_over_time(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\"}[1h]))) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ), 1)",
"expr": "100 * ( ( (sum by (namespace) (max_over_time(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\"}[$__range]))) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ) / clamp_min(sum( ( (sum by (namespace) (max_over_time(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\"}[$__range]))) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ), 1)",
"refId": "A",
"legendFormat": "{{namespace}}"
}
@ -995,9 +1153,7 @@
"placement": "right"
},
"pieType": "pie",
"displayLabels": [
"percent"
],
"displayLabels": [],
"tooltip": {
"mode": "single"
},
@ -1048,9 +1204,7 @@
"placement": "right"
},
"pieType": "pie",
"displayLabels": [
"percent"
],
"displayLabels": [],
"tooltip": {
"mode": "single"
},
@ -1175,7 +1329,7 @@
},
"targets": [
{
"expr": "(avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-0a|titan-0b|titan-0c\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")",
"expr": "(avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-0a|titan-0b|titan-0c|titan-db\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")",
"refId": "A",
"legendFormat": "{{node}}"
}
@ -1212,7 +1366,7 @@
},
"targets": [
{
"expr": "(avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-0a|titan-0b|titan-0c\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")",
"expr": "(avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-0a|titan-0b|titan-0c|titan-db\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")",
"refId": "A",
"legendFormat": "{{node}}"
}
@ -1233,6 +1387,138 @@
}
}
},
{
"id": 28,
"type": "piechart",
"title": "Node Pod Share",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 10,
"w": 12,
"x": 0,
"y": 54
},
"targets": [
{
"expr": "(sum(kube_pod_info{pod!=\"\" , node!=\"\"}) by (node) / clamp_min(sum(kube_pod_info{pod!=\"\" , node!=\"\"}), 1)) * 100",
"refId": "A",
"legendFormat": "{{namespace}}"
}
],
"fieldConfig": {
"defaults": {
"unit": "percent",
"color": {
"mode": "palette-classic"
}
},
"overrides": []
},
"options": {
"legend": {
"displayMode": "list",
"placement": "right"
},
"pieType": "pie",
"displayLabels": [],
"tooltip": {
"mode": "single"
},
"colorScheme": "interpolateSpectral",
"colorBy": "value",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
}
}
},
{
"id": 29,
"type": "bargauge",
"title": "Top Nodes by Pod Count",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 10,
"w": 12,
"x": 12,
"y": 54
},
"targets": [
{
"expr": "topk(12, sum(kube_pod_info{pod!=\"\" , node!=\"\"}) by (node))",
"refId": "A",
"legendFormat": "{{node}}",
"instant": true
}
],
"fieldConfig": {
"defaults": {
"unit": "none",
"min": 0,
"max": null,
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "yellow",
"value": 50
},
{
"color": "orange",
"value": 75
},
{
"color": "red",
"value": 100
}
]
},
"decimals": 0
},
"overrides": []
},
"options": {
"displayMode": "gradient",
"orientation": "horizontal",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
}
},
"transformations": [
{
"id": "sortBy",
"options": {
"fields": [
"Value"
],
"order": "desc"
}
},
{
"id": "limit",
"options": {
"limit": 12
}
}
]
},
{
"id": 18,
"type": "timeseries",
@ -1377,7 +1663,7 @@
"h": 16,
"w": 12,
"x": 0,
"y": 54
"y": 64
},
"targets": [
{
@ -1425,7 +1711,7 @@
"h": 16,
"w": 12,
"x": 12,
"y": 54
"y": 64
},
"targets": [
{
@ -1452,11 +1738,11 @@
},
{
"color": "orange",
"value": 70
"value": 75
},
{
"color": "red",
"value": 85
"value": 91.5
}
]
}
@ -1480,6 +1766,17 @@
"url": "/d/atlas-storage",
"targetBlank": true
}
],
"transformations": [
{
"id": "sortBy",
"options": {
"fields": [
"Value"
],
"order": "desc"
}
}
]
}
],
@ -1497,36 +1794,5 @@
"to": "now"
},
"refresh": "1m",
"links": [
{
"title": "Atlas Pods",
"type": "dashboard",
"dashboardUid": "atlas-pods",
"keepTime": false
},
{
"title": "Atlas Nodes",
"type": "dashboard",
"dashboardUid": "atlas-nodes",
"keepTime": false
},
{
"title": "Atlas Storage",
"type": "dashboard",
"dashboardUid": "atlas-storage",
"keepTime": false
},
{
"title": "Atlas Network",
"type": "dashboard",
"dashboardUid": "atlas-network",
"keepTime": false
},
{
"title": "Atlas GPU",
"type": "dashboard",
"dashboardUid": "atlas-gpu",
"keepTime": false
}
]
"links": []
}

View File

@ -20,14 +20,14 @@
},
"targets": [
{
"expr": "sum(max by (namespace,pod) (kube_pod_status_phase{phase!~\"Running|Succeeded\"}))",
"expr": "sum(max by (namespace,pod) (kube_pod_status_phase{phase!~\"Running|Succeeded\"})) or on() vector(0)",
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
"mode": "thresholds"
},
"mappings": [],
"thresholds": {
@ -80,14 +80,14 @@
},
"targets": [
{
"expr": "sum(max by (namespace,pod) (kube_pod_container_status_waiting_reason{reason=~\"CrashLoopBackOff|ImagePullBackOff\"}))",
"expr": "sum(max by (namespace,pod) (kube_pod_container_status_waiting_reason{reason=~\"CrashLoopBackOff|ImagePullBackOff\"})) or on() vector(0)",
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
"mode": "thresholds"
},
"mappings": [],
"thresholds": {
@ -140,14 +140,14 @@
},
"targets": [
{
"expr": "sum(max by (namespace,pod) (((time() - kube_pod_deletion_timestamp{pod!=\"\"}) > bool 600) and on(namespace,pod) (kube_pod_deletion_timestamp{pod!=\"\"} > bool 0)))",
"expr": "sum(max by (namespace,pod) (((time() - kube_pod_deletion_timestamp{pod!=\"\"}) > bool 600) and on(namespace,pod) (kube_pod_deletion_timestamp{pod!=\"\"} > bool 0))) or on() vector(0)",
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
"mode": "thresholds"
},
"mappings": [],
"thresholds": {
@ -207,7 +207,7 @@
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
"mode": "thresholds"
},
"mappings": [],
"thresholds": {
@ -266,12 +266,16 @@
],
"fieldConfig": {
"defaults": {
"unit": "s"
"unit": "s",
"custom": {
"filterable": true
}
},
"overrides": []
},
"options": {
"showHeader": true
"showHeader": true,
"columnFilters": false
},
"transformations": [
{
@ -302,12 +306,16 @@
],
"fieldConfig": {
"defaults": {
"unit": "s"
"unit": "s",
"custom": {
"filterable": true
}
},
"overrides": []
},
"options": {
"showHeader": true
"showHeader": true,
"columnFilters": false
},
"transformations": [
{
@ -338,12 +346,16 @@
],
"fieldConfig": {
"defaults": {
"unit": "s"
"unit": "s",
"custom": {
"filterable": true
}
},
"overrides": []
},
"options": {
"showHeader": true
"showHeader": true,
"columnFilters": false
},
"transformations": [
{
@ -359,6 +371,233 @@
}
}
]
},
{
"id": 8,
"type": "piechart",
"title": "Node Pod Share",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 8,
"w": 12,
"x": 12,
"y": 34
},
"targets": [
{
"expr": "(sum(kube_pod_info{pod!=\"\" , node!=\"\"}) by (node) / clamp_min(sum(kube_pod_info{pod!=\"\" , node!=\"\"}), 1)) * 100",
"refId": "A",
"legendFormat": "{{namespace}}"
}
],
"fieldConfig": {
"defaults": {
"unit": "percent",
"color": {
"mode": "palette-classic"
}
},
"overrides": []
},
"options": {
"legend": {
"displayMode": "list",
"placement": "right"
},
"pieType": "pie",
"displayLabels": [],
"tooltip": {
"mode": "single"
},
"colorScheme": "interpolateSpectral",
"colorBy": "value",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
}
}
},
{
"id": 9,
"type": "bargauge",
"title": "Top Nodes by Pod Count",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 8,
"w": 12,
"x": 0,
"y": 34
},
"targets": [
{
"expr": "topk(12, sum(kube_pod_info{pod!=\"\" , node!=\"\"}) by (node))",
"refId": "A",
"legendFormat": "{{node}}",
"instant": true
}
],
"fieldConfig": {
"defaults": {
"unit": "none",
"min": 0,
"max": null,
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "yellow",
"value": 50
},
{
"color": "orange",
"value": 75
},
{
"color": "red",
"value": 100
}
]
},
"decimals": 0
},
"overrides": []
},
"options": {
"displayMode": "gradient",
"orientation": "horizontal",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
}
},
"transformations": [
{
"id": "sortBy",
"options": {
"fields": [
"Value"
],
"order": "desc"
}
},
{
"id": "limit",
"options": {
"limit": 12
}
}
]
},
{
"id": 10,
"type": "table",
"title": "Namespace Plurality by Node v27",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 8,
"w": 24,
"x": 0,
"y": 42
},
"targets": [
{
"expr": "(sum by (namespace,node) (kube_pod_info{pod!=\"\" , node!=\"\"}) / on(namespace) group_left() clamp_min(sum by (namespace) (kube_pod_info{pod!=\"\"}), 1) * 100) * on(namespace,node) group_left() ((sum by (namespace,node) (kube_pod_info{pod!=\"\" , node!=\"\"}) / on(namespace) group_left() clamp_min(sum by (namespace) (kube_pod_info{pod!=\"\"}), 1) * 100) + on(node) group_left() ((sum by (node) (kube_node_info{node=\"titan-0a\"}) * 0 + 0.001) or (sum by (node) (kube_node_info{node=\"titan-0b\"}) * 0 + 0.002) or (sum by (node) (kube_node_info{node=\"titan-0c\"}) * 0 + 0.003) or (sum by (node) (kube_node_info{node=\"titan-db\"}) * 0 + 0.004) or (sum by (node) (kube_node_info{node=\"titan-04\"}) * 0 + 0.005) or (sum by (node) (kube_node_info{node=\"titan-05\"}) * 0 + 0.006) or (sum by (node) (kube_node_info{node=\"titan-06\"}) * 0 + 0.007) or (sum by (node) (kube_node_info{node=\"titan-07\"}) * 0 + 0.008) or (sum by (node) (kube_node_info{node=\"titan-08\"}) * 0 + 0.009000000000000001) or (sum by (node) (kube_node_info{node=\"titan-09\"}) * 0 + 0.01) or (sum by (node) (kube_node_info{node=\"titan-10\"}) * 0 + 0.011) or (sum by (node) (kube_node_info{node=\"titan-11\"}) * 0 + 0.012) or (sum by (node) (kube_node_info{node=\"titan-12\"}) * 0 + 0.013000000000000001) or (sum by (node) (kube_node_info{node=\"titan-13\"}) * 0 + 0.014) or (sum by (node) (kube_node_info{node=\"titan-14\"}) * 0 + 0.015) or (sum by (node) (kube_node_info{node=\"titan-15\"}) * 0 + 0.016) or (sum by (node) (kube_node_info{node=\"titan-16\"}) * 0 + 0.017) or (sum by (node) (kube_node_info{node=\"titan-17\"}) * 0 + 0.018000000000000002) or (sum by (node) (kube_node_info{node=\"titan-18\"}) * 0 + 0.019) or (sum by (node) (kube_node_info{node=\"titan-19\"}) * 0 + 0.02) or (sum by (node) (kube_node_info{node=\"titan-22\"}) * 0 + 0.021) or (sum by (node) (kube_node_info{node=\"titan-24\"}) * 0 + 0.022)) == bool on(namespace) group_left() (max by (namespace) ((sum by (namespace,node) (kube_pod_info{pod!=\"\" , node!=\"\"}) / on(namespace) group_left() clamp_min(sum by (namespace) (kube_pod_info{pod!=\"\"}), 1) * 100) + on(node) group_left() ((sum by (node) (kube_node_info{node=\"titan-0a\"}) * 0 + 0.001) or (sum by (node) (kube_node_info{node=\"titan-0b\"}) * 0 + 0.002) or (sum by (node) (kube_node_info{node=\"titan-0c\"}) * 0 + 0.003) or (sum by (node) (kube_node_info{node=\"titan-db\"}) * 0 + 0.004) or (sum by (node) (kube_node_info{node=\"titan-04\"}) * 0 + 0.005) or (sum by (node) (kube_node_info{node=\"titan-05\"}) * 0 + 0.006) or (sum by (node) (kube_node_info{node=\"titan-06\"}) * 0 + 0.007) or (sum by (node) (kube_node_info{node=\"titan-07\"}) * 0 + 0.008) or (sum by (node) (kube_node_info{node=\"titan-08\"}) * 0 + 0.009000000000000001) or (sum by (node) (kube_node_info{node=\"titan-09\"}) * 0 + 0.01) or (sum by (node) (kube_node_info{node=\"titan-10\"}) * 0 + 0.011) or (sum by (node) (kube_node_info{node=\"titan-11\"}) * 0 + 0.012) or (sum by (node) (kube_node_info{node=\"titan-12\"}) * 0 + 0.013000000000000001) or (sum by (node) (kube_node_info{node=\"titan-13\"}) * 0 + 0.014) or (sum by (node) (kube_node_info{node=\"titan-14\"}) * 0 + 0.015) or (sum by (node) (kube_node_info{node=\"titan-15\"}) * 0 + 0.016) or (sum by (node) (kube_node_info{node=\"titan-16\"}) * 0 + 0.017) or (sum by (node) (kube_node_info{node=\"titan-17\"}) * 0 + 0.018000000000000002) or (sum by (node) (kube_node_info{node=\"titan-18\"}) * 0 + 0.019) or (sum by (node) (kube_node_info{node=\"titan-19\"}) * 0 + 0.02) or (sum by (node) (kube_node_info{node=\"titan-22\"}) * 0 + 0.021) or (sum by (node) (kube_node_info{node=\"titan-24\"}) * 0 + 0.022)))))",
"refId": "A",
"instant": true,
"format": "table"
}
],
"fieldConfig": {
"defaults": {
"unit": "percent",
"custom": {
"filterable": false
}
},
"overrides": []
},
"options": {
"showHeader": true,
"columnFilters": false,
"showColumnFilters": false,
"footer": {
"show": false,
"fields": "",
"calcs": []
}
},
"transformations": [
{
"id": "labelsToFields",
"options": {}
},
{
"id": "organize",
"options": {
"excludeByName": {
"Time": true
}
}
},
{
"id": "filterByValue",
"options": {
"match": "Value",
"operator": "gt",
"value": 0
}
},
{
"id": "sortBy",
"options": {
"fields": [
"Value"
],
"order": "desc"
}
},
{
"id": "groupBy",
"options": {
"fields": {
"namespace": {
"aggregations": [
{
"field": "Value",
"operation": "max"
},
{
"field": "node",
"operation": "first"
}
]
}
},
"rowBy": [
"namespace"
]
}
}
]
}
],
"time": {

View File

@ -27,11 +27,11 @@
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
"mode": "thresholds"
},
"mappings": [],
"thresholds": {
"mode": "percentage",
"mode": "absolute",
"steps": [
{
"color": "green",
@ -39,11 +39,15 @@
},
{
"color": "yellow",
"value": 70
"value": 50
},
{
"color": "orange",
"value": 75
},
{
"color": "red",
"value": 85
"value": 91.5
}
]
},
@ -91,11 +95,11 @@
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
"mode": "thresholds"
},
"mappings": [],
"thresholds": {
"mode": "percentage",
"mode": "absolute",
"steps": [
{
"color": "green",
@ -103,11 +107,15 @@
},
{
"color": "yellow",
"value": 70
"value": 50
},
{
"color": "orange",
"value": 75
},
{
"color": "red",
"value": 85
"value": 91.5
}
]
},
@ -155,7 +163,7 @@
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
"mode": "thresholds"
},
"mappings": [],
"thresholds": {
@ -215,7 +223,7 @@
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
"mode": "thresholds"
},
"mappings": [],
"thresholds": {

View File

@ -29,7 +29,7 @@ data:
},
"targets": [
{
"expr": "100 * ( ( (sum by (namespace) (avg_over_time(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\"}[1h]))) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ) / clamp_min(sum( ( (sum by (namespace) (avg_over_time(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\"}[1h]))) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ), 1)",
"expr": "100 * ( ( (sum by (namespace) (max_over_time(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\"}[$__range]))) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ) / clamp_min(sum( ( (sum by (namespace) (max_over_time(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\"}[$__range]))) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ), 1)",
"refId": "A",
"legendFormat": "{{namespace}}"
}
@ -49,9 +49,7 @@ data:
"placement": "right"
},
"pieType": "pie",
"displayLabels": [
"percent"
],
"displayLabels": [],
"tooltip": {
"mode": "single"
},
@ -162,12 +160,16 @@ data:
],
"fieldConfig": {
"defaults": {
"unit": "percent"
"unit": "percent",
"custom": {
"filterable": true
}
},
"overrides": []
},
"options": {
"showHeader": true
"showHeader": true,
"columnFilters": false
},
"transformations": [
{

View File

@ -16,46 +16,55 @@ data:
{
"id": 1,
"type": "stat",
"title": "Ingress Traffic",
"title": "Ingress Success Rate (5m)",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 4,
"w": 8,
"w": 6,
"x": 0,
"y": 0
},
"targets": [
{
"expr": "sum(rate(node_network_receive_bytes_total{device!~\"lo|cni.*|veth.*|flannel.*|docker.*|virbr.*|vxlan.*|wg.*\"}[5m])) or on() vector(0)",
"expr": "(sum(rate(traefik_entrypoint_requests_total{code!~\"5..\"}[5m]))) / clamp_min(sum(rate(traefik_entrypoint_requests_total[5m])), 1)",
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
"mode": "thresholds"
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "rgba(115, 115, 115, 1)",
"color": "red",
"value": null
},
{
"color": "orange",
"value": 0.995
},
{
"color": "yellow",
"value": 0.999
},
{
"color": "green",
"value": 1
"value": 0.9995
}
]
},
"unit": "Bps",
"unit": "percentunit",
"custom": {
"displayMode": "auto"
}
},
"decimals": 2
},
"overrides": []
},
@ -76,46 +85,55 @@ data:
{
"id": 2,
"type": "stat",
"title": "Egress Traffic",
"title": "Error Budget Burn (1h)",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 4,
"w": 8,
"x": 8,
"w": 6,
"x": 6,
"y": 0
},
"targets": [
{
"expr": "sum(rate(node_network_transmit_bytes_total{device!~\"lo|cni.*|veth.*|flannel.*|docker.*|virbr.*|vxlan.*|wg.*\"}[5m])) or on() vector(0)",
"expr": "(1 - ((sum(rate(traefik_entrypoint_requests_total{code!~\"5..\"}[1h]))) / clamp_min(sum(rate(traefik_entrypoint_requests_total[1h])), 1))) / 0.0010000000000000009",
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
"mode": "thresholds"
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "rgba(115, 115, 115, 1)",
"color": "green",
"value": null
},
{
"color": "green",
"color": "yellow",
"value": 1
},
{
"color": "orange",
"value": 2
},
{
"color": "red",
"value": 4
}
]
},
"unit": "Bps",
"unit": "none",
"custom": {
"displayMode": "auto"
}
},
"decimals": 2
},
"overrides": []
},
@ -136,7 +154,145 @@ data:
{
"id": 3,
"type": "stat",
"title": "Intra-Cluster Traffic",
"title": "Error Budget Burn (6h)",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 4,
"w": 6,
"x": 12,
"y": 0
},
"targets": [
{
"expr": "(1 - ((sum(rate(traefik_entrypoint_requests_total{code!~\"5..\"}[6h]))) / clamp_min(sum(rate(traefik_entrypoint_requests_total[6h])), 1))) / 0.0010000000000000009",
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "yellow",
"value": 1
},
{
"color": "orange",
"value": 2
},
{
"color": "red",
"value": 4
}
]
},
"unit": "none",
"custom": {
"displayMode": "auto"
},
"decimals": 2
},
"overrides": []
},
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "center",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"textMode": "value"
}
},
{
"id": 4,
"type": "stat",
"title": "Edge P99 Latency (ms)",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 4,
"w": 6,
"x": 18,
"y": 0
},
"targets": [
{
"expr": "histogram_quantile(0.99, sum by (le) (rate(traefik_entrypoint_request_duration_seconds_bucket[5m]))) * 1000",
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "yellow",
"value": 200
},
{
"color": "orange",
"value": 350
},
{
"color": "red",
"value": 500
}
]
},
"unit": "ms",
"custom": {
"displayMode": "auto"
},
"decimals": 1
},
"overrides": []
},
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "center",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"textMode": "value"
}
},
{
"id": 5,
"type": "stat",
"title": "Ingress Traffic",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
@ -144,19 +300,19 @@ data:
"gridPos": {
"h": 4,
"w": 8,
"x": 16,
"y": 0
"x": 0,
"y": 4
},
"targets": [
{
"expr": "sum(rate(container_network_receive_bytes_total{namespace!=\"traefik\",pod!=\"\"}[5m]) + rate(container_network_transmit_bytes_total{namespace!=\"traefik\",pod!=\"\"}[5m])) or on() vector(0)",
"expr": "sum(rate(node_network_receive_bytes_total{device!~\"lo|cni.*|veth.*|flannel.*|docker.*|virbr.*|vxlan.*|wg.*\"}[5m])) or on() vector(0)",
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
"mode": "thresholds"
},
"mappings": [],
"thresholds": {
@ -194,9 +350,9 @@ data:
}
},
{
"id": 4,
"id": 6,
"type": "stat",
"title": "Top Router req/s",
"title": "Egress Traffic",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
@ -204,20 +360,19 @@ data:
"gridPos": {
"h": 4,
"w": 8,
"x": 0,
"x": 8,
"y": 4
},
"targets": [
{
"expr": "topk(1, sum by (router) (rate(traefik_router_requests_total[5m])))",
"refId": "A",
"legendFormat": "{{router}}"
"expr": "sum(rate(node_network_transmit_bytes_total{device!~\"lo|cni.*|veth.*|flannel.*|docker.*|virbr.*|vxlan.*|wg.*\"}[5m])) or on() vector(0)",
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
"mode": "thresholds"
},
"mappings": [],
"thresholds": {
@ -233,7 +388,7 @@ data:
}
]
},
"unit": "req/s",
"unit": "Bps",
"custom": {
"displayMode": "auto"
}
@ -255,7 +410,67 @@ data:
}
},
{
"id": 5,
"id": 7,
"type": "stat",
"title": "Intra-Cluster Traffic",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 4,
"w": 8,
"x": 16,
"y": 4
},
"targets": [
{
"expr": "sum(rate(container_network_receive_bytes_total{namespace!=\"traefik\",pod!=\"\"}[5m]) + rate(container_network_transmit_bytes_total{namespace!=\"traefik\",pod!=\"\"}[5m])) or on() vector(0)",
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "rgba(115, 115, 115, 1)",
"value": null
},
{
"color": "green",
"value": 1
}
]
},
"unit": "Bps",
"custom": {
"displayMode": "auto"
}
},
"overrides": []
},
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "center",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"textMode": "value"
}
},
{
"id": 8,
"type": "timeseries",
"title": "Per-Node Throughput",
"datasource": {
@ -292,7 +507,7 @@ data:
}
},
{
"id": 6,
"id": 9,
"type": "table",
"title": "Top Namespaces",
"datasource": {
@ -313,12 +528,16 @@ data:
],
"fieldConfig": {
"defaults": {
"unit": "Bps"
"unit": "Bps",
"custom": {
"filterable": true
}
},
"overrides": []
},
"options": {
"showHeader": true
"showHeader": true,
"columnFilters": false
},
"transformations": [
{
@ -328,7 +547,7 @@ data:
]
},
{
"id": 7,
"id": 10,
"type": "table",
"title": "Top Pods",
"datasource": {
@ -349,12 +568,16 @@ data:
],
"fieldConfig": {
"defaults": {
"unit": "Bps"
"unit": "Bps",
"custom": {
"filterable": true
}
},
"overrides": []
},
"options": {
"showHeader": true
"showHeader": true,
"columnFilters": false
},
"transformations": [
{
@ -364,7 +587,7 @@ data:
]
},
{
"id": 8,
"id": 11,
"type": "timeseries",
"title": "Traefik Routers (req/s)",
"datasource": {
@ -401,7 +624,7 @@ data:
}
},
{
"id": 9,
"id": 12,
"type": "timeseries",
"title": "Traefik Entrypoints (req/s)",
"datasource": {

View File

@ -36,7 +36,7 @@ data:
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
"mode": "thresholds"
},
"mappings": [],
"thresholds": {
@ -97,7 +97,7 @@ data:
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
"mode": "thresholds"
},
"mappings": [],
"thresholds": {
@ -158,7 +158,7 @@ data:
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
"mode": "thresholds"
},
"mappings": [],
"thresholds": {
@ -195,6 +195,213 @@ data:
"textMode": "value"
}
},
{
"id": 9,
"type": "stat",
"title": "API Server 5xx rate",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 4,
"w": 8,
"x": 0,
"y": 4
},
"targets": [
{
"expr": "sum(rate(apiserver_request_total{code=~\"5..\"}[5m]))",
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "yellow",
"value": 0.05
},
{
"color": "orange",
"value": 0.2
},
{
"color": "red",
"value": 0.5
}
]
},
"unit": "req/s",
"custom": {
"displayMode": "auto"
},
"decimals": 3
},
"overrides": []
},
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "center",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"textMode": "value"
}
},
{
"id": 10,
"type": "stat",
"title": "API Server P99 latency",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 4,
"w": 8,
"x": 8,
"y": 4
},
"targets": [
{
"expr": "histogram_quantile(0.99, sum by (le) (rate(apiserver_request_duration_seconds_bucket[5m]))) * 1000",
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "yellow",
"value": 250
},
{
"color": "orange",
"value": 400
},
{
"color": "red",
"value": 600
}
]
},
"unit": "ms",
"custom": {
"displayMode": "auto"
},
"decimals": 1
},
"overrides": []
},
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "center",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"textMode": "value"
}
},
{
"id": 11,
"type": "stat",
"title": "etcd P99 latency",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 4,
"w": 8,
"x": 16,
"y": 4
},
"targets": [
{
"expr": "histogram_quantile(0.99, sum by (le) (rate(etcd_request_duration_seconds_bucket[5m]))) * 1000",
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "yellow",
"value": 50
},
{
"color": "orange",
"value": 100
},
{
"color": "red",
"value": 200
}
]
},
"unit": "ms",
"custom": {
"displayMode": "auto"
},
"decimals": 1
},
"overrides": []
},
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "center",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"textMode": "value"
}
},
{
"id": 4,
"type": "timeseries",
@ -207,7 +414,7 @@ data:
"h": 9,
"w": 24,
"x": 0,
"y": 4
"y": 8
},
"targets": [
{
@ -247,7 +454,7 @@ data:
"h": 9,
"w": 24,
"x": 0,
"y": 13
"y": 17
},
"targets": [
{
@ -287,7 +494,7 @@ data:
"h": 9,
"w": 12,
"x": 0,
"y": 22
"y": 26
},
"targets": [
{
@ -324,7 +531,7 @@ data:
"h": 9,
"w": 12,
"x": 12,
"y": 22
"y": 26
},
"targets": [
{
@ -361,7 +568,7 @@ data:
"h": 9,
"w": 24,
"x": 0,
"y": 31
"y": 35
},
"targets": [
{

View File

@ -16,67 +16,6 @@ data:
"list": []
},
"panels": [
{
"id": 1,
"type": "gauge",
"title": "Workers Ready",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 5,
"w": 5,
"x": 0,
"y": 0
},
"targets": [
{
"expr": "sum(kube_node_status_condition{condition=\"Ready\",status=\"true\",node=~\"titan-04|titan-05|titan-06|titan-07|titan-08|titan-09|titan-10|titan-11|titan-12|titan-13|titan-14|titan-15|titan-16|titan-17|titan-18|titan-19|titan-22|titan-24\"})",
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"min": 0,
"max": 18,
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "red",
"value": null
},
{
"color": "orange",
"value": 16
},
{
"color": "yellow",
"value": 17
},
{
"color": "green",
"value": 18
}
]
}
},
"overrides": []
},
"options": {
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"orientation": "auto",
"showThresholdMarkers": false,
"showThresholdLabels": false
}
},
{
"id": 2,
"type": "gauge",
@ -87,8 +26,8 @@ data:
},
"gridPos": {
"h": 5,
"w": 5,
"x": 5,
"w": 4,
"x": 0,
"y": 0
},
"targets": [
@ -140,8 +79,8 @@ data:
},
"gridPos": {
"h": 5,
"w": 5,
"x": 10,
"w": 3,
"x": 4,
"y": 0
},
"targets": [
@ -153,82 +92,7 @@ data:
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "yellow",
"value": 1
},
{
"color": "orange",
"value": 2
},
{
"color": "red",
"value": 3
}
]
},
"unit": "none",
"custom": {
"displayMode": "auto"
}
},
"overrides": []
},
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "center",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"textMode": "value"
},
"links": [
{
"title": "Open atlas-pods dashboard",
"url": "/d/atlas-pods",
"targetBlank": true
}
]
},
{
"id": 4,
"type": "stat",
"title": "Problem Pods",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 5,
"w": 5,
"x": 15,
"y": 0
},
"targets": [
{
"expr": "sum(max by (namespace,pod) (kube_pod_status_phase{phase!~\"Running|Succeeded\"}))",
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
"mode": "thresholds"
},
"mappings": [],
"thresholds": {
@ -290,20 +154,20 @@ data:
},
"gridPos": {
"h": 5,
"w": 4,
"x": 20,
"w": 3,
"x": 7,
"y": 0
},
"targets": [
{
"expr": "sum(max by (namespace,pod) (((time() - kube_pod_deletion_timestamp{pod!=\"\"}) > bool 600) and on(namespace,pod) (kube_pod_deletion_timestamp{pod!=\"\"} > bool 0)))",
"expr": "sum(max by (namespace,pod) (((time() - kube_pod_deletion_timestamp{pod!=\"\"}) > bool 600) and on(namespace,pod) (kube_pod_deletion_timestamp{pod!=\"\"} > bool 0))) or on() vector(0)",
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
"mode": "thresholds"
},
"mappings": [],
"thresholds": {
@ -355,6 +219,286 @@ data:
}
]
},
{
"id": 27,
"type": "stat",
"title": "Atlas Availability (30d)",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 5,
"w": 4,
"x": 10,
"y": 0
},
"targets": [
{
"expr": "avg_over_time((min(((sum(kube_node_status_condition{condition=\"Ready\",status=\"true\",node=~\"titan-0a|titan-0b|titan-0c\"}) / 3)), ((sum(kube_deployment_status_replicas_available{namespace=~\"traefik|kube-system\",deployment=\"traefik\"}) / clamp_min(sum(kube_deployment_spec_replicas{namespace=~\"traefik|kube-system\",deployment=\"traefik\"}), 1)))))[30d:5m])",
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "red",
"value": null
},
{
"color": "orange",
"value": 0.999
},
{
"color": "yellow",
"value": 0.9999
},
{
"color": "green",
"value": 0.99999
}
]
},
"unit": "percentunit",
"custom": {
"displayMode": "auto"
},
"decimals": 3
},
"overrides": []
},
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "center",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"textMode": "value"
}
},
{
"id": 4,
"type": "stat",
"title": "Problem Pods",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 5,
"w": 3,
"x": 14,
"y": 0
},
"targets": [
{
"expr": "sum(max by (namespace,pod) (kube_pod_status_phase{phase!~\"Running|Succeeded\"})) or on() vector(0)",
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "yellow",
"value": 1
},
{
"color": "orange",
"value": 2
},
{
"color": "red",
"value": 3
}
]
},
"unit": "none",
"custom": {
"displayMode": "auto"
}
},
"overrides": []
},
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "center",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"textMode": "value"
},
"links": [
{
"title": "Open atlas-pods dashboard",
"url": "/d/atlas-pods",
"targetBlank": true
}
]
},
{
"id": 6,
"type": "stat",
"title": "CrashLoop / ImagePull",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 5,
"w": 3,
"x": 17,
"y": 0
},
"targets": [
{
"expr": "sum(max by (namespace,pod) (kube_pod_container_status_waiting_reason{reason=~\"CrashLoopBackOff|ImagePullBackOff\"})) or on() vector(0)",
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "yellow",
"value": 1
},
{
"color": "orange",
"value": 2
},
{
"color": "red",
"value": 3
}
]
},
"unit": "none",
"custom": {
"displayMode": "auto"
}
},
"overrides": []
},
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "center",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"textMode": "value"
},
"links": [
{
"title": "Open atlas-pods dashboard",
"url": "/d/atlas-pods",
"targetBlank": true
}
]
},
{
"id": 1,
"type": "gauge",
"title": "Workers Ready",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 5,
"w": 4,
"x": 20,
"y": 0
},
"targets": [
{
"expr": "sum(kube_node_status_condition{condition=\"Ready\",status=\"true\",node=~\"titan-04|titan-05|titan-06|titan-07|titan-08|titan-09|titan-10|titan-11|titan-12|titan-13|titan-14|titan-15|titan-16|titan-17|titan-18|titan-19|titan-22|titan-24\"})",
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"min": 0,
"max": 18,
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "red",
"value": null
},
{
"color": "orange",
"value": 16
},
{
"color": "yellow",
"value": 17
},
{
"color": "green",
"value": 18
}
]
}
},
"overrides": []
},
"options": {
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"orientation": "auto",
"showThresholdMarkers": false,
"showThresholdLabels": false
}
},
{
"id": 7,
"type": "stat",
@ -380,11 +524,11 @@ data:
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
"mode": "thresholds"
},
"mappings": [],
"thresholds": {
"mode": "percentage",
"mode": "absolute",
"steps": [
{
"color": "green",
@ -392,11 +536,15 @@ data:
},
{
"color": "yellow",
"value": 70
"value": 50
},
{
"color": "orange",
"value": 75
},
{
"color": "red",
"value": 85
"value": 91.5
}
]
},
@ -453,11 +601,11 @@ data:
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
"mode": "thresholds"
},
"mappings": [],
"thresholds": {
"mode": "percentage",
"mode": "absolute",
"steps": [
{
"color": "green",
@ -465,11 +613,15 @@ data:
},
{
"color": "yellow",
"value": 70
"value": 50
},
{
"color": "orange",
"value": 75
},
{
"color": "red",
"value": 85
"value": 91.5
}
]
},
@ -526,7 +678,7 @@ data:
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
"mode": "thresholds"
},
"mappings": [],
"thresholds": {
@ -595,7 +747,7 @@ data:
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
"mode": "thresholds"
},
"mappings": [],
"thresholds": {
@ -662,11 +814,11 @@ data:
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
"mode": "thresholds"
},
"mappings": [],
"thresholds": {
"mode": "percentage",
"mode": "absolute",
"steps": [
{
"color": "green",
@ -674,11 +826,15 @@ data:
},
{
"color": "yellow",
"value": 70
"value": 50
},
{
"color": "orange",
"value": 75
},
{
"color": "red",
"value": 85
"value": 91.5
}
]
},
@ -733,11 +889,11 @@ data:
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
"mode": "thresholds"
},
"mappings": [],
"thresholds": {
"mode": "percentage",
"mode": "absolute",
"steps": [
{
"color": "green",
@ -745,11 +901,15 @@ data:
},
{
"color": "yellow",
"value": 70
"value": 50
},
{
"color": "orange",
"value": 75
},
{
"color": "red",
"value": 85
"value": 91.5
}
]
},
@ -804,7 +964,7 @@ data:
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
"mode": "thresholds"
},
"mappings": [],
"thresholds": {
@ -871,7 +1031,7 @@ data:
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
"mode": "thresholds"
},
"mappings": [],
"thresholds": {
@ -951,9 +1111,7 @@ data:
"placement": "right"
},
"pieType": "pie",
"displayLabels": [
"percent"
],
"displayLabels": [],
"tooltip": {
"mode": "single"
},
@ -984,7 +1142,7 @@ data:
},
"targets": [
{
"expr": "100 * ( ( (sum by (namespace) (avg_over_time(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\"}[1h]))) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ) / clamp_min(sum( ( (sum by (namespace) (avg_over_time(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\"}[1h]))) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ), 1)",
"expr": "100 * ( ( (sum by (namespace) (max_over_time(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\"}[$__range]))) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ) / clamp_min(sum( ( (sum by (namespace) (max_over_time(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\"}[$__range]))) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ), 1)",
"refId": "A",
"legendFormat": "{{namespace}}"
}
@ -1004,9 +1162,7 @@ data:
"placement": "right"
},
"pieType": "pie",
"displayLabels": [
"percent"
],
"displayLabels": [],
"tooltip": {
"mode": "single"
},
@ -1057,9 +1213,7 @@ data:
"placement": "right"
},
"pieType": "pie",
"displayLabels": [
"percent"
],
"displayLabels": [],
"tooltip": {
"mode": "single"
},
@ -1184,7 +1338,7 @@ data:
},
"targets": [
{
"expr": "(avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-0a|titan-0b|titan-0c\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")",
"expr": "(avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-0a|titan-0b|titan-0c|titan-db\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")",
"refId": "A",
"legendFormat": "{{node}}"
}
@ -1221,7 +1375,7 @@ data:
},
"targets": [
{
"expr": "(avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-0a|titan-0b|titan-0c\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")",
"expr": "(avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-0a|titan-0b|titan-0c|titan-db\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")",
"refId": "A",
"legendFormat": "{{node}}"
}
@ -1242,6 +1396,138 @@ data:
}
}
},
{
"id": 28,
"type": "piechart",
"title": "Node Pod Share",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 10,
"w": 12,
"x": 0,
"y": 54
},
"targets": [
{
"expr": "(sum(kube_pod_info{pod!=\"\" , node!=\"\"}) by (node) / clamp_min(sum(kube_pod_info{pod!=\"\" , node!=\"\"}), 1)) * 100",
"refId": "A",
"legendFormat": "{{namespace}}"
}
],
"fieldConfig": {
"defaults": {
"unit": "percent",
"color": {
"mode": "palette-classic"
}
},
"overrides": []
},
"options": {
"legend": {
"displayMode": "list",
"placement": "right"
},
"pieType": "pie",
"displayLabels": [],
"tooltip": {
"mode": "single"
},
"colorScheme": "interpolateSpectral",
"colorBy": "value",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
}
}
},
{
"id": 29,
"type": "bargauge",
"title": "Top Nodes by Pod Count",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 10,
"w": 12,
"x": 12,
"y": 54
},
"targets": [
{
"expr": "topk(12, sum(kube_pod_info{pod!=\"\" , node!=\"\"}) by (node))",
"refId": "A",
"legendFormat": "{{node}}",
"instant": true
}
],
"fieldConfig": {
"defaults": {
"unit": "none",
"min": 0,
"max": null,
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "yellow",
"value": 50
},
{
"color": "orange",
"value": 75
},
{
"color": "red",
"value": 100
}
]
},
"decimals": 0
},
"overrides": []
},
"options": {
"displayMode": "gradient",
"orientation": "horizontal",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
}
},
"transformations": [
{
"id": "sortBy",
"options": {
"fields": [
"Value"
],
"order": "desc"
}
},
{
"id": "limit",
"options": {
"limit": 12
}
}
]
},
{
"id": 18,
"type": "timeseries",
@ -1386,7 +1672,7 @@ data:
"h": 16,
"w": 12,
"x": 0,
"y": 54
"y": 64
},
"targets": [
{
@ -1434,7 +1720,7 @@ data:
"h": 16,
"w": 12,
"x": 12,
"y": 54
"y": 64
},
"targets": [
{
@ -1461,11 +1747,11 @@ data:
},
{
"color": "orange",
"value": 70
"value": 75
},
{
"color": "red",
"value": 85
"value": 91.5
}
]
}
@ -1489,6 +1775,17 @@ data:
"url": "/d/atlas-storage",
"targetBlank": true
}
],
"transformations": [
{
"id": "sortBy",
"options": {
"fields": [
"Value"
],
"order": "desc"
}
}
]
}
],
@ -1506,36 +1803,5 @@ data:
"to": "now"
},
"refresh": "1m",
"links": [
{
"title": "Atlas Pods",
"type": "dashboard",
"dashboardUid": "atlas-pods",
"keepTime": false
},
{
"title": "Atlas Nodes",
"type": "dashboard",
"dashboardUid": "atlas-nodes",
"keepTime": false
},
{
"title": "Atlas Storage",
"type": "dashboard",
"dashboardUid": "atlas-storage",
"keepTime": false
},
{
"title": "Atlas Network",
"type": "dashboard",
"dashboardUid": "atlas-network",
"keepTime": false
},
{
"title": "Atlas GPU",
"type": "dashboard",
"dashboardUid": "atlas-gpu",
"keepTime": false
}
]
"links": []
}

View File

@ -29,14 +29,14 @@ data:
},
"targets": [
{
"expr": "sum(max by (namespace,pod) (kube_pod_status_phase{phase!~\"Running|Succeeded\"}))",
"expr": "sum(max by (namespace,pod) (kube_pod_status_phase{phase!~\"Running|Succeeded\"})) or on() vector(0)",
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
"mode": "thresholds"
},
"mappings": [],
"thresholds": {
@ -89,14 +89,14 @@ data:
},
"targets": [
{
"expr": "sum(max by (namespace,pod) (kube_pod_container_status_waiting_reason{reason=~\"CrashLoopBackOff|ImagePullBackOff\"}))",
"expr": "sum(max by (namespace,pod) (kube_pod_container_status_waiting_reason{reason=~\"CrashLoopBackOff|ImagePullBackOff\"})) or on() vector(0)",
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
"mode": "thresholds"
},
"mappings": [],
"thresholds": {
@ -149,14 +149,14 @@ data:
},
"targets": [
{
"expr": "sum(max by (namespace,pod) (((time() - kube_pod_deletion_timestamp{pod!=\"\"}) > bool 600) and on(namespace,pod) (kube_pod_deletion_timestamp{pod!=\"\"} > bool 0)))",
"expr": "sum(max by (namespace,pod) (((time() - kube_pod_deletion_timestamp{pod!=\"\"}) > bool 600) and on(namespace,pod) (kube_pod_deletion_timestamp{pod!=\"\"} > bool 0))) or on() vector(0)",
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
"mode": "thresholds"
},
"mappings": [],
"thresholds": {
@ -216,7 +216,7 @@ data:
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
"mode": "thresholds"
},
"mappings": [],
"thresholds": {
@ -275,12 +275,16 @@ data:
],
"fieldConfig": {
"defaults": {
"unit": "s"
"unit": "s",
"custom": {
"filterable": true
}
},
"overrides": []
},
"options": {
"showHeader": true
"showHeader": true,
"columnFilters": false
},
"transformations": [
{
@ -311,12 +315,16 @@ data:
],
"fieldConfig": {
"defaults": {
"unit": "s"
"unit": "s",
"custom": {
"filterable": true
}
},
"overrides": []
},
"options": {
"showHeader": true
"showHeader": true,
"columnFilters": false
},
"transformations": [
{
@ -347,12 +355,16 @@ data:
],
"fieldConfig": {
"defaults": {
"unit": "s"
"unit": "s",
"custom": {
"filterable": true
}
},
"overrides": []
},
"options": {
"showHeader": true
"showHeader": true,
"columnFilters": false
},
"transformations": [
{
@ -368,6 +380,233 @@ data:
}
}
]
},
{
"id": 8,
"type": "piechart",
"title": "Node Pod Share",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 8,
"w": 12,
"x": 12,
"y": 34
},
"targets": [
{
"expr": "(sum(kube_pod_info{pod!=\"\" , node!=\"\"}) by (node) / clamp_min(sum(kube_pod_info{pod!=\"\" , node!=\"\"}), 1)) * 100",
"refId": "A",
"legendFormat": "{{namespace}}"
}
],
"fieldConfig": {
"defaults": {
"unit": "percent",
"color": {
"mode": "palette-classic"
}
},
"overrides": []
},
"options": {
"legend": {
"displayMode": "list",
"placement": "right"
},
"pieType": "pie",
"displayLabels": [],
"tooltip": {
"mode": "single"
},
"colorScheme": "interpolateSpectral",
"colorBy": "value",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
}
}
},
{
"id": 9,
"type": "bargauge",
"title": "Top Nodes by Pod Count",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 8,
"w": 12,
"x": 0,
"y": 34
},
"targets": [
{
"expr": "topk(12, sum(kube_pod_info{pod!=\"\" , node!=\"\"}) by (node))",
"refId": "A",
"legendFormat": "{{node}}",
"instant": true
}
],
"fieldConfig": {
"defaults": {
"unit": "none",
"min": 0,
"max": null,
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "yellow",
"value": 50
},
{
"color": "orange",
"value": 75
},
{
"color": "red",
"value": 100
}
]
},
"decimals": 0
},
"overrides": []
},
"options": {
"displayMode": "gradient",
"orientation": "horizontal",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
}
},
"transformations": [
{
"id": "sortBy",
"options": {
"fields": [
"Value"
],
"order": "desc"
}
},
{
"id": "limit",
"options": {
"limit": 12
}
}
]
},
{
"id": 10,
"type": "table",
"title": "Namespace Plurality by Node v27",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 8,
"w": 24,
"x": 0,
"y": 42
},
"targets": [
{
"expr": "(sum by (namespace,node) (kube_pod_info{pod!=\"\" , node!=\"\"}) / on(namespace) group_left() clamp_min(sum by (namespace) (kube_pod_info{pod!=\"\"}), 1) * 100) * on(namespace,node) group_left() ((sum by (namespace,node) (kube_pod_info{pod!=\"\" , node!=\"\"}) / on(namespace) group_left() clamp_min(sum by (namespace) (kube_pod_info{pod!=\"\"}), 1) * 100) + on(node) group_left() ((sum by (node) (kube_node_info{node=\"titan-0a\"}) * 0 + 0.001) or (sum by (node) (kube_node_info{node=\"titan-0b\"}) * 0 + 0.002) or (sum by (node) (kube_node_info{node=\"titan-0c\"}) * 0 + 0.003) or (sum by (node) (kube_node_info{node=\"titan-db\"}) * 0 + 0.004) or (sum by (node) (kube_node_info{node=\"titan-04\"}) * 0 + 0.005) or (sum by (node) (kube_node_info{node=\"titan-05\"}) * 0 + 0.006) or (sum by (node) (kube_node_info{node=\"titan-06\"}) * 0 + 0.007) or (sum by (node) (kube_node_info{node=\"titan-07\"}) * 0 + 0.008) or (sum by (node) (kube_node_info{node=\"titan-08\"}) * 0 + 0.009000000000000001) or (sum by (node) (kube_node_info{node=\"titan-09\"}) * 0 + 0.01) or (sum by (node) (kube_node_info{node=\"titan-10\"}) * 0 + 0.011) or (sum by (node) (kube_node_info{node=\"titan-11\"}) * 0 + 0.012) or (sum by (node) (kube_node_info{node=\"titan-12\"}) * 0 + 0.013000000000000001) or (sum by (node) (kube_node_info{node=\"titan-13\"}) * 0 + 0.014) or (sum by (node) (kube_node_info{node=\"titan-14\"}) * 0 + 0.015) or (sum by (node) (kube_node_info{node=\"titan-15\"}) * 0 + 0.016) or (sum by (node) (kube_node_info{node=\"titan-16\"}) * 0 + 0.017) or (sum by (node) (kube_node_info{node=\"titan-17\"}) * 0 + 0.018000000000000002) or (sum by (node) (kube_node_info{node=\"titan-18\"}) * 0 + 0.019) or (sum by (node) (kube_node_info{node=\"titan-19\"}) * 0 + 0.02) or (sum by (node) (kube_node_info{node=\"titan-22\"}) * 0 + 0.021) or (sum by (node) (kube_node_info{node=\"titan-24\"}) * 0 + 0.022)) == bool on(namespace) group_left() (max by (namespace) ((sum by (namespace,node) (kube_pod_info{pod!=\"\" , node!=\"\"}) / on(namespace) group_left() clamp_min(sum by (namespace) (kube_pod_info{pod!=\"\"}), 1) * 100) + on(node) group_left() ((sum by (node) (kube_node_info{node=\"titan-0a\"}) * 0 + 0.001) or (sum by (node) (kube_node_info{node=\"titan-0b\"}) * 0 + 0.002) or (sum by (node) (kube_node_info{node=\"titan-0c\"}) * 0 + 0.003) or (sum by (node) (kube_node_info{node=\"titan-db\"}) * 0 + 0.004) or (sum by (node) (kube_node_info{node=\"titan-04\"}) * 0 + 0.005) or (sum by (node) (kube_node_info{node=\"titan-05\"}) * 0 + 0.006) or (sum by (node) (kube_node_info{node=\"titan-06\"}) * 0 + 0.007) or (sum by (node) (kube_node_info{node=\"titan-07\"}) * 0 + 0.008) or (sum by (node) (kube_node_info{node=\"titan-08\"}) * 0 + 0.009000000000000001) or (sum by (node) (kube_node_info{node=\"titan-09\"}) * 0 + 0.01) or (sum by (node) (kube_node_info{node=\"titan-10\"}) * 0 + 0.011) or (sum by (node) (kube_node_info{node=\"titan-11\"}) * 0 + 0.012) or (sum by (node) (kube_node_info{node=\"titan-12\"}) * 0 + 0.013000000000000001) or (sum by (node) (kube_node_info{node=\"titan-13\"}) * 0 + 0.014) or (sum by (node) (kube_node_info{node=\"titan-14\"}) * 0 + 0.015) or (sum by (node) (kube_node_info{node=\"titan-15\"}) * 0 + 0.016) or (sum by (node) (kube_node_info{node=\"titan-16\"}) * 0 + 0.017) or (sum by (node) (kube_node_info{node=\"titan-17\"}) * 0 + 0.018000000000000002) or (sum by (node) (kube_node_info{node=\"titan-18\"}) * 0 + 0.019) or (sum by (node) (kube_node_info{node=\"titan-19\"}) * 0 + 0.02) or (sum by (node) (kube_node_info{node=\"titan-22\"}) * 0 + 0.021) or (sum by (node) (kube_node_info{node=\"titan-24\"}) * 0 + 0.022)))))",
"refId": "A",
"instant": true,
"format": "table"
}
],
"fieldConfig": {
"defaults": {
"unit": "percent",
"custom": {
"filterable": false
}
},
"overrides": []
},
"options": {
"showHeader": true,
"columnFilters": false,
"showColumnFilters": false,
"footer": {
"show": false,
"fields": "",
"calcs": []
}
},
"transformations": [
{
"id": "labelsToFields",
"options": {}
},
{
"id": "organize",
"options": {
"excludeByName": {
"Time": true
}
}
},
{
"id": "filterByValue",
"options": {
"match": "Value",
"operator": "gt",
"value": 0
}
},
{
"id": "sortBy",
"options": {
"fields": [
"Value"
],
"order": "desc"
}
},
{
"id": "groupBy",
"options": {
"fields": {
"namespace": {
"aggregations": [
{
"field": "Value",
"operation": "max"
},
{
"field": "node",
"operation": "first"
}
]
}
},
"rowBy": [
"namespace"
]
}
}
]
}
],
"time": {

View File

@ -36,11 +36,11 @@ data:
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
"mode": "thresholds"
},
"mappings": [],
"thresholds": {
"mode": "percentage",
"mode": "absolute",
"steps": [
{
"color": "green",
@ -48,11 +48,15 @@ data:
},
{
"color": "yellow",
"value": 70
"value": 50
},
{
"color": "orange",
"value": 75
},
{
"color": "red",
"value": 85
"value": 91.5
}
]
},
@ -100,11 +104,11 @@ data:
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
"mode": "thresholds"
},
"mappings": [],
"thresholds": {
"mode": "percentage",
"mode": "absolute",
"steps": [
{
"color": "green",
@ -112,11 +116,15 @@ data:
},
{
"color": "yellow",
"value": 70
"value": 50
},
{
"color": "orange",
"value": 75
},
{
"color": "red",
"value": 85
"value": 91.5
}
]
},
@ -164,7 +172,7 @@ data:
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
"mode": "thresholds"
},
"mappings": [],
"thresholds": {
@ -224,7 +232,7 @@ data:
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
"mode": "thresholds"
},
"mappings": [],
"thresholds": {

View File

@ -65,13 +65,13 @@ spec:
namespace: flux-system
values:
server:
# keep ~3 months; change as you like (supports "d", "y")
# keep 1 year; supports "d", "y"
extraArgs:
retentionPeriod: "90d" # VM flag -retentionPeriod=90d. :contentReference[oaicite:11]{index=11}
retentionPeriod: "1y" # VM flag -retentionPeriod=1y. :contentReference[oaicite:11]{index=11}
persistentVolume:
enabled: true
size: 100Gi
size: 250Gi
# Enable built-in Kubernetes scraping
scrape:
@ -186,6 +186,15 @@ spec:
- targets: ["longhorn-backend.longhorn-system.svc:9500"]
metrics_path: /metrics
# --- titan-db node_exporter (external control-plane DB host) ---
- job_name: "titan-db"
static_configs:
- targets: ["192.168.22.10:9100"]
relabel_configs:
- source_labels: [__address__]
target_label: instance
replacement: titan-db
# --- cert-manager (pods expose on 9402) ---
- job_name: "cert-manager"
kubernetes_sd_configs: [{ role: pod }]
@ -209,16 +218,6 @@ spec:
- action: keep
source_labels: [__meta_kubernetes_namespace, __meta_kubernetes_pod_label_app_kubernetes_io_part_of]
regex: flux-system;flux
- job_name: "titan-db"
static_configs:
- targets: ["titan-db:9100"]
relabel_configs:
- source_labels: [__address__]
target_label: instance
metric_relabel_configs:
- source_labels: [instance]
target_label: node
replacement: titan-db
---
@ -249,9 +248,27 @@ spec:
service:
type: ClusterIP
env:
GF_AUTH_ANONYMOUS_ENABLED: "true"
GF_AUTH_ANONYMOUS_ORG_ROLE: Viewer
GF_AUTH_ANONYMOUS_ENABLED: "false"
GF_SECURITY_ALLOW_EMBEDDING: "true"
GF_AUTH_GENERIC_OAUTH_ENABLED: "true"
GF_AUTH_GENERIC_OAUTH_NAME: "Keycloak"
GF_AUTH_GENERIC_OAUTH_ALLOW_SIGN_UP: "true"
GF_AUTH_GENERIC_OAUTH_SCOPES: "openid profile email groups"
GF_AUTH_GENERIC_OAUTH_AUTH_URL: "https://sso.bstein.dev/realms/atlas/protocol/openid-connect/auth"
GF_AUTH_GENERIC_OAUTH_TOKEN_URL: "https://sso.bstein.dev/realms/atlas/protocol/openid-connect/token"
GF_AUTH_GENERIC_OAUTH_API_URL: "https://sso.bstein.dev/realms/atlas/protocol/openid-connect/userinfo"
GF_AUTH_GENERIC_OAUTH_ROLE_ATTRIBUTE_PATH: "contains(groups, 'admin') && 'Admin' || 'Viewer'"
GF_AUTH_GENERIC_OAUTH_TLS_SKIP_VERIFY_INSECURE: "false"
GF_AUTH_SIGNOUT_REDIRECT_URL: "https://sso.bstein.dev/realms/atlas/protocol/openid-connect/logout?redirect_uri=https://metrics.bstein.dev/"
envValueFrom:
GF_AUTH_GENERIC_OAUTH_CLIENT_ID:
secretKeyRef:
name: grafana-oidc
key: client_id
GF_AUTH_GENERIC_OAUTH_CLIENT_SECRET:
secretKeyRef:
name: grafana-oidc
key: client_secret
grafana.ini:
server:
domain: metrics.bstein.dev

View File

@ -0,0 +1,48 @@
# services/nextcloud/configmap.yaml
apiVersion: v1
kind: ConfigMap
metadata:
name: nextcloud-config
namespace: nextcloud
data:
extra.config.php: |
<?php
$CONFIG = array (
'trusted_domains' =>
array (
0 => 'cloud.bstein.dev',
),
'overwritehost' => 'cloud.bstein.dev',
'overwriteprotocol' => 'https',
'overwrite.cli.url' => 'https://cloud.bstein.dev',
'default_phone_region' => 'US',
'mail_smtpmode' => 'smtp',
'mail_sendmailmode' => 'smtp',
'mail_smtphost' => 'mail.bstein.dev',
'mail_smtpport' => '587',
'mail_smtpsecure' => 'tls',
'mail_smtpauth' => true,
'mail_smtpauthtype' => 'LOGIN',
'mail_domain' => 'bstein.dev',
'mail_from_address' => 'no-reply',
'oidc_login_provider_url' => 'https://sso.bstein.dev/realms/atlas',
'oidc_login_client_id' => getenv('OIDC_CLIENT_ID'),
'oidc_login_client_secret' => getenv('OIDC_CLIENT_SECRET'),
'oidc_login_auto_redirect' => false,
'oidc_login_end_session_redirect' => true,
'oidc_login_button_text' => 'Login with Keycloak',
'oidc_login_hide_password_form' => false,
'oidc_login_attributes' =>
array (
'id' => 'preferred_username',
'mail' => 'email',
'name' => 'name',
),
'oidc_login_scope' => 'openid profile email',
'oidc_login_unique_id' => 'preferred_username',
'oidc_login_use_pkce' => true,
'oidc_login_disable_registration' => false,
'oidc_login_create_groups' => false,
# External storage for user data should be configured to Asteria via the External Storage app (admin UI),
# keeping the astreae PVC for app internals only.
);

View File

@ -0,0 +1,32 @@
# services/nextcloud/cronjob.yaml
apiVersion: batch/v1
kind: CronJob
metadata:
name: nextcloud-cron
namespace: nextcloud
spec:
schedule: "*/5 * * * *"
concurrencyPolicy: Forbid
jobTemplate:
spec:
template:
spec:
securityContext:
runAsUser: 33
runAsGroup: 33
fsGroup: 33
restartPolicy: OnFailure
containers:
- name: nextcloud-cron
image: nextcloud:29-apache
imagePullPolicy: IfNotPresent
command: ["/bin/sh", "-c"]
args:
- "cd /var/www/html && php -f cron.php"
volumeMounts:
- name: nextcloud-data
mountPath: /var/www/html
volumes:
- name: nextcloud-data
persistentVolumeClaim:
claimName: nextcloud-data

View File

@ -0,0 +1,143 @@
# services/nextcloud/deployment.yaml
apiVersion: apps/v1
kind: Deployment
metadata:
name: nextcloud
namespace: nextcloud
labels:
app: nextcloud
spec:
replicas: 1
selector:
matchLabels:
app: nextcloud
template:
metadata:
labels:
app: nextcloud
spec:
nodeSelector:
hardware: rpi5
securityContext:
fsGroup: 33
runAsUser: 33
runAsGroup: 33
initContainers:
- name: fix-perms
image: alpine:3.20
command: ["/bin/sh", "-c"]
args:
- |
chown -R 33:33 /var/www/html/config || true
chown -R 33:33 /var/www/html/data || true
securityContext:
runAsUser: 0
runAsGroup: 0
volumeMounts:
- name: nextcloud-data
mountPath: /var/www/html
- name: nextcloud-config
mountPath: /var/www/html/config/extra.config.php
subPath: extra.config.php
containers:
- name: nextcloud
image: nextcloud:29-apache
imagePullPolicy: IfNotPresent
env:
# DB (external secret required: nextcloud-db with keys username,password,database)
- name: POSTGRES_HOST
value: postgres-service.postgres.svc.cluster.local
- name: POSTGRES_DB
valueFrom:
secretKeyRef:
name: nextcloud-db
key: database
- name: POSTGRES_USER
valueFrom:
secretKeyRef:
name: nextcloud-db
key: db-username
- name: POSTGRES_PASSWORD
valueFrom:
secretKeyRef:
name: nextcloud-db
key: db-password
# Admin bootstrap (external secret: nextcloud-admin with keys admin-user, admin-password)
- name: NEXTCLOUD_ADMIN_USER
valueFrom:
secretKeyRef:
name: nextcloud-admin
key: admin-user
- name: NEXTCLOUD_ADMIN_PASSWORD
valueFrom:
secretKeyRef:
name: nextcloud-admin
key: admin-password
- name: NEXTCLOUD_TRUSTED_DOMAINS
value: cloud.bstein.dev
- name: OVERWRITEHOST
value: cloud.bstein.dev
- name: OVERWRITEPROTOCOL
value: https
- name: OVERWRITECLIURL
value: https://cloud.bstein.dev
# SMTP (external secret: nextcloud-smtp with keys username, password)
- name: SMTP_HOST
value: mail.bstein.dev
- name: SMTP_PORT
value: "587"
- name: SMTP_SECURE
value: tls
- name: SMTP_NAME
valueFrom:
secretKeyRef:
name: nextcloud-smtp
key: smtp-username
- name: SMTP_PASSWORD
valueFrom:
secretKeyRef:
name: nextcloud-smtp
key: smtp-password
- name: MAIL_FROM_ADDRESS
value: no-reply
- name: MAIL_DOMAIN
value: bstein.dev
# OIDC (external secret: nextcloud-oidc with keys client-id, client-secret)
- name: OIDC_CLIENT_ID
valueFrom:
secretKeyRef:
name: nextcloud-oidc
key: client-id
- name: OIDC_CLIENT_SECRET
valueFrom:
secretKeyRef:
name: nextcloud-oidc
key: client-secret
- name: NEXTCLOUD_UPDATE
value: "1"
- name: APP_INSTALL
value: "mail,oidc_login,external"
ports:
- containerPort: 80
name: http
volumeMounts:
- name: nextcloud-data
mountPath: /var/www/html
- name: nextcloud-config
mountPath: /var/www/html/config/extra.config.php
subPath: extra.config.php
resources:
requests:
cpu: 250m
memory: 1Gi
limits:
cpu: 1
memory: 3Gi
volumes:
- name: nextcloud-data
persistentVolumeClaim:
claimName: nextcloud-data
- name: nextcloud-config
configMap:
name: nextcloud-config
defaultMode: 0444

View File

@ -0,0 +1,25 @@
# services/nextcloud/ingress.yaml
apiVersion: networking.k8s.io/v1
kind: Ingress
metadata:
name: nextcloud
namespace: nextcloud
annotations:
cert-manager.io/cluster-issuer: letsencrypt-prod
traefik.ingress.kubernetes.io/router.entrypoints: websecure
spec:
tls:
- hosts:
- cloud.bstein.dev
secretName: nextcloud-tls
rules:
- host: cloud.bstein.dev
http:
paths:
- path: /
pathType: Prefix
backend:
service:
name: nextcloud
port:
number: 80

View File

@ -0,0 +1,25 @@
# services/nextcloud/kustomization.yaml
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
namespace: nextcloud
resources:
- namespace.yaml
- configmap.yaml
- pvc.yaml
- deployment.yaml
- service.yaml
- ingress.yaml
- cronjob.yaml
- mail-sync-cronjob.yaml
- maintenance-cronjob.yaml
configMapGenerator:
- name: nextcloud-maintenance-script
files:
- maintenance.sh=../../scripts/nextcloud-maintenance.sh
options:
disableNameSuffixHash: true
- name: nextcloud-mail-sync-script
files:
- sync.sh=../../scripts/nextcloud-mail-sync.sh
options:
disableNameSuffixHash: true

View File

@ -0,0 +1,58 @@
# services/nextcloud/mail-sync-cronjob.yaml
apiVersion: batch/v1
kind: CronJob
metadata:
name: nextcloud-mail-sync
namespace: nextcloud
spec:
schedule: "0 5 * * *"
concurrencyPolicy: Forbid
jobTemplate:
spec:
template:
spec:
restartPolicy: OnFailure
securityContext:
runAsUser: 0
runAsGroup: 0
containers:
- name: mail-sync
image: nextcloud:29-apache
imagePullPolicy: IfNotPresent
command: ["/bin/bash", "/sync/sync.sh"]
env:
- name: KC_BASE
value: https://sso.bstein.dev
- name: KC_REALM
value: atlas
- name: KC_ADMIN_USER
valueFrom:
secretKeyRef:
name: nextcloud-keycloak-admin
key: username
- name: KC_ADMIN_PASS
valueFrom:
secretKeyRef:
name: nextcloud-keycloak-admin
key: password
volumeMounts:
- name: nextcloud-data
mountPath: /var/www/html
- name: sync-script
mountPath: /sync/sync.sh
subPath: sync.sh
resources:
requests:
cpu: 100m
memory: 256Mi
limits:
cpu: 500m
memory: 512Mi
volumes:
- name: nextcloud-data
persistentVolumeClaim:
claimName: nextcloud-data
- name: sync-script
configMap:
name: nextcloud-mail-sync-script
defaultMode: 0755

View File

@ -0,0 +1,56 @@
# services/nextcloud/maintenance-cronjob.yaml
apiVersion: batch/v1
kind: CronJob
metadata:
name: nextcloud-maintenance
namespace: nextcloud
spec:
schedule: "30 4 * * *"
concurrencyPolicy: Forbid
jobTemplate:
spec:
template:
spec:
restartPolicy: OnFailure
securityContext:
runAsUser: 0
runAsGroup: 0
containers:
- name: maintenance
image: nextcloud:29-apache
imagePullPolicy: IfNotPresent
command: ["/bin/bash", "/maintenance/maintenance.sh"]
env:
- name: NC_URL
value: https://cloud.bstein.dev
- name: ADMIN_USER
valueFrom:
secretKeyRef:
name: nextcloud-admin
key: admin-user
- name: ADMIN_PASS
valueFrom:
secretKeyRef:
name: nextcloud-admin
key: admin-password
volumeMounts:
- name: nextcloud-data
mountPath: /var/www/html
- name: maintenance-script
mountPath: /maintenance/maintenance.sh
subPath: maintenance.sh
resources:
requests:
cpu: 100m
memory: 256Mi
limits:
cpu: 500m
memory: 512Mi
volumes:
- name: nextcloud-data
persistentVolumeClaim:
claimName: nextcloud-data
- name: maintenance-script
configMap:
name: nextcloud-maintenance-script
defaultMode: 0755

View File

@ -0,0 +1,5 @@
# services/nextcloud/namespace.yaml
apiVersion: v1
kind: Namespace
metadata:
name: nextcloud

View File

@ -0,0 +1,13 @@
# services/nextcloud/pvc.yaml
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
name: nextcloud-data
namespace: nextcloud
spec:
accessModes:
- ReadWriteMany
resources:
requests:
storage: 200Gi
storageClassName: astreae

View File

@ -0,0 +1,13 @@
# services/nextcloud/service.yaml
apiVersion: v1
kind: Service
metadata:
name: nextcloud
namespace: nextcloud
spec:
selector:
app: nextcloud
ports:
- name: http
port: 80
targetPort: http

View File

@ -0,0 +1,82 @@
# services/oauth2-proxy/deployment.yaml
apiVersion: apps/v1
kind: Deployment
metadata:
name: oauth2-proxy
namespace: sso
labels:
app: oauth2-proxy
spec:
replicas: 2
selector:
matchLabels:
app: oauth2-proxy
template:
metadata:
labels:
app: oauth2-proxy
spec:
nodeSelector:
node-role.kubernetes.io/worker: "true"
affinity:
nodeAffinity:
preferredDuringSchedulingIgnoredDuringExecution:
- weight: 90
preference:
matchExpressions:
- key: hardware
operator: In
values: ["rpi5","rpi4"]
containers:
- name: oauth2-proxy
image: quay.io/oauth2-proxy/oauth2-proxy:v7.6.0
imagePullPolicy: IfNotPresent
args:
- --provider=oidc
- --redirect-url=https://auth.bstein.dev/oauth2/callback
- --oidc-issuer-url=https://sso.bstein.dev/realms/atlas
- --scope=openid profile email groups
- --email-domain=*
- --set-xauthrequest=true
- --pass-access-token=true
- --set-authorization-header=true
- --cookie-secure=true
- --cookie-samesite=lax
- --cookie-refresh=20m
- --cookie-expire=168h
- --upstream=static://200
- --http-address=0.0.0.0:4180
- --skip-provider-button=true
- --skip-jwt-bearer-tokens=true
- --oidc-groups-claim=groups
env:
- name: OAUTH2_PROXY_CLIENT_ID
valueFrom:
secretKeyRef:
name: oauth2-proxy-oidc
key: client_id
- name: OAUTH2_PROXY_CLIENT_SECRET
valueFrom:
secretKeyRef:
name: oauth2-proxy-oidc
key: client_secret
- name: OAUTH2_PROXY_COOKIE_SECRET
valueFrom:
secretKeyRef:
name: oauth2-proxy-oidc
key: cookie_secret
ports:
- containerPort: 4180
name: http
readinessProbe:
httpGet:
path: /ping
port: 4180
initialDelaySeconds: 5
periodSeconds: 10
livenessProbe:
httpGet:
path: /ping
port: 4180
initialDelaySeconds: 20
periodSeconds: 20

View File

@ -0,0 +1,25 @@
# services/oauth2-proxy/ingress.yaml
apiVersion: networking.k8s.io/v1
kind: Ingress
metadata:
name: oauth2-proxy
namespace: sso
annotations:
cert-manager.io/cluster-issuer: letsencrypt
traefik.ingress.kubernetes.io/router.middlewares: sso-oauth2-proxy-errors@kubernetescrd
spec:
ingressClassName: traefik
rules:
- host: auth.bstein.dev
http:
paths:
- path: /
pathType: Prefix
backend:
service:
name: oauth2-proxy
port:
number: 80
tls:
- hosts: [auth.bstein.dev]
secretName: auth-tls

View File

@ -0,0 +1,10 @@
# services/oauth2-proxy/kustomization.yaml
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
namespace: sso
resources:
- deployment.yaml
- service.yaml
- ingress.yaml
- middleware.yaml
- middleware-errors.yaml

View File

@ -0,0 +1,15 @@
# services/oauth2-proxy/middleware-errors.yaml
apiVersion: traefik.io/v1alpha1
kind: Middleware
metadata:
name: oauth2-proxy-errors
namespace: sso
spec:
errors:
status:
- "401"
- "403"
service:
name: oauth2-proxy
port: 80
query: /oauth2/start?rd={url}

View File

@ -0,0 +1,15 @@
# services/oauth2-proxy/middleware.yaml
apiVersion: traefik.io/v1alpha1
kind: Middleware
metadata:
name: oauth2-proxy-forward-auth
namespace: sso
spec:
forwardAuth:
address: http://oauth2-proxy.sso.svc.cluster.local:4180/oauth2/auth
trustForwardHeader: true
authResponseHeaders:
- Authorization
- X-Auth-Request-Email
- X-Auth-Request-User
- X-Auth-Request-Groups

View File

@ -0,0 +1,15 @@
# services/oauth2-proxy/service.yaml
apiVersion: v1
kind: Service
metadata:
name: oauth2-proxy
namespace: sso
labels:
app: oauth2-proxy
spec:
selector:
app: oauth2-proxy
ports:
- name: http
port: 80
targetPort: 4180

View File

@ -8,7 +8,7 @@ metadata:
kubernetes.io/ingress.class: traefik
traefik.ingress.kubernetes.io/router.entrypoints: websecure
traefik.ingress.kubernetes.io/router.tls: "true"
cert-manager.io/cluster-issuer: letsencrypt-prod
cert-manager.io/cluster-issuer: letsencrypt
spec:
tls:
- hosts: [ "pegasus.bstein.dev" ]

View File

@ -8,7 +8,7 @@ spec:
secretName: vault-server-tls
issuerRef:
kind: ClusterIssuer
name: letsencrypt-prod
name: letsencrypt
commonName: secret.bstein.dev
dnsNames:
- secret.bstein.dev

View File

@ -7,7 +7,6 @@ metadata:
annotations:
kubernetes.io/ingress.class: traefik
traefik.ingress.kubernetes.io/router.entrypoints: websecure
traefik.ingress.kubernetes.io/router.middlewares: vault-vault-basicauth@kubernetescrd
traefik.ingress.kubernetes.io/service.serversscheme: https
traefik.ingress.kubernetes.io/service.serversTransport: vault-vault-to-https@kubernetescrd
spec:

View File

@ -7,5 +7,4 @@ resources:
- helmrelease.yaml
- certificate.yaml
- ingress.yaml
- middleware.yaml
- serverstransport.yaml

View File

@ -1,9 +0,0 @@
# services/vault/middleware.yaml
apiVersion: traefik.io/v1alpha1
kind: Middleware
metadata:
name: vault-basicauth
namespace: vault
spec:
basicAuth:
secret: vault-basic-auth

View File

@ -5,7 +5,7 @@ metadata:
name: zot
namespace: zot
annotations:
cert-manager.io/cluster-issuer: letsencrypt-prod
cert-manager.io/cluster-issuer: letsencrypt
traefik.ingress.kubernetes.io/router.entrypoints: websecure
traefik.ingress.kubernetes.io/router.tls: "true"
traefik.ingress.kubernetes.io/router.middlewares: zot-zot-resp-headers@kubernetescrd