Compare commits

..

72 Commits

Author SHA1 Message Date
055ce7d18c Merge pull request 'feature/mailu' (#5) from feature/mailu into main
Reviewed-on: #5
2025-12-14 17:48:02 +00:00
1a161b4d3c monitoring: longer data history 2025-12-14 14:47:20 -03:00
f7bf990d62 flux: bump gitops-ui kustomization 2025-12-14 14:41:52 -03:00
63bf153c8b flux: add weave gitops ui 2025-12-14 14:38:08 -03:00
8fceebd7a7 nextcloud: integration with mailu & gitops-ui: initial install 2025-12-14 14:21:40 -03:00
0d0216c8f5 Add tests and dedupe nextcloud mail sync 2025-12-14 14:15:19 -03:00
c8b49560b6 Keep nextcloud scripts single-sourced under scripts/ 2025-12-14 14:05:01 -03:00
327a7bed57 Extract nextcloud scripts to files 2025-12-14 13:59:16 -03:00
aae09c5074 Normalize doc layout and README guidance 2025-12-14 13:47:59 -03:00
56bb4e91b9 Group namespace plurality rows to one per namespace 2025-12-13 22:17:47 -03:00
18f3a2cefe Fix namespace plurality mask and bump v26 2025-12-13 20:53:11 -03:00
1ec3ca29a4 Use OR-joined node ranks for plurality tie-break 2025-12-13 19:04:22 -03:00
4812958e82 Deduplicate namespace plurality rows with ranked tie-break 2025-12-13 18:39:31 -03:00
9ad5f7f405 Restore namespace plurality panel data 2025-12-13 18:25:03 -03:00
57ea397027 Use table format for namespace plurality panel 2025-12-13 18:23:19 -03:00
be0ac48b33 Simplify namespace plurality table rendering 2025-12-13 18:07:56 -03:00
2156b6f6aa Hide table footer on namespace plurality table 2025-12-13 18:03:51 -03:00
4fcc7c84f2 Make namespace plurality table non-filterable 2025-12-13 17:55:52 -03:00
a4b3273bab Remove filter bar from namespace plurality table 2025-12-13 17:38:57 -03:00
c536a13d55 Disable column filters on namespace plurality table 2025-12-13 17:35:52 -03:00
13eb02c19b Hide filters on namespace plurality table 2025-12-13 17:32:19 -03:00
134a4ad001 Fix namespace plurality table query 2025-12-13 17:29:55 -03:00
3e0a84b074 atlas pods: plurality table v11 (deterministic top node) 2025-12-13 17:19:03 -03:00
7f67793ee5 atlas pods: plurality table v10 2025-12-13 16:36:25 -03:00
e87d54f19d atlas pods: per-namespace top node via topk 2025-12-13 15:51:45 -03:00
6ac01e5879 atlas pods: simplify plurality table (no filter) 2025-12-13 15:29:08 -03:00
d0ed188179 monitoring: drop README per convention 2025-12-13 15:25:21 -03:00
b703e66b98 monitoring: restore README 2025-12-13 15:11:50 -03:00
68d4f43903 atlas pods: stabilize plurality query to avoid 422 2025-12-13 15:11:21 -03:00
cf9dacd4ea atlas pods: show per-namespace top node without vars 2025-12-13 15:02:52 -03:00
6eee7b8853 atlas pods: drop non-leading nodes in plurality table 2025-12-13 13:39:06 -03:00
03a4ca4d84 atlas pods: simplify plurality table query 2025-12-13 12:06:18 -03:00
c7adb0c8cb atlas pods: fix plurality table query 2025-12-13 12:00:31 -03:00
9d1163f580 atlas pods: use prom share() for plurality table 2025-12-13 11:53:27 -03:00
001f0f95a6 atlas pods: fix plurality query with bool max match 2025-12-13 11:51:18 -03:00
2177a8009e atlas pods: robust per-namespace top-node share 2025-12-13 11:48:44 -03:00
6a3d1311b9 atlas pods: select per-namespace top node via max match 2025-12-13 04:15:03 -03:00
d916e5a7f1 atlas pods: sort plurality table by node then share 2025-12-13 04:10:10 -03:00
5d6d34c274 atlas pods: simplify namespace plurality query 2025-12-13 04:06:46 -03:00
53423c7a46 atlas pods: fix namespace plurality query 2025-12-13 04:00:57 -03:00
d274738e9e restore readmes removed in last commit 2025-12-13 03:57:44 -03:00
f0265d6b94 atlas pods: add namespace plurality by node table 2025-12-13 03:57:20 -03:00
8a755e0c42 mailu: forcing version 1.4 clamav over 1.2 2025-12-13 00:11:40 -03:00
e22293db3e forcing 12-r3 over 12-r6 for redis 2025-12-12 22:09:04 -03:00
6f8a70fd58 atlas overview: include titan-db in control plane panels 2025-12-12 21:55:53 -03:00
580d1731f9 monitoring: drop duplicate titan-db scrape job 2025-12-12 21:48:03 -03:00
4def298b83 monitoring: scrape titan-db node_exporter 2025-12-12 21:38:10 -03:00
1166069640 atlas dashboards: align percent thresholds and disk bars 2025-12-12 21:13:31 -03:00
e56bed284e atlas overview: refine alert thresholds and availability colors 2025-12-12 20:50:41 -03:00
24376594ff atlas dashboards: use threshold colors for stats 2025-12-12 20:44:20 -03:00
5277c98385 atlas dashboards: fix pod share display and zero/red stat thresholds 2025-12-12 20:40:32 -03:00
056b7b7770 atlas dashboards: show pod counts (not %) and make zero-friendly stats 2025-12-12 20:30:00 -03:00
b770575b42 atlas dashboards: show pod counts with top12 bars 2025-12-12 20:20:13 -03:00
9e76277c22 atlas dashboards: drop empty nodes and enforce top12 pod bars 2025-12-12 19:09:51 -03:00
93b3c6d2ec atlas dashboards: cap pod count bars at top12 2025-12-12 18:56:13 -03:00
596bf46863 atlas dashboards: sort pod counts and add pod row to overview 2025-12-12 18:51:43 -03:00
8b703f8655 atlas pods: add pod count bar and tidy pie 2025-12-12 18:45:29 -03:00
ec59d25ad8 atlas dashboards: fix overview links and add pods-by-node pie 2025-12-12 18:32:45 -03:00
bf6179f907 atlas internal dashboards: add SLO/burn and api health panels 2025-12-12 18:00:43 -03:00
0a0966db78 atlas overview: fix availability scaling 2025-12-12 16:36:47 -03:00
87fbba0d3e atlas overview: show availability percent with 3 decimals 2025-12-12 16:15:37 -03:00
b200dba5b9 atlas overview: show availability percent and keep uptime centered 2025-12-12 16:11:28 -03:00
697ce3c18f atlas overview: center uptime and reorder top row 2025-12-12 15:56:33 -03:00
8e39c6a28b atlas overview: add uptime and crashloop panels 2025-12-12 15:23:51 -03:00
38ab8e3364 standardize cert issuers to letsencrypt 2025-12-12 15:18:40 -03:00
29d22ba539 mailu: fix unbound sidecar mounts 2025-12-12 01:19:27 -03:00
118032d2c6 mailu: use mvance unbound sidecar and current redis image 2025-12-12 01:12:48 -03:00
4cfe92feb2 mailu: remove force upgrade to avoid pvc replace 2025-12-12 01:09:25 -03:00
ca27cc95b6 mailu: add validating dns sidecar and disable vip hostports 2025-12-12 01:06:38 -03:00
6c77b8e7f8 restore docs after gitignore change 2025-12-12 00:50:02 -03:00
78195c4685 mailu: fix admin dns and tame vip 2025-12-12 00:49:45 -03:00
5ef0b4edf6 mailu: capture helm release and cert 2025-12-11 23:54:43 -03:00
67 changed files with 5029 additions and 775 deletions

6
.gitignore vendored
View File

@ -1 +1,5 @@
AGENTS.md
# Ignore markdown by default, but keep top-level docs
*.md
!README.md
!AGENTS.md
!**/NOTES.md

81
AGENTS.md Normal file
View File

@ -0,0 +1,81 @@
Repository Guidelines
> Local-only note: apply changes through Flux-tracked manifests, not by manual kubectl edits in-cluster—manual tweaks will be reverted by Flux.
## Project Structure & Module Organization
- `infrastructure/`: cluster-scoped building blocks (core, flux-system, traefik, longhorn). Add new platform features by mirroring this layout.
- `services/`: workload manifests per app (`services/gitea/`, etc.) with `kustomization.yaml` plus one file per kind; keep diffs small and focused.
- `dockerfiles/` hosts bespoke images, while `scripts/` stores operational Fish/Bash helpers—extend these directories instead of relying on ad-hoc commands.
## Build, Test, and Development Commands
- `kustomize build services/<app>` (or `kubectl kustomize ...`) renders manifests exactly as Flux will.
- `kubectl apply --server-side --dry-run=client -k services/<app>` checks schema compatibility without touching the cluster.
- `flux reconcile kustomization <name> --namespace flux-system --with-source` pulls the latest Git state after merges or hotfixes.
- `fish scripts/flux_hammer.fish --help` explains the recovery tool; read it before running against production workloads.
## Coding Style & Naming Conventions
- YAML uses two-space indents; retain the leading path comment (e.g. `# services/gitea/deployment.yaml`) to speed code review.
- Keep resource names lowercase kebab-case, align labels/selectors, and mirror namespaces with directory names.
- List resources in `kustomization.yaml` from namespace/config, through storage, then workloads and networking for predictable diffs.
- Scripts start with `#!/usr/bin/env fish` or bash, stay executable, and follow snake_case names such as `flux_hammer.fish`.
## Testing Guidelines
- Run `kustomize build` and the dry-run apply for every service you touch; capture failures before opening a PR.
- `flux diff kustomization <name> --path services/<app>` previews reconciliations—link notable output when behavior shifts.
- Docker edits: `docker build -f dockerfiles/Dockerfile.monerod .` (swap the file you changed) to verify image builds.
## Commit & Pull Request Guidelines
- Keep commit subjects short, present-tense, and optionally scoped (`gpu(titan-24): add RuntimeClass`); squash fixups before review.
- Describe linked issues, affected services, and required operator steps (e.g. `flux reconcile kustomization services-gitea`) in the PR body.
- Focus each PR on one kustomization or service and update `infrastructure/flux-system` when Flux must track new folders.
- Record the validation you ran (dry-runs, diffs, builds) and add screenshots only when ingress or UI behavior changes.
## Security & Configuration Tips
- Never commit credentials; use Vault workflows (`services/vault/`) or SOPS-encrypted manifests wired through `infrastructure/flux-system`.
- Node selectors and tolerations gate workloads to hardware like `hardware: rpi4`; confirm labels before scaling or renaming nodes.
- Pin external images by digest or rely on Flux image automation to follow approved tags and avoid drift.
## Dashboard roadmap / context (2025-12-02)
- Atlas dashboards are generated via `scripts/dashboards_render_atlas.py --build`, which writes JSON under `services/monitoring/dashboards/` and ConfigMaps under `services/monitoring/`. Keep the Grafana manifests in sync by regenerating after edits.
- Atlas Overview panels are paired with internal dashboards (pods, nodes, storage, network, GPU). A new `atlas-gpu` internal dashboard holds the detailed GPU metrics that feed the overview share pie.
- Old Grafana folders (`Atlas Storage`, `Atlas SRE`, `Atlas Public`, `Atlas Nodes`) should be removed in Grafana UI when convenient; only `Atlas Overview` and `Atlas Internal` should remain provisioned.
- Future work: add a separate generator (e.g., `dashboards_render_oceanus.py`) for SUI/oceanus validation dashboards, mirroring the atlas pattern of internal dashboards feeding a public overview.
## Monitoring state (2025-12-03)
- dcgm-exporter DaemonSet pulls `registry.bstein.dev/monitoring/dcgm-exporter:4.4.2-4.7.0-ubuntu22.04` with nvidia runtime/imagePullSecret; titan-24 exports metrics, titan-22 remains NotReady.
- Atlas Overview is the Grafana home (1h range, 1m refresh), Overview folder UID `overview`, internal folder `atlas-internal` (oceanus-internal stub).
- Panels standardized via generator; hottest row compressed, worker/control rows taller, root disk row taller and top12 bar gauge with labels. GPU share pie uses 1h avg_over_time to persist idle activity.
- Internal dashboards are provisioned without Viewer role; if anonymous still sees them, restart Grafana and tighten auth if needed.
- GPU share panel updated (feature/sso) to use `max_over_time(…[$__range])`, so longer ranges (e.g., 12h) keep recent activity visible. Flux tracking `feature/sso`.
## Upcoming priorities (SSO/storage/mail)
- Establish SSO (Keycloak or similar) and federate Grafana, Gitea, Zot, Nextcloud, Pegasus/Jellyfin; keep Vaultwarden separate until safe.
- Add Nextcloud (limit to rpi5 workers) with office suite; integrate with SSO; plan storage class and ingress.
- Plan mail: mostly self-hosted, relay through trusted provider for outbound; integrate with services (Nextcloud, Vaultwarden, etc.) for notifications and account flows.
## SSO plan sketch (2025-12-03)
- IdP: use Keycloak (preferred) in a new `sso` namespace, Bitnami or codecentric chart with Postgres backing store (single PVC), ingress `sso.bstein.dev`, admin user bound to brad@bstein.dev; stick with local DB initially (no external IdP).
- Auth flow goals: Grafana (OIDC), Gitea (OAuth2/Keycloak), Zot (via Traefik forward-auth/oauth2-proxy), Jellyfin/Pegasus via Jellyfin OAuth/OpenID plugin (map existing usernames; run migration to pre-create users in Keycloak with same usernames/emails and temporary passwords), Pegasus keeps using Jellyfin tokens.
- Steps to implement:
1) Add service folder `services/keycloak/` (namespace, PVC, HelmRelease, ingress, secret for admin creds). Verify with kustomize + Flux reconcile.
2) Seed realm `atlas` with users (import CSV/realm). Create client for Grafana (public/implicit), Gitea (confidential), and a “jellyfin” client for the OAuth plugin; set email for brad@bstein.dev as admin.
3) Reconfigure Grafana to OIDC (disable anonymous to internal folders, leave Overview public via folder permissions). Reconfigure Gitea to OIDC (app.ini).
4) Add Traefik forward-auth (oauth2-proxy) in front of Zot and any other services needing headers-based auth.
5) Deploy Jellyfin OpenID plugin; map Keycloak users to existing Jellyfin usernames; communicate password reset path.
- Migration caution: do not delete existing local creds until SSO validated; keep Pegasus working via Jellyfin tokens during transition.
## Postgres centralization (2025-12-03)
- Prefer a shared in-cluster Postgres deployment with per-service databases to reduce resource sprawl on Pi nodes. Use it for services that can easily point at an external DB.
- Candidates to migrate to shared Postgres: Keycloak (realm DB), Gitea (git DB), Nextcloud (app DB), possibly Grafana (if persistence needed beyond current provisioner), Jitsi prosody/JVB state (if external DB supported). Keep tightly-coupled or lightweight embedded DBs as-is when migration is painful or not supported.
## SSO integration snapshot (2025-12-08)
- Current blockers: Zot still prompts for basic auth/double-login; Vault still wants the token UI after Keycloak (previously 502/404 when vault-0 sealed). Forward-auth middleware on Zot Ingress likely still causing the 401/Found hop; Vault OIDC mount not completing UI flow unless unsealed and preferred login is set.
- Flux-only changes required: remove zot forward-auth middleware from Ingress (let oauth2-proxy handle redirect), ensure Vault OIDC mount is preferred UI login and bound to admin group; keep all edits in repo so Flux enforces them.
- Secrets present (per user): `zot-oidc-client` (client_secret only), `oauth2-proxy-zot-oidc`, `oauth2-proxy-vault-oidc`, `vault-oidc-admin-token`. Zot needs its regcred in the zot namespace if image pulls fail.
- Cluster validation blocked here: `kubectl get nodes` fails (403/permission) and DNS to `*.bstein.dev` fails in this session, so no live curl verification could be run. Re-test on a host with cluster/DNS access after Flux applies fixes.
## Docs hygiene
- Do not add per-service `README.md` files; use `NOTES.md` if documentation is needed inside service folders. Keep only the top-level repo README.
- Keep comments succinct and in a human voice—no AI-sounding notes. Use `NOTES.md` for scratch notes instead of sprinkling reminders into code or extra READMEs.

3
NOTES.md Normal file
View File

@ -0,0 +1,3 @@
# Rotation reminders (temporary secrets set by automation)
- Weave GitOps UI (`cd.bstein.dev`) admin: `admin` / `G1tOps!2025` — rotate immediately after first login.

3
README.md Normal file
View File

@ -0,0 +1,3 @@
# titan-iac
Flux-managed Kubernetes cluster for bstein.dev services.

View File

@ -15,3 +15,4 @@ resources:
- sui-metrics/kustomization.yaml
- keycloak/kustomization.yaml
- oauth2-proxy/kustomization.yaml
- mailu/kustomization.yaml

View File

@ -0,0 +1,18 @@
# clusters/atlas/flux-system/applications/mailu/kustomization.yaml
apiVersion: kustomize.toolkit.fluxcd.io/v1
kind: Kustomization
metadata:
name: mailu
namespace: flux-system
spec:
interval: 10m
sourceRef:
kind: GitRepository
name: flux-system
namespace: flux-system
path: ./services/mailu
targetNamespace: mailu-mailserver
prune: true
wait: true
dependsOn:
- name: helm

View File

@ -8,7 +8,7 @@ metadata:
spec:
interval: 1m0s
ref:
branch: feature/sso
branch: feature/mailu
secretRef:
name: flux-system-gitea
url: ssh://git@scm.bstein.dev:2242/bstein/titan-iac.git

View File

@ -0,0 +1,20 @@
# clusters/atlas/flux-system/platform/gitops-ui/kustomization.yaml
apiVersion: kustomize.toolkit.fluxcd.io/v1
kind: Kustomization
metadata:
name: gitops-ui
namespace: flux-system
spec:
interval: 10m
timeout: 10m
path: ./services/gitops-ui
prune: true
sourceRef:
kind: GitRepository
name: flux-system
namespace: flux-system
targetNamespace: flux-system
dependsOn:
- name: helm
- name: traefik
wait: true

View File

@ -5,5 +5,6 @@ resources:
- core/kustomization.yaml
- helm/kustomization.yaml
- traefik/kustomization.yaml
- gitops-ui/kustomization.yaml
- monitoring/kustomization.yaml
- longhorn-ui/kustomization.yaml

View File

@ -1,5 +0,0 @@
# Oceanus Cluster Scaffold
This directory prepares the Flux and Kustomize layout for a future Oceanus-managed cluster.
Populate `flux-system/` with `gotk-components.yaml` and related manifests after running `flux bootstrap`.
Define node-specific resources under `infrastructure/modules/profiles/oceanus-validator/` and reference workloads in `applications/` as they come online.

View File

@ -2,15 +2,14 @@
| Hostname | Role / Function | Managed By | Notes |
|------------|--------------------------------|---------------------|-------|
| titan-db | HA control plane database | Ansible | PostgreSQL / etcd backing services |
| titan-0a | Kubernetes control-plane | Flux (atlas cluster)| HA leader, tainted for control only |
| titan-0b | Kubernetes control-plane | Flux (atlas cluster)| Standby control node |
| titan-0c | Kubernetes control-plane | Flux (atlas cluster)| Standby control node |
| titan-04-19| Raspberry Pi workers | Flux (atlas cluster)| Workload nodes, labelled per hardware |
| titan-20&21| NVIDIA Jetson workers | Flux (atlas cluster)| Workload nodes, labelled per hardware |
| titan-22 | GPU mini-PC (Jellyfin) | Flux + Ansible | NVIDIA runtime managed via `modules/profiles/atlas-ha` |
| titan-23 | Dedicated SUI validator Oceanus| Manual + Ansible | Baremetal validator workloads, exposes metrics to atlas |
| titan-24 | Tethys hybrid node | Flux + Ansible | Runs SUI metrics via K8s, validator via Ansible |
| titan-db | HA control plane database | Ansible | PostgreSQL / etcd backing services |
| titan-jh | Jumphost & bastion | Ansible | Entry point / future KVM services |
| oceanus | Dedicated SUI validator host | Ansible / Flux prep | Baremetal validator workloads, exposes metrics to atlas; Kustomize scaffold under `clusters/oceanus/` |
| titan-jh | Jumphost & bastion & lesavka | Ansible | Entry point / future KVM services / custom kvm - lesavaka |
| styx | Air-gapped workstation | Manual / Scripts | Remains isolated, scripts tracked in `hosts/styx` |
Use the `clusters/` directory for cluster-scoped state and the `hosts/` directory for baremetal orchestration.

View File

@ -5,3 +5,4 @@ resources:
- ../modules/base
- ../modules/profiles/atlas-ha
- ../sources/cert-manager/letsencrypt.yaml
- ../sources/cert-manager/letsencrypt-prod.yaml

View File

@ -0,0 +1,14 @@
apiVersion: cert-manager.io/v1
kind: ClusterIssuer
metadata:
name: letsencrypt-prod
spec:
acme:
email: brad.stein@gmail.com
server: https://acme-v02.api.letsencrypt.org/directory
privateKeySecretRef:
name: letsencrypt-prod-account-key
solvers:
- http01:
ingress:
class: traefik

View File

@ -0,0 +1,10 @@
# infrastructure/sources/helm/kustomization.yaml
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
resources:
- grafana.yaml
- hashicorp.yaml
- jetstack.yaml
- mailu.yaml
- prometheus.yaml
- victoria-metrics.yaml

View File

@ -0,0 +1,9 @@
# infrastructure/sources/helm/mailu.yaml
apiVersion: source.toolkit.fluxcd.io/v1
kind: HelmRepository
metadata:
name: mailu
namespace: flux-system
spec:
interval: 1h
url: https://mailu.github.io/helm-charts

View File

@ -36,11 +36,12 @@ PUBLIC_FOLDER = "overview"
PRIVATE_FOLDER = "atlas-internal"
PERCENT_THRESHOLDS = {
"mode": "percentage",
"mode": "absolute",
"steps": [
{"color": "green", "value": None},
{"color": "yellow", "value": 70},
{"color": "red", "value": 85},
{"color": "yellow", "value": 50},
{"color": "orange", "value": 75},
{"color": "red", "value": 91.5},
],
}
@ -81,7 +82,7 @@ CONTROL_SUFFIX = f"/{CONTROL_TOTAL}"
WORKER_SUFFIX = f"/{WORKER_TOTAL}"
CP_ALLOWED_NS = "kube-system|kube-public|kube-node-lease|longhorn-system|monitoring|flux-system"
LONGHORN_NODE_REGEX = "titan-1[2-9]|titan-2[24]"
GAUGE_WIDTHS = [5, 5, 5, 5, 4]
GAUGE_WIDTHS = [4, 3, 3, 4, 3, 3, 4]
CONTROL_WORKLOADS_EXPR = (
f'sum(kube_pod_info{{node=~"{CONTROL_REGEX}",namespace!~"{CP_ALLOWED_NS}"}}) or on() vector(0)'
)
@ -187,17 +188,64 @@ def namespace_gpu_share_expr():
return namespace_share_expr(NAMESPACE_GPU_RAW)
PROBLEM_PODS_EXPR = 'sum(max by (namespace,pod) (kube_pod_status_phase{phase!~"Running|Succeeded"}))'
PROBLEM_PODS_EXPR = (
'sum(max by (namespace,pod) (kube_pod_status_phase{phase!~"Running|Succeeded"})) '
"or on() vector(0)"
)
CRASHLOOP_EXPR = (
'sum(max by (namespace,pod) (kube_pod_container_status_waiting_reason'
'{reason=~"CrashLoopBackOff|ImagePullBackOff"}))'
'{reason=~"CrashLoopBackOff|ImagePullBackOff"})) '
"or on() vector(0)"
)
STUCK_TERMINATING_EXPR = (
'sum(max by (namespace,pod) ('
'((time() - kube_pod_deletion_timestamp{pod!=""}) > bool 600)'
' and on(namespace,pod) (kube_pod_deletion_timestamp{pod!=""} > bool 0)'
'))'
')) '
"or on() vector(0)"
)
UPTIME_WINDOW = "30d"
TRAEFIK_READY_EXPR = (
"("
'sum(kube_deployment_status_replicas_available{namespace=~"traefik|kube-system",deployment="traefik"})'
" / clamp_min("
'sum(kube_deployment_spec_replicas{namespace=~"traefik|kube-system",deployment="traefik"}), 1)'
")"
)
CONTROL_READY_FRACTION_EXPR = (
f"(sum(kube_node_status_condition{{condition=\"Ready\",status=\"true\",node=~\"{CONTROL_REGEX}\"}})"
f" / {CONTROL_TOTAL})"
)
UPTIME_AVAIL_EXPR = (
f"min(({CONTROL_READY_FRACTION_EXPR}), ({TRAEFIK_READY_EXPR}))"
)
# Tie-breaker to deterministically pick one node per namespace when shares tie.
NODE_TIEBREAKER = " + ".join(
f"({node_filter(node)}) * 1e-6 * {idx}"
for idx, node in enumerate(CONTROL_ALL + WORKER_NODES, start=1)
)
UPTIME_AVG_EXPR = f"avg_over_time(({UPTIME_AVAIL_EXPR})[{UPTIME_WINDOW}:5m])"
UPTIME_PERCENT_EXPR = UPTIME_AVG_EXPR
UPTIME_NINES_EXPR = f"-log10(1 - clamp_max({UPTIME_AVG_EXPR}, 0.999999999))"
UPTIME_THRESHOLDS = {
"mode": "absolute",
"steps": [
{"color": "red", "value": None},
{"color": "orange", "value": 2},
{"color": "yellow", "value": 3},
{"color": "green", "value": 3.5},
],
}
UPTIME_PERCENT_THRESHOLDS = {
"mode": "absolute",
"steps": [
{"color": "red", "value": None},
{"color": "orange", "value": 0.999},
{"color": "yellow", "value": 0.9999},
{"color": "green", "value": 0.99999},
],
}
PROBLEM_TABLE_EXPR = (
"(time() - kube_pod_created{pod!=\"\"}) "
"* on(namespace,pod) group_left(node) kube_pod_info "
@ -291,6 +339,34 @@ NET_INTERNAL_EXPR = (
'+ rate(container_network_transmit_bytes_total{namespace!="traefik",pod!=""}[5m]))'
' or on() vector(0)'
)
APISERVER_5XX_RATE = 'sum(rate(apiserver_request_total{code=~"5.."}[5m]))'
APISERVER_P99_LATENCY_MS = (
"histogram_quantile(0.99, sum by (le) (rate(apiserver_request_duration_seconds_bucket[5m]))) * 1000"
)
ETCD_P99_LATENCY_MS = (
"histogram_quantile(0.99, sum by (le) (rate(etcd_request_duration_seconds_bucket[5m]))) * 1000"
)
TRAEFIK_TOTAL_5M = "sum(rate(traefik_entrypoint_requests_total[5m]))"
TRAEFIK_SUCCESS_5M = 'sum(rate(traefik_entrypoint_requests_total{code!~"5.."}[5m]))'
TRAEFIK_SLI_5M = f"({TRAEFIK_SUCCESS_5M}) / clamp_min({TRAEFIK_TOTAL_5M}, 1)"
TRAEFIK_P99_LATENCY_MS = (
"histogram_quantile(0.99, sum by (le) (rate(traefik_entrypoint_request_duration_seconds_bucket[5m]))) * 1000"
)
TRAEFIK_P95_LATENCY_MS = (
"histogram_quantile(0.95, sum by (le) (rate(traefik_entrypoint_request_duration_seconds_bucket[5m]))) * 1000"
)
SLO_AVAILABILITY = 0.999
def traefik_sli(window):
total = f'sum(rate(traefik_entrypoint_requests_total[{window}]))'
success = f'sum(rate(traefik_entrypoint_requests_total{{code!~"5.."}}[{window}]))'
return f"({success}) / clamp_min({total}, 1)"
def traefik_burn(window):
sli = traefik_sli(window)
return f"(1 - ({sli})) / {1 - SLO_AVAILABILITY}"
# ---------------------------------------------------------------------------
# Panel factories
@ -304,6 +380,7 @@ def stat_panel(
grid,
*,
unit="none",
decimals=None,
thresholds=None,
text_mode="value",
legend=None,
@ -313,7 +390,7 @@ def stat_panel(
):
"""Return a Grafana stat panel definition."""
defaults = {
"color": {"mode": "palette-classic"},
"color": {"mode": "thresholds"},
"mappings": [],
"thresholds": thresholds
or {
@ -328,6 +405,8 @@ def stat_panel(
}
if value_suffix:
defaults["custom"]["valueSuffix"] = value_suffix
if decimals is not None:
defaults["decimals"] = decimals
panel = {
"id": panel_id,
"type": "stat",
@ -446,17 +525,32 @@ def table_panel(
*,
unit="none",
transformations=None,
instant=False,
options=None,
filterable=True,
footer=None,
format=None,
):
"""Return a Grafana table panel definition."""
# Optional PromQL subquery helpers in expr: share(), etc.
panel_options = {"showHeader": True, "columnFilters": False}
if options:
panel_options.update(options)
if footer is not None:
panel_options["footer"] = footer
field_defaults = {"unit": unit, "custom": {"filterable": filterable}}
target = {"expr": expr, "refId": "A", **({"instant": True} if instant else {})}
if format:
target["format"] = format
panel = {
"id": panel_id,
"type": "table",
"title": title,
"datasource": PROM_DS,
"gridPos": grid,
"targets": [{"expr": expr, "refId": "A"}],
"fieldConfig": {"defaults": {"unit": unit}, "overrides": []},
"options": {"showHeader": True},
"targets": [target],
"fieldConfig": {"defaults": field_defaults, "overrides": []},
"options": panel_options,
}
if transformations:
panel["transformations"] = transformations
@ -482,7 +576,7 @@ def pie_panel(panel_id, title, expr, grid):
"options": {
"legend": {"displayMode": "list", "placement": "right"},
"pieType": "pie",
"displayLabels": ["percent"],
"displayLabels": [],
"tooltip": {"mode": "single"},
"colorScheme": "interpolateSpectral",
"colorBy": "value",
@ -491,7 +585,19 @@ def pie_panel(panel_id, title, expr, grid):
}
def bargauge_panel(panel_id, title, expr, grid, *, unit="none", links=None):
def bargauge_panel(
panel_id,
title,
expr,
grid,
*,
unit="none",
links=None,
limit=None,
thresholds=None,
decimals=None,
instant=False,
):
"""Return a bar gauge panel with label-aware reduction."""
panel = {
"id": panel_id,
@ -499,13 +605,16 @@ def bargauge_panel(panel_id, title, expr, grid, *, unit="none", links=None):
"title": title,
"datasource": PROM_DS,
"gridPos": grid,
"targets": [{"expr": expr, "refId": "A", "legendFormat": "{{node}}"}],
"targets": [
{"expr": expr, "refId": "A", "legendFormat": "{{node}}", **({"instant": True} if instant else {})}
],
"fieldConfig": {
"defaults": {
"unit": unit,
"min": 0,
"max": 100 if unit == "percent" else None,
"thresholds": {
"thresholds": thresholds
or {
"mode": "absolute",
"steps": [
{"color": "green", "value": None},
@ -527,8 +636,19 @@ def bargauge_panel(panel_id, title, expr, grid, *, unit="none", links=None):
},
},
}
if decimals is not None:
panel["fieldConfig"]["defaults"]["decimals"] = decimals
if links:
panel["links"] = links
# Keep bars ordered by value descending for readability.
panel["transformations"] = [
{
"id": "sortBy",
"options": {"fields": ["Value"], "order": "desc"},
}
]
if limit:
panel["transformations"].append({"id": "limit", "options": {"limit": limit}})
return panel
@ -555,81 +675,37 @@ def link_to(uid):
def build_overview():
panels = []
count_thresholds = {
"mode": "absolute",
"steps": [
{"color": "green", "value": None},
{"color": "yellow", "value": 1},
{"color": "orange", "value": 2},
{"color": "red", "value": 3},
],
}
row1_stats = [
(
1,
"Workers Ready",
f'sum(kube_node_status_condition{{condition="Ready",status="true",node=~"{WORKER_REGEX}"}})',
WORKER_SUFFIX,
WORKER_TOTAL,
None,
),
(
2,
"Control Plane Ready",
f'sum(kube_node_status_condition{{condition="Ready",status="true",node=~"{CONTROL_REGEX}"}})',
CONTROL_SUFFIX,
CONTROL_TOTAL,
None,
),
(
3,
"Control Plane Workloads",
CONTROL_WORKLOADS_EXPR,
None,
4,
link_to("atlas-pods"),
),
(
4,
"Problem Pods",
PROBLEM_PODS_EXPR,
None,
1,
link_to("atlas-pods"),
),
(
5,
"Stuck Terminating",
STUCK_TERMINATING_EXPR,
None,
1,
link_to("atlas-pods"),
),
]
def gauge_grid(idx):
width = GAUGE_WIDTHS[idx] if idx < len(GAUGE_WIDTHS) else 4
x = sum(GAUGE_WIDTHS[:idx])
return width, x
for idx, (panel_id, title, expr, suffix, ok_value, links) in enumerate(row1_stats):
thresholds = None
min_value = 0
max_value = ok_value or 5
if panel_id == 1:
max_value = WORKER_TOTAL
thresholds = {
"mode": "absolute",
"steps": [
{"color": "red", "value": None},
{"color": "orange", "value": WORKER_TOTAL - 2},
{"color": "yellow", "value": WORKER_TOTAL - 1},
{"color": "green", "value": WORKER_TOTAL},
],
}
elif panel_id == 2:
max_value = CONTROL_TOTAL
thresholds = {
{
"id": 2,
"title": "Control Plane Ready",
"expr": f'sum(kube_node_status_condition{{condition="Ready",status="true",node=~"{CONTROL_REGEX}"}})',
"kind": "gauge",
"max_value": CONTROL_TOTAL,
"thresholds": {
"mode": "absolute",
"steps": [
{"color": "red", "value": None},
{"color": "green", "value": CONTROL_TOTAL},
],
}
elif panel_id in (3, 4, 5):
max_value = 4
thresholds = {
},
},
{
"id": 3,
"title": "Control Plane Workloads",
"expr": CONTROL_WORKLOADS_EXPR,
"kind": "stat",
"thresholds": {
"mode": "absolute",
"steps": [
{"color": "green", "value": None},
@ -637,40 +713,122 @@ def build_overview():
{"color": "orange", "value": 2},
{"color": "red", "value": 3},
],
}
else:
thresholds = {
},
"links": link_to("atlas-pods"),
},
{
"id": 5,
"title": "Stuck Terminating",
"expr": STUCK_TERMINATING_EXPR,
"kind": "stat",
"thresholds": {
"mode": "absolute",
"steps": [
{"color": "green", "value": None},
{"color": "red", "value": max_value},
{"color": "yellow", "value": 1},
{"color": "orange", "value": 2},
{"color": "red", "value": 3},
],
}
},
"links": link_to("atlas-pods"),
},
{
"id": 27,
"title": "Atlas Availability (30d)",
"expr": UPTIME_PERCENT_EXPR,
"kind": "stat",
"thresholds": UPTIME_PERCENT_THRESHOLDS,
"unit": "percentunit",
"decimals": 3,
"text_mode": "value",
},
{
"id": 4,
"title": "Problem Pods",
"expr": PROBLEM_PODS_EXPR,
"kind": "stat",
"thresholds": {
"mode": "absolute",
"steps": [
{"color": "green", "value": None},
{"color": "yellow", "value": 1},
{"color": "orange", "value": 2},
{"color": "red", "value": 3},
],
},
"links": link_to("atlas-pods"),
},
{
"id": 6,
"title": "CrashLoop / ImagePull",
"expr": CRASHLOOP_EXPR,
"kind": "stat",
"thresholds": {
"mode": "absolute",
"steps": [
{"color": "green", "value": None},
{"color": "yellow", "value": 1},
{"color": "orange", "value": 2},
{"color": "red", "value": 3},
],
},
"links": link_to("atlas-pods"),
},
{
"id": 1,
"title": "Workers Ready",
"expr": f'sum(kube_node_status_condition{{condition="Ready",status="true",node=~"{WORKER_REGEX}"}})',
"kind": "gauge",
"max_value": WORKER_TOTAL,
"thresholds": {
"mode": "absolute",
"steps": [
{"color": "red", "value": None},
{"color": "orange", "value": WORKER_TOTAL - 2},
{"color": "yellow", "value": WORKER_TOTAL - 1},
{"color": "green", "value": WORKER_TOTAL},
],
},
},
]
def gauge_grid(idx):
width = GAUGE_WIDTHS[idx] if idx < len(GAUGE_WIDTHS) else 4
x = sum(GAUGE_WIDTHS[:idx])
return width, x
for idx, item in enumerate(row1_stats):
panel_id = item["id"]
width, x = gauge_grid(idx)
if panel_id in (3, 4, 5):
grid = {"h": 5, "w": width, "x": x, "y": 0}
kind = item.get("kind", "gauge")
if kind == "stat":
panels.append(
stat_panel(
panel_id,
title,
expr,
{"h": 5, "w": width, "x": x, "y": 0},
thresholds=thresholds,
legend=None,
links=links,
text_mode="value",
)
)
item["title"],
item["expr"],
grid,
thresholds=item.get("thresholds"),
legend=None,
links=item.get("links"),
text_mode=item.get("text_mode", "value"),
value_suffix=item.get("value_suffix"),
unit=item.get("unit", "none"),
decimals=item.get("decimals"),
)
)
else:
panels.append(
gauge_panel(
panel_id,
title,
expr,
{"h": 5, "w": width, "x": x, "y": 0},
min_value=min_value,
max_value=max_value,
thresholds=thresholds,
links=links,
item["title"],
item["expr"],
grid,
min_value=0,
max_value=item.get("max_value", 5),
thresholds=item.get("thresholds"),
links=item.get("links"),
)
)
@ -774,7 +932,7 @@ def build_overview():
timeseries_panel(
16,
"Control plane CPU",
node_cpu_expr(CONTROL_REGEX),
node_cpu_expr(CONTROL_ALL_REGEX),
{"h": 10, "w": 12, "x": 0, "y": 44},
unit="percent",
legend="{{node}}",
@ -786,7 +944,7 @@ def build_overview():
timeseries_panel(
17,
"Control plane RAM",
node_mem_expr(CONTROL_REGEX),
node_mem_expr(CONTROL_ALL_REGEX),
{"h": 10, "w": 12, "x": 12, "y": 44},
unit="percent",
legend="{{node}}",
@ -795,6 +953,36 @@ def build_overview():
)
)
panels.append(
pie_panel(
28,
"Node Pod Share",
'(sum(kube_pod_info{pod!="" , node!=""}) by (node) / clamp_min(sum(kube_pod_info{pod!="" , node!=""}), 1)) * 100',
{"h": 10, "w": 12, "x": 0, "y": 54},
)
)
panels.append(
bargauge_panel(
29,
"Top Nodes by Pod Count",
'topk(12, sum(kube_pod_info{pod!="" , node!=""}) by (node))',
{"h": 10, "w": 12, "x": 12, "y": 54},
unit="none",
limit=12,
decimals=0,
thresholds={
"mode": "absolute",
"steps": [
{"color": "green", "value": None},
{"color": "yellow", "value": 50},
{"color": "orange", "value": 75},
{"color": "red", "value": 100},
],
},
instant=True,
)
)
panels.append(
timeseries_panel(
18,
@ -840,7 +1028,7 @@ def build_overview():
21,
"Root Filesystem Usage",
root_usage_expr(),
{"h": 16, "w": 12, "x": 0, "y": 54},
{"h": 16, "w": 12, "x": 0, "y": 64},
unit="percent",
legend="{{node}}",
legend_calcs=["last"],
@ -855,8 +1043,9 @@ def build_overview():
22,
"Nodes Closest to Full Root Disks",
f"topk(12, {root_usage_expr()})",
{"h": 16, "w": 12, "x": 12, "y": 54},
{"h": 16, "w": 12, "x": 12, "y": 64},
unit="percent",
thresholds=PERCENT_THRESHOLDS,
links=link_to("atlas-storage"),
)
)
@ -874,13 +1063,7 @@ def build_overview():
"templating": {"list": []},
"time": {"from": "now-1h", "to": "now"},
"refresh": "1m",
"links": [
{"title": "Atlas Pods", "type": "dashboard", "dashboardUid": "atlas-pods", "keepTime": False},
{"title": "Atlas Nodes", "type": "dashboard", "dashboardUid": "atlas-nodes", "keepTime": False},
{"title": "Atlas Storage", "type": "dashboard", "dashboardUid": "atlas-storage", "keepTime": False},
{"title": "Atlas Network", "type": "dashboard", "dashboardUid": "atlas-network", "keepTime": False},
{"title": "Atlas GPU", "type": "dashboard", "dashboardUid": "atlas-gpu", "keepTime": False},
],
"links": [],
}
@ -980,6 +1163,91 @@ def build_pods_dashboard():
],
)
)
panels.append(
pie_panel(
8,
"Node Pod Share",
'(sum(kube_pod_info{pod!="" , node!=""}) by (node) / clamp_min(sum(kube_pod_info{pod!="" , node!=""}), 1)) * 100',
{"h": 8, "w": 12, "x": 12, "y": 34},
)
)
panels.append(
bargauge_panel(
9,
"Top Nodes by Pod Count",
'topk(12, sum(kube_pod_info{pod!="" , node!=""}) by (node))',
{"h": 8, "w": 12, "x": 0, "y": 34},
unit="none",
limit=12,
decimals=0,
thresholds={
"mode": "absolute",
"steps": [
{"color": "green", "value": None},
{"color": "yellow", "value": 50},
{"color": "orange", "value": 75},
{"color": "red", "value": 100},
],
},
instant=True,
)
)
share_expr = (
'(sum by (namespace,node) (kube_pod_info{pod!="" , node!=""}) '
'/ on(namespace) group_left() clamp_min(sum by (namespace) (kube_pod_info{pod!=""}), 1) * 100)'
)
rank_terms = [
f"(sum by (node) (kube_node_info{{node=\"{node}\"}}) * 0 + {idx * 1e-3})"
for idx, node in enumerate(CONTROL_ALL + WORKER_NODES, start=1)
]
rank_expr = " or ".join(rank_terms)
score_expr = f"{share_expr} + on(node) group_left() ({rank_expr})"
mask_expr = (
f"{score_expr} == bool on(namespace) group_left() "
f"(max by (namespace) ({score_expr}))"
)
panels.append(
table_panel(
10,
"Namespace Plurality by Node v27",
(
f"{share_expr} * on(namespace,node) group_left() "
f"({mask_expr})"
),
{"h": 8, "w": 24, "x": 0, "y": 42},
unit="percent",
transformations=[
{"id": "labelsToFields", "options": {}},
{"id": "organize", "options": {"excludeByName": {"Time": True}}},
{"id": "filterByValue", "options": {"match": "Value", "operator": "gt", "value": 0}},
{
"id": "sortBy",
"options": {"fields": ["Value"], "order": "desc"},
},
{
"id": "groupBy",
"options": {
"fields": {
"namespace": {
"aggregations": [
{"field": "Value", "operation": "max"},
{"field": "node", "operation": "first"},
]
}
},
"rowBy": ["namespace"],
},
},
],
instant=True,
options={"showColumnFilters": False},
filterable=False,
footer={"show": False, "fields": "", "calcs": []},
format="table",
)
)
return {
"uid": "atlas-pods",
"title": "Atlas Pods",
@ -1022,12 +1290,69 @@ def build_nodes_dashboard():
{"h": 4, "w": 8, "x": 16, "y": 0},
)
)
panels.append(
stat_panel(
9,
"API Server 5xx rate",
APISERVER_5XX_RATE,
{"h": 4, "w": 8, "x": 0, "y": 4},
unit="req/s",
thresholds={
"mode": "absolute",
"steps": [
{"color": "green", "value": None},
{"color": "yellow", "value": 0.05},
{"color": "orange", "value": 0.2},
{"color": "red", "value": 0.5},
],
},
decimals=3,
)
)
panels.append(
stat_panel(
10,
"API Server P99 latency",
APISERVER_P99_LATENCY_MS,
{"h": 4, "w": 8, "x": 8, "y": 4},
unit="ms",
thresholds={
"mode": "absolute",
"steps": [
{"color": "green", "value": None},
{"color": "yellow", "value": 250},
{"color": "orange", "value": 400},
{"color": "red", "value": 600},
],
},
decimals=1,
)
)
panels.append(
stat_panel(
11,
"etcd P99 latency",
ETCD_P99_LATENCY_MS,
{"h": 4, "w": 8, "x": 16, "y": 4},
unit="ms",
thresholds={
"mode": "absolute",
"steps": [
{"color": "green", "value": None},
{"color": "yellow", "value": 50},
{"color": "orange", "value": 100},
{"color": "red", "value": 200},
],
},
decimals=1,
)
)
panels.append(
timeseries_panel(
4,
"Node CPU",
node_cpu_expr(),
{"h": 9, "w": 24, "x": 0, "y": 4},
{"h": 9, "w": 24, "x": 0, "y": 8},
unit="percent",
legend="{{node}}",
legend_calcs=["last"],
@ -1040,7 +1365,7 @@ def build_nodes_dashboard():
5,
"Node RAM",
node_mem_expr(),
{"h": 9, "w": 24, "x": 0, "y": 13},
{"h": 9, "w": 24, "x": 0, "y": 17},
unit="percent",
legend="{{node}}",
legend_calcs=["last"],
@ -1053,7 +1378,7 @@ def build_nodes_dashboard():
6,
"Control Plane (incl. titan-db) CPU",
node_cpu_expr(CONTROL_ALL_REGEX),
{"h": 9, "w": 12, "x": 0, "y": 22},
{"h": 9, "w": 12, "x": 0, "y": 26},
unit="percent",
legend="{{node}}",
legend_display="table",
@ -1065,7 +1390,7 @@ def build_nodes_dashboard():
7,
"Control Plane (incl. titan-db) RAM",
node_mem_expr(CONTROL_ALL_REGEX),
{"h": 9, "w": 12, "x": 12, "y": 22},
{"h": 9, "w": 12, "x": 12, "y": 26},
unit="percent",
legend="{{node}}",
legend_display="table",
@ -1077,7 +1402,7 @@ def build_nodes_dashboard():
8,
"Root Filesystem Usage",
root_usage_expr(),
{"h": 9, "w": 24, "x": 0, "y": 31},
{"h": 9, "w": 24, "x": 0, "y": 35},
unit="percent",
legend="{{node}}",
legend_display="table",
@ -1204,43 +1529,107 @@ def build_network_dashboard():
panels.append(
stat_panel(
1,
"Ingress Traffic",
NET_INGRESS_EXPR,
{"h": 4, "w": 8, "x": 0, "y": 0},
unit="Bps",
"Ingress Success Rate (5m)",
TRAEFIK_SLI_5M,
{"h": 4, "w": 6, "x": 0, "y": 0},
unit="percentunit",
decimals=2,
thresholds={
"mode": "absolute",
"steps": [
{"color": "red", "value": None},
{"color": "orange", "value": 0.995},
{"color": "yellow", "value": 0.999},
{"color": "green", "value": 0.9995},
],
},
)
)
panels.append(
stat_panel(
2,
"Egress Traffic",
NET_EGRESS_EXPR,
{"h": 4, "w": 8, "x": 8, "y": 0},
unit="Bps",
"Error Budget Burn (1h)",
traefik_burn("1h"),
{"h": 4, "w": 6, "x": 6, "y": 0},
thresholds={
"mode": "absolute",
"steps": [
{"color": "green", "value": None},
{"color": "yellow", "value": 1},
{"color": "orange", "value": 2},
{"color": "red", "value": 4},
],
},
decimals=2,
)
)
panels.append(
stat_panel(
3,
"Intra-Cluster Traffic",
NET_INTERNAL_EXPR,
{"h": 4, "w": 8, "x": 16, "y": 0},
unit="Bps",
"Error Budget Burn (6h)",
traefik_burn("6h"),
{"h": 4, "w": 6, "x": 12, "y": 0},
thresholds={
"mode": "absolute",
"steps": [
{"color": "green", "value": None},
{"color": "yellow", "value": 1},
{"color": "orange", "value": 2},
{"color": "red", "value": 4},
],
},
decimals=2,
)
)
panels.append(
stat_panel(
4,
"Top Router req/s",
f"topk(1, {TRAEFIK_ROUTER_EXPR})",
"Edge P99 Latency (ms)",
TRAEFIK_P99_LATENCY_MS,
{"h": 4, "w": 6, "x": 18, "y": 0},
unit="ms",
thresholds={
"mode": "absolute",
"steps": [
{"color": "green", "value": None},
{"color": "yellow", "value": 200},
{"color": "orange", "value": 350},
{"color": "red", "value": 500},
],
},
decimals=1,
)
)
panels.append(
stat_panel(
5,
"Ingress Traffic",
NET_INGRESS_EXPR,
{"h": 4, "w": 8, "x": 0, "y": 4},
unit="req/s",
legend="{{router}}",
unit="Bps",
)
)
panels.append(
stat_panel(
6,
"Egress Traffic",
NET_EGRESS_EXPR,
{"h": 4, "w": 8, "x": 8, "y": 4},
unit="Bps",
)
)
panels.append(
stat_panel(
7,
"Intra-Cluster Traffic",
NET_INTERNAL_EXPR,
{"h": 4, "w": 8, "x": 16, "y": 4},
unit="Bps",
)
)
panels.append(
timeseries_panel(
5,
8,
"Per-Node Throughput",
f'avg by (node) (({NET_NODE_TX_PHYS} + {NET_NODE_RX_PHYS}) * on(instance) group_left(node) {NODE_INFO})',
{"h": 8, "w": 24, "x": 0, "y": 8},
@ -1252,7 +1641,7 @@ def build_network_dashboard():
)
panels.append(
table_panel(
6,
9,
"Top Namespaces",
'topk(10, sum(rate(container_network_transmit_bytes_total{namespace!=""}[5m]) '
'+ rate(container_network_receive_bytes_total{namespace!=""}[5m])) by (namespace))',
@ -1263,7 +1652,7 @@ def build_network_dashboard():
)
panels.append(
table_panel(
7,
10,
"Top Pods",
'topk(10, sum(rate(container_network_transmit_bytes_total{pod!=""}[5m]) '
'+ rate(container_network_receive_bytes_total{pod!=""}[5m])) by (namespace,pod))',
@ -1274,7 +1663,7 @@ def build_network_dashboard():
)
panels.append(
timeseries_panel(
8,
11,
"Traefik Routers (req/s)",
f"topk(10, {TRAEFIK_ROUTER_EXPR})",
{"h": 9, "w": 12, "x": 0, "y": 25},
@ -1286,7 +1675,7 @@ def build_network_dashboard():
)
panels.append(
timeseries_panel(
9,
12,
"Traefik Entrypoints (req/s)",
'sum by (entrypoint) (rate(traefik_entrypoint_requests_total[5m]))',
{"h": 9, "w": 12, "x": 12, "y": 25},

204
scripts/mailu_sync.py Normal file
View File

@ -0,0 +1,204 @@
#!/usr/bin/env python3
"""
Sync Keycloak users to Mailu mailboxes.
- Generates/stores a mailu_app_password attribute in Keycloak (admin-only)
- Upserts the mailbox in Mailu Postgres using that password
"""
import os
import sys
import json
import time
import secrets
import string
import datetime
import requests
import psycopg2
from psycopg2.extras import RealDictCursor
from passlib.hash import bcrypt_sha256
KC_BASE = os.environ["KEYCLOAK_BASE_URL"].rstrip("/")
KC_REALM = os.environ["KEYCLOAK_REALM"]
KC_CLIENT_ID = os.environ["KEYCLOAK_CLIENT_ID"]
KC_CLIENT_SECRET = os.environ["KEYCLOAK_CLIENT_SECRET"]
MAILU_DOMAIN = os.environ["MAILU_DOMAIN"]
MAILU_DEFAULT_QUOTA = int(os.environ.get("MAILU_DEFAULT_QUOTA", "20000000000"))
DB_CONFIG = {
"host": os.environ["MAILU_DB_HOST"],
"port": int(os.environ.get("MAILU_DB_PORT", "5432")),
"dbname": os.environ["MAILU_DB_NAME"],
"user": os.environ["MAILU_DB_USER"],
"password": os.environ["MAILU_DB_PASSWORD"],
}
SESSION = requests.Session()
def log(msg):
sys.stdout.write(f"{msg}\n")
sys.stdout.flush()
def get_kc_token():
resp = SESSION.post(
f"{KC_BASE}/realms/{KC_REALM}/protocol/openid-connect/token",
data={
"grant_type": "client_credentials",
"client_id": KC_CLIENT_ID,
"client_secret": KC_CLIENT_SECRET,
},
timeout=15,
)
resp.raise_for_status()
return resp.json()["access_token"]
def kc_get_users(token):
users = []
first = 0
max_results = 200
headers = {"Authorization": f"Bearer {token}"}
while True:
resp = SESSION.get(
f"{KC_BASE}/admin/realms/{KC_REALM}/users",
params={"first": first, "max": max_results, "enabled": "true"},
headers=headers,
timeout=20,
)
resp.raise_for_status()
batch = resp.json()
users.extend(batch)
if len(batch) < max_results:
break
first += max_results
return users
def kc_update_attributes(token, user, attributes):
headers = {
"Authorization": f"Bearer {token}",
"Content-Type": "application/json",
}
payload = {
"firstName": user.get("firstName"),
"lastName": user.get("lastName"),
"email": user.get("email"),
"enabled": user.get("enabled", True),
"username": user["username"],
"emailVerified": user.get("emailVerified", False),
"attributes": attributes,
}
user_url = f"{KC_BASE}/admin/realms/{KC_REALM}/users/{user['id']}"
resp = SESSION.put(user_url, headers=headers, json=payload, timeout=20)
resp.raise_for_status()
verify = SESSION.get(
user_url,
headers={"Authorization": f"Bearer {token}"},
params={"briefRepresentation": "false"},
timeout=15,
)
verify.raise_for_status()
attrs = verify.json().get("attributes") or {}
if not attrs.get("mailu_app_password"):
raise Exception(f"attribute not persisted for {user.get('email') or user['username']}")
def random_password():
alphabet = string.ascii_letters + string.digits
return "".join(secrets.choice(alphabet) for _ in range(24))
def ensure_mailu_user(cursor, email, password, display_name):
localpart, domain = email.split("@", 1)
if domain.lower() != MAILU_DOMAIN.lower():
return
hashed = bcrypt_sha256.hash(password)
now = datetime.datetime.utcnow()
cursor.execute(
"""
INSERT INTO "user" (
email, localpart, domain_name, password,
quota_bytes, quota_bytes_used,
global_admin, enabled, enable_imap, enable_pop, allow_spoofing,
forward_enabled, forward_destination, forward_keep,
reply_enabled, reply_subject, reply_body, reply_startdate, reply_enddate,
displayed_name, spam_enabled, spam_mark_as_read, spam_threshold,
change_pw_next_login, created_at, updated_at, comment
)
VALUES (
%(email)s, %(localpart)s, %(domain)s, %(password)s,
%(quota)s, 0,
false, true, true, true, false,
false, '', true,
false, NULL, NULL, DATE '1900-01-01', DATE '2999-12-31',
%(display)s, true, true, 80,
false, CURRENT_DATE, %(now)s, ''
)
ON CONFLICT (email) DO UPDATE
SET password = EXCLUDED.password,
enabled = true,
updated_at = EXCLUDED.updated_at
""",
{
"email": email,
"localpart": localpart,
"domain": domain,
"password": hashed,
"quota": MAILU_DEFAULT_QUOTA,
"display": display_name or localpart,
"now": now,
},
)
def main():
token = get_kc_token()
users = kc_get_users(token)
if not users:
log("No users found; exiting.")
return
conn = psycopg2.connect(**DB_CONFIG)
conn.autocommit = True
cursor = conn.cursor(cursor_factory=RealDictCursor)
for user in users:
attrs = user.get("attributes", {}) or {}
app_pw_value = attrs.get("mailu_app_password")
if isinstance(app_pw_value, list):
app_pw = app_pw_value[0] if app_pw_value else None
elif isinstance(app_pw_value, str):
app_pw = app_pw_value
else:
app_pw = None
email = user.get("email")
if not email:
email = f"{user['username']}@{MAILU_DOMAIN}"
if not app_pw:
app_pw = random_password()
attrs["mailu_app_password"] = app_pw
kc_update_attributes(token, user, attrs)
log(f"Set mailu_app_password for {email}")
display_name = " ".join(
part for part in [user.get("firstName"), user.get("lastName")] if part
).strip()
ensure_mailu_user(cursor, email, app_pw, display_name)
log(f"Synced mailbox for {email}")
cursor.close()
conn.close()
if __name__ == "__main__":
try:
main()
except Exception as exc:
log(f"ERROR: {exc}")
sys.exit(1)

49
scripts/nextcloud-mail-sync.sh Executable file
View File

@ -0,0 +1,49 @@
#!/bin/bash
set -euo pipefail
KC_BASE="${KC_BASE:?}"
KC_REALM="${KC_REALM:?}"
KC_ADMIN_USER="${KC_ADMIN_USER:?}"
KC_ADMIN_PASS="${KC_ADMIN_PASS:?}"
if ! command -v jq >/dev/null 2>&1; then
apt-get update && apt-get install -y jq curl >/dev/null
fi
account_exists() {
# Skip if the account email is already present in the mail app.
runuser -u www-data -- php occ mail:account:list 2>/dev/null | grep -Fq " ${1}" || \
runuser -u www-data -- php occ mail:account:list 2>/dev/null | grep -Fq "${1} "
}
token=$(
curl -s -d "grant_type=password" \
-d "client_id=admin-cli" \
-d "username=${KC_ADMIN_USER}" \
-d "password=${KC_ADMIN_PASS}" \
"${KC_BASE}/realms/master/protocol/openid-connect/token" | jq -r '.access_token'
)
if [[ -z "${token}" || "${token}" == "null" ]]; then
echo "Failed to obtain admin token"
exit 1
fi
users=$(curl -s -H "Authorization: Bearer ${token}" \
"${KC_BASE}/admin/realms/${KC_REALM}/users?max=2000")
echo "${users}" | jq -c '.[]' | while read -r user; do
username=$(echo "${user}" | jq -r '.username')
email=$(echo "${user}" | jq -r '.email // empty')
app_pw=$(echo "${user}" | jq -r '.attributes.mailu_app_password[0] // empty')
[[ -z "${email}" || -z "${app_pw}" ]] && continue
if account_exists "${email}"; then
echo "Skipping ${email}, already exists"
continue
fi
echo "Syncing ${email}"
runuser -u www-data -- php occ mail:account:create \
"${username}" "${username}" "${email}" \
mail.bstein.dev 993 ssl "${email}" "${app_pw}" \
mail.bstein.dev 587 tls "${email}" "${app_pw}" login || true
done

View File

@ -0,0 +1,65 @@
#!/bin/bash
set -euo pipefail
NC_URL="${NC_URL:-https://cloud.bstein.dev}"
ADMIN_USER="${ADMIN_USER:?}"
ADMIN_PASS="${ADMIN_PASS:?}"
export DEBIAN_FRONTEND=noninteractive
apt-get update -qq
apt-get install -y -qq curl jq >/dev/null
run_occ() {
runuser -u www-data -- php occ "$@"
}
log() { echo "[$(date -Is)] $*"; }
log "Applying Atlas theming"
run_occ theming:config name "Atlas Cloud"
run_occ theming:config slogan "Unified access to Atlas services"
run_occ theming:config url "https://cloud.bstein.dev"
run_occ theming:config color "#0f172a"
run_occ theming:config disable-user-theming yes
log "Setting default quota to 200 GB"
run_occ config:app:set files default_quota --value "200 GB"
API_BASE="${NC_URL}/ocs/v2.php/apps/external/api/v1"
AUTH=(-u "${ADMIN_USER}:${ADMIN_PASS}" -H "OCS-APIRequest: true")
log "Removing existing external links"
existing=$(curl -sf "${AUTH[@]}" "${API_BASE}?format=json" | jq -r '.ocs.data[].id // empty')
for id in ${existing}; do
curl -sf "${AUTH[@]}" -X DELETE "${API_BASE}/sites/${id}?format=json" >/dev/null || true
done
SITES=(
"Vaultwarden|https://vault.bstein.dev"
"Jellyfin|https://stream.bstein.dev"
"Gitea|https://scm.bstein.dev"
"Jenkins|https://ci.bstein.dev"
"Zot|https://registry.bstein.dev"
"Vault|https://secret.bstein.dev"
"Jitsi|https://meet.bstein.dev"
"Grafana|https://metrics.bstein.dev"
"Chat LLM|https://chat.ai.bstein.dev"
"Vision|https://draw.ai.bstein.dev"
"STT/TTS|https://talk.ai.bstein.dev"
)
log "Seeding external links"
for entry in "${SITES[@]}"; do
IFS="|" read -r name url <<<"${entry}"
curl -sf "${AUTH[@]}" -X POST "${API_BASE}/sites?format=json" \
-d "name=${name}" \
-d "url=${url}" \
-d "lang=" \
-d "type=link" \
-d "device=" \
-d "icon=" \
-d "groups[]=" \
-d "redirect=1" >/dev/null
done
log "Maintenance run completed"

View File

@ -0,0 +1,58 @@
import importlib.util
import pathlib
def load_module():
path = pathlib.Path(__file__).resolve().parents[1] / "dashboards_render_atlas.py"
spec = importlib.util.spec_from_file_location("dashboards_render_atlas", path)
module = importlib.util.module_from_spec(spec)
assert spec.loader is not None
spec.loader.exec_module(module)
return module
def test_table_panel_options_and_filterable():
mod = load_module()
panel = mod.table_panel(
1,
"test",
"metric",
{"h": 1, "w": 1, "x": 0, "y": 0},
unit="percent",
transformations=[{"id": "labelsToFields", "options": {}}],
instant=True,
options={"showColumnFilters": False},
filterable=False,
footer={"show": False, "fields": "", "calcs": []},
format="table",
)
assert panel["fieldConfig"]["defaults"]["unit"] == "percent"
assert panel["fieldConfig"]["defaults"]["custom"]["filterable"] is False
assert panel["options"]["showHeader"] is True
assert panel["targets"][0]["format"] == "table"
def test_node_filter_and_expr_helpers():
mod = load_module()
expr = mod.node_filter("titan-.*")
assert "label_replace" in expr
cpu_expr = mod.node_cpu_expr("titan-.*")
mem_expr = mod.node_mem_expr("titan-.*")
assert "node_cpu_seconds_total" in cpu_expr
assert "node_memory_MemAvailable_bytes" in mem_expr
def test_render_configmap_writes(tmp_path):
mod = load_module()
mod.DASHBOARD_DIR = tmp_path / "dash"
mod.ROOT = tmp_path
uid = "atlas-test"
info = {"configmap": tmp_path / "cm.yaml"}
data = {"title": "Atlas Test"}
mod.write_json(uid, data)
mod.render_configmap(uid, info)
json_path = mod.DASHBOARD_DIR / f"{uid}.json"
assert json_path.exists()
content = (tmp_path / "cm.yaml").read_text()
assert "kind: ConfigMap" in content
assert f"{uid}.json" in content

View File

@ -0,0 +1,181 @@
import importlib.util
import pathlib
import pytest
def load_sync_module(monkeypatch):
# Minimal env required by module import
env = {
"KEYCLOAK_BASE_URL": "http://keycloak",
"KEYCLOAK_REALM": "atlas",
"KEYCLOAK_CLIENT_ID": "mailu-sync",
"KEYCLOAK_CLIENT_SECRET": "secret",
"MAILU_DOMAIN": "example.com",
"MAILU_DB_HOST": "localhost",
"MAILU_DB_PORT": "5432",
"MAILU_DB_NAME": "mailu",
"MAILU_DB_USER": "mailu",
"MAILU_DB_PASSWORD": "pw",
}
for k, v in env.items():
monkeypatch.setenv(k, v)
module_path = pathlib.Path(__file__).resolve().parents[1] / "mailu_sync.py"
spec = importlib.util.spec_from_file_location("mailu_sync_testmod", module_path)
module = importlib.util.module_from_spec(spec)
assert spec.loader is not None
spec.loader.exec_module(module)
return module
def test_random_password_length_and_charset(monkeypatch):
sync = load_sync_module(monkeypatch)
pw = sync.random_password()
assert len(pw) == 24
assert all(ch.isalnum() for ch in pw)
class _FakeResponse:
def __init__(self, json_data=None, status=200):
self._json_data = json_data or {}
self.status_code = status
def raise_for_status(self):
if self.status_code >= 400:
raise AssertionError(f"status {self.status_code}")
def json(self):
return self._json_data
class _FakeSession:
def __init__(self, put_resp, get_resp):
self.put_resp = put_resp
self.get_resp = get_resp
self.put_called = False
self.get_called = False
def post(self, *args, **kwargs):
return _FakeResponse({"access_token": "dummy"})
def put(self, *args, **kwargs):
self.put_called = True
return self.put_resp
def get(self, *args, **kwargs):
self.get_called = True
return self.get_resp
def test_kc_update_attributes_succeeds(monkeypatch):
sync = load_sync_module(monkeypatch)
ok_resp = _FakeResponse({"attributes": {"mailu_app_password": ["abc"]}})
sync.SESSION = _FakeSession(_FakeResponse({}), ok_resp)
sync.kc_update_attributes("token", {"id": "u1", "username": "u1"}, {"mailu_app_password": "abc"})
assert sync.SESSION.put_called and sync.SESSION.get_called
def test_kc_update_attributes_raises_without_attribute(monkeypatch):
sync = load_sync_module(monkeypatch)
missing_attr_resp = _FakeResponse({"attributes": {}}, status=200)
sync.SESSION = _FakeSession(_FakeResponse({}), missing_attr_resp)
with pytest.raises(Exception):
sync.kc_update_attributes("token", {"id": "u1", "username": "u1"}, {"mailu_app_password": "abc"})
def test_kc_get_users_paginates(monkeypatch):
sync = load_sync_module(monkeypatch)
class _PagedSession:
def __init__(self):
self.calls = 0
def post(self, *_, **__):
return _FakeResponse({"access_token": "tok"})
def get(self, *_, **__):
self.calls += 1
if self.calls == 1:
return _FakeResponse([{"id": "u1"}, {"id": "u2"}])
return _FakeResponse([]) # stop pagination
sync.SESSION = _PagedSession()
users = sync.kc_get_users("tok")
assert [u["id"] for u in users] == ["u1", "u2"]
assert sync.SESSION.calls == 2
def test_ensure_mailu_user_skips_foreign_domain(monkeypatch):
sync = load_sync_module(monkeypatch)
executed = []
class _Cursor:
def execute(self, sql, params):
executed.append((sql, params))
sync.ensure_mailu_user(_Cursor(), "user@other.com", "pw", "User")
assert not executed
def test_ensure_mailu_user_upserts(monkeypatch):
sync = load_sync_module(monkeypatch)
captured = {}
class _Cursor:
def execute(self, sql, params):
captured.update(params)
sync.ensure_mailu_user(_Cursor(), "user@example.com", "pw", "User Example")
assert captured["email"] == "user@example.com"
assert captured["localpart"] == "user"
# password should be hashed, not the raw string
assert captured["password"] != "pw"
def test_main_generates_password_and_upserts(monkeypatch):
sync = load_sync_module(monkeypatch)
users = [
{"id": "u1", "username": "user1", "email": "user1@example.com", "attributes": {}},
{"id": "u2", "username": "user2", "email": "user2@example.com", "attributes": {"mailu_app_password": ["keepme"]}},
{"id": "u3", "username": "user3", "email": "user3@other.com", "attributes": {}},
]
updated = []
class _Cursor:
def __init__(self):
self.executions = []
def execute(self, sql, params):
self.executions.append(params)
def close(self):
return None
class _Conn:
def __init__(self):
self.autocommit = False
self._cursor = _Cursor()
def cursor(self, cursor_factory=None):
return self._cursor
def close(self):
return None
monkeypatch.setattr(sync, "get_kc_token", lambda: "tok")
monkeypatch.setattr(sync, "kc_get_users", lambda token: users)
monkeypatch.setattr(sync, "kc_update_attributes", lambda token, user, attrs: updated.append((user["id"], attrs["mailu_app_password"])))
conns = []
def _connect(**kwargs):
conn = _Conn()
conns.append(conn)
return conn
monkeypatch.setattr(sync.psycopg2, "connect", _connect)
sync.main()
# Should attempt two inserts (third user skipped due to domain mismatch)
assert len(updated) == 1 # only one missing attr was backfilled
assert conns and len(conns[0]._cursor.executions) == 2

View File

@ -5,7 +5,7 @@ metadata:
name: gitea-ingress
namespace: gitea
annotations:
cert-manager.io/cluster-issuer: "letsencrypt-prod"
cert-manager.io/cluster-issuer: letsencrypt
nginx.ingress.kubernetes.io/ssl-redirect: "true"
spec:
tls:

View File

@ -0,0 +1,49 @@
# services/gitops-ui/helmrelease.yaml
apiVersion: helm.toolkit.fluxcd.io/v2
kind: HelmRelease
metadata:
name: weave-gitops
namespace: flux-system
spec:
interval: 30m
chart:
spec:
chart: ./charts/gitops-server
sourceRef:
kind: GitRepository
name: weave-gitops-upstream
namespace: flux-system
# track upstream tag; see source object for version pin
install:
remediation:
retries: 3
upgrade:
remediation:
retries: 3
remediateLastFailure: true
cleanupOnFail: true
values:
adminUser:
create: true
createClusterRole: true
createSecret: true
username: admin
# bcrypt hash for temporary password "G1tOps!2025" (rotate after login)
passwordHash: "$2y$12$wDEOzR1Gc2dbvNSJ3ZXNdOBVFEjC6YASIxnZmHIbO.W1m0fie/QVi"
ingress:
enabled: true
className: traefik
annotations:
cert-manager.io/cluster-issuer: letsencrypt-prod
traefik.ingress.kubernetes.io/router.entrypoints: websecure
hosts:
- host: cd.bstein.dev
paths:
- path: /
pathType: Prefix
tls:
- secretName: gitops-ui-tls
hosts:
- cd.bstein.dev
metrics:
enabled: true

View File

@ -0,0 +1,7 @@
# services/gitops-ui/kustomization.yaml
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
namespace: flux-system
resources:
- source.yaml
- helmrelease.yaml

View File

@ -0,0 +1,11 @@
# services/gitops-ui/source.yaml
apiVersion: source.toolkit.fluxcd.io/v1
kind: GitRepository
metadata:
name: weave-gitops-upstream
namespace: flux-system
spec:
interval: 1h
url: https://github.com/weaveworks/weave-gitops.git
ref:
tag: v0.38.0

View File

@ -5,7 +5,7 @@ metadata:
name: jitsi
namespace: jitsi
annotations:
cert-manager.io/cluster-issuer: "letsencrypt-prod"
cert-manager.io/cluster-issuer: letsencrypt
spec:
ingressClassName: traefik
tls:

View File

@ -48,6 +48,20 @@ spec:
runAsGroup: 0
fsGroup: 1000
fsGroupChangePolicy: OnRootMismatch
imagePullSecrets:
- name: zot-regcred
initContainers:
- name: mailu-http-listener
image: registry.bstein.dev/sso/mailu-http-listener:0.1.0
imagePullPolicy: IfNotPresent
command: ["/bin/sh", "-c"]
args:
- |
cp /plugin/mailu-http-listener-0.1.0.jar /providers/
cp -r /plugin/src /providers/src
volumeMounts:
- name: providers
mountPath: /providers
containers:
- name: keycloak
image: quay.io/keycloak/keycloak:26.0.7
@ -104,6 +118,10 @@ spec:
secretKeyRef:
name: keycloak-admin
key: password
- name: KC_EVENTS_LISTENERS
value: jboss-logging,mailu-http
- name: KC_SPI_EVENTS_LISTENER_MAILU-HTTP_ENDPOINT
value: http://mailu-sync-listener.mailu-mailserver.svc.cluster.local:8080/events
ports:
- containerPort: 8080
name: http
@ -126,7 +144,11 @@ spec:
volumeMounts:
- name: data
mountPath: /opt/keycloak/data
- name: providers
mountPath: /opt/keycloak/providers
volumes:
- name: data
persistentVolumeClaim:
claimName: keycloak-data
- name: providers
emptyDir: {}

View File

@ -0,0 +1,13 @@
# services/mailu/certificate.yaml
apiVersion: cert-manager.io/v1
kind: Certificate
metadata:
name: mailu-tls
namespace: mailu-mailserver
spec:
secretName: mailu-certificates
issuerRef:
kind: ClusterIssuer
name: letsencrypt-prod
dnsNames:
- mail.bstein.dev

View File

@ -0,0 +1,287 @@
# services/mailu/helmrelease.yaml
apiVersion: helm.toolkit.fluxcd.io/v2
kind: HelmRelease
metadata:
name: mailu
namespace: mailu-mailserver
spec:
interval: 30m
chart:
spec:
chart: mailu
version: 2.1.2
sourceRef:
kind: HelmRepository
name: mailu
namespace: flux-system
install:
remediation: { retries: 3 }
timeout: 10m
upgrade:
remediation:
retries: 3
remediateLastFailure: true
cleanupOnFail: true
timeout: 10m
values:
mailuVersion: "2024.06"
domain: bstein.dev
hostnames: [mail.bstein.dev]
domains:
- name: bstein.dev
enabled: true
dkim:
enabled: true
externalRelay:
host: "[email-smtp.us-east-2.amazonaws.com]:587"
existingSecret: mailu-ses-relay
usernameKey: relay-username
passwordKey: relay-password
timezone: Etc/UTC
subnet: 10.42.0.0/16
existingSecret: mailu-secret
tls:
outboundLevel: encrypt
externalDatabase:
enabled: true
type: postgresql
host: postgres-service.postgres.svc.cluster.local
port: 5432
database: mailu
username: mailu
existingSecret: mailu-db-secret
existingSecretUsernameKey: username
existingSecretPasswordKey: password
existingSecretDatabaseKey: database
initialAccount:
enabled: true
username: test
domain: bstein.dev
existingSecret: mailu-initial-account-secret
existingSecretPasswordKey: password
persistence:
accessModes: [ReadWriteMany]
size: 100Gi
storageClass: astreae
single_pvc: true
front:
hostnames: [mail.bstein.dev]
proxied: true
hostPort:
enabled: false
https:
enabled: false
external: false
forceHttps: false
externalService:
enabled: true
type: LoadBalancer
externalTrafficPolicy: Cluster
ports:
submission: true
nodePorts:
pop3: 30010
pop3s: 30011
imap: 30143
imaps: 30993
manageSieve: 30419
smtp: 30025
smtps: 30465
submission: 30587
logLevel: DEBUG
nodeSelector:
hardware: rpi4
admin:
logLevel: DEBUG
nodeSelector:
hardware: rpi4
podLivenessProbe:
enabled: true
initialDelaySeconds: 30
periodSeconds: 10
timeoutSeconds: 5
failureThreshold: 6
successThreshold: 1
podReadinessProbe:
enabled: true
initialDelaySeconds: 20
periodSeconds: 10
timeoutSeconds: 5
failureThreshold: 6
successThreshold: 1
extraEnvVars:
- name: FLASK_DEBUG
value: "1"
- name: ACCESSLOG
value: /dev/stdout
- name: ERRORLOG
value: /dev/stderr
- name: WEBROOT_REDIRECT
value: ""
- name: FORWARDED_ALLOW_IPS
value: 127.0.0.1,10.42.0.0/16
- name: DNS_RESOLVERS
value: 1.1.1.1,9.9.9.9
extraVolumes:
- name: unbound-config
configMap:
name: mailu-unbound
- name: unbound-run
emptyDir: {}
extraVolumeMounts:
- name: unbound-run
mountPath: /var/lib/unbound
extraContainers:
- name: unbound
image: docker.io/alpine:3.20
command: ["/bin/sh", "-c"]
args:
- |
while :; do
printf "nameserver 10.43.0.10\n" > /etc/resolv.conf
if apk add --no-cache unbound bind-tools; then
break
fi
echo "apk failed, retrying" >&2
sleep 10
done
cat >/etc/resolv.conf <<'EOF'
search mailu-mailserver.svc.cluster.local svc.cluster.local cluster.local
nameserver 127.0.0.1
EOF
unbound-anchor -a /var/lib/unbound/root.key || true
exec unbound -d -c /opt/unbound/etc/unbound/unbound.conf
ports:
- containerPort: 53
protocol: UDP
- containerPort: 53
protocol: TCP
volumeMounts:
- name: unbound-config
mountPath: /opt/unbound/etc/unbound
- name: unbound-run
mountPath: /var/lib/unbound
dnsPolicy: None
dnsConfig:
nameservers:
- 127.0.0.1
searches:
- mailu-mailserver.svc.cluster.local
- svc.cluster.local
- cluster.local
clamav:
image:
repository: clamav/clamav-debian
tag: "1.4"
logLevel: DEBUG
nodeSelector:
hardware: rpi5
resources:
requests:
cpu: 200m
memory: 1Gi
limits:
cpu: 500m
memory: 3Gi
livenessProbe:
enabled: false
initialDelaySeconds: 300
periodSeconds: 30
timeoutSeconds: 5
failureThreshold: 6
successThreshold: 1
startupProbe:
enabled: false
initialDelaySeconds: 60
periodSeconds: 30
timeoutSeconds: 5
failureThreshold: 20
successThreshold: 1
readinessProbe:
enabled: false
initialDelaySeconds: 300
periodSeconds: 30
timeoutSeconds: 5
failureThreshold: 6
successThreshold: 1
dovecot:
logLevel: DEBUG
nodeSelector:
hardware: rpi4
oletools:
logLevel: DEBUG
nodeSelector:
hardware: rpi4
postfix:
logLevel: DEBUG
nodeSelector:
hardware: rpi4
overrides:
smtp_use_tls: "yes"
smtp_tls_security_level: "encrypt"
smtp_sasl_security_options: "noanonymous"
redis:
enabled: true
architecture: standalone
logLevel: DEBUG
image:
repository: bitnamilegacy/redis
tag: 8.0.3-debian-12-r3
master:
nodeSelector:
hardware: rpi4
persistence:
enabled: true
accessModes: [ReadWriteMany]
size: 8Gi
storageClass: astreae
rspamd:
logLevel: DEBUG
nodeSelector:
hardware: rpi4
persistence:
accessModes: [ReadWriteOnce]
size: 8Gi
storageClass: astreae
tika:
logLevel: DEBUG
nodeSelector:
hardware: rpi4
global:
logLevel: DEBUG
storageClass: astreae
webmail:
enabled: false
nodeSelector:
hardware: rpi4
ingress:
enabled: false
ingressClassName: traefik
tls: true
existingSecret: mailu-certificates
annotations:
traefik.ingress.kubernetes.io/router.entrypoints: websecure
traefik.ingress.kubernetes.io/service.serversscheme: https
traefik.ingress.kubernetes.io/service.serverstransport: mailu-transport@kubernetescrd
extraRules:
- host: mail.bstein.dev
http:
paths:
- path: /
pathType: Prefix
backend:
service:
name: mailu-front
port:
number: 443
service:
ports:
smtp:
port: 25
targetPort: 25
smtps:
port: 465
targetPort: 465
submission:
port: 587
targetPort: 587

View File

@ -0,0 +1,19 @@
# services/mailu/ingressroute.yaml
apiVersion: traefik.io/v1alpha1
kind: IngressRoute
metadata:
name: mailu
namespace: mailu-mailserver
spec:
entryPoints:
- websecure
routes:
- match: Host(`mail.bstein.dev`)
kind: Rule
services:
- name: mailu-front
port: 443
scheme: https
serversTransport: mailu-transport
tls:
secretName: mailu-certificates

View File

@ -0,0 +1,23 @@
# services/mailu/kustomization.yaml
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
namespace: mailu-mailserver
resources:
- namespace.yaml
- helmrelease.yaml
- certificate.yaml
- vip-controller.yaml
- unbound-configmap.yaml
- serverstransport.yaml
- ingressroute.yaml
- mailu-sync-job.yaml
- mailu-sync-cronjob.yaml
- mailu-sync-listener.yaml
configMapGenerator:
- name: mailu-sync-script
namespace: mailu-mailserver
files:
- sync.py=../../scripts/mailu_sync.py
options:
disableNameSuffixHash: true

View File

@ -0,0 +1,77 @@
# services/mailu/mailu-sync-cronjob.yaml
apiVersion: batch/v1
kind: CronJob
metadata:
name: mailu-sync-nightly
namespace: mailu-mailserver
spec:
schedule: "30 4 * * *"
concurrencyPolicy: Forbid
jobTemplate:
spec:
template:
spec:
restartPolicy: OnFailure
containers:
- name: mailu-sync
image: python:3.11-alpine
imagePullPolicy: IfNotPresent
command: ["/bin/sh", "-c"]
args:
- |
pip install --no-cache-dir requests psycopg2-binary passlib >/tmp/pip.log \
&& python /app/sync.py
env:
- name: KEYCLOAK_BASE_URL
value: http://keycloak.sso.svc.cluster.local
- name: KEYCLOAK_REALM
value: atlas
- name: MAILU_DOMAIN
value: bstein.dev
- name: MAILU_DEFAULT_QUOTA
value: "20000000000"
- name: MAILU_DB_HOST
value: postgres-service.postgres.svc.cluster.local
- name: MAILU_DB_PORT
value: "5432"
- name: MAILU_DB_NAME
valueFrom:
secretKeyRef:
name: mailu-db-secret
key: database
- name: MAILU_DB_USER
valueFrom:
secretKeyRef:
name: mailu-db-secret
key: username
- name: MAILU_DB_PASSWORD
valueFrom:
secretKeyRef:
name: mailu-db-secret
key: password
- name: KEYCLOAK_CLIENT_ID
valueFrom:
secretKeyRef:
name: mailu-sync-credentials
key: client-id
- name: KEYCLOAK_CLIENT_SECRET
valueFrom:
secretKeyRef:
name: mailu-sync-credentials
key: client-secret
volumeMounts:
- name: sync-script
mountPath: /app/sync.py
subPath: sync.py
resources:
requests:
cpu: 50m
memory: 128Mi
limits:
cpu: 200m
memory: 256Mi
volumes:
- name: sync-script
configMap:
name: mailu-sync-script
defaultMode: 0444

View File

@ -0,0 +1,73 @@
# services/mailu/mailu-sync-job.yaml
apiVersion: batch/v1
kind: Job
metadata:
name: mailu-sync
namespace: mailu-mailserver
spec:
template:
spec:
restartPolicy: OnFailure
containers:
- name: mailu-sync
image: python:3.11-alpine
imagePullPolicy: IfNotPresent
command: ["/bin/sh", "-c"]
args:
- |
pip install --no-cache-dir requests psycopg2-binary passlib >/tmp/pip.log \
&& python /app/sync.py
env:
- name: KEYCLOAK_BASE_URL
value: http://keycloak.sso.svc.cluster.local
- name: KEYCLOAK_REALM
value: atlas
- name: MAILU_DOMAIN
value: bstein.dev
- name: MAILU_DEFAULT_QUOTA
value: "20000000000"
- name: MAILU_DB_HOST
value: postgres-service.postgres.svc.cluster.local
- name: MAILU_DB_PORT
value: "5432"
- name: MAILU_DB_NAME
valueFrom:
secretKeyRef:
name: mailu-db-secret
key: database
- name: MAILU_DB_USER
valueFrom:
secretKeyRef:
name: mailu-db-secret
key: username
- name: MAILU_DB_PASSWORD
valueFrom:
secretKeyRef:
name: mailu-db-secret
key: password
- name: KEYCLOAK_CLIENT_ID
valueFrom:
secretKeyRef:
name: mailu-sync-credentials
key: client-id
- name: KEYCLOAK_CLIENT_SECRET
valueFrom:
secretKeyRef:
name: mailu-sync-credentials
key: client-secret
volumeMounts:
- name: sync-script
mountPath: /app/sync.py
subPath: sync.py
resources:
requests:
cpu: 50m
memory: 128Mi
limits:
cpu: 200m
memory: 256Mi
volumes:
- name: sync-script
configMap:
name: mailu-sync-script
defaultMode: 0444

View File

@ -0,0 +1,154 @@
# services/mailu/mailu-sync-listener.yaml
apiVersion: v1
kind: Service
metadata:
name: mailu-sync-listener
namespace: mailu-mailserver
spec:
selector:
app: mailu-sync-listener
ports:
- name: http
port: 8080
targetPort: 8080
---
apiVersion: apps/v1
kind: Deployment
metadata:
name: mailu-sync-listener
namespace: mailu-mailserver
labels:
app: mailu-sync-listener
spec:
replicas: 1
selector:
matchLabels:
app: mailu-sync-listener
template:
metadata:
labels:
app: mailu-sync-listener
spec:
restartPolicy: Always
containers:
- name: listener
image: python:3.11-alpine
imagePullPolicy: IfNotPresent
command: ["/bin/sh", "-c"]
args:
- |
pip install --no-cache-dir requests psycopg2-binary passlib >/tmp/pip.log \
&& python /app/listener.py
env:
- name: KEYCLOAK_BASE_URL
value: http://keycloak.sso.svc.cluster.local
- name: KEYCLOAK_REALM
value: atlas
- name: MAILU_DOMAIN
value: bstein.dev
- name: MAILU_DEFAULT_QUOTA
value: "20000000000"
- name: MAILU_DB_HOST
value: postgres-service.postgres.svc.cluster.local
- name: MAILU_DB_PORT
value: "5432"
- name: MAILU_DB_NAME
valueFrom:
secretKeyRef:
name: mailu-db-secret
key: database
- name: MAILU_DB_USER
valueFrom:
secretKeyRef:
name: mailu-db-secret
key: username
- name: MAILU_DB_PASSWORD
valueFrom:
secretKeyRef:
name: mailu-db-secret
key: password
- name: KEYCLOAK_CLIENT_ID
valueFrom:
secretKeyRef:
name: mailu-sync-credentials
key: client-id
- name: KEYCLOAK_CLIENT_SECRET
valueFrom:
secretKeyRef:
name: mailu-sync-credentials
key: client-secret
volumeMounts:
- name: sync-script
mountPath: /app/sync.py
subPath: sync.py
- name: listener-script
mountPath: /app/listener.py
subPath: listener.py
resources:
requests:
cpu: 50m
memory: 128Mi
limits:
cpu: 200m
memory: 256Mi
volumes:
- name: sync-script
configMap:
name: mailu-sync-script
defaultMode: 0444
- name: listener-script
configMap:
name: mailu-sync-listener
defaultMode: 0444
---
apiVersion: v1
kind: ConfigMap
metadata:
name: mailu-sync-listener
namespace: mailu-mailserver
data:
listener.py: |
import http.server
import json
import os
import subprocess
import threading
from time import time
# Simple debounce to avoid hammering on bursts
MIN_INTERVAL_SECONDS = 10
last_run = 0.0
lock = threading.Lock()
def trigger_sync():
global last_run
with lock:
now = time()
if now - last_run < MIN_INTERVAL_SECONDS:
return
last_run = now
# Fire and forget; output to stdout
subprocess.Popen(["python", "/app/sync.py"], stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
class Handler(http.server.BaseHTTPRequestHandler):
def do_POST(self):
length = int(self.headers.get("Content-Length", 0))
body = self.rfile.read(length) if length else b""
try:
json.loads(body or b"{}")
except json.JSONDecodeError:
self.send_response(400)
self.end_headers()
return
trigger_sync()
self.send_response(202)
self.end_headers()
def log_message(self, fmt, *args):
# Quiet logging
return
if __name__ == "__main__":
server = http.server.ThreadingHTTPServer(("", 8080), Handler)
server.serve_forever()

View File

@ -0,0 +1,5 @@
# services/mailu/namespace.yaml
apiVersion: v1
kind: Namespace
metadata:
name: mailu-mailserver

View File

@ -0,0 +1,10 @@
# services/mailu/serverstransport.yaml
apiVersion: traefik.io/v1alpha1
kind: ServersTransport
metadata:
name: mailu-transport
namespace: mailu-mailserver
spec:
# Force SNI to mail.bstein.dev and skip backend cert verification (backend cert is for the host, not the pod IP).
serverName: mail.bstein.dev
insecureSkipVerify: true

View File

@ -0,0 +1,49 @@
# services/mailu/unbound-configmap.yaml
apiVersion: v1
kind: ConfigMap
metadata:
name: mailu-unbound
namespace: mailu-mailserver
data:
unbound.conf: |
server:
verbosity: 1
interface: 0.0.0.0
do-ip4: yes
do-ip6: no
do-udp: yes
do-tcp: yes
auto-trust-anchor-file: "/var/lib/unbound/root.key"
prefetch: yes
qname-minimisation: yes
harden-dnssec-stripped: yes
val-clean-additional: yes
domain-insecure: "mailu-mailserver.svc.cluster.local."
domain-insecure: "svc.cluster.local."
domain-insecure: "cluster.local."
cache-min-ttl: 120
cache-max-ttl: 86400
access-control: 0.0.0.0/0 allow
forward-zone:
name: "mailu-mailserver.svc.cluster.local."
forward-addr: 10.43.0.10
forward-no-cache: yes
forward-first: yes
forward-zone:
name: "svc.cluster.local."
forward-addr: 10.43.0.10
forward-no-cache: yes
forward-first: yes
forward-zone:
name: "cluster.local."
forward-addr: 10.43.0.10
forward-no-cache: yes
forward-first: yes
forward-zone:
name: "."
forward-addr: 9.9.9.9
forward-addr: 1.1.1.1

View File

@ -0,0 +1,71 @@
# services/mailu/vip-controller.yaml
---
apiVersion: v1
kind: ServiceAccount
metadata:
name: vip-controller
namespace: mailu-mailserver
---
apiVersion: rbac.authorization.k8s.io/v1
kind: Role
metadata:
name: vip-controller-role
namespace: mailu-mailserver
rules:
- apiGroups: ["apps"]
resources: ["deployments"]
verbs: ["get", "list", "patch", "update"]
---
apiVersion: rbac.authorization.k8s.io/v1
kind: RoleBinding
metadata:
name: vip-controller-binding
namespace: mailu-mailserver
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: Role
name: vip-controller-role
subjects:
- kind: ServiceAccount
name: vip-controller
namespace: mailu-mailserver
---
apiVersion: apps/v1
kind: DaemonSet
metadata:
name: vip-controller
namespace: mailu-mailserver
spec:
selector:
matchLabels:
app: vip-controller
template:
metadata:
labels:
app: vip-controller
spec:
serviceAccountName: vip-controller
hostNetwork: true
nodeSelector:
mailu.bstein.dev/vip: "true"
containers:
- name: vip-controller
image: lachlanevenson/k8s-kubectl:latest
imagePullPolicy: IfNotPresent
command:
- /bin/sh
- -c
args:
- |
set -e
while true; do
if ip addr show end0 | grep -q 'inet 192\.168\.22\.9/32'; then
NODE=$(hostname)
echo "VIP found on node ${NODE}."
kubectl patch deployment mailu-front -n mailu-mailserver --type='merge' \
-p "{\"spec\":{\"template\":{\"spec\":{\"nodeSelector\":{\"kubernetes.io/hostname\":\"${NODE}\"}}}}}"
else
echo "No VIP on node ${HOSTNAME}."
fi
sleep 60
done

View File

@ -1,28 +0,0 @@
# services/monitoring
## Grafana admin secret
The Grafana Helm release expects a pre-existing secret named `grafana-admin`
in the `monitoring` namespace. Create or rotate it with:
```bash
kubectl create secret generic grafana-admin \
--namespace monitoring \
--from-literal=admin-user=admin \
--from-literal=admin-password='REPLACE_ME'
```
Update the password whenever you rotate credentials.
## DCGM exporter image
The NVIDIA GPU metrics DaemonSet expects `registry.bstein.dev/monitoring/dcgm-exporter:4.4.2-4.7.0-ubuntu22.04`, mirrored from `docker.io/nvidia/dcgm-exporter:4.4.2-4.7.0-ubuntu22.04`. Refresh it in Zot when bumping versions:
```bash
skopeo copy \
--all \
docker://docker.io/nvidia/dcgm-exporter:4.4.2-4.7.0-ubuntu22.04 \
docker://registry.bstein.dev/monitoring/dcgm-exporter:4.4.2-4.7.0-ubuntu22.04
```
When finished mirroring from the control-plane, you can remove temporary tooling with `sudo apt-get purge -y skopeo && sudo apt-get autoremove -y` and clear `~/.config/containers/auth.json`.

View File

@ -40,9 +40,7 @@
"placement": "right"
},
"pieType": "pie",
"displayLabels": [
"percent"
],
"displayLabels": [],
"tooltip": {
"mode": "single"
},
@ -153,12 +151,16 @@
],
"fieldConfig": {
"defaults": {
"unit": "percent"
"unit": "percent",
"custom": {
"filterable": true
}
},
"overrides": []
},
"options": {
"showHeader": true
"showHeader": true,
"columnFilters": false
},
"transformations": [
{

View File

@ -7,46 +7,55 @@
{
"id": 1,
"type": "stat",
"title": "Ingress Traffic",
"title": "Ingress Success Rate (5m)",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 4,
"w": 8,
"w": 6,
"x": 0,
"y": 0
},
"targets": [
{
"expr": "sum(rate(node_network_receive_bytes_total{device!~\"lo|cni.*|veth.*|flannel.*|docker.*|virbr.*|vxlan.*|wg.*\"}[5m])) or on() vector(0)",
"expr": "(sum(rate(traefik_entrypoint_requests_total{code!~\"5..\"}[5m]))) / clamp_min(sum(rate(traefik_entrypoint_requests_total[5m])), 1)",
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
"mode": "thresholds"
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "rgba(115, 115, 115, 1)",
"color": "red",
"value": null
},
{
"color": "orange",
"value": 0.995
},
{
"color": "yellow",
"value": 0.999
},
{
"color": "green",
"value": 1
"value": 0.9995
}
]
},
"unit": "Bps",
"unit": "percentunit",
"custom": {
"displayMode": "auto"
}
},
"decimals": 2
},
"overrides": []
},
@ -67,46 +76,55 @@
{
"id": 2,
"type": "stat",
"title": "Egress Traffic",
"title": "Error Budget Burn (1h)",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 4,
"w": 8,
"x": 8,
"w": 6,
"x": 6,
"y": 0
},
"targets": [
{
"expr": "sum(rate(node_network_transmit_bytes_total{device!~\"lo|cni.*|veth.*|flannel.*|docker.*|virbr.*|vxlan.*|wg.*\"}[5m])) or on() vector(0)",
"expr": "(1 - ((sum(rate(traefik_entrypoint_requests_total{code!~\"5..\"}[1h]))) / clamp_min(sum(rate(traefik_entrypoint_requests_total[1h])), 1))) / 0.0010000000000000009",
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
"mode": "thresholds"
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "rgba(115, 115, 115, 1)",
"color": "green",
"value": null
},
{
"color": "green",
"color": "yellow",
"value": 1
},
{
"color": "orange",
"value": 2
},
{
"color": "red",
"value": 4
}
]
},
"unit": "Bps",
"unit": "none",
"custom": {
"displayMode": "auto"
}
},
"decimals": 2
},
"overrides": []
},
@ -127,7 +145,145 @@
{
"id": 3,
"type": "stat",
"title": "Intra-Cluster Traffic",
"title": "Error Budget Burn (6h)",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 4,
"w": 6,
"x": 12,
"y": 0
},
"targets": [
{
"expr": "(1 - ((sum(rate(traefik_entrypoint_requests_total{code!~\"5..\"}[6h]))) / clamp_min(sum(rate(traefik_entrypoint_requests_total[6h])), 1))) / 0.0010000000000000009",
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "yellow",
"value": 1
},
{
"color": "orange",
"value": 2
},
{
"color": "red",
"value": 4
}
]
},
"unit": "none",
"custom": {
"displayMode": "auto"
},
"decimals": 2
},
"overrides": []
},
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "center",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"textMode": "value"
}
},
{
"id": 4,
"type": "stat",
"title": "Edge P99 Latency (ms)",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 4,
"w": 6,
"x": 18,
"y": 0
},
"targets": [
{
"expr": "histogram_quantile(0.99, sum by (le) (rate(traefik_entrypoint_request_duration_seconds_bucket[5m]))) * 1000",
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "yellow",
"value": 200
},
{
"color": "orange",
"value": 350
},
{
"color": "red",
"value": 500
}
]
},
"unit": "ms",
"custom": {
"displayMode": "auto"
},
"decimals": 1
},
"overrides": []
},
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "center",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"textMode": "value"
}
},
{
"id": 5,
"type": "stat",
"title": "Ingress Traffic",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
@ -135,19 +291,19 @@
"gridPos": {
"h": 4,
"w": 8,
"x": 16,
"y": 0
"x": 0,
"y": 4
},
"targets": [
{
"expr": "sum(rate(container_network_receive_bytes_total{namespace!=\"traefik\",pod!=\"\"}[5m]) + rate(container_network_transmit_bytes_total{namespace!=\"traefik\",pod!=\"\"}[5m])) or on() vector(0)",
"expr": "sum(rate(node_network_receive_bytes_total{device!~\"lo|cni.*|veth.*|flannel.*|docker.*|virbr.*|vxlan.*|wg.*\"}[5m])) or on() vector(0)",
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
"mode": "thresholds"
},
"mappings": [],
"thresholds": {
@ -185,9 +341,9 @@
}
},
{
"id": 4,
"id": 6,
"type": "stat",
"title": "Top Router req/s",
"title": "Egress Traffic",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
@ -195,20 +351,19 @@
"gridPos": {
"h": 4,
"w": 8,
"x": 0,
"x": 8,
"y": 4
},
"targets": [
{
"expr": "topk(1, sum by (router) (rate(traefik_router_requests_total[5m])))",
"refId": "A",
"legendFormat": "{{router}}"
"expr": "sum(rate(node_network_transmit_bytes_total{device!~\"lo|cni.*|veth.*|flannel.*|docker.*|virbr.*|vxlan.*|wg.*\"}[5m])) or on() vector(0)",
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
"mode": "thresholds"
},
"mappings": [],
"thresholds": {
@ -224,7 +379,7 @@
}
]
},
"unit": "req/s",
"unit": "Bps",
"custom": {
"displayMode": "auto"
}
@ -246,7 +401,67 @@
}
},
{
"id": 5,
"id": 7,
"type": "stat",
"title": "Intra-Cluster Traffic",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 4,
"w": 8,
"x": 16,
"y": 4
},
"targets": [
{
"expr": "sum(rate(container_network_receive_bytes_total{namespace!=\"traefik\",pod!=\"\"}[5m]) + rate(container_network_transmit_bytes_total{namespace!=\"traefik\",pod!=\"\"}[5m])) or on() vector(0)",
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "rgba(115, 115, 115, 1)",
"value": null
},
{
"color": "green",
"value": 1
}
]
},
"unit": "Bps",
"custom": {
"displayMode": "auto"
}
},
"overrides": []
},
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "center",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"textMode": "value"
}
},
{
"id": 8,
"type": "timeseries",
"title": "Per-Node Throughput",
"datasource": {
@ -283,7 +498,7 @@
}
},
{
"id": 6,
"id": 9,
"type": "table",
"title": "Top Namespaces",
"datasource": {
@ -304,12 +519,16 @@
],
"fieldConfig": {
"defaults": {
"unit": "Bps"
"unit": "Bps",
"custom": {
"filterable": true
}
},
"overrides": []
},
"options": {
"showHeader": true
"showHeader": true,
"columnFilters": false
},
"transformations": [
{
@ -319,7 +538,7 @@
]
},
{
"id": 7,
"id": 10,
"type": "table",
"title": "Top Pods",
"datasource": {
@ -340,12 +559,16 @@
],
"fieldConfig": {
"defaults": {
"unit": "Bps"
"unit": "Bps",
"custom": {
"filterable": true
}
},
"overrides": []
},
"options": {
"showHeader": true
"showHeader": true,
"columnFilters": false
},
"transformations": [
{
@ -355,7 +578,7 @@
]
},
{
"id": 8,
"id": 11,
"type": "timeseries",
"title": "Traefik Routers (req/s)",
"datasource": {
@ -392,7 +615,7 @@
}
},
{
"id": 9,
"id": 12,
"type": "timeseries",
"title": "Traefik Entrypoints (req/s)",
"datasource": {

View File

@ -27,7 +27,7 @@
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
"mode": "thresholds"
},
"mappings": [],
"thresholds": {
@ -88,7 +88,7 @@
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
"mode": "thresholds"
},
"mappings": [],
"thresholds": {
@ -149,7 +149,7 @@
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
"mode": "thresholds"
},
"mappings": [],
"thresholds": {
@ -186,6 +186,213 @@
"textMode": "value"
}
},
{
"id": 9,
"type": "stat",
"title": "API Server 5xx rate",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 4,
"w": 8,
"x": 0,
"y": 4
},
"targets": [
{
"expr": "sum(rate(apiserver_request_total{code=~\"5..\"}[5m]))",
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "yellow",
"value": 0.05
},
{
"color": "orange",
"value": 0.2
},
{
"color": "red",
"value": 0.5
}
]
},
"unit": "req/s",
"custom": {
"displayMode": "auto"
},
"decimals": 3
},
"overrides": []
},
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "center",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"textMode": "value"
}
},
{
"id": 10,
"type": "stat",
"title": "API Server P99 latency",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 4,
"w": 8,
"x": 8,
"y": 4
},
"targets": [
{
"expr": "histogram_quantile(0.99, sum by (le) (rate(apiserver_request_duration_seconds_bucket[5m]))) * 1000",
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "yellow",
"value": 250
},
{
"color": "orange",
"value": 400
},
{
"color": "red",
"value": 600
}
]
},
"unit": "ms",
"custom": {
"displayMode": "auto"
},
"decimals": 1
},
"overrides": []
},
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "center",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"textMode": "value"
}
},
{
"id": 11,
"type": "stat",
"title": "etcd P99 latency",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 4,
"w": 8,
"x": 16,
"y": 4
},
"targets": [
{
"expr": "histogram_quantile(0.99, sum by (le) (rate(etcd_request_duration_seconds_bucket[5m]))) * 1000",
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "yellow",
"value": 50
},
{
"color": "orange",
"value": 100
},
{
"color": "red",
"value": 200
}
]
},
"unit": "ms",
"custom": {
"displayMode": "auto"
},
"decimals": 1
},
"overrides": []
},
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "center",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"textMode": "value"
}
},
{
"id": 4,
"type": "timeseries",
@ -198,7 +405,7 @@
"h": 9,
"w": 24,
"x": 0,
"y": 4
"y": 8
},
"targets": [
{
@ -238,7 +445,7 @@
"h": 9,
"w": 24,
"x": 0,
"y": 13
"y": 17
},
"targets": [
{
@ -278,7 +485,7 @@
"h": 9,
"w": 12,
"x": 0,
"y": 22
"y": 26
},
"targets": [
{
@ -315,7 +522,7 @@
"h": 9,
"w": 12,
"x": 12,
"y": 22
"y": 26
},
"targets": [
{
@ -352,7 +559,7 @@
"h": 9,
"w": 24,
"x": 0,
"y": 31
"y": 35
},
"targets": [
{

View File

@ -7,67 +7,6 @@
"list": []
},
"panels": [
{
"id": 1,
"type": "gauge",
"title": "Workers Ready",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 5,
"w": 5,
"x": 0,
"y": 0
},
"targets": [
{
"expr": "sum(kube_node_status_condition{condition=\"Ready\",status=\"true\",node=~\"titan-04|titan-05|titan-06|titan-07|titan-08|titan-09|titan-10|titan-11|titan-12|titan-13|titan-14|titan-15|titan-16|titan-17|titan-18|titan-19|titan-22|titan-24\"})",
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"min": 0,
"max": 18,
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "red",
"value": null
},
{
"color": "orange",
"value": 16
},
{
"color": "yellow",
"value": 17
},
{
"color": "green",
"value": 18
}
]
}
},
"overrides": []
},
"options": {
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"orientation": "auto",
"showThresholdMarkers": false,
"showThresholdLabels": false
}
},
{
"id": 2,
"type": "gauge",
@ -78,8 +17,8 @@
},
"gridPos": {
"h": 5,
"w": 5,
"x": 5,
"w": 4,
"x": 0,
"y": 0
},
"targets": [
@ -131,8 +70,8 @@
},
"gridPos": {
"h": 5,
"w": 5,
"x": 10,
"w": 3,
"x": 4,
"y": 0
},
"targets": [
@ -144,82 +83,7 @@
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "yellow",
"value": 1
},
{
"color": "orange",
"value": 2
},
{
"color": "red",
"value": 3
}
]
},
"unit": "none",
"custom": {
"displayMode": "auto"
}
},
"overrides": []
},
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "center",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"textMode": "value"
},
"links": [
{
"title": "Open atlas-pods dashboard",
"url": "/d/atlas-pods",
"targetBlank": true
}
]
},
{
"id": 4,
"type": "stat",
"title": "Problem Pods",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 5,
"w": 5,
"x": 15,
"y": 0
},
"targets": [
{
"expr": "sum(max by (namespace,pod) (kube_pod_status_phase{phase!~\"Running|Succeeded\"}))",
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
"mode": "thresholds"
},
"mappings": [],
"thresholds": {
@ -281,20 +145,20 @@
},
"gridPos": {
"h": 5,
"w": 4,
"x": 20,
"w": 3,
"x": 7,
"y": 0
},
"targets": [
{
"expr": "sum(max by (namespace,pod) (((time() - kube_pod_deletion_timestamp{pod!=\"\"}) > bool 600) and on(namespace,pod) (kube_pod_deletion_timestamp{pod!=\"\"} > bool 0)))",
"expr": "sum(max by (namespace,pod) (((time() - kube_pod_deletion_timestamp{pod!=\"\"}) > bool 600) and on(namespace,pod) (kube_pod_deletion_timestamp{pod!=\"\"} > bool 0))) or on() vector(0)",
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
"mode": "thresholds"
},
"mappings": [],
"thresholds": {
@ -346,6 +210,286 @@
}
]
},
{
"id": 27,
"type": "stat",
"title": "Atlas Availability (30d)",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 5,
"w": 4,
"x": 10,
"y": 0
},
"targets": [
{
"expr": "avg_over_time((min(((sum(kube_node_status_condition{condition=\"Ready\",status=\"true\",node=~\"titan-0a|titan-0b|titan-0c\"}) / 3)), ((sum(kube_deployment_status_replicas_available{namespace=~\"traefik|kube-system\",deployment=\"traefik\"}) / clamp_min(sum(kube_deployment_spec_replicas{namespace=~\"traefik|kube-system\",deployment=\"traefik\"}), 1)))))[30d:5m])",
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "red",
"value": null
},
{
"color": "orange",
"value": 0.999
},
{
"color": "yellow",
"value": 0.9999
},
{
"color": "green",
"value": 0.99999
}
]
},
"unit": "percentunit",
"custom": {
"displayMode": "auto"
},
"decimals": 3
},
"overrides": []
},
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "center",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"textMode": "value"
}
},
{
"id": 4,
"type": "stat",
"title": "Problem Pods",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 5,
"w": 3,
"x": 14,
"y": 0
},
"targets": [
{
"expr": "sum(max by (namespace,pod) (kube_pod_status_phase{phase!~\"Running|Succeeded\"})) or on() vector(0)",
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "yellow",
"value": 1
},
{
"color": "orange",
"value": 2
},
{
"color": "red",
"value": 3
}
]
},
"unit": "none",
"custom": {
"displayMode": "auto"
}
},
"overrides": []
},
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "center",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"textMode": "value"
},
"links": [
{
"title": "Open atlas-pods dashboard",
"url": "/d/atlas-pods",
"targetBlank": true
}
]
},
{
"id": 6,
"type": "stat",
"title": "CrashLoop / ImagePull",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 5,
"w": 3,
"x": 17,
"y": 0
},
"targets": [
{
"expr": "sum(max by (namespace,pod) (kube_pod_container_status_waiting_reason{reason=~\"CrashLoopBackOff|ImagePullBackOff\"})) or on() vector(0)",
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "yellow",
"value": 1
},
{
"color": "orange",
"value": 2
},
{
"color": "red",
"value": 3
}
]
},
"unit": "none",
"custom": {
"displayMode": "auto"
}
},
"overrides": []
},
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "center",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"textMode": "value"
},
"links": [
{
"title": "Open atlas-pods dashboard",
"url": "/d/atlas-pods",
"targetBlank": true
}
]
},
{
"id": 1,
"type": "gauge",
"title": "Workers Ready",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 5,
"w": 4,
"x": 20,
"y": 0
},
"targets": [
{
"expr": "sum(kube_node_status_condition{condition=\"Ready\",status=\"true\",node=~\"titan-04|titan-05|titan-06|titan-07|titan-08|titan-09|titan-10|titan-11|titan-12|titan-13|titan-14|titan-15|titan-16|titan-17|titan-18|titan-19|titan-22|titan-24\"})",
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"min": 0,
"max": 18,
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "red",
"value": null
},
{
"color": "orange",
"value": 16
},
{
"color": "yellow",
"value": 17
},
{
"color": "green",
"value": 18
}
]
}
},
"overrides": []
},
"options": {
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"orientation": "auto",
"showThresholdMarkers": false,
"showThresholdLabels": false
}
},
{
"id": 7,
"type": "stat",
@ -371,11 +515,11 @@
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
"mode": "thresholds"
},
"mappings": [],
"thresholds": {
"mode": "percentage",
"mode": "absolute",
"steps": [
{
"color": "green",
@ -383,11 +527,15 @@
},
{
"color": "yellow",
"value": 70
"value": 50
},
{
"color": "orange",
"value": 75
},
{
"color": "red",
"value": 85
"value": 91.5
}
]
},
@ -444,11 +592,11 @@
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
"mode": "thresholds"
},
"mappings": [],
"thresholds": {
"mode": "percentage",
"mode": "absolute",
"steps": [
{
"color": "green",
@ -456,11 +604,15 @@
},
{
"color": "yellow",
"value": 70
"value": 50
},
{
"color": "orange",
"value": 75
},
{
"color": "red",
"value": 85
"value": 91.5
}
]
},
@ -517,7 +669,7 @@
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
"mode": "thresholds"
},
"mappings": [],
"thresholds": {
@ -586,7 +738,7 @@
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
"mode": "thresholds"
},
"mappings": [],
"thresholds": {
@ -653,11 +805,11 @@
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
"mode": "thresholds"
},
"mappings": [],
"thresholds": {
"mode": "percentage",
"mode": "absolute",
"steps": [
{
"color": "green",
@ -665,11 +817,15 @@
},
{
"color": "yellow",
"value": 70
"value": 50
},
{
"color": "orange",
"value": 75
},
{
"color": "red",
"value": 85
"value": 91.5
}
]
},
@ -724,11 +880,11 @@
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
"mode": "thresholds"
},
"mappings": [],
"thresholds": {
"mode": "percentage",
"mode": "absolute",
"steps": [
{
"color": "green",
@ -736,11 +892,15 @@
},
{
"color": "yellow",
"value": 70
"value": 50
},
{
"color": "orange",
"value": 75
},
{
"color": "red",
"value": 85
"value": 91.5
}
]
},
@ -795,7 +955,7 @@
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
"mode": "thresholds"
},
"mappings": [],
"thresholds": {
@ -862,7 +1022,7 @@
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
"mode": "thresholds"
},
"mappings": [],
"thresholds": {
@ -942,9 +1102,7 @@
"placement": "right"
},
"pieType": "pie",
"displayLabels": [
"percent"
],
"displayLabels": [],
"tooltip": {
"mode": "single"
},
@ -995,9 +1153,7 @@
"placement": "right"
},
"pieType": "pie",
"displayLabels": [
"percent"
],
"displayLabels": [],
"tooltip": {
"mode": "single"
},
@ -1048,9 +1204,7 @@
"placement": "right"
},
"pieType": "pie",
"displayLabels": [
"percent"
],
"displayLabels": [],
"tooltip": {
"mode": "single"
},
@ -1175,7 +1329,7 @@
},
"targets": [
{
"expr": "(avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-0a|titan-0b|titan-0c\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")",
"expr": "(avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-0a|titan-0b|titan-0c|titan-db\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")",
"refId": "A",
"legendFormat": "{{node}}"
}
@ -1212,7 +1366,7 @@
},
"targets": [
{
"expr": "(avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-0a|titan-0b|titan-0c\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")",
"expr": "(avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-0a|titan-0b|titan-0c|titan-db\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")",
"refId": "A",
"legendFormat": "{{node}}"
}
@ -1233,6 +1387,138 @@
}
}
},
{
"id": 28,
"type": "piechart",
"title": "Node Pod Share",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 10,
"w": 12,
"x": 0,
"y": 54
},
"targets": [
{
"expr": "(sum(kube_pod_info{pod!=\"\" , node!=\"\"}) by (node) / clamp_min(sum(kube_pod_info{pod!=\"\" , node!=\"\"}), 1)) * 100",
"refId": "A",
"legendFormat": "{{namespace}}"
}
],
"fieldConfig": {
"defaults": {
"unit": "percent",
"color": {
"mode": "palette-classic"
}
},
"overrides": []
},
"options": {
"legend": {
"displayMode": "list",
"placement": "right"
},
"pieType": "pie",
"displayLabels": [],
"tooltip": {
"mode": "single"
},
"colorScheme": "interpolateSpectral",
"colorBy": "value",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
}
}
},
{
"id": 29,
"type": "bargauge",
"title": "Top Nodes by Pod Count",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 10,
"w": 12,
"x": 12,
"y": 54
},
"targets": [
{
"expr": "topk(12, sum(kube_pod_info{pod!=\"\" , node!=\"\"}) by (node))",
"refId": "A",
"legendFormat": "{{node}}",
"instant": true
}
],
"fieldConfig": {
"defaults": {
"unit": "none",
"min": 0,
"max": null,
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "yellow",
"value": 50
},
{
"color": "orange",
"value": 75
},
{
"color": "red",
"value": 100
}
]
},
"decimals": 0
},
"overrides": []
},
"options": {
"displayMode": "gradient",
"orientation": "horizontal",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
}
},
"transformations": [
{
"id": "sortBy",
"options": {
"fields": [
"Value"
],
"order": "desc"
}
},
{
"id": "limit",
"options": {
"limit": 12
}
}
]
},
{
"id": 18,
"type": "timeseries",
@ -1377,7 +1663,7 @@
"h": 16,
"w": 12,
"x": 0,
"y": 54
"y": 64
},
"targets": [
{
@ -1425,7 +1711,7 @@
"h": 16,
"w": 12,
"x": 12,
"y": 54
"y": 64
},
"targets": [
{
@ -1452,11 +1738,11 @@
},
{
"color": "orange",
"value": 70
"value": 75
},
{
"color": "red",
"value": 85
"value": 91.5
}
]
}
@ -1480,6 +1766,17 @@
"url": "/d/atlas-storage",
"targetBlank": true
}
],
"transformations": [
{
"id": "sortBy",
"options": {
"fields": [
"Value"
],
"order": "desc"
}
}
]
}
],
@ -1497,36 +1794,5 @@
"to": "now"
},
"refresh": "1m",
"links": [
{
"title": "Atlas Pods",
"type": "dashboard",
"dashboardUid": "atlas-pods",
"keepTime": false
},
{
"title": "Atlas Nodes",
"type": "dashboard",
"dashboardUid": "atlas-nodes",
"keepTime": false
},
{
"title": "Atlas Storage",
"type": "dashboard",
"dashboardUid": "atlas-storage",
"keepTime": false
},
{
"title": "Atlas Network",
"type": "dashboard",
"dashboardUid": "atlas-network",
"keepTime": false
},
{
"title": "Atlas GPU",
"type": "dashboard",
"dashboardUid": "atlas-gpu",
"keepTime": false
}
]
"links": []
}

View File

@ -20,14 +20,14 @@
},
"targets": [
{
"expr": "sum(max by (namespace,pod) (kube_pod_status_phase{phase!~\"Running|Succeeded\"}))",
"expr": "sum(max by (namespace,pod) (kube_pod_status_phase{phase!~\"Running|Succeeded\"})) or on() vector(0)",
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
"mode": "thresholds"
},
"mappings": [],
"thresholds": {
@ -80,14 +80,14 @@
},
"targets": [
{
"expr": "sum(max by (namespace,pod) (kube_pod_container_status_waiting_reason{reason=~\"CrashLoopBackOff|ImagePullBackOff\"}))",
"expr": "sum(max by (namespace,pod) (kube_pod_container_status_waiting_reason{reason=~\"CrashLoopBackOff|ImagePullBackOff\"})) or on() vector(0)",
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
"mode": "thresholds"
},
"mappings": [],
"thresholds": {
@ -140,14 +140,14 @@
},
"targets": [
{
"expr": "sum(max by (namespace,pod) (((time() - kube_pod_deletion_timestamp{pod!=\"\"}) > bool 600) and on(namespace,pod) (kube_pod_deletion_timestamp{pod!=\"\"} > bool 0)))",
"expr": "sum(max by (namespace,pod) (((time() - kube_pod_deletion_timestamp{pod!=\"\"}) > bool 600) and on(namespace,pod) (kube_pod_deletion_timestamp{pod!=\"\"} > bool 0))) or on() vector(0)",
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
"mode": "thresholds"
},
"mappings": [],
"thresholds": {
@ -207,7 +207,7 @@
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
"mode": "thresholds"
},
"mappings": [],
"thresholds": {
@ -266,12 +266,16 @@
],
"fieldConfig": {
"defaults": {
"unit": "s"
"unit": "s",
"custom": {
"filterable": true
}
},
"overrides": []
},
"options": {
"showHeader": true
"showHeader": true,
"columnFilters": false
},
"transformations": [
{
@ -302,12 +306,16 @@
],
"fieldConfig": {
"defaults": {
"unit": "s"
"unit": "s",
"custom": {
"filterable": true
}
},
"overrides": []
},
"options": {
"showHeader": true
"showHeader": true,
"columnFilters": false
},
"transformations": [
{
@ -338,12 +346,16 @@
],
"fieldConfig": {
"defaults": {
"unit": "s"
"unit": "s",
"custom": {
"filterable": true
}
},
"overrides": []
},
"options": {
"showHeader": true
"showHeader": true,
"columnFilters": false
},
"transformations": [
{
@ -359,6 +371,233 @@
}
}
]
},
{
"id": 8,
"type": "piechart",
"title": "Node Pod Share",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 8,
"w": 12,
"x": 12,
"y": 34
},
"targets": [
{
"expr": "(sum(kube_pod_info{pod!=\"\" , node!=\"\"}) by (node) / clamp_min(sum(kube_pod_info{pod!=\"\" , node!=\"\"}), 1)) * 100",
"refId": "A",
"legendFormat": "{{namespace}}"
}
],
"fieldConfig": {
"defaults": {
"unit": "percent",
"color": {
"mode": "palette-classic"
}
},
"overrides": []
},
"options": {
"legend": {
"displayMode": "list",
"placement": "right"
},
"pieType": "pie",
"displayLabels": [],
"tooltip": {
"mode": "single"
},
"colorScheme": "interpolateSpectral",
"colorBy": "value",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
}
}
},
{
"id": 9,
"type": "bargauge",
"title": "Top Nodes by Pod Count",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 8,
"w": 12,
"x": 0,
"y": 34
},
"targets": [
{
"expr": "topk(12, sum(kube_pod_info{pod!=\"\" , node!=\"\"}) by (node))",
"refId": "A",
"legendFormat": "{{node}}",
"instant": true
}
],
"fieldConfig": {
"defaults": {
"unit": "none",
"min": 0,
"max": null,
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "yellow",
"value": 50
},
{
"color": "orange",
"value": 75
},
{
"color": "red",
"value": 100
}
]
},
"decimals": 0
},
"overrides": []
},
"options": {
"displayMode": "gradient",
"orientation": "horizontal",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
}
},
"transformations": [
{
"id": "sortBy",
"options": {
"fields": [
"Value"
],
"order": "desc"
}
},
{
"id": "limit",
"options": {
"limit": 12
}
}
]
},
{
"id": 10,
"type": "table",
"title": "Namespace Plurality by Node v27",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 8,
"w": 24,
"x": 0,
"y": 42
},
"targets": [
{
"expr": "(sum by (namespace,node) (kube_pod_info{pod!=\"\" , node!=\"\"}) / on(namespace) group_left() clamp_min(sum by (namespace) (kube_pod_info{pod!=\"\"}), 1) * 100) * on(namespace,node) group_left() ((sum by (namespace,node) (kube_pod_info{pod!=\"\" , node!=\"\"}) / on(namespace) group_left() clamp_min(sum by (namespace) (kube_pod_info{pod!=\"\"}), 1) * 100) + on(node) group_left() ((sum by (node) (kube_node_info{node=\"titan-0a\"}) * 0 + 0.001) or (sum by (node) (kube_node_info{node=\"titan-0b\"}) * 0 + 0.002) or (sum by (node) (kube_node_info{node=\"titan-0c\"}) * 0 + 0.003) or (sum by (node) (kube_node_info{node=\"titan-db\"}) * 0 + 0.004) or (sum by (node) (kube_node_info{node=\"titan-04\"}) * 0 + 0.005) or (sum by (node) (kube_node_info{node=\"titan-05\"}) * 0 + 0.006) or (sum by (node) (kube_node_info{node=\"titan-06\"}) * 0 + 0.007) or (sum by (node) (kube_node_info{node=\"titan-07\"}) * 0 + 0.008) or (sum by (node) (kube_node_info{node=\"titan-08\"}) * 0 + 0.009000000000000001) or (sum by (node) (kube_node_info{node=\"titan-09\"}) * 0 + 0.01) or (sum by (node) (kube_node_info{node=\"titan-10\"}) * 0 + 0.011) or (sum by (node) (kube_node_info{node=\"titan-11\"}) * 0 + 0.012) or (sum by (node) (kube_node_info{node=\"titan-12\"}) * 0 + 0.013000000000000001) or (sum by (node) (kube_node_info{node=\"titan-13\"}) * 0 + 0.014) or (sum by (node) (kube_node_info{node=\"titan-14\"}) * 0 + 0.015) or (sum by (node) (kube_node_info{node=\"titan-15\"}) * 0 + 0.016) or (sum by (node) (kube_node_info{node=\"titan-16\"}) * 0 + 0.017) or (sum by (node) (kube_node_info{node=\"titan-17\"}) * 0 + 0.018000000000000002) or (sum by (node) (kube_node_info{node=\"titan-18\"}) * 0 + 0.019) or (sum by (node) (kube_node_info{node=\"titan-19\"}) * 0 + 0.02) or (sum by (node) (kube_node_info{node=\"titan-22\"}) * 0 + 0.021) or (sum by (node) (kube_node_info{node=\"titan-24\"}) * 0 + 0.022)) == bool on(namespace) group_left() (max by (namespace) ((sum by (namespace,node) (kube_pod_info{pod!=\"\" , node!=\"\"}) / on(namespace) group_left() clamp_min(sum by (namespace) (kube_pod_info{pod!=\"\"}), 1) * 100) + on(node) group_left() ((sum by (node) (kube_node_info{node=\"titan-0a\"}) * 0 + 0.001) or (sum by (node) (kube_node_info{node=\"titan-0b\"}) * 0 + 0.002) or (sum by (node) (kube_node_info{node=\"titan-0c\"}) * 0 + 0.003) or (sum by (node) (kube_node_info{node=\"titan-db\"}) * 0 + 0.004) or (sum by (node) (kube_node_info{node=\"titan-04\"}) * 0 + 0.005) or (sum by (node) (kube_node_info{node=\"titan-05\"}) * 0 + 0.006) or (sum by (node) (kube_node_info{node=\"titan-06\"}) * 0 + 0.007) or (sum by (node) (kube_node_info{node=\"titan-07\"}) * 0 + 0.008) or (sum by (node) (kube_node_info{node=\"titan-08\"}) * 0 + 0.009000000000000001) or (sum by (node) (kube_node_info{node=\"titan-09\"}) * 0 + 0.01) or (sum by (node) (kube_node_info{node=\"titan-10\"}) * 0 + 0.011) or (sum by (node) (kube_node_info{node=\"titan-11\"}) * 0 + 0.012) or (sum by (node) (kube_node_info{node=\"titan-12\"}) * 0 + 0.013000000000000001) or (sum by (node) (kube_node_info{node=\"titan-13\"}) * 0 + 0.014) or (sum by (node) (kube_node_info{node=\"titan-14\"}) * 0 + 0.015) or (sum by (node) (kube_node_info{node=\"titan-15\"}) * 0 + 0.016) or (sum by (node) (kube_node_info{node=\"titan-16\"}) * 0 + 0.017) or (sum by (node) (kube_node_info{node=\"titan-17\"}) * 0 + 0.018000000000000002) or (sum by (node) (kube_node_info{node=\"titan-18\"}) * 0 + 0.019) or (sum by (node) (kube_node_info{node=\"titan-19\"}) * 0 + 0.02) or (sum by (node) (kube_node_info{node=\"titan-22\"}) * 0 + 0.021) or (sum by (node) (kube_node_info{node=\"titan-24\"}) * 0 + 0.022)))))",
"refId": "A",
"instant": true,
"format": "table"
}
],
"fieldConfig": {
"defaults": {
"unit": "percent",
"custom": {
"filterable": false
}
},
"overrides": []
},
"options": {
"showHeader": true,
"columnFilters": false,
"showColumnFilters": false,
"footer": {
"show": false,
"fields": "",
"calcs": []
}
},
"transformations": [
{
"id": "labelsToFields",
"options": {}
},
{
"id": "organize",
"options": {
"excludeByName": {
"Time": true
}
}
},
{
"id": "filterByValue",
"options": {
"match": "Value",
"operator": "gt",
"value": 0
}
},
{
"id": "sortBy",
"options": {
"fields": [
"Value"
],
"order": "desc"
}
},
{
"id": "groupBy",
"options": {
"fields": {
"namespace": {
"aggregations": [
{
"field": "Value",
"operation": "max"
},
{
"field": "node",
"operation": "first"
}
]
}
},
"rowBy": [
"namespace"
]
}
}
]
}
],
"time": {

View File

@ -27,11 +27,11 @@
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
"mode": "thresholds"
},
"mappings": [],
"thresholds": {
"mode": "percentage",
"mode": "absolute",
"steps": [
{
"color": "green",
@ -39,11 +39,15 @@
},
{
"color": "yellow",
"value": 70
"value": 50
},
{
"color": "orange",
"value": 75
},
{
"color": "red",
"value": 85
"value": 91.5
}
]
},
@ -91,11 +95,11 @@
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
"mode": "thresholds"
},
"mappings": [],
"thresholds": {
"mode": "percentage",
"mode": "absolute",
"steps": [
{
"color": "green",
@ -103,11 +107,15 @@
},
{
"color": "yellow",
"value": 70
"value": 50
},
{
"color": "orange",
"value": 75
},
{
"color": "red",
"value": 85
"value": 91.5
}
]
},
@ -155,7 +163,7 @@
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
"mode": "thresholds"
},
"mappings": [],
"thresholds": {
@ -215,7 +223,7 @@
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
"mode": "thresholds"
},
"mappings": [],
"thresholds": {

View File

@ -49,9 +49,7 @@ data:
"placement": "right"
},
"pieType": "pie",
"displayLabels": [
"percent"
],
"displayLabels": [],
"tooltip": {
"mode": "single"
},
@ -162,12 +160,16 @@ data:
],
"fieldConfig": {
"defaults": {
"unit": "percent"
"unit": "percent",
"custom": {
"filterable": true
}
},
"overrides": []
},
"options": {
"showHeader": true
"showHeader": true,
"columnFilters": false
},
"transformations": [
{

View File

@ -16,46 +16,55 @@ data:
{
"id": 1,
"type": "stat",
"title": "Ingress Traffic",
"title": "Ingress Success Rate (5m)",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 4,
"w": 8,
"w": 6,
"x": 0,
"y": 0
},
"targets": [
{
"expr": "sum(rate(node_network_receive_bytes_total{device!~\"lo|cni.*|veth.*|flannel.*|docker.*|virbr.*|vxlan.*|wg.*\"}[5m])) or on() vector(0)",
"expr": "(sum(rate(traefik_entrypoint_requests_total{code!~\"5..\"}[5m]))) / clamp_min(sum(rate(traefik_entrypoint_requests_total[5m])), 1)",
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
"mode": "thresholds"
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "rgba(115, 115, 115, 1)",
"color": "red",
"value": null
},
{
"color": "orange",
"value": 0.995
},
{
"color": "yellow",
"value": 0.999
},
{
"color": "green",
"value": 1
"value": 0.9995
}
]
},
"unit": "Bps",
"unit": "percentunit",
"custom": {
"displayMode": "auto"
}
},
"decimals": 2
},
"overrides": []
},
@ -76,46 +85,55 @@ data:
{
"id": 2,
"type": "stat",
"title": "Egress Traffic",
"title": "Error Budget Burn (1h)",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 4,
"w": 8,
"x": 8,
"w": 6,
"x": 6,
"y": 0
},
"targets": [
{
"expr": "sum(rate(node_network_transmit_bytes_total{device!~\"lo|cni.*|veth.*|flannel.*|docker.*|virbr.*|vxlan.*|wg.*\"}[5m])) or on() vector(0)",
"expr": "(1 - ((sum(rate(traefik_entrypoint_requests_total{code!~\"5..\"}[1h]))) / clamp_min(sum(rate(traefik_entrypoint_requests_total[1h])), 1))) / 0.0010000000000000009",
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
"mode": "thresholds"
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "rgba(115, 115, 115, 1)",
"color": "green",
"value": null
},
{
"color": "green",
"color": "yellow",
"value": 1
},
{
"color": "orange",
"value": 2
},
{
"color": "red",
"value": 4
}
]
},
"unit": "Bps",
"unit": "none",
"custom": {
"displayMode": "auto"
}
},
"decimals": 2
},
"overrides": []
},
@ -136,7 +154,145 @@ data:
{
"id": 3,
"type": "stat",
"title": "Intra-Cluster Traffic",
"title": "Error Budget Burn (6h)",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 4,
"w": 6,
"x": 12,
"y": 0
},
"targets": [
{
"expr": "(1 - ((sum(rate(traefik_entrypoint_requests_total{code!~\"5..\"}[6h]))) / clamp_min(sum(rate(traefik_entrypoint_requests_total[6h])), 1))) / 0.0010000000000000009",
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "yellow",
"value": 1
},
{
"color": "orange",
"value": 2
},
{
"color": "red",
"value": 4
}
]
},
"unit": "none",
"custom": {
"displayMode": "auto"
},
"decimals": 2
},
"overrides": []
},
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "center",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"textMode": "value"
}
},
{
"id": 4,
"type": "stat",
"title": "Edge P99 Latency (ms)",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 4,
"w": 6,
"x": 18,
"y": 0
},
"targets": [
{
"expr": "histogram_quantile(0.99, sum by (le) (rate(traefik_entrypoint_request_duration_seconds_bucket[5m]))) * 1000",
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "yellow",
"value": 200
},
{
"color": "orange",
"value": 350
},
{
"color": "red",
"value": 500
}
]
},
"unit": "ms",
"custom": {
"displayMode": "auto"
},
"decimals": 1
},
"overrides": []
},
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "center",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"textMode": "value"
}
},
{
"id": 5,
"type": "stat",
"title": "Ingress Traffic",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
@ -144,19 +300,19 @@ data:
"gridPos": {
"h": 4,
"w": 8,
"x": 16,
"y": 0
"x": 0,
"y": 4
},
"targets": [
{
"expr": "sum(rate(container_network_receive_bytes_total{namespace!=\"traefik\",pod!=\"\"}[5m]) + rate(container_network_transmit_bytes_total{namespace!=\"traefik\",pod!=\"\"}[5m])) or on() vector(0)",
"expr": "sum(rate(node_network_receive_bytes_total{device!~\"lo|cni.*|veth.*|flannel.*|docker.*|virbr.*|vxlan.*|wg.*\"}[5m])) or on() vector(0)",
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
"mode": "thresholds"
},
"mappings": [],
"thresholds": {
@ -194,9 +350,9 @@ data:
}
},
{
"id": 4,
"id": 6,
"type": "stat",
"title": "Top Router req/s",
"title": "Egress Traffic",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
@ -204,20 +360,19 @@ data:
"gridPos": {
"h": 4,
"w": 8,
"x": 0,
"x": 8,
"y": 4
},
"targets": [
{
"expr": "topk(1, sum by (router) (rate(traefik_router_requests_total[5m])))",
"refId": "A",
"legendFormat": "{{router}}"
"expr": "sum(rate(node_network_transmit_bytes_total{device!~\"lo|cni.*|veth.*|flannel.*|docker.*|virbr.*|vxlan.*|wg.*\"}[5m])) or on() vector(0)",
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
"mode": "thresholds"
},
"mappings": [],
"thresholds": {
@ -233,7 +388,7 @@ data:
}
]
},
"unit": "req/s",
"unit": "Bps",
"custom": {
"displayMode": "auto"
}
@ -255,7 +410,67 @@ data:
}
},
{
"id": 5,
"id": 7,
"type": "stat",
"title": "Intra-Cluster Traffic",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 4,
"w": 8,
"x": 16,
"y": 4
},
"targets": [
{
"expr": "sum(rate(container_network_receive_bytes_total{namespace!=\"traefik\",pod!=\"\"}[5m]) + rate(container_network_transmit_bytes_total{namespace!=\"traefik\",pod!=\"\"}[5m])) or on() vector(0)",
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "rgba(115, 115, 115, 1)",
"value": null
},
{
"color": "green",
"value": 1
}
]
},
"unit": "Bps",
"custom": {
"displayMode": "auto"
}
},
"overrides": []
},
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "center",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"textMode": "value"
}
},
{
"id": 8,
"type": "timeseries",
"title": "Per-Node Throughput",
"datasource": {
@ -292,7 +507,7 @@ data:
}
},
{
"id": 6,
"id": 9,
"type": "table",
"title": "Top Namespaces",
"datasource": {
@ -313,12 +528,16 @@ data:
],
"fieldConfig": {
"defaults": {
"unit": "Bps"
"unit": "Bps",
"custom": {
"filterable": true
}
},
"overrides": []
},
"options": {
"showHeader": true
"showHeader": true,
"columnFilters": false
},
"transformations": [
{
@ -328,7 +547,7 @@ data:
]
},
{
"id": 7,
"id": 10,
"type": "table",
"title": "Top Pods",
"datasource": {
@ -349,12 +568,16 @@ data:
],
"fieldConfig": {
"defaults": {
"unit": "Bps"
"unit": "Bps",
"custom": {
"filterable": true
}
},
"overrides": []
},
"options": {
"showHeader": true
"showHeader": true,
"columnFilters": false
},
"transformations": [
{
@ -364,7 +587,7 @@ data:
]
},
{
"id": 8,
"id": 11,
"type": "timeseries",
"title": "Traefik Routers (req/s)",
"datasource": {
@ -401,7 +624,7 @@ data:
}
},
{
"id": 9,
"id": 12,
"type": "timeseries",
"title": "Traefik Entrypoints (req/s)",
"datasource": {

View File

@ -36,7 +36,7 @@ data:
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
"mode": "thresholds"
},
"mappings": [],
"thresholds": {
@ -97,7 +97,7 @@ data:
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
"mode": "thresholds"
},
"mappings": [],
"thresholds": {
@ -158,7 +158,7 @@ data:
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
"mode": "thresholds"
},
"mappings": [],
"thresholds": {
@ -195,6 +195,213 @@ data:
"textMode": "value"
}
},
{
"id": 9,
"type": "stat",
"title": "API Server 5xx rate",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 4,
"w": 8,
"x": 0,
"y": 4
},
"targets": [
{
"expr": "sum(rate(apiserver_request_total{code=~\"5..\"}[5m]))",
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "yellow",
"value": 0.05
},
{
"color": "orange",
"value": 0.2
},
{
"color": "red",
"value": 0.5
}
]
},
"unit": "req/s",
"custom": {
"displayMode": "auto"
},
"decimals": 3
},
"overrides": []
},
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "center",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"textMode": "value"
}
},
{
"id": 10,
"type": "stat",
"title": "API Server P99 latency",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 4,
"w": 8,
"x": 8,
"y": 4
},
"targets": [
{
"expr": "histogram_quantile(0.99, sum by (le) (rate(apiserver_request_duration_seconds_bucket[5m]))) * 1000",
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "yellow",
"value": 250
},
{
"color": "orange",
"value": 400
},
{
"color": "red",
"value": 600
}
]
},
"unit": "ms",
"custom": {
"displayMode": "auto"
},
"decimals": 1
},
"overrides": []
},
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "center",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"textMode": "value"
}
},
{
"id": 11,
"type": "stat",
"title": "etcd P99 latency",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 4,
"w": 8,
"x": 16,
"y": 4
},
"targets": [
{
"expr": "histogram_quantile(0.99, sum by (le) (rate(etcd_request_duration_seconds_bucket[5m]))) * 1000",
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "yellow",
"value": 50
},
{
"color": "orange",
"value": 100
},
{
"color": "red",
"value": 200
}
]
},
"unit": "ms",
"custom": {
"displayMode": "auto"
},
"decimals": 1
},
"overrides": []
},
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "center",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"textMode": "value"
}
},
{
"id": 4,
"type": "timeseries",
@ -207,7 +414,7 @@ data:
"h": 9,
"w": 24,
"x": 0,
"y": 4
"y": 8
},
"targets": [
{
@ -247,7 +454,7 @@ data:
"h": 9,
"w": 24,
"x": 0,
"y": 13
"y": 17
},
"targets": [
{
@ -287,7 +494,7 @@ data:
"h": 9,
"w": 12,
"x": 0,
"y": 22
"y": 26
},
"targets": [
{
@ -324,7 +531,7 @@ data:
"h": 9,
"w": 12,
"x": 12,
"y": 22
"y": 26
},
"targets": [
{
@ -361,7 +568,7 @@ data:
"h": 9,
"w": 24,
"x": 0,
"y": 31
"y": 35
},
"targets": [
{

View File

@ -16,67 +16,6 @@ data:
"list": []
},
"panels": [
{
"id": 1,
"type": "gauge",
"title": "Workers Ready",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 5,
"w": 5,
"x": 0,
"y": 0
},
"targets": [
{
"expr": "sum(kube_node_status_condition{condition=\"Ready\",status=\"true\",node=~\"titan-04|titan-05|titan-06|titan-07|titan-08|titan-09|titan-10|titan-11|titan-12|titan-13|titan-14|titan-15|titan-16|titan-17|titan-18|titan-19|titan-22|titan-24\"})",
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"min": 0,
"max": 18,
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "red",
"value": null
},
{
"color": "orange",
"value": 16
},
{
"color": "yellow",
"value": 17
},
{
"color": "green",
"value": 18
}
]
}
},
"overrides": []
},
"options": {
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"orientation": "auto",
"showThresholdMarkers": false,
"showThresholdLabels": false
}
},
{
"id": 2,
"type": "gauge",
@ -87,8 +26,8 @@ data:
},
"gridPos": {
"h": 5,
"w": 5,
"x": 5,
"w": 4,
"x": 0,
"y": 0
},
"targets": [
@ -140,8 +79,8 @@ data:
},
"gridPos": {
"h": 5,
"w": 5,
"x": 10,
"w": 3,
"x": 4,
"y": 0
},
"targets": [
@ -153,82 +92,7 @@ data:
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "yellow",
"value": 1
},
{
"color": "orange",
"value": 2
},
{
"color": "red",
"value": 3
}
]
},
"unit": "none",
"custom": {
"displayMode": "auto"
}
},
"overrides": []
},
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "center",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"textMode": "value"
},
"links": [
{
"title": "Open atlas-pods dashboard",
"url": "/d/atlas-pods",
"targetBlank": true
}
]
},
{
"id": 4,
"type": "stat",
"title": "Problem Pods",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 5,
"w": 5,
"x": 15,
"y": 0
},
"targets": [
{
"expr": "sum(max by (namespace,pod) (kube_pod_status_phase{phase!~\"Running|Succeeded\"}))",
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
"mode": "thresholds"
},
"mappings": [],
"thresholds": {
@ -290,20 +154,20 @@ data:
},
"gridPos": {
"h": 5,
"w": 4,
"x": 20,
"w": 3,
"x": 7,
"y": 0
},
"targets": [
{
"expr": "sum(max by (namespace,pod) (((time() - kube_pod_deletion_timestamp{pod!=\"\"}) > bool 600) and on(namespace,pod) (kube_pod_deletion_timestamp{pod!=\"\"} > bool 0)))",
"expr": "sum(max by (namespace,pod) (((time() - kube_pod_deletion_timestamp{pod!=\"\"}) > bool 600) and on(namespace,pod) (kube_pod_deletion_timestamp{pod!=\"\"} > bool 0))) or on() vector(0)",
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
"mode": "thresholds"
},
"mappings": [],
"thresholds": {
@ -355,6 +219,286 @@ data:
}
]
},
{
"id": 27,
"type": "stat",
"title": "Atlas Availability (30d)",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 5,
"w": 4,
"x": 10,
"y": 0
},
"targets": [
{
"expr": "avg_over_time((min(((sum(kube_node_status_condition{condition=\"Ready\",status=\"true\",node=~\"titan-0a|titan-0b|titan-0c\"}) / 3)), ((sum(kube_deployment_status_replicas_available{namespace=~\"traefik|kube-system\",deployment=\"traefik\"}) / clamp_min(sum(kube_deployment_spec_replicas{namespace=~\"traefik|kube-system\",deployment=\"traefik\"}), 1)))))[30d:5m])",
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "red",
"value": null
},
{
"color": "orange",
"value": 0.999
},
{
"color": "yellow",
"value": 0.9999
},
{
"color": "green",
"value": 0.99999
}
]
},
"unit": "percentunit",
"custom": {
"displayMode": "auto"
},
"decimals": 3
},
"overrides": []
},
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "center",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"textMode": "value"
}
},
{
"id": 4,
"type": "stat",
"title": "Problem Pods",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 5,
"w": 3,
"x": 14,
"y": 0
},
"targets": [
{
"expr": "sum(max by (namespace,pod) (kube_pod_status_phase{phase!~\"Running|Succeeded\"})) or on() vector(0)",
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "yellow",
"value": 1
},
{
"color": "orange",
"value": 2
},
{
"color": "red",
"value": 3
}
]
},
"unit": "none",
"custom": {
"displayMode": "auto"
}
},
"overrides": []
},
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "center",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"textMode": "value"
},
"links": [
{
"title": "Open atlas-pods dashboard",
"url": "/d/atlas-pods",
"targetBlank": true
}
]
},
{
"id": 6,
"type": "stat",
"title": "CrashLoop / ImagePull",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 5,
"w": 3,
"x": 17,
"y": 0
},
"targets": [
{
"expr": "sum(max by (namespace,pod) (kube_pod_container_status_waiting_reason{reason=~\"CrashLoopBackOff|ImagePullBackOff\"})) or on() vector(0)",
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "yellow",
"value": 1
},
{
"color": "orange",
"value": 2
},
{
"color": "red",
"value": 3
}
]
},
"unit": "none",
"custom": {
"displayMode": "auto"
}
},
"overrides": []
},
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "center",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"textMode": "value"
},
"links": [
{
"title": "Open atlas-pods dashboard",
"url": "/d/atlas-pods",
"targetBlank": true
}
]
},
{
"id": 1,
"type": "gauge",
"title": "Workers Ready",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 5,
"w": 4,
"x": 20,
"y": 0
},
"targets": [
{
"expr": "sum(kube_node_status_condition{condition=\"Ready\",status=\"true\",node=~\"titan-04|titan-05|titan-06|titan-07|titan-08|titan-09|titan-10|titan-11|titan-12|titan-13|titan-14|titan-15|titan-16|titan-17|titan-18|titan-19|titan-22|titan-24\"})",
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"min": 0,
"max": 18,
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "red",
"value": null
},
{
"color": "orange",
"value": 16
},
{
"color": "yellow",
"value": 17
},
{
"color": "green",
"value": 18
}
]
}
},
"overrides": []
},
"options": {
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"orientation": "auto",
"showThresholdMarkers": false,
"showThresholdLabels": false
}
},
{
"id": 7,
"type": "stat",
@ -380,11 +524,11 @@ data:
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
"mode": "thresholds"
},
"mappings": [],
"thresholds": {
"mode": "percentage",
"mode": "absolute",
"steps": [
{
"color": "green",
@ -392,11 +536,15 @@ data:
},
{
"color": "yellow",
"value": 70
"value": 50
},
{
"color": "orange",
"value": 75
},
{
"color": "red",
"value": 85
"value": 91.5
}
]
},
@ -453,11 +601,11 @@ data:
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
"mode": "thresholds"
},
"mappings": [],
"thresholds": {
"mode": "percentage",
"mode": "absolute",
"steps": [
{
"color": "green",
@ -465,11 +613,15 @@ data:
},
{
"color": "yellow",
"value": 70
"value": 50
},
{
"color": "orange",
"value": 75
},
{
"color": "red",
"value": 85
"value": 91.5
}
]
},
@ -526,7 +678,7 @@ data:
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
"mode": "thresholds"
},
"mappings": [],
"thresholds": {
@ -595,7 +747,7 @@ data:
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
"mode": "thresholds"
},
"mappings": [],
"thresholds": {
@ -662,11 +814,11 @@ data:
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
"mode": "thresholds"
},
"mappings": [],
"thresholds": {
"mode": "percentage",
"mode": "absolute",
"steps": [
{
"color": "green",
@ -674,11 +826,15 @@ data:
},
{
"color": "yellow",
"value": 70
"value": 50
},
{
"color": "orange",
"value": 75
},
{
"color": "red",
"value": 85
"value": 91.5
}
]
},
@ -733,11 +889,11 @@ data:
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
"mode": "thresholds"
},
"mappings": [],
"thresholds": {
"mode": "percentage",
"mode": "absolute",
"steps": [
{
"color": "green",
@ -745,11 +901,15 @@ data:
},
{
"color": "yellow",
"value": 70
"value": 50
},
{
"color": "orange",
"value": 75
},
{
"color": "red",
"value": 85
"value": 91.5
}
]
},
@ -804,7 +964,7 @@ data:
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
"mode": "thresholds"
},
"mappings": [],
"thresholds": {
@ -871,7 +1031,7 @@ data:
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
"mode": "thresholds"
},
"mappings": [],
"thresholds": {
@ -951,9 +1111,7 @@ data:
"placement": "right"
},
"pieType": "pie",
"displayLabels": [
"percent"
],
"displayLabels": [],
"tooltip": {
"mode": "single"
},
@ -1004,9 +1162,7 @@ data:
"placement": "right"
},
"pieType": "pie",
"displayLabels": [
"percent"
],
"displayLabels": [],
"tooltip": {
"mode": "single"
},
@ -1057,9 +1213,7 @@ data:
"placement": "right"
},
"pieType": "pie",
"displayLabels": [
"percent"
],
"displayLabels": [],
"tooltip": {
"mode": "single"
},
@ -1184,7 +1338,7 @@ data:
},
"targets": [
{
"expr": "(avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-0a|titan-0b|titan-0c\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")",
"expr": "(avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-0a|titan-0b|titan-0c|titan-db\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")",
"refId": "A",
"legendFormat": "{{node}}"
}
@ -1221,7 +1375,7 @@ data:
},
"targets": [
{
"expr": "(avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-0a|titan-0b|titan-0c\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")",
"expr": "(avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-0a|titan-0b|titan-0c|titan-db\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")",
"refId": "A",
"legendFormat": "{{node}}"
}
@ -1242,6 +1396,138 @@ data:
}
}
},
{
"id": 28,
"type": "piechart",
"title": "Node Pod Share",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 10,
"w": 12,
"x": 0,
"y": 54
},
"targets": [
{
"expr": "(sum(kube_pod_info{pod!=\"\" , node!=\"\"}) by (node) / clamp_min(sum(kube_pod_info{pod!=\"\" , node!=\"\"}), 1)) * 100",
"refId": "A",
"legendFormat": "{{namespace}}"
}
],
"fieldConfig": {
"defaults": {
"unit": "percent",
"color": {
"mode": "palette-classic"
}
},
"overrides": []
},
"options": {
"legend": {
"displayMode": "list",
"placement": "right"
},
"pieType": "pie",
"displayLabels": [],
"tooltip": {
"mode": "single"
},
"colorScheme": "interpolateSpectral",
"colorBy": "value",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
}
}
},
{
"id": 29,
"type": "bargauge",
"title": "Top Nodes by Pod Count",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 10,
"w": 12,
"x": 12,
"y": 54
},
"targets": [
{
"expr": "topk(12, sum(kube_pod_info{pod!=\"\" , node!=\"\"}) by (node))",
"refId": "A",
"legendFormat": "{{node}}",
"instant": true
}
],
"fieldConfig": {
"defaults": {
"unit": "none",
"min": 0,
"max": null,
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "yellow",
"value": 50
},
{
"color": "orange",
"value": 75
},
{
"color": "red",
"value": 100
}
]
},
"decimals": 0
},
"overrides": []
},
"options": {
"displayMode": "gradient",
"orientation": "horizontal",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
}
},
"transformations": [
{
"id": "sortBy",
"options": {
"fields": [
"Value"
],
"order": "desc"
}
},
{
"id": "limit",
"options": {
"limit": 12
}
}
]
},
{
"id": 18,
"type": "timeseries",
@ -1386,7 +1672,7 @@ data:
"h": 16,
"w": 12,
"x": 0,
"y": 54
"y": 64
},
"targets": [
{
@ -1434,7 +1720,7 @@ data:
"h": 16,
"w": 12,
"x": 12,
"y": 54
"y": 64
},
"targets": [
{
@ -1461,11 +1747,11 @@ data:
},
{
"color": "orange",
"value": 70
"value": 75
},
{
"color": "red",
"value": 85
"value": 91.5
}
]
}
@ -1489,6 +1775,17 @@ data:
"url": "/d/atlas-storage",
"targetBlank": true
}
],
"transformations": [
{
"id": "sortBy",
"options": {
"fields": [
"Value"
],
"order": "desc"
}
}
]
}
],
@ -1506,36 +1803,5 @@ data:
"to": "now"
},
"refresh": "1m",
"links": [
{
"title": "Atlas Pods",
"type": "dashboard",
"dashboardUid": "atlas-pods",
"keepTime": false
},
{
"title": "Atlas Nodes",
"type": "dashboard",
"dashboardUid": "atlas-nodes",
"keepTime": false
},
{
"title": "Atlas Storage",
"type": "dashboard",
"dashboardUid": "atlas-storage",
"keepTime": false
},
{
"title": "Atlas Network",
"type": "dashboard",
"dashboardUid": "atlas-network",
"keepTime": false
},
{
"title": "Atlas GPU",
"type": "dashboard",
"dashboardUid": "atlas-gpu",
"keepTime": false
}
]
"links": []
}

View File

@ -29,14 +29,14 @@ data:
},
"targets": [
{
"expr": "sum(max by (namespace,pod) (kube_pod_status_phase{phase!~\"Running|Succeeded\"}))",
"expr": "sum(max by (namespace,pod) (kube_pod_status_phase{phase!~\"Running|Succeeded\"})) or on() vector(0)",
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
"mode": "thresholds"
},
"mappings": [],
"thresholds": {
@ -89,14 +89,14 @@ data:
},
"targets": [
{
"expr": "sum(max by (namespace,pod) (kube_pod_container_status_waiting_reason{reason=~\"CrashLoopBackOff|ImagePullBackOff\"}))",
"expr": "sum(max by (namespace,pod) (kube_pod_container_status_waiting_reason{reason=~\"CrashLoopBackOff|ImagePullBackOff\"})) or on() vector(0)",
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
"mode": "thresholds"
},
"mappings": [],
"thresholds": {
@ -149,14 +149,14 @@ data:
},
"targets": [
{
"expr": "sum(max by (namespace,pod) (((time() - kube_pod_deletion_timestamp{pod!=\"\"}) > bool 600) and on(namespace,pod) (kube_pod_deletion_timestamp{pod!=\"\"} > bool 0)))",
"expr": "sum(max by (namespace,pod) (((time() - kube_pod_deletion_timestamp{pod!=\"\"}) > bool 600) and on(namespace,pod) (kube_pod_deletion_timestamp{pod!=\"\"} > bool 0))) or on() vector(0)",
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
"mode": "thresholds"
},
"mappings": [],
"thresholds": {
@ -216,7 +216,7 @@ data:
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
"mode": "thresholds"
},
"mappings": [],
"thresholds": {
@ -275,12 +275,16 @@ data:
],
"fieldConfig": {
"defaults": {
"unit": "s"
"unit": "s",
"custom": {
"filterable": true
}
},
"overrides": []
},
"options": {
"showHeader": true
"showHeader": true,
"columnFilters": false
},
"transformations": [
{
@ -311,12 +315,16 @@ data:
],
"fieldConfig": {
"defaults": {
"unit": "s"
"unit": "s",
"custom": {
"filterable": true
}
},
"overrides": []
},
"options": {
"showHeader": true
"showHeader": true,
"columnFilters": false
},
"transformations": [
{
@ -347,12 +355,16 @@ data:
],
"fieldConfig": {
"defaults": {
"unit": "s"
"unit": "s",
"custom": {
"filterable": true
}
},
"overrides": []
},
"options": {
"showHeader": true
"showHeader": true,
"columnFilters": false
},
"transformations": [
{
@ -368,6 +380,233 @@ data:
}
}
]
},
{
"id": 8,
"type": "piechart",
"title": "Node Pod Share",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 8,
"w": 12,
"x": 12,
"y": 34
},
"targets": [
{
"expr": "(sum(kube_pod_info{pod!=\"\" , node!=\"\"}) by (node) / clamp_min(sum(kube_pod_info{pod!=\"\" , node!=\"\"}), 1)) * 100",
"refId": "A",
"legendFormat": "{{namespace}}"
}
],
"fieldConfig": {
"defaults": {
"unit": "percent",
"color": {
"mode": "palette-classic"
}
},
"overrides": []
},
"options": {
"legend": {
"displayMode": "list",
"placement": "right"
},
"pieType": "pie",
"displayLabels": [],
"tooltip": {
"mode": "single"
},
"colorScheme": "interpolateSpectral",
"colorBy": "value",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
}
}
},
{
"id": 9,
"type": "bargauge",
"title": "Top Nodes by Pod Count",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 8,
"w": 12,
"x": 0,
"y": 34
},
"targets": [
{
"expr": "topk(12, sum(kube_pod_info{pod!=\"\" , node!=\"\"}) by (node))",
"refId": "A",
"legendFormat": "{{node}}",
"instant": true
}
],
"fieldConfig": {
"defaults": {
"unit": "none",
"min": 0,
"max": null,
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "yellow",
"value": 50
},
{
"color": "orange",
"value": 75
},
{
"color": "red",
"value": 100
}
]
},
"decimals": 0
},
"overrides": []
},
"options": {
"displayMode": "gradient",
"orientation": "horizontal",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
}
},
"transformations": [
{
"id": "sortBy",
"options": {
"fields": [
"Value"
],
"order": "desc"
}
},
{
"id": "limit",
"options": {
"limit": 12
}
}
]
},
{
"id": 10,
"type": "table",
"title": "Namespace Plurality by Node v27",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 8,
"w": 24,
"x": 0,
"y": 42
},
"targets": [
{
"expr": "(sum by (namespace,node) (kube_pod_info{pod!=\"\" , node!=\"\"}) / on(namespace) group_left() clamp_min(sum by (namespace) (kube_pod_info{pod!=\"\"}), 1) * 100) * on(namespace,node) group_left() ((sum by (namespace,node) (kube_pod_info{pod!=\"\" , node!=\"\"}) / on(namespace) group_left() clamp_min(sum by (namespace) (kube_pod_info{pod!=\"\"}), 1) * 100) + on(node) group_left() ((sum by (node) (kube_node_info{node=\"titan-0a\"}) * 0 + 0.001) or (sum by (node) (kube_node_info{node=\"titan-0b\"}) * 0 + 0.002) or (sum by (node) (kube_node_info{node=\"titan-0c\"}) * 0 + 0.003) or (sum by (node) (kube_node_info{node=\"titan-db\"}) * 0 + 0.004) or (sum by (node) (kube_node_info{node=\"titan-04\"}) * 0 + 0.005) or (sum by (node) (kube_node_info{node=\"titan-05\"}) * 0 + 0.006) or (sum by (node) (kube_node_info{node=\"titan-06\"}) * 0 + 0.007) or (sum by (node) (kube_node_info{node=\"titan-07\"}) * 0 + 0.008) or (sum by (node) (kube_node_info{node=\"titan-08\"}) * 0 + 0.009000000000000001) or (sum by (node) (kube_node_info{node=\"titan-09\"}) * 0 + 0.01) or (sum by (node) (kube_node_info{node=\"titan-10\"}) * 0 + 0.011) or (sum by (node) (kube_node_info{node=\"titan-11\"}) * 0 + 0.012) or (sum by (node) (kube_node_info{node=\"titan-12\"}) * 0 + 0.013000000000000001) or (sum by (node) (kube_node_info{node=\"titan-13\"}) * 0 + 0.014) or (sum by (node) (kube_node_info{node=\"titan-14\"}) * 0 + 0.015) or (sum by (node) (kube_node_info{node=\"titan-15\"}) * 0 + 0.016) or (sum by (node) (kube_node_info{node=\"titan-16\"}) * 0 + 0.017) or (sum by (node) (kube_node_info{node=\"titan-17\"}) * 0 + 0.018000000000000002) or (sum by (node) (kube_node_info{node=\"titan-18\"}) * 0 + 0.019) or (sum by (node) (kube_node_info{node=\"titan-19\"}) * 0 + 0.02) or (sum by (node) (kube_node_info{node=\"titan-22\"}) * 0 + 0.021) or (sum by (node) (kube_node_info{node=\"titan-24\"}) * 0 + 0.022)) == bool on(namespace) group_left() (max by (namespace) ((sum by (namespace,node) (kube_pod_info{pod!=\"\" , node!=\"\"}) / on(namespace) group_left() clamp_min(sum by (namespace) (kube_pod_info{pod!=\"\"}), 1) * 100) + on(node) group_left() ((sum by (node) (kube_node_info{node=\"titan-0a\"}) * 0 + 0.001) or (sum by (node) (kube_node_info{node=\"titan-0b\"}) * 0 + 0.002) or (sum by (node) (kube_node_info{node=\"titan-0c\"}) * 0 + 0.003) or (sum by (node) (kube_node_info{node=\"titan-db\"}) * 0 + 0.004) or (sum by (node) (kube_node_info{node=\"titan-04\"}) * 0 + 0.005) or (sum by (node) (kube_node_info{node=\"titan-05\"}) * 0 + 0.006) or (sum by (node) (kube_node_info{node=\"titan-06\"}) * 0 + 0.007) or (sum by (node) (kube_node_info{node=\"titan-07\"}) * 0 + 0.008) or (sum by (node) (kube_node_info{node=\"titan-08\"}) * 0 + 0.009000000000000001) or (sum by (node) (kube_node_info{node=\"titan-09\"}) * 0 + 0.01) or (sum by (node) (kube_node_info{node=\"titan-10\"}) * 0 + 0.011) or (sum by (node) (kube_node_info{node=\"titan-11\"}) * 0 + 0.012) or (sum by (node) (kube_node_info{node=\"titan-12\"}) * 0 + 0.013000000000000001) or (sum by (node) (kube_node_info{node=\"titan-13\"}) * 0 + 0.014) or (sum by (node) (kube_node_info{node=\"titan-14\"}) * 0 + 0.015) or (sum by (node) (kube_node_info{node=\"titan-15\"}) * 0 + 0.016) or (sum by (node) (kube_node_info{node=\"titan-16\"}) * 0 + 0.017) or (sum by (node) (kube_node_info{node=\"titan-17\"}) * 0 + 0.018000000000000002) or (sum by (node) (kube_node_info{node=\"titan-18\"}) * 0 + 0.019) or (sum by (node) (kube_node_info{node=\"titan-19\"}) * 0 + 0.02) or (sum by (node) (kube_node_info{node=\"titan-22\"}) * 0 + 0.021) or (sum by (node) (kube_node_info{node=\"titan-24\"}) * 0 + 0.022)))))",
"refId": "A",
"instant": true,
"format": "table"
}
],
"fieldConfig": {
"defaults": {
"unit": "percent",
"custom": {
"filterable": false
}
},
"overrides": []
},
"options": {
"showHeader": true,
"columnFilters": false,
"showColumnFilters": false,
"footer": {
"show": false,
"fields": "",
"calcs": []
}
},
"transformations": [
{
"id": "labelsToFields",
"options": {}
},
{
"id": "organize",
"options": {
"excludeByName": {
"Time": true
}
}
},
{
"id": "filterByValue",
"options": {
"match": "Value",
"operator": "gt",
"value": 0
}
},
{
"id": "sortBy",
"options": {
"fields": [
"Value"
],
"order": "desc"
}
},
{
"id": "groupBy",
"options": {
"fields": {
"namespace": {
"aggregations": [
{
"field": "Value",
"operation": "max"
},
{
"field": "node",
"operation": "first"
}
]
}
},
"rowBy": [
"namespace"
]
}
}
]
}
],
"time": {

View File

@ -36,11 +36,11 @@ data:
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
"mode": "thresholds"
},
"mappings": [],
"thresholds": {
"mode": "percentage",
"mode": "absolute",
"steps": [
{
"color": "green",
@ -48,11 +48,15 @@ data:
},
{
"color": "yellow",
"value": 70
"value": 50
},
{
"color": "orange",
"value": 75
},
{
"color": "red",
"value": 85
"value": 91.5
}
]
},
@ -100,11 +104,11 @@ data:
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
"mode": "thresholds"
},
"mappings": [],
"thresholds": {
"mode": "percentage",
"mode": "absolute",
"steps": [
{
"color": "green",
@ -112,11 +116,15 @@ data:
},
{
"color": "yellow",
"value": 70
"value": 50
},
{
"color": "orange",
"value": 75
},
{
"color": "red",
"value": 85
"value": 91.5
}
]
},
@ -164,7 +172,7 @@ data:
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
"mode": "thresholds"
},
"mappings": [],
"thresholds": {
@ -224,7 +232,7 @@ data:
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
"mode": "thresholds"
},
"mappings": [],
"thresholds": {

View File

@ -65,13 +65,13 @@ spec:
namespace: flux-system
values:
server:
# keep ~3 months; change as you like (supports "d", "y")
# keep 1 year; supports "d", "y"
extraArgs:
retentionPeriod: "90d" # VM flag -retentionPeriod=90d. :contentReference[oaicite:11]{index=11}
retentionPeriod: "1y" # VM flag -retentionPeriod=1y. :contentReference[oaicite:11]{index=11}
persistentVolume:
enabled: true
size: 100Gi
size: 250Gi
# Enable built-in Kubernetes scraping
scrape:
@ -186,6 +186,15 @@ spec:
- targets: ["longhorn-backend.longhorn-system.svc:9500"]
metrics_path: /metrics
# --- titan-db node_exporter (external control-plane DB host) ---
- job_name: "titan-db"
static_configs:
- targets: ["192.168.22.10:9100"]
relabel_configs:
- source_labels: [__address__]
target_label: instance
replacement: titan-db
# --- cert-manager (pods expose on 9402) ---
- job_name: "cert-manager"
kubernetes_sd_configs: [{ role: pod }]
@ -209,16 +218,6 @@ spec:
- action: keep
source_labels: [__meta_kubernetes_namespace, __meta_kubernetes_pod_label_app_kubernetes_io_part_of]
regex: flux-system;flux
- job_name: "titan-db"
static_configs:
- targets: ["titan-db:9100"]
relabel_configs:
- source_labels: [__address__]
target_label: instance
metric_relabel_configs:
- source_labels: [instance]
target_label: node
replacement: titan-db
---

View File

@ -0,0 +1,48 @@
# services/nextcloud/configmap.yaml
apiVersion: v1
kind: ConfigMap
metadata:
name: nextcloud-config
namespace: nextcloud
data:
extra.config.php: |
<?php
$CONFIG = array (
'trusted_domains' =>
array (
0 => 'cloud.bstein.dev',
),
'overwritehost' => 'cloud.bstein.dev',
'overwriteprotocol' => 'https',
'overwrite.cli.url' => 'https://cloud.bstein.dev',
'default_phone_region' => 'US',
'mail_smtpmode' => 'smtp',
'mail_sendmailmode' => 'smtp',
'mail_smtphost' => 'mail.bstein.dev',
'mail_smtpport' => '587',
'mail_smtpsecure' => 'tls',
'mail_smtpauth' => true,
'mail_smtpauthtype' => 'LOGIN',
'mail_domain' => 'bstein.dev',
'mail_from_address' => 'no-reply',
'oidc_login_provider_url' => 'https://sso.bstein.dev/realms/atlas',
'oidc_login_client_id' => getenv('OIDC_CLIENT_ID'),
'oidc_login_client_secret' => getenv('OIDC_CLIENT_SECRET'),
'oidc_login_auto_redirect' => false,
'oidc_login_end_session_redirect' => true,
'oidc_login_button_text' => 'Login with Keycloak',
'oidc_login_hide_password_form' => false,
'oidc_login_attributes' =>
array (
'id' => 'preferred_username',
'mail' => 'email',
'name' => 'name',
),
'oidc_login_scope' => 'openid profile email',
'oidc_login_unique_id' => 'preferred_username',
'oidc_login_use_pkce' => true,
'oidc_login_disable_registration' => false,
'oidc_login_create_groups' => false,
# External storage for user data should be configured to Asteria via the External Storage app (admin UI),
# keeping the astreae PVC for app internals only.
);

View File

@ -0,0 +1,32 @@
# services/nextcloud/cronjob.yaml
apiVersion: batch/v1
kind: CronJob
metadata:
name: nextcloud-cron
namespace: nextcloud
spec:
schedule: "*/5 * * * *"
concurrencyPolicy: Forbid
jobTemplate:
spec:
template:
spec:
securityContext:
runAsUser: 33
runAsGroup: 33
fsGroup: 33
restartPolicy: OnFailure
containers:
- name: nextcloud-cron
image: nextcloud:29-apache
imagePullPolicy: IfNotPresent
command: ["/bin/sh", "-c"]
args:
- "cd /var/www/html && php -f cron.php"
volumeMounts:
- name: nextcloud-data
mountPath: /var/www/html
volumes:
- name: nextcloud-data
persistentVolumeClaim:
claimName: nextcloud-data

View File

@ -0,0 +1,143 @@
# services/nextcloud/deployment.yaml
apiVersion: apps/v1
kind: Deployment
metadata:
name: nextcloud
namespace: nextcloud
labels:
app: nextcloud
spec:
replicas: 1
selector:
matchLabels:
app: nextcloud
template:
metadata:
labels:
app: nextcloud
spec:
nodeSelector:
hardware: rpi5
securityContext:
fsGroup: 33
runAsUser: 33
runAsGroup: 33
initContainers:
- name: fix-perms
image: alpine:3.20
command: ["/bin/sh", "-c"]
args:
- |
chown -R 33:33 /var/www/html/config || true
chown -R 33:33 /var/www/html/data || true
securityContext:
runAsUser: 0
runAsGroup: 0
volumeMounts:
- name: nextcloud-data
mountPath: /var/www/html
- name: nextcloud-config
mountPath: /var/www/html/config/extra.config.php
subPath: extra.config.php
containers:
- name: nextcloud
image: nextcloud:29-apache
imagePullPolicy: IfNotPresent
env:
# DB (external secret required: nextcloud-db with keys username,password,database)
- name: POSTGRES_HOST
value: postgres-service.postgres.svc.cluster.local
- name: POSTGRES_DB
valueFrom:
secretKeyRef:
name: nextcloud-db
key: database
- name: POSTGRES_USER
valueFrom:
secretKeyRef:
name: nextcloud-db
key: db-username
- name: POSTGRES_PASSWORD
valueFrom:
secretKeyRef:
name: nextcloud-db
key: db-password
# Admin bootstrap (external secret: nextcloud-admin with keys admin-user, admin-password)
- name: NEXTCLOUD_ADMIN_USER
valueFrom:
secretKeyRef:
name: nextcloud-admin
key: admin-user
- name: NEXTCLOUD_ADMIN_PASSWORD
valueFrom:
secretKeyRef:
name: nextcloud-admin
key: admin-password
- name: NEXTCLOUD_TRUSTED_DOMAINS
value: cloud.bstein.dev
- name: OVERWRITEHOST
value: cloud.bstein.dev
- name: OVERWRITEPROTOCOL
value: https
- name: OVERWRITECLIURL
value: https://cloud.bstein.dev
# SMTP (external secret: nextcloud-smtp with keys username, password)
- name: SMTP_HOST
value: mail.bstein.dev
- name: SMTP_PORT
value: "587"
- name: SMTP_SECURE
value: tls
- name: SMTP_NAME
valueFrom:
secretKeyRef:
name: nextcloud-smtp
key: smtp-username
- name: SMTP_PASSWORD
valueFrom:
secretKeyRef:
name: nextcloud-smtp
key: smtp-password
- name: MAIL_FROM_ADDRESS
value: no-reply
- name: MAIL_DOMAIN
value: bstein.dev
# OIDC (external secret: nextcloud-oidc with keys client-id, client-secret)
- name: OIDC_CLIENT_ID
valueFrom:
secretKeyRef:
name: nextcloud-oidc
key: client-id
- name: OIDC_CLIENT_SECRET
valueFrom:
secretKeyRef:
name: nextcloud-oidc
key: client-secret
- name: NEXTCLOUD_UPDATE
value: "1"
- name: APP_INSTALL
value: "mail,oidc_login,external"
ports:
- containerPort: 80
name: http
volumeMounts:
- name: nextcloud-data
mountPath: /var/www/html
- name: nextcloud-config
mountPath: /var/www/html/config/extra.config.php
subPath: extra.config.php
resources:
requests:
cpu: 250m
memory: 1Gi
limits:
cpu: 1
memory: 3Gi
volumes:
- name: nextcloud-data
persistentVolumeClaim:
claimName: nextcloud-data
- name: nextcloud-config
configMap:
name: nextcloud-config
defaultMode: 0444

View File

@ -0,0 +1,25 @@
# services/nextcloud/ingress.yaml
apiVersion: networking.k8s.io/v1
kind: Ingress
metadata:
name: nextcloud
namespace: nextcloud
annotations:
cert-manager.io/cluster-issuer: letsencrypt-prod
traefik.ingress.kubernetes.io/router.entrypoints: websecure
spec:
tls:
- hosts:
- cloud.bstein.dev
secretName: nextcloud-tls
rules:
- host: cloud.bstein.dev
http:
paths:
- path: /
pathType: Prefix
backend:
service:
name: nextcloud
port:
number: 80

View File

@ -0,0 +1,25 @@
# services/nextcloud/kustomization.yaml
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
namespace: nextcloud
resources:
- namespace.yaml
- configmap.yaml
- pvc.yaml
- deployment.yaml
- service.yaml
- ingress.yaml
- cronjob.yaml
- mail-sync-cronjob.yaml
- maintenance-cronjob.yaml
configMapGenerator:
- name: nextcloud-maintenance-script
files:
- maintenance.sh=../../scripts/nextcloud-maintenance.sh
options:
disableNameSuffixHash: true
- name: nextcloud-mail-sync-script
files:
- sync.sh=../../scripts/nextcloud-mail-sync.sh
options:
disableNameSuffixHash: true

View File

@ -0,0 +1,58 @@
# services/nextcloud/mail-sync-cronjob.yaml
apiVersion: batch/v1
kind: CronJob
metadata:
name: nextcloud-mail-sync
namespace: nextcloud
spec:
schedule: "0 5 * * *"
concurrencyPolicy: Forbid
jobTemplate:
spec:
template:
spec:
restartPolicy: OnFailure
securityContext:
runAsUser: 0
runAsGroup: 0
containers:
- name: mail-sync
image: nextcloud:29-apache
imagePullPolicy: IfNotPresent
command: ["/bin/bash", "/sync/sync.sh"]
env:
- name: KC_BASE
value: https://sso.bstein.dev
- name: KC_REALM
value: atlas
- name: KC_ADMIN_USER
valueFrom:
secretKeyRef:
name: nextcloud-keycloak-admin
key: username
- name: KC_ADMIN_PASS
valueFrom:
secretKeyRef:
name: nextcloud-keycloak-admin
key: password
volumeMounts:
- name: nextcloud-data
mountPath: /var/www/html
- name: sync-script
mountPath: /sync/sync.sh
subPath: sync.sh
resources:
requests:
cpu: 100m
memory: 256Mi
limits:
cpu: 500m
memory: 512Mi
volumes:
- name: nextcloud-data
persistentVolumeClaim:
claimName: nextcloud-data
- name: sync-script
configMap:
name: nextcloud-mail-sync-script
defaultMode: 0755

View File

@ -0,0 +1,56 @@
# services/nextcloud/maintenance-cronjob.yaml
apiVersion: batch/v1
kind: CronJob
metadata:
name: nextcloud-maintenance
namespace: nextcloud
spec:
schedule: "30 4 * * *"
concurrencyPolicy: Forbid
jobTemplate:
spec:
template:
spec:
restartPolicy: OnFailure
securityContext:
runAsUser: 0
runAsGroup: 0
containers:
- name: maintenance
image: nextcloud:29-apache
imagePullPolicy: IfNotPresent
command: ["/bin/bash", "/maintenance/maintenance.sh"]
env:
- name: NC_URL
value: https://cloud.bstein.dev
- name: ADMIN_USER
valueFrom:
secretKeyRef:
name: nextcloud-admin
key: admin-user
- name: ADMIN_PASS
valueFrom:
secretKeyRef:
name: nextcloud-admin
key: admin-password
volumeMounts:
- name: nextcloud-data
mountPath: /var/www/html
- name: maintenance-script
mountPath: /maintenance/maintenance.sh
subPath: maintenance.sh
resources:
requests:
cpu: 100m
memory: 256Mi
limits:
cpu: 500m
memory: 512Mi
volumes:
- name: nextcloud-data
persistentVolumeClaim:
claimName: nextcloud-data
- name: maintenance-script
configMap:
name: nextcloud-maintenance-script
defaultMode: 0755

View File

@ -0,0 +1,5 @@
# services/nextcloud/namespace.yaml
apiVersion: v1
kind: Namespace
metadata:
name: nextcloud

View File

@ -0,0 +1,13 @@
# services/nextcloud/pvc.yaml
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
name: nextcloud-data
namespace: nextcloud
spec:
accessModes:
- ReadWriteMany
resources:
requests:
storage: 200Gi
storageClassName: astreae

View File

@ -0,0 +1,13 @@
# services/nextcloud/service.yaml
apiVersion: v1
kind: Service
metadata:
name: nextcloud
namespace: nextcloud
spec:
selector:
app: nextcloud
ports:
- name: http
port: 80
targetPort: http

View File

@ -8,7 +8,7 @@ metadata:
kubernetes.io/ingress.class: traefik
traefik.ingress.kubernetes.io/router.entrypoints: websecure
traefik.ingress.kubernetes.io/router.tls: "true"
cert-manager.io/cluster-issuer: letsencrypt-prod
cert-manager.io/cluster-issuer: letsencrypt
spec:
tls:
- hosts: [ "pegasus.bstein.dev" ]

View File

@ -8,7 +8,7 @@ spec:
secretName: vault-server-tls
issuerRef:
kind: ClusterIssuer
name: letsencrypt-prod
name: letsencrypt
commonName: secret.bstein.dev
dnsNames:
- secret.bstein.dev

View File

@ -5,7 +5,7 @@ metadata:
name: zot
namespace: zot
annotations:
cert-manager.io/cluster-issuer: letsencrypt-prod
cert-manager.io/cluster-issuer: letsencrypt
traefik.ingress.kubernetes.io/router.entrypoints: websecure
traefik.ingress.kubernetes.io/router.tls: "true"
traefik.ingress.kubernetes.io/router.middlewares: zot-zot-resp-headers@kubernetescrd