diff --git a/clusters/atlas/flux-system/applications/atlasbot/image-automation.yaml b/clusters/atlas/flux-system/applications/atlasbot/image-automation.yaml new file mode 100644 index 0000000..ec8672c --- /dev/null +++ b/clusters/atlas/flux-system/applications/atlasbot/image-automation.yaml @@ -0,0 +1,26 @@ +# clusters/atlas/flux-system/applications/atlasbot/image-automation.yaml +apiVersion: image.toolkit.fluxcd.io/v1 +kind: ImageUpdateAutomation +metadata: + name: atlasbot + namespace: ai +spec: + interval: 1m0s + sourceRef: + kind: GitRepository + name: flux-system + namespace: flux-system + git: + checkout: + ref: + branch: feature/atlasbot + commit: + author: + email: ops@bstein.dev + name: flux-bot + messageTemplate: "chore(atlasbot): automated image update" + push: + branch: feature/atlasbot + update: + strategy: Setters + path: services/atlasbot diff --git a/clusters/atlas/flux-system/applications/atlasbot/kustomization.yaml b/clusters/atlas/flux-system/applications/atlasbot/kustomization.yaml new file mode 100644 index 0000000..9e49331 --- /dev/null +++ b/clusters/atlas/flux-system/applications/atlasbot/kustomization.yaml @@ -0,0 +1,17 @@ +# clusters/atlas/flux-system/applications/atlasbot/kustomization.yaml +apiVersion: kustomize.toolkit.fluxcd.io/v1 +kind: Kustomization +metadata: + name: atlasbot + namespace: flux-system +spec: + interval: 10m + prune: true + sourceRef: + kind: GitRepository + name: flux-system + path: ./services/atlasbot + targetNamespace: ai + timeout: 2m + dependsOn: + - name: ai-llm diff --git a/clusters/atlas/flux-system/applications/kustomization.yaml b/clusters/atlas/flux-system/applications/kustomization.yaml index 6e765d4..32ccc74 100644 --- a/clusters/atlas/flux-system/applications/kustomization.yaml +++ b/clusters/atlas/flux-system/applications/kustomization.yaml @@ -7,6 +7,8 @@ resources: - vaultwarden/kustomization.yaml - comms/kustomization.yaml - comms/image-automation.yaml + - atlasbot/kustomization.yaml + - atlasbot/image-automation.yaml - crypto/kustomization.yaml - monerod/kustomization.yaml - pegasus/kustomization.yaml diff --git a/scripts/knowledge_render_atlas.py b/scripts/knowledge_render_atlas.py index 1e305cb..82da0d8 100644 --- a/scripts/knowledge_render_atlas.py +++ b/scripts/knowledge_render_atlas.py @@ -539,9 +539,9 @@ def main() -> int: help="Write generated files (otherwise just print a summary).", ) ap.add_argument( - "--sync-comms", + "--sync-atlasbot", action="store_true", - help="Mirror rendered knowledge into services/comms/knowledge for atlasbot.", + help="Mirror rendered knowledge into services/atlasbot/knowledge for atlasbot.", ) args = ap.parse_args() @@ -632,10 +632,10 @@ def main() -> int: print(f"Wrote {runbooks_json_path.relative_to(REPO_ROOT)}") print(f"Wrote {metrics_json_path.relative_to(REPO_ROOT)}") - if args.sync_comms: - comms_dir = REPO_ROOT / "services" / "comms" / "knowledge" - _sync_tree(out_dir, comms_dir) - print(f"Synced {out_dir.relative_to(REPO_ROOT)} -> {comms_dir.relative_to(REPO_ROOT)}") + if args.sync_atlasbot: + atlasbot_dir = REPO_ROOT / "services" / "atlasbot" / "knowledge" + _sync_tree(out_dir, atlasbot_dir) + print(f"Synced {out_dir.relative_to(REPO_ROOT)} -> {atlasbot_dir.relative_to(REPO_ROOT)}") return 0 diff --git a/services/comms/atlasbot-deployment.yaml b/services/atlasbot/atlasbot-deployment.yaml similarity index 95% rename from services/comms/atlasbot-deployment.yaml rename to services/atlasbot/atlasbot-deployment.yaml index a9d2fd7..5406db5 100644 --- a/services/comms/atlasbot-deployment.yaml +++ b/services/atlasbot/atlasbot-deployment.yaml @@ -3,7 +3,7 @@ apiVersion: apps/v1 kind: Deployment metadata: name: atlasbot - namespace: comms + namespace: ai labels: app: atlasbot spec: @@ -18,7 +18,7 @@ spec: annotations: checksum/atlasbot-configmap: manual-atlasbot-101 vault.hashicorp.com/agent-inject: "true" - vault.hashicorp.com/role: "comms" + vault.hashicorp.com/role: "ai" vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret" vault.hashicorp.com/agent-inject-template-turn-secret: | {{- with secret "kv/data/atlas/comms/turn-shared-secret" -}}{{ .Data.data.TURN_STATIC_AUTH_SECRET }}{{- end -}} @@ -71,13 +71,13 @@ spec: command: ["/bin/sh","-c"] args: - | - . /vault/scripts/comms_vault_env.sh + . /vault/scripts/atlasbot_vault_env.sh exec python -m atlasbot.main env: - name: MATRIX_BASE - value: http://othrys-synapse-matrix-synapse:8008 + value: http://othrys-synapse-matrix-synapse.comms.svc.cluster.local:8008 - name: AUTH_BASE - value: http://matrix-authentication-service:8080 + value: http://matrix-authentication-service.comms.svc.cluster.local:8080 - name: KB_DIR value: /kb - name: VM_URL @@ -115,7 +115,7 @@ spec: - name: ATLASBOT_STATE_DB value: /data/atlasbot_state.db - name: ATLASBOT_QUEUE_ENABLED - value: "false" + value: "true" - name: ATLASBOT_DEBUG_PIPELINE value: "true" - name: ATLASBOT_NATS_URL @@ -170,7 +170,7 @@ spec: path: diagrams/atlas-http.mmd - name: vault-scripts configMap: - name: comms-vault-env + name: atlasbot-vault-env defaultMode: 0555 - name: atlasbot-state emptyDir: {} diff --git a/services/comms/atlasbot-rbac.yaml b/services/atlasbot/atlasbot-rbac.yaml similarity index 97% rename from services/comms/atlasbot-rbac.yaml rename to services/atlasbot/atlasbot-rbac.yaml index ed6206d..3a83acc 100644 --- a/services/comms/atlasbot-rbac.yaml +++ b/services/atlasbot/atlasbot-rbac.yaml @@ -3,7 +3,7 @@ apiVersion: v1 kind: ServiceAccount metadata: name: atlasbot - namespace: comms + namespace: ai imagePullSecrets: - name: harbor-regcred --- @@ -45,4 +45,4 @@ roleRef: subjects: - kind: ServiceAccount name: atlasbot - namespace: comms + namespace: ai diff --git a/services/comms/atlasbot-service.yaml b/services/atlasbot/atlasbot-service.yaml similarity index 91% rename from services/comms/atlasbot-service.yaml rename to services/atlasbot/atlasbot-service.yaml index c8b3570..d47617d 100644 --- a/services/comms/atlasbot-service.yaml +++ b/services/atlasbot/atlasbot-service.yaml @@ -2,7 +2,7 @@ apiVersion: v1 kind: Service metadata: name: atlasbot - namespace: comms + namespace: ai labels: app: atlasbot spec: diff --git a/services/comms/image.yaml b/services/atlasbot/image.yaml similarity index 91% rename from services/comms/image.yaml rename to services/atlasbot/image.yaml index 8b1d699..b461a70 100644 --- a/services/comms/image.yaml +++ b/services/atlasbot/image.yaml @@ -3,7 +3,7 @@ apiVersion: image.toolkit.fluxcd.io/v1beta2 kind: ImageRepository metadata: name: atlasbot - namespace: comms + namespace: ai spec: image: registry.bstein.dev/bstein/atlasbot interval: 1m0s @@ -14,7 +14,7 @@ apiVersion: image.toolkit.fluxcd.io/v1beta2 kind: ImagePolicy metadata: name: atlasbot - namespace: comms + namespace: ai spec: imageRepositoryRef: name: atlasbot diff --git a/services/atlasbot/knowledge/catalog/atlas-summary.json b/services/atlasbot/knowledge/catalog/atlas-summary.json new file mode 100644 index 0000000..ea825ce --- /dev/null +++ b/services/atlasbot/knowledge/catalog/atlas-summary.json @@ -0,0 +1,8 @@ +{ + "counts": { + "helmrelease_host_hints": 19, + "http_endpoints": 45, + "services": 47, + "workloads": 74 + } +} diff --git a/services/atlasbot/knowledge/catalog/atlas.json b/services/atlasbot/knowledge/catalog/atlas.json new file mode 100644 index 0000000..951c807 --- /dev/null +++ b/services/atlasbot/knowledge/catalog/atlas.json @@ -0,0 +1,3445 @@ +{ + "cluster": "atlas", + "sources": [ + { + "name": "ai-llm", + "path": "services/ai-llm", + "targetNamespace": "ai" + }, + { + "name": "bstein-dev-home", + "path": "services/bstein-dev-home", + "targetNamespace": "bstein-dev-home" + }, + { + "name": "bstein-dev-home-migrations", + "path": "services/bstein-dev-home/migrations", + "targetNamespace": "bstein-dev-home" + }, + { + "name": "cert-manager", + "path": "infrastructure/cert-manager", + "targetNamespace": "cert-manager" + }, + { + "name": "cert-manager-cleanup", + "path": "infrastructure/cert-manager/cleanup", + "targetNamespace": "cert-manager" + }, + { + "name": "comms", + "path": "services/comms", + "targetNamespace": "comms" + }, + { + "name": "core", + "path": "infrastructure/core", + "targetNamespace": null + }, + { + "name": "crypto", + "path": "services/crypto", + "targetNamespace": "crypto" + }, + { + "name": "finance", + "path": "services/finance", + "targetNamespace": "finance" + }, + { + "name": "flux-system", + "path": "clusters/atlas/flux-system", + "targetNamespace": null + }, + { + "name": "gitea", + "path": "services/gitea", + "targetNamespace": "gitea" + }, + { + "name": "gitops-ui", + "path": "services/gitops-ui", + "targetNamespace": "flux-system" + }, + { + "name": "harbor", + "path": "services/harbor", + "targetNamespace": "harbor" + }, + { + "name": "health", + "path": "services/health", + "targetNamespace": "health" + }, + { + "name": "helm", + "path": "infrastructure/sources/helm", + "targetNamespace": "flux-system" + }, + { + "name": "jellyfin", + "path": "services/jellyfin", + "targetNamespace": "jellyfin" + }, + { + "name": "jenkins", + "path": "services/jenkins", + "targetNamespace": "jenkins" + }, + { + "name": "keycloak", + "path": "services/keycloak", + "targetNamespace": "sso" + }, + { + "name": "logging", + "path": "services/logging", + "targetNamespace": null + }, + { + "name": "longhorn", + "path": "infrastructure/longhorn/core", + "targetNamespace": "longhorn-system" + }, + { + "name": "longhorn-adopt", + "path": "infrastructure/longhorn/adopt", + "targetNamespace": "longhorn-system" + }, + { + "name": "longhorn-ui", + "path": "infrastructure/longhorn/ui-ingress", + "targetNamespace": "longhorn-system" + }, + { + "name": "mailu", + "path": "services/mailu", + "targetNamespace": "mailu-mailserver" + }, + { + "name": "maintenance", + "path": "services/maintenance", + "targetNamespace": null + }, + { + "name": "metallb", + "path": "infrastructure/metallb", + "targetNamespace": "metallb-system" + }, + { + "name": "monerod", + "path": "services/crypto/monerod", + "targetNamespace": "crypto" + }, + { + "name": "monitoring", + "path": "services/monitoring", + "targetNamespace": null + }, + { + "name": "nextcloud", + "path": "services/nextcloud", + "targetNamespace": "nextcloud" + }, + { + "name": "nextcloud-mail-sync", + "path": "services/nextcloud-mail-sync", + "targetNamespace": "nextcloud" + }, + { + "name": "oauth2-proxy", + "path": "services/oauth2-proxy", + "targetNamespace": "sso" + }, + { + "name": "openldap", + "path": "services/openldap", + "targetNamespace": "sso" + }, + { + "name": "outline", + "path": "services/outline", + "targetNamespace": "outline" + }, + { + "name": "pegasus", + "path": "services/pegasus", + "targetNamespace": "jellyfin" + }, + { + "name": "planka", + "path": "services/planka", + "targetNamespace": "planka" + }, + { + "name": "postgres", + "path": "infrastructure/postgres", + "targetNamespace": "postgres" + }, + { + "name": "sui-metrics", + "path": "services/sui-metrics/overlays/atlas", + "targetNamespace": "sui-metrics" + }, + { + "name": "traefik", + "path": "infrastructure/traefik", + "targetNamespace": "traefik" + }, + { + "name": "vault", + "path": "services/vault", + "targetNamespace": "vault" + }, + { + "name": "vault-csi", + "path": "infrastructure/vault-csi", + "targetNamespace": "kube-system" + }, + { + "name": "vault-injector", + "path": "infrastructure/vault-injector", + "targetNamespace": "vault" + }, + { + "name": "vaultwarden", + "path": "services/vaultwarden", + "targetNamespace": "vaultwarden" + }, + { + "name": "wallet-monero-temp", + "path": "services/crypto/wallet-monero-temp", + "targetNamespace": "crypto" + }, + { + "name": "xmr-miner", + "path": "services/crypto/xmr-miner", + "targetNamespace": "crypto" + } + ], + "workloads": [ + { + "kind": "Deployment", + "namespace": "ai", + "name": "ollama", + "labels": { + "app": "ollama" + }, + "serviceAccountName": null, + "nodeSelector": {}, + "images": [ + "ollama/ollama@sha256:2c9595c555fd70a28363489ac03bd5bf9e7c5bdf2890373c3a830ffd7252ce6d" + ] + }, + { + "kind": "Deployment", + "namespace": "bstein-dev-home", + "name": "bstein-dev-home-backend", + "labels": { + "app": "bstein-dev-home-backend" + }, + "serviceAccountName": "bstein-dev-home", + "nodeSelector": { + "kubernetes.io/arch": "arm64", + "node-role.kubernetes.io/worker": "true" + }, + "images": [ + "registry.bstein.dev/bstein/bstein-dev-home-backend:0.1.1-157" + ] + }, + { + "kind": "Deployment", + "namespace": "bstein-dev-home", + "name": "bstein-dev-home-frontend", + "labels": { + "app": "bstein-dev-home-frontend" + }, + "serviceAccountName": null, + "nodeSelector": { + "kubernetes.io/arch": "arm64", + "node-role.kubernetes.io/worker": "true" + }, + "images": [ + "registry.bstein.dev/bstein/bstein-dev-home-frontend:0.1.1-157" + ] + }, + { + "kind": "Deployment", + "namespace": "bstein-dev-home", + "name": "bstein-dev-home-vault-sync", + "labels": { + "app": "bstein-dev-home-vault-sync" + }, + "serviceAccountName": "bstein-dev-home-vault-sync", + "nodeSelector": {}, + "images": [ + "alpine:3.20" + ] + }, + { + "kind": "Deployment", + "namespace": "bstein-dev-home", + "name": "chat-ai-gateway", + "labels": { + "app": "chat-ai-gateway" + }, + "serviceAccountName": "bstein-dev-home", + "nodeSelector": { + "kubernetes.io/arch": "arm64", + "node-role.kubernetes.io/worker": "true" + }, + "images": [ + "python:3.11-slim" + ] + }, + { + "kind": "Deployment", + "namespace": "comms", + "name": "atlasbot", + "labels": { + "app": "atlasbot" + }, + "serviceAccountName": "atlasbot", + "nodeSelector": { + "hardware": "rpi5" + }, + "images": [ + "python:3.11-slim" + ] + }, + { + "kind": "Deployment", + "namespace": "comms", + "name": "comms-vault-sync", + "labels": { + "app": "comms-vault-sync" + }, + "serviceAccountName": "comms-vault", + "nodeSelector": {}, + "images": [ + "alpine:3.20" + ] + }, + { + "kind": "Deployment", + "namespace": "comms", + "name": "coturn", + "labels": { + "app": "coturn" + }, + "serviceAccountName": "comms-vault", + "nodeSelector": { + "hardware": "rpi5" + }, + "images": [ + "ghcr.io/coturn/coturn:4.6.2" + ] + }, + { + "kind": "Deployment", + "namespace": "comms", + "name": "element-call", + "labels": { + "app": "element-call" + }, + "serviceAccountName": null, + "nodeSelector": { + "hardware": "rpi5" + }, + "images": [ + "ghcr.io/element-hq/element-call@sha256:e6897c7818331714eae19d83ef8ea94a8b41115f0d8d3f62c2fed2d02c65c9bc" + ] + }, + { + "kind": "Deployment", + "namespace": "comms", + "name": "livekit", + "labels": { + "app": "livekit" + }, + "serviceAccountName": "comms-vault", + "nodeSelector": { + "hardware": "rpi5" + }, + "images": [ + "livekit/livekit-server:v1.9.0" + ] + }, + { + "kind": "Deployment", + "namespace": "comms", + "name": "livekit-token-service", + "labels": { + "app": "livekit-token-service" + }, + "serviceAccountName": "comms-vault", + "nodeSelector": { + "hardware": "rpi5" + }, + "images": [ + "registry.bstein.dev/tools/lk-jwt-service-vault:0.3.0" + ] + }, + { + "kind": "Deployment", + "namespace": "comms", + "name": "matrix-authentication-service", + "labels": { + "app": "matrix-authentication-service" + }, + "serviceAccountName": "comms-vault", + "nodeSelector": { + "hardware": "rpi5" + }, + "images": [ + "ghcr.io/element-hq/matrix-authentication-service:1.8.0" + ] + }, + { + "kind": "Deployment", + "namespace": "comms", + "name": "matrix-guest-register", + "labels": { + "app.kubernetes.io/name": "matrix-guest-register" + }, + "serviceAccountName": "comms-vault", + "nodeSelector": {}, + "images": [ + "python:3.11-slim" + ] + }, + { + "kind": "Deployment", + "namespace": "comms", + "name": "matrix-wellknown", + "labels": { + "app": "matrix-wellknown" + }, + "serviceAccountName": null, + "nodeSelector": {}, + "images": [ + "nginx:1.27-alpine" + ] + }, + { + "kind": "DaemonSet", + "namespace": "crypto", + "name": "monero-xmrig", + "labels": { + "app": "monero-xmrig" + }, + "serviceAccountName": null, + "nodeSelector": { + "node-role.kubernetes.io/worker": "true" + }, + "images": [ + "ghcr.io/tari-project/xmrig@sha256:80defbfd0b640d604c91cb5101d3642db7928e1e68ee3c6b011289b3565a39d9" + ] + }, + { + "kind": "Deployment", + "namespace": "crypto", + "name": "crypto-vault-sync", + "labels": { + "app": "crypto-vault-sync" + }, + "serviceAccountName": "crypto-vault-sync", + "nodeSelector": {}, + "images": [ + "alpine:3.20" + ] + }, + { + "kind": "Deployment", + "namespace": "crypto", + "name": "monero-p2pool", + "labels": { + "app": "monero-p2pool" + }, + "serviceAccountName": "crypto-vault-sync", + "nodeSelector": { + "node-role.kubernetes.io/worker": "true" + }, + "images": [ + "debian:bookworm-slim" + ] + }, + { + "kind": "Deployment", + "namespace": "crypto", + "name": "monerod", + "labels": { + "app": "monerod" + }, + "serviceAccountName": null, + "nodeSelector": { + "node-role.kubernetes.io/worker": "true" + }, + "images": [ + "registry.bstein.dev/crypto/monerod:0.18.4.1" + ] + }, + { + "kind": "Deployment", + "namespace": "crypto", + "name": "wallet-monero-temp", + "labels": { + "app": "wallet-monero-temp" + }, + "serviceAccountName": "crypto-vault-sync", + "nodeSelector": { + "node-role.kubernetes.io/worker": "true" + }, + "images": [ + "registry.bstein.dev/crypto/monero-wallet-rpc:0.18.4.1" + ] + }, + { + "kind": "Deployment", + "namespace": "finance", + "name": "actual-budget", + "labels": { + "app": "actual-budget" + }, + "serviceAccountName": "finance-vault", + "nodeSelector": { + "kubernetes.io/arch": "arm64", + "node-role.kubernetes.io/worker": "true" + }, + "images": [ + "actualbudget/actual-server:26.1.0-alpine@sha256:34aae5813fdfee12af2a50c4d0667df68029f1d61b90f45f282473273eb70d0d" + ] + }, + { + "kind": "Deployment", + "namespace": "finance", + "name": "firefly", + "labels": { + "app": "firefly" + }, + "serviceAccountName": "finance-vault", + "nodeSelector": { + "kubernetes.io/arch": "arm64", + "node-role.kubernetes.io/worker": "true" + }, + "images": [ + "fireflyiii/core:version-6.4.15" + ] + }, + { + "kind": "Deployment", + "namespace": "flux-system", + "name": "helm-controller", + "labels": { + "app": "helm-controller", + "app.kubernetes.io/component": "helm-controller", + "app.kubernetes.io/instance": "flux-system", + "app.kubernetes.io/part-of": "flux", + "app.kubernetes.io/version": "v2.7.5" + }, + "serviceAccountName": "helm-controller", + "nodeSelector": { + "kubernetes.io/os": "linux" + }, + "images": [ + "ghcr.io/fluxcd/helm-controller:v1.4.5" + ] + }, + { + "kind": "Deployment", + "namespace": "flux-system", + "name": "image-automation-controller", + "labels": { + "app": "image-automation-controller", + "app.kubernetes.io/component": "image-automation-controller", + "app.kubernetes.io/instance": "flux-system", + "app.kubernetes.io/part-of": "flux", + "app.kubernetes.io/version": "v2.7.5" + }, + "serviceAccountName": "image-automation-controller", + "nodeSelector": { + "kubernetes.io/os": "linux" + }, + "images": [ + "ghcr.io/fluxcd/image-automation-controller:v1.0.4" + ] + }, + { + "kind": "Deployment", + "namespace": "flux-system", + "name": "image-reflector-controller", + "labels": { + "app": "image-reflector-controller", + "app.kubernetes.io/component": "image-reflector-controller", + "app.kubernetes.io/instance": "flux-system", + "app.kubernetes.io/part-of": "flux", + "app.kubernetes.io/version": "v2.7.5" + }, + "serviceAccountName": "image-reflector-controller", + "nodeSelector": { + "kubernetes.io/os": "linux" + }, + "images": [ + "ghcr.io/fluxcd/image-reflector-controller:v1.0.4" + ] + }, + { + "kind": "Deployment", + "namespace": "flux-system", + "name": "kustomize-controller", + "labels": { + "app": "kustomize-controller", + "app.kubernetes.io/component": "kustomize-controller", + "app.kubernetes.io/instance": "flux-system", + "app.kubernetes.io/part-of": "flux", + "app.kubernetes.io/version": "v2.7.5" + }, + "serviceAccountName": "kustomize-controller", + "nodeSelector": { + "kubernetes.io/os": "linux" + }, + "images": [ + "ghcr.io/fluxcd/kustomize-controller:v1.7.3" + ] + }, + { + "kind": "Deployment", + "namespace": "flux-system", + "name": "notification-controller", + "labels": { + "app": "notification-controller", + "app.kubernetes.io/component": "notification-controller", + "app.kubernetes.io/instance": "flux-system", + "app.kubernetes.io/part-of": "flux", + "app.kubernetes.io/version": "v2.7.5" + }, + "serviceAccountName": "notification-controller", + "nodeSelector": { + "kubernetes.io/os": "linux" + }, + "images": [ + "ghcr.io/fluxcd/notification-controller:v1.7.5" + ] + }, + { + "kind": "Deployment", + "namespace": "flux-system", + "name": "source-controller", + "labels": { + "app": "source-controller", + "app.kubernetes.io/component": "source-controller", + "app.kubernetes.io/instance": "flux-system", + "app.kubernetes.io/part-of": "flux", + "app.kubernetes.io/version": "v2.7.5" + }, + "serviceAccountName": "source-controller", + "nodeSelector": { + "kubernetes.io/os": "linux" + }, + "images": [ + "ghcr.io/fluxcd/source-controller:v1.7.4" + ] + }, + { + "kind": "Deployment", + "namespace": "gitea", + "name": "gitea", + "labels": { + "app": "gitea" + }, + "serviceAccountName": "gitea-vault", + "nodeSelector": { + "node-role.kubernetes.io/worker": "true" + }, + "images": [ + "gitea/gitea:1.23" + ] + }, + { + "kind": "Deployment", + "namespace": "harbor", + "name": "harbor-vault-sync", + "labels": { + "app": "harbor-vault-sync" + }, + "serviceAccountName": "harbor-vault-sync", + "nodeSelector": {}, + "images": [ + "alpine:3.20" + ] + }, + { + "kind": "Deployment", + "namespace": "health", + "name": "wger", + "labels": { + "app": "wger" + }, + "serviceAccountName": "health-vault-sync", + "nodeSelector": { + "kubernetes.io/arch": "arm64", + "node-role.kubernetes.io/worker": "true" + }, + "images": [ + "nginx:1.27.5-alpine@sha256:65645c7bb6a0661892a8b03b89d0743208a18dd2f3f17a54ef4b76fb8e2f2a10", + "wger/server@sha256:710588b78af4e0aa0b4d8a8061e4563e16eae80eeaccfe7f9e0d9cbdd7f0cbc5" + ] + }, + { + "kind": "Deployment", + "namespace": "jellyfin", + "name": "jellyfin", + "labels": { + "app": "jellyfin" + }, + "serviceAccountName": "pegasus-vault-sync", + "nodeSelector": {}, + "images": [ + "docker.io/jellyfin/jellyfin:10.11.5" + ] + }, + { + "kind": "Deployment", + "namespace": "jellyfin", + "name": "pegasus", + "labels": { + "app": "pegasus" + }, + "serviceAccountName": "pegasus-vault-sync", + "nodeSelector": { + "kubernetes.io/arch": "arm64", + "node-role.kubernetes.io/worker": "true" + }, + "images": [ + "alpine:3.20", + "registry.bstein.dev/streaming/pegasus-vault:1.2.32" + ] + }, + { + "kind": "Deployment", + "namespace": "jellyfin", + "name": "pegasus-vault-sync", + "labels": { + "app": "pegasus-vault-sync" + }, + "serviceAccountName": "pegasus-vault-sync", + "nodeSelector": {}, + "images": [ + "alpine:3.20" + ] + }, + { + "kind": "Deployment", + "namespace": "jenkins", + "name": "jenkins", + "labels": { + "app": "jenkins" + }, + "serviceAccountName": "jenkins", + "nodeSelector": { + "kubernetes.io/arch": "arm64", + "node-role.kubernetes.io/worker": "true" + }, + "images": [ + "jenkins/jenkins:2.528.3-jdk21" + ] + }, + { + "kind": "Deployment", + "namespace": "jenkins", + "name": "jenkins-vault-sync", + "labels": { + "app": "jenkins-vault-sync" + }, + "serviceAccountName": "jenkins-vault-sync", + "nodeSelector": { + "kubernetes.io/arch": "arm64", + "node-role.kubernetes.io/worker": "true" + }, + "images": [ + "alpine:3.20" + ] + }, + { + "kind": "DaemonSet", + "namespace": "kube-system", + "name": "ntp-sync", + "labels": { + "app": "ntp-sync" + }, + "serviceAccountName": null, + "nodeSelector": {}, + "images": [ + "public.ecr.aws/docker/library/busybox:1.36.1" + ] + }, + { + "kind": "DaemonSet", + "namespace": "kube-system", + "name": "nvidia-device-plugin-jetson", + "labels": { + "app.kubernetes.io/instance": "jetson", + "app.kubernetes.io/name": "nvidia-device-plugin" + }, + "serviceAccountName": null, + "nodeSelector": { + "jetson": "true", + "kubernetes.io/arch": "arm64" + }, + "images": [ + "nvcr.io/nvidia/k8s-device-plugin:v0.16.2" + ] + }, + { + "kind": "DaemonSet", + "namespace": "kube-system", + "name": "nvidia-device-plugin-minipc", + "labels": { + "app.kubernetes.io/instance": "titan22", + "app.kubernetes.io/name": "nvidia-device-plugin" + }, + "serviceAccountName": null, + "nodeSelector": { + "kubernetes.io/arch": "amd64", + "kubernetes.io/hostname": "titan-22" + }, + "images": [ + "nvcr.io/nvidia/k8s-device-plugin:v0.16.2" + ] + }, + { + "kind": "DaemonSet", + "namespace": "kube-system", + "name": "nvidia-device-plugin-tethys", + "labels": { + "app.kubernetes.io/instance": "titan24", + "app.kubernetes.io/name": "nvidia-device-plugin" + }, + "serviceAccountName": null, + "nodeSelector": { + "kubernetes.io/arch": "amd64", + "kubernetes.io/hostname": "titan-24" + }, + "images": [ + "nvcr.io/nvidia/k8s-device-plugin:v0.16.2" + ] + }, + { + "kind": "DaemonSet", + "namespace": "kube-system", + "name": "vault-csi-provider", + "labels": { + "app.kubernetes.io/name": "vault-csi-provider" + }, + "serviceAccountName": "vault-csi-provider", + "nodeSelector": { + "kubernetes.io/os": "linux" + }, + "images": [ + "hashicorp/vault-csi-provider:1.7.0" + ] + }, + { + "kind": "Deployment", + "namespace": "kube-system", + "name": "coredns", + "labels": { + "k8s-app": "kube-dns" + }, + "serviceAccountName": "coredns", + "nodeSelector": { + "kubernetes.io/os": "linux" + }, + "images": [ + "registry.bstein.dev/infra/coredns:1.12.1" + ] + }, + { + "kind": "DaemonSet", + "namespace": "logging", + "name": "node-image-gc-rpi4", + "labels": { + "app": "node-image-gc-rpi4" + }, + "serviceAccountName": "node-image-gc-rpi4", + "nodeSelector": { + "hardware": "rpi4" + }, + "images": [ + "bitnami/kubectl@sha256:554ab88b1858e8424c55de37ad417b16f2a0e65d1607aa0f3fe3ce9b9f10b131" + ] + }, + { + "kind": "DaemonSet", + "namespace": "logging", + "name": "node-image-prune-rpi5", + "labels": { + "app": "node-image-prune-rpi5" + }, + "serviceAccountName": "node-image-prune-rpi5", + "nodeSelector": { + "hardware": "rpi5" + }, + "images": [ + "bitnami/kubectl@sha256:554ab88b1858e8424c55de37ad417b16f2a0e65d1607aa0f3fe3ce9b9f10b131" + ] + }, + { + "kind": "DaemonSet", + "namespace": "logging", + "name": "node-log-rotation", + "labels": { + "app": "node-log-rotation" + }, + "serviceAccountName": "node-log-rotation", + "nodeSelector": { + "hardware": "rpi5" + }, + "images": [ + "bitnami/kubectl@sha256:554ab88b1858e8424c55de37ad417b16f2a0e65d1607aa0f3fe3ce9b9f10b131" + ] + }, + { + "kind": "Deployment", + "namespace": "logging", + "name": "logging-vault-sync", + "labels": { + "app": "logging-vault-sync" + }, + "serviceAccountName": "logging-vault-sync", + "nodeSelector": {}, + "images": [ + "alpine:3.20" + ] + }, + { + "kind": "Deployment", + "namespace": "logging", + "name": "oauth2-proxy-logs", + "labels": { + "app": "oauth2-proxy-logs" + }, + "serviceAccountName": "logging-vault-sync", + "nodeSelector": { + "node-role.kubernetes.io/worker": "true" + }, + "images": [ + "registry.bstein.dev/tools/oauth2-proxy-vault:v7.6.0" + ] + }, + { + "kind": "Deployment", + "namespace": "longhorn-system", + "name": "longhorn-vault-sync", + "labels": { + "app": "longhorn-vault-sync" + }, + "serviceAccountName": "longhorn-vault-sync", + "nodeSelector": { + "node-role.kubernetes.io/worker": "true" + }, + "images": [ + "alpine:3.20" + ] + }, + { + "kind": "Deployment", + "namespace": "longhorn-system", + "name": "oauth2-proxy-longhorn", + "labels": { + "app": "oauth2-proxy-longhorn" + }, + "serviceAccountName": "longhorn-vault", + "nodeSelector": { + "node-role.kubernetes.io/worker": "true" + }, + "images": [ + "quay.io/oauth2-proxy/oauth2-proxy:v7.6.0" + ] + }, + { + "kind": "DaemonSet", + "namespace": "mailu-mailserver", + "name": "vip-controller", + "labels": { + "app": "vip-controller" + }, + "serviceAccountName": "vip-controller", + "nodeSelector": { + "mailu.bstein.dev/vip": "true" + }, + "images": [ + "registry.bstein.dev/bstein/kubectl:1.35.0" + ] + }, + { + "kind": "Deployment", + "namespace": "mailu-mailserver", + "name": "mailu-vault-sync", + "labels": { + "app": "mailu-vault-sync" + }, + "serviceAccountName": "mailu-vault-sync", + "nodeSelector": {}, + "images": [ + "alpine:3.20" + ] + }, + { + "kind": "DaemonSet", + "namespace": "maintenance", + "name": "disable-k3s-traefik", + "labels": { + "app": "disable-k3s-traefik" + }, + "serviceAccountName": "disable-k3s-traefik", + "nodeSelector": { + "node-role.kubernetes.io/control-plane": "true" + }, + "images": [ + "bitnami/kubectl@sha256:554ab88b1858e8424c55de37ad417b16f2a0e65d1607aa0f3fe3ce9b9f10b131" + ] + }, + { + "kind": "DaemonSet", + "namespace": "maintenance", + "name": "k3s-agent-restart", + "labels": { + "app": "k3s-agent-restart" + }, + "serviceAccountName": "node-nofile", + "nodeSelector": { + "kubernetes.io/arch": "arm64", + "node-role.kubernetes.io/worker": "true" + }, + "images": [ + "bitnami/kubectl@sha256:554ab88b1858e8424c55de37ad417b16f2a0e65d1607aa0f3fe3ce9b9f10b131" + ] + }, + { + "kind": "DaemonSet", + "namespace": "maintenance", + "name": "node-image-sweeper", + "labels": { + "app": "node-image-sweeper" + }, + "serviceAccountName": "node-image-sweeper", + "nodeSelector": { + "kubernetes.io/os": "linux" + }, + "images": [ + "python:3.12.9-alpine3.20" + ] + }, + { + "kind": "DaemonSet", + "namespace": "maintenance", + "name": "node-nofile", + "labels": { + "app": "node-nofile" + }, + "serviceAccountName": "node-nofile", + "nodeSelector": {}, + "images": [ + "bitnami/kubectl@sha256:554ab88b1858e8424c55de37ad417b16f2a0e65d1607aa0f3fe3ce9b9f10b131" + ] + }, + { + "kind": "Deployment", + "namespace": "maintenance", + "name": "ariadne", + "labels": { + "app": "ariadne" + }, + "serviceAccountName": "ariadne", + "nodeSelector": { + "kubernetes.io/arch": "arm64", + "node-role.kubernetes.io/worker": "true" + }, + "images": [ + "registry.bstein.dev/bstein/ariadne:0.1.0-49" + ] + }, + { + "kind": "Deployment", + "namespace": "maintenance", + "name": "maintenance-vault-sync", + "labels": { + "app": "maintenance-vault-sync" + }, + "serviceAccountName": "maintenance-vault-sync", + "nodeSelector": {}, + "images": [ + "alpine:3.20" + ] + }, + { + "kind": "DaemonSet", + "namespace": "monitoring", + "name": "dcgm-exporter", + "labels": { + "app": "dcgm-exporter" + }, + "serviceAccountName": "default", + "nodeSelector": {}, + "images": [ + "registry.bstein.dev/monitoring/dcgm-exporter:4.4.2-4.7.0-ubuntu22.04" + ] + }, + { + "kind": "DaemonSet", + "namespace": "monitoring", + "name": "jetson-tegrastats-exporter", + "labels": { + "app": "jetson-tegrastats-exporter" + }, + "serviceAccountName": "default", + "nodeSelector": { + "jetson": "true" + }, + "images": [ + "python:3.10-slim" + ] + }, + { + "kind": "Deployment", + "namespace": "monitoring", + "name": "monitoring-vault-sync", + "labels": { + "app": "monitoring-vault-sync" + }, + "serviceAccountName": "monitoring-vault-sync", + "nodeSelector": {}, + "images": [ + "alpine:3.20" + ] + }, + { + "kind": "Deployment", + "namespace": "monitoring", + "name": "postmark-exporter", + "labels": { + "app": "postmark-exporter" + }, + "serviceAccountName": "monitoring-vault-sync", + "nodeSelector": {}, + "images": [ + "python:3.12-alpine" + ] + }, + { + "kind": "Deployment", + "namespace": "nextcloud", + "name": "collabora", + "labels": { + "app": "collabora" + }, + "serviceAccountName": null, + "nodeSelector": { + "hardware": "rpi5" + }, + "images": [ + "collabora/code@sha256:3c58d0e9bae75e4647467d0c7d91cb66f261d3e814709aed590b5c334a04db26" + ] + }, + { + "kind": "Deployment", + "namespace": "nextcloud", + "name": "nextcloud", + "labels": { + "app": "nextcloud" + }, + "serviceAccountName": "nextcloud-vault", + "nodeSelector": { + "hardware": "rpi5" + }, + "images": [ + "nextcloud:29-apache" + ] + }, + { + "kind": "Deployment", + "namespace": "outline", + "name": "outline", + "labels": { + "app": "outline" + }, + "serviceAccountName": "outline-vault", + "nodeSelector": { + "node-role.kubernetes.io/worker": "true" + }, + "images": [ + "outlinewiki/outline:1.2.0" + ] + }, + { + "kind": "Deployment", + "namespace": "outline", + "name": "outline-redis", + "labels": { + "app": "outline-redis" + }, + "serviceAccountName": null, + "nodeSelector": { + "node-role.kubernetes.io/worker": "true" + }, + "images": [ + "redis:7.4.1-alpine" + ] + }, + { + "kind": "Deployment", + "namespace": "planka", + "name": "planka", + "labels": { + "app": "planka" + }, + "serviceAccountName": "planka-vault", + "nodeSelector": { + "node-role.kubernetes.io/worker": "true" + }, + "images": [ + "ghcr.io/plankanban/planka:2.0.0-rc.4" + ] + }, + { + "kind": "StatefulSet", + "namespace": "postgres", + "name": "postgres", + "labels": { + "app": "postgres" + }, + "serviceAccountName": "postgres-vault", + "nodeSelector": { + "node-role.kubernetes.io/worker": "true" + }, + "images": [ + "postgres:15", + "quay.io/prometheuscommunity/postgres-exporter:v0.15.0" + ] + }, + { + "kind": "Deployment", + "namespace": "sso", + "name": "keycloak", + "labels": { + "app": "keycloak" + }, + "serviceAccountName": "sso-vault", + "nodeSelector": { + "kubernetes.io/arch": "arm64", + "node-role.kubernetes.io/worker": "true" + }, + "images": [ + "quay.io/keycloak/keycloak:26.0.7" + ] + }, + { + "kind": "Deployment", + "namespace": "sso", + "name": "oauth2-proxy", + "labels": { + "app": "oauth2-proxy" + }, + "serviceAccountName": "sso-vault", + "nodeSelector": { + "node-role.kubernetes.io/worker": "true" + }, + "images": [ + "registry.bstein.dev/tools/oauth2-proxy-vault:v7.6.0" + ] + }, + { + "kind": "Deployment", + "namespace": "sso", + "name": "sso-vault-sync", + "labels": { + "app": "sso-vault-sync" + }, + "serviceAccountName": "sso-vault-sync", + "nodeSelector": {}, + "images": [ + "alpine:3.20" + ] + }, + { + "kind": "StatefulSet", + "namespace": "sso", + "name": "openldap", + "labels": { + "app": "openldap" + }, + "serviceAccountName": "sso-vault", + "nodeSelector": { + "kubernetes.io/arch": "arm64", + "node-role.kubernetes.io/worker": "true" + }, + "images": [ + "docker.io/osixia/openldap:1.5.0" + ] + }, + { + "kind": "Deployment", + "namespace": "sui-metrics", + "name": "sui-metrics", + "labels": { + "app": "sui-metrics" + }, + "serviceAccountName": "sui-metrics", + "nodeSelector": { + "hardware": "rpi5" + }, + "images": [ + "victoriametrics/vmagent:v1.103.0" + ] + }, + { + "kind": "Deployment", + "namespace": "traefik", + "name": "traefik", + "labels": { + "app": "traefik", + "app.kubernetes.io/instance": "traefik-kube-system", + "app.kubernetes.io/name": "traefik" + }, + "serviceAccountName": "traefik-ingress-controller", + "nodeSelector": { + "node-role.kubernetes.io/worker": "true" + }, + "images": [ + "traefik:v3.3.3" + ] + }, + { + "kind": "StatefulSet", + "namespace": "vault", + "name": "vault", + "labels": { + "app": "vault" + }, + "serviceAccountName": "vault", + "nodeSelector": { + "kubernetes.io/arch": "arm64", + "node-role.kubernetes.io/worker": "true" + }, + "images": [ + "hashicorp/vault:1.17.6" + ] + }, + { + "kind": "Deployment", + "namespace": "vaultwarden", + "name": "vaultwarden", + "labels": { + "app": "vaultwarden" + }, + "serviceAccountName": "vaultwarden-vault", + "nodeSelector": { + "kubernetes.io/arch": "arm64", + "node-role.kubernetes.io/worker": "true" + }, + "images": [ + "vaultwarden/server:1.35.2" + ] + } + ], + "services": [ + { + "namespace": "ai", + "name": "ollama", + "type": "ClusterIP", + "selector": { + "app": "ollama" + }, + "ports": [ + { + "name": "http", + "port": 11434, + "targetPort": 11434, + "protocol": "TCP" + } + ] + }, + { + "namespace": "bstein-dev-home", + "name": "bstein-dev-home-backend", + "type": "ClusterIP", + "selector": { + "app": "bstein-dev-home-backend" + }, + "ports": [ + { + "name": "http", + "port": 80, + "targetPort": 8080, + "protocol": "TCP" + } + ] + }, + { + "namespace": "bstein-dev-home", + "name": "bstein-dev-home-frontend", + "type": "ClusterIP", + "selector": { + "app": "bstein-dev-home-frontend" + }, + "ports": [ + { + "name": "http", + "port": 80, + "targetPort": 80, + "protocol": "TCP" + } + ] + }, + { + "namespace": "bstein-dev-home", + "name": "chat-ai-gateway", + "type": "ClusterIP", + "selector": { + "app": "chat-ai-gateway" + }, + "ports": [ + { + "name": "http", + "port": 80, + "targetPort": 8080, + "protocol": "TCP" + } + ] + }, + { + "namespace": "comms", + "name": "coturn", + "type": "LoadBalancer", + "selector": { + "app": "coturn" + }, + "ports": [ + { + "name": "turn-udp", + "port": 3478, + "targetPort": 3478, + "protocol": "UDP" + }, + { + "name": "turn-tcp", + "port": 3478, + "targetPort": 3478, + "protocol": "TCP" + }, + { + "name": "turn-tls", + "port": 5349, + "targetPort": 5349, + "protocol": "TCP" + }, + { + "name": "relay-50000", + "port": 50000, + "targetPort": 50000, + "protocol": "UDP" + }, + { + "name": "relay-50001", + "port": 50001, + "targetPort": 50001, + "protocol": "UDP" + }, + { + "name": "relay-50002", + "port": 50002, + "targetPort": 50002, + "protocol": "UDP" + }, + { + "name": "relay-50003", + "port": 50003, + "targetPort": 50003, + "protocol": "UDP" + }, + { + "name": "relay-50004", + "port": 50004, + "targetPort": 50004, + "protocol": "UDP" + }, + { + "name": "relay-50005", + "port": 50005, + "targetPort": 50005, + "protocol": "UDP" + }, + { + "name": "relay-50006", + "port": 50006, + "targetPort": 50006, + "protocol": "UDP" + }, + { + "name": "relay-50007", + "port": 50007, + "targetPort": 50007, + "protocol": "UDP" + }, + { + "name": "relay-50008", + "port": 50008, + "targetPort": 50008, + "protocol": "UDP" + }, + { + "name": "relay-50009", + "port": 50009, + "targetPort": 50009, + "protocol": "UDP" + }, + { + "name": "relay-50010", + "port": 50010, + "targetPort": 50010, + "protocol": "UDP" + }, + { + "name": "relay-50011", + "port": 50011, + "targetPort": 50011, + "protocol": "UDP" + }, + { + "name": "relay-50012", + "port": 50012, + "targetPort": 50012, + "protocol": "UDP" + }, + { + "name": "relay-50013", + "port": 50013, + "targetPort": 50013, + "protocol": "UDP" + }, + { + "name": "relay-50014", + "port": 50014, + "targetPort": 50014, + "protocol": "UDP" + }, + { + "name": "relay-50015", + "port": 50015, + "targetPort": 50015, + "protocol": "UDP" + }, + { + "name": "relay-50016", + "port": 50016, + "targetPort": 50016, + "protocol": "UDP" + }, + { + "name": "relay-50017", + "port": 50017, + "targetPort": 50017, + "protocol": "UDP" + }, + { + "name": "relay-50018", + "port": 50018, + "targetPort": 50018, + "protocol": "UDP" + }, + { + "name": "relay-50019", + "port": 50019, + "targetPort": 50019, + "protocol": "UDP" + }, + { + "name": "relay-50020", + "port": 50020, + "targetPort": 50020, + "protocol": "UDP" + }, + { + "name": "relay-50021", + "port": 50021, + "targetPort": 50021, + "protocol": "UDP" + }, + { + "name": "relay-50022", + "port": 50022, + "targetPort": 50022, + "protocol": "UDP" + }, + { + "name": "relay-50023", + "port": 50023, + "targetPort": 50023, + "protocol": "UDP" + }, + { + "name": "relay-50024", + "port": 50024, + "targetPort": 50024, + "protocol": "UDP" + }, + { + "name": "relay-50025", + "port": 50025, + "targetPort": 50025, + "protocol": "UDP" + }, + { + "name": "relay-50026", + "port": 50026, + "targetPort": 50026, + "protocol": "UDP" + }, + { + "name": "relay-50027", + "port": 50027, + "targetPort": 50027, + "protocol": "UDP" + }, + { + "name": "relay-50028", + "port": 50028, + "targetPort": 50028, + "protocol": "UDP" + }, + { + "name": "relay-50029", + "port": 50029, + "targetPort": 50029, + "protocol": "UDP" + }, + { + "name": "relay-50030", + "port": 50030, + "targetPort": 50030, + "protocol": "UDP" + }, + { + "name": "relay-50031", + "port": 50031, + "targetPort": 50031, + "protocol": "UDP" + }, + { + "name": "relay-50032", + "port": 50032, + "targetPort": 50032, + "protocol": "UDP" + }, + { + "name": "relay-50033", + "port": 50033, + "targetPort": 50033, + "protocol": "UDP" + }, + { + "name": "relay-50034", + "port": 50034, + "targetPort": 50034, + "protocol": "UDP" + }, + { + "name": "relay-50035", + "port": 50035, + "targetPort": 50035, + "protocol": "UDP" + }, + { + "name": "relay-50036", + "port": 50036, + "targetPort": 50036, + "protocol": "UDP" + }, + { + "name": "relay-50037", + "port": 50037, + "targetPort": 50037, + "protocol": "UDP" + }, + { + "name": "relay-50038", + "port": 50038, + "targetPort": 50038, + "protocol": "UDP" + }, + { + "name": "relay-50039", + "port": 50039, + "targetPort": 50039, + "protocol": "UDP" + }, + { + "name": "relay-50040", + "port": 50040, + "targetPort": 50040, + "protocol": "UDP" + }, + { + "name": "relay-50041", + "port": 50041, + "targetPort": 50041, + "protocol": "UDP" + }, + { + "name": "relay-50042", + "port": 50042, + "targetPort": 50042, + "protocol": "UDP" + }, + { + "name": "relay-50043", + "port": 50043, + "targetPort": 50043, + "protocol": "UDP" + }, + { + "name": "relay-50044", + "port": 50044, + "targetPort": 50044, + "protocol": "UDP" + }, + { + "name": "relay-50045", + "port": 50045, + "targetPort": 50045, + "protocol": "UDP" + }, + { + "name": "relay-50046", + "port": 50046, + "targetPort": 50046, + "protocol": "UDP" + }, + { + "name": "relay-50047", + "port": 50047, + "targetPort": 50047, + "protocol": "UDP" + }, + { + "name": "relay-50048", + "port": 50048, + "targetPort": 50048, + "protocol": "UDP" + }, + { + "name": "relay-50049", + "port": 50049, + "targetPort": 50049, + "protocol": "UDP" + }, + { + "name": "relay-50050", + "port": 50050, + "targetPort": 50050, + "protocol": "UDP" + } + ] + }, + { + "namespace": "comms", + "name": "element-call", + "type": "ClusterIP", + "selector": { + "app": "element-call" + }, + "ports": [ + { + "name": "http", + "port": 80, + "targetPort": 8080, + "protocol": "TCP" + } + ] + }, + { + "namespace": "comms", + "name": "livekit", + "type": "LoadBalancer", + "selector": { + "app": "livekit" + }, + "ports": [ + { + "name": "http", + "port": 7880, + "targetPort": 7880, + "protocol": "TCP" + }, + { + "name": "rtc-tcp", + "port": 7881, + "targetPort": 7881, + "protocol": "TCP" + }, + { + "name": "rtc-udp-7882", + "port": 7882, + "targetPort": 7882, + "protocol": "UDP" + }, + { + "name": "rtc-udp-7883", + "port": 7883, + "targetPort": 7883, + "protocol": "UDP" + } + ] + }, + { + "namespace": "comms", + "name": "livekit-token-service", + "type": "ClusterIP", + "selector": { + "app": "livekit-token-service" + }, + "ports": [ + { + "name": "http", + "port": 8080, + "targetPort": 8080, + "protocol": "TCP" + } + ] + }, + { + "namespace": "comms", + "name": "matrix-authentication-service", + "type": "ClusterIP", + "selector": { + "app": "matrix-authentication-service" + }, + "ports": [ + { + "name": "http", + "port": 8080, + "targetPort": "http", + "protocol": "TCP" + }, + { + "name": "internal", + "port": 8081, + "targetPort": "internal", + "protocol": "TCP" + } + ] + }, + { + "namespace": "comms", + "name": "matrix-guest-register", + "type": "ClusterIP", + "selector": { + "app.kubernetes.io/name": "matrix-guest-register" + }, + "ports": [ + { + "name": "http", + "port": 8080, + "targetPort": "http", + "protocol": "TCP" + } + ] + }, + { + "namespace": "comms", + "name": "matrix-wellknown", + "type": "ClusterIP", + "selector": { + "app": "matrix-wellknown" + }, + "ports": [ + { + "name": "http", + "port": 80, + "targetPort": 80, + "protocol": "TCP" + } + ] + }, + { + "namespace": "crypto", + "name": "monerod", + "type": "ClusterIP", + "selector": { + "app": "monerod" + }, + "ports": [ + { + "name": "rpc", + "port": 18081, + "targetPort": 18081, + "protocol": "TCP" + }, + { + "name": "p2p", + "port": 18080, + "targetPort": 18080, + "protocol": "TCP" + }, + { + "name": "zmq", + "port": 18083, + "targetPort": 18083, + "protocol": "TCP" + } + ] + }, + { + "namespace": "crypto", + "name": "p2pool", + "type": "ClusterIP", + "selector": { + "app": "p2pool" + }, + "ports": [ + { + "name": "stratum", + "port": 3333, + "targetPort": 3333, + "protocol": "TCP" + } + ] + }, + { + "namespace": "crypto", + "name": "wallet-monero-temp", + "type": "ClusterIP", + "selector": { + "app": "wallet-monero-temp" + }, + "ports": [ + { + "name": "rpc", + "port": 18083, + "targetPort": 18083, + "protocol": "TCP" + } + ] + }, + { + "namespace": "finance", + "name": "actual-budget", + "type": "ClusterIP", + "selector": { + "app": "actual-budget" + }, + "ports": [ + { + "name": "http", + "port": 80, + "targetPort": 5006, + "protocol": "TCP" + } + ] + }, + { + "namespace": "finance", + "name": "firefly", + "type": "ClusterIP", + "selector": { + "app": "firefly" + }, + "ports": [ + { + "name": "http", + "port": 80, + "targetPort": 8080, + "protocol": "TCP" + } + ] + }, + { + "namespace": "flux-system", + "name": "notification-controller", + "type": "ClusterIP", + "selector": { + "app": "notification-controller" + }, + "ports": [ + { + "name": "http", + "port": 80, + "targetPort": "http", + "protocol": "TCP" + } + ] + }, + { + "namespace": "flux-system", + "name": "source-controller", + "type": "ClusterIP", + "selector": { + "app": "source-controller" + }, + "ports": [ + { + "name": "http", + "port": 80, + "targetPort": "http", + "protocol": "TCP" + } + ] + }, + { + "namespace": "flux-system", + "name": "webhook-receiver", + "type": "ClusterIP", + "selector": { + "app": "notification-controller" + }, + "ports": [ + { + "name": "http", + "port": 80, + "targetPort": "http-webhook", + "protocol": "TCP" + } + ] + }, + { + "namespace": "gitea", + "name": "gitea", + "type": "ClusterIP", + "selector": { + "app": "gitea" + }, + "ports": [ + { + "name": "http", + "port": 3000, + "targetPort": 3000, + "protocol": "TCP" + } + ] + }, + { + "namespace": "gitea", + "name": "gitea-ssh", + "type": "LoadBalancer", + "selector": { + "app": "gitea" + }, + "ports": [ + { + "name": "ssh", + "port": 2242, + "targetPort": 2242, + "protocol": "TCP" + } + ] + }, + { + "namespace": "health", + "name": "wger", + "type": "ClusterIP", + "selector": { + "app": "wger" + }, + "ports": [ + { + "name": "http", + "port": 80, + "targetPort": "http", + "protocol": "TCP" + } + ] + }, + { + "namespace": "jellyfin", + "name": "jellyfin", + "type": "ClusterIP", + "selector": { + "app": "jellyfin" + }, + "ports": [ + { + "name": "http", + "port": 80, + "targetPort": 8096, + "protocol": "TCP" + } + ] + }, + { + "namespace": "jellyfin", + "name": "pegasus", + "type": "ClusterIP", + "selector": { + "app": "pegasus" + }, + "ports": [ + { + "name": "http", + "port": 80, + "targetPort": "http", + "protocol": "TCP" + } + ] + }, + { + "namespace": "jenkins", + "name": "jenkins", + "type": "ClusterIP", + "selector": { + "app": "jenkins" + }, + "ports": [ + { + "name": "http", + "port": 8080, + "targetPort": 8080, + "protocol": "TCP" + }, + { + "name": "agent-listener", + "port": 50000, + "targetPort": 50000, + "protocol": "TCP" + } + ] + }, + { + "namespace": "logging", + "name": "oauth2-proxy-logs", + "type": "ClusterIP", + "selector": { + "app": "oauth2-proxy-logs" + }, + "ports": [ + { + "name": "http", + "port": 80, + "targetPort": 4180, + "protocol": "TCP" + } + ] + }, + { + "namespace": "longhorn-system", + "name": "oauth2-proxy-longhorn", + "type": "ClusterIP", + "selector": { + "app": "oauth2-proxy-longhorn" + }, + "ports": [ + { + "name": "http", + "port": 80, + "targetPort": 4180, + "protocol": "TCP" + } + ] + }, + { + "namespace": "mailu-mailserver", + "name": "mailu-front-lb", + "type": "LoadBalancer", + "selector": { + "app.kubernetes.io/component": "front", + "app.kubernetes.io/instance": "mailu", + "app.kubernetes.io/name": "mailu" + }, + "ports": [ + { + "name": "smtp", + "port": 25, + "targetPort": 25, + "protocol": "TCP" + }, + { + "name": "smtps", + "port": 465, + "targetPort": 465, + "protocol": "TCP" + }, + { + "name": "submission", + "port": 587, + "targetPort": 587, + "protocol": "TCP" + }, + { + "name": "imaps", + "port": 993, + "targetPort": 993, + "protocol": "TCP" + }, + { + "name": "pop3s", + "port": 995, + "targetPort": 995, + "protocol": "TCP" + }, + { + "name": "sieve", + "port": 4190, + "targetPort": 4190, + "protocol": "TCP" + } + ] + }, + { + "namespace": "maintenance", + "name": "ariadne", + "type": "ClusterIP", + "selector": { + "app": "ariadne" + }, + "ports": [ + { + "name": "http", + "port": 80, + "targetPort": "http", + "protocol": "TCP" + } + ] + }, + { + "namespace": "monitoring", + "name": "dcgm-exporter", + "type": "ClusterIP", + "selector": { + "app": "dcgm-exporter" + }, + "ports": [ + { + "name": "metrics", + "port": 9400, + "targetPort": "metrics", + "protocol": "TCP" + } + ] + }, + { + "namespace": "monitoring", + "name": "jetson-tegrastats-exporter", + "type": "ClusterIP", + "selector": { + "app": "jetson-tegrastats-exporter" + }, + "ports": [ + { + "name": "metrics", + "port": 9100, + "targetPort": "metrics", + "protocol": "TCP" + } + ] + }, + { + "namespace": "monitoring", + "name": "postmark-exporter", + "type": "ClusterIP", + "selector": { + "app": "postmark-exporter" + }, + "ports": [ + { + "name": "http", + "port": 8000, + "targetPort": "http", + "protocol": "TCP" + } + ] + }, + { + "namespace": "nextcloud", + "name": "collabora", + "type": "ClusterIP", + "selector": { + "app": "collabora" + }, + "ports": [ + { + "name": "http", + "port": 9980, + "targetPort": "http", + "protocol": "TCP" + } + ] + }, + { + "namespace": "nextcloud", + "name": "nextcloud", + "type": "ClusterIP", + "selector": { + "app": "nextcloud" + }, + "ports": [ + { + "name": "http", + "port": 80, + "targetPort": "http", + "protocol": "TCP" + } + ] + }, + { + "namespace": "outline", + "name": "outline", + "type": "ClusterIP", + "selector": { + "app": "outline" + }, + "ports": [ + { + "name": "http", + "port": 80, + "targetPort": "http", + "protocol": "TCP" + } + ] + }, + { + "namespace": "outline", + "name": "outline-redis", + "type": "ClusterIP", + "selector": { + "app": "outline-redis" + }, + "ports": [ + { + "name": "redis", + "port": 6379, + "targetPort": "redis", + "protocol": "TCP" + } + ] + }, + { + "namespace": "planka", + "name": "planka", + "type": "ClusterIP", + "selector": { + "app": "planka" + }, + "ports": [ + { + "name": "http", + "port": 80, + "targetPort": "http", + "protocol": "TCP" + } + ] + }, + { + "namespace": "postgres", + "name": "postgres-service", + "type": "ClusterIP", + "selector": { + "app": "postgres" + }, + "ports": [ + { + "name": "postgres", + "port": 5432, + "targetPort": 5432, + "protocol": "TCP" + }, + { + "name": "metrics", + "port": 9187, + "targetPort": 9187, + "protocol": "TCP" + } + ] + }, + { + "namespace": "sso", + "name": "keycloak", + "type": "ClusterIP", + "selector": { + "app": "keycloak" + }, + "ports": [ + { + "name": "http", + "port": 80, + "targetPort": "http", + "protocol": "TCP" + } + ] + }, + { + "namespace": "sso", + "name": "oauth2-proxy", + "type": "ClusterIP", + "selector": { + "app": "oauth2-proxy" + }, + "ports": [ + { + "name": "http", + "port": 80, + "targetPort": 4180, + "protocol": "TCP" + } + ] + }, + { + "namespace": "sso", + "name": "openldap", + "type": "ClusterIP", + "selector": { + "app": "openldap" + }, + "ports": [ + { + "name": "ldap", + "port": 389, + "targetPort": "ldap", + "protocol": "TCP" + }, + { + "name": "ldaps", + "port": 636, + "targetPort": "ldaps", + "protocol": "TCP" + } + ] + }, + { + "namespace": "sui-metrics", + "name": "sui-metrics", + "type": "ClusterIP", + "selector": { + "app": "sui-metrics" + }, + "ports": [ + { + "name": "http", + "port": 8429, + "targetPort": 8429, + "protocol": "TCP" + } + ] + }, + { + "namespace": "traefik", + "name": "traefik", + "type": "LoadBalancer", + "selector": { + "app": "traefik" + }, + "ports": [ + { + "name": "web", + "port": 80, + "targetPort": "web", + "protocol": "TCP" + }, + { + "name": "websecure", + "port": 443, + "targetPort": "websecure", + "protocol": "TCP" + } + ] + }, + { + "namespace": "traefik", + "name": "traefik-metrics", + "type": "ClusterIP", + "selector": { + "app": "traefik" + }, + "ports": [ + { + "name": "metrics", + "port": 9100, + "targetPort": "metrics", + "protocol": "TCP" + } + ] + }, + { + "namespace": "vault", + "name": "vault", + "type": "ClusterIP", + "selector": { + "app": "vault" + }, + "ports": [ + { + "name": "api", + "port": 8200, + "targetPort": 8200, + "protocol": "TCP" + }, + { + "name": "cluster", + "port": 8201, + "targetPort": 8201, + "protocol": "TCP" + } + ] + }, + { + "namespace": "vault", + "name": "vault-internal", + "type": "ClusterIP", + "selector": { + "app": "vault" + }, + "ports": [ + { + "name": "api", + "port": 8200, + "targetPort": 8200, + "protocol": "TCP" + }, + { + "name": "cluster", + "port": 8201, + "targetPort": 8201, + "protocol": "TCP" + } + ] + }, + { + "namespace": "vaultwarden", + "name": "vaultwarden-service", + "type": "ClusterIP", + "selector": { + "app": "vaultwarden" + }, + "ports": [ + { + "name": "http", + "port": 80, + "targetPort": "http", + "protocol": "TCP" + } + ] + } + ], + "http_endpoints": [ + { + "host": "auth.bstein.dev", + "path": "/", + "backend": { + "namespace": "sso", + "service": "oauth2-proxy", + "port": 80, + "workloads": [ + { + "kind": "Deployment", + "name": "oauth2-proxy" + } + ] + }, + "via": { + "kind": "Ingress", + "name": "oauth2-proxy", + "source": "oauth2-proxy" + } + }, + { + "host": "bstein.dev", + "path": "/", + "backend": { + "namespace": "bstein-dev-home", + "service": "bstein-dev-home-frontend", + "port": 80, + "workloads": [ + { + "kind": "Deployment", + "name": "bstein-dev-home-frontend" + } + ] + }, + "via": { + "kind": "Ingress", + "name": "bstein-dev-home", + "source": "bstein-dev-home" + } + }, + { + "host": "bstein.dev", + "path": "/.well-known/matrix/client", + "backend": { + "namespace": "comms", + "service": "matrix-wellknown", + "port": 80, + "workloads": [ + { + "kind": "Deployment", + "name": "matrix-wellknown" + } + ] + }, + "via": { + "kind": "Ingress", + "name": "matrix-wellknown-bstein-dev", + "source": "comms" + } + }, + { + "host": "bstein.dev", + "path": "/.well-known/matrix/server", + "backend": { + "namespace": "comms", + "service": "matrix-wellknown", + "port": 80, + "workloads": [ + { + "kind": "Deployment", + "name": "matrix-wellknown" + } + ] + }, + "via": { + "kind": "Ingress", + "name": "matrix-wellknown-bstein-dev", + "source": "comms" + } + }, + { + "host": "bstein.dev", + "path": "/api", + "backend": { + "namespace": "bstein-dev-home", + "service": "bstein-dev-home-backend", + "port": 80, + "workloads": [ + { + "kind": "Deployment", + "name": "bstein-dev-home-backend" + } + ] + }, + "via": { + "kind": "Ingress", + "name": "bstein-dev-home", + "source": "bstein-dev-home" + } + }, + { + "host": "budget.bstein.dev", + "path": "/", + "backend": { + "namespace": "finance", + "service": "actual-budget", + "port": 80, + "workloads": [ + { + "kind": "Deployment", + "name": "actual-budget" + } + ] + }, + "via": { + "kind": "Ingress", + "name": "actual-budget", + "source": "finance" + } + }, + { + "host": "call.live.bstein.dev", + "path": "/", + "backend": { + "namespace": "comms", + "service": "element-call", + "port": 80, + "workloads": [ + { + "kind": "Deployment", + "name": "element-call" + } + ] + }, + "via": { + "kind": "Ingress", + "name": "element-call", + "source": "comms" + } + }, + { + "host": "chat.ai.bstein.dev", + "path": "/", + "backend": { + "namespace": "bstein-dev-home", + "service": "chat-ai-gateway", + "port": 80, + "workloads": [ + { + "kind": "Deployment", + "name": "chat-ai-gateway" + } + ] + }, + "via": { + "kind": "Ingress", + "name": "bstein-dev-home", + "source": "bstein-dev-home" + } + }, + { + "host": "ci.bstein.dev", + "path": "/", + "backend": { + "namespace": "jenkins", + "service": "jenkins", + "port": "http", + "workloads": [ + { + "kind": "Deployment", + "name": "jenkins" + } + ] + }, + "via": { + "kind": "Ingress", + "name": "jenkins", + "source": "jenkins" + } + }, + { + "host": "cloud.bstein.dev", + "path": "/", + "backend": { + "namespace": "nextcloud", + "service": "nextcloud", + "port": 80, + "workloads": [ + { + "kind": "Deployment", + "name": "nextcloud" + } + ] + }, + "via": { + "kind": "Ingress", + "name": "nextcloud", + "source": "nextcloud" + } + }, + { + "host": "health.bstein.dev", + "path": "/", + "backend": { + "namespace": "health", + "service": "wger", + "port": 80, + "workloads": [ + { + "kind": "Deployment", + "name": "wger" + } + ] + }, + "via": { + "kind": "Ingress", + "name": "wger", + "source": "health" + } + }, + { + "host": "kit.live.bstein.dev", + "path": "/livekit/jwt", + "backend": { + "namespace": "comms", + "service": "livekit-token-service", + "port": 8080, + "workloads": [ + { + "kind": "Deployment", + "name": "livekit-token-service" + } + ] + }, + "via": { + "kind": "Ingress", + "name": "livekit-jwt-ingress", + "source": "comms" + } + }, + { + "host": "kit.live.bstein.dev", + "path": "/livekit/sfu", + "backend": { + "namespace": "comms", + "service": "livekit", + "port": 7880, + "workloads": [ + { + "kind": "Deployment", + "name": "livekit" + } + ] + }, + "via": { + "kind": "Ingress", + "name": "livekit-ingress", + "source": "comms" + } + }, + { + "host": "live.bstein.dev", + "path": "/.well-known/matrix/client", + "backend": { + "namespace": "comms", + "service": "matrix-wellknown", + "port": 80, + "workloads": [ + { + "kind": "Deployment", + "name": "matrix-wellknown" + } + ] + }, + "via": { + "kind": "Ingress", + "name": "matrix-wellknown", + "source": "comms" + } + }, + { + "host": "live.bstein.dev", + "path": "/.well-known/matrix/server", + "backend": { + "namespace": "comms", + "service": "matrix-wellknown", + "port": 80, + "workloads": [ + { + "kind": "Deployment", + "name": "matrix-wellknown" + } + ] + }, + "via": { + "kind": "Ingress", + "name": "matrix-wellknown", + "source": "comms" + } + }, + { + "host": "live.bstein.dev", + "path": "/_matrix", + "backend": { + "namespace": "comms", + "service": "othrys-synapse-matrix-synapse", + "port": 8008, + "workloads": [] + }, + "via": { + "kind": "Ingress", + "name": "matrix-routing", + "source": "comms" + } + }, + { + "host": "live.bstein.dev", + "path": "/_matrix/client/r0/register", + "backend": { + "namespace": "comms", + "service": "matrix-guest-register", + "port": 8080, + "workloads": [ + { + "kind": "Deployment", + "name": "matrix-guest-register" + } + ] + }, + "via": { + "kind": "Ingress", + "name": "matrix-routing", + "source": "comms" + } + }, + { + "host": "live.bstein.dev", + "path": "/_matrix/client/v3/login", + "backend": { + "namespace": "comms", + "service": "matrix-authentication-service", + "port": 8080, + "workloads": [ + { + "kind": "Deployment", + "name": "matrix-authentication-service" + } + ] + }, + "via": { + "kind": "Ingress", + "name": "matrix-routing", + "source": "comms" + } + }, + { + "host": "live.bstein.dev", + "path": "/_matrix/client/v3/logout", + "backend": { + "namespace": "comms", + "service": "matrix-authentication-service", + "port": 8080, + "workloads": [ + { + "kind": "Deployment", + "name": "matrix-authentication-service" + } + ] + }, + "via": { + "kind": "Ingress", + "name": "matrix-routing", + "source": "comms" + } + }, + { + "host": "live.bstein.dev", + "path": "/_matrix/client/v3/refresh", + "backend": { + "namespace": "comms", + "service": "matrix-authentication-service", + "port": 8080, + "workloads": [ + { + "kind": "Deployment", + "name": "matrix-authentication-service" + } + ] + }, + "via": { + "kind": "Ingress", + "name": "matrix-routing", + "source": "comms" + } + }, + { + "host": "live.bstein.dev", + "path": "/_matrix/client/v3/register", + "backend": { + "namespace": "comms", + "service": "matrix-guest-register", + "port": 8080, + "workloads": [ + { + "kind": "Deployment", + "name": "matrix-guest-register" + } + ] + }, + "via": { + "kind": "Ingress", + "name": "matrix-routing", + "source": "comms" + } + }, + { + "host": "logs.bstein.dev", + "path": "/", + "backend": { + "namespace": "logging", + "service": "oauth2-proxy-logs", + "port": "http", + "workloads": [ + { + "kind": "Deployment", + "name": "oauth2-proxy-logs" + } + ] + }, + "via": { + "kind": "Ingress", + "name": "logs", + "source": "logging" + } + }, + { + "host": "longhorn.bstein.dev", + "path": "/", + "backend": { + "namespace": "longhorn-system", + "service": "oauth2-proxy-longhorn", + "port": 80, + "workloads": [ + { + "kind": "Deployment", + "name": "oauth2-proxy-longhorn" + } + ] + }, + "via": { + "kind": "Ingress", + "name": "longhorn-ingress", + "source": "longhorn-ui" + } + }, + { + "host": "mail.bstein.dev", + "path": "/", + "backend": { + "namespace": "mailu-mailserver", + "service": "mailu-front", + "port": 443, + "workloads": [] + }, + "via": { + "kind": "IngressRoute", + "name": "mailu", + "source": "mailu" + } + }, + { + "host": "matrix.live.bstein.dev", + "path": "/", + "backend": { + "namespace": "comms", + "service": "matrix-authentication-service", + "port": 8080, + "workloads": [ + { + "kind": "Deployment", + "name": "matrix-authentication-service" + } + ] + }, + "via": { + "kind": "Ingress", + "name": "matrix-routing", + "source": "comms" + } + }, + { + "host": "matrix.live.bstein.dev", + "path": "/.well-known/matrix/client", + "backend": { + "namespace": "comms", + "service": "matrix-wellknown", + "port": 80, + "workloads": [ + { + "kind": "Deployment", + "name": "matrix-wellknown" + } + ] + }, + "via": { + "kind": "Ingress", + "name": "matrix-wellknown-matrix-live", + "source": "comms" + } + }, + { + "host": "matrix.live.bstein.dev", + "path": "/.well-known/matrix/server", + "backend": { + "namespace": "comms", + "service": "matrix-wellknown", + "port": 80, + "workloads": [ + { + "kind": "Deployment", + "name": "matrix-wellknown" + } + ] + }, + "via": { + "kind": "Ingress", + "name": "matrix-wellknown-matrix-live", + "source": "comms" + } + }, + { + "host": "matrix.live.bstein.dev", + "path": "/_matrix", + "backend": { + "namespace": "comms", + "service": "othrys-synapse-matrix-synapse", + "port": 8008, + "workloads": [] + }, + "via": { + "kind": "Ingress", + "name": "matrix-routing", + "source": "comms" + } + }, + { + "host": "matrix.live.bstein.dev", + "path": "/_matrix/client/r0/register", + "backend": { + "namespace": "comms", + "service": "matrix-guest-register", + "port": 8080, + "workloads": [ + { + "kind": "Deployment", + "name": "matrix-guest-register" + } + ] + }, + "via": { + "kind": "Ingress", + "name": "matrix-routing", + "source": "comms" + } + }, + { + "host": "matrix.live.bstein.dev", + "path": "/_matrix/client/v3/login", + "backend": { + "namespace": "comms", + "service": "matrix-authentication-service", + "port": 8080, + "workloads": [ + { + "kind": "Deployment", + "name": "matrix-authentication-service" + } + ] + }, + "via": { + "kind": "Ingress", + "name": "matrix-routing", + "source": "comms" + } + }, + { + "host": "matrix.live.bstein.dev", + "path": "/_matrix/client/v3/logout", + "backend": { + "namespace": "comms", + "service": "matrix-authentication-service", + "port": 8080, + "workloads": [ + { + "kind": "Deployment", + "name": "matrix-authentication-service" + } + ] + }, + "via": { + "kind": "Ingress", + "name": "matrix-routing", + "source": "comms" + } + }, + { + "host": "matrix.live.bstein.dev", + "path": "/_matrix/client/v3/refresh", + "backend": { + "namespace": "comms", + "service": "matrix-authentication-service", + "port": 8080, + "workloads": [ + { + "kind": "Deployment", + "name": "matrix-authentication-service" + } + ] + }, + "via": { + "kind": "Ingress", + "name": "matrix-routing", + "source": "comms" + } + }, + { + "host": "matrix.live.bstein.dev", + "path": "/_matrix/client/v3/register", + "backend": { + "namespace": "comms", + "service": "matrix-guest-register", + "port": 8080, + "workloads": [ + { + "kind": "Deployment", + "name": "matrix-guest-register" + } + ] + }, + "via": { + "kind": "Ingress", + "name": "matrix-routing", + "source": "comms" + } + }, + { + "host": "matrix.live.bstein.dev", + "path": "/_synapse", + "backend": { + "namespace": "comms", + "service": "othrys-synapse-matrix-synapse", + "port": 8008, + "workloads": [] + }, + "via": { + "kind": "Ingress", + "name": "matrix-routing", + "source": "comms" + } + }, + { + "host": "monero.bstein.dev", + "path": "/", + "backend": { + "namespace": "crypto", + "service": "monerod", + "port": 18081, + "workloads": [ + { + "kind": "Deployment", + "name": "monerod" + } + ] + }, + "via": { + "kind": "Ingress", + "name": "monerod", + "source": "monerod" + } + }, + { + "host": "money.bstein.dev", + "path": "/", + "backend": { + "namespace": "finance", + "service": "firefly", + "port": 80, + "workloads": [ + { + "kind": "Deployment", + "name": "firefly" + } + ] + }, + "via": { + "kind": "Ingress", + "name": "firefly", + "source": "finance" + } + }, + { + "host": "notes.bstein.dev", + "path": "/", + "backend": { + "namespace": "outline", + "service": "outline", + "port": 80, + "workloads": [ + { + "kind": "Deployment", + "name": "outline" + } + ] + }, + "via": { + "kind": "Ingress", + "name": "outline", + "source": "outline" + } + }, + { + "host": "office.bstein.dev", + "path": "/", + "backend": { + "namespace": "nextcloud", + "service": "collabora", + "port": 9980, + "workloads": [ + { + "kind": "Deployment", + "name": "collabora" + } + ] + }, + "via": { + "kind": "Ingress", + "name": "collabora", + "source": "nextcloud" + } + }, + { + "host": "pegasus.bstein.dev", + "path": "/", + "backend": { + "namespace": "jellyfin", + "service": "pegasus", + "port": 80, + "workloads": [ + { + "kind": "Deployment", + "name": "pegasus" + } + ] + }, + "via": { + "kind": "Ingress", + "name": "pegasus", + "source": "pegasus" + } + }, + { + "host": "scm.bstein.dev", + "path": "/", + "backend": { + "namespace": "gitea", + "service": "gitea", + "port": 3000, + "workloads": [ + { + "kind": "Deployment", + "name": "gitea" + } + ] + }, + "via": { + "kind": "Ingress", + "name": "gitea-ingress", + "source": "gitea" + } + }, + { + "host": "secret.bstein.dev", + "path": "/", + "backend": { + "namespace": "vault", + "service": "vault", + "port": 8200, + "workloads": [ + { + "kind": "StatefulSet", + "name": "vault" + } + ] + }, + "via": { + "kind": "Ingress", + "name": "vault", + "source": "vault" + } + }, + { + "host": "sso.bstein.dev", + "path": "/", + "backend": { + "namespace": "sso", + "service": "keycloak", + "port": 80, + "workloads": [ + { + "kind": "Deployment", + "name": "keycloak" + } + ] + }, + "via": { + "kind": "Ingress", + "name": "keycloak", + "source": "keycloak" + } + }, + { + "host": "stream.bstein.dev", + "path": "/", + "backend": { + "namespace": "jellyfin", + "service": "jellyfin", + "port": 80, + "workloads": [ + { + "kind": "Deployment", + "name": "jellyfin" + } + ] + }, + "via": { + "kind": "Ingress", + "name": "jellyfin", + "source": "jellyfin" + } + }, + { + "host": "tasks.bstein.dev", + "path": "/", + "backend": { + "namespace": "planka", + "service": "planka", + "port": 80, + "workloads": [ + { + "kind": "Deployment", + "name": "planka" + } + ] + }, + "via": { + "kind": "Ingress", + "name": "planka", + "source": "planka" + } + }, + { + "host": "vault.bstein.dev", + "path": "/", + "backend": { + "namespace": "vaultwarden", + "service": "vaultwarden-service", + "port": 80, + "workloads": [ + { + "kind": "Deployment", + "name": "vaultwarden" + } + ] + }, + "via": { + "kind": "Ingress", + "name": "vaultwarden-ingress", + "source": "vaultwarden" + } + } + ], + "helmrelease_host_hints": { + "comms:comms/othrys-element": [ + "call.live.bstein.dev", + "live.bstein.dev", + "matrix.live.bstein.dev" + ], + "comms:comms/othrys-synapse": [ + "kit.live.bstein.dev", + "live.bstein.dev", + "matrix.live.bstein.dev", + "turn.live.bstein.dev" + ], + "gitops-ui:flux-system/weave-gitops": [ + "cd.bstein.dev" + ], + "harbor:harbor/harbor": [ + "registry.bstein.dev" + ], + "logging:logging/data-prepper": [ + "registry.bstein.dev" + ], + "longhorn:longhorn-system/longhorn": [ + "registry.bstein.dev" + ], + "mailu:mailu-mailserver/mailu": [ + "bstein.dev", + "mail.bstein.dev" + ], + "monitoring:monitoring/alertmanager": [ + "alerts.bstein.dev" + ], + "monitoring:monitoring/grafana": [ + "bstein.dev", + "mail.bstein.dev", + "metrics.bstein.dev", + "sso.bstein.dev" + ], + "monitoring:monitoring/kube-state-metrics": [ + "atlas.bstein.dev" + ] + } +} diff --git a/services/atlasbot/knowledge/catalog/metrics.json b/services/atlasbot/knowledge/catalog/metrics.json new file mode 100644 index 0000000..e929db5 --- /dev/null +++ b/services/atlasbot/knowledge/catalog/metrics.json @@ -0,0 +1,1880 @@ +[ + { + "dashboard": "Atlas GPU", + "panel_title": "Namespace GPU Share", + "panel_id": 1, + "panel_type": "piechart", + "description": "Shares are normalized within the selected filter. Switching scope changes the denominator.", + "tags": [ + "atlas", + "gpu" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "(100 * (sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace)) / clamp_min((sum(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace)) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") and on() ((sum(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace)) or on() vector(0)) == 0))" + ] + }, + { + "dashboard": "Atlas GPU", + "panel_title": "GPU Util by Namespace", + "panel_id": 2, + "panel_type": "timeseries", + "description": "", + "tags": [ + "atlas", + "gpu" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace)" + ] + }, + { + "dashboard": "Atlas GPU", + "panel_title": "GPU Util by Node", + "panel_id": 3, + "panel_type": "timeseries", + "description": "", + "tags": [ + "atlas", + "gpu" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum by (Hostname) (DCGM_FI_DEV_GPU_UTIL{pod!=\"\"})" + ] + }, + { + "dashboard": "Atlas GPU", + "panel_title": "Top Pods by GPU Util", + "panel_id": 4, + "panel_type": "table", + "description": "", + "tags": [ + "atlas", + "gpu" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "topk(10, sum(DCGM_FI_DEV_GPU_UTIL{pod!=\"\"}) by (namespace,pod,Hostname))" + ] + }, + { + "dashboard": "Atlas Jobs", + "panel_title": "Ariadne Task Errors (range)", + "panel_id": 1, + "panel_type": "bargauge", + "description": "", + "tags": [ + "atlas", + "jobs", + "glue" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sort_desc(sum by (task) (increase(ariadne_task_runs_total{status=\"error\"}[$__range])))" + ] + }, + { + "dashboard": "Atlas Jobs", + "panel_title": "Ariadne Attempts / Failures", + "panel_id": 2, + "panel_type": "timeseries", + "description": "", + "tags": [ + "atlas", + "jobs", + "glue" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum(increase(ariadne_task_runs_total[$__interval]))", + "sum(increase(ariadne_task_runs_total{status=\"error\"}[$__interval]))" + ] + }, + { + "dashboard": "Atlas Jobs", + "panel_title": "One-off Job Pods (age hours)", + "panel_id": 3, + "panel_type": "bargauge", + "description": "", + "tags": [ + "atlas", + "jobs", + "glue" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sort_desc(((time() - kube_pod_start_time{pod!=\"\"}) / 3600) * on(namespace,pod) group_left(owner_name) (kube_pod_owner{owner_kind=\"Job\"} unless on(namespace, owner_name) label_replace(kube_job_owner{owner_kind=\"CronJob\"}, \"owner_name\", \"$1\", \"job_name\", \"(.*)\")) * on(namespace,pod) group_left(phase) max by (namespace,pod,phase) (kube_pod_status_phase{phase=~\"Running|Succeeded\"}))" + ] + }, + { + "dashboard": "Atlas Jobs", + "panel_title": "Glue Jobs Stale (>36h)", + "panel_id": 4, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "jobs", + "glue" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "(sum((((time() - (kube_cronjob_status_last_successful_time and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"})) > bool 129600) unless on(namespace,cronjob) (kube_cronjob_spec_suspend and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}) == 1)) + count(((kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"} unless on(namespace,cronjob) kube_cronjob_status_last_successful_time) unless on(namespace,cronjob) (kube_cronjob_spec_suspend and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}) == 1))) or on() vector(0)" + ] + }, + { + "dashboard": "Atlas Jobs", + "panel_title": "Glue Jobs Missing Success", + "panel_id": 5, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "jobs", + "glue" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "count(((kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"} unless on(namespace,cronjob) kube_cronjob_status_last_successful_time) unless on(namespace,cronjob) (kube_cronjob_spec_suspend and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}) == 1)) or on() vector(0)" + ] + }, + { + "dashboard": "Atlas Jobs", + "panel_title": "Glue Jobs Suspended", + "panel_id": 6, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "jobs", + "glue" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum((kube_cronjob_spec_suspend and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}) == 1) or on() vector(0)" + ] + }, + { + "dashboard": "Atlas Jobs", + "panel_title": "Ariadne Task Errors (1h)", + "panel_id": 7, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "jobs", + "glue" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum(increase(ariadne_task_runs_total{status=\"error\"}[1h]))" + ] + }, + { + "dashboard": "Atlas Jobs", + "panel_title": "Ariadne Task Errors (24h)", + "panel_id": 8, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "jobs", + "glue" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum(increase(ariadne_task_runs_total{status=\"error\"}[24h]))" + ] + }, + { + "dashboard": "Atlas Jobs", + "panel_title": "Ariadne Task Runs (1h)", + "panel_id": 9, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "jobs", + "glue" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum(increase(ariadne_task_runs_total[1h]))" + ] + }, + { + "dashboard": "Atlas Jobs", + "panel_title": "Ariadne Schedule Last Error (hours ago)", + "panel_id": 10, + "panel_type": "bargauge", + "description": "", + "tags": [ + "atlas", + "jobs", + "glue" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sort_desc((time() - max_over_time(ariadne_schedule_last_error_timestamp_seconds[$__range])) / 3600)" + ] + }, + { + "dashboard": "Atlas Jobs", + "panel_title": "Ariadne Schedule Last Success (hours ago)", + "panel_id": 11, + "panel_type": "bargauge", + "description": "", + "tags": [ + "atlas", + "jobs", + "glue" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sort_desc((time() - max_over_time(ariadne_schedule_last_success_timestamp_seconds[$__range])) / 3600)" + ] + }, + { + "dashboard": "Atlas Jobs", + "panel_title": "Glue Jobs Last Success (hours ago)", + "panel_id": 12, + "panel_type": "bargauge", + "description": "", + "tags": [ + "atlas", + "jobs", + "glue" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sort_desc((time() - max_over_time((kube_cronjob_status_last_successful_time and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"})[$__range])) / 3600)" + ] + }, + { + "dashboard": "Atlas Jobs", + "panel_title": "Glue Jobs Last Schedule (hours ago)", + "panel_id": 13, + "panel_type": "bargauge", + "description": "", + "tags": [ + "atlas", + "jobs", + "glue" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sort_desc((time() - max_over_time((kube_cronjob_status_last_schedule_time and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"})[$__range])) / 3600)" + ] + }, + { + "dashboard": "Atlas Jobs", + "panel_title": "Ariadne Task Errors (1h)", + "panel_id": 14, + "panel_type": "bargauge", + "description": "", + "tags": [ + "atlas", + "jobs", + "glue" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sort_desc(sum by (task) (increase(ariadne_task_runs_total{status=\"error\"}[1h])))" + ] + }, + { + "dashboard": "Atlas Jobs", + "panel_title": "Ariadne Task Errors (30d)", + "panel_id": 15, + "panel_type": "bargauge", + "description": "", + "tags": [ + "atlas", + "jobs", + "glue" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sort_desc(sum by (task) (increase(ariadne_task_runs_total{status=\"error\"}[30d])))" + ] + }, + { + "dashboard": "Atlas Jobs", + "panel_title": "Ariadne Access Requests", + "panel_id": 16, + "panel_type": "bargauge", + "description": "", + "tags": [ + "atlas", + "jobs", + "glue" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sort_desc(ariadne_access_requests_total)" + ] + }, + { + "dashboard": "Atlas Jobs", + "panel_title": "Ariadne CI Coverage (%)", + "panel_id": 17, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "jobs", + "glue" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "ariadne_ci_coverage_percent{repo=\"ariadne\"}" + ] + }, + { + "dashboard": "Atlas Jobs", + "panel_title": "Ariadne CI Tests (latest)", + "panel_id": 18, + "panel_type": "table", + "description": "", + "tags": [ + "atlas", + "jobs", + "glue" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "ariadne_ci_tests_total{repo=\"ariadne\"}" + ] + }, + { + "dashboard": "Atlas Mail", + "panel_title": "Sent (1d)", + "panel_id": 1, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "mail" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "max(postmark_outbound_sent{window=\"1d\"})" + ] + }, + { + "dashboard": "Atlas Mail", + "panel_title": "Sent (7d)", + "panel_id": 2, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "mail" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "max(postmark_outbound_sent{window=\"7d\"})" + ] + }, + { + "dashboard": "Atlas Mail", + "panel_title": "Mail Bounces (1d)", + "panel_id": 3, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "mail" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "max(postmark_outbound_bounce_rate{window=\"1d\"})", + "max(postmark_outbound_bounced{window=\"1d\"})" + ] + }, + { + "dashboard": "Atlas Mail", + "panel_title": "Success Rate (1d)", + "panel_id": 4, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "mail" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "clamp_min(100 - max(postmark_outbound_bounce_rate{window=\"1d\"}), 0)" + ] + }, + { + "dashboard": "Atlas Mail", + "panel_title": "Limit Used (30d)", + "panel_id": 5, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "mail" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "max(postmark_sending_limit_used_percent)" + ] + }, + { + "dashboard": "Atlas Mail", + "panel_title": "Send Limit (30d)", + "panel_id": 6, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "mail" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "max(postmark_sending_limit)" + ] + }, + { + "dashboard": "Atlas Mail", + "panel_title": "Last Success", + "panel_id": 7, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "mail" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "max(postmark_last_success_timestamp_seconds)" + ] + }, + { + "dashboard": "Atlas Mail", + "panel_title": "Exporter Errors", + "panel_id": 8, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "mail" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum(postmark_request_errors_total)" + ] + }, + { + "dashboard": "Atlas Mail", + "panel_title": "Bounce Rate (1d vs 7d)", + "panel_id": 13, + "panel_type": "timeseries", + "description": "", + "tags": [ + "atlas", + "mail" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "max by (window) (postmark_outbound_bounce_rate)" + ] + }, + { + "dashboard": "Atlas Mail", + "panel_title": "Bounced (1d vs 7d)", + "panel_id": 14, + "panel_type": "timeseries", + "description": "", + "tags": [ + "atlas", + "mail" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "max by (window) (postmark_outbound_bounced)" + ] + }, + { + "dashboard": "Atlas Mail", + "panel_title": "Sent (1d vs 7d)", + "panel_id": 15, + "panel_type": "timeseries", + "description": "", + "tags": [ + "atlas", + "mail" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "max by (window) (postmark_outbound_sent)" + ] + }, + { + "dashboard": "Atlas Mail", + "panel_title": "Exporter Errors", + "panel_id": 16, + "panel_type": "timeseries", + "description": "", + "tags": [ + "atlas", + "mail" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum(postmark_request_errors_total)" + ] + }, + { + "dashboard": "Atlas Network", + "panel_title": "Ingress Success Rate (5m)", + "panel_id": 1, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "network" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "(sum(rate(traefik_entrypoint_requests_total{code!~\"5..\"}[5m]))) / clamp_min(sum(rate(traefik_entrypoint_requests_total[5m])), 1)" + ] + }, + { + "dashboard": "Atlas Network", + "panel_title": "Error Budget Burn (1h)", + "panel_id": 2, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "network" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "(1 - ((sum(rate(traefik_entrypoint_requests_total{code!~\"5..\"}[1h]))) / clamp_min(sum(rate(traefik_entrypoint_requests_total[1h])), 1))) / 0.0010000000000000009" + ] + }, + { + "dashboard": "Atlas Network", + "panel_title": "Error Budget Burn (6h)", + "panel_id": 3, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "network" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "(1 - ((sum(rate(traefik_entrypoint_requests_total{code!~\"5..\"}[6h]))) / clamp_min(sum(rate(traefik_entrypoint_requests_total[6h])), 1))) / 0.0010000000000000009" + ] + }, + { + "dashboard": "Atlas Network", + "panel_title": "Edge P99 Latency (ms)", + "panel_id": 4, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "network" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "histogram_quantile(0.99, sum by (le) (rate(traefik_entrypoint_request_duration_seconds_bucket[5m]))) * 1000" + ] + }, + { + "dashboard": "Atlas Network", + "panel_title": "Ingress Traffic", + "panel_id": 5, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "network" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum(rate(node_network_receive_bytes_total{device!~\"lo|cni.*|veth.*|flannel.*|docker.*|virbr.*|vxlan.*|wg.*\"}[5m])) or on() vector(0)" + ] + }, + { + "dashboard": "Atlas Network", + "panel_title": "Egress Traffic", + "panel_id": 6, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "network" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum(rate(node_network_transmit_bytes_total{device!~\"lo|cni.*|veth.*|flannel.*|docker.*|virbr.*|vxlan.*|wg.*\"}[5m])) or on() vector(0)" + ] + }, + { + "dashboard": "Atlas Network", + "panel_title": "Intra-Cluster Traffic", + "panel_id": 7, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "network" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum(rate(container_network_receive_bytes_total{namespace!=\"traefik\",pod!=\"\"}[5m]) + rate(container_network_transmit_bytes_total{namespace!=\"traefik\",pod!=\"\"}[5m])) or on() vector(0)" + ] + }, + { + "dashboard": "Atlas Network", + "panel_title": "Per-Node Throughput", + "panel_id": 8, + "panel_type": "timeseries", + "description": "", + "tags": [ + "atlas", + "network" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "avg by (node) ((sum(rate(node_network_transmit_bytes_total{device!~\"lo|cni.*|veth.*|flannel.*|docker.*|virbr.*|vxlan.*|wg.*\"}[5m])) or on() vector(0) + sum(rate(node_network_receive_bytes_total{device!~\"lo|cni.*|veth.*|flannel.*|docker.*|virbr.*|vxlan.*|wg.*\"}[5m])) or on() vector(0)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))" + ] + }, + { + "dashboard": "Atlas Network", + "panel_title": "Top Namespaces", + "panel_id": 9, + "panel_type": "table", + "description": "", + "tags": [ + "atlas", + "network" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "topk(10, sum(rate(container_network_transmit_bytes_total{namespace!=\"\"}[5m]) + rate(container_network_receive_bytes_total{namespace!=\"\"}[5m])) by (namespace))" + ] + }, + { + "dashboard": "Atlas Network", + "panel_title": "Top Pods", + "panel_id": 10, + "panel_type": "table", + "description": "", + "tags": [ + "atlas", + "network" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "topk(10, sum(rate(container_network_transmit_bytes_total{pod!=\"\"}[5m]) + rate(container_network_receive_bytes_total{pod!=\"\"}[5m])) by (namespace,pod))" + ] + }, + { + "dashboard": "Atlas Network", + "panel_title": "Traefik Routers (req/s)", + "panel_id": 11, + "panel_type": "timeseries", + "description": "", + "tags": [ + "atlas", + "network" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "topk(10, sum by (router) (rate(traefik_router_requests_total[5m])))" + ] + }, + { + "dashboard": "Atlas Network", + "panel_title": "Traefik Entrypoints (req/s)", + "panel_id": 12, + "panel_type": "timeseries", + "description": "", + "tags": [ + "atlas", + "network" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum by (entrypoint) (rate(traefik_entrypoint_requests_total[5m]))" + ] + }, + { + "dashboard": "Atlas Nodes", + "panel_title": "Worker Nodes Ready", + "panel_id": 1, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "nodes" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum(kube_node_status_condition{condition=\"Ready\",status=\"true\",node=~\"titan-04|titan-05|titan-06|titan-07|titan-08|titan-09|titan-10|titan-11|titan-20|titan-21|titan-12|titan-13|titan-14|titan-15|titan-16|titan-17|titan-18|titan-19|titan-22|titan-24\"})" + ] + }, + { + "dashboard": "Atlas Nodes", + "panel_title": "Control Plane Ready", + "panel_id": 2, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "nodes" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum(kube_node_status_condition{condition=\"Ready\",status=\"true\",node=~\"titan-0a|titan-0b|titan-0c\"})" + ] + }, + { + "dashboard": "Atlas Nodes", + "panel_title": "Control Plane Workloads", + "panel_id": 3, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "nodes" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum(kube_pod_info{node=~\"titan-0a|titan-0b|titan-0c\",namespace!~\"^(kube-.*|.*-system|traefik|monitoring|logging|cert-manager|maintenance|postgres)$\"})" + ] + }, + { + "dashboard": "Atlas Nodes", + "panel_title": "API Server 5xx rate", + "panel_id": 9, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "nodes" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum(rate(apiserver_request_total{code=~\"5..\"}[5m]))" + ] + }, + { + "dashboard": "Atlas Nodes", + "panel_title": "API Server P99 latency", + "panel_id": 10, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "nodes" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "histogram_quantile(0.99, sum by (le) (rate(apiserver_request_duration_seconds_bucket[5m]))) * 1000" + ] + }, + { + "dashboard": "Atlas Nodes", + "panel_title": "etcd P99 latency", + "panel_id": 11, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "nodes" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "histogram_quantile(0.99, sum by (le) (rate(etcd_request_duration_seconds_bucket[5m]))) * 1000" + ] + }, + { + "dashboard": "Atlas Nodes", + "panel_title": "Node CPU", + "panel_id": 4, + "panel_type": "timeseries", + "description": "", + "tags": [ + "atlas", + "nodes" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))" + ] + }, + { + "dashboard": "Atlas Nodes", + "panel_title": "Node RAM", + "panel_id": 5, + "panel_type": "timeseries", + "description": "", + "tags": [ + "atlas", + "nodes" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))" + ] + }, + { + "dashboard": "Atlas Nodes", + "panel_title": "Control Plane (incl. titan-db) CPU", + "panel_id": 6, + "panel_type": "timeseries", + "description": "", + "tags": [ + "atlas", + "nodes" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "(avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-0a|titan-0b|titan-0c|titan-db|titan-jh\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")" + ] + }, + { + "dashboard": "Atlas Nodes", + "panel_title": "Control Plane (incl. titan-db) RAM", + "panel_id": 7, + "panel_type": "timeseries", + "description": "", + "tags": [ + "atlas", + "nodes" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "(avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-0a|titan-0b|titan-0c|titan-db|titan-jh\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")" + ] + }, + { + "dashboard": "Atlas Nodes", + "panel_title": "Root Filesystem Usage", + "panel_id": 8, + "panel_type": "timeseries", + "description": "", + "tags": [ + "atlas", + "nodes" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "avg by (node) ((avg by (instance) ((1 - (node_filesystem_avail_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"})) * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Control Plane Ready", + "panel_id": 2, + "panel_type": "gauge", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum(kube_node_status_condition{condition=\"Ready\",status=\"true\",node=~\"titan-0a|titan-0b|titan-0c\"})" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Control Plane Workloads", + "panel_id": 3, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum(kube_pod_info{node=~\"titan-0a|titan-0b|titan-0c\",namespace!~\"^(kube-.*|.*-system|traefik|monitoring|logging|cert-manager|maintenance|postgres)$\"}) or on() vector(0)" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Stuck Terminating", + "panel_id": 5, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum(max by (namespace,pod) (((time() - kube_pod_deletion_timestamp{pod!=\"\"}) > bool 600) and on(namespace,pod) (kube_pod_deletion_timestamp{pod!=\"\"} > bool 0))) or on() vector(0)" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Atlas Availability", + "panel_id": 27, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "avg_over_time((min(((sum(kube_node_status_condition{condition=\"Ready\",status=\"true\",node=~\"titan-0a|titan-0b|titan-0c\"}) / 3)), ((sum(kube_deployment_status_replicas_available{namespace=~\"traefik|kube-system\",deployment=\"traefik\"}) / clamp_min(sum(kube_deployment_spec_replicas{namespace=~\"traefik|kube-system\",deployment=\"traefik\"}), 1)))))[365d:1h])" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Problem Pods", + "panel_id": 4, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum(max by (namespace,pod) (kube_pod_status_phase{phase!~\"Running|Succeeded\"})) or on() vector(0)" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "CrashLoop / ImagePull", + "panel_id": 6, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum(max by (namespace,pod) (kube_pod_container_status_waiting_reason{reason=~\"CrashLoopBackOff|ImagePullBackOff\"})) or on() vector(0)" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Workers Ready", + "panel_id": 1, + "panel_type": "gauge", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum(kube_node_status_condition{condition=\"Ready\",status=\"true\",node=~\"titan-04|titan-05|titan-06|titan-07|titan-08|titan-09|titan-10|titan-11|titan-20|titan-21|titan-12|titan-13|titan-14|titan-15|titan-16|titan-17|titan-18|titan-19|titan-22|titan-24\"})" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Hottest node: CPU", + "panel_id": 7, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "label_replace(topk(1, avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))), \"__name__\", \"$1\", \"node\", \"(.*)\")" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Hottest node: RAM", + "panel_id": 8, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "label_replace(topk(1, avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))), \"__name__\", \"$1\", \"node\", \"(.*)\")" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Hottest node: NET (rx+tx)", + "panel_id": 9, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "label_replace(topk(1, avg by (node) ((sum by (instance) (rate(node_network_receive_bytes_total{device!~\"lo\"}[5m]) + rate(node_network_transmit_bytes_total{device!~\"lo\"}[5m]))) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))), \"__name__\", \"$1\", \"node\", \"(.*)\")" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Hottest node: I/O (r+w)", + "panel_id": 10, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "label_replace(topk(1, avg by (node) ((sum by (instance) (rate(node_disk_read_bytes_total[5m]) + rate(node_disk_written_bytes_total[5m]))) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))), \"__name__\", \"$1\", \"node\", \"(.*)\")" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Mail Sent (1d)", + "panel_id": 30, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "max(postmark_outbound_sent{window=\"1d\"})" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Mail Bounces (1d)", + "panel_id": 31, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "max(postmark_outbound_bounce_rate{window=\"1d\"})", + "max(postmark_outbound_bounced{window=\"1d\"})" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Mail Success Rate (1d)", + "panel_id": 32, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "clamp_min(100 - max(postmark_outbound_bounce_rate{window=\"1d\"}), 0)" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Mail Limit Used (30d)", + "panel_id": 33, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "max(postmark_sending_limit_used_percent)" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Postgres Connections Used", + "panel_id": 34, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "label_replace(sum(pg_stat_activity_count), \"conn\", \"used\", \"__name__\", \".*\") or label_replace(max(pg_settings_max_connections), \"conn\", \"max\", \"__name__\", \".*\")" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Postgres Hottest Connections", + "panel_id": 35, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "topk(1, sum by (datname) (pg_stat_activity_count))" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Astreae Usage", + "panel_id": 23, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "100 - (sum(node_filesystem_avail_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"}) / sum(node_filesystem_size_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"}) * 100)" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Asteria Usage", + "panel_id": 24, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "100 - (sum(node_filesystem_avail_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"}) / sum(node_filesystem_size_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"}) * 100)" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Astreae Free", + "panel_id": 25, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum(node_filesystem_avail_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"})" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Asteria Free", + "panel_id": 26, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum(node_filesystem_avail_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"})" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "One-off Job Pods (age hours)", + "panel_id": 40, + "panel_type": "bargauge", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sort_desc(((time() - kube_pod_start_time{pod!=\"\"}) / 3600) * on(namespace,pod) group_left(owner_name) (kube_pod_owner{owner_kind=\"Job\"} unless on(namespace, owner_name) label_replace(kube_job_owner{owner_kind=\"CronJob\"}, \"owner_name\", \"$1\", \"job_name\", \"(.*)\")) * on(namespace,pod) group_left(phase) max by (namespace,pod,phase) (kube_pod_status_phase{phase=~\"Running|Succeeded\"}))" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Ariadne Attempts / Failures", + "panel_id": 41, + "panel_type": "timeseries", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum(increase(ariadne_task_runs_total[$__interval]))", + "sum(increase(ariadne_task_runs_total{status=\"error\"}[$__interval]))" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Ariadne Test Success Rate", + "panel_id": 42, + "panel_type": "timeseries", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "100 * sum(max_over_time(ariadne_ci_tests_total{repo=\"ariadne\",result=\"passed\"}[1h])) / clamp_min(sum(max_over_time(ariadne_ci_tests_total{repo=\"ariadne\",result=~\"passed|failed|error\"}[1h])), 1)" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Tests with Failures (24h)", + "panel_id": 43, + "panel_type": "bargauge", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sort_desc(sum by (result) (max_over_time(ariadne_ci_tests_total{repo=\"ariadne\",result=~\"failed|error\"}[24h])))" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Namespace CPU Share", + "panel_id": 11, + "panel_type": "piechart", + "description": "Shares are normalized within the selected filter. Switching scope changes the denominator.", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "100 * ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\",container!=\"POD\",$namespace_scope_cpu}[1m])) by (namespace) ) / clamp_min(sum( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\",container!=\"POD\",$namespace_scope_cpu}[1m])) by (namespace) ), 1)" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Namespace GPU Share", + "panel_id": 12, + "panel_type": "piechart", + "description": "Shares are normalized within the selected filter. Switching scope changes the denominator.", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "(100 * (sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace)) / clamp_min((sum(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace)) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") and on() ((sum(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace)) or on() vector(0)) == 0))" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Namespace RAM Share", + "panel_id": 13, + "panel_type": "piechart", + "description": "Shares are normalized within the selected filter. Switching scope changes the denominator.", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "100 * ( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\",container!=\"POD\",$namespace_scope_ram}) by (namespace) ) / clamp_min(sum( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\",container!=\"POD\",$namespace_scope_ram}) by (namespace) ), 1)" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Worker Node CPU", + "panel_id": 14, + "panel_type": "timeseries", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "(avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-04|titan-05|titan-06|titan-07|titan-08|titan-09|titan-10|titan-11|titan-20|titan-21|titan-12|titan-13|titan-14|titan-15|titan-16|titan-17|titan-18|titan-19|titan-22|titan-24\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Worker Node RAM", + "panel_id": 15, + "panel_type": "timeseries", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "(avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-04|titan-05|titan-06|titan-07|titan-08|titan-09|titan-10|titan-11|titan-20|titan-21|titan-12|titan-13|titan-14|titan-15|titan-16|titan-17|titan-18|titan-19|titan-22|titan-24\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Control plane CPU", + "panel_id": 16, + "panel_type": "timeseries", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "(avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-0a|titan-0b|titan-0c|titan-db|titan-jh\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Control plane RAM", + "panel_id": 17, + "panel_type": "timeseries", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "(avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-0a|titan-0b|titan-0c|titan-db|titan-jh\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Node Pod Share", + "panel_id": 28, + "panel_type": "piechart", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "(sum(kube_pod_info{pod!=\"\" , node!=\"\"}) by (node) / clamp_min(sum(kube_pod_info{pod!=\"\" , node!=\"\"}), 1)) * 100" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Top Nodes by Pod Count", + "panel_id": 29, + "panel_type": "bargauge", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sort_desc(topk(12, sum(kube_pod_info{pod!=\"\" , node!=\"\"}) by (node)))" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Cluster Ingress Throughput", + "panel_id": 18, + "panel_type": "timeseries", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum(rate(node_network_receive_bytes_total{device!~\"lo|cni.*|veth.*|flannel.*|docker.*|virbr.*|vxlan.*|wg.*\"}[5m])) or on() vector(0)" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Cluster Egress Throughput", + "panel_id": 19, + "panel_type": "timeseries", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum(rate(node_network_transmit_bytes_total{device!~\"lo|cni.*|veth.*|flannel.*|docker.*|virbr.*|vxlan.*|wg.*\"}[5m])) or on() vector(0)" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Intra-Cluster Throughput", + "panel_id": 20, + "panel_type": "timeseries", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum(rate(container_network_receive_bytes_total{namespace!=\"traefik\",pod!=\"\"}[5m]) + rate(container_network_transmit_bytes_total{namespace!=\"traefik\",pod!=\"\"}[5m])) or on() vector(0)" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Root Filesystem Usage", + "panel_id": 21, + "panel_type": "timeseries", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "avg by (node) ((avg by (instance) ((1 - (node_filesystem_avail_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"})) * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Nodes Closest to Full Root Disks", + "panel_id": 22, + "panel_type": "bargauge", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sort_desc(topk(12, avg by (node) ((avg by (instance) ((1 - (node_filesystem_avail_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"})) * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))))" + ] + }, + { + "dashboard": "Atlas Pods", + "panel_title": "Problem Pods", + "panel_id": 1, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "pods" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum(max by (namespace,pod) (kube_pod_status_phase{phase!~\"Running|Succeeded\"})) or on() vector(0)" + ] + }, + { + "dashboard": "Atlas Pods", + "panel_title": "CrashLoop / ImagePull", + "panel_id": 2, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "pods" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum(max by (namespace,pod) (kube_pod_container_status_waiting_reason{reason=~\"CrashLoopBackOff|ImagePullBackOff\"})) or on() vector(0)" + ] + }, + { + "dashboard": "Atlas Pods", + "panel_title": "Stuck Terminating (>10m)", + "panel_id": 3, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "pods" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum(max by (namespace,pod) (((time() - kube_pod_deletion_timestamp{pod!=\"\"}) > bool 600) and on(namespace,pod) (kube_pod_deletion_timestamp{pod!=\"\"} > bool 0))) or on() vector(0)" + ] + }, + { + "dashboard": "Atlas Pods", + "panel_title": "Control Plane Workloads", + "panel_id": 4, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "pods" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum(kube_pod_info{node=~\"titan-0a|titan-0b|titan-0c\",namespace!~\"^(kube-.*|.*-system|traefik|monitoring|logging|cert-manager|maintenance|postgres)$\"})" + ] + }, + { + "dashboard": "Atlas Pods", + "panel_title": "Pods Not Running", + "panel_id": 5, + "panel_type": "table", + "description": "", + "tags": [ + "atlas", + "pods" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "(time() - kube_pod_created{pod!=\"\"}) * on(namespace,pod) group_left(node) kube_pod_info * on(namespace,pod) group_left(phase) max by (namespace,pod,phase) (kube_pod_status_phase{phase!~\"Running|Succeeded\"})" + ] + }, + { + "dashboard": "Atlas Pods", + "panel_title": "CrashLoop / ImagePull", + "panel_id": 6, + "panel_type": "table", + "description": "", + "tags": [ + "atlas", + "pods" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "(time() - kube_pod_created{pod!=\"\"}) * on(namespace,pod) group_left(node) kube_pod_info * on(namespace,pod,container) group_left(reason) max by (namespace,pod,container,reason) (kube_pod_container_status_waiting_reason{reason=~\"CrashLoopBackOff|ImagePullBackOff\"})" + ] + }, + { + "dashboard": "Atlas Pods", + "panel_title": "Terminating >10m", + "panel_id": 7, + "panel_type": "table", + "description": "", + "tags": [ + "atlas", + "pods" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "(((time() - kube_pod_deletion_timestamp{pod!=\"\"}) and on(namespace,pod) (kube_pod_deletion_timestamp{pod!=\"\"} > bool 0)) * on(namespace,pod) group_left(node) kube_pod_info)" + ] + }, + { + "dashboard": "Atlas Pods", + "panel_title": "Node Pod Share", + "panel_id": 8, + "panel_type": "piechart", + "description": "", + "tags": [ + "atlas", + "pods" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "(sum(kube_pod_info{pod!=\"\" , node!=\"\"}) by (node) / clamp_min(sum(kube_pod_info{pod!=\"\" , node!=\"\"}), 1)) * 100" + ] + }, + { + "dashboard": "Atlas Pods", + "panel_title": "Top Nodes by Pod Count", + "panel_id": 9, + "panel_type": "bargauge", + "description": "", + "tags": [ + "atlas", + "pods" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sort_desc(topk(12, sum(kube_pod_info{pod!=\"\" , node!=\"\"}) by (node)))" + ] + }, + { + "dashboard": "Atlas Pods", + "panel_title": "Namespace Plurality by Node v27", + "panel_id": 10, + "panel_type": "table", + "description": "", + "tags": [ + "atlas", + "pods" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "(sum by (namespace,node) (kube_pod_info{pod!=\"\" , node!=\"\"}) / on(namespace) group_left() clamp_min(sum by (namespace) (kube_pod_info{pod!=\"\"}), 1) * 100) * on(namespace,node) group_left() ((sum by (namespace,node) (kube_pod_info{pod!=\"\" , node!=\"\"}) / on(namespace) group_left() clamp_min(sum by (namespace) (kube_pod_info{pod!=\"\"}), 1) * 100) + on(node) group_left() ((sum by (node) (kube_node_info{node=\"titan-0a\"}) * 0 + 0.001) or (sum by (node) (kube_node_info{node=\"titan-0b\"}) * 0 + 0.002) or (sum by (node) (kube_node_info{node=\"titan-0c\"}) * 0 + 0.003) or (sum by (node) (kube_node_info{node=\"titan-db\"}) * 0 + 0.004) or (sum by (node) (kube_node_info{node=\"titan-jh\"}) * 0 + 0.005) or (sum by (node) (kube_node_info{node=\"titan-04\"}) * 0 + 0.006) or (sum by (node) (kube_node_info{node=\"titan-05\"}) * 0 + 0.007) or (sum by (node) (kube_node_info{node=\"titan-06\"}) * 0 + 0.008) or (sum by (node) (kube_node_info{node=\"titan-07\"}) * 0 + 0.009000000000000001) or (sum by (node) (kube_node_info{node=\"titan-08\"}) * 0 + 0.01) or (sum by (node) (kube_node_info{node=\"titan-09\"}) * 0 + 0.011) or (sum by (node) (kube_node_info{node=\"titan-10\"}) * 0 + 0.012) or (sum by (node) (kube_node_info{node=\"titan-11\"}) * 0 + 0.013000000000000001) or (sum by (node) (kube_node_info{node=\"titan-20\"}) * 0 + 0.014) or (sum by (node) (kube_node_info{node=\"titan-21\"}) * 0 + 0.015) or (sum by (node) (kube_node_info{node=\"titan-12\"}) * 0 + 0.016) or (sum by (node) (kube_node_info{node=\"titan-13\"}) * 0 + 0.017) or (sum by (node) (kube_node_info{node=\"titan-14\"}) * 0 + 0.018000000000000002) or (sum by (node) (kube_node_info{node=\"titan-15\"}) * 0 + 0.019) or (sum by (node) (kube_node_info{node=\"titan-16\"}) * 0 + 0.02) or (sum by (node) (kube_node_info{node=\"titan-17\"}) * 0 + 0.021) or (sum by (node) (kube_node_info{node=\"titan-18\"}) * 0 + 0.022) or (sum by (node) (kube_node_info{node=\"titan-19\"}) * 0 + 0.023) or (sum by (node) (kube_node_info{node=\"titan-22\"}) * 0 + 0.024) or (sum by (node) (kube_node_info{node=\"titan-24\"}) * 0 + 0.025)) == bool on(namespace) group_left() (max by (namespace) ((sum by (namespace,node) (kube_pod_info{pod!=\"\" , node!=\"\"}) / on(namespace) group_left() clamp_min(sum by (namespace) (kube_pod_info{pod!=\"\"}), 1) * 100) + on(node) group_left() ((sum by (node) (kube_node_info{node=\"titan-0a\"}) * 0 + 0.001) or (sum by (node) (kube_node_info{node=\"titan-0b\"}) * 0 + 0.002) or (sum by (node) (kube_node_info{node=\"titan-0c\"}) * 0 + 0.003) or (sum by (node) (kube_node_info{node=\"titan-db\"}) * 0 + 0.004) or (sum by (node) (kube_node_info{node=\"titan-jh\"}) * 0 + 0.005) or (sum by (node) (kube_node_info{node=\"titan-04\"}) * 0 + 0.006) or (sum by (node) (kube_node_info{node=\"titan-05\"}) * 0 + 0.007) or (sum by (node) (kube_node_info{node=\"titan-06\"}) * 0 + 0.008) or (sum by (node) (kube_node_info{node=\"titan-07\"}) * 0 + 0.009000000000000001) or (sum by (node) (kube_node_info{node=\"titan-08\"}) * 0 + 0.01) or (sum by (node) (kube_node_info{node=\"titan-09\"}) * 0 + 0.011) or (sum by (node) (kube_node_info{node=\"titan-10\"}) * 0 + 0.012) or (sum by (node) (kube_node_info{node=\"titan-11\"}) * 0 + 0.013000000000000001) or (sum by (node) (kube_node_info{node=\"titan-20\"}) * 0 + 0.014) or (sum by (node) (kube_node_info{node=\"titan-21\"}) * 0 + 0.015) or (sum by (node) (kube_node_info{node=\"titan-12\"}) * 0 + 0.016) or (sum by (node) (kube_node_info{node=\"titan-13\"}) * 0 + 0.017) or (sum by (node) (kube_node_info{node=\"titan-14\"}) * 0 + 0.018000000000000002) or (sum by (node) (kube_node_info{node=\"titan-15\"}) * 0 + 0.019) or (sum by (node) (kube_node_info{node=\"titan-16\"}) * 0 + 0.02) or (sum by (node) (kube_node_info{node=\"titan-17\"}) * 0 + 0.021) or (sum by (node) (kube_node_info{node=\"titan-18\"}) * 0 + 0.022) or (sum by (node) (kube_node_info{node=\"titan-19\"}) * 0 + 0.023) or (sum by (node) (kube_node_info{node=\"titan-22\"}) * 0 + 0.024) or (sum by (node) (kube_node_info{node=\"titan-24\"}) * 0 + 0.025)))))" + ] + }, + { + "dashboard": "Atlas Storage", + "panel_title": "Astreae Usage", + "panel_id": 1, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "storage" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "100 - (sum(node_filesystem_avail_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"}) / sum(node_filesystem_size_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"}) * 100)" + ] + }, + { + "dashboard": "Atlas Storage", + "panel_title": "Asteria Usage", + "panel_id": 2, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "storage" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "100 - (sum(node_filesystem_avail_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"}) / sum(node_filesystem_size_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"}) * 100)" + ] + }, + { + "dashboard": "Atlas Storage", + "panel_title": "Astreae Free", + "panel_id": 3, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "storage" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum(node_filesystem_avail_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"})" + ] + }, + { + "dashboard": "Atlas Storage", + "panel_title": "Asteria Free", + "panel_id": 4, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "storage" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum(node_filesystem_avail_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"})" + ] + }, + { + "dashboard": "Atlas Storage", + "panel_title": "Astreae Per-Node Usage", + "panel_id": 5, + "panel_type": "timeseries", + "description": "", + "tags": [ + "atlas", + "storage" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "(avg by (node) ((avg by (instance) ((1 - (node_filesystem_avail_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"})) * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-1[2-9]|titan-2[24]\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")" + ] + }, + { + "dashboard": "Atlas Storage", + "panel_title": "Asteria Per-Node Usage", + "panel_id": 6, + "panel_type": "timeseries", + "description": "", + "tags": [ + "atlas", + "storage" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "(avg by (node) ((avg by (instance) ((1 - (node_filesystem_avail_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"})) * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-1[2-9]|titan-2[24]\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")" + ] + }, + { + "dashboard": "Atlas Storage", + "panel_title": "Astreae Usage History", + "panel_id": 7, + "panel_type": "timeseries", + "description": "", + "tags": [ + "atlas", + "storage" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "100 - (sum(node_filesystem_avail_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"}) / sum(node_filesystem_size_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"}) * 100)" + ] + }, + { + "dashboard": "Atlas Storage", + "panel_title": "Asteria Usage History", + "panel_id": 8, + "panel_type": "timeseries", + "description": "", + "tags": [ + "atlas", + "storage" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "100 - (sum(node_filesystem_avail_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"}) / sum(node_filesystem_size_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"}) * 100)" + ] + }, + { + "dashboard": "Atlas Storage", + "panel_title": "Maintenance Sweepers Ready", + "panel_id": 30, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "storage" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "kube_daemonset_status_number_ready{namespace=\"maintenance\",daemonset=\"node-image-sweeper\"} / on(namespace,daemonset) kube_daemonset_status_desired_number_scheduled{namespace=\"maintenance\",daemonset=\"node-image-sweeper\"} * 100" + ] + }, + { + "dashboard": "Atlas Storage", + "panel_title": "Maintenance Cron Freshness (s)", + "panel_id": 31, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "storage" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "time() - max by (cronjob) (kube_cronjob_status_last_successful_time{namespace=\"maintenance\",cronjob=\"image-sweeper\"})" + ] + } +] diff --git a/services/atlasbot/knowledge/catalog/runbooks.json b/services/atlasbot/knowledge/catalog/runbooks.json new file mode 100644 index 0000000..960510d --- /dev/null +++ b/services/atlasbot/knowledge/catalog/runbooks.json @@ -0,0 +1,97 @@ +[ + { + "path": "runbooks/ci-gitea-jenkins.md", + "title": "CI: Gitea \u2192 Jenkins pipeline", + "tags": [ + "atlas", + "ci", + "gitea", + "jenkins" + ], + "entrypoints": [ + "scm.bstein.dev", + "ci.bstein.dev" + ], + "source_paths": [ + "services/gitea", + "services/jenkins", + "scripts/jenkins_cred_sync.sh", + "scripts/gitea_cred_sync.sh" + ], + "body": "# CI: Gitea \u2192 Jenkins pipeline\n\n## What this is\nAtlas uses Gitea for source control and Jenkins for CI. Authentication is via Keycloak (SSO).\n\n## Where it is configured\n- Gitea manifests: `services/gitea/`\n- Jenkins manifests: `services/jenkins/`\n- Credential sync helpers: `scripts/gitea_cred_sync.sh`, `scripts/jenkins_cred_sync.sh`\n\n## What users do (typical flow)\n- Create a repo in Gitea.\n- Create/update a Jenkins job/pipeline that can fetch the repo.\n- Configure a webhook (or SCM polling) so pushes trigger builds.\n\n## Troubleshooting (common)\n- \u201cWebhook not firing\u201d: confirm ingress host, webhook URL, and Jenkins job is reachable.\n- \u201cAuth denied cloning\u201d: confirm Keycloak group membership and that Jenkins has a valid token/credential configured." + }, + { + "path": "runbooks/comms-verify.md", + "title": "Othrys verification checklist", + "tags": [ + "comms", + "matrix", + "element", + "livekit" + ], + "entrypoints": [ + "https://live.bstein.dev", + "https://matrix.live.bstein.dev" + ], + "source_paths": [], + "body": "1) Guest join:\n- Open a private window and visit:\n `https://live.bstein.dev/#/room/#othrys:live.bstein.dev?action=join`\n- Confirm the guest join flow works and the displayname becomes `-`.\n\n2) Keycloak login:\n- Log in from `https://live.bstein.dev` and confirm MAS -> Keycloak -> Element redirect.\n\n3) Video rooms:\n- Start an Element Call room and confirm audio/video with a second account.\n- Check that guests can read public rooms but cannot start calls.\n\n4) Well-known:\n- `https://live.bstein.dev/.well-known/matrix/client` returns JSON.\n- `https://matrix.live.bstein.dev/.well-known/matrix/client` returns JSON.\n\n5) TURN reachability:\n- Confirm `turn.live.bstein.dev:3478` and `turns:5349` are reachable from WAN." + }, + { + "path": "runbooks/kb-authoring.md", + "title": "KB authoring: what to write (and what not to)", + "tags": [ + "atlas", + "kb", + "runbooks" + ], + "entrypoints": [], + "source_paths": [ + "knowledge/runbooks", + "scripts/knowledge_render_atlas.py" + ], + "body": "# KB authoring: what to write (and what not to)\n\n## The goal\nGive Atlas assistants enough grounded, Atlas-specific context to answer \u201chow do I\u2026?\u201d questions without guessing.\n\n## What to capture (high value)\n- User workflows: \u201cclick here, set X, expected result\u201d\n- Operator workflows: \u201cedit these files, reconcile this kustomization, verify with these commands\u201d\n- Wiring: \u201cthis host routes to this service; this service depends on Postgres/Vault/etc\u201d\n- Failure modes: exact error messages + the 2\u20135 checks that usually resolve them\n- Permissions: Keycloak groups/roles and what they unlock\n\n## What to avoid (low value / fluff)\n- Generic Kubernetes explanations (link to upstream docs instead)\n- Copy-pasting large manifests (prefer file paths + small snippets)\n- Anything that will drift quickly (render it from GitOps instead)\n- Any secret values (reference Secret/Vault locations by name only)\n\n## Document pattern (recommended)\nEach runbook should answer:\n- \u201cWhat is this?\u201d\n- \u201cWhat do users do?\u201d\n- \u201cWhat do operators change (where in Git)?\u201d\n- \u201cHow do we verify it works?\u201d\n- \u201cWhat breaks and how to debug it?\u201d" + }, + { + "path": "runbooks/observability.md", + "title": "Observability: Grafana + VictoriaMetrics (how to query safely)", + "tags": [ + "atlas", + "monitoring", + "grafana", + "victoriametrics" + ], + "entrypoints": [ + "metrics.bstein.dev", + "alerts.bstein.dev" + ], + "source_paths": [ + "services/monitoring" + ], + "body": "# Observability: Grafana + VictoriaMetrics (how to query safely)\n\n## Where it is configured\n- `services/monitoring/helmrelease.yaml` (Grafana + Alertmanager + VM values)\n- `services/monitoring/grafana-dashboard-*.yaml` (dashboards and their PromQL)\n\n## Using metrics as a \u201ctool\u201d for Atlas assistants\nThe safest pattern is: map a small set of intents \u2192 fixed PromQL queries, then summarize results.\n\nExamples (intents)\n- \u201cIs the cluster healthy?\u201d \u2192 node readiness + pod restart rate\n- \u201cWhy is Element Call failing?\u201d \u2192 LiveKit/coturn pod restarts + synapse errors + ingress 5xx\n- \u201cIs Jenkins slow?\u201d \u2192 pod CPU/memory + HTTP latency metrics (if exported)\n\n## Why dashboards are not the KB\nDashboards are great references, but the assistant should query VictoriaMetrics directly for live answers and keep the\nKB focused on wiring, runbooks, and stable conventions." + }, + { + "path": "runbooks/template.md", + "title": "", + "tags": [ + "atlas", + "", + "" + ], + "entrypoints": [ + "" + ], + "source_paths": [ + "services/", + "clusters/atlas/<...>" + ], + "body": "# \n\n## What this is\n\n## For users (how to)\n\n## For operators (where configured)\n\n## Troubleshooting (symptoms \u2192 checks)" + }, + { + "path": "software/metis.md", + "title": "metis", + "tags": [], + "entrypoints": [], + "source_paths": [], + "body": "# Metis (node recovery)\n\n## Node classes (current map)\n- rpi5 Ubuntu workers: titan-04,05,06,07,08,09,10,11,20,21 (Ubuntu 24.04.3, k3s agent)\n- rpi5 control-plane: titan-0a/0b/0c (Ubuntu 24.04.1, k3s server, control-plane taint)\n- rpi4 Armbian longhorn: titan-13/15/17/19 (Armbian 6.6.x, k3s agent, longhorn disks)\n- rpi4 Armbian standard: titan-12/14/18 (Armbian 6.6.x, k3s agent)\n- amd64 agents: titan-22/24 (Debian 13, k3s agent)\n- External/non-cluster: tethys, titan-db, titan-jh, oceanus/titan-23, plus any newcomers.\n\n### Jetson nodes (titan-20/21)\n- Ubuntu 20.04.6 (Focal), kernel 5.10.104-tegra, CRI containerd 2.0.5-k3s2, arch arm64.\n- Storage: NVMe 232G at / (ext4); onboard mmc partitions present but root on NVMe; 1.9T sda present (unused).\n- k3s agent with drop-in 99-nofile.conf.\n\n## Longhorn disk UUIDs (critical nodes)\n- titan-13: /mnt/astreae UUID=6031fa8b-f28c-45c3-b7bc-6133300e07c6 (ext4); /mnt/asteria UUID=cbd4989d-62b5-4741-8b2a-28fdae259cae (ext4)\n- titan-15: /mnt/astreae UUID=f3362f14-5822-449f-944b-ac570b5cd615 (ext4); /mnt/asteria UUID=9c5316e6-f847-4884-b502-11f2d0d15d6f (ext4)\n- titan-17: /mnt/astreae UUID=1fecdade-08b0-49cb-9ae3-be6c188b0a96 (ext4); /mnt/asteria UUID=2fe9f613-d372-47ca-b84f-82084e4edda0 (ext4)\n- titan-19: /mnt/astreae UUID=4890abb9-dda2-4f4f-9c0f-081ee82849cf (ext4); /mnt/asteria UUID=2b4ea28d-b0e6-4fa3-841b-cd7067ae9153 (ext4)\n\n## Metis repo (~/Development/metis)\n- CLI skeleton in Go (`cmd/metis`), inventory loader (`pkg/inventory`), plan builder (`pkg/plan`).\n- `inventory.example.yaml` shows expected schema (classes + per-node overlay, Longhorn disks, labels, taints).\n- `AGENTS.md` in repo is untracked and holds raw notes.\n\n## Next implementation steps\n- Add per-class golden image refs and checksums (Harbor or file://) when ready.\n- Implement burn execution: download with checksum, write via dd/etcher-equivalent, mount boot/root to inject hostname/IP/k3s tokens/labels/taints, journald/GC drop-ins, and Longhorn fstab entries. Add Windows writer (diskpart + wmic) and Linux writer (dd + sgdisk) paths.\n- Add Keycloak/SSH bootstrap: ensure ssh user, authorized keys, and k3s token/URL injection for agents; control-plane restore path with etcd snapshot selection.\n- Add per-host inventory entries for tethys, titan-db, titan-jh, oceanus/titan-23, future 20/21 once audited.\n\n## Node OS/Kernel/CRI snapshot (Jan 2026)\n- titan-04: Ubuntu 24.04.3 LTS, kernel 6.8.0-1031-raspi, CRI containerd://2.0.5-k3s2, arch arm64\n- titan-05: Ubuntu 24.04.3 LTS, kernel 6.8.0-1039-raspi, CRI containerd://2.0.5-k3s2, arch arm64\n- titan-06: Ubuntu 24.04.3 LTS, kernel 6.8.0-1039-raspi, CRI containerd://2.0.5-k3s2, arch arm64\n- titan-07: Ubuntu 24.04.3 LTS, kernel 6.8.0-1039-raspi, CRI containerd://2.0.5-k3s2, arch arm64\n- titan-08: Ubuntu 24.04.3 LTS, kernel 6.8.0-1039-raspi, CRI containerd://2.0.5-k3s2, arch arm64\n- titan-09: Ubuntu 24.04.3 LTS, kernel 6.8.0-1031-raspi, CRI containerd://2.0.5-k3s2, arch arm64\n- titan-0a: Ubuntu 24.04.1 LTS, kernel 6.8.0-1038-raspi, CRI containerd://2.0.5-k3s2, arch arm64\n- titan-0b: Ubuntu 24.04.1 LTS, kernel 6.8.0-1038-raspi, CRI containerd://2.0.5-k3s2, arch arm64\n- titan-0c: Ubuntu 24.04.1 LTS, kernel 6.8.0-1038-raspi, CRI containerd://2.0.5-k3s2, arch arm64\n- titan-10: Ubuntu 24.04.3 LTS, kernel 6.8.0-1039-raspi, CRI containerd://2.0.5-k3s2, arch arm64\n- titan-11: Ubuntu 24.04.3 LTS, kernel 6.8.0-1039-raspi, CRI containerd://2.0.5-k3s2, arch arm64\n- titan-12: Armbian 24.11.1 noble, kernel 6.6.60-current-bcm2711, CRI containerd://1.7.23-k3s2, arch arm64\n- titan-13: Armbian 25.2.1 noble, kernel 6.6.63-current-bcm2711, CRI containerd://1.7.23-k3s2, arch arm64\n- titan-14: Armbian 24.11.1 noble, kernel 6.6.60-current-bcm2711, CRI containerd://1.7.23-k3s2, arch arm64\n- titan-15: Armbian 25.2.1 noble, kernel 6.6.63-current-bcm2711, CRI containerd://1.7.23-k3s2, arch arm64\n- titan-17: Armbian 25.2.1 noble, kernel 6.6.63-current-bcm2711, CRI containerd://1.7.23-k3s2, arch arm64\n- titan-18: Armbian 24.11.1 noble, kernel 6.6.60-current-bcm2711, CRI containerd://1.7.23-k3s2, arch arm64\n- titan-19: Armbian 25.2.1 noble, kernel 6.6.63-current-bcm2711, CRI containerd://1.7.23-k3s2, arch arm64\n- titan-20: Ubuntu 20.04.6 LTS, kernel 5.10.104-tegra, CRI containerd://2.0.5-k3s2, arch arm64\n- titan-21: Ubuntu 20.04.6 LTS, kernel 5.10.104-tegra, CRI containerd://2.0.5-k3s2, arch arm64\n- titan-22: Debian 13 (trixie), kernel 6.12.41+deb13-amd64, CRI containerd://2.0.5-k3s2, arch amd64\n- titan-24: Debian 13 (trixie), kernel 6.12.57+deb13-amd64, CRI containerd://2.0.5-k3s2, arch amd64\n\n\n### External hosts\n- titan-db: Ubuntu 24.10, kernel 6.11.0-1015-raspi, root on /dev/sda2 ext4 (465G), boot vfat /dev/sda1; PostgreSQL service enabled.\n- titan-jh: Arch Linux ARM (rolling), kernel 6.18.4-2-rpi, NVMe root ext4 238G (/), boot vfat 512M; ~495 packages installed (pacman -Q).\n- titan-23/oceanus: TODO audit (future).\n\n\n### Control plane Pis (titan-0a/0b/0c)\n- Ubuntu 24.04.1 LTS, kernel 6.8.0-1038-raspi, containerd 2.0.5-k3s2.\n- Storage: 477G SSD root (/dev/sda2 ext4), /boot/firmware vfat (/dev/sda1). fstab uses LABEL=writable and LABEL=system-boot.\n- k3s server (control-plane taint expected); etcd snapshots not yet cataloged (TODO).\n\n\n## k3s versions\n- rpi5 workers/control-plane: k3s v1.33.3+k3s1 (crictl v1.31.0-k3s2)\n- rpi4 nodes: k3s v1.31.5+k3s1 (crictl v1.31.0-k3s2)\n- Jetson titan-20/21: k3s v1.33.3+k3s1 (per node info), crictl v1.31.0-k3s2" + } +] diff --git a/services/atlasbot/knowledge/diagrams/atlas-http.mmd b/services/atlasbot/knowledge/diagrams/atlas-http.mmd new file mode 100644 index 0000000..1aa7ac8 --- /dev/null +++ b/services/atlasbot/knowledge/diagrams/atlas-http.mmd @@ -0,0 +1,234 @@ +flowchart LR + host_auth_bstein_dev["auth.bstein.dev"] + svc_sso_oauth2_proxy["sso/oauth2-proxy (Service)"] + host_auth_bstein_dev --> svc_sso_oauth2_proxy + wl_sso_oauth2_proxy["sso/oauth2-proxy (Deployment)"] + svc_sso_oauth2_proxy --> wl_sso_oauth2_proxy + host_bstein_dev["bstein.dev"] + svc_bstein_dev_home_bstein_dev_home_frontend["bstein-dev-home/bstein-dev-home-frontend (Service)"] + host_bstein_dev --> svc_bstein_dev_home_bstein_dev_home_frontend + wl_bstein_dev_home_bstein_dev_home_frontend["bstein-dev-home/bstein-dev-home-frontend (Deployment)"] + svc_bstein_dev_home_bstein_dev_home_frontend --> wl_bstein_dev_home_bstein_dev_home_frontend + svc_comms_matrix_wellknown["comms/matrix-wellknown (Service)"] + host_bstein_dev --> svc_comms_matrix_wellknown + wl_comms_matrix_wellknown["comms/matrix-wellknown (Deployment)"] + svc_comms_matrix_wellknown --> wl_comms_matrix_wellknown + svc_bstein_dev_home_bstein_dev_home_backend["bstein-dev-home/bstein-dev-home-backend (Service)"] + host_bstein_dev --> svc_bstein_dev_home_bstein_dev_home_backend + wl_bstein_dev_home_bstein_dev_home_backend["bstein-dev-home/bstein-dev-home-backend (Deployment)"] + svc_bstein_dev_home_bstein_dev_home_backend --> wl_bstein_dev_home_bstein_dev_home_backend + host_budget_bstein_dev["budget.bstein.dev"] + svc_finance_actual_budget["finance/actual-budget (Service)"] + host_budget_bstein_dev --> svc_finance_actual_budget + wl_finance_actual_budget["finance/actual-budget (Deployment)"] + svc_finance_actual_budget --> wl_finance_actual_budget + host_call_live_bstein_dev["call.live.bstein.dev"] + svc_comms_element_call["comms/element-call (Service)"] + host_call_live_bstein_dev --> svc_comms_element_call + wl_comms_element_call["comms/element-call (Deployment)"] + svc_comms_element_call --> wl_comms_element_call + host_chat_ai_bstein_dev["chat.ai.bstein.dev"] + svc_bstein_dev_home_chat_ai_gateway["bstein-dev-home/chat-ai-gateway (Service)"] + host_chat_ai_bstein_dev --> svc_bstein_dev_home_chat_ai_gateway + wl_bstein_dev_home_chat_ai_gateway["bstein-dev-home/chat-ai-gateway (Deployment)"] + svc_bstein_dev_home_chat_ai_gateway --> wl_bstein_dev_home_chat_ai_gateway + host_ci_bstein_dev["ci.bstein.dev"] + svc_jenkins_jenkins["jenkins/jenkins (Service)"] + host_ci_bstein_dev --> svc_jenkins_jenkins + wl_jenkins_jenkins["jenkins/jenkins (Deployment)"] + svc_jenkins_jenkins --> wl_jenkins_jenkins + host_cloud_bstein_dev["cloud.bstein.dev"] + svc_nextcloud_nextcloud["nextcloud/nextcloud (Service)"] + host_cloud_bstein_dev --> svc_nextcloud_nextcloud + wl_nextcloud_nextcloud["nextcloud/nextcloud (Deployment)"] + svc_nextcloud_nextcloud --> wl_nextcloud_nextcloud + host_health_bstein_dev["health.bstein.dev"] + svc_health_wger["health/wger (Service)"] + host_health_bstein_dev --> svc_health_wger + wl_health_wger["health/wger (Deployment)"] + svc_health_wger --> wl_health_wger + host_kit_live_bstein_dev["kit.live.bstein.dev"] + svc_comms_livekit_token_service["comms/livekit-token-service (Service)"] + host_kit_live_bstein_dev --> svc_comms_livekit_token_service + wl_comms_livekit_token_service["comms/livekit-token-service (Deployment)"] + svc_comms_livekit_token_service --> wl_comms_livekit_token_service + svc_comms_livekit["comms/livekit (Service)"] + host_kit_live_bstein_dev --> svc_comms_livekit + wl_comms_livekit["comms/livekit (Deployment)"] + svc_comms_livekit --> wl_comms_livekit + host_live_bstein_dev["live.bstein.dev"] + host_live_bstein_dev --> svc_comms_matrix_wellknown + svc_comms_othrys_synapse_matrix_synapse["comms/othrys-synapse-matrix-synapse (Service)"] + host_live_bstein_dev --> svc_comms_othrys_synapse_matrix_synapse + svc_comms_matrix_guest_register["comms/matrix-guest-register (Service)"] + host_live_bstein_dev --> svc_comms_matrix_guest_register + wl_comms_matrix_guest_register["comms/matrix-guest-register (Deployment)"] + svc_comms_matrix_guest_register --> wl_comms_matrix_guest_register + svc_comms_matrix_authentication_service["comms/matrix-authentication-service (Service)"] + host_live_bstein_dev --> svc_comms_matrix_authentication_service + wl_comms_matrix_authentication_service["comms/matrix-authentication-service (Deployment)"] + svc_comms_matrix_authentication_service --> wl_comms_matrix_authentication_service + host_logs_bstein_dev["logs.bstein.dev"] + svc_logging_oauth2_proxy_logs["logging/oauth2-proxy-logs (Service)"] + host_logs_bstein_dev --> svc_logging_oauth2_proxy_logs + wl_logging_oauth2_proxy_logs["logging/oauth2-proxy-logs (Deployment)"] + svc_logging_oauth2_proxy_logs --> wl_logging_oauth2_proxy_logs + host_longhorn_bstein_dev["longhorn.bstein.dev"] + svc_longhorn_system_oauth2_proxy_longhorn["longhorn-system/oauth2-proxy-longhorn (Service)"] + host_longhorn_bstein_dev --> svc_longhorn_system_oauth2_proxy_longhorn + wl_longhorn_system_oauth2_proxy_longhorn["longhorn-system/oauth2-proxy-longhorn (Deployment)"] + svc_longhorn_system_oauth2_proxy_longhorn --> wl_longhorn_system_oauth2_proxy_longhorn + host_mail_bstein_dev["mail.bstein.dev"] + svc_mailu_mailserver_mailu_front["mailu-mailserver/mailu-front (Service)"] + host_mail_bstein_dev --> svc_mailu_mailserver_mailu_front + host_matrix_live_bstein_dev["matrix.live.bstein.dev"] + host_matrix_live_bstein_dev --> svc_comms_matrix_authentication_service + host_matrix_live_bstein_dev --> svc_comms_matrix_wellknown + host_matrix_live_bstein_dev --> svc_comms_othrys_synapse_matrix_synapse + host_matrix_live_bstein_dev --> svc_comms_matrix_guest_register + host_monero_bstein_dev["monero.bstein.dev"] + svc_crypto_monerod["crypto/monerod (Service)"] + host_monero_bstein_dev --> svc_crypto_monerod + wl_crypto_monerod["crypto/monerod (Deployment)"] + svc_crypto_monerod --> wl_crypto_monerod + host_money_bstein_dev["money.bstein.dev"] + svc_finance_firefly["finance/firefly (Service)"] + host_money_bstein_dev --> svc_finance_firefly + wl_finance_firefly["finance/firefly (Deployment)"] + svc_finance_firefly --> wl_finance_firefly + host_notes_bstein_dev["notes.bstein.dev"] + svc_outline_outline["outline/outline (Service)"] + host_notes_bstein_dev --> svc_outline_outline + wl_outline_outline["outline/outline (Deployment)"] + svc_outline_outline --> wl_outline_outline + host_office_bstein_dev["office.bstein.dev"] + svc_nextcloud_collabora["nextcloud/collabora (Service)"] + host_office_bstein_dev --> svc_nextcloud_collabora + wl_nextcloud_collabora["nextcloud/collabora (Deployment)"] + svc_nextcloud_collabora --> wl_nextcloud_collabora + host_pegasus_bstein_dev["pegasus.bstein.dev"] + svc_jellyfin_pegasus["jellyfin/pegasus (Service)"] + host_pegasus_bstein_dev --> svc_jellyfin_pegasus + wl_jellyfin_pegasus["jellyfin/pegasus (Deployment)"] + svc_jellyfin_pegasus --> wl_jellyfin_pegasus + host_scm_bstein_dev["scm.bstein.dev"] + svc_gitea_gitea["gitea/gitea (Service)"] + host_scm_bstein_dev --> svc_gitea_gitea + wl_gitea_gitea["gitea/gitea (Deployment)"] + svc_gitea_gitea --> wl_gitea_gitea + host_secret_bstein_dev["secret.bstein.dev"] + svc_vault_vault["vault/vault (Service)"] + host_secret_bstein_dev --> svc_vault_vault + wl_vault_vault["vault/vault (StatefulSet)"] + svc_vault_vault --> wl_vault_vault + host_sso_bstein_dev["sso.bstein.dev"] + svc_sso_keycloak["sso/keycloak (Service)"] + host_sso_bstein_dev --> svc_sso_keycloak + wl_sso_keycloak["sso/keycloak (Deployment)"] + svc_sso_keycloak --> wl_sso_keycloak + host_stream_bstein_dev["stream.bstein.dev"] + svc_jellyfin_jellyfin["jellyfin/jellyfin (Service)"] + host_stream_bstein_dev --> svc_jellyfin_jellyfin + wl_jellyfin_jellyfin["jellyfin/jellyfin (Deployment)"] + svc_jellyfin_jellyfin --> wl_jellyfin_jellyfin + host_tasks_bstein_dev["tasks.bstein.dev"] + svc_planka_planka["planka/planka (Service)"] + host_tasks_bstein_dev --> svc_planka_planka + wl_planka_planka["planka/planka (Deployment)"] + svc_planka_planka --> wl_planka_planka + host_vault_bstein_dev["vault.bstein.dev"] + svc_vaultwarden_vaultwarden_service["vaultwarden/vaultwarden-service (Service)"] + host_vault_bstein_dev --> svc_vaultwarden_vaultwarden_service + wl_vaultwarden_vaultwarden["vaultwarden/vaultwarden (Deployment)"] + svc_vaultwarden_vaultwarden_service --> wl_vaultwarden_vaultwarden + + subgraph bstein_dev_home[bstein-dev-home] + svc_bstein_dev_home_bstein_dev_home_frontend + wl_bstein_dev_home_bstein_dev_home_frontend + svc_bstein_dev_home_bstein_dev_home_backend + wl_bstein_dev_home_bstein_dev_home_backend + svc_bstein_dev_home_chat_ai_gateway + wl_bstein_dev_home_chat_ai_gateway + end + subgraph comms[comms] + svc_comms_matrix_wellknown + wl_comms_matrix_wellknown + svc_comms_element_call + wl_comms_element_call + svc_comms_livekit_token_service + wl_comms_livekit_token_service + svc_comms_livekit + wl_comms_livekit + svc_comms_othrys_synapse_matrix_synapse + svc_comms_matrix_guest_register + wl_comms_matrix_guest_register + svc_comms_matrix_authentication_service + wl_comms_matrix_authentication_service + end + subgraph crypto[crypto] + svc_crypto_monerod + wl_crypto_monerod + end + subgraph finance[finance] + svc_finance_actual_budget + wl_finance_actual_budget + svc_finance_firefly + wl_finance_firefly + end + subgraph gitea[gitea] + svc_gitea_gitea + wl_gitea_gitea + end + subgraph health[health] + svc_health_wger + wl_health_wger + end + subgraph jellyfin[jellyfin] + svc_jellyfin_pegasus + wl_jellyfin_pegasus + svc_jellyfin_jellyfin + wl_jellyfin_jellyfin + end + subgraph jenkins[jenkins] + svc_jenkins_jenkins + wl_jenkins_jenkins + end + subgraph logging[logging] + svc_logging_oauth2_proxy_logs + wl_logging_oauth2_proxy_logs + end + subgraph longhorn_system[longhorn-system] + svc_longhorn_system_oauth2_proxy_longhorn + wl_longhorn_system_oauth2_proxy_longhorn + end + subgraph mailu_mailserver[mailu-mailserver] + svc_mailu_mailserver_mailu_front + end + subgraph nextcloud[nextcloud] + svc_nextcloud_nextcloud + wl_nextcloud_nextcloud + svc_nextcloud_collabora + wl_nextcloud_collabora + end + subgraph outline[outline] + svc_outline_outline + wl_outline_outline + end + subgraph planka[planka] + svc_planka_planka + wl_planka_planka + end + subgraph sso[sso] + svc_sso_oauth2_proxy + wl_sso_oauth2_proxy + svc_sso_keycloak + wl_sso_keycloak + end + subgraph vault[vault] + svc_vault_vault + wl_vault_vault + end + subgraph vaultwarden[vaultwarden] + svc_vaultwarden_vaultwarden_service + wl_vaultwarden_vaultwarden + end diff --git a/services/atlasbot/kustomization.yaml b/services/atlasbot/kustomization.yaml new file mode 100644 index 0000000..a707bd8 --- /dev/null +++ b/services/atlasbot/kustomization.yaml @@ -0,0 +1,26 @@ +# services/atlasbot/kustomization.yaml +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization +namespace: ai +resources: + - atlasbot-deployment.yaml + - atlasbot-service.yaml + - atlasbot-rbac.yaml + - image.yaml +images: + - name: registry.bstein.dev/bstein/atlasbot + newTag: 0.1.0-104 # {"$imagepolicy": "ai:atlasbot:tag"} +configMapGenerator: + - name: atlasbot-vault-env + files: + - atlasbot_vault_env.sh=scripts/atlasbot_vault_env.sh + options: + disableNameSuffixHash: true + - name: atlas-kb + files: + - INDEX.md=knowledge/INDEX.md + - atlas.json=knowledge/catalog/atlas.json + - atlas-summary.json=knowledge/catalog/atlas-summary.json + - metrics.json=knowledge/catalog/metrics.json + - runbooks.json=knowledge/catalog/runbooks.json + - atlas-http.mmd=knowledge/diagrams/atlas-http.mmd diff --git a/services/atlasbot/scripts/atlasbot_vault_env.sh b/services/atlasbot/scripts/atlasbot_vault_env.sh new file mode 100644 index 0000000..9dd2094 --- /dev/null +++ b/services/atlasbot/scripts/atlasbot_vault_env.sh @@ -0,0 +1,44 @@ +#!/usr/bin/env sh +set -eu + +vault_dir="/vault/secrets" + +read_secret() { + tr -d '\r\n' < "${vault_dir}/$1" +} + +read_optional() { + if [ -f "${vault_dir}/$1" ]; then + tr -d '\r\n' < "${vault_dir}/$1" + else + printf '' + fi +} + +export TURN_STATIC_AUTH_SECRET="$(read_secret turn-secret)" +export TURN_PASSWORD="${TURN_STATIC_AUTH_SECRET}" + +export LIVEKIT_API_SECRET="$(read_secret livekit-primary)" +export LIVEKIT_SECRET="${LIVEKIT_API_SECRET}" + +export BOT_PASS="$(read_secret bot-pass)" +export BOT_PASS_QUICK="$(read_optional bot-quick-pass)" +export BOT_PASS_SMART="$(read_optional bot-smart-pass)" +export BOT_PASS_GENIUS="$(read_optional bot-genius-pass)" +if [ -z "${BOT_PASS_SMART}" ]; then + export BOT_PASS_SMART="${BOT_PASS}" +fi +if [ -z "${BOT_PASS_GENIUS}" ]; then + export BOT_PASS_GENIUS="${BOT_PASS_SMART}" +fi +export SEEDER_PASS="$(read_secret seeder-pass)" + +export CHAT_API_KEY="$(read_secret chat-matrix)" +export CHAT_API_HOMEPAGE="$(read_secret chat-homepage)" + +export MAS_ADMIN_CLIENT_SECRET_FILE="${vault_dir}/mas-admin-secret" +export PGPASSWORD="$(read_secret synapse-db-pass)" + +export MAS_DB_PASSWORD="$(read_secret mas-db-pass)" +export MATRIX_SHARED_SECRET="$(read_secret mas-matrix-shared)" +export KEYCLOAK_CLIENT_SECRET="$(read_secret mas-kc-secret)" diff --git a/services/bstein-dev-home/backend-deployment.yaml b/services/bstein-dev-home/backend-deployment.yaml index c1fd7ee..1403893 100644 --- a/services/bstein-dev-home/backend-deployment.yaml +++ b/services/bstein-dev-home/backend-deployment.yaml @@ -68,7 +68,7 @@ spec: - name: AI_CHAT_TIMEOUT_SEC value: "480" - name: AI_ATLASBOT_ENDPOINT - value: http://atlasbot.comms.svc.cluster.local:8090/v1/answer + value: http://atlasbot.ai.svc.cluster.local:8090/v1/answer - name: AI_ATLASBOT_MODEL_FAST value: qwen2.5:14b-instruct-q4_0 - name: AI_ATLASBOT_MODEL_SMART diff --git a/services/comms/kustomization.yaml b/services/comms/kustomization.yaml index 0ad6b00..a35a538 100644 --- a/services/comms/kustomization.yaml +++ b/services/comms/kustomization.yaml @@ -4,7 +4,6 @@ kind: Kustomization namespace: comms resources: - namespace.yaml - - image.yaml - serviceaccount.yaml - secretproviderclass.yaml - mas-configmap.yaml @@ -14,10 +13,7 @@ resources: - element-call-deployment.yaml - guest-register-deployment.yaml - guest-register-service.yaml - - atlasbot-deployment.yaml - - atlasbot-service.yaml - wellknown.yaml - - atlasbot-rbac.yaml - mas-secrets-ensure-rbac.yaml - comms-secrets-ensure-rbac.yaml - mas-db-ensure-rbac.yaml @@ -44,9 +40,6 @@ resources: - livekit-ingress.yaml - livekit-middlewares.yaml - matrix-ingress.yaml -images: - - name: registry.bstein.dev/bstein/atlasbot - newTag: 0.1.0-104 # {"$imagepolicy": "comms:atlasbot:tag"} configMapGenerator: - name: comms-vault-env files: @@ -68,11 +61,3 @@ configMapGenerator: - 20-host-config.sh=scripts/element-host-config.sh options: disableNameSuffixHash: true - - name: atlas-kb - files: - - INDEX.md=knowledge/INDEX.md - - atlas.json=knowledge/catalog/atlas.json - - atlas-summary.json=knowledge/catalog/atlas-summary.json - - metrics.json=knowledge/catalog/metrics.json - - runbooks.json=knowledge/catalog/runbooks.json - - atlas-http.mmd=knowledge/diagrams/atlas-http.mmd diff --git a/services/vault/scripts/vault_k8s_auth_configure.sh b/services/vault/scripts/vault_k8s_auth_configure.sh index acd98e8..474d53a 100644 --- a/services/vault/scripts/vault_k8s_auth_configure.sh +++ b/services/vault/scripts/vault_k8s_auth_configure.sh @@ -255,6 +255,8 @@ write_policy_and_role "nextcloud" "nextcloud" "nextcloud-vault" \ "nextcloud/* shared/keycloak-admin shared/postmark-relay" "" write_policy_and_role "comms" "comms" "comms-vault,atlasbot" \ "comms/* shared/chat-ai-keys-runtime shared/harbor-pull" "" +write_policy_and_role "ai" "ai" "atlasbot" \ + "comms/* shared/chat-ai-keys-runtime shared/harbor-pull" "" write_policy_and_role "jenkins" "jenkins" "jenkins,jenkins-vault-sync" \ "jenkins/* shared/harbor-pull" "" write_policy_and_role "monitoring" "monitoring" "monitoring-vault-sync" \