diff --git a/.gitignore b/.gitignore index 8d0ab1e..7543bbf 100644 --- a/.gitignore +++ b/.gitignore @@ -6,4 +6,5 @@ __pycache__/ *.py[cod] .pytest_cache .venv +.venv-ci tmp/ diff --git a/Jenkinsfile b/Jenkinsfile new file mode 100644 index 0000000..4d6b23e --- /dev/null +++ b/Jenkinsfile @@ -0,0 +1,77 @@ +// Mirror of ci/Jenkinsfile.titan-iac for multibranch discovery. +pipeline { + agent { + kubernetes { + defaultContainer 'python' + yaml """ +apiVersion: v1 +kind: Pod +spec: + nodeSelector: + hardware: rpi5 + kubernetes.io/arch: arm64 + node-role.kubernetes.io/worker: "true" + containers: + - name: python + image: python:3.12-slim + command: + - cat + tty: true +""" + } + } + environment { + PIP_DISABLE_PIP_VERSION_CHECK = '1' + PYTHONUNBUFFERED = '1' + } + stages { + stage('Checkout') { + steps { + checkout scm + } + } + stage('Install deps') { + steps { + sh 'pip install --no-cache-dir -r ci/requirements.txt' + } + } + stage('Glue tests') { + steps { + sh 'pytest -q ci/tests/glue' + } + } + stage('Resolve Flux branch') { + steps { + script { + env.FLUX_BRANCH = sh( + returnStdout: true, + script: "awk '/branch:/{print $2; exit}' clusters/atlas/flux-system/gotk-sync.yaml" + ).trim() + if (!env.FLUX_BRANCH) { + error('Flux branch not found in gotk-sync.yaml') + } + echo "Flux branch: ${env.FLUX_BRANCH}" + } + } + } + stage('Promote') { + when { + expression { + def branch = env.BRANCH_NAME ?: (env.GIT_BRANCH ?: '').replaceFirst('origin/', '') + return env.FLUX_BRANCH && branch == env.FLUX_BRANCH + } + } + steps { + withCredentials([usernamePassword(credentialsId: 'gitea-pat', usernameVariable: 'GIT_USER', passwordVariable: 'GIT_TOKEN')]) { + sh ''' + set +x + git config user.email "jenkins@bstein.dev" + git config user.name "jenkins" + git remote set-url origin https://${GIT_USER}:${GIT_TOKEN}@scm.bstein.dev/bstein/titan-iac.git + git push origin HEAD:${FLUX_BRANCH} + ''' + } + } + } + } +} diff --git a/ci/Jenkinsfile.titan-iac b/ci/Jenkinsfile.titan-iac index 3b13eb0..77990d7 100644 --- a/ci/Jenkinsfile.titan-iac +++ b/ci/Jenkinsfile.titan-iac @@ -6,6 +6,10 @@ pipeline { apiVersion: v1 kind: Pod spec: + nodeSelector: + hardware: rpi5 + kubernetes.io/arch: arm64 + node-role.kubernetes.io/worker: "true" containers: - name: python image: python:3.12-slim @@ -18,7 +22,6 @@ spec: environment { PIP_DISABLE_PIP_VERSION_CHECK = '1' PYTHONUNBUFFERED = '1' - DEPLOY_BRANCH = 'deploy' } stages { stage('Checkout') { @@ -36,7 +39,27 @@ spec: sh 'pytest -q ci/tests/glue' } } + stage('Resolve Flux branch') { + steps { + script { + env.FLUX_BRANCH = sh( + returnStdout: true, + script: "awk '/branch:/{print $2; exit}' clusters/atlas/flux-system/gotk-sync.yaml" + ).trim() + if (!env.FLUX_BRANCH) { + error('Flux branch not found in gotk-sync.yaml') + } + echo "Flux branch: ${env.FLUX_BRANCH}" + } + } + } stage('Promote') { + when { + expression { + def branch = env.BRANCH_NAME ?: (env.GIT_BRANCH ?: '').replaceFirst('origin/', '') + return env.FLUX_BRANCH && branch == env.FLUX_BRANCH + } + } steps { withCredentials([usernamePassword(credentialsId: 'gitea-pat', usernameVariable: 'GIT_USER', passwordVariable: 'GIT_TOKEN')]) { sh ''' @@ -44,7 +67,7 @@ spec: git config user.email "jenkins@bstein.dev" git config user.name "jenkins" git remote set-url origin https://${GIT_USER}:${GIT_TOKEN}@scm.bstein.dev/bstein/titan-iac.git - git push origin HEAD:${DEPLOY_BRANCH} + git push origin HEAD:${FLUX_BRANCH} ''' } } diff --git a/ci/tests/glue/config.yaml b/ci/tests/glue/config.yaml index 8adf4ca..16b656c 100644 --- a/ci/tests/glue/config.yaml +++ b/ci/tests/glue/config.yaml @@ -1,7 +1,16 @@ max_success_age_hours: 48 allow_suspended: + - bstein-dev-home/vaultwarden-cred-sync - comms/othrys-room-reset - comms/pin-othrys-invite - comms/seed-othrys-room - finance/firefly-user-sync + - health/wger-admin-ensure - health/wger-user-sync + - mailu-mailserver/mailu-sync-nightly + - nextcloud/nextcloud-mail-sync +ariadne_schedule_tasks: + - schedule.mailu_sync + - schedule.nextcloud_sync + - schedule.vaultwarden_sync + - schedule.wger_admin diff --git a/ci/tests/glue/test_glue_metrics.py b/ci/tests/glue/test_glue_metrics.py index 16b01c7..52ec0be 100644 --- a/ci/tests/glue/test_glue_metrics.py +++ b/ci/tests/glue/test_glue_metrics.py @@ -1,11 +1,19 @@ from __future__ import annotations import os +from pathlib import Path import requests +import yaml VM_URL = os.environ.get("VM_URL", "http://victoria-metrics-single-server:8428").rstrip("/") +CONFIG_PATH = Path(__file__).with_name("config.yaml") + + +def _load_config() -> dict: + with CONFIG_PATH.open("r", encoding="utf-8") as handle: + return yaml.safe_load(handle) or {} def _query(promql: str) -> list[dict]: @@ -27,3 +35,14 @@ def test_glue_metrics_success_join(): ) series = _query(query) assert series, "No glue cronjob last success series found" + + +def test_ariadne_schedule_metrics_present(): + cfg = _load_config() + expected = cfg.get("ariadne_schedule_tasks", []) + if not expected: + return + series = _query("ariadne_schedule_next_run_timestamp_seconds") + tasks = {item.get("metric", {}).get("task") for item in series} + missing = [task for task in expected if task not in tasks] + assert not missing, f"Missing Ariadne schedule metrics for: {', '.join(missing)}" diff --git a/clusters/atlas/flux-system/applications/bstein-dev-home-migrations/kustomization.yaml b/clusters/atlas/flux-system/applications/bstein-dev-home-migrations/kustomization.yaml new file mode 100644 index 0000000..ff97f73 --- /dev/null +++ b/clusters/atlas/flux-system/applications/bstein-dev-home-migrations/kustomization.yaml @@ -0,0 +1,17 @@ +# clusters/atlas/flux-system/applications/bstein-dev-home-migrations/kustomization.yaml +apiVersion: kustomize.toolkit.fluxcd.io/v1 +kind: Kustomization +metadata: + name: bstein-dev-home-migrations + namespace: flux-system +spec: + interval: 10m + path: ./services/bstein-dev-home/oneoffs/migrations + prune: true + force: true + sourceRef: + kind: GitRepository + name: flux-system + targetNamespace: bstein-dev-home + wait: false + suspend: true diff --git a/clusters/atlas/flux-system/applications/bstein-dev-home/image-automation.yaml b/clusters/atlas/flux-system/applications/bstein-dev-home/image-automation.yaml index 88dda40..f1d41be 100644 --- a/clusters/atlas/flux-system/applications/bstein-dev-home/image-automation.yaml +++ b/clusters/atlas/flux-system/applications/bstein-dev-home/image-automation.yaml @@ -3,7 +3,7 @@ apiVersion: image.toolkit.fluxcd.io/v1 kind: ImageUpdateAutomation metadata: name: bstein-dev-home - namespace: flux-system + namespace: bstein-dev-home spec: interval: 1m0s sourceRef: @@ -13,14 +13,14 @@ spec: git: checkout: ref: - branch: feature/vault-consumption + branch: feature/ariadne commit: author: email: ops@bstein.dev name: flux-bot - messageTemplate: "chore(bstein-dev-home): update images to {{range .Updated.Images}}{{.}}{{end}}" + messageTemplate: "chore(bstein-dev-home): automated image update" push: - branch: feature/vault-consumption + branch: feature/ariadne update: strategy: Setters path: services/bstein-dev-home diff --git a/clusters/atlas/flux-system/applications/harbor/kustomization.yaml b/clusters/atlas/flux-system/applications/harbor/kustomization.yaml index 06baf26..5eec32f 100644 --- a/clusters/atlas/flux-system/applications/harbor/kustomization.yaml +++ b/clusters/atlas/flux-system/applications/harbor/kustomization.yaml @@ -13,11 +13,6 @@ spec: kind: GitRepository name: flux-system namespace: flux-system - healthChecks: - - apiVersion: helm.toolkit.fluxcd.io/v2 - kind: HelmRelease - name: harbor - namespace: harbor wait: false dependsOn: - name: core diff --git a/clusters/atlas/flux-system/applications/kustomization.yaml b/clusters/atlas/flux-system/applications/kustomization.yaml index 417a3ec..10c203d 100644 --- a/clusters/atlas/flux-system/applications/kustomization.yaml +++ b/clusters/atlas/flux-system/applications/kustomization.yaml @@ -12,6 +12,7 @@ resources: - pegasus/image-automation.yaml - bstein-dev-home/kustomization.yaml - bstein-dev-home/image-automation.yaml + - bstein-dev-home-migrations/kustomization.yaml - harbor/kustomization.yaml - harbor/image-automation.yaml - jellyfin/kustomization.yaml diff --git a/clusters/atlas/flux-system/applications/pegasus/image-automation.yaml b/clusters/atlas/flux-system/applications/pegasus/image-automation.yaml index ec0494e..d11422a 100644 --- a/clusters/atlas/flux-system/applications/pegasus/image-automation.yaml +++ b/clusters/atlas/flux-system/applications/pegasus/image-automation.yaml @@ -3,7 +3,7 @@ apiVersion: image.toolkit.fluxcd.io/v1 kind: ImageUpdateAutomation metadata: name: pegasus - namespace: flux-system + namespace: jellyfin spec: interval: 1m0s sourceRef: diff --git a/clusters/atlas/flux-system/platform/kustomization.yaml b/clusters/atlas/flux-system/platform/kustomization.yaml index b689cc0..6e75b04 100644 --- a/clusters/atlas/flux-system/platform/kustomization.yaml +++ b/clusters/atlas/flux-system/platform/kustomization.yaml @@ -11,6 +11,7 @@ resources: - monitoring/kustomization.yaml - logging/kustomization.yaml - maintenance/kustomization.yaml + - maintenance/image-automation.yaml - longhorn-adopt/kustomization.yaml - longhorn/kustomization.yaml - longhorn-ui/kustomization.yaml diff --git a/clusters/atlas/flux-system/platform/maintenance/image-automation.yaml b/clusters/atlas/flux-system/platform/maintenance/image-automation.yaml new file mode 100644 index 0000000..6e8f612 --- /dev/null +++ b/clusters/atlas/flux-system/platform/maintenance/image-automation.yaml @@ -0,0 +1,26 @@ +# clusters/atlas/flux-system/platform/maintenance/image-automation.yaml +apiVersion: image.toolkit.fluxcd.io/v1 +kind: ImageUpdateAutomation +metadata: + name: maintenance + namespace: maintenance +spec: + interval: 1m0s + sourceRef: + kind: GitRepository + name: flux-system + namespace: flux-system + git: + checkout: + ref: + branch: feature/ariadne + commit: + author: + email: ops@bstein.dev + name: flux-bot + messageTemplate: "chore(maintenance): automated image update" + push: + branch: feature/ariadne + update: + strategy: Setters + path: services/maintenance diff --git a/clusters/atlas/flux-system/platform/maintenance/kustomization.yaml b/clusters/atlas/flux-system/platform/maintenance/kustomization.yaml index fc655a4..8477ec9 100644 --- a/clusters/atlas/flux-system/platform/maintenance/kustomization.yaml +++ b/clusters/atlas/flux-system/platform/maintenance/kustomization.yaml @@ -8,6 +8,7 @@ spec: interval: 10m path: ./services/maintenance prune: true + force: true sourceRef: kind: GitRepository name: flux-system diff --git a/infrastructure/core/coredns-custom.yaml b/infrastructure/core/coredns-custom.yaml index 8aeff14..6266a22 100644 --- a/infrastructure/core/coredns-custom.yaml +++ b/infrastructure/core/coredns-custom.yaml @@ -32,6 +32,9 @@ data: 192.168.22.9 notes.bstein.dev 192.168.22.9 office.bstein.dev 192.168.22.9 pegasus.bstein.dev + 3.136.224.193 pm-bounces.bstein.dev + 3.150.68.49 pm-bounces.bstein.dev + 18.189.137.81 pm-bounces.bstein.dev 192.168.22.9 registry.bstein.dev 192.168.22.9 scm.bstein.dev 192.168.22.9 secret.bstein.dev diff --git a/infrastructure/core/kustomization.yaml b/infrastructure/core/kustomization.yaml index 6286186..257e1f0 100644 --- a/infrastructure/core/kustomization.yaml +++ b/infrastructure/core/kustomization.yaml @@ -6,5 +6,6 @@ resources: - ../modules/profiles/atlas-ha - coredns-custom.yaml - coredns-deployment.yaml + - ntp-sync-daemonset.yaml - ../sources/cert-manager/letsencrypt.yaml - ../sources/cert-manager/letsencrypt-prod.yaml diff --git a/infrastructure/core/ntp-sync-daemonset.yaml b/infrastructure/core/ntp-sync-daemonset.yaml new file mode 100644 index 0000000..ba97294 --- /dev/null +++ b/infrastructure/core/ntp-sync-daemonset.yaml @@ -0,0 +1,50 @@ +# infrastructure/core/ntp-sync-daemonset.yaml +apiVersion: apps/v1 +kind: DaemonSet +metadata: + name: ntp-sync + namespace: kube-system + labels: + app: ntp-sync +spec: + selector: + matchLabels: + app: ntp-sync + template: + metadata: + labels: + app: ntp-sync + spec: + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: node-role.kubernetes.io/control-plane + operator: DoesNotExist + - key: node-role.kubernetes.io/master + operator: DoesNotExist + containers: + - name: ntp-sync + image: public.ecr.aws/docker/library/busybox:1.36.1 + imagePullPolicy: IfNotPresent + command: ["/bin/sh", "-c"] + args: + - | + set -eu + while true; do + ntpd -q -p pool.ntp.org || true + sleep 300 + done + securityContext: + capabilities: + add: ["SYS_TIME"] + runAsUser: 0 + runAsGroup: 0 + resources: + requests: + cpu: 10m + memory: 16Mi + limits: + cpu: 50m + memory: 64Mi diff --git a/infrastructure/longhorn/core/secretproviderclass.yaml b/infrastructure/longhorn/core/secretproviderclass.yaml index 031d1d8..e292b86 100644 --- a/infrastructure/longhorn/core/secretproviderclass.yaml +++ b/infrastructure/longhorn/core/secretproviderclass.yaml @@ -11,7 +11,7 @@ spec: roleName: "longhorn" objects: | - objectName: "harbor-pull__dockerconfigjson" - secretPath: "kv/data/atlas/harbor-pull/longhorn" + secretPath: "kv/data/atlas/shared/harbor-pull" secretKey: "dockerconfigjson" secretObjects: - secretName: longhorn-registry diff --git a/infrastructure/postgres/service.yaml b/infrastructure/postgres/service.yaml index 3dcab3c..b695045 100644 --- a/infrastructure/postgres/service.yaml +++ b/infrastructure/postgres/service.yaml @@ -4,6 +4,10 @@ kind: Service metadata: name: postgres-service namespace: postgres + annotations: + prometheus.io/scrape: "true" + prometheus.io/port: "9187" + prometheus.io/path: "/metrics" spec: clusterIP: None ports: @@ -11,5 +15,9 @@ spec: port: 5432 protocol: TCP targetPort: 5432 + - name: metrics + port: 9187 + protocol: TCP + targetPort: 9187 selector: app: postgres diff --git a/infrastructure/postgres/statefulset.yaml b/infrastructure/postgres/statefulset.yaml index e1a1921..2c79248 100644 --- a/infrastructure/postgres/statefulset.yaml +++ b/infrastructure/postgres/statefulset.yaml @@ -58,6 +58,23 @@ spec: - name: vault-secrets mountPath: /mnt/vault readOnly: true + - name: postgres-exporter + image: quay.io/prometheuscommunity/postgres-exporter:v0.15.0 + ports: + - name: metrics + containerPort: 9187 + protocol: TCP + env: + - name: DATA_SOURCE_URI + value: "localhost:5432/postgres?sslmode=disable" + - name: DATA_SOURCE_USER + value: postgres + - name: DATA_SOURCE_PASS_FILE + value: /mnt/vault/postgres_password + volumeMounts: + - name: vault-secrets + mountPath: /mnt/vault + readOnly: true volumes: - name: vault-secrets csi: diff --git a/infrastructure/sources/cert-manager/letsencrypt-prod.yaml b/infrastructure/sources/cert-manager/letsencrypt-prod.yaml index 7f90f01..5795b09 100644 --- a/infrastructure/sources/cert-manager/letsencrypt-prod.yaml +++ b/infrastructure/sources/cert-manager/letsencrypt-prod.yaml @@ -5,7 +5,7 @@ metadata: name: letsencrypt-prod spec: acme: - email: brad.stein@gmail.com + email: brad@bstein.dev server: https://acme-v02.api.letsencrypt.org/directory privateKeySecretRef: name: letsencrypt-prod-account-key diff --git a/infrastructure/sources/cert-manager/letsencrypt.yaml b/infrastructure/sources/cert-manager/letsencrypt.yaml index a988312..5fbe4e3 100644 --- a/infrastructure/sources/cert-manager/letsencrypt.yaml +++ b/infrastructure/sources/cert-manager/letsencrypt.yaml @@ -5,7 +5,7 @@ metadata: name: letsencrypt spec: acme: - email: brad.stein@gmail.com + email: brad@bstein.dev server: https://acme-v02.api.letsencrypt.org/directory privateKeySecretRef: name: letsencrypt-account-key diff --git a/infrastructure/vault-csi/secrets-store-csi-driver.yaml b/infrastructure/vault-csi/secrets-store-csi-driver.yaml index 0b249fc..0004c0d 100644 --- a/infrastructure/vault-csi/secrets-store-csi-driver.yaml +++ b/infrastructure/vault-csi/secrets-store-csi-driver.yaml @@ -17,4 +17,5 @@ spec: values: syncSecret: enabled: true - enableSecretRotation: false + enableSecretRotation: true + rotationPollInterval: 2m diff --git a/knowledge/catalog/atlas-summary.json b/knowledge/catalog/atlas-summary.json index fa35051..ea825ce 100644 --- a/knowledge/catalog/atlas-summary.json +++ b/knowledge/catalog/atlas-summary.json @@ -1,8 +1,8 @@ { "counts": { - "helmrelease_host_hints": 17, - "http_endpoints": 37, - "services": 43, - "workloads": 54 + "helmrelease_host_hints": 19, + "http_endpoints": 45, + "services": 47, + "workloads": 74 } } diff --git a/knowledge/catalog/atlas.json b/knowledge/catalog/atlas.json index 0d97bcd..951c807 100644 --- a/knowledge/catalog/atlas.json +++ b/knowledge/catalog/atlas.json @@ -11,6 +11,21 @@ "path": "services/bstein-dev-home", "targetNamespace": "bstein-dev-home" }, + { + "name": "bstein-dev-home-migrations", + "path": "services/bstein-dev-home/migrations", + "targetNamespace": "bstein-dev-home" + }, + { + "name": "cert-manager", + "path": "infrastructure/cert-manager", + "targetNamespace": "cert-manager" + }, + { + "name": "cert-manager-cleanup", + "path": "infrastructure/cert-manager/cleanup", + "targetNamespace": "cert-manager" + }, { "name": "comms", "path": "services/comms", @@ -26,6 +41,11 @@ "path": "services/crypto", "targetNamespace": "crypto" }, + { + "name": "finance", + "path": "services/finance", + "targetNamespace": "finance" + }, { "name": "flux-system", "path": "clusters/atlas/flux-system", @@ -46,6 +66,11 @@ "path": "services/harbor", "targetNamespace": "harbor" }, + { + "name": "health", + "path": "services/health", + "targetNamespace": "health" + }, { "name": "helm", "path": "infrastructure/sources/helm", @@ -71,6 +96,16 @@ "path": "services/logging", "targetNamespace": null }, + { + "name": "longhorn", + "path": "infrastructure/longhorn/core", + "targetNamespace": "longhorn-system" + }, + { + "name": "longhorn-adopt", + "path": "infrastructure/longhorn/adopt", + "targetNamespace": "longhorn-system" + }, { "name": "longhorn-ui", "path": "infrastructure/longhorn/ui-ingress", @@ -161,11 +196,21 @@ "path": "infrastructure/vault-csi", "targetNamespace": "kube-system" }, + { + "name": "vault-injector", + "path": "infrastructure/vault-injector", + "targetNamespace": "vault" + }, { "name": "vaultwarden", "path": "services/vaultwarden", "targetNamespace": "vaultwarden" }, + { + "name": "wallet-monero-temp", + "path": "services/crypto/wallet-monero-temp", + "targetNamespace": "crypto" + }, { "name": "xmr-miner", "path": "services/crypto/xmr-miner", @@ -199,7 +244,7 @@ "node-role.kubernetes.io/worker": "true" }, "images": [ - "registry.bstein.dev/bstein/bstein-dev-home-backend:0.1.1-92" + "registry.bstein.dev/bstein/bstein-dev-home-backend:0.1.1-157" ] }, { @@ -215,7 +260,20 @@ "node-role.kubernetes.io/worker": "true" }, "images": [ - "registry.bstein.dev/bstein/bstein-dev-home-frontend:0.1.1-92" + "registry.bstein.dev/bstein/bstein-dev-home-frontend:0.1.1-157" + ] + }, + { + "kind": "Deployment", + "namespace": "bstein-dev-home", + "name": "bstein-dev-home-vault-sync", + "labels": { + "app": "bstein-dev-home-vault-sync" + }, + "serviceAccountName": "bstein-dev-home-vault-sync", + "nodeSelector": {}, + "images": [ + "alpine:3.20" ] }, { @@ -225,7 +283,7 @@ "labels": { "app": "chat-ai-gateway" }, - "serviceAccountName": null, + "serviceAccountName": "bstein-dev-home", "nodeSelector": { "kubernetes.io/arch": "arm64", "node-role.kubernetes.io/worker": "true" @@ -249,6 +307,19 @@ "python:3.11-slim" ] }, + { + "kind": "Deployment", + "namespace": "comms", + "name": "comms-vault-sync", + "labels": { + "app": "comms-vault-sync" + }, + "serviceAccountName": "comms-vault", + "nodeSelector": {}, + "images": [ + "alpine:3.20" + ] + }, { "kind": "Deployment", "namespace": "comms", @@ -256,7 +327,7 @@ "labels": { "app": "coturn" }, - "serviceAccountName": null, + "serviceAccountName": "comms-vault", "nodeSelector": { "hardware": "rpi5" }, @@ -286,7 +357,7 @@ "labels": { "app": "livekit" }, - "serviceAccountName": null, + "serviceAccountName": "comms-vault", "nodeSelector": { "hardware": "rpi5" }, @@ -301,12 +372,12 @@ "labels": { "app": "livekit-token-service" }, - "serviceAccountName": null, + "serviceAccountName": "comms-vault", "nodeSelector": { "hardware": "rpi5" }, "images": [ - "ghcr.io/element-hq/lk-jwt-service:0.3.0" + "registry.bstein.dev/tools/lk-jwt-service-vault:0.3.0" ] }, { @@ -316,7 +387,7 @@ "labels": { "app": "matrix-authentication-service" }, - "serviceAccountName": null, + "serviceAccountName": "comms-vault", "nodeSelector": { "hardware": "rpi5" }, @@ -331,7 +402,7 @@ "labels": { "app.kubernetes.io/name": "matrix-guest-register" }, - "serviceAccountName": null, + "serviceAccountName": "comms-vault", "nodeSelector": {}, "images": [ "python:3.11-slim" @@ -365,6 +436,19 @@ "ghcr.io/tari-project/xmrig@sha256:80defbfd0b640d604c91cb5101d3642db7928e1e68ee3c6b011289b3565a39d9" ] }, + { + "kind": "Deployment", + "namespace": "crypto", + "name": "crypto-vault-sync", + "labels": { + "app": "crypto-vault-sync" + }, + "serviceAccountName": "crypto-vault-sync", + "nodeSelector": {}, + "images": [ + "alpine:3.20" + ] + }, { "kind": "Deployment", "namespace": "crypto", @@ -372,7 +456,7 @@ "labels": { "app": "monero-p2pool" }, - "serviceAccountName": null, + "serviceAccountName": "crypto-vault-sync", "nodeSelector": { "node-role.kubernetes.io/worker": "true" }, @@ -395,6 +479,53 @@ "registry.bstein.dev/crypto/monerod:0.18.4.1" ] }, + { + "kind": "Deployment", + "namespace": "crypto", + "name": "wallet-monero-temp", + "labels": { + "app": "wallet-monero-temp" + }, + "serviceAccountName": "crypto-vault-sync", + "nodeSelector": { + "node-role.kubernetes.io/worker": "true" + }, + "images": [ + "registry.bstein.dev/crypto/monero-wallet-rpc:0.18.4.1" + ] + }, + { + "kind": "Deployment", + "namespace": "finance", + "name": "actual-budget", + "labels": { + "app": "actual-budget" + }, + "serviceAccountName": "finance-vault", + "nodeSelector": { + "kubernetes.io/arch": "arm64", + "node-role.kubernetes.io/worker": "true" + }, + "images": [ + "actualbudget/actual-server:26.1.0-alpine@sha256:34aae5813fdfee12af2a50c4d0667df68029f1d61b90f45f282473273eb70d0d" + ] + }, + { + "kind": "Deployment", + "namespace": "finance", + "name": "firefly", + "labels": { + "app": "firefly" + }, + "serviceAccountName": "finance-vault", + "nodeSelector": { + "kubernetes.io/arch": "arm64", + "node-role.kubernetes.io/worker": "true" + }, + "images": [ + "fireflyiii/core:version-6.4.15" + ] + }, { "kind": "Deployment", "namespace": "flux-system", @@ -516,7 +647,7 @@ "labels": { "app": "gitea" }, - "serviceAccountName": null, + "serviceAccountName": "gitea-vault", "nodeSelector": { "node-role.kubernetes.io/worker": "true" }, @@ -524,6 +655,36 @@ "gitea/gitea:1.23" ] }, + { + "kind": "Deployment", + "namespace": "harbor", + "name": "harbor-vault-sync", + "labels": { + "app": "harbor-vault-sync" + }, + "serviceAccountName": "harbor-vault-sync", + "nodeSelector": {}, + "images": [ + "alpine:3.20" + ] + }, + { + "kind": "Deployment", + "namespace": "health", + "name": "wger", + "labels": { + "app": "wger" + }, + "serviceAccountName": "health-vault-sync", + "nodeSelector": { + "kubernetes.io/arch": "arm64", + "node-role.kubernetes.io/worker": "true" + }, + "images": [ + "nginx:1.27.5-alpine@sha256:65645c7bb6a0661892a8b03b89d0743208a18dd2f3f17a54ef4b76fb8e2f2a10", + "wger/server@sha256:710588b78af4e0aa0b4d8a8061e4563e16eae80eeaccfe7f9e0d9cbdd7f0cbc5" + ] + }, { "kind": "Deployment", "namespace": "jellyfin", @@ -531,7 +692,7 @@ "labels": { "app": "jellyfin" }, - "serviceAccountName": null, + "serviceAccountName": "pegasus-vault-sync", "nodeSelector": {}, "images": [ "docker.io/jellyfin/jellyfin:10.11.5" @@ -544,14 +705,27 @@ "labels": { "app": "pegasus" }, - "serviceAccountName": null, + "serviceAccountName": "pegasus-vault-sync", "nodeSelector": { "kubernetes.io/arch": "arm64", "node-role.kubernetes.io/worker": "true" }, "images": [ "alpine:3.20", - "registry.bstein.dev/streaming/pegasus:1.2.32" + "registry.bstein.dev/streaming/pegasus-vault:1.2.32" + ] + }, + { + "kind": "Deployment", + "namespace": "jellyfin", + "name": "pegasus-vault-sync", + "labels": { + "app": "pegasus-vault-sync" + }, + "serviceAccountName": "pegasus-vault-sync", + "nodeSelector": {}, + "images": [ + "alpine:3.20" ] }, { @@ -570,6 +744,35 @@ "jenkins/jenkins:2.528.3-jdk21" ] }, + { + "kind": "Deployment", + "namespace": "jenkins", + "name": "jenkins-vault-sync", + "labels": { + "app": "jenkins-vault-sync" + }, + "serviceAccountName": "jenkins-vault-sync", + "nodeSelector": { + "kubernetes.io/arch": "arm64", + "node-role.kubernetes.io/worker": "true" + }, + "images": [ + "alpine:3.20" + ] + }, + { + "kind": "DaemonSet", + "namespace": "kube-system", + "name": "ntp-sync", + "labels": { + "app": "ntp-sync" + }, + "serviceAccountName": null, + "nodeSelector": {}, + "images": [ + "public.ecr.aws/docker/library/busybox:1.36.1" + ] + }, { "kind": "DaemonSet", "namespace": "kube-system", @@ -636,6 +839,21 @@ "hashicorp/vault-csi-provider:1.7.0" ] }, + { + "kind": "Deployment", + "namespace": "kube-system", + "name": "coredns", + "labels": { + "k8s-app": "kube-dns" + }, + "serviceAccountName": "coredns", + "nodeSelector": { + "kubernetes.io/os": "linux" + }, + "images": [ + "registry.bstein.dev/infra/coredns:1.12.1" + ] + }, { "kind": "DaemonSet", "namespace": "logging", @@ -681,6 +899,19 @@ "bitnami/kubectl@sha256:554ab88b1858e8424c55de37ad417b16f2a0e65d1607aa0f3fe3ce9b9f10b131" ] }, + { + "kind": "Deployment", + "namespace": "logging", + "name": "logging-vault-sync", + "labels": { + "app": "logging-vault-sync" + }, + "serviceAccountName": "logging-vault-sync", + "nodeSelector": {}, + "images": [ + "alpine:3.20" + ] + }, { "kind": "Deployment", "namespace": "logging", @@ -688,12 +919,27 @@ "labels": { "app": "oauth2-proxy-logs" }, - "serviceAccountName": null, + "serviceAccountName": "logging-vault-sync", "nodeSelector": { "node-role.kubernetes.io/worker": "true" }, "images": [ - "quay.io/oauth2-proxy/oauth2-proxy:v7.6.0" + "registry.bstein.dev/tools/oauth2-proxy-vault:v7.6.0" + ] + }, + { + "kind": "Deployment", + "namespace": "longhorn-system", + "name": "longhorn-vault-sync", + "labels": { + "app": "longhorn-vault-sync" + }, + "serviceAccountName": "longhorn-vault-sync", + "nodeSelector": { + "node-role.kubernetes.io/worker": "true" + }, + "images": [ + "alpine:3.20" ] }, { @@ -703,7 +949,7 @@ "labels": { "app": "oauth2-proxy-longhorn" }, - "serviceAccountName": null, + "serviceAccountName": "longhorn-vault", "nodeSelector": { "node-role.kubernetes.io/worker": "true" }, @@ -729,14 +975,45 @@ { "kind": "Deployment", "namespace": "mailu-mailserver", - "name": "mailu-sync-listener", + "name": "mailu-vault-sync", "labels": { - "app": "mailu-sync-listener" + "app": "mailu-vault-sync" }, - "serviceAccountName": null, + "serviceAccountName": "mailu-vault-sync", "nodeSelector": {}, "images": [ - "python:3.11-alpine" + "alpine:3.20" + ] + }, + { + "kind": "DaemonSet", + "namespace": "maintenance", + "name": "disable-k3s-traefik", + "labels": { + "app": "disable-k3s-traefik" + }, + "serviceAccountName": "disable-k3s-traefik", + "nodeSelector": { + "node-role.kubernetes.io/control-plane": "true" + }, + "images": [ + "bitnami/kubectl@sha256:554ab88b1858e8424c55de37ad417b16f2a0e65d1607aa0f3fe3ce9b9f10b131" + ] + }, + { + "kind": "DaemonSet", + "namespace": "maintenance", + "name": "k3s-agent-restart", + "labels": { + "app": "k3s-agent-restart" + }, + "serviceAccountName": "node-nofile", + "nodeSelector": { + "kubernetes.io/arch": "arm64", + "node-role.kubernetes.io/worker": "true" + }, + "images": [ + "bitnami/kubectl@sha256:554ab88b1858e8424c55de37ad417b16f2a0e65d1607aa0f3fe3ce9b9f10b131" ] }, { @@ -767,6 +1044,35 @@ "bitnami/kubectl@sha256:554ab88b1858e8424c55de37ad417b16f2a0e65d1607aa0f3fe3ce9b9f10b131" ] }, + { + "kind": "Deployment", + "namespace": "maintenance", + "name": "ariadne", + "labels": { + "app": "ariadne" + }, + "serviceAccountName": "ariadne", + "nodeSelector": { + "kubernetes.io/arch": "arm64", + "node-role.kubernetes.io/worker": "true" + }, + "images": [ + "registry.bstein.dev/bstein/ariadne:0.1.0-49" + ] + }, + { + "kind": "Deployment", + "namespace": "maintenance", + "name": "maintenance-vault-sync", + "labels": { + "app": "maintenance-vault-sync" + }, + "serviceAccountName": "maintenance-vault-sync", + "nodeSelector": {}, + "images": [ + "alpine:3.20" + ] + }, { "kind": "DaemonSet", "namespace": "monitoring", @@ -795,6 +1101,19 @@ "python:3.10-slim" ] }, + { + "kind": "Deployment", + "namespace": "monitoring", + "name": "monitoring-vault-sync", + "labels": { + "app": "monitoring-vault-sync" + }, + "serviceAccountName": "monitoring-vault-sync", + "nodeSelector": {}, + "images": [ + "alpine:3.20" + ] + }, { "kind": "Deployment", "namespace": "monitoring", @@ -802,7 +1121,7 @@ "labels": { "app": "postmark-exporter" }, - "serviceAccountName": null, + "serviceAccountName": "monitoring-vault-sync", "nodeSelector": {}, "images": [ "python:3.12-alpine" @@ -830,7 +1149,7 @@ "labels": { "app": "nextcloud" }, - "serviceAccountName": null, + "serviceAccountName": "nextcloud-vault", "nodeSelector": { "hardware": "rpi5" }, @@ -845,7 +1164,7 @@ "labels": { "app": "outline" }, - "serviceAccountName": null, + "serviceAccountName": "outline-vault", "nodeSelector": { "node-role.kubernetes.io/worker": "true" }, @@ -875,7 +1194,7 @@ "labels": { "app": "planka" }, - "serviceAccountName": null, + "serviceAccountName": "planka-vault", "nodeSelector": { "node-role.kubernetes.io/worker": "true" }, @@ -895,7 +1214,8 @@ "node-role.kubernetes.io/worker": "true" }, "images": [ - "postgres:15" + "postgres:15", + "quay.io/prometheuscommunity/postgres-exporter:v0.15.0" ] }, { @@ -905,8 +1225,11 @@ "labels": { "app": "keycloak" }, - "serviceAccountName": null, - "nodeSelector": {}, + "serviceAccountName": "sso-vault", + "nodeSelector": { + "kubernetes.io/arch": "arm64", + "node-role.kubernetes.io/worker": "true" + }, "images": [ "quay.io/keycloak/keycloak:26.0.7" ] @@ -918,12 +1241,25 @@ "labels": { "app": "oauth2-proxy" }, - "serviceAccountName": null, + "serviceAccountName": "sso-vault", "nodeSelector": { "node-role.kubernetes.io/worker": "true" }, "images": [ - "quay.io/oauth2-proxy/oauth2-proxy:v7.6.0" + "registry.bstein.dev/tools/oauth2-proxy-vault:v7.6.0" + ] + }, + { + "kind": "Deployment", + "namespace": "sso", + "name": "sso-vault-sync", + "labels": { + "app": "sso-vault-sync" + }, + "serviceAccountName": "sso-vault-sync", + "nodeSelector": {}, + "images": [ + "alpine:3.20" ] }, { @@ -933,7 +1269,7 @@ "labels": { "app": "openldap" }, - "serviceAccountName": null, + "serviceAccountName": "sso-vault", "nodeSelector": { "kubernetes.io/arch": "arm64", "node-role.kubernetes.io/worker": "true" @@ -951,7 +1287,7 @@ }, "serviceAccountName": "sui-metrics", "nodeSelector": { - "kubernetes.io/hostname": "titan-24" + "hardware": "rpi5" }, "images": [ "victoriametrics/vmagent:v1.103.0" @@ -962,7 +1298,9 @@ "namespace": "traefik", "name": "traefik", "labels": { - "app": "traefik" + "app": "traefik", + "app.kubernetes.io/instance": "traefik-kube-system", + "app.kubernetes.io/name": "traefik" }, "serviceAccountName": "traefik-ingress-controller", "nodeSelector": { @@ -995,10 +1333,13 @@ "labels": { "app": "vaultwarden" }, - "serviceAccountName": null, - "nodeSelector": {}, + "serviceAccountName": "vaultwarden-vault", + "nodeSelector": { + "kubernetes.io/arch": "arm64", + "node-role.kubernetes.io/worker": "true" + }, "images": [ - "vaultwarden/server:1.33.2" + "vaultwarden/server:1.35.2" ] } ], @@ -1565,6 +1906,54 @@ } ] }, + { + "namespace": "crypto", + "name": "wallet-monero-temp", + "type": "ClusterIP", + "selector": { + "app": "wallet-monero-temp" + }, + "ports": [ + { + "name": "rpc", + "port": 18083, + "targetPort": 18083, + "protocol": "TCP" + } + ] + }, + { + "namespace": "finance", + "name": "actual-budget", + "type": "ClusterIP", + "selector": { + "app": "actual-budget" + }, + "ports": [ + { + "name": "http", + "port": 80, + "targetPort": 5006, + "protocol": "TCP" + } + ] + }, + { + "namespace": "finance", + "name": "firefly", + "type": "ClusterIP", + "selector": { + "app": "firefly" + }, + "ports": [ + { + "name": "http", + "port": 80, + "targetPort": 8080, + "protocol": "TCP" + } + ] + }, { "namespace": "flux-system", "name": "notification-controller", @@ -1632,7 +2021,7 @@ { "namespace": "gitea", "name": "gitea-ssh", - "type": "NodePort", + "type": "LoadBalancer", "selector": { "app": "gitea" }, @@ -1645,6 +2034,22 @@ } ] }, + { + "namespace": "health", + "name": "wger", + "type": "ClusterIP", + "selector": { + "app": "wger" + }, + "ports": [ + { + "name": "http", + "port": 80, + "targetPort": "http", + "protocol": "TCP" + } + ] + }, { "namespace": "jellyfin", "name": "jellyfin", @@ -1699,29 +2104,6 @@ } ] }, - { - "namespace": "kube-system", - "name": "traefik", - "type": "LoadBalancer", - "selector": { - "app.kubernetes.io/instance": "traefik-kube-system", - "app.kubernetes.io/name": "traefik" - }, - "ports": [ - { - "name": "web", - "port": 80, - "targetPort": "web", - "protocol": "TCP" - }, - { - "name": "websecure", - "port": 443, - "targetPort": "websecure", - "protocol": "TCP" - } - ] - }, { "namespace": "logging", "name": "oauth2-proxy-logs", @@ -1803,17 +2185,17 @@ ] }, { - "namespace": "mailu-mailserver", - "name": "mailu-sync-listener", + "namespace": "maintenance", + "name": "ariadne", "type": "ClusterIP", "selector": { - "app": "mailu-sync-listener" + "app": "ariadne" }, "ports": [ { "name": "http", - "port": 8080, - "targetPort": 8080, + "port": 80, + "targetPort": "http", "protocol": "TCP" } ] @@ -1959,6 +2341,12 @@ "port": 5432, "targetPort": 5432, "protocol": "TCP" + }, + { + "name": "metrics", + "port": 9187, + "targetPort": 9187, + "protocol": "TCP" } ] }, @@ -2032,6 +2420,28 @@ } ] }, + { + "namespace": "traefik", + "name": "traefik", + "type": "LoadBalancer", + "selector": { + "app": "traefik" + }, + "ports": [ + { + "name": "web", + "port": 80, + "targetPort": "web", + "protocol": "TCP" + }, + { + "name": "websecure", + "port": 443, + "targetPort": "websecure", + "protocol": "TCP" + } + ] + }, { "namespace": "traefik", "name": "traefik-metrics", @@ -2210,6 +2620,26 @@ "source": "bstein-dev-home" } }, + { + "host": "budget.bstein.dev", + "path": "/", + "backend": { + "namespace": "finance", + "service": "actual-budget", + "port": 80, + "workloads": [ + { + "kind": "Deployment", + "name": "actual-budget" + } + ] + }, + "via": { + "kind": "Ingress", + "name": "actual-budget", + "source": "finance" + } + }, { "host": "call.live.bstein.dev", "path": "/", @@ -2290,6 +2720,26 @@ "source": "nextcloud" } }, + { + "host": "health.bstein.dev", + "path": "/", + "backend": { + "namespace": "health", + "service": "wger", + "port": 80, + "workloads": [ + { + "kind": "Deployment", + "name": "wger" + } + ] + }, + "via": { + "kind": "Ingress", + "name": "wger", + "source": "health" + } + }, { "host": "kit.live.bstein.dev", "path": "/livekit/jwt", @@ -2385,6 +2835,106 @@ "source": "comms" } }, + { + "host": "live.bstein.dev", + "path": "/_matrix/client/r0/register", + "backend": { + "namespace": "comms", + "service": "matrix-guest-register", + "port": 8080, + "workloads": [ + { + "kind": "Deployment", + "name": "matrix-guest-register" + } + ] + }, + "via": { + "kind": "Ingress", + "name": "matrix-routing", + "source": "comms" + } + }, + { + "host": "live.bstein.dev", + "path": "/_matrix/client/v3/login", + "backend": { + "namespace": "comms", + "service": "matrix-authentication-service", + "port": 8080, + "workloads": [ + { + "kind": "Deployment", + "name": "matrix-authentication-service" + } + ] + }, + "via": { + "kind": "Ingress", + "name": "matrix-routing", + "source": "comms" + } + }, + { + "host": "live.bstein.dev", + "path": "/_matrix/client/v3/logout", + "backend": { + "namespace": "comms", + "service": "matrix-authentication-service", + "port": 8080, + "workloads": [ + { + "kind": "Deployment", + "name": "matrix-authentication-service" + } + ] + }, + "via": { + "kind": "Ingress", + "name": "matrix-routing", + "source": "comms" + } + }, + { + "host": "live.bstein.dev", + "path": "/_matrix/client/v3/refresh", + "backend": { + "namespace": "comms", + "service": "matrix-authentication-service", + "port": 8080, + "workloads": [ + { + "kind": "Deployment", + "name": "matrix-authentication-service" + } + ] + }, + "via": { + "kind": "Ingress", + "name": "matrix-routing", + "source": "comms" + } + }, + { + "host": "live.bstein.dev", + "path": "/_matrix/client/v3/register", + "backend": { + "namespace": "comms", + "service": "matrix-guest-register", + "port": 8080, + "workloads": [ + { + "kind": "Deployment", + "name": "matrix-guest-register" + } + ] + }, + "via": { + "kind": "Ingress", + "name": "matrix-routing", + "source": "comms" + } + }, { "host": "logs.bstein.dev", "path": "/", @@ -2650,6 +3200,26 @@ "source": "monerod" } }, + { + "host": "money.bstein.dev", + "path": "/", + "backend": { + "namespace": "finance", + "service": "firefly", + "port": 80, + "workloads": [ + { + "kind": "Deployment", + "name": "firefly" + } + ] + }, + "via": { + "kind": "Ingress", + "name": "firefly", + "source": "finance" + } + }, { "host": "notes.bstein.dev", "path": "/", @@ -2838,7 +3408,6 @@ "matrix.live.bstein.dev" ], "comms:comms/othrys-synapse": [ - "bstein.dev", "kit.live.bstein.dev", "live.bstein.dev", "matrix.live.bstein.dev", @@ -2853,6 +3422,9 @@ "logging:logging/data-prepper": [ "registry.bstein.dev" ], + "longhorn:longhorn-system/longhorn": [ + "registry.bstein.dev" + ], "mailu:mailu-mailserver/mailu": [ "bstein.dev", "mail.bstein.dev" @@ -2862,8 +3434,12 @@ ], "monitoring:monitoring/grafana": [ "bstein.dev", + "mail.bstein.dev", "metrics.bstein.dev", "sso.bstein.dev" + ], + "monitoring:monitoring/kube-state-metrics": [ + "atlas.bstein.dev" ] } } diff --git a/knowledge/catalog/atlas.yaml b/knowledge/catalog/atlas.yaml index f3e04a8..637b5f9 100644 --- a/knowledge/catalog/atlas.yaml +++ b/knowledge/catalog/atlas.yaml @@ -8,6 +8,15 @@ sources: - name: bstein-dev-home path: services/bstein-dev-home targetNamespace: bstein-dev-home +- name: bstein-dev-home-migrations + path: services/bstein-dev-home/migrations + targetNamespace: bstein-dev-home +- name: cert-manager + path: infrastructure/cert-manager + targetNamespace: cert-manager +- name: cert-manager-cleanup + path: infrastructure/cert-manager/cleanup + targetNamespace: cert-manager - name: comms path: services/comms targetNamespace: comms @@ -17,6 +26,9 @@ sources: - name: crypto path: services/crypto targetNamespace: crypto +- name: finance + path: services/finance + targetNamespace: finance - name: flux-system path: clusters/atlas/flux-system targetNamespace: null @@ -29,6 +41,9 @@ sources: - name: harbor path: services/harbor targetNamespace: harbor +- name: health + path: services/health + targetNamespace: health - name: helm path: infrastructure/sources/helm targetNamespace: flux-system @@ -44,6 +59,12 @@ sources: - name: logging path: services/logging targetNamespace: null +- name: longhorn + path: infrastructure/longhorn/core + targetNamespace: longhorn-system +- name: longhorn-adopt + path: infrastructure/longhorn/adopt + targetNamespace: longhorn-system - name: longhorn-ui path: infrastructure/longhorn/ui-ingress targetNamespace: longhorn-system @@ -98,9 +119,15 @@ sources: - name: vault-csi path: infrastructure/vault-csi targetNamespace: kube-system +- name: vault-injector + path: infrastructure/vault-injector + targetNamespace: vault - name: vaultwarden path: services/vaultwarden targetNamespace: vaultwarden +- name: wallet-monero-temp + path: services/crypto/wallet-monero-temp + targetNamespace: crypto - name: xmr-miner path: services/crypto/xmr-miner targetNamespace: crypto @@ -124,7 +151,7 @@ workloads: kubernetes.io/arch: arm64 node-role.kubernetes.io/worker: 'true' images: - - registry.bstein.dev/bstein/bstein-dev-home-backend:0.1.1-92 + - registry.bstein.dev/bstein/bstein-dev-home-backend:0.1.1-157 - kind: Deployment namespace: bstein-dev-home name: bstein-dev-home-frontend @@ -135,13 +162,22 @@ workloads: kubernetes.io/arch: arm64 node-role.kubernetes.io/worker: 'true' images: - - registry.bstein.dev/bstein/bstein-dev-home-frontend:0.1.1-92 + - registry.bstein.dev/bstein/bstein-dev-home-frontend:0.1.1-157 +- kind: Deployment + namespace: bstein-dev-home + name: bstein-dev-home-vault-sync + labels: + app: bstein-dev-home-vault-sync + serviceAccountName: bstein-dev-home-vault-sync + nodeSelector: {} + images: + - alpine:3.20 - kind: Deployment namespace: bstein-dev-home name: chat-ai-gateway labels: app: chat-ai-gateway - serviceAccountName: null + serviceAccountName: bstein-dev-home nodeSelector: kubernetes.io/arch: arm64 node-role.kubernetes.io/worker: 'true' @@ -157,12 +193,21 @@ workloads: hardware: rpi5 images: - python:3.11-slim +- kind: Deployment + namespace: comms + name: comms-vault-sync + labels: + app: comms-vault-sync + serviceAccountName: comms-vault + nodeSelector: {} + images: + - alpine:3.20 - kind: Deployment namespace: comms name: coturn labels: app: coturn - serviceAccountName: null + serviceAccountName: comms-vault nodeSelector: hardware: rpi5 images: @@ -182,7 +227,7 @@ workloads: name: livekit labels: app: livekit - serviceAccountName: null + serviceAccountName: comms-vault nodeSelector: hardware: rpi5 images: @@ -192,17 +237,17 @@ workloads: name: livekit-token-service labels: app: livekit-token-service - serviceAccountName: null + serviceAccountName: comms-vault nodeSelector: hardware: rpi5 images: - - ghcr.io/element-hq/lk-jwt-service:0.3.0 + - registry.bstein.dev/tools/lk-jwt-service-vault:0.3.0 - kind: Deployment namespace: comms name: matrix-authentication-service labels: app: matrix-authentication-service - serviceAccountName: null + serviceAccountName: comms-vault nodeSelector: hardware: rpi5 images: @@ -212,7 +257,7 @@ workloads: name: matrix-guest-register labels: app.kubernetes.io/name: matrix-guest-register - serviceAccountName: null + serviceAccountName: comms-vault nodeSelector: {} images: - python:3.11-slim @@ -235,12 +280,21 @@ workloads: node-role.kubernetes.io/worker: 'true' images: - ghcr.io/tari-project/xmrig@sha256:80defbfd0b640d604c91cb5101d3642db7928e1e68ee3c6b011289b3565a39d9 +- kind: Deployment + namespace: crypto + name: crypto-vault-sync + labels: + app: crypto-vault-sync + serviceAccountName: crypto-vault-sync + nodeSelector: {} + images: + - alpine:3.20 - kind: Deployment namespace: crypto name: monero-p2pool labels: app: monero-p2pool - serviceAccountName: null + serviceAccountName: crypto-vault-sync nodeSelector: node-role.kubernetes.io/worker: 'true' images: @@ -255,6 +309,38 @@ workloads: node-role.kubernetes.io/worker: 'true' images: - registry.bstein.dev/crypto/monerod:0.18.4.1 +- kind: Deployment + namespace: crypto + name: wallet-monero-temp + labels: + app: wallet-monero-temp + serviceAccountName: crypto-vault-sync + nodeSelector: + node-role.kubernetes.io/worker: 'true' + images: + - registry.bstein.dev/crypto/monero-wallet-rpc:0.18.4.1 +- kind: Deployment + namespace: finance + name: actual-budget + labels: + app: actual-budget + serviceAccountName: finance-vault + nodeSelector: + kubernetes.io/arch: arm64 + node-role.kubernetes.io/worker: 'true' + images: + - actualbudget/actual-server:26.1.0-alpine@sha256:34aae5813fdfee12af2a50c4d0667df68029f1d61b90f45f282473273eb70d0d +- kind: Deployment + namespace: finance + name: firefly + labels: + app: firefly + serviceAccountName: finance-vault + nodeSelector: + kubernetes.io/arch: arm64 + node-role.kubernetes.io/worker: 'true' + images: + - fireflyiii/core:version-6.4.15 - kind: Deployment namespace: flux-system name: helm-controller @@ -344,17 +430,38 @@ workloads: name: gitea labels: app: gitea - serviceAccountName: null + serviceAccountName: gitea-vault nodeSelector: node-role.kubernetes.io/worker: 'true' images: - gitea/gitea:1.23 +- kind: Deployment + namespace: harbor + name: harbor-vault-sync + labels: + app: harbor-vault-sync + serviceAccountName: harbor-vault-sync + nodeSelector: {} + images: + - alpine:3.20 +- kind: Deployment + namespace: health + name: wger + labels: + app: wger + serviceAccountName: health-vault-sync + nodeSelector: + kubernetes.io/arch: arm64 + node-role.kubernetes.io/worker: 'true' + images: + - nginx:1.27.5-alpine@sha256:65645c7bb6a0661892a8b03b89d0743208a18dd2f3f17a54ef4b76fb8e2f2a10 + - wger/server@sha256:710588b78af4e0aa0b4d8a8061e4563e16eae80eeaccfe7f9e0d9cbdd7f0cbc5 - kind: Deployment namespace: jellyfin name: jellyfin labels: app: jellyfin - serviceAccountName: null + serviceAccountName: pegasus-vault-sync nodeSelector: {} images: - docker.io/jellyfin/jellyfin:10.11.5 @@ -363,13 +470,22 @@ workloads: name: pegasus labels: app: pegasus - serviceAccountName: null + serviceAccountName: pegasus-vault-sync nodeSelector: kubernetes.io/arch: arm64 node-role.kubernetes.io/worker: 'true' images: - alpine:3.20 - - registry.bstein.dev/streaming/pegasus:1.2.32 + - registry.bstein.dev/streaming/pegasus-vault:1.2.32 +- kind: Deployment + namespace: jellyfin + name: pegasus-vault-sync + labels: + app: pegasus-vault-sync + serviceAccountName: pegasus-vault-sync + nodeSelector: {} + images: + - alpine:3.20 - kind: Deployment namespace: jenkins name: jenkins @@ -381,6 +497,26 @@ workloads: node-role.kubernetes.io/worker: 'true' images: - jenkins/jenkins:2.528.3-jdk21 +- kind: Deployment + namespace: jenkins + name: jenkins-vault-sync + labels: + app: jenkins-vault-sync + serviceAccountName: jenkins-vault-sync + nodeSelector: + kubernetes.io/arch: arm64 + node-role.kubernetes.io/worker: 'true' + images: + - alpine:3.20 +- kind: DaemonSet + namespace: kube-system + name: ntp-sync + labels: + app: ntp-sync + serviceAccountName: null + nodeSelector: {} + images: + - public.ecr.aws/docker/library/busybox:1.36.1 - kind: DaemonSet namespace: kube-system name: nvidia-device-plugin-jetson @@ -427,6 +563,16 @@ workloads: kubernetes.io/os: linux images: - hashicorp/vault-csi-provider:1.7.0 +- kind: Deployment + namespace: kube-system + name: coredns + labels: + k8s-app: kube-dns + serviceAccountName: coredns + nodeSelector: + kubernetes.io/os: linux + images: + - registry.bstein.dev/infra/coredns:1.12.1 - kind: DaemonSet namespace: logging name: node-image-gc-rpi4 @@ -457,22 +603,41 @@ workloads: hardware: rpi5 images: - bitnami/kubectl@sha256:554ab88b1858e8424c55de37ad417b16f2a0e65d1607aa0f3fe3ce9b9f10b131 +- kind: Deployment + namespace: logging + name: logging-vault-sync + labels: + app: logging-vault-sync + serviceAccountName: logging-vault-sync + nodeSelector: {} + images: + - alpine:3.20 - kind: Deployment namespace: logging name: oauth2-proxy-logs labels: app: oauth2-proxy-logs - serviceAccountName: null + serviceAccountName: logging-vault-sync nodeSelector: node-role.kubernetes.io/worker: 'true' images: - - quay.io/oauth2-proxy/oauth2-proxy:v7.6.0 + - registry.bstein.dev/tools/oauth2-proxy-vault:v7.6.0 +- kind: Deployment + namespace: longhorn-system + name: longhorn-vault-sync + labels: + app: longhorn-vault-sync + serviceAccountName: longhorn-vault-sync + nodeSelector: + node-role.kubernetes.io/worker: 'true' + images: + - alpine:3.20 - kind: Deployment namespace: longhorn-system name: oauth2-proxy-longhorn labels: app: oauth2-proxy-longhorn - serviceAccountName: null + serviceAccountName: longhorn-vault nodeSelector: node-role.kubernetes.io/worker: 'true' images: @@ -489,13 +654,34 @@ workloads: - registry.bstein.dev/bstein/kubectl:1.35.0 - kind: Deployment namespace: mailu-mailserver - name: mailu-sync-listener + name: mailu-vault-sync labels: - app: mailu-sync-listener - serviceAccountName: null + app: mailu-vault-sync + serviceAccountName: mailu-vault-sync nodeSelector: {} images: - - python:3.11-alpine + - alpine:3.20 +- kind: DaemonSet + namespace: maintenance + name: disable-k3s-traefik + labels: + app: disable-k3s-traefik + serviceAccountName: disable-k3s-traefik + nodeSelector: + node-role.kubernetes.io/control-plane: 'true' + images: + - bitnami/kubectl@sha256:554ab88b1858e8424c55de37ad417b16f2a0e65d1607aa0f3fe3ce9b9f10b131 +- kind: DaemonSet + namespace: maintenance + name: k3s-agent-restart + labels: + app: k3s-agent-restart + serviceAccountName: node-nofile + nodeSelector: + kubernetes.io/arch: arm64 + node-role.kubernetes.io/worker: 'true' + images: + - bitnami/kubectl@sha256:554ab88b1858e8424c55de37ad417b16f2a0e65d1607aa0f3fe3ce9b9f10b131 - kind: DaemonSet namespace: maintenance name: node-image-sweeper @@ -515,6 +701,26 @@ workloads: nodeSelector: {} images: - bitnami/kubectl@sha256:554ab88b1858e8424c55de37ad417b16f2a0e65d1607aa0f3fe3ce9b9f10b131 +- kind: Deployment + namespace: maintenance + name: ariadne + labels: + app: ariadne + serviceAccountName: ariadne + nodeSelector: + kubernetes.io/arch: arm64 + node-role.kubernetes.io/worker: 'true' + images: + - registry.bstein.dev/bstein/ariadne:0.1.0-49 +- kind: Deployment + namespace: maintenance + name: maintenance-vault-sync + labels: + app: maintenance-vault-sync + serviceAccountName: maintenance-vault-sync + nodeSelector: {} + images: + - alpine:3.20 - kind: DaemonSet namespace: monitoring name: dcgm-exporter @@ -534,12 +740,21 @@ workloads: jetson: 'true' images: - python:3.10-slim +- kind: Deployment + namespace: monitoring + name: monitoring-vault-sync + labels: + app: monitoring-vault-sync + serviceAccountName: monitoring-vault-sync + nodeSelector: {} + images: + - alpine:3.20 - kind: Deployment namespace: monitoring name: postmark-exporter labels: app: postmark-exporter - serviceAccountName: null + serviceAccountName: monitoring-vault-sync nodeSelector: {} images: - python:3.12-alpine @@ -558,7 +773,7 @@ workloads: name: nextcloud labels: app: nextcloud - serviceAccountName: null + serviceAccountName: nextcloud-vault nodeSelector: hardware: rpi5 images: @@ -568,7 +783,7 @@ workloads: name: outline labels: app: outline - serviceAccountName: null + serviceAccountName: outline-vault nodeSelector: node-role.kubernetes.io/worker: 'true' images: @@ -588,7 +803,7 @@ workloads: name: planka labels: app: planka - serviceAccountName: null + serviceAccountName: planka-vault nodeSelector: node-role.kubernetes.io/worker: 'true' images: @@ -603,13 +818,16 @@ workloads: node-role.kubernetes.io/worker: 'true' images: - postgres:15 + - quay.io/prometheuscommunity/postgres-exporter:v0.15.0 - kind: Deployment namespace: sso name: keycloak labels: app: keycloak - serviceAccountName: null - nodeSelector: {} + serviceAccountName: sso-vault + nodeSelector: + kubernetes.io/arch: arm64 + node-role.kubernetes.io/worker: 'true' images: - quay.io/keycloak/keycloak:26.0.7 - kind: Deployment @@ -617,17 +835,26 @@ workloads: name: oauth2-proxy labels: app: oauth2-proxy - serviceAccountName: null + serviceAccountName: sso-vault nodeSelector: node-role.kubernetes.io/worker: 'true' images: - - quay.io/oauth2-proxy/oauth2-proxy:v7.6.0 + - registry.bstein.dev/tools/oauth2-proxy-vault:v7.6.0 +- kind: Deployment + namespace: sso + name: sso-vault-sync + labels: + app: sso-vault-sync + serviceAccountName: sso-vault-sync + nodeSelector: {} + images: + - alpine:3.20 - kind: StatefulSet namespace: sso name: openldap labels: app: openldap - serviceAccountName: null + serviceAccountName: sso-vault nodeSelector: kubernetes.io/arch: arm64 node-role.kubernetes.io/worker: 'true' @@ -640,7 +867,7 @@ workloads: app: sui-metrics serviceAccountName: sui-metrics nodeSelector: - kubernetes.io/hostname: titan-24 + hardware: rpi5 images: - victoriametrics/vmagent:v1.103.0 - kind: Deployment @@ -648,6 +875,8 @@ workloads: name: traefik labels: app: traefik + app.kubernetes.io/instance: traefik-kube-system + app.kubernetes.io/name: traefik serviceAccountName: traefik-ingress-controller nodeSelector: node-role.kubernetes.io/worker: 'true' @@ -669,10 +898,12 @@ workloads: name: vaultwarden labels: app: vaultwarden - serviceAccountName: null - nodeSelector: {} + serviceAccountName: vaultwarden-vault + nodeSelector: + kubernetes.io/arch: arm64 + node-role.kubernetes.io/worker: 'true' images: - - vaultwarden/server:1.33.2 + - vaultwarden/server:1.35.2 services: - namespace: ai name: ollama @@ -1040,6 +1271,36 @@ services: port: 3333 targetPort: 3333 protocol: TCP +- namespace: crypto + name: wallet-monero-temp + type: ClusterIP + selector: + app: wallet-monero-temp + ports: + - name: rpc + port: 18083 + targetPort: 18083 + protocol: TCP +- namespace: finance + name: actual-budget + type: ClusterIP + selector: + app: actual-budget + ports: + - name: http + port: 80 + targetPort: 5006 + protocol: TCP +- namespace: finance + name: firefly + type: ClusterIP + selector: + app: firefly + ports: + - name: http + port: 80 + targetPort: 8080 + protocol: TCP - namespace: flux-system name: notification-controller type: ClusterIP @@ -1082,7 +1343,7 @@ services: protocol: TCP - namespace: gitea name: gitea-ssh - type: NodePort + type: LoadBalancer selector: app: gitea ports: @@ -1090,6 +1351,16 @@ services: port: 2242 targetPort: 2242 protocol: TCP +- namespace: health + name: wger + type: ClusterIP + selector: + app: wger + ports: + - name: http + port: 80 + targetPort: http + protocol: TCP - namespace: jellyfin name: jellyfin type: ClusterIP @@ -1124,21 +1395,6 @@ services: port: 50000 targetPort: 50000 protocol: TCP -- namespace: kube-system - name: traefik - type: LoadBalancer - selector: - app.kubernetes.io/instance: traefik-kube-system - app.kubernetes.io/name: traefik - ports: - - name: web - port: 80 - targetPort: web - protocol: TCP - - name: websecure - port: 443 - targetPort: websecure - protocol: TCP - namespace: logging name: oauth2-proxy-logs type: ClusterIP @@ -1191,15 +1447,15 @@ services: port: 4190 targetPort: 4190 protocol: TCP -- namespace: mailu-mailserver - name: mailu-sync-listener +- namespace: maintenance + name: ariadne type: ClusterIP selector: - app: mailu-sync-listener + app: ariadne ports: - name: http - port: 8080 - targetPort: 8080 + port: 80 + targetPort: http protocol: TCP - namespace: monitoring name: dcgm-exporter @@ -1291,6 +1547,10 @@ services: port: 5432 targetPort: 5432 protocol: TCP + - name: metrics + port: 9187 + targetPort: 9187 + protocol: TCP - namespace: sso name: keycloak type: ClusterIP @@ -1335,6 +1595,20 @@ services: port: 8429 targetPort: 8429 protocol: TCP +- namespace: traefik + name: traefik + type: LoadBalancer + selector: + app: traefik + ports: + - name: web + port: 80 + targetPort: web + protocol: TCP + - name: websecure + port: 443 + targetPort: websecure + protocol: TCP - namespace: traefik name: traefik-metrics type: ClusterIP @@ -1447,6 +1721,19 @@ http_endpoints: kind: Ingress name: bstein-dev-home source: bstein-dev-home +- host: budget.bstein.dev + path: / + backend: + namespace: finance + service: actual-budget + port: 80 + workloads: + - kind: Deployment + name: actual-budget + via: + kind: Ingress + name: actual-budget + source: finance - host: call.live.bstein.dev path: / backend: @@ -1499,6 +1786,19 @@ http_endpoints: kind: Ingress name: nextcloud source: nextcloud +- host: health.bstein.dev + path: / + backend: + namespace: health + service: wger + port: 80 + workloads: + - kind: Deployment + name: wger + via: + kind: Ingress + name: wger + source: health - host: kit.live.bstein.dev path: /livekit/jwt backend: @@ -1558,6 +1858,65 @@ http_endpoints: kind: Ingress name: matrix-routing source: comms +- host: live.bstein.dev + path: /_matrix/client/r0/register + backend: + namespace: comms + service: matrix-guest-register + port: 8080 + workloads: &id003 + - kind: Deployment + name: matrix-guest-register + via: + kind: Ingress + name: matrix-routing + source: comms +- host: live.bstein.dev + path: /_matrix/client/v3/login + backend: + namespace: comms + service: matrix-authentication-service + port: 8080 + workloads: &id002 + - kind: Deployment + name: matrix-authentication-service + via: + kind: Ingress + name: matrix-routing + source: comms +- host: live.bstein.dev + path: /_matrix/client/v3/logout + backend: + namespace: comms + service: matrix-authentication-service + port: 8080 + workloads: *id002 + via: + kind: Ingress + name: matrix-routing + source: comms +- host: live.bstein.dev + path: /_matrix/client/v3/refresh + backend: + namespace: comms + service: matrix-authentication-service + port: 8080 + workloads: *id002 + via: + kind: Ingress + name: matrix-routing + source: comms +- host: live.bstein.dev + path: /_matrix/client/v3/register + backend: + namespace: comms + service: matrix-guest-register + port: 8080 + workloads: *id003 + via: + kind: Ingress + name: matrix-routing + source: comms - host: logs.bstein.dev path: / backend: @@ -1601,9 +1960,7 @@ http_endpoints: namespace: comms service: matrix-authentication-service port: 8080 - workloads: &id002 - - kind: Deployment - name: matrix-authentication-service + workloads: *id002 via: kind: Ingress name: matrix-routing @@ -1647,9 +2004,7 @@ http_endpoints: namespace: comms service: matrix-guest-register port: 8080 - workloads: &id003 - - kind: Deployment - name: matrix-guest-register + workloads: *id003 via: kind: Ingress name: matrix-routing @@ -1722,6 +2077,19 @@ http_endpoints: kind: Ingress name: monerod source: monerod +- host: money.bstein.dev + path: / + backend: + namespace: finance + service: firefly + port: 80 + workloads: + - kind: Deployment + name: firefly + via: + kind: Ingress + name: firefly + source: finance - host: notes.bstein.dev path: / backend: @@ -1845,7 +2213,6 @@ helmrelease_host_hints: - live.bstein.dev - matrix.live.bstein.dev comms:comms/othrys-synapse: - - bstein.dev - kit.live.bstein.dev - live.bstein.dev - matrix.live.bstein.dev @@ -1856,6 +2223,8 @@ helmrelease_host_hints: - registry.bstein.dev logging:logging/data-prepper: - registry.bstein.dev + longhorn:longhorn-system/longhorn: + - registry.bstein.dev mailu:mailu-mailserver/mailu: - bstein.dev - mail.bstein.dev @@ -1863,5 +2232,8 @@ helmrelease_host_hints: - alerts.bstein.dev monitoring:monitoring/grafana: - bstein.dev + - mail.bstein.dev - metrics.bstein.dev - sso.bstein.dev + monitoring:monitoring/kube-state-metrics: + - atlas.bstein.dev diff --git a/knowledge/catalog/metrics.json b/knowledge/catalog/metrics.json new file mode 100644 index 0000000..e929db5 --- /dev/null +++ b/knowledge/catalog/metrics.json @@ -0,0 +1,1880 @@ +[ + { + "dashboard": "Atlas GPU", + "panel_title": "Namespace GPU Share", + "panel_id": 1, + "panel_type": "piechart", + "description": "Shares are normalized within the selected filter. Switching scope changes the denominator.", + "tags": [ + "atlas", + "gpu" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "(100 * (sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace)) / clamp_min((sum(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace)) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") and on() ((sum(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace)) or on() vector(0)) == 0))" + ] + }, + { + "dashboard": "Atlas GPU", + "panel_title": "GPU Util by Namespace", + "panel_id": 2, + "panel_type": "timeseries", + "description": "", + "tags": [ + "atlas", + "gpu" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace)" + ] + }, + { + "dashboard": "Atlas GPU", + "panel_title": "GPU Util by Node", + "panel_id": 3, + "panel_type": "timeseries", + "description": "", + "tags": [ + "atlas", + "gpu" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum by (Hostname) (DCGM_FI_DEV_GPU_UTIL{pod!=\"\"})" + ] + }, + { + "dashboard": "Atlas GPU", + "panel_title": "Top Pods by GPU Util", + "panel_id": 4, + "panel_type": "table", + "description": "", + "tags": [ + "atlas", + "gpu" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "topk(10, sum(DCGM_FI_DEV_GPU_UTIL{pod!=\"\"}) by (namespace,pod,Hostname))" + ] + }, + { + "dashboard": "Atlas Jobs", + "panel_title": "Ariadne Task Errors (range)", + "panel_id": 1, + "panel_type": "bargauge", + "description": "", + "tags": [ + "atlas", + "jobs", + "glue" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sort_desc(sum by (task) (increase(ariadne_task_runs_total{status=\"error\"}[$__range])))" + ] + }, + { + "dashboard": "Atlas Jobs", + "panel_title": "Ariadne Attempts / Failures", + "panel_id": 2, + "panel_type": "timeseries", + "description": "", + "tags": [ + "atlas", + "jobs", + "glue" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum(increase(ariadne_task_runs_total[$__interval]))", + "sum(increase(ariadne_task_runs_total{status=\"error\"}[$__interval]))" + ] + }, + { + "dashboard": "Atlas Jobs", + "panel_title": "One-off Job Pods (age hours)", + "panel_id": 3, + "panel_type": "bargauge", + "description": "", + "tags": [ + "atlas", + "jobs", + "glue" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sort_desc(((time() - kube_pod_start_time{pod!=\"\"}) / 3600) * on(namespace,pod) group_left(owner_name) (kube_pod_owner{owner_kind=\"Job\"} unless on(namespace, owner_name) label_replace(kube_job_owner{owner_kind=\"CronJob\"}, \"owner_name\", \"$1\", \"job_name\", \"(.*)\")) * on(namespace,pod) group_left(phase) max by (namespace,pod,phase) (kube_pod_status_phase{phase=~\"Running|Succeeded\"}))" + ] + }, + { + "dashboard": "Atlas Jobs", + "panel_title": "Glue Jobs Stale (>36h)", + "panel_id": 4, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "jobs", + "glue" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "(sum((((time() - (kube_cronjob_status_last_successful_time and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"})) > bool 129600) unless on(namespace,cronjob) (kube_cronjob_spec_suspend and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}) == 1)) + count(((kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"} unless on(namespace,cronjob) kube_cronjob_status_last_successful_time) unless on(namespace,cronjob) (kube_cronjob_spec_suspend and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}) == 1))) or on() vector(0)" + ] + }, + { + "dashboard": "Atlas Jobs", + "panel_title": "Glue Jobs Missing Success", + "panel_id": 5, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "jobs", + "glue" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "count(((kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"} unless on(namespace,cronjob) kube_cronjob_status_last_successful_time) unless on(namespace,cronjob) (kube_cronjob_spec_suspend and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}) == 1)) or on() vector(0)" + ] + }, + { + "dashboard": "Atlas Jobs", + "panel_title": "Glue Jobs Suspended", + "panel_id": 6, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "jobs", + "glue" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum((kube_cronjob_spec_suspend and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}) == 1) or on() vector(0)" + ] + }, + { + "dashboard": "Atlas Jobs", + "panel_title": "Ariadne Task Errors (1h)", + "panel_id": 7, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "jobs", + "glue" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum(increase(ariadne_task_runs_total{status=\"error\"}[1h]))" + ] + }, + { + "dashboard": "Atlas Jobs", + "panel_title": "Ariadne Task Errors (24h)", + "panel_id": 8, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "jobs", + "glue" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum(increase(ariadne_task_runs_total{status=\"error\"}[24h]))" + ] + }, + { + "dashboard": "Atlas Jobs", + "panel_title": "Ariadne Task Runs (1h)", + "panel_id": 9, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "jobs", + "glue" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum(increase(ariadne_task_runs_total[1h]))" + ] + }, + { + "dashboard": "Atlas Jobs", + "panel_title": "Ariadne Schedule Last Error (hours ago)", + "panel_id": 10, + "panel_type": "bargauge", + "description": "", + "tags": [ + "atlas", + "jobs", + "glue" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sort_desc((time() - max_over_time(ariadne_schedule_last_error_timestamp_seconds[$__range])) / 3600)" + ] + }, + { + "dashboard": "Atlas Jobs", + "panel_title": "Ariadne Schedule Last Success (hours ago)", + "panel_id": 11, + "panel_type": "bargauge", + "description": "", + "tags": [ + "atlas", + "jobs", + "glue" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sort_desc((time() - max_over_time(ariadne_schedule_last_success_timestamp_seconds[$__range])) / 3600)" + ] + }, + { + "dashboard": "Atlas Jobs", + "panel_title": "Glue Jobs Last Success (hours ago)", + "panel_id": 12, + "panel_type": "bargauge", + "description": "", + "tags": [ + "atlas", + "jobs", + "glue" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sort_desc((time() - max_over_time((kube_cronjob_status_last_successful_time and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"})[$__range])) / 3600)" + ] + }, + { + "dashboard": "Atlas Jobs", + "panel_title": "Glue Jobs Last Schedule (hours ago)", + "panel_id": 13, + "panel_type": "bargauge", + "description": "", + "tags": [ + "atlas", + "jobs", + "glue" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sort_desc((time() - max_over_time((kube_cronjob_status_last_schedule_time and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"})[$__range])) / 3600)" + ] + }, + { + "dashboard": "Atlas Jobs", + "panel_title": "Ariadne Task Errors (1h)", + "panel_id": 14, + "panel_type": "bargauge", + "description": "", + "tags": [ + "atlas", + "jobs", + "glue" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sort_desc(sum by (task) (increase(ariadne_task_runs_total{status=\"error\"}[1h])))" + ] + }, + { + "dashboard": "Atlas Jobs", + "panel_title": "Ariadne Task Errors (30d)", + "panel_id": 15, + "panel_type": "bargauge", + "description": "", + "tags": [ + "atlas", + "jobs", + "glue" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sort_desc(sum by (task) (increase(ariadne_task_runs_total{status=\"error\"}[30d])))" + ] + }, + { + "dashboard": "Atlas Jobs", + "panel_title": "Ariadne Access Requests", + "panel_id": 16, + "panel_type": "bargauge", + "description": "", + "tags": [ + "atlas", + "jobs", + "glue" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sort_desc(ariadne_access_requests_total)" + ] + }, + { + "dashboard": "Atlas Jobs", + "panel_title": "Ariadne CI Coverage (%)", + "panel_id": 17, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "jobs", + "glue" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "ariadne_ci_coverage_percent{repo=\"ariadne\"}" + ] + }, + { + "dashboard": "Atlas Jobs", + "panel_title": "Ariadne CI Tests (latest)", + "panel_id": 18, + "panel_type": "table", + "description": "", + "tags": [ + "atlas", + "jobs", + "glue" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "ariadne_ci_tests_total{repo=\"ariadne\"}" + ] + }, + { + "dashboard": "Atlas Mail", + "panel_title": "Sent (1d)", + "panel_id": 1, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "mail" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "max(postmark_outbound_sent{window=\"1d\"})" + ] + }, + { + "dashboard": "Atlas Mail", + "panel_title": "Sent (7d)", + "panel_id": 2, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "mail" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "max(postmark_outbound_sent{window=\"7d\"})" + ] + }, + { + "dashboard": "Atlas Mail", + "panel_title": "Mail Bounces (1d)", + "panel_id": 3, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "mail" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "max(postmark_outbound_bounce_rate{window=\"1d\"})", + "max(postmark_outbound_bounced{window=\"1d\"})" + ] + }, + { + "dashboard": "Atlas Mail", + "panel_title": "Success Rate (1d)", + "panel_id": 4, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "mail" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "clamp_min(100 - max(postmark_outbound_bounce_rate{window=\"1d\"}), 0)" + ] + }, + { + "dashboard": "Atlas Mail", + "panel_title": "Limit Used (30d)", + "panel_id": 5, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "mail" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "max(postmark_sending_limit_used_percent)" + ] + }, + { + "dashboard": "Atlas Mail", + "panel_title": "Send Limit (30d)", + "panel_id": 6, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "mail" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "max(postmark_sending_limit)" + ] + }, + { + "dashboard": "Atlas Mail", + "panel_title": "Last Success", + "panel_id": 7, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "mail" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "max(postmark_last_success_timestamp_seconds)" + ] + }, + { + "dashboard": "Atlas Mail", + "panel_title": "Exporter Errors", + "panel_id": 8, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "mail" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum(postmark_request_errors_total)" + ] + }, + { + "dashboard": "Atlas Mail", + "panel_title": "Bounce Rate (1d vs 7d)", + "panel_id": 13, + "panel_type": "timeseries", + "description": "", + "tags": [ + "atlas", + "mail" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "max by (window) (postmark_outbound_bounce_rate)" + ] + }, + { + "dashboard": "Atlas Mail", + "panel_title": "Bounced (1d vs 7d)", + "panel_id": 14, + "panel_type": "timeseries", + "description": "", + "tags": [ + "atlas", + "mail" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "max by (window) (postmark_outbound_bounced)" + ] + }, + { + "dashboard": "Atlas Mail", + "panel_title": "Sent (1d vs 7d)", + "panel_id": 15, + "panel_type": "timeseries", + "description": "", + "tags": [ + "atlas", + "mail" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "max by (window) (postmark_outbound_sent)" + ] + }, + { + "dashboard": "Atlas Mail", + "panel_title": "Exporter Errors", + "panel_id": 16, + "panel_type": "timeseries", + "description": "", + "tags": [ + "atlas", + "mail" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum(postmark_request_errors_total)" + ] + }, + { + "dashboard": "Atlas Network", + "panel_title": "Ingress Success Rate (5m)", + "panel_id": 1, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "network" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "(sum(rate(traefik_entrypoint_requests_total{code!~\"5..\"}[5m]))) / clamp_min(sum(rate(traefik_entrypoint_requests_total[5m])), 1)" + ] + }, + { + "dashboard": "Atlas Network", + "panel_title": "Error Budget Burn (1h)", + "panel_id": 2, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "network" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "(1 - ((sum(rate(traefik_entrypoint_requests_total{code!~\"5..\"}[1h]))) / clamp_min(sum(rate(traefik_entrypoint_requests_total[1h])), 1))) / 0.0010000000000000009" + ] + }, + { + "dashboard": "Atlas Network", + "panel_title": "Error Budget Burn (6h)", + "panel_id": 3, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "network" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "(1 - ((sum(rate(traefik_entrypoint_requests_total{code!~\"5..\"}[6h]))) / clamp_min(sum(rate(traefik_entrypoint_requests_total[6h])), 1))) / 0.0010000000000000009" + ] + }, + { + "dashboard": "Atlas Network", + "panel_title": "Edge P99 Latency (ms)", + "panel_id": 4, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "network" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "histogram_quantile(0.99, sum by (le) (rate(traefik_entrypoint_request_duration_seconds_bucket[5m]))) * 1000" + ] + }, + { + "dashboard": "Atlas Network", + "panel_title": "Ingress Traffic", + "panel_id": 5, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "network" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum(rate(node_network_receive_bytes_total{device!~\"lo|cni.*|veth.*|flannel.*|docker.*|virbr.*|vxlan.*|wg.*\"}[5m])) or on() vector(0)" + ] + }, + { + "dashboard": "Atlas Network", + "panel_title": "Egress Traffic", + "panel_id": 6, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "network" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum(rate(node_network_transmit_bytes_total{device!~\"lo|cni.*|veth.*|flannel.*|docker.*|virbr.*|vxlan.*|wg.*\"}[5m])) or on() vector(0)" + ] + }, + { + "dashboard": "Atlas Network", + "panel_title": "Intra-Cluster Traffic", + "panel_id": 7, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "network" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum(rate(container_network_receive_bytes_total{namespace!=\"traefik\",pod!=\"\"}[5m]) + rate(container_network_transmit_bytes_total{namespace!=\"traefik\",pod!=\"\"}[5m])) or on() vector(0)" + ] + }, + { + "dashboard": "Atlas Network", + "panel_title": "Per-Node Throughput", + "panel_id": 8, + "panel_type": "timeseries", + "description": "", + "tags": [ + "atlas", + "network" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "avg by (node) ((sum(rate(node_network_transmit_bytes_total{device!~\"lo|cni.*|veth.*|flannel.*|docker.*|virbr.*|vxlan.*|wg.*\"}[5m])) or on() vector(0) + sum(rate(node_network_receive_bytes_total{device!~\"lo|cni.*|veth.*|flannel.*|docker.*|virbr.*|vxlan.*|wg.*\"}[5m])) or on() vector(0)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))" + ] + }, + { + "dashboard": "Atlas Network", + "panel_title": "Top Namespaces", + "panel_id": 9, + "panel_type": "table", + "description": "", + "tags": [ + "atlas", + "network" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "topk(10, sum(rate(container_network_transmit_bytes_total{namespace!=\"\"}[5m]) + rate(container_network_receive_bytes_total{namespace!=\"\"}[5m])) by (namespace))" + ] + }, + { + "dashboard": "Atlas Network", + "panel_title": "Top Pods", + "panel_id": 10, + "panel_type": "table", + "description": "", + "tags": [ + "atlas", + "network" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "topk(10, sum(rate(container_network_transmit_bytes_total{pod!=\"\"}[5m]) + rate(container_network_receive_bytes_total{pod!=\"\"}[5m])) by (namespace,pod))" + ] + }, + { + "dashboard": "Atlas Network", + "panel_title": "Traefik Routers (req/s)", + "panel_id": 11, + "panel_type": "timeseries", + "description": "", + "tags": [ + "atlas", + "network" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "topk(10, sum by (router) (rate(traefik_router_requests_total[5m])))" + ] + }, + { + "dashboard": "Atlas Network", + "panel_title": "Traefik Entrypoints (req/s)", + "panel_id": 12, + "panel_type": "timeseries", + "description": "", + "tags": [ + "atlas", + "network" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum by (entrypoint) (rate(traefik_entrypoint_requests_total[5m]))" + ] + }, + { + "dashboard": "Atlas Nodes", + "panel_title": "Worker Nodes Ready", + "panel_id": 1, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "nodes" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum(kube_node_status_condition{condition=\"Ready\",status=\"true\",node=~\"titan-04|titan-05|titan-06|titan-07|titan-08|titan-09|titan-10|titan-11|titan-20|titan-21|titan-12|titan-13|titan-14|titan-15|titan-16|titan-17|titan-18|titan-19|titan-22|titan-24\"})" + ] + }, + { + "dashboard": "Atlas Nodes", + "panel_title": "Control Plane Ready", + "panel_id": 2, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "nodes" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum(kube_node_status_condition{condition=\"Ready\",status=\"true\",node=~\"titan-0a|titan-0b|titan-0c\"})" + ] + }, + { + "dashboard": "Atlas Nodes", + "panel_title": "Control Plane Workloads", + "panel_id": 3, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "nodes" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum(kube_pod_info{node=~\"titan-0a|titan-0b|titan-0c\",namespace!~\"^(kube-.*|.*-system|traefik|monitoring|logging|cert-manager|maintenance|postgres)$\"})" + ] + }, + { + "dashboard": "Atlas Nodes", + "panel_title": "API Server 5xx rate", + "panel_id": 9, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "nodes" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum(rate(apiserver_request_total{code=~\"5..\"}[5m]))" + ] + }, + { + "dashboard": "Atlas Nodes", + "panel_title": "API Server P99 latency", + "panel_id": 10, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "nodes" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "histogram_quantile(0.99, sum by (le) (rate(apiserver_request_duration_seconds_bucket[5m]))) * 1000" + ] + }, + { + "dashboard": "Atlas Nodes", + "panel_title": "etcd P99 latency", + "panel_id": 11, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "nodes" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "histogram_quantile(0.99, sum by (le) (rate(etcd_request_duration_seconds_bucket[5m]))) * 1000" + ] + }, + { + "dashboard": "Atlas Nodes", + "panel_title": "Node CPU", + "panel_id": 4, + "panel_type": "timeseries", + "description": "", + "tags": [ + "atlas", + "nodes" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))" + ] + }, + { + "dashboard": "Atlas Nodes", + "panel_title": "Node RAM", + "panel_id": 5, + "panel_type": "timeseries", + "description": "", + "tags": [ + "atlas", + "nodes" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))" + ] + }, + { + "dashboard": "Atlas Nodes", + "panel_title": "Control Plane (incl. titan-db) CPU", + "panel_id": 6, + "panel_type": "timeseries", + "description": "", + "tags": [ + "atlas", + "nodes" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "(avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-0a|titan-0b|titan-0c|titan-db|titan-jh\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")" + ] + }, + { + "dashboard": "Atlas Nodes", + "panel_title": "Control Plane (incl. titan-db) RAM", + "panel_id": 7, + "panel_type": "timeseries", + "description": "", + "tags": [ + "atlas", + "nodes" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "(avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-0a|titan-0b|titan-0c|titan-db|titan-jh\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")" + ] + }, + { + "dashboard": "Atlas Nodes", + "panel_title": "Root Filesystem Usage", + "panel_id": 8, + "panel_type": "timeseries", + "description": "", + "tags": [ + "atlas", + "nodes" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "avg by (node) ((avg by (instance) ((1 - (node_filesystem_avail_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"})) * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Control Plane Ready", + "panel_id": 2, + "panel_type": "gauge", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum(kube_node_status_condition{condition=\"Ready\",status=\"true\",node=~\"titan-0a|titan-0b|titan-0c\"})" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Control Plane Workloads", + "panel_id": 3, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum(kube_pod_info{node=~\"titan-0a|titan-0b|titan-0c\",namespace!~\"^(kube-.*|.*-system|traefik|monitoring|logging|cert-manager|maintenance|postgres)$\"}) or on() vector(0)" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Stuck Terminating", + "panel_id": 5, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum(max by (namespace,pod) (((time() - kube_pod_deletion_timestamp{pod!=\"\"}) > bool 600) and on(namespace,pod) (kube_pod_deletion_timestamp{pod!=\"\"} > bool 0))) or on() vector(0)" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Atlas Availability", + "panel_id": 27, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "avg_over_time((min(((sum(kube_node_status_condition{condition=\"Ready\",status=\"true\",node=~\"titan-0a|titan-0b|titan-0c\"}) / 3)), ((sum(kube_deployment_status_replicas_available{namespace=~\"traefik|kube-system\",deployment=\"traefik\"}) / clamp_min(sum(kube_deployment_spec_replicas{namespace=~\"traefik|kube-system\",deployment=\"traefik\"}), 1)))))[365d:1h])" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Problem Pods", + "panel_id": 4, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum(max by (namespace,pod) (kube_pod_status_phase{phase!~\"Running|Succeeded\"})) or on() vector(0)" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "CrashLoop / ImagePull", + "panel_id": 6, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum(max by (namespace,pod) (kube_pod_container_status_waiting_reason{reason=~\"CrashLoopBackOff|ImagePullBackOff\"})) or on() vector(0)" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Workers Ready", + "panel_id": 1, + "panel_type": "gauge", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum(kube_node_status_condition{condition=\"Ready\",status=\"true\",node=~\"titan-04|titan-05|titan-06|titan-07|titan-08|titan-09|titan-10|titan-11|titan-20|titan-21|titan-12|titan-13|titan-14|titan-15|titan-16|titan-17|titan-18|titan-19|titan-22|titan-24\"})" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Hottest node: CPU", + "panel_id": 7, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "label_replace(topk(1, avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))), \"__name__\", \"$1\", \"node\", \"(.*)\")" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Hottest node: RAM", + "panel_id": 8, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "label_replace(topk(1, avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))), \"__name__\", \"$1\", \"node\", \"(.*)\")" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Hottest node: NET (rx+tx)", + "panel_id": 9, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "label_replace(topk(1, avg by (node) ((sum by (instance) (rate(node_network_receive_bytes_total{device!~\"lo\"}[5m]) + rate(node_network_transmit_bytes_total{device!~\"lo\"}[5m]))) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))), \"__name__\", \"$1\", \"node\", \"(.*)\")" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Hottest node: I/O (r+w)", + "panel_id": 10, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "label_replace(topk(1, avg by (node) ((sum by (instance) (rate(node_disk_read_bytes_total[5m]) + rate(node_disk_written_bytes_total[5m]))) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))), \"__name__\", \"$1\", \"node\", \"(.*)\")" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Mail Sent (1d)", + "panel_id": 30, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "max(postmark_outbound_sent{window=\"1d\"})" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Mail Bounces (1d)", + "panel_id": 31, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "max(postmark_outbound_bounce_rate{window=\"1d\"})", + "max(postmark_outbound_bounced{window=\"1d\"})" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Mail Success Rate (1d)", + "panel_id": 32, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "clamp_min(100 - max(postmark_outbound_bounce_rate{window=\"1d\"}), 0)" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Mail Limit Used (30d)", + "panel_id": 33, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "max(postmark_sending_limit_used_percent)" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Postgres Connections Used", + "panel_id": 34, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "label_replace(sum(pg_stat_activity_count), \"conn\", \"used\", \"__name__\", \".*\") or label_replace(max(pg_settings_max_connections), \"conn\", \"max\", \"__name__\", \".*\")" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Postgres Hottest Connections", + "panel_id": 35, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "topk(1, sum by (datname) (pg_stat_activity_count))" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Astreae Usage", + "panel_id": 23, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "100 - (sum(node_filesystem_avail_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"}) / sum(node_filesystem_size_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"}) * 100)" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Asteria Usage", + "panel_id": 24, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "100 - (sum(node_filesystem_avail_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"}) / sum(node_filesystem_size_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"}) * 100)" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Astreae Free", + "panel_id": 25, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum(node_filesystem_avail_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"})" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Asteria Free", + "panel_id": 26, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum(node_filesystem_avail_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"})" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "One-off Job Pods (age hours)", + "panel_id": 40, + "panel_type": "bargauge", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sort_desc(((time() - kube_pod_start_time{pod!=\"\"}) / 3600) * on(namespace,pod) group_left(owner_name) (kube_pod_owner{owner_kind=\"Job\"} unless on(namespace, owner_name) label_replace(kube_job_owner{owner_kind=\"CronJob\"}, \"owner_name\", \"$1\", \"job_name\", \"(.*)\")) * on(namespace,pod) group_left(phase) max by (namespace,pod,phase) (kube_pod_status_phase{phase=~\"Running|Succeeded\"}))" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Ariadne Attempts / Failures", + "panel_id": 41, + "panel_type": "timeseries", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum(increase(ariadne_task_runs_total[$__interval]))", + "sum(increase(ariadne_task_runs_total{status=\"error\"}[$__interval]))" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Ariadne Test Success Rate", + "panel_id": 42, + "panel_type": "timeseries", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "100 * sum(max_over_time(ariadne_ci_tests_total{repo=\"ariadne\",result=\"passed\"}[1h])) / clamp_min(sum(max_over_time(ariadne_ci_tests_total{repo=\"ariadne\",result=~\"passed|failed|error\"}[1h])), 1)" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Tests with Failures (24h)", + "panel_id": 43, + "panel_type": "bargauge", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sort_desc(sum by (result) (max_over_time(ariadne_ci_tests_total{repo=\"ariadne\",result=~\"failed|error\"}[24h])))" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Namespace CPU Share", + "panel_id": 11, + "panel_type": "piechart", + "description": "Shares are normalized within the selected filter. Switching scope changes the denominator.", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "100 * ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\",container!=\"POD\",$namespace_scope_cpu}[1m])) by (namespace) ) / clamp_min(sum( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\",container!=\"POD\",$namespace_scope_cpu}[1m])) by (namespace) ), 1)" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Namespace GPU Share", + "panel_id": 12, + "panel_type": "piechart", + "description": "Shares are normalized within the selected filter. Switching scope changes the denominator.", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "(100 * (sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace)) / clamp_min((sum(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace)) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") and on() ((sum(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace)) or on() vector(0)) == 0))" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Namespace RAM Share", + "panel_id": 13, + "panel_type": "piechart", + "description": "Shares are normalized within the selected filter. Switching scope changes the denominator.", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "100 * ( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\",container!=\"POD\",$namespace_scope_ram}) by (namespace) ) / clamp_min(sum( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\",container!=\"POD\",$namespace_scope_ram}) by (namespace) ), 1)" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Worker Node CPU", + "panel_id": 14, + "panel_type": "timeseries", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "(avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-04|titan-05|titan-06|titan-07|titan-08|titan-09|titan-10|titan-11|titan-20|titan-21|titan-12|titan-13|titan-14|titan-15|titan-16|titan-17|titan-18|titan-19|titan-22|titan-24\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Worker Node RAM", + "panel_id": 15, + "panel_type": "timeseries", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "(avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-04|titan-05|titan-06|titan-07|titan-08|titan-09|titan-10|titan-11|titan-20|titan-21|titan-12|titan-13|titan-14|titan-15|titan-16|titan-17|titan-18|titan-19|titan-22|titan-24\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Control plane CPU", + "panel_id": 16, + "panel_type": "timeseries", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "(avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-0a|titan-0b|titan-0c|titan-db|titan-jh\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Control plane RAM", + "panel_id": 17, + "panel_type": "timeseries", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "(avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-0a|titan-0b|titan-0c|titan-db|titan-jh\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Node Pod Share", + "panel_id": 28, + "panel_type": "piechart", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "(sum(kube_pod_info{pod!=\"\" , node!=\"\"}) by (node) / clamp_min(sum(kube_pod_info{pod!=\"\" , node!=\"\"}), 1)) * 100" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Top Nodes by Pod Count", + "panel_id": 29, + "panel_type": "bargauge", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sort_desc(topk(12, sum(kube_pod_info{pod!=\"\" , node!=\"\"}) by (node)))" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Cluster Ingress Throughput", + "panel_id": 18, + "panel_type": "timeseries", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum(rate(node_network_receive_bytes_total{device!~\"lo|cni.*|veth.*|flannel.*|docker.*|virbr.*|vxlan.*|wg.*\"}[5m])) or on() vector(0)" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Cluster Egress Throughput", + "panel_id": 19, + "panel_type": "timeseries", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum(rate(node_network_transmit_bytes_total{device!~\"lo|cni.*|veth.*|flannel.*|docker.*|virbr.*|vxlan.*|wg.*\"}[5m])) or on() vector(0)" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Intra-Cluster Throughput", + "panel_id": 20, + "panel_type": "timeseries", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum(rate(container_network_receive_bytes_total{namespace!=\"traefik\",pod!=\"\"}[5m]) + rate(container_network_transmit_bytes_total{namespace!=\"traefik\",pod!=\"\"}[5m])) or on() vector(0)" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Root Filesystem Usage", + "panel_id": 21, + "panel_type": "timeseries", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "avg by (node) ((avg by (instance) ((1 - (node_filesystem_avail_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"})) * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Nodes Closest to Full Root Disks", + "panel_id": 22, + "panel_type": "bargauge", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sort_desc(topk(12, avg by (node) ((avg by (instance) ((1 - (node_filesystem_avail_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"})) * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))))" + ] + }, + { + "dashboard": "Atlas Pods", + "panel_title": "Problem Pods", + "panel_id": 1, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "pods" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum(max by (namespace,pod) (kube_pod_status_phase{phase!~\"Running|Succeeded\"})) or on() vector(0)" + ] + }, + { + "dashboard": "Atlas Pods", + "panel_title": "CrashLoop / ImagePull", + "panel_id": 2, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "pods" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum(max by (namespace,pod) (kube_pod_container_status_waiting_reason{reason=~\"CrashLoopBackOff|ImagePullBackOff\"})) or on() vector(0)" + ] + }, + { + "dashboard": "Atlas Pods", + "panel_title": "Stuck Terminating (>10m)", + "panel_id": 3, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "pods" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum(max by (namespace,pod) (((time() - kube_pod_deletion_timestamp{pod!=\"\"}) > bool 600) and on(namespace,pod) (kube_pod_deletion_timestamp{pod!=\"\"} > bool 0))) or on() vector(0)" + ] + }, + { + "dashboard": "Atlas Pods", + "panel_title": "Control Plane Workloads", + "panel_id": 4, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "pods" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum(kube_pod_info{node=~\"titan-0a|titan-0b|titan-0c\",namespace!~\"^(kube-.*|.*-system|traefik|monitoring|logging|cert-manager|maintenance|postgres)$\"})" + ] + }, + { + "dashboard": "Atlas Pods", + "panel_title": "Pods Not Running", + "panel_id": 5, + "panel_type": "table", + "description": "", + "tags": [ + "atlas", + "pods" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "(time() - kube_pod_created{pod!=\"\"}) * on(namespace,pod) group_left(node) kube_pod_info * on(namespace,pod) group_left(phase) max by (namespace,pod,phase) (kube_pod_status_phase{phase!~\"Running|Succeeded\"})" + ] + }, + { + "dashboard": "Atlas Pods", + "panel_title": "CrashLoop / ImagePull", + "panel_id": 6, + "panel_type": "table", + "description": "", + "tags": [ + "atlas", + "pods" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "(time() - kube_pod_created{pod!=\"\"}) * on(namespace,pod) group_left(node) kube_pod_info * on(namespace,pod,container) group_left(reason) max by (namespace,pod,container,reason) (kube_pod_container_status_waiting_reason{reason=~\"CrashLoopBackOff|ImagePullBackOff\"})" + ] + }, + { + "dashboard": "Atlas Pods", + "panel_title": "Terminating >10m", + "panel_id": 7, + "panel_type": "table", + "description": "", + "tags": [ + "atlas", + "pods" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "(((time() - kube_pod_deletion_timestamp{pod!=\"\"}) and on(namespace,pod) (kube_pod_deletion_timestamp{pod!=\"\"} > bool 0)) * on(namespace,pod) group_left(node) kube_pod_info)" + ] + }, + { + "dashboard": "Atlas Pods", + "panel_title": "Node Pod Share", + "panel_id": 8, + "panel_type": "piechart", + "description": "", + "tags": [ + "atlas", + "pods" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "(sum(kube_pod_info{pod!=\"\" , node!=\"\"}) by (node) / clamp_min(sum(kube_pod_info{pod!=\"\" , node!=\"\"}), 1)) * 100" + ] + }, + { + "dashboard": "Atlas Pods", + "panel_title": "Top Nodes by Pod Count", + "panel_id": 9, + "panel_type": "bargauge", + "description": "", + "tags": [ + "atlas", + "pods" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sort_desc(topk(12, sum(kube_pod_info{pod!=\"\" , node!=\"\"}) by (node)))" + ] + }, + { + "dashboard": "Atlas Pods", + "panel_title": "Namespace Plurality by Node v27", + "panel_id": 10, + "panel_type": "table", + "description": "", + "tags": [ + "atlas", + "pods" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "(sum by (namespace,node) (kube_pod_info{pod!=\"\" , node!=\"\"}) / on(namespace) group_left() clamp_min(sum by (namespace) (kube_pod_info{pod!=\"\"}), 1) * 100) * on(namespace,node) group_left() ((sum by (namespace,node) (kube_pod_info{pod!=\"\" , node!=\"\"}) / on(namespace) group_left() clamp_min(sum by (namespace) (kube_pod_info{pod!=\"\"}), 1) * 100) + on(node) group_left() ((sum by (node) (kube_node_info{node=\"titan-0a\"}) * 0 + 0.001) or (sum by (node) (kube_node_info{node=\"titan-0b\"}) * 0 + 0.002) or (sum by (node) (kube_node_info{node=\"titan-0c\"}) * 0 + 0.003) or (sum by (node) (kube_node_info{node=\"titan-db\"}) * 0 + 0.004) or (sum by (node) (kube_node_info{node=\"titan-jh\"}) * 0 + 0.005) or (sum by (node) (kube_node_info{node=\"titan-04\"}) * 0 + 0.006) or (sum by (node) (kube_node_info{node=\"titan-05\"}) * 0 + 0.007) or (sum by (node) (kube_node_info{node=\"titan-06\"}) * 0 + 0.008) or (sum by (node) (kube_node_info{node=\"titan-07\"}) * 0 + 0.009000000000000001) or (sum by (node) (kube_node_info{node=\"titan-08\"}) * 0 + 0.01) or (sum by (node) (kube_node_info{node=\"titan-09\"}) * 0 + 0.011) or (sum by (node) (kube_node_info{node=\"titan-10\"}) * 0 + 0.012) or (sum by (node) (kube_node_info{node=\"titan-11\"}) * 0 + 0.013000000000000001) or (sum by (node) (kube_node_info{node=\"titan-20\"}) * 0 + 0.014) or (sum by (node) (kube_node_info{node=\"titan-21\"}) * 0 + 0.015) or (sum by (node) (kube_node_info{node=\"titan-12\"}) * 0 + 0.016) or (sum by (node) (kube_node_info{node=\"titan-13\"}) * 0 + 0.017) or (sum by (node) (kube_node_info{node=\"titan-14\"}) * 0 + 0.018000000000000002) or (sum by (node) (kube_node_info{node=\"titan-15\"}) * 0 + 0.019) or (sum by (node) (kube_node_info{node=\"titan-16\"}) * 0 + 0.02) or (sum by (node) (kube_node_info{node=\"titan-17\"}) * 0 + 0.021) or (sum by (node) (kube_node_info{node=\"titan-18\"}) * 0 + 0.022) or (sum by (node) (kube_node_info{node=\"titan-19\"}) * 0 + 0.023) or (sum by (node) (kube_node_info{node=\"titan-22\"}) * 0 + 0.024) or (sum by (node) (kube_node_info{node=\"titan-24\"}) * 0 + 0.025)) == bool on(namespace) group_left() (max by (namespace) ((sum by (namespace,node) (kube_pod_info{pod!=\"\" , node!=\"\"}) / on(namespace) group_left() clamp_min(sum by (namespace) (kube_pod_info{pod!=\"\"}), 1) * 100) + on(node) group_left() ((sum by (node) (kube_node_info{node=\"titan-0a\"}) * 0 + 0.001) or (sum by (node) (kube_node_info{node=\"titan-0b\"}) * 0 + 0.002) or (sum by (node) (kube_node_info{node=\"titan-0c\"}) * 0 + 0.003) or (sum by (node) (kube_node_info{node=\"titan-db\"}) * 0 + 0.004) or (sum by (node) (kube_node_info{node=\"titan-jh\"}) * 0 + 0.005) or (sum by (node) (kube_node_info{node=\"titan-04\"}) * 0 + 0.006) or (sum by (node) (kube_node_info{node=\"titan-05\"}) * 0 + 0.007) or (sum by (node) (kube_node_info{node=\"titan-06\"}) * 0 + 0.008) or (sum by (node) (kube_node_info{node=\"titan-07\"}) * 0 + 0.009000000000000001) or (sum by (node) (kube_node_info{node=\"titan-08\"}) * 0 + 0.01) or (sum by (node) (kube_node_info{node=\"titan-09\"}) * 0 + 0.011) or (sum by (node) (kube_node_info{node=\"titan-10\"}) * 0 + 0.012) or (sum by (node) (kube_node_info{node=\"titan-11\"}) * 0 + 0.013000000000000001) or (sum by (node) (kube_node_info{node=\"titan-20\"}) * 0 + 0.014) or (sum by (node) (kube_node_info{node=\"titan-21\"}) * 0 + 0.015) or (sum by (node) (kube_node_info{node=\"titan-12\"}) * 0 + 0.016) or (sum by (node) (kube_node_info{node=\"titan-13\"}) * 0 + 0.017) or (sum by (node) (kube_node_info{node=\"titan-14\"}) * 0 + 0.018000000000000002) or (sum by (node) (kube_node_info{node=\"titan-15\"}) * 0 + 0.019) or (sum by (node) (kube_node_info{node=\"titan-16\"}) * 0 + 0.02) or (sum by (node) (kube_node_info{node=\"titan-17\"}) * 0 + 0.021) or (sum by (node) (kube_node_info{node=\"titan-18\"}) * 0 + 0.022) or (sum by (node) (kube_node_info{node=\"titan-19\"}) * 0 + 0.023) or (sum by (node) (kube_node_info{node=\"titan-22\"}) * 0 + 0.024) or (sum by (node) (kube_node_info{node=\"titan-24\"}) * 0 + 0.025)))))" + ] + }, + { + "dashboard": "Atlas Storage", + "panel_title": "Astreae Usage", + "panel_id": 1, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "storage" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "100 - (sum(node_filesystem_avail_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"}) / sum(node_filesystem_size_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"}) * 100)" + ] + }, + { + "dashboard": "Atlas Storage", + "panel_title": "Asteria Usage", + "panel_id": 2, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "storage" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "100 - (sum(node_filesystem_avail_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"}) / sum(node_filesystem_size_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"}) * 100)" + ] + }, + { + "dashboard": "Atlas Storage", + "panel_title": "Astreae Free", + "panel_id": 3, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "storage" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum(node_filesystem_avail_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"})" + ] + }, + { + "dashboard": "Atlas Storage", + "panel_title": "Asteria Free", + "panel_id": 4, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "storage" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum(node_filesystem_avail_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"})" + ] + }, + { + "dashboard": "Atlas Storage", + "panel_title": "Astreae Per-Node Usage", + "panel_id": 5, + "panel_type": "timeseries", + "description": "", + "tags": [ + "atlas", + "storage" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "(avg by (node) ((avg by (instance) ((1 - (node_filesystem_avail_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"})) * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-1[2-9]|titan-2[24]\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")" + ] + }, + { + "dashboard": "Atlas Storage", + "panel_title": "Asteria Per-Node Usage", + "panel_id": 6, + "panel_type": "timeseries", + "description": "", + "tags": [ + "atlas", + "storage" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "(avg by (node) ((avg by (instance) ((1 - (node_filesystem_avail_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"})) * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-1[2-9]|titan-2[24]\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")" + ] + }, + { + "dashboard": "Atlas Storage", + "panel_title": "Astreae Usage History", + "panel_id": 7, + "panel_type": "timeseries", + "description": "", + "tags": [ + "atlas", + "storage" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "100 - (sum(node_filesystem_avail_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"}) / sum(node_filesystem_size_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"}) * 100)" + ] + }, + { + "dashboard": "Atlas Storage", + "panel_title": "Asteria Usage History", + "panel_id": 8, + "panel_type": "timeseries", + "description": "", + "tags": [ + "atlas", + "storage" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "100 - (sum(node_filesystem_avail_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"}) / sum(node_filesystem_size_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"}) * 100)" + ] + }, + { + "dashboard": "Atlas Storage", + "panel_title": "Maintenance Sweepers Ready", + "panel_id": 30, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "storage" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "kube_daemonset_status_number_ready{namespace=\"maintenance\",daemonset=\"node-image-sweeper\"} / on(namespace,daemonset) kube_daemonset_status_desired_number_scheduled{namespace=\"maintenance\",daemonset=\"node-image-sweeper\"} * 100" + ] + }, + { + "dashboard": "Atlas Storage", + "panel_title": "Maintenance Cron Freshness (s)", + "panel_id": 31, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "storage" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "time() - max by (cronjob) (kube_cronjob_status_last_successful_time{namespace=\"maintenance\",cronjob=\"image-sweeper\"})" + ] + } +] diff --git a/knowledge/catalog/runbooks.json b/knowledge/catalog/runbooks.json index 0718562..960510d 100644 --- a/knowledge/catalog/runbooks.json +++ b/knowledge/catalog/runbooks.json @@ -85,5 +85,13 @@ "clusters/atlas/<...>" ], "body": "# \n\n## What this is\n\n## For users (how to)\n\n## For operators (where configured)\n\n## Troubleshooting (symptoms \u2192 checks)" + }, + { + "path": "software/metis.md", + "title": "metis", + "tags": [], + "entrypoints": [], + "source_paths": [], + "body": "# Metis (node recovery)\n\n## Node classes (current map)\n- rpi5 Ubuntu workers: titan-04,05,06,07,08,09,10,11,20,21 (Ubuntu 24.04.3, k3s agent)\n- rpi5 control-plane: titan-0a/0b/0c (Ubuntu 24.04.1, k3s server, control-plane taint)\n- rpi4 Armbian longhorn: titan-13/15/17/19 (Armbian 6.6.x, k3s agent, longhorn disks)\n- rpi4 Armbian standard: titan-12/14/18 (Armbian 6.6.x, k3s agent)\n- amd64 agents: titan-22/24 (Debian 13, k3s agent)\n- External/non-cluster: tethys, titan-db, titan-jh, oceanus/titan-23, plus any newcomers.\n\n### Jetson nodes (titan-20/21)\n- Ubuntu 20.04.6 (Focal), kernel 5.10.104-tegra, CRI containerd 2.0.5-k3s2, arch arm64.\n- Storage: NVMe 232G at / (ext4); onboard mmc partitions present but root on NVMe; 1.9T sda present (unused).\n- k3s agent with drop-in 99-nofile.conf.\n\n## Longhorn disk UUIDs (critical nodes)\n- titan-13: /mnt/astreae UUID=6031fa8b-f28c-45c3-b7bc-6133300e07c6 (ext4); /mnt/asteria UUID=cbd4989d-62b5-4741-8b2a-28fdae259cae (ext4)\n- titan-15: /mnt/astreae UUID=f3362f14-5822-449f-944b-ac570b5cd615 (ext4); /mnt/asteria UUID=9c5316e6-f847-4884-b502-11f2d0d15d6f (ext4)\n- titan-17: /mnt/astreae UUID=1fecdade-08b0-49cb-9ae3-be6c188b0a96 (ext4); /mnt/asteria UUID=2fe9f613-d372-47ca-b84f-82084e4edda0 (ext4)\n- titan-19: /mnt/astreae UUID=4890abb9-dda2-4f4f-9c0f-081ee82849cf (ext4); /mnt/asteria UUID=2b4ea28d-b0e6-4fa3-841b-cd7067ae9153 (ext4)\n\n## Metis repo (~/Development/metis)\n- CLI skeleton in Go (`cmd/metis`), inventory loader (`pkg/inventory`), plan builder (`pkg/plan`).\n- `inventory.example.yaml` shows expected schema (classes + per-node overlay, Longhorn disks, labels, taints).\n- `AGENTS.md` in repo is untracked and holds raw notes.\n\n## Next implementation steps\n- Add per-class golden image refs and checksums (Harbor or file://) when ready.\n- Implement burn execution: download with checksum, write via dd/etcher-equivalent, mount boot/root to inject hostname/IP/k3s tokens/labels/taints, journald/GC drop-ins, and Longhorn fstab entries. Add Windows writer (diskpart + wmic) and Linux writer (dd + sgdisk) paths.\n- Add Keycloak/SSH bootstrap: ensure ssh user, authorized keys, and k3s token/URL injection for agents; control-plane restore path with etcd snapshot selection.\n- Add per-host inventory entries for tethys, titan-db, titan-jh, oceanus/titan-23, future 20/21 once audited.\n\n## Node OS/Kernel/CRI snapshot (Jan 2026)\n- titan-04: Ubuntu 24.04.3 LTS, kernel 6.8.0-1031-raspi, CRI containerd://2.0.5-k3s2, arch arm64\n- titan-05: Ubuntu 24.04.3 LTS, kernel 6.8.0-1039-raspi, CRI containerd://2.0.5-k3s2, arch arm64\n- titan-06: Ubuntu 24.04.3 LTS, kernel 6.8.0-1039-raspi, CRI containerd://2.0.5-k3s2, arch arm64\n- titan-07: Ubuntu 24.04.3 LTS, kernel 6.8.0-1039-raspi, CRI containerd://2.0.5-k3s2, arch arm64\n- titan-08: Ubuntu 24.04.3 LTS, kernel 6.8.0-1039-raspi, CRI containerd://2.0.5-k3s2, arch arm64\n- titan-09: Ubuntu 24.04.3 LTS, kernel 6.8.0-1031-raspi, CRI containerd://2.0.5-k3s2, arch arm64\n- titan-0a: Ubuntu 24.04.1 LTS, kernel 6.8.0-1038-raspi, CRI containerd://2.0.5-k3s2, arch arm64\n- titan-0b: Ubuntu 24.04.1 LTS, kernel 6.8.0-1038-raspi, CRI containerd://2.0.5-k3s2, arch arm64\n- titan-0c: Ubuntu 24.04.1 LTS, kernel 6.8.0-1038-raspi, CRI containerd://2.0.5-k3s2, arch arm64\n- titan-10: Ubuntu 24.04.3 LTS, kernel 6.8.0-1039-raspi, CRI containerd://2.0.5-k3s2, arch arm64\n- titan-11: Ubuntu 24.04.3 LTS, kernel 6.8.0-1039-raspi, CRI containerd://2.0.5-k3s2, arch arm64\n- titan-12: Armbian 24.11.1 noble, kernel 6.6.60-current-bcm2711, CRI containerd://1.7.23-k3s2, arch arm64\n- titan-13: Armbian 25.2.1 noble, kernel 6.6.63-current-bcm2711, CRI containerd://1.7.23-k3s2, arch arm64\n- titan-14: Armbian 24.11.1 noble, kernel 6.6.60-current-bcm2711, CRI containerd://1.7.23-k3s2, arch arm64\n- titan-15: Armbian 25.2.1 noble, kernel 6.6.63-current-bcm2711, CRI containerd://1.7.23-k3s2, arch arm64\n- titan-17: Armbian 25.2.1 noble, kernel 6.6.63-current-bcm2711, CRI containerd://1.7.23-k3s2, arch arm64\n- titan-18: Armbian 24.11.1 noble, kernel 6.6.60-current-bcm2711, CRI containerd://1.7.23-k3s2, arch arm64\n- titan-19: Armbian 25.2.1 noble, kernel 6.6.63-current-bcm2711, CRI containerd://1.7.23-k3s2, arch arm64\n- titan-20: Ubuntu 20.04.6 LTS, kernel 5.10.104-tegra, CRI containerd://2.0.5-k3s2, arch arm64\n- titan-21: Ubuntu 20.04.6 LTS, kernel 5.10.104-tegra, CRI containerd://2.0.5-k3s2, arch arm64\n- titan-22: Debian 13 (trixie), kernel 6.12.41+deb13-amd64, CRI containerd://2.0.5-k3s2, arch amd64\n- titan-24: Debian 13 (trixie), kernel 6.12.57+deb13-amd64, CRI containerd://2.0.5-k3s2, arch amd64\n\n\n### External hosts\n- titan-db: Ubuntu 24.10, kernel 6.11.0-1015-raspi, root on /dev/sda2 ext4 (465G), boot vfat /dev/sda1; PostgreSQL service enabled.\n- titan-jh: Arch Linux ARM (rolling), kernel 6.18.4-2-rpi, NVMe root ext4 238G (/), boot vfat 512M; ~495 packages installed (pacman -Q).\n- titan-23/oceanus: TODO audit (future).\n\n\n### Control plane Pis (titan-0a/0b/0c)\n- Ubuntu 24.04.1 LTS, kernel 6.8.0-1038-raspi, containerd 2.0.5-k3s2.\n- Storage: 477G SSD root (/dev/sda2 ext4), /boot/firmware vfat (/dev/sda1). fstab uses LABEL=writable and LABEL=system-boot.\n- k3s server (control-plane taint expected); etcd snapshots not yet cataloged (TODO).\n\n\n## k3s versions\n- rpi5 workers/control-plane: k3s v1.33.3+k3s1 (crictl v1.31.0-k3s2)\n- rpi4 nodes: k3s v1.31.5+k3s1 (crictl v1.31.0-k3s2)\n- Jetson titan-20/21: k3s v1.33.3+k3s1 (per node info), crictl v1.31.0-k3s2" } ] diff --git a/knowledge/diagrams/atlas-http.mmd b/knowledge/diagrams/atlas-http.mmd index ab7c362..1aa7ac8 100644 --- a/knowledge/diagrams/atlas-http.mmd +++ b/knowledge/diagrams/atlas-http.mmd @@ -17,6 +17,11 @@ flowchart LR host_bstein_dev --> svc_bstein_dev_home_bstein_dev_home_backend wl_bstein_dev_home_bstein_dev_home_backend["bstein-dev-home/bstein-dev-home-backend (Deployment)"] svc_bstein_dev_home_bstein_dev_home_backend --> wl_bstein_dev_home_bstein_dev_home_backend + host_budget_bstein_dev["budget.bstein.dev"] + svc_finance_actual_budget["finance/actual-budget (Service)"] + host_budget_bstein_dev --> svc_finance_actual_budget + wl_finance_actual_budget["finance/actual-budget (Deployment)"] + svc_finance_actual_budget --> wl_finance_actual_budget host_call_live_bstein_dev["call.live.bstein.dev"] svc_comms_element_call["comms/element-call (Service)"] host_call_live_bstein_dev --> svc_comms_element_call @@ -37,6 +42,11 @@ flowchart LR host_cloud_bstein_dev --> svc_nextcloud_nextcloud wl_nextcloud_nextcloud["nextcloud/nextcloud (Deployment)"] svc_nextcloud_nextcloud --> wl_nextcloud_nextcloud + host_health_bstein_dev["health.bstein.dev"] + svc_health_wger["health/wger (Service)"] + host_health_bstein_dev --> svc_health_wger + wl_health_wger["health/wger (Deployment)"] + svc_health_wger --> wl_health_wger host_kit_live_bstein_dev["kit.live.bstein.dev"] svc_comms_livekit_token_service["comms/livekit-token-service (Service)"] host_kit_live_bstein_dev --> svc_comms_livekit_token_service @@ -50,6 +60,14 @@ flowchart LR host_live_bstein_dev --> svc_comms_matrix_wellknown svc_comms_othrys_synapse_matrix_synapse["comms/othrys-synapse-matrix-synapse (Service)"] host_live_bstein_dev --> svc_comms_othrys_synapse_matrix_synapse + svc_comms_matrix_guest_register["comms/matrix-guest-register (Service)"] + host_live_bstein_dev --> svc_comms_matrix_guest_register + wl_comms_matrix_guest_register["comms/matrix-guest-register (Deployment)"] + svc_comms_matrix_guest_register --> wl_comms_matrix_guest_register + svc_comms_matrix_authentication_service["comms/matrix-authentication-service (Service)"] + host_live_bstein_dev --> svc_comms_matrix_authentication_service + wl_comms_matrix_authentication_service["comms/matrix-authentication-service (Deployment)"] + svc_comms_matrix_authentication_service --> wl_comms_matrix_authentication_service host_logs_bstein_dev["logs.bstein.dev"] svc_logging_oauth2_proxy_logs["logging/oauth2-proxy-logs (Service)"] host_logs_bstein_dev --> svc_logging_oauth2_proxy_logs @@ -64,21 +82,20 @@ flowchart LR svc_mailu_mailserver_mailu_front["mailu-mailserver/mailu-front (Service)"] host_mail_bstein_dev --> svc_mailu_mailserver_mailu_front host_matrix_live_bstein_dev["matrix.live.bstein.dev"] - svc_comms_matrix_authentication_service["comms/matrix-authentication-service (Service)"] host_matrix_live_bstein_dev --> svc_comms_matrix_authentication_service - wl_comms_matrix_authentication_service["comms/matrix-authentication-service (Deployment)"] - svc_comms_matrix_authentication_service --> wl_comms_matrix_authentication_service host_matrix_live_bstein_dev --> svc_comms_matrix_wellknown host_matrix_live_bstein_dev --> svc_comms_othrys_synapse_matrix_synapse - svc_comms_matrix_guest_register["comms/matrix-guest-register (Service)"] host_matrix_live_bstein_dev --> svc_comms_matrix_guest_register - wl_comms_matrix_guest_register["comms/matrix-guest-register (Deployment)"] - svc_comms_matrix_guest_register --> wl_comms_matrix_guest_register host_monero_bstein_dev["monero.bstein.dev"] svc_crypto_monerod["crypto/monerod (Service)"] host_monero_bstein_dev --> svc_crypto_monerod wl_crypto_monerod["crypto/monerod (Deployment)"] svc_crypto_monerod --> wl_crypto_monerod + host_money_bstein_dev["money.bstein.dev"] + svc_finance_firefly["finance/firefly (Service)"] + host_money_bstein_dev --> svc_finance_firefly + wl_finance_firefly["finance/firefly (Deployment)"] + svc_finance_firefly --> wl_finance_firefly host_notes_bstein_dev["notes.bstein.dev"] svc_outline_outline["outline/outline (Service)"] host_notes_bstein_dev --> svc_outline_outline @@ -143,19 +160,29 @@ flowchart LR svc_comms_livekit wl_comms_livekit svc_comms_othrys_synapse_matrix_synapse - svc_comms_matrix_authentication_service - wl_comms_matrix_authentication_service svc_comms_matrix_guest_register wl_comms_matrix_guest_register + svc_comms_matrix_authentication_service + wl_comms_matrix_authentication_service end subgraph crypto[crypto] svc_crypto_monerod wl_crypto_monerod end + subgraph finance[finance] + svc_finance_actual_budget + wl_finance_actual_budget + svc_finance_firefly + wl_finance_firefly + end subgraph gitea[gitea] svc_gitea_gitea wl_gitea_gitea end + subgraph health[health] + svc_health_wger + wl_health_wger + end subgraph jellyfin[jellyfin] svc_jellyfin_pegasus wl_jellyfin_pegasus diff --git a/scripts/dashboards_render_atlas.py b/scripts/dashboards_render_atlas.py index 0931b48..5db798d 100644 --- a/scripts/dashboards_render_atlas.py +++ b/scripts/dashboards_render_atlas.py @@ -70,6 +70,7 @@ WORKER_NODES = [ "titan-13", "titan-14", "titan-15", + "titan-16", "titan-17", "titan-18", "titan-19", @@ -207,7 +208,66 @@ def namespace_ram_raw(scope_var): def namespace_gpu_usage_instant(scope_var): - return f"sum(DCGM_FI_DEV_GPU_UTIL{{{namespace_gpu_selector(scope_var)}}}) by (namespace)" + return gpu_usage_by_namespace(scope_var) + + +def jetson_gpu_util_by_node(): + return 'max by (node) (jetson_gr3d_freq_percent{node!=""})' + + +def dcgm_gpu_util_by_node(): + dcgm_pod = 'label_replace(DCGM_FI_DEV_GPU_UTIL, "pod", "$1", "Hostname", "(.*)")' + dcgm_ns = 'label_replace(' + dcgm_pod + ', "namespace", "monitoring", "", "")' + return ( + "avg by (node) (" + f"{dcgm_ns} * on(namespace,pod) group_left(node) " + 'kube_pod_info{namespace="monitoring"}' + ")" + ) + + +def gpu_util_by_node(): + return f"{dcgm_gpu_util_by_node()} or {jetson_gpu_util_by_node()}" + + +def gpu_util_by_hostname(): + return 'label_replace(' + gpu_util_by_node() + ', "Hostname", "$1", "node", "(.*)")' + + +def gpu_node_labels(): + return 'kube_node_labels{label_accelerator=~".+"} or kube_node_labels{label_jetson="true"}' + + +def gpu_requests_by_namespace_node(scope_var): + return ( + "sum by (namespace,node) (" + f'kube_pod_container_resource_requests{{resource=~"nvidia.com/gpu.*",{scope_var}}} ' + "* on(namespace,pod) group_left(node) kube_pod_info " + f"* on(node) group_left() ({gpu_node_labels()})" + ")" + ) + + +def gpu_usage_by_namespace(scope_var): + requests_by_ns = gpu_requests_by_namespace_node(scope_var) + total_by_node = f"sum by (node) ({requests_by_ns})" + return ( + "sum by (namespace) (" + f"({requests_by_ns}) / clamp_min({total_by_node}, 1) " + f"* on(node) group_left() ({gpu_util_by_node()})" + ")" + ) + + +def jetson_gpu_usage_by_namespace(scope_var): + requests_by_ns = jetson_gpu_requests(scope_var) + total_by_node = f"sum by (node) ({requests_by_ns})" + return ( + "sum by (namespace) (" + f"({requests_by_ns}) / clamp_min({total_by_node}, 1) " + f"* on(node) group_left() {jetson_gpu_util_by_node()}" + ")" + ) def namespace_share_expr(resource_expr): @@ -227,7 +287,7 @@ def namespace_gpu_share_expr(scope_var): usage = namespace_gpu_usage_instant(scope_var) total = f"(sum({usage}) or on() vector(0))" share = f"100 * ({usage}) / clamp_min({total}, 1)" - idle = 'label_replace(vector(100), "namespace", "idle", "", "") and on() (' + total + " == 0)" + idle = 'label_replace(vector(100), "namespace", "idle", "", "") * scalar(' + total + " == bool 0)" return f"({share}) or ({idle})" @@ -333,9 +393,60 @@ GLUE_STALE = f"({GLUE_LAST_SUCCESS_AGE} > bool {GLUE_STALE_WINDOW_SEC})" GLUE_MISSING = f"({GLUE_JOBS} unless on(namespace,cronjob) kube_cronjob_status_last_successful_time)" GLUE_STALE_ACTIVE = f"({GLUE_STALE} unless on(namespace,cronjob) {GLUE_SUSPENDED})" GLUE_MISSING_ACTIVE = f"({GLUE_MISSING} unless on(namespace,cronjob) {GLUE_SUSPENDED})" -GLUE_STALE_COUNT = f"(sum({GLUE_STALE_ACTIVE}) + count({GLUE_MISSING_ACTIVE}))" -GLUE_MISSING_COUNT = f"count({GLUE_MISSING_ACTIVE})" -GLUE_SUSPENDED_COUNT = f"sum({GLUE_SUSPENDED})" +GLUE_STALE_COUNT = f"(sum({GLUE_STALE_ACTIVE}) + count({GLUE_MISSING_ACTIVE})) or on() vector(0)" +GLUE_MISSING_COUNT = f"count({GLUE_MISSING_ACTIVE}) or on() vector(0)" +GLUE_SUSPENDED_COUNT = f"sum({GLUE_SUSPENDED}) or on() vector(0)" +ARIADNE_TASK_ERRORS_RANGE = 'sum by (task) (increase(ariadne_task_runs_total{status="error"}[$__range]))' +ARIADNE_TASK_ERRORS_24H = 'sum by (task) (increase(ariadne_task_runs_total{status="error"}[24h]))' +ARIADNE_TASK_ERRORS_1H = 'sum by (task) (increase(ariadne_task_runs_total{status="error"}[1h]))' +ARIADNE_TASK_ERRORS_30D = 'sum by (task) (increase(ariadne_task_runs_total{status="error"}[30d]))' +ARIADNE_TASK_SUCCESS_24H = 'sum by (task) (increase(ariadne_task_runs_total{status="ok"}[24h]))' +ARIADNE_TASK_RUNS_BY_STATUS_1H = 'sum by (status) (increase(ariadne_task_runs_total[1h]))' +ARIADNE_TASK_ERRORS_1H_TOTAL = 'sum(increase(ariadne_task_runs_total{status="error"}[1h]))' +ARIADNE_TASK_ERRORS_24H_TOTAL = 'sum(increase(ariadne_task_runs_total{status="error"}[24h]))' +ARIADNE_TASK_RUNS_1H_TOTAL = 'sum(increase(ariadne_task_runs_total[1h]))' +ARIADNE_TASK_ATTEMPTS_SERIES = 'sum(increase(ariadne_task_runs_total[$__interval]))' +ARIADNE_TASK_FAILURES_SERIES = 'sum(increase(ariadne_task_runs_total{status="error"}[$__interval]))' +ARIADNE_TASK_WARNINGS_SERIES = ( + 'sum(increase(ariadne_task_runs_total{status!~"ok|error"}[$__interval])) or on() vector(0)' +) +ARIADNE_SCHEDULE_LAST_SUCCESS_HOURS = "(time() - ariadne_schedule_last_success_timestamp_seconds) / 3600" +ARIADNE_SCHEDULE_LAST_ERROR_HOURS = "(time() - ariadne_schedule_last_error_timestamp_seconds) / 3600" +ARIADNE_SCHEDULE_LAST_SUCCESS_RANGE_HOURS = ( + "(time() - max_over_time(ariadne_schedule_last_success_timestamp_seconds[$__range])) / 3600" +) +ARIADNE_SCHEDULE_LAST_ERROR_RANGE_HOURS = ( + "(time() - max_over_time(ariadne_schedule_last_error_timestamp_seconds[$__range])) / 3600" +) +ARIADNE_ACCESS_REQUESTS = "ariadne_access_requests_total" +ARIADNE_CI_COVERAGE = 'ariadne_ci_coverage_percent{repo="ariadne"}' +ARIADNE_CI_TESTS = 'ariadne_ci_tests_total{repo="ariadne"}' +ARIADNE_TEST_SUCCESS_RATE = ( + "100 * " + 'sum(max_over_time(ariadne_ci_tests_total{repo="ariadne",result="passed"}[30d])) ' + "/ clamp_min(" + 'sum(max_over_time(ariadne_ci_tests_total{repo="ariadne",result=~"passed|failed|error"}[30d])), 1)' +) +ARIADNE_TEST_FAILURES_24H = ( + 'sum by (result) (max_over_time(ariadne_ci_tests_total{repo="ariadne",result=~"failed|error"}[24h]))' +) +POSTGRES_CONN_USED = ( + 'label_replace(sum(pg_stat_activity_count), "conn", "used", "__name__", ".*") ' + 'or label_replace(max(pg_settings_max_connections), "conn", "max", "__name__", ".*")' +) +POSTGRES_CONN_HOTTEST = 'topk(1, sum by (datname) (pg_stat_activity_count))' +ONEOFF_JOB_OWNER = ( + 'label_replace(kube_job_owner{owner_kind="CronJob"}, "owner_name", "$1", "job_name", "(.*)")' +) +ONEOFF_JOB_PODS = f'(kube_pod_owner{{owner_kind="Job"}} unless on(namespace, owner_name) {ONEOFF_JOB_OWNER})' +ONEOFF_JOB_POD_AGE_HOURS = ( + '((time() - kube_pod_start_time{pod!=""}) / 3600) ' + f'* on(namespace,pod) group_left(owner_name) {ONEOFF_JOB_PODS} ' + '* on(namespace,pod) group_left(phase) ' + 'max by (namespace,pod,phase) (kube_pod_status_phase{phase=~"Running|Succeeded"})' +) +GLUE_LAST_SUCCESS_RANGE_HOURS = f"(time() - max_over_time({GLUE_LAST_SUCCESS}[$__range])) / 3600" +GLUE_LAST_SCHEDULE_RANGE_HOURS = f"(time() - max_over_time({GLUE_LAST_SCHEDULE}[$__range])) / 3600" GPU_NODES = ["titan-20", "titan-21", "titan-22", "titan-24"] GPU_NODE_REGEX = "|".join(GPU_NODES) TRAEFIK_ROUTER_EXPR = "sum by (router) (rate(traefik_router_requests_total[5m]))" @@ -513,6 +624,7 @@ def timeseries_panel( grid, *, unit="none", + max_value=None, legend=None, legend_display="table", legend_placement="bottom", @@ -537,6 +649,8 @@ def timeseries_panel( "tooltip": {"mode": "multi"}, }, } + if max_value is not None: + panel["fieldConfig"]["defaults"]["max"] = max_value if legend: panel["targets"][0]["legendFormat"] = legend if legend_calcs: @@ -688,13 +802,22 @@ def bargauge_panel( grid, *, unit="none", + legend=None, links=None, limit=None, + sort_order="desc", thresholds=None, decimals=None, instant=False, + overrides=None, ): """Return a bar gauge panel with label-aware reduction.""" + cleaned_expr = expr.strip() + if not cleaned_expr.startswith(("sort(", "sort_desc(")): + if sort_order == "desc": + expr = f"sort_desc({expr})" + elif sort_order == "asc": + expr = f"sort({expr})" panel = { "id": panel_id, "type": "bargauge", @@ -702,7 +825,12 @@ def bargauge_panel( "datasource": PROM_DS, "gridPos": grid, "targets": [ - {"expr": expr, "refId": "A", "legendFormat": "{{node}}", **({"instant": True} if instant else {})} + { + "expr": expr, + "refId": "A", + "legendFormat": legend or "{{node}}", + **({"instant": True} if instant else {}), + } ], "fieldConfig": { "defaults": { @@ -732,6 +860,8 @@ def bargauge_panel( }, }, } + if overrides: + panel["fieldConfig"]["overrides"].extend(overrides) if decimals is not None: panel["fieldConfig"]["defaults"]["decimals"] = decimals if links: @@ -740,7 +870,7 @@ def bargauge_panel( panel["transformations"] = [ { "id": "sortBy", - "options": {"fields": ["Value"], "order": "desc"}, + "options": {"fields": ["Value"], "order": sort_order}, } ] if limit: @@ -780,6 +910,15 @@ def build_overview(): {"color": "red", "value": 3}, ], } + age_thresholds = { + "mode": "absolute", + "steps": [ + {"color": "green", "value": None}, + {"color": "yellow", "value": 6}, + {"color": "orange", "value": 24}, + {"color": "red", "value": 48}, + ], + } row1_stats = [ { @@ -982,7 +1121,7 @@ def build_overview(): 30, "Mail Sent (1d)", 'max(postmark_outbound_sent{window="1d"})', - {"h": 2, "w": 5, "x": 0, "y": 8}, + {"h": 3, "w": 4, "x": 0, "y": 8}, unit="none", links=link_to("atlas-mail"), ) @@ -993,7 +1132,7 @@ def build_overview(): "type": "stat", "title": "Mail Bounces (1d)", "datasource": PROM_DS, - "gridPos": {"h": 2, "w": 5, "x": 10, "y": 8}, + "gridPos": {"h": 3, "w": 4, "x": 8, "y": 8}, "targets": [ { "expr": 'max(postmark_outbound_bounce_rate{window="1d"})', @@ -1039,7 +1178,7 @@ def build_overview(): 32, "Mail Success Rate (1d)", 'clamp_min(100 - max(postmark_outbound_bounce_rate{window="1d"}), 0)', - {"h": 2, "w": 5, "x": 5, "y": 8}, + {"h": 3, "w": 4, "x": 4, "y": 8}, unit="percent", thresholds=mail_success_thresholds, decimals=1, @@ -1051,13 +1190,38 @@ def build_overview(): 33, "Mail Limit Used (30d)", "max(postmark_sending_limit_used_percent)", - {"h": 2, "w": 5, "x": 15, "y": 8}, + {"h": 3, "w": 4, "x": 12, "y": 8}, unit="percent", thresholds=mail_limit_thresholds, decimals=1, links=link_to("atlas-mail"), ) ) + panels.append( + stat_panel( + 34, + "Postgres Connections Used", + POSTGRES_CONN_USED, + {"h": 3, "w": 4, "x": 16, "y": 8}, + decimals=0, + text_mode="name_and_value", + legend="{{conn}}", + instant=True, + ) + ) + panels.append( + stat_panel( + 35, + "Postgres Hottest Connections", + POSTGRES_CONN_HOTTEST, + {"h": 3, "w": 4, "x": 20, "y": 8}, + unit="none", + decimals=0, + text_mode="name_and_value", + legend="{{datname}}", + instant=True, + ) + ) storage_panels = [ (23, "Astreae Usage", astreae_usage_expr("/mnt/astreae"), "percent"), @@ -1071,13 +1235,104 @@ def build_overview(): panel_id, title, expr, - {"h": 6, "w": 6, "x": 6 * idx, "y": 10}, + {"h": 3, "w": 6, "x": 6 * idx, "y": 11}, unit=unit, thresholds=PERCENT_THRESHOLDS if unit == "percent" else None, links=link_to("atlas-storage"), ) ) + panels.append( + bargauge_panel( + 40, + "One-off Job Pods (age hours)", + ONEOFF_JOB_POD_AGE_HOURS, + {"h": 6, "w": 6, "x": 0, "y": 14}, + unit="h", + instant=True, + legend="{{namespace}}/{{pod}}", + thresholds=age_thresholds, + limit=8, + decimals=2, + ) + ) + panels.append( + { + "id": 41, + "type": "timeseries", + "title": "Ariadne Attempts / Failures", + "datasource": PROM_DS, + "gridPos": {"h": 6, "w": 6, "x": 6, "y": 14}, + "targets": [ + {"expr": ARIADNE_TASK_ATTEMPTS_SERIES, "refId": "A", "legendFormat": "Attempts"}, + {"expr": ARIADNE_TASK_FAILURES_SERIES, "refId": "B", "legendFormat": "Failures"}, + ], + "fieldConfig": { + "defaults": {"unit": "none"}, + "overrides": [ + { + "matcher": {"id": "byName", "options": "Attempts"}, + "properties": [ + {"id": "color", "value": {"mode": "fixed", "fixedColor": "green"}} + ], + }, + { + "matcher": {"id": "byName", "options": "Failures"}, + "properties": [ + {"id": "color", "value": {"mode": "fixed", "fixedColor": "red"}} + ], + }, + ], + }, + "options": { + "legend": {"displayMode": "table", "placement": "right"}, + "tooltip": {"mode": "multi"}, + }, + } + ) + panels.append( + timeseries_panel( + 42, + "Ariadne Test Success Rate", + ARIADNE_TEST_SUCCESS_RATE, + {"h": 6, "w": 6, "x": 12, "y": 14}, + unit="percent", + max_value=100, + legend=None, + legend_display="list", + ) + ) + panels.append( + bargauge_panel( + 43, + "Tests with Failures (24h)", + ARIADNE_TEST_FAILURES_24H, + {"h": 6, "w": 6, "x": 18, "y": 14}, + unit="none", + instant=True, + legend="{{result}}", + overrides=[ + { + "matcher": {"id": "byName", "options": "error"}, + "properties": [{"id": "color", "value": {"mode": "fixed", "fixedColor": "yellow"}}], + }, + { + "matcher": {"id": "byName", "options": "failed"}, + "properties": [{"id": "color", "value": {"mode": "fixed", "fixedColor": "red"}}], + }, + ], + thresholds={ + "mode": "absolute", + "steps": [ + {"color": "green", "value": None}, + {"color": "yellow", "value": 1}, + {"color": "orange", "value": 5}, + {"color": "red", "value": 10}, + ], + }, + ) + ) + cpu_scope = "$namespace_scope_cpu" gpu_scope = "$namespace_scope_gpu" ram_scope = "$namespace_scope_ram" @@ -1087,7 +1342,7 @@ def build_overview(): 11, "Namespace CPU Share", namespace_cpu_share_expr(cpu_scope), - {"h": 9, "w": 8, "x": 0, "y": 16}, + {"h": 9, "w": 8, "x": 0, "y": 20}, links=namespace_scope_links("namespace_scope_cpu"), description="Shares are normalized within the selected filter. Switching scope changes the denominator.", ) @@ -1097,7 +1352,7 @@ def build_overview(): 12, "Namespace GPU Share", namespace_gpu_share_expr(gpu_scope), - {"h": 9, "w": 8, "x": 8, "y": 16}, + {"h": 9, "w": 8, "x": 8, "y": 20}, links=namespace_scope_links("namespace_scope_gpu"), description="Shares are normalized within the selected filter. Switching scope changes the denominator.", ) @@ -1107,7 +1362,7 @@ def build_overview(): 13, "Namespace RAM Share", namespace_ram_share_expr(ram_scope), - {"h": 9, "w": 8, "x": 16, "y": 16}, + {"h": 9, "w": 8, "x": 16, "y": 20}, links=namespace_scope_links("namespace_scope_ram"), description="Shares are normalized within the selected filter. Switching scope changes the denominator.", ) @@ -1119,7 +1374,7 @@ def build_overview(): 14, "Worker Node CPU", node_cpu_expr(worker_filter), - {"h": 12, "w": 12, "x": 0, "y": 32}, + {"h": 12, "w": 12, "x": 0, "y": 36}, unit="percent", legend="{{node}}", legend_calcs=["last"], @@ -1133,7 +1388,7 @@ def build_overview(): 15, "Worker Node RAM", node_mem_expr(worker_filter), - {"h": 12, "w": 12, "x": 12, "y": 32}, + {"h": 12, "w": 12, "x": 12, "y": 36}, unit="percent", legend="{{node}}", legend_calcs=["last"], @@ -1148,7 +1403,7 @@ def build_overview(): 16, "Control plane CPU", node_cpu_expr(CONTROL_ALL_REGEX), - {"h": 10, "w": 12, "x": 0, "y": 44}, + {"h": 10, "w": 12, "x": 0, "y": 48}, unit="percent", legend="{{node}}", legend_display="table", @@ -1160,7 +1415,7 @@ def build_overview(): 17, "Control plane RAM", node_mem_expr(CONTROL_ALL_REGEX), - {"h": 10, "w": 12, "x": 12, "y": 44}, + {"h": 10, "w": 12, "x": 12, "y": 48}, unit="percent", legend="{{node}}", legend_display="table", @@ -1173,7 +1428,7 @@ def build_overview(): 28, "Node Pod Share", '(sum(kube_pod_info{pod!="" , node!=""}) by (node) / clamp_min(sum(kube_pod_info{pod!="" , node!=""}), 1)) * 100', - {"h": 10, "w": 12, "x": 0, "y": 54}, + {"h": 10, "w": 12, "x": 0, "y": 58}, ) ) panels.append( @@ -1181,7 +1436,7 @@ def build_overview(): 29, "Top Nodes by Pod Count", 'topk(12, sum(kube_pod_info{pod!="" , node!=""}) by (node))', - {"h": 10, "w": 12, "x": 12, "y": 54}, + {"h": 10, "w": 12, "x": 12, "y": 58}, unit="none", limit=12, decimals=0, @@ -1203,7 +1458,7 @@ def build_overview(): 18, "Cluster Ingress Throughput", NET_INGRESS_EXPR, - {"h": 7, "w": 8, "x": 0, "y": 25}, + {"h": 7, "w": 8, "x": 0, "y": 29}, unit="Bps", legend="Ingress (Traefik)", legend_display="list", @@ -1216,7 +1471,7 @@ def build_overview(): 19, "Cluster Egress Throughput", NET_EGRESS_EXPR, - {"h": 7, "w": 8, "x": 8, "y": 25}, + {"h": 7, "w": 8, "x": 8, "y": 29}, unit="Bps", legend="Egress (Traefik)", legend_display="list", @@ -1229,7 +1484,7 @@ def build_overview(): 20, "Intra-Cluster Throughput", NET_INTERNAL_EXPR, - {"h": 7, "w": 8, "x": 16, "y": 25}, + {"h": 7, "w": 8, "x": 16, "y": 29}, unit="Bps", legend="Internal traffic", legend_display="list", @@ -1243,7 +1498,7 @@ def build_overview(): 21, "Root Filesystem Usage", root_usage_expr(), - {"h": 16, "w": 12, "x": 0, "y": 64}, + {"h": 16, "w": 12, "x": 0, "y": 68}, unit="percent", legend="{{node}}", legend_calcs=["last"], @@ -1258,7 +1513,7 @@ def build_overview(): 22, "Nodes Closest to Full Root Disks", f"topk(12, {root_usage_expr()})", - {"h": 16, "w": 12, "x": 12, "y": 64}, + {"h": 16, "w": 12, "x": 12, "y": 68}, unit="percent", thresholds=PERCENT_THRESHOLDS, links=link_to("atlas-storage"), @@ -2153,16 +2408,103 @@ def build_mail_dashboard(): } -def build_testing_dashboard(): +def build_jobs_dashboard(): panels = [] - sort_desc = [{"id": "labelsToFields", "options": {}}, {"id": "sortBy", "options": {"fields": ["Value"], "order": "desc"}}] + age_thresholds = { + "mode": "absolute", + "steps": [ + {"color": "green", "value": None}, + {"color": "yellow", "value": 6}, + {"color": "orange", "value": 24}, + {"color": "red", "value": 48}, + ], + } + recent_error_thresholds = { + "mode": "absolute", + "steps": [ + {"color": "red", "value": None}, + {"color": "orange", "value": 1}, + {"color": "yellow", "value": 6}, + {"color": "green", "value": 24}, + ], + } + + task_error_thresholds = { + "mode": "absolute", + "steps": [ + {"color": "green", "value": None}, + {"color": "yellow", "value": 1}, + {"color": "orange", "value": 3}, + {"color": "red", "value": 5}, + ], + } panels.append( - stat_panel( + bargauge_panel( 1, + "Ariadne Task Errors (range)", + ARIADNE_TASK_ERRORS_RANGE, + {"h": 7, "w": 8, "x": 0, "y": 0}, + unit="none", + instant=True, + legend="{{task}}", + thresholds=task_error_thresholds, + ) + ) + panels.append( + { + "id": 2, + "type": "timeseries", + "title": "Ariadne Attempts / Failures", + "datasource": PROM_DS, + "gridPos": {"h": 7, "w": 8, "x": 8, "y": 0}, + "targets": [ + {"expr": ARIADNE_TASK_ATTEMPTS_SERIES, "refId": "A", "legendFormat": "Attempts"}, + {"expr": ARIADNE_TASK_FAILURES_SERIES, "refId": "B", "legendFormat": "Failures"}, + ], + "fieldConfig": { + "defaults": {"unit": "none"}, + "overrides": [ + { + "matcher": {"id": "byName", "options": "Attempts"}, + "properties": [ + {"id": "color", "value": {"mode": "fixed", "fixedColor": "green"}} + ], + }, + { + "matcher": {"id": "byName", "options": "Failures"}, + "properties": [ + {"id": "color", "value": {"mode": "fixed", "fixedColor": "red"}} + ], + }, + ], + }, + "options": { + "legend": {"displayMode": "table", "placement": "right"}, + "tooltip": {"mode": "multi"}, + }, + } + ) + panels.append( + bargauge_panel( + 3, + "One-off Job Pods (age hours)", + ONEOFF_JOB_POD_AGE_HOURS, + {"h": 7, "w": 8, "x": 16, "y": 0}, + unit="h", + instant=True, + legend="{{namespace}}/{{pod}}", + thresholds=age_thresholds, + limit=12, + decimals=2, + ) + ) + panels.append( + stat_panel( + 4, "Glue Jobs Stale (>36h)", GLUE_STALE_COUNT, - {"h": 4, "w": 6, "x": 0, "y": 0}, + {"h": 4, "w": 4, "x": 0, "y": 7}, unit="none", thresholds={ "mode": "absolute", @@ -2176,64 +2518,164 @@ def build_testing_dashboard(): ) ) panels.append( - table_panel( - 2, - "Glue Jobs Missing Success", - GLUE_MISSING_ACTIVE, - {"h": 4, "w": 6, "x": 6, "y": 0}, - unit="none", - transformations=sort_desc, - instant=True, - ) - ) - panels.append( - table_panel( - 3, - "Glue Jobs Suspended", - GLUE_SUSPENDED, - {"h": 4, "w": 6, "x": 12, "y": 0}, - unit="none", - transformations=sort_desc, - instant=True, - ) - ) - panels.append( - table_panel( - 4, - "Glue Jobs Active Runs", - GLUE_ACTIVE, - {"h": 4, "w": 6, "x": 18, "y": 0}, - unit="none", - transformations=sort_desc, - instant=True, - ) - ) - panels.append( - table_panel( + stat_panel( 5, - "Glue Jobs Last Success (hours ago)", - GLUE_LAST_SUCCESS_AGE_HOURS, - {"h": 8, "w": 12, "x": 0, "y": 4}, + "Glue Jobs Missing Success", + GLUE_MISSING_COUNT, + {"h": 4, "w": 4, "x": 4, "y": 7}, + unit="none", + ) + ) + panels.append( + stat_panel( + 6, + "Glue Jobs Suspended", + GLUE_SUSPENDED_COUNT, + {"h": 4, "w": 4, "x": 8, "y": 7}, + unit="none", + ) + ) + panels.append( + stat_panel( + 7, + "Ariadne Task Errors (1h)", + ARIADNE_TASK_ERRORS_1H_TOTAL, + {"h": 4, "w": 4, "x": 12, "y": 7}, + unit="none", + ) + ) + panels.append( + stat_panel( + 8, + "Ariadne Task Errors (24h)", + ARIADNE_TASK_ERRORS_24H_TOTAL, + {"h": 4, "w": 4, "x": 16, "y": 7}, + unit="none", + ) + ) + panels.append( + stat_panel( + 9, + "Ariadne Task Runs (1h)", + ARIADNE_TASK_RUNS_1H_TOTAL, + {"h": 4, "w": 4, "x": 20, "y": 7}, + unit="none", + ) + ) + panels.append( + bargauge_panel( + 10, + "Ariadne Schedule Last Error (hours ago)", + ARIADNE_SCHEDULE_LAST_ERROR_RANGE_HOURS, + {"h": 6, "w": 12, "x": 0, "y": 17}, unit="h", - transformations=sort_desc, instant=True, + legend="{{task}}", + thresholds=recent_error_thresholds, + decimals=2, + ) + ) + panels.append( + bargauge_panel( + 11, + "Ariadne Schedule Last Success (hours ago)", + ARIADNE_SCHEDULE_LAST_SUCCESS_RANGE_HOURS, + {"h": 6, "w": 12, "x": 12, "y": 17}, + unit="h", + instant=True, + legend="{{task}}", + thresholds=age_thresholds, + decimals=2, + ) + ) + panels.append( + bargauge_panel( + 12, + "Glue Jobs Last Success (hours ago)", + GLUE_LAST_SUCCESS_RANGE_HOURS, + {"h": 6, "w": 12, "x": 0, "y": 23}, + unit="h", + instant=True, + legend="{{namespace}}/{{cronjob}}", + thresholds=age_thresholds, + decimals=2, + ) + ) + panels.append( + bargauge_panel( + 13, + "Glue Jobs Last Schedule (hours ago)", + GLUE_LAST_SCHEDULE_RANGE_HOURS, + {"h": 6, "w": 12, "x": 12, "y": 23}, + unit="h", + instant=True, + legend="{{namespace}}/{{cronjob}}", + thresholds=age_thresholds, + decimals=2, + ) + ) + panels.append( + bargauge_panel( + 14, + "Ariadne Task Errors (1h)", + ARIADNE_TASK_ERRORS_1H, + {"h": 6, "w": 12, "x": 0, "y": 29}, + unit="none", + instant=True, + legend="{{task}}", + thresholds=task_error_thresholds, + ) + ) + panels.append( + bargauge_panel( + 15, + "Ariadne Task Errors (30d)", + ARIADNE_TASK_ERRORS_30D, + {"h": 6, "w": 12, "x": 12, "y": 29}, + unit="none", + instant=True, + legend="{{task}}", + thresholds=task_error_thresholds, + ) + ) + panels.append( + bargauge_panel( + 16, + "Ariadne Access Requests", + ARIADNE_ACCESS_REQUESTS, + {"h": 6, "w": 8, "x": 0, "y": 11}, + unit="none", + instant=True, + legend="{{status}}", + ) + ) + panels.append( + stat_panel( + 17, + "Ariadne CI Coverage (%)", + ARIADNE_CI_COVERAGE, + {"h": 6, "w": 4, "x": 8, "y": 11}, + unit="percent", + decimals=1, + instant=True, + legend="{{branch}}", ) ) panels.append( table_panel( - 6, - "Glue Jobs Last Schedule (hours ago)", - GLUE_LAST_SCHEDULE_AGE_HOURS, - {"h": 8, "w": 12, "x": 12, "y": 4}, - unit="h", - transformations=sort_desc, + 18, + "Ariadne CI Tests (latest)", + ARIADNE_CI_TESTS, + {"h": 6, "w": 12, "x": 12, "y": 11}, + unit="none", + transformations=[{"id": "labelsToFields", "options": {}}, {"id": "sortBy", "options": {"fields": ["Value"], "order": "desc"}}], instant=True, ) ) return { - "uid": "atlas-testing", - "title": "Atlas Testing", + "uid": "atlas-jobs", + "title": "Atlas Jobs", "folderUid": PRIVATE_FOLDER, "editable": True, "panels": panels, @@ -2241,7 +2683,7 @@ def build_testing_dashboard(): "annotations": {"list": []}, "schemaVersion": 39, "style": "dark", - "tags": ["atlas", "testing"], + "tags": ["atlas", "jobs", "glue"], } @@ -2274,7 +2716,7 @@ def build_gpu_dashboard(): timeseries_panel( 3, "GPU Util by Node", - 'sum by (Hostname) (DCGM_FI_DEV_GPU_UTIL{pod!=""})', + gpu_util_by_hostname(), {"h": 8, "w": 12, "x": 0, "y": 8}, unit="percent", legend="{{Hostname}}", @@ -2338,9 +2780,9 @@ DASHBOARDS = { "builder": build_mail_dashboard, "configmap": ROOT / "services" / "monitoring" / "grafana-dashboard-mail.yaml", }, - "atlas-testing": { - "builder": build_testing_dashboard, - "configmap": ROOT / "services" / "monitoring" / "grafana-dashboard-testing.yaml", + "atlas-jobs": { + "builder": build_jobs_dashboard, + "configmap": ROOT / "services" / "monitoring" / "grafana-dashboard-jobs.yaml", }, "atlas-gpu": { "builder": build_gpu_dashboard, diff --git a/scripts/knowledge_render_atlas.py b/scripts/knowledge_render_atlas.py index c7f9f26..1e305cb 100644 --- a/scripts/knowledge_render_atlas.py +++ b/scripts/knowledge_render_atlas.py @@ -20,11 +20,13 @@ import subprocess import sys from dataclasses import dataclass from pathlib import Path +import shutil from typing import Any, Iterable import yaml REPO_ROOT = Path(__file__).resolve().parents[1] +DASHBOARD_DIR = REPO_ROOT / "services" / "monitoring" / "dashboards" CLUSTER_SCOPED_KINDS = { "Namespace", @@ -60,6 +62,70 @@ def _run(cmd: list[str], *, cwd: Path) -> str: return res.stdout +def _sync_tree(source: Path, dest: Path) -> None: + if dest.exists(): + shutil.rmtree(dest) + shutil.copytree(source, dest) + + +def _iter_dashboard_panels(dashboard: dict[str, Any]) -> Iterable[dict[str, Any]]: + panels = dashboard.get("panels") if isinstance(dashboard.get("panels"), list) else [] + for panel in panels: + if not isinstance(panel, dict): + continue + if panel.get("type") == "row" and isinstance(panel.get("panels"), list): + yield from _iter_dashboard_panels({"panels": panel.get("panels")}) + continue + yield panel + + +def _extract_metrics_index(dashboard_dir: Path) -> list[dict[str, Any]]: + index: list[dict[str, Any]] = [] + for path in sorted(dashboard_dir.glob("*.json")): + try: + data = json.loads(path.read_text(encoding="utf-8")) + except json.JSONDecodeError: + continue + if not isinstance(data, dict): + continue + dash_title = data.get("title") or path.stem + dash_tags = data.get("tags") or [] + for panel in _iter_dashboard_panels(data): + targets = panel.get("targets") + if not isinstance(targets, list): + continue + exprs: list[str] = [] + for target in targets: + if not isinstance(target, dict): + continue + expr = target.get("expr") + if isinstance(expr, str) and expr.strip(): + exprs.append(expr.strip()) + if not exprs: + continue + datasource = panel.get("datasource") or {} + if isinstance(datasource, dict): + ds_uid = datasource.get("uid") + ds_type = datasource.get("type") + else: + ds_uid = None + ds_type = None + index.append( + { + "dashboard": dash_title, + "panel_title": panel.get("title") or "", + "panel_id": panel.get("id"), + "panel_type": panel.get("type"), + "description": panel.get("description") or "", + "tags": dash_tags, + "datasource_uid": ds_uid, + "datasource_type": ds_type, + "exprs": exprs, + } + ) + return index + + def kustomize_build(path: Path) -> str: rel = path.relative_to(REPO_ROOT) try: @@ -472,6 +538,11 @@ def main() -> int: action="store_true", help="Write generated files (otherwise just print a summary).", ) + ap.add_argument( + "--sync-comms", + action="store_true", + help="Mirror rendered knowledge into services/comms/knowledge for atlasbot.", + ) args = ap.parse_args() out_dir = REPO_ROOT / args.out @@ -504,6 +575,7 @@ def main() -> int: summary_path = out_dir / "catalog" / "atlas-summary.json" diagram_path = out_dir / "diagrams" / "atlas-http.mmd" runbooks_json_path = out_dir / "catalog" / "runbooks.json" + metrics_json_path = out_dir / "catalog" / "metrics.json" catalog_rel = catalog_path.relative_to(REPO_ROOT).as_posix() catalog_path.write_text( @@ -517,9 +589,14 @@ def main() -> int: diagram_path.write_text(diagram, encoding="utf-8") # Render runbooks into JSON for lightweight, dependency-free consumption in-cluster. - runbooks_dir = out_dir / "runbooks" + runbook_dirs = [ + out_dir / "runbooks", + out_dir / "software", + ] runbooks: list[dict[str, Any]] = [] - if runbooks_dir.exists(): + for runbooks_dir in runbook_dirs: + if not runbooks_dir.exists(): + continue for md_file in sorted(runbooks_dir.glob("*.md")): raw = md_file.read_text(encoding="utf-8") fm: dict[str, Any] = {} @@ -543,12 +620,22 @@ def main() -> int: } ) runbooks_json_path.write_text(json.dumps(runbooks, indent=2, sort_keys=False) + "\n", encoding="utf-8") + metrics_index = _extract_metrics_index(DASHBOARD_DIR) + metrics_json_path.write_text( + json.dumps(metrics_index, indent=2, sort_keys=False) + "\n", encoding="utf-8" + ) print(f"Wrote {catalog_path.relative_to(REPO_ROOT)}") print(f"Wrote {catalog_json_path.relative_to(REPO_ROOT)}") print(f"Wrote {summary_path.relative_to(REPO_ROOT)}") print(f"Wrote {diagram_path.relative_to(REPO_ROOT)}") print(f"Wrote {runbooks_json_path.relative_to(REPO_ROOT)}") + print(f"Wrote {metrics_json_path.relative_to(REPO_ROOT)}") + + if args.sync_comms: + comms_dir = REPO_ROOT / "services" / "comms" / "knowledge" + _sync_tree(out_dir, comms_dir) + print(f"Synced {out_dir.relative_to(REPO_ROOT)} -> {comms_dir.relative_to(REPO_ROOT)}") return 0 diff --git a/services/ai-llm/deployment.yaml b/services/ai-llm/deployment.yaml index fa35440..bf012c0 100644 --- a/services/ai-llm/deployment.yaml +++ b/services/ai-llm/deployment.yaml @@ -20,8 +20,9 @@ spec: labels: app: ollama annotations: - ai.bstein.dev/model: qwen2.5-coder:7b-instruct-q4_0 - ai.bstein.dev/gpu: GPU pool (titan-20/21/22/24) + ai.bstein.dev/model: qwen2.5:14b-instruct-q4_0 + ai.bstein.dev/gpu: GPU pool (titan-22/24) + ai.bstein.dev/restartedAt: "2026-01-26T12:00:00Z" spec: affinity: nodeAffinity: @@ -31,8 +32,6 @@ spec: - key: kubernetes.io/hostname operator: In values: - - titan-20 - - titan-21 - titan-22 - titan-24 runtimeClassName: nvidia @@ -53,7 +52,7 @@ spec: - name: OLLAMA_MODELS value: /root/.ollama - name: OLLAMA_MODEL - value: qwen2.5-coder:7b-instruct-q4_0 + value: qwen2.5:14b-instruct-q4_0 command: - /bin/sh - -c @@ -68,8 +67,8 @@ spec: mountPath: /root/.ollama resources: requests: - cpu: 250m - memory: 1Gi + cpu: 500m + memory: 2Gi nvidia.com/gpu.shared: 1 limits: nvidia.com/gpu.shared: 1 @@ -96,10 +95,10 @@ spec: mountPath: /root/.ollama resources: requests: - cpu: "2" - memory: 8Gi + cpu: "4" + memory: 16Gi nvidia.com/gpu.shared: 1 limits: - cpu: "4" - memory: 12Gi + cpu: "8" + memory: 24Gi nvidia.com/gpu.shared: 1 diff --git a/services/bstein-dev-home/backend-deployment.yaml b/services/bstein-dev-home/backend-deployment.yaml index 376622c..ba7d6f8 100644 --- a/services/bstein-dev-home/backend-deployment.yaml +++ b/services/bstein-dev-home/backend-deployment.yaml @@ -28,6 +28,7 @@ spec: {{ with secret "kv/data/atlas/shared/chat-ai-keys-runtime" }} export CHAT_KEY_MATRIX="{{ .Data.data.matrix }}" export CHAT_KEY_HOMEPAGE="{{ .Data.data.homepage }}" + export AI_ATLASBOT_TOKEN="{{ .Data.data.homepage }}" {{ end }} {{ with secret "kv/data/atlas/shared/portal-e2e-client" }} export PORTAL_E2E_CLIENT_ID="{{ .Data.data.client_id }}" @@ -58,14 +59,18 @@ spec: args: - >- . /vault/secrets/portal-env.sh - && exec gunicorn -b 0.0.0.0:8080 --workers 2 --timeout 180 app:app + && exec gunicorn -b 0.0.0.0:8080 --workers 2 --timeout 600 app:app env: - name: AI_CHAT_API value: http://ollama.ai.svc.cluster.local:11434 - name: AI_CHAT_MODEL value: qwen2.5-coder:7b-instruct-q4_0 - name: AI_CHAT_TIMEOUT_SEC - value: "60" + value: "480" + - name: AI_ATLASBOT_ENDPOINT + value: http://atlasbot.comms.svc.cluster.local:8090/v1/answer + - name: AI_ATLASBOT_TIMEOUT_SEC + value: "30" - name: AI_NODE_NAME valueFrom: fieldRef: @@ -91,10 +96,28 @@ spec: value: atlas - name: KEYCLOAK_ADMIN_CLIENT_ID value: bstein-dev-home-admin + - name: ARIADNE_URL + value: http://ariadne.maintenance.svc.cluster.local + - name: ARIADNE_TIMEOUT_SEC + value: "10" - name: ACCOUNT_ALLOWED_GROUPS value: "" - name: HTTP_CHECK_TIMEOUT_SEC value: "2" + - name: PORTAL_DB_POOL_MIN + value: "0" + - name: PORTAL_DB_POOL_MAX + value: "5" + - name: PORTAL_DB_CONNECT_TIMEOUT_SEC + value: "5" + - name: PORTAL_DB_LOCK_TIMEOUT_SEC + value: "5" + - name: PORTAL_DB_STATEMENT_TIMEOUT_SEC + value: "30" + - name: PORTAL_DB_IDLE_IN_TX_TIMEOUT_SEC + value: "10" + - name: PORTAL_RUN_MIGRATIONS + value: "false" - name: ACCESS_REQUEST_SUBMIT_RATE_LIMIT value: "30" - name: ACCESS_REQUEST_SUBMIT_RATE_WINDOW_SEC diff --git a/services/bstein-dev-home/chat-ai-gateway-deployment.yaml b/services/bstein-dev-home/chat-ai-gateway-deployment.yaml index 40d74fe..e572406 100644 --- a/services/bstein-dev-home/chat-ai-gateway-deployment.yaml +++ b/services/bstein-dev-home/chat-ai-gateway-deployment.yaml @@ -47,6 +47,8 @@ spec: env: - name: UPSTREAM_URL value: http://bstein-dev-home-backend/api/chat + - name: UPSTREAM_TIMEOUT_SEC + value: "600" ports: - name: http containerPort: 8080 @@ -65,10 +67,10 @@ spec: resources: requests: cpu: 20m - memory: 64Mi + memory: 128Mi limits: cpu: 200m - memory: 256Mi + memory: 512Mi volumeMounts: - name: code mountPath: /app/gateway.py diff --git a/services/bstein-dev-home/image.yaml b/services/bstein-dev-home/image.yaml index 3b6c757..eed2736 100644 --- a/services/bstein-dev-home/image.yaml +++ b/services/bstein-dev-home/image.yaml @@ -7,6 +7,8 @@ metadata: spec: image: registry.bstein.dev/bstein/bstein-dev-home-frontend interval: 1m0s + secretRef: + name: harbor-regcred --- apiVersion: image.toolkit.fluxcd.io/v1beta2 kind: ImagePolicy @@ -28,6 +30,8 @@ metadata: spec: image: registry.bstein.dev/bstein/bstein-dev-home-backend interval: 1m0s + secretRef: + name: harbor-regcred --- apiVersion: image.toolkit.fluxcd.io/v1beta2 kind: ImagePolicy diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml index f9d3c87..f62fb17 100644 --- a/services/bstein-dev-home/kustomization.yaml +++ b/services/bstein-dev-home/kustomization.yaml @@ -16,13 +16,13 @@ resources: - backend-deployment.yaml - backend-service.yaml - vaultwarden-cred-sync-cronjob.yaml - - portal-onboarding-e2e-test-job.yaml + - oneoffs/portal-onboarding-e2e-test-job.yaml - ingress.yaml images: - name: registry.bstein.dev/bstein/bstein-dev-home-frontend - newTag: 0.1.1-102 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend"} + newTag: 0.1.1-162 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} - name: registry.bstein.dev/bstein/bstein-dev-home-backend - newTag: 0.1.1-103 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend"} + newTag: 0.1.1-162 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} configMapGenerator: - name: chat-ai-gateway namespace: bstein-dev-home diff --git a/services/bstein-dev-home/oneoffs/migrations/kustomization.yaml b/services/bstein-dev-home/oneoffs/migrations/kustomization.yaml new file mode 100644 index 0000000..1d1dfc8 --- /dev/null +++ b/services/bstein-dev-home/oneoffs/migrations/kustomization.yaml @@ -0,0 +1,6 @@ +# services/bstein-dev-home/oneoffs/migrations/kustomization.yaml +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization +namespace: bstein-dev-home +resources: + - portal-migrate-job.yaml diff --git a/services/bstein-dev-home/oneoffs/migrations/portal-migrate-job.yaml b/services/bstein-dev-home/oneoffs/migrations/portal-migrate-job.yaml new file mode 100644 index 0000000..1f7e092 --- /dev/null +++ b/services/bstein-dev-home/oneoffs/migrations/portal-migrate-job.yaml @@ -0,0 +1,48 @@ +# services/bstein-dev-home/oneoffs/migrations/portal-migrate-job.yaml +# One-off job for bstein-dev-home/bstein-dev-home-portal-migrate-36. +# Purpose: bstein dev home portal migrate 36 (see container args/env in this file). +# Run by setting spec.suspend to false, reconcile, then set it back to true. +# Safe to delete the finished Job/pod; it should not run continuously. +apiVersion: batch/v1 +kind: Job +metadata: + name: bstein-dev-home-portal-migrate-36 + namespace: bstein-dev-home + annotations: + kustomize.toolkit.fluxcd.io/force: "true" +spec: + suspend: true + backoffLimit: 1 + ttlSecondsAfterFinished: 3600 + template: + metadata: + labels: + app: bstein-dev-home-portal-migrate + annotations: + vault.hashicorp.com/agent-inject: "true" + vault.hashicorp.com/role: "bstein-dev-home" + vault.hashicorp.com/agent-inject-secret-portal-env.sh: "kv/data/atlas/portal/atlas-portal-db" + vault.hashicorp.com/agent-inject-template-portal-env.sh: | + {{ with secret "kv/data/atlas/portal/atlas-portal-db" }} + export PORTAL_DATABASE_URL="{{ .Data.data.PORTAL_DATABASE_URL }}" + {{ end }} + spec: + serviceAccountName: bstein-dev-home + restartPolicy: Never + nodeSelector: + kubernetes.io/arch: arm64 + node-role.kubernetes.io/worker: "true" + imagePullSecrets: + - name: harbor-regcred + containers: + - name: migrate + image: registry.bstein.dev/bstein/bstein-dev-home-backend:0.1.1-95 + imagePullPolicy: Always + command: ["/bin/sh", "-c"] + args: + - >- + . /vault/secrets/portal-env.sh + && exec python -m atlas_portal.migrate + env: + - name: PORTAL_RUN_MIGRATIONS + value: "true" diff --git a/services/bstein-dev-home/portal-onboarding-e2e-test-job.yaml b/services/bstein-dev-home/oneoffs/portal-onboarding-e2e-test-job.yaml similarity index 88% rename from services/bstein-dev-home/portal-onboarding-e2e-test-job.yaml rename to services/bstein-dev-home/oneoffs/portal-onboarding-e2e-test-job.yaml index f22272e..9923499 100644 --- a/services/bstein-dev-home/portal-onboarding-e2e-test-job.yaml +++ b/services/bstein-dev-home/oneoffs/portal-onboarding-e2e-test-job.yaml @@ -1,10 +1,15 @@ -# services/bstein-dev-home/portal-onboarding-e2e-test-job.yaml +# services/bstein-dev-home/oneoffs/portal-onboarding-e2e-test-job.yaml +# One-off job for bstein-dev-home/portal-onboarding-e2e-test-27. +# Purpose: portal onboarding e2e test 27 (see container args/env in this file). +# Run by setting spec.suspend to false, reconcile, then set it back to true. +# Safe to delete the finished Job/pod; it should not run continuously. apiVersion: batch/v1 kind: Job metadata: - name: portal-onboarding-e2e-test-19 + name: portal-onboarding-e2e-test-27 namespace: bstein-dev-home spec: + suspend: true backoffLimit: 0 template: metadata: diff --git a/services/bstein-dev-home/scripts/gateway.py b/services/bstein-dev-home/scripts/gateway.py index 3ca2fa1..19d3606 100644 --- a/services/bstein-dev-home/scripts/gateway.py +++ b/services/bstein-dev-home/scripts/gateway.py @@ -6,6 +6,7 @@ from urllib import request, error UPSTREAM = os.environ.get("UPSTREAM_URL", "http://bstein-dev-home-backend/api/chat") KEY_MATRIX = os.environ.get("CHAT_KEY_MATRIX", "") KEY_HOMEPAGE = os.environ.get("CHAT_KEY_HOMEPAGE", "") +UPSTREAM_TIMEOUT_SEC = float(os.environ.get("UPSTREAM_TIMEOUT_SEC", "90")) ALLOWED = {k for k in (KEY_MATRIX, KEY_HOMEPAGE) if k} @@ -41,7 +42,7 @@ class Handler(BaseHTTPRequestHandler): headers={"Content-Type": "application/json"}, method="POST", ) - with request.urlopen(upstream_req, timeout=90) as resp: + with request.urlopen(upstream_req, timeout=UPSTREAM_TIMEOUT_SEC) as resp: data = resp.read() self.send_response(resp.status) for k, v in resp.headers.items(): diff --git a/services/bstein-dev-home/secretproviderclass.yaml b/services/bstein-dev-home/secretproviderclass.yaml index f330fe6..2fa714a 100644 --- a/services/bstein-dev-home/secretproviderclass.yaml +++ b/services/bstein-dev-home/secretproviderclass.yaml @@ -11,7 +11,7 @@ spec: roleName: "bstein-dev-home" objects: | - objectName: "harbor-pull__dockerconfigjson" - secretPath: "kv/data/atlas/harbor-pull/bstein-dev-home" + secretPath: "kv/data/atlas/shared/harbor-pull" secretKey: "dockerconfigjson" secretObjects: - secretName: harbor-regcred diff --git a/services/bstein-dev-home/vaultwarden-cred-sync-cronjob.yaml b/services/bstein-dev-home/vaultwarden-cred-sync-cronjob.yaml index 29141fe..acd851b 100644 --- a/services/bstein-dev-home/vaultwarden-cred-sync-cronjob.yaml +++ b/services/bstein-dev-home/vaultwarden-cred-sync-cronjob.yaml @@ -8,6 +8,7 @@ metadata: atlas.bstein.dev/glue: "true" spec: schedule: "*/15 * * * *" + suspend: true concurrencyPolicy: Forbid successfulJobsHistoryLimit: 1 failedJobsHistoryLimit: 3 diff --git a/services/comms/atlasbot-deployment.yaml b/services/comms/atlasbot-deployment.yaml index 4618053..b65aef0 100644 --- a/services/comms/atlasbot-deployment.yaml +++ b/services/comms/atlasbot-deployment.yaml @@ -16,7 +16,7 @@ spec: labels: app: atlasbot annotations: - checksum/atlasbot-configmap: manual-atlasbot-4 + checksum/atlasbot-configmap: manual-atlasbot-101 vault.hashicorp.com/agent-inject: "true" vault.hashicorp.com/role: "comms" vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret" @@ -73,12 +73,33 @@ spec: value: /kb - name: VM_URL value: http://victoria-metrics-single-server.monitoring.svc.cluster.local:8428 + - name: ARIADNE_STATE_URL + value: http://ariadne.maintenance.svc.cluster.local/api/internal/cluster/state - name: BOT_USER value: atlasbot + - name: BOT_MENTIONS + value: atlasbot,aatlasbot,atlas_quick,atlas_smart - name: OLLAMA_URL - value: https://chat.ai.bstein.dev/ + value: http://ollama.ai.svc.cluster.local:11434 - name: OLLAMA_MODEL - value: qwen2.5-coder:7b-instruct-q4_0 + value: qwen2.5:14b-instruct + - name: ATLASBOT_MODEL_FAST + value: qwen2.5:14b-instruct-q4_0 + - name: ATLASBOT_MODEL_DEEP + value: qwen2.5:14b-instruct + - name: OLLAMA_FALLBACK_MODEL + value: qwen2.5:14b-instruct-q4_0 + - name: OLLAMA_TIMEOUT_SEC + value: "600" + - name: ATLASBOT_THINKING_INTERVAL_SEC + value: "120" + - name: ATLASBOT_SNAPSHOT_TTL_SEC + value: "30" + - name: ATLASBOT_HTTP_PORT + value: "8090" + ports: + - name: http + containerPort: 8090 resources: requests: cpu: 100m @@ -110,6 +131,8 @@ spec: path: catalog/atlas.json - key: atlas-summary.json path: catalog/atlas-summary.json + - key: metrics.json + path: catalog/metrics.json - key: runbooks.json path: catalog/runbooks.json - key: atlas-http.mmd diff --git a/services/comms/atlasbot-service.yaml b/services/comms/atlasbot-service.yaml new file mode 100644 index 0000000..c8b3570 --- /dev/null +++ b/services/comms/atlasbot-service.yaml @@ -0,0 +1,15 @@ +apiVersion: v1 +kind: Service +metadata: + name: atlasbot + namespace: comms + labels: + app: atlasbot +spec: + selector: + app: atlasbot + ports: + - name: http + port: 8090 + targetPort: 8090 + type: ClusterIP diff --git a/services/comms/guest-name-job.yaml b/services/comms/guest-name-job.yaml index 21a8af5..3eae2dd 100644 --- a/services/comms/guest-name-job.yaml +++ b/services/comms/guest-name-job.yaml @@ -8,7 +8,7 @@ metadata: atlas.bstein.dev/glue: "true" spec: schedule: "*/1 * * * *" - suspend: false + suspend: true concurrencyPolicy: Forbid successfulJobsHistoryLimit: 1 failedJobsHistoryLimit: 1 diff --git a/services/comms/helmrelease.yaml b/services/comms/helmrelease.yaml index 4456348..eeac49e 100644 --- a/services/comms/helmrelease.yaml +++ b/services/comms/helmrelease.yaml @@ -140,6 +140,7 @@ spec: autocreate_auto_join_rooms: true default_room_version: "11" experimental_features: + msc4108_enabled: true msc3266_enabled: true msc4143_enabled: true msc4222_enabled: true diff --git a/services/comms/knowledge/catalog/atlas-summary.json b/services/comms/knowledge/catalog/atlas-summary.json index fa35051..ea825ce 100644 --- a/services/comms/knowledge/catalog/atlas-summary.json +++ b/services/comms/knowledge/catalog/atlas-summary.json @@ -1,8 +1,8 @@ { "counts": { - "helmrelease_host_hints": 17, - "http_endpoints": 37, - "services": 43, - "workloads": 54 + "helmrelease_host_hints": 19, + "http_endpoints": 45, + "services": 47, + "workloads": 74 } } diff --git a/services/comms/knowledge/catalog/atlas.json b/services/comms/knowledge/catalog/atlas.json index 0d97bcd..951c807 100644 --- a/services/comms/knowledge/catalog/atlas.json +++ b/services/comms/knowledge/catalog/atlas.json @@ -11,6 +11,21 @@ "path": "services/bstein-dev-home", "targetNamespace": "bstein-dev-home" }, + { + "name": "bstein-dev-home-migrations", + "path": "services/bstein-dev-home/migrations", + "targetNamespace": "bstein-dev-home" + }, + { + "name": "cert-manager", + "path": "infrastructure/cert-manager", + "targetNamespace": "cert-manager" + }, + { + "name": "cert-manager-cleanup", + "path": "infrastructure/cert-manager/cleanup", + "targetNamespace": "cert-manager" + }, { "name": "comms", "path": "services/comms", @@ -26,6 +41,11 @@ "path": "services/crypto", "targetNamespace": "crypto" }, + { + "name": "finance", + "path": "services/finance", + "targetNamespace": "finance" + }, { "name": "flux-system", "path": "clusters/atlas/flux-system", @@ -46,6 +66,11 @@ "path": "services/harbor", "targetNamespace": "harbor" }, + { + "name": "health", + "path": "services/health", + "targetNamespace": "health" + }, { "name": "helm", "path": "infrastructure/sources/helm", @@ -71,6 +96,16 @@ "path": "services/logging", "targetNamespace": null }, + { + "name": "longhorn", + "path": "infrastructure/longhorn/core", + "targetNamespace": "longhorn-system" + }, + { + "name": "longhorn-adopt", + "path": "infrastructure/longhorn/adopt", + "targetNamespace": "longhorn-system" + }, { "name": "longhorn-ui", "path": "infrastructure/longhorn/ui-ingress", @@ -161,11 +196,21 @@ "path": "infrastructure/vault-csi", "targetNamespace": "kube-system" }, + { + "name": "vault-injector", + "path": "infrastructure/vault-injector", + "targetNamespace": "vault" + }, { "name": "vaultwarden", "path": "services/vaultwarden", "targetNamespace": "vaultwarden" }, + { + "name": "wallet-monero-temp", + "path": "services/crypto/wallet-monero-temp", + "targetNamespace": "crypto" + }, { "name": "xmr-miner", "path": "services/crypto/xmr-miner", @@ -199,7 +244,7 @@ "node-role.kubernetes.io/worker": "true" }, "images": [ - "registry.bstein.dev/bstein/bstein-dev-home-backend:0.1.1-92" + "registry.bstein.dev/bstein/bstein-dev-home-backend:0.1.1-157" ] }, { @@ -215,7 +260,20 @@ "node-role.kubernetes.io/worker": "true" }, "images": [ - "registry.bstein.dev/bstein/bstein-dev-home-frontend:0.1.1-92" + "registry.bstein.dev/bstein/bstein-dev-home-frontend:0.1.1-157" + ] + }, + { + "kind": "Deployment", + "namespace": "bstein-dev-home", + "name": "bstein-dev-home-vault-sync", + "labels": { + "app": "bstein-dev-home-vault-sync" + }, + "serviceAccountName": "bstein-dev-home-vault-sync", + "nodeSelector": {}, + "images": [ + "alpine:3.20" ] }, { @@ -225,7 +283,7 @@ "labels": { "app": "chat-ai-gateway" }, - "serviceAccountName": null, + "serviceAccountName": "bstein-dev-home", "nodeSelector": { "kubernetes.io/arch": "arm64", "node-role.kubernetes.io/worker": "true" @@ -249,6 +307,19 @@ "python:3.11-slim" ] }, + { + "kind": "Deployment", + "namespace": "comms", + "name": "comms-vault-sync", + "labels": { + "app": "comms-vault-sync" + }, + "serviceAccountName": "comms-vault", + "nodeSelector": {}, + "images": [ + "alpine:3.20" + ] + }, { "kind": "Deployment", "namespace": "comms", @@ -256,7 +327,7 @@ "labels": { "app": "coturn" }, - "serviceAccountName": null, + "serviceAccountName": "comms-vault", "nodeSelector": { "hardware": "rpi5" }, @@ -286,7 +357,7 @@ "labels": { "app": "livekit" }, - "serviceAccountName": null, + "serviceAccountName": "comms-vault", "nodeSelector": { "hardware": "rpi5" }, @@ -301,12 +372,12 @@ "labels": { "app": "livekit-token-service" }, - "serviceAccountName": null, + "serviceAccountName": "comms-vault", "nodeSelector": { "hardware": "rpi5" }, "images": [ - "ghcr.io/element-hq/lk-jwt-service:0.3.0" + "registry.bstein.dev/tools/lk-jwt-service-vault:0.3.0" ] }, { @@ -316,7 +387,7 @@ "labels": { "app": "matrix-authentication-service" }, - "serviceAccountName": null, + "serviceAccountName": "comms-vault", "nodeSelector": { "hardware": "rpi5" }, @@ -331,7 +402,7 @@ "labels": { "app.kubernetes.io/name": "matrix-guest-register" }, - "serviceAccountName": null, + "serviceAccountName": "comms-vault", "nodeSelector": {}, "images": [ "python:3.11-slim" @@ -365,6 +436,19 @@ "ghcr.io/tari-project/xmrig@sha256:80defbfd0b640d604c91cb5101d3642db7928e1e68ee3c6b011289b3565a39d9" ] }, + { + "kind": "Deployment", + "namespace": "crypto", + "name": "crypto-vault-sync", + "labels": { + "app": "crypto-vault-sync" + }, + "serviceAccountName": "crypto-vault-sync", + "nodeSelector": {}, + "images": [ + "alpine:3.20" + ] + }, { "kind": "Deployment", "namespace": "crypto", @@ -372,7 +456,7 @@ "labels": { "app": "monero-p2pool" }, - "serviceAccountName": null, + "serviceAccountName": "crypto-vault-sync", "nodeSelector": { "node-role.kubernetes.io/worker": "true" }, @@ -395,6 +479,53 @@ "registry.bstein.dev/crypto/monerod:0.18.4.1" ] }, + { + "kind": "Deployment", + "namespace": "crypto", + "name": "wallet-monero-temp", + "labels": { + "app": "wallet-monero-temp" + }, + "serviceAccountName": "crypto-vault-sync", + "nodeSelector": { + "node-role.kubernetes.io/worker": "true" + }, + "images": [ + "registry.bstein.dev/crypto/monero-wallet-rpc:0.18.4.1" + ] + }, + { + "kind": "Deployment", + "namespace": "finance", + "name": "actual-budget", + "labels": { + "app": "actual-budget" + }, + "serviceAccountName": "finance-vault", + "nodeSelector": { + "kubernetes.io/arch": "arm64", + "node-role.kubernetes.io/worker": "true" + }, + "images": [ + "actualbudget/actual-server:26.1.0-alpine@sha256:34aae5813fdfee12af2a50c4d0667df68029f1d61b90f45f282473273eb70d0d" + ] + }, + { + "kind": "Deployment", + "namespace": "finance", + "name": "firefly", + "labels": { + "app": "firefly" + }, + "serviceAccountName": "finance-vault", + "nodeSelector": { + "kubernetes.io/arch": "arm64", + "node-role.kubernetes.io/worker": "true" + }, + "images": [ + "fireflyiii/core:version-6.4.15" + ] + }, { "kind": "Deployment", "namespace": "flux-system", @@ -516,7 +647,7 @@ "labels": { "app": "gitea" }, - "serviceAccountName": null, + "serviceAccountName": "gitea-vault", "nodeSelector": { "node-role.kubernetes.io/worker": "true" }, @@ -524,6 +655,36 @@ "gitea/gitea:1.23" ] }, + { + "kind": "Deployment", + "namespace": "harbor", + "name": "harbor-vault-sync", + "labels": { + "app": "harbor-vault-sync" + }, + "serviceAccountName": "harbor-vault-sync", + "nodeSelector": {}, + "images": [ + "alpine:3.20" + ] + }, + { + "kind": "Deployment", + "namespace": "health", + "name": "wger", + "labels": { + "app": "wger" + }, + "serviceAccountName": "health-vault-sync", + "nodeSelector": { + "kubernetes.io/arch": "arm64", + "node-role.kubernetes.io/worker": "true" + }, + "images": [ + "nginx:1.27.5-alpine@sha256:65645c7bb6a0661892a8b03b89d0743208a18dd2f3f17a54ef4b76fb8e2f2a10", + "wger/server@sha256:710588b78af4e0aa0b4d8a8061e4563e16eae80eeaccfe7f9e0d9cbdd7f0cbc5" + ] + }, { "kind": "Deployment", "namespace": "jellyfin", @@ -531,7 +692,7 @@ "labels": { "app": "jellyfin" }, - "serviceAccountName": null, + "serviceAccountName": "pegasus-vault-sync", "nodeSelector": {}, "images": [ "docker.io/jellyfin/jellyfin:10.11.5" @@ -544,14 +705,27 @@ "labels": { "app": "pegasus" }, - "serviceAccountName": null, + "serviceAccountName": "pegasus-vault-sync", "nodeSelector": { "kubernetes.io/arch": "arm64", "node-role.kubernetes.io/worker": "true" }, "images": [ "alpine:3.20", - "registry.bstein.dev/streaming/pegasus:1.2.32" + "registry.bstein.dev/streaming/pegasus-vault:1.2.32" + ] + }, + { + "kind": "Deployment", + "namespace": "jellyfin", + "name": "pegasus-vault-sync", + "labels": { + "app": "pegasus-vault-sync" + }, + "serviceAccountName": "pegasus-vault-sync", + "nodeSelector": {}, + "images": [ + "alpine:3.20" ] }, { @@ -570,6 +744,35 @@ "jenkins/jenkins:2.528.3-jdk21" ] }, + { + "kind": "Deployment", + "namespace": "jenkins", + "name": "jenkins-vault-sync", + "labels": { + "app": "jenkins-vault-sync" + }, + "serviceAccountName": "jenkins-vault-sync", + "nodeSelector": { + "kubernetes.io/arch": "arm64", + "node-role.kubernetes.io/worker": "true" + }, + "images": [ + "alpine:3.20" + ] + }, + { + "kind": "DaemonSet", + "namespace": "kube-system", + "name": "ntp-sync", + "labels": { + "app": "ntp-sync" + }, + "serviceAccountName": null, + "nodeSelector": {}, + "images": [ + "public.ecr.aws/docker/library/busybox:1.36.1" + ] + }, { "kind": "DaemonSet", "namespace": "kube-system", @@ -636,6 +839,21 @@ "hashicorp/vault-csi-provider:1.7.0" ] }, + { + "kind": "Deployment", + "namespace": "kube-system", + "name": "coredns", + "labels": { + "k8s-app": "kube-dns" + }, + "serviceAccountName": "coredns", + "nodeSelector": { + "kubernetes.io/os": "linux" + }, + "images": [ + "registry.bstein.dev/infra/coredns:1.12.1" + ] + }, { "kind": "DaemonSet", "namespace": "logging", @@ -681,6 +899,19 @@ "bitnami/kubectl@sha256:554ab88b1858e8424c55de37ad417b16f2a0e65d1607aa0f3fe3ce9b9f10b131" ] }, + { + "kind": "Deployment", + "namespace": "logging", + "name": "logging-vault-sync", + "labels": { + "app": "logging-vault-sync" + }, + "serviceAccountName": "logging-vault-sync", + "nodeSelector": {}, + "images": [ + "alpine:3.20" + ] + }, { "kind": "Deployment", "namespace": "logging", @@ -688,12 +919,27 @@ "labels": { "app": "oauth2-proxy-logs" }, - "serviceAccountName": null, + "serviceAccountName": "logging-vault-sync", "nodeSelector": { "node-role.kubernetes.io/worker": "true" }, "images": [ - "quay.io/oauth2-proxy/oauth2-proxy:v7.6.0" + "registry.bstein.dev/tools/oauth2-proxy-vault:v7.6.0" + ] + }, + { + "kind": "Deployment", + "namespace": "longhorn-system", + "name": "longhorn-vault-sync", + "labels": { + "app": "longhorn-vault-sync" + }, + "serviceAccountName": "longhorn-vault-sync", + "nodeSelector": { + "node-role.kubernetes.io/worker": "true" + }, + "images": [ + "alpine:3.20" ] }, { @@ -703,7 +949,7 @@ "labels": { "app": "oauth2-proxy-longhorn" }, - "serviceAccountName": null, + "serviceAccountName": "longhorn-vault", "nodeSelector": { "node-role.kubernetes.io/worker": "true" }, @@ -729,14 +975,45 @@ { "kind": "Deployment", "namespace": "mailu-mailserver", - "name": "mailu-sync-listener", + "name": "mailu-vault-sync", "labels": { - "app": "mailu-sync-listener" + "app": "mailu-vault-sync" }, - "serviceAccountName": null, + "serviceAccountName": "mailu-vault-sync", "nodeSelector": {}, "images": [ - "python:3.11-alpine" + "alpine:3.20" + ] + }, + { + "kind": "DaemonSet", + "namespace": "maintenance", + "name": "disable-k3s-traefik", + "labels": { + "app": "disable-k3s-traefik" + }, + "serviceAccountName": "disable-k3s-traefik", + "nodeSelector": { + "node-role.kubernetes.io/control-plane": "true" + }, + "images": [ + "bitnami/kubectl@sha256:554ab88b1858e8424c55de37ad417b16f2a0e65d1607aa0f3fe3ce9b9f10b131" + ] + }, + { + "kind": "DaemonSet", + "namespace": "maintenance", + "name": "k3s-agent-restart", + "labels": { + "app": "k3s-agent-restart" + }, + "serviceAccountName": "node-nofile", + "nodeSelector": { + "kubernetes.io/arch": "arm64", + "node-role.kubernetes.io/worker": "true" + }, + "images": [ + "bitnami/kubectl@sha256:554ab88b1858e8424c55de37ad417b16f2a0e65d1607aa0f3fe3ce9b9f10b131" ] }, { @@ -767,6 +1044,35 @@ "bitnami/kubectl@sha256:554ab88b1858e8424c55de37ad417b16f2a0e65d1607aa0f3fe3ce9b9f10b131" ] }, + { + "kind": "Deployment", + "namespace": "maintenance", + "name": "ariadne", + "labels": { + "app": "ariadne" + }, + "serviceAccountName": "ariadne", + "nodeSelector": { + "kubernetes.io/arch": "arm64", + "node-role.kubernetes.io/worker": "true" + }, + "images": [ + "registry.bstein.dev/bstein/ariadne:0.1.0-49" + ] + }, + { + "kind": "Deployment", + "namespace": "maintenance", + "name": "maintenance-vault-sync", + "labels": { + "app": "maintenance-vault-sync" + }, + "serviceAccountName": "maintenance-vault-sync", + "nodeSelector": {}, + "images": [ + "alpine:3.20" + ] + }, { "kind": "DaemonSet", "namespace": "monitoring", @@ -795,6 +1101,19 @@ "python:3.10-slim" ] }, + { + "kind": "Deployment", + "namespace": "monitoring", + "name": "monitoring-vault-sync", + "labels": { + "app": "monitoring-vault-sync" + }, + "serviceAccountName": "monitoring-vault-sync", + "nodeSelector": {}, + "images": [ + "alpine:3.20" + ] + }, { "kind": "Deployment", "namespace": "monitoring", @@ -802,7 +1121,7 @@ "labels": { "app": "postmark-exporter" }, - "serviceAccountName": null, + "serviceAccountName": "monitoring-vault-sync", "nodeSelector": {}, "images": [ "python:3.12-alpine" @@ -830,7 +1149,7 @@ "labels": { "app": "nextcloud" }, - "serviceAccountName": null, + "serviceAccountName": "nextcloud-vault", "nodeSelector": { "hardware": "rpi5" }, @@ -845,7 +1164,7 @@ "labels": { "app": "outline" }, - "serviceAccountName": null, + "serviceAccountName": "outline-vault", "nodeSelector": { "node-role.kubernetes.io/worker": "true" }, @@ -875,7 +1194,7 @@ "labels": { "app": "planka" }, - "serviceAccountName": null, + "serviceAccountName": "planka-vault", "nodeSelector": { "node-role.kubernetes.io/worker": "true" }, @@ -895,7 +1214,8 @@ "node-role.kubernetes.io/worker": "true" }, "images": [ - "postgres:15" + "postgres:15", + "quay.io/prometheuscommunity/postgres-exporter:v0.15.0" ] }, { @@ -905,8 +1225,11 @@ "labels": { "app": "keycloak" }, - "serviceAccountName": null, - "nodeSelector": {}, + "serviceAccountName": "sso-vault", + "nodeSelector": { + "kubernetes.io/arch": "arm64", + "node-role.kubernetes.io/worker": "true" + }, "images": [ "quay.io/keycloak/keycloak:26.0.7" ] @@ -918,12 +1241,25 @@ "labels": { "app": "oauth2-proxy" }, - "serviceAccountName": null, + "serviceAccountName": "sso-vault", "nodeSelector": { "node-role.kubernetes.io/worker": "true" }, "images": [ - "quay.io/oauth2-proxy/oauth2-proxy:v7.6.0" + "registry.bstein.dev/tools/oauth2-proxy-vault:v7.6.0" + ] + }, + { + "kind": "Deployment", + "namespace": "sso", + "name": "sso-vault-sync", + "labels": { + "app": "sso-vault-sync" + }, + "serviceAccountName": "sso-vault-sync", + "nodeSelector": {}, + "images": [ + "alpine:3.20" ] }, { @@ -933,7 +1269,7 @@ "labels": { "app": "openldap" }, - "serviceAccountName": null, + "serviceAccountName": "sso-vault", "nodeSelector": { "kubernetes.io/arch": "arm64", "node-role.kubernetes.io/worker": "true" @@ -951,7 +1287,7 @@ }, "serviceAccountName": "sui-metrics", "nodeSelector": { - "kubernetes.io/hostname": "titan-24" + "hardware": "rpi5" }, "images": [ "victoriametrics/vmagent:v1.103.0" @@ -962,7 +1298,9 @@ "namespace": "traefik", "name": "traefik", "labels": { - "app": "traefik" + "app": "traefik", + "app.kubernetes.io/instance": "traefik-kube-system", + "app.kubernetes.io/name": "traefik" }, "serviceAccountName": "traefik-ingress-controller", "nodeSelector": { @@ -995,10 +1333,13 @@ "labels": { "app": "vaultwarden" }, - "serviceAccountName": null, - "nodeSelector": {}, + "serviceAccountName": "vaultwarden-vault", + "nodeSelector": { + "kubernetes.io/arch": "arm64", + "node-role.kubernetes.io/worker": "true" + }, "images": [ - "vaultwarden/server:1.33.2" + "vaultwarden/server:1.35.2" ] } ], @@ -1565,6 +1906,54 @@ } ] }, + { + "namespace": "crypto", + "name": "wallet-monero-temp", + "type": "ClusterIP", + "selector": { + "app": "wallet-monero-temp" + }, + "ports": [ + { + "name": "rpc", + "port": 18083, + "targetPort": 18083, + "protocol": "TCP" + } + ] + }, + { + "namespace": "finance", + "name": "actual-budget", + "type": "ClusterIP", + "selector": { + "app": "actual-budget" + }, + "ports": [ + { + "name": "http", + "port": 80, + "targetPort": 5006, + "protocol": "TCP" + } + ] + }, + { + "namespace": "finance", + "name": "firefly", + "type": "ClusterIP", + "selector": { + "app": "firefly" + }, + "ports": [ + { + "name": "http", + "port": 80, + "targetPort": 8080, + "protocol": "TCP" + } + ] + }, { "namespace": "flux-system", "name": "notification-controller", @@ -1632,7 +2021,7 @@ { "namespace": "gitea", "name": "gitea-ssh", - "type": "NodePort", + "type": "LoadBalancer", "selector": { "app": "gitea" }, @@ -1645,6 +2034,22 @@ } ] }, + { + "namespace": "health", + "name": "wger", + "type": "ClusterIP", + "selector": { + "app": "wger" + }, + "ports": [ + { + "name": "http", + "port": 80, + "targetPort": "http", + "protocol": "TCP" + } + ] + }, { "namespace": "jellyfin", "name": "jellyfin", @@ -1699,29 +2104,6 @@ } ] }, - { - "namespace": "kube-system", - "name": "traefik", - "type": "LoadBalancer", - "selector": { - "app.kubernetes.io/instance": "traefik-kube-system", - "app.kubernetes.io/name": "traefik" - }, - "ports": [ - { - "name": "web", - "port": 80, - "targetPort": "web", - "protocol": "TCP" - }, - { - "name": "websecure", - "port": 443, - "targetPort": "websecure", - "protocol": "TCP" - } - ] - }, { "namespace": "logging", "name": "oauth2-proxy-logs", @@ -1803,17 +2185,17 @@ ] }, { - "namespace": "mailu-mailserver", - "name": "mailu-sync-listener", + "namespace": "maintenance", + "name": "ariadne", "type": "ClusterIP", "selector": { - "app": "mailu-sync-listener" + "app": "ariadne" }, "ports": [ { "name": "http", - "port": 8080, - "targetPort": 8080, + "port": 80, + "targetPort": "http", "protocol": "TCP" } ] @@ -1959,6 +2341,12 @@ "port": 5432, "targetPort": 5432, "protocol": "TCP" + }, + { + "name": "metrics", + "port": 9187, + "targetPort": 9187, + "protocol": "TCP" } ] }, @@ -2032,6 +2420,28 @@ } ] }, + { + "namespace": "traefik", + "name": "traefik", + "type": "LoadBalancer", + "selector": { + "app": "traefik" + }, + "ports": [ + { + "name": "web", + "port": 80, + "targetPort": "web", + "protocol": "TCP" + }, + { + "name": "websecure", + "port": 443, + "targetPort": "websecure", + "protocol": "TCP" + } + ] + }, { "namespace": "traefik", "name": "traefik-metrics", @@ -2210,6 +2620,26 @@ "source": "bstein-dev-home" } }, + { + "host": "budget.bstein.dev", + "path": "/", + "backend": { + "namespace": "finance", + "service": "actual-budget", + "port": 80, + "workloads": [ + { + "kind": "Deployment", + "name": "actual-budget" + } + ] + }, + "via": { + "kind": "Ingress", + "name": "actual-budget", + "source": "finance" + } + }, { "host": "call.live.bstein.dev", "path": "/", @@ -2290,6 +2720,26 @@ "source": "nextcloud" } }, + { + "host": "health.bstein.dev", + "path": "/", + "backend": { + "namespace": "health", + "service": "wger", + "port": 80, + "workloads": [ + { + "kind": "Deployment", + "name": "wger" + } + ] + }, + "via": { + "kind": "Ingress", + "name": "wger", + "source": "health" + } + }, { "host": "kit.live.bstein.dev", "path": "/livekit/jwt", @@ -2385,6 +2835,106 @@ "source": "comms" } }, + { + "host": "live.bstein.dev", + "path": "/_matrix/client/r0/register", + "backend": { + "namespace": "comms", + "service": "matrix-guest-register", + "port": 8080, + "workloads": [ + { + "kind": "Deployment", + "name": "matrix-guest-register" + } + ] + }, + "via": { + "kind": "Ingress", + "name": "matrix-routing", + "source": "comms" + } + }, + { + "host": "live.bstein.dev", + "path": "/_matrix/client/v3/login", + "backend": { + "namespace": "comms", + "service": "matrix-authentication-service", + "port": 8080, + "workloads": [ + { + "kind": "Deployment", + "name": "matrix-authentication-service" + } + ] + }, + "via": { + "kind": "Ingress", + "name": "matrix-routing", + "source": "comms" + } + }, + { + "host": "live.bstein.dev", + "path": "/_matrix/client/v3/logout", + "backend": { + "namespace": "comms", + "service": "matrix-authentication-service", + "port": 8080, + "workloads": [ + { + "kind": "Deployment", + "name": "matrix-authentication-service" + } + ] + }, + "via": { + "kind": "Ingress", + "name": "matrix-routing", + "source": "comms" + } + }, + { + "host": "live.bstein.dev", + "path": "/_matrix/client/v3/refresh", + "backend": { + "namespace": "comms", + "service": "matrix-authentication-service", + "port": 8080, + "workloads": [ + { + "kind": "Deployment", + "name": "matrix-authentication-service" + } + ] + }, + "via": { + "kind": "Ingress", + "name": "matrix-routing", + "source": "comms" + } + }, + { + "host": "live.bstein.dev", + "path": "/_matrix/client/v3/register", + "backend": { + "namespace": "comms", + "service": "matrix-guest-register", + "port": 8080, + "workloads": [ + { + "kind": "Deployment", + "name": "matrix-guest-register" + } + ] + }, + "via": { + "kind": "Ingress", + "name": "matrix-routing", + "source": "comms" + } + }, { "host": "logs.bstein.dev", "path": "/", @@ -2650,6 +3200,26 @@ "source": "monerod" } }, + { + "host": "money.bstein.dev", + "path": "/", + "backend": { + "namespace": "finance", + "service": "firefly", + "port": 80, + "workloads": [ + { + "kind": "Deployment", + "name": "firefly" + } + ] + }, + "via": { + "kind": "Ingress", + "name": "firefly", + "source": "finance" + } + }, { "host": "notes.bstein.dev", "path": "/", @@ -2838,7 +3408,6 @@ "matrix.live.bstein.dev" ], "comms:comms/othrys-synapse": [ - "bstein.dev", "kit.live.bstein.dev", "live.bstein.dev", "matrix.live.bstein.dev", @@ -2853,6 +3422,9 @@ "logging:logging/data-prepper": [ "registry.bstein.dev" ], + "longhorn:longhorn-system/longhorn": [ + "registry.bstein.dev" + ], "mailu:mailu-mailserver/mailu": [ "bstein.dev", "mail.bstein.dev" @@ -2862,8 +3434,12 @@ ], "monitoring:monitoring/grafana": [ "bstein.dev", + "mail.bstein.dev", "metrics.bstein.dev", "sso.bstein.dev" + ], + "monitoring:monitoring/kube-state-metrics": [ + "atlas.bstein.dev" ] } } diff --git a/services/comms/knowledge/catalog/atlas.yaml b/services/comms/knowledge/catalog/atlas.yaml index 6529e1a..637b5f9 100644 --- a/services/comms/knowledge/catalog/atlas.yaml +++ b/services/comms/knowledge/catalog/atlas.yaml @@ -1,4 +1,4 @@ -# services/comms/knowledge/catalog/atlas.yaml +# knowledge/catalog/atlas.yaml # Generated by scripts/knowledge_render_atlas.py (do not edit by hand) cluster: atlas sources: @@ -8,6 +8,15 @@ sources: - name: bstein-dev-home path: services/bstein-dev-home targetNamespace: bstein-dev-home +- name: bstein-dev-home-migrations + path: services/bstein-dev-home/migrations + targetNamespace: bstein-dev-home +- name: cert-manager + path: infrastructure/cert-manager + targetNamespace: cert-manager +- name: cert-manager-cleanup + path: infrastructure/cert-manager/cleanup + targetNamespace: cert-manager - name: comms path: services/comms targetNamespace: comms @@ -17,6 +26,9 @@ sources: - name: crypto path: services/crypto targetNamespace: crypto +- name: finance + path: services/finance + targetNamespace: finance - name: flux-system path: clusters/atlas/flux-system targetNamespace: null @@ -29,6 +41,9 @@ sources: - name: harbor path: services/harbor targetNamespace: harbor +- name: health + path: services/health + targetNamespace: health - name: helm path: infrastructure/sources/helm targetNamespace: flux-system @@ -44,6 +59,12 @@ sources: - name: logging path: services/logging targetNamespace: null +- name: longhorn + path: infrastructure/longhorn/core + targetNamespace: longhorn-system +- name: longhorn-adopt + path: infrastructure/longhorn/adopt + targetNamespace: longhorn-system - name: longhorn-ui path: infrastructure/longhorn/ui-ingress targetNamespace: longhorn-system @@ -98,9 +119,15 @@ sources: - name: vault-csi path: infrastructure/vault-csi targetNamespace: kube-system +- name: vault-injector + path: infrastructure/vault-injector + targetNamespace: vault - name: vaultwarden path: services/vaultwarden targetNamespace: vaultwarden +- name: wallet-monero-temp + path: services/crypto/wallet-monero-temp + targetNamespace: crypto - name: xmr-miner path: services/crypto/xmr-miner targetNamespace: crypto @@ -124,7 +151,7 @@ workloads: kubernetes.io/arch: arm64 node-role.kubernetes.io/worker: 'true' images: - - registry.bstein.dev/bstein/bstein-dev-home-backend:0.1.1-92 + - registry.bstein.dev/bstein/bstein-dev-home-backend:0.1.1-157 - kind: Deployment namespace: bstein-dev-home name: bstein-dev-home-frontend @@ -135,13 +162,22 @@ workloads: kubernetes.io/arch: arm64 node-role.kubernetes.io/worker: 'true' images: - - registry.bstein.dev/bstein/bstein-dev-home-frontend:0.1.1-92 + - registry.bstein.dev/bstein/bstein-dev-home-frontend:0.1.1-157 +- kind: Deployment + namespace: bstein-dev-home + name: bstein-dev-home-vault-sync + labels: + app: bstein-dev-home-vault-sync + serviceAccountName: bstein-dev-home-vault-sync + nodeSelector: {} + images: + - alpine:3.20 - kind: Deployment namespace: bstein-dev-home name: chat-ai-gateway labels: app: chat-ai-gateway - serviceAccountName: null + serviceAccountName: bstein-dev-home nodeSelector: kubernetes.io/arch: arm64 node-role.kubernetes.io/worker: 'true' @@ -157,12 +193,21 @@ workloads: hardware: rpi5 images: - python:3.11-slim +- kind: Deployment + namespace: comms + name: comms-vault-sync + labels: + app: comms-vault-sync + serviceAccountName: comms-vault + nodeSelector: {} + images: + - alpine:3.20 - kind: Deployment namespace: comms name: coturn labels: app: coturn - serviceAccountName: null + serviceAccountName: comms-vault nodeSelector: hardware: rpi5 images: @@ -182,7 +227,7 @@ workloads: name: livekit labels: app: livekit - serviceAccountName: null + serviceAccountName: comms-vault nodeSelector: hardware: rpi5 images: @@ -192,17 +237,17 @@ workloads: name: livekit-token-service labels: app: livekit-token-service - serviceAccountName: null + serviceAccountName: comms-vault nodeSelector: hardware: rpi5 images: - - ghcr.io/element-hq/lk-jwt-service:0.3.0 + - registry.bstein.dev/tools/lk-jwt-service-vault:0.3.0 - kind: Deployment namespace: comms name: matrix-authentication-service labels: app: matrix-authentication-service - serviceAccountName: null + serviceAccountName: comms-vault nodeSelector: hardware: rpi5 images: @@ -212,7 +257,7 @@ workloads: name: matrix-guest-register labels: app.kubernetes.io/name: matrix-guest-register - serviceAccountName: null + serviceAccountName: comms-vault nodeSelector: {} images: - python:3.11-slim @@ -235,12 +280,21 @@ workloads: node-role.kubernetes.io/worker: 'true' images: - ghcr.io/tari-project/xmrig@sha256:80defbfd0b640d604c91cb5101d3642db7928e1e68ee3c6b011289b3565a39d9 +- kind: Deployment + namespace: crypto + name: crypto-vault-sync + labels: + app: crypto-vault-sync + serviceAccountName: crypto-vault-sync + nodeSelector: {} + images: + - alpine:3.20 - kind: Deployment namespace: crypto name: monero-p2pool labels: app: monero-p2pool - serviceAccountName: null + serviceAccountName: crypto-vault-sync nodeSelector: node-role.kubernetes.io/worker: 'true' images: @@ -255,6 +309,38 @@ workloads: node-role.kubernetes.io/worker: 'true' images: - registry.bstein.dev/crypto/monerod:0.18.4.1 +- kind: Deployment + namespace: crypto + name: wallet-monero-temp + labels: + app: wallet-monero-temp + serviceAccountName: crypto-vault-sync + nodeSelector: + node-role.kubernetes.io/worker: 'true' + images: + - registry.bstein.dev/crypto/monero-wallet-rpc:0.18.4.1 +- kind: Deployment + namespace: finance + name: actual-budget + labels: + app: actual-budget + serviceAccountName: finance-vault + nodeSelector: + kubernetes.io/arch: arm64 + node-role.kubernetes.io/worker: 'true' + images: + - actualbudget/actual-server:26.1.0-alpine@sha256:34aae5813fdfee12af2a50c4d0667df68029f1d61b90f45f282473273eb70d0d +- kind: Deployment + namespace: finance + name: firefly + labels: + app: firefly + serviceAccountName: finance-vault + nodeSelector: + kubernetes.io/arch: arm64 + node-role.kubernetes.io/worker: 'true' + images: + - fireflyiii/core:version-6.4.15 - kind: Deployment namespace: flux-system name: helm-controller @@ -344,17 +430,38 @@ workloads: name: gitea labels: app: gitea - serviceAccountName: null + serviceAccountName: gitea-vault nodeSelector: node-role.kubernetes.io/worker: 'true' images: - gitea/gitea:1.23 +- kind: Deployment + namespace: harbor + name: harbor-vault-sync + labels: + app: harbor-vault-sync + serviceAccountName: harbor-vault-sync + nodeSelector: {} + images: + - alpine:3.20 +- kind: Deployment + namespace: health + name: wger + labels: + app: wger + serviceAccountName: health-vault-sync + nodeSelector: + kubernetes.io/arch: arm64 + node-role.kubernetes.io/worker: 'true' + images: + - nginx:1.27.5-alpine@sha256:65645c7bb6a0661892a8b03b89d0743208a18dd2f3f17a54ef4b76fb8e2f2a10 + - wger/server@sha256:710588b78af4e0aa0b4d8a8061e4563e16eae80eeaccfe7f9e0d9cbdd7f0cbc5 - kind: Deployment namespace: jellyfin name: jellyfin labels: app: jellyfin - serviceAccountName: null + serviceAccountName: pegasus-vault-sync nodeSelector: {} images: - docker.io/jellyfin/jellyfin:10.11.5 @@ -363,13 +470,22 @@ workloads: name: pegasus labels: app: pegasus - serviceAccountName: null + serviceAccountName: pegasus-vault-sync nodeSelector: kubernetes.io/arch: arm64 node-role.kubernetes.io/worker: 'true' images: - alpine:3.20 - - registry.bstein.dev/streaming/pegasus:1.2.32 + - registry.bstein.dev/streaming/pegasus-vault:1.2.32 +- kind: Deployment + namespace: jellyfin + name: pegasus-vault-sync + labels: + app: pegasus-vault-sync + serviceAccountName: pegasus-vault-sync + nodeSelector: {} + images: + - alpine:3.20 - kind: Deployment namespace: jenkins name: jenkins @@ -381,6 +497,26 @@ workloads: node-role.kubernetes.io/worker: 'true' images: - jenkins/jenkins:2.528.3-jdk21 +- kind: Deployment + namespace: jenkins + name: jenkins-vault-sync + labels: + app: jenkins-vault-sync + serviceAccountName: jenkins-vault-sync + nodeSelector: + kubernetes.io/arch: arm64 + node-role.kubernetes.io/worker: 'true' + images: + - alpine:3.20 +- kind: DaemonSet + namespace: kube-system + name: ntp-sync + labels: + app: ntp-sync + serviceAccountName: null + nodeSelector: {} + images: + - public.ecr.aws/docker/library/busybox:1.36.1 - kind: DaemonSet namespace: kube-system name: nvidia-device-plugin-jetson @@ -427,6 +563,16 @@ workloads: kubernetes.io/os: linux images: - hashicorp/vault-csi-provider:1.7.0 +- kind: Deployment + namespace: kube-system + name: coredns + labels: + k8s-app: kube-dns + serviceAccountName: coredns + nodeSelector: + kubernetes.io/os: linux + images: + - registry.bstein.dev/infra/coredns:1.12.1 - kind: DaemonSet namespace: logging name: node-image-gc-rpi4 @@ -457,22 +603,41 @@ workloads: hardware: rpi5 images: - bitnami/kubectl@sha256:554ab88b1858e8424c55de37ad417b16f2a0e65d1607aa0f3fe3ce9b9f10b131 +- kind: Deployment + namespace: logging + name: logging-vault-sync + labels: + app: logging-vault-sync + serviceAccountName: logging-vault-sync + nodeSelector: {} + images: + - alpine:3.20 - kind: Deployment namespace: logging name: oauth2-proxy-logs labels: app: oauth2-proxy-logs - serviceAccountName: null + serviceAccountName: logging-vault-sync nodeSelector: node-role.kubernetes.io/worker: 'true' images: - - quay.io/oauth2-proxy/oauth2-proxy:v7.6.0 + - registry.bstein.dev/tools/oauth2-proxy-vault:v7.6.0 +- kind: Deployment + namespace: longhorn-system + name: longhorn-vault-sync + labels: + app: longhorn-vault-sync + serviceAccountName: longhorn-vault-sync + nodeSelector: + node-role.kubernetes.io/worker: 'true' + images: + - alpine:3.20 - kind: Deployment namespace: longhorn-system name: oauth2-proxy-longhorn labels: app: oauth2-proxy-longhorn - serviceAccountName: null + serviceAccountName: longhorn-vault nodeSelector: node-role.kubernetes.io/worker: 'true' images: @@ -489,13 +654,34 @@ workloads: - registry.bstein.dev/bstein/kubectl:1.35.0 - kind: Deployment namespace: mailu-mailserver - name: mailu-sync-listener + name: mailu-vault-sync labels: - app: mailu-sync-listener - serviceAccountName: null + app: mailu-vault-sync + serviceAccountName: mailu-vault-sync nodeSelector: {} images: - - python:3.11-alpine + - alpine:3.20 +- kind: DaemonSet + namespace: maintenance + name: disable-k3s-traefik + labels: + app: disable-k3s-traefik + serviceAccountName: disable-k3s-traefik + nodeSelector: + node-role.kubernetes.io/control-plane: 'true' + images: + - bitnami/kubectl@sha256:554ab88b1858e8424c55de37ad417b16f2a0e65d1607aa0f3fe3ce9b9f10b131 +- kind: DaemonSet + namespace: maintenance + name: k3s-agent-restart + labels: + app: k3s-agent-restart + serviceAccountName: node-nofile + nodeSelector: + kubernetes.io/arch: arm64 + node-role.kubernetes.io/worker: 'true' + images: + - bitnami/kubectl@sha256:554ab88b1858e8424c55de37ad417b16f2a0e65d1607aa0f3fe3ce9b9f10b131 - kind: DaemonSet namespace: maintenance name: node-image-sweeper @@ -515,6 +701,26 @@ workloads: nodeSelector: {} images: - bitnami/kubectl@sha256:554ab88b1858e8424c55de37ad417b16f2a0e65d1607aa0f3fe3ce9b9f10b131 +- kind: Deployment + namespace: maintenance + name: ariadne + labels: + app: ariadne + serviceAccountName: ariadne + nodeSelector: + kubernetes.io/arch: arm64 + node-role.kubernetes.io/worker: 'true' + images: + - registry.bstein.dev/bstein/ariadne:0.1.0-49 +- kind: Deployment + namespace: maintenance + name: maintenance-vault-sync + labels: + app: maintenance-vault-sync + serviceAccountName: maintenance-vault-sync + nodeSelector: {} + images: + - alpine:3.20 - kind: DaemonSet namespace: monitoring name: dcgm-exporter @@ -534,12 +740,21 @@ workloads: jetson: 'true' images: - python:3.10-slim +- kind: Deployment + namespace: monitoring + name: monitoring-vault-sync + labels: + app: monitoring-vault-sync + serviceAccountName: monitoring-vault-sync + nodeSelector: {} + images: + - alpine:3.20 - kind: Deployment namespace: monitoring name: postmark-exporter labels: app: postmark-exporter - serviceAccountName: null + serviceAccountName: monitoring-vault-sync nodeSelector: {} images: - python:3.12-alpine @@ -558,7 +773,7 @@ workloads: name: nextcloud labels: app: nextcloud - serviceAccountName: null + serviceAccountName: nextcloud-vault nodeSelector: hardware: rpi5 images: @@ -568,7 +783,7 @@ workloads: name: outline labels: app: outline - serviceAccountName: null + serviceAccountName: outline-vault nodeSelector: node-role.kubernetes.io/worker: 'true' images: @@ -588,7 +803,7 @@ workloads: name: planka labels: app: planka - serviceAccountName: null + serviceAccountName: planka-vault nodeSelector: node-role.kubernetes.io/worker: 'true' images: @@ -603,13 +818,16 @@ workloads: node-role.kubernetes.io/worker: 'true' images: - postgres:15 + - quay.io/prometheuscommunity/postgres-exporter:v0.15.0 - kind: Deployment namespace: sso name: keycloak labels: app: keycloak - serviceAccountName: null - nodeSelector: {} + serviceAccountName: sso-vault + nodeSelector: + kubernetes.io/arch: arm64 + node-role.kubernetes.io/worker: 'true' images: - quay.io/keycloak/keycloak:26.0.7 - kind: Deployment @@ -617,17 +835,26 @@ workloads: name: oauth2-proxy labels: app: oauth2-proxy - serviceAccountName: null + serviceAccountName: sso-vault nodeSelector: node-role.kubernetes.io/worker: 'true' images: - - quay.io/oauth2-proxy/oauth2-proxy:v7.6.0 + - registry.bstein.dev/tools/oauth2-proxy-vault:v7.6.0 +- kind: Deployment + namespace: sso + name: sso-vault-sync + labels: + app: sso-vault-sync + serviceAccountName: sso-vault-sync + nodeSelector: {} + images: + - alpine:3.20 - kind: StatefulSet namespace: sso name: openldap labels: app: openldap - serviceAccountName: null + serviceAccountName: sso-vault nodeSelector: kubernetes.io/arch: arm64 node-role.kubernetes.io/worker: 'true' @@ -640,7 +867,7 @@ workloads: app: sui-metrics serviceAccountName: sui-metrics nodeSelector: - kubernetes.io/hostname: titan-24 + hardware: rpi5 images: - victoriametrics/vmagent:v1.103.0 - kind: Deployment @@ -648,6 +875,8 @@ workloads: name: traefik labels: app: traefik + app.kubernetes.io/instance: traefik-kube-system + app.kubernetes.io/name: traefik serviceAccountName: traefik-ingress-controller nodeSelector: node-role.kubernetes.io/worker: 'true' @@ -669,10 +898,12 @@ workloads: name: vaultwarden labels: app: vaultwarden - serviceAccountName: null - nodeSelector: {} + serviceAccountName: vaultwarden-vault + nodeSelector: + kubernetes.io/arch: arm64 + node-role.kubernetes.io/worker: 'true' images: - - vaultwarden/server:1.33.2 + - vaultwarden/server:1.35.2 services: - namespace: ai name: ollama @@ -1040,6 +1271,36 @@ services: port: 3333 targetPort: 3333 protocol: TCP +- namespace: crypto + name: wallet-monero-temp + type: ClusterIP + selector: + app: wallet-monero-temp + ports: + - name: rpc + port: 18083 + targetPort: 18083 + protocol: TCP +- namespace: finance + name: actual-budget + type: ClusterIP + selector: + app: actual-budget + ports: + - name: http + port: 80 + targetPort: 5006 + protocol: TCP +- namespace: finance + name: firefly + type: ClusterIP + selector: + app: firefly + ports: + - name: http + port: 80 + targetPort: 8080 + protocol: TCP - namespace: flux-system name: notification-controller type: ClusterIP @@ -1082,7 +1343,7 @@ services: protocol: TCP - namespace: gitea name: gitea-ssh - type: NodePort + type: LoadBalancer selector: app: gitea ports: @@ -1090,6 +1351,16 @@ services: port: 2242 targetPort: 2242 protocol: TCP +- namespace: health + name: wger + type: ClusterIP + selector: + app: wger + ports: + - name: http + port: 80 + targetPort: http + protocol: TCP - namespace: jellyfin name: jellyfin type: ClusterIP @@ -1124,21 +1395,6 @@ services: port: 50000 targetPort: 50000 protocol: TCP -- namespace: kube-system - name: traefik - type: LoadBalancer - selector: - app.kubernetes.io/instance: traefik-kube-system - app.kubernetes.io/name: traefik - ports: - - name: web - port: 80 - targetPort: web - protocol: TCP - - name: websecure - port: 443 - targetPort: websecure - protocol: TCP - namespace: logging name: oauth2-proxy-logs type: ClusterIP @@ -1191,15 +1447,15 @@ services: port: 4190 targetPort: 4190 protocol: TCP -- namespace: mailu-mailserver - name: mailu-sync-listener +- namespace: maintenance + name: ariadne type: ClusterIP selector: - app: mailu-sync-listener + app: ariadne ports: - name: http - port: 8080 - targetPort: 8080 + port: 80 + targetPort: http protocol: TCP - namespace: monitoring name: dcgm-exporter @@ -1291,6 +1547,10 @@ services: port: 5432 targetPort: 5432 protocol: TCP + - name: metrics + port: 9187 + targetPort: 9187 + protocol: TCP - namespace: sso name: keycloak type: ClusterIP @@ -1335,6 +1595,20 @@ services: port: 8429 targetPort: 8429 protocol: TCP +- namespace: traefik + name: traefik + type: LoadBalancer + selector: + app: traefik + ports: + - name: web + port: 80 + targetPort: web + protocol: TCP + - name: websecure + port: 443 + targetPort: websecure + protocol: TCP - namespace: traefik name: traefik-metrics type: ClusterIP @@ -1447,6 +1721,19 @@ http_endpoints: kind: Ingress name: bstein-dev-home source: bstein-dev-home +- host: budget.bstein.dev + path: / + backend: + namespace: finance + service: actual-budget + port: 80 + workloads: + - kind: Deployment + name: actual-budget + via: + kind: Ingress + name: actual-budget + source: finance - host: call.live.bstein.dev path: / backend: @@ -1499,6 +1786,19 @@ http_endpoints: kind: Ingress name: nextcloud source: nextcloud +- host: health.bstein.dev + path: / + backend: + namespace: health + service: wger + port: 80 + workloads: + - kind: Deployment + name: wger + via: + kind: Ingress + name: wger + source: health - host: kit.live.bstein.dev path: /livekit/jwt backend: @@ -1558,6 +1858,65 @@ http_endpoints: kind: Ingress name: matrix-routing source: comms +- host: live.bstein.dev + path: /_matrix/client/r0/register + backend: + namespace: comms + service: matrix-guest-register + port: 8080 + workloads: &id003 + - kind: Deployment + name: matrix-guest-register + via: + kind: Ingress + name: matrix-routing + source: comms +- host: live.bstein.dev + path: /_matrix/client/v3/login + backend: + namespace: comms + service: matrix-authentication-service + port: 8080 + workloads: &id002 + - kind: Deployment + name: matrix-authentication-service + via: + kind: Ingress + name: matrix-routing + source: comms +- host: live.bstein.dev + path: /_matrix/client/v3/logout + backend: + namespace: comms + service: matrix-authentication-service + port: 8080 + workloads: *id002 + via: + kind: Ingress + name: matrix-routing + source: comms +- host: live.bstein.dev + path: /_matrix/client/v3/refresh + backend: + namespace: comms + service: matrix-authentication-service + port: 8080 + workloads: *id002 + via: + kind: Ingress + name: matrix-routing + source: comms +- host: live.bstein.dev + path: /_matrix/client/v3/register + backend: + namespace: comms + service: matrix-guest-register + port: 8080 + workloads: *id003 + via: + kind: Ingress + name: matrix-routing + source: comms - host: logs.bstein.dev path: / backend: @@ -1601,9 +1960,7 @@ http_endpoints: namespace: comms service: matrix-authentication-service port: 8080 - workloads: &id002 - - kind: Deployment - name: matrix-authentication-service + workloads: *id002 via: kind: Ingress name: matrix-routing @@ -1647,9 +2004,7 @@ http_endpoints: namespace: comms service: matrix-guest-register port: 8080 - workloads: &id003 - - kind: Deployment - name: matrix-guest-register + workloads: *id003 via: kind: Ingress name: matrix-routing @@ -1722,6 +2077,19 @@ http_endpoints: kind: Ingress name: monerod source: monerod +- host: money.bstein.dev + path: / + backend: + namespace: finance + service: firefly + port: 80 + workloads: + - kind: Deployment + name: firefly + via: + kind: Ingress + name: firefly + source: finance - host: notes.bstein.dev path: / backend: @@ -1845,7 +2213,6 @@ helmrelease_host_hints: - live.bstein.dev - matrix.live.bstein.dev comms:comms/othrys-synapse: - - bstein.dev - kit.live.bstein.dev - live.bstein.dev - matrix.live.bstein.dev @@ -1856,6 +2223,8 @@ helmrelease_host_hints: - registry.bstein.dev logging:logging/data-prepper: - registry.bstein.dev + longhorn:longhorn-system/longhorn: + - registry.bstein.dev mailu:mailu-mailserver/mailu: - bstein.dev - mail.bstein.dev @@ -1863,5 +2232,8 @@ helmrelease_host_hints: - alerts.bstein.dev monitoring:monitoring/grafana: - bstein.dev + - mail.bstein.dev - metrics.bstein.dev - sso.bstein.dev + monitoring:monitoring/kube-state-metrics: + - atlas.bstein.dev diff --git a/services/comms/knowledge/catalog/metrics.json b/services/comms/knowledge/catalog/metrics.json new file mode 100644 index 0000000..e929db5 --- /dev/null +++ b/services/comms/knowledge/catalog/metrics.json @@ -0,0 +1,1880 @@ +[ + { + "dashboard": "Atlas GPU", + "panel_title": "Namespace GPU Share", + "panel_id": 1, + "panel_type": "piechart", + "description": "Shares are normalized within the selected filter. Switching scope changes the denominator.", + "tags": [ + "atlas", + "gpu" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "(100 * (sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace)) / clamp_min((sum(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace)) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") and on() ((sum(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace)) or on() vector(0)) == 0))" + ] + }, + { + "dashboard": "Atlas GPU", + "panel_title": "GPU Util by Namespace", + "panel_id": 2, + "panel_type": "timeseries", + "description": "", + "tags": [ + "atlas", + "gpu" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace)" + ] + }, + { + "dashboard": "Atlas GPU", + "panel_title": "GPU Util by Node", + "panel_id": 3, + "panel_type": "timeseries", + "description": "", + "tags": [ + "atlas", + "gpu" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum by (Hostname) (DCGM_FI_DEV_GPU_UTIL{pod!=\"\"})" + ] + }, + { + "dashboard": "Atlas GPU", + "panel_title": "Top Pods by GPU Util", + "panel_id": 4, + "panel_type": "table", + "description": "", + "tags": [ + "atlas", + "gpu" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "topk(10, sum(DCGM_FI_DEV_GPU_UTIL{pod!=\"\"}) by (namespace,pod,Hostname))" + ] + }, + { + "dashboard": "Atlas Jobs", + "panel_title": "Ariadne Task Errors (range)", + "panel_id": 1, + "panel_type": "bargauge", + "description": "", + "tags": [ + "atlas", + "jobs", + "glue" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sort_desc(sum by (task) (increase(ariadne_task_runs_total{status=\"error\"}[$__range])))" + ] + }, + { + "dashboard": "Atlas Jobs", + "panel_title": "Ariadne Attempts / Failures", + "panel_id": 2, + "panel_type": "timeseries", + "description": "", + "tags": [ + "atlas", + "jobs", + "glue" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum(increase(ariadne_task_runs_total[$__interval]))", + "sum(increase(ariadne_task_runs_total{status=\"error\"}[$__interval]))" + ] + }, + { + "dashboard": "Atlas Jobs", + "panel_title": "One-off Job Pods (age hours)", + "panel_id": 3, + "panel_type": "bargauge", + "description": "", + "tags": [ + "atlas", + "jobs", + "glue" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sort_desc(((time() - kube_pod_start_time{pod!=\"\"}) / 3600) * on(namespace,pod) group_left(owner_name) (kube_pod_owner{owner_kind=\"Job\"} unless on(namespace, owner_name) label_replace(kube_job_owner{owner_kind=\"CronJob\"}, \"owner_name\", \"$1\", \"job_name\", \"(.*)\")) * on(namespace,pod) group_left(phase) max by (namespace,pod,phase) (kube_pod_status_phase{phase=~\"Running|Succeeded\"}))" + ] + }, + { + "dashboard": "Atlas Jobs", + "panel_title": "Glue Jobs Stale (>36h)", + "panel_id": 4, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "jobs", + "glue" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "(sum((((time() - (kube_cronjob_status_last_successful_time and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"})) > bool 129600) unless on(namespace,cronjob) (kube_cronjob_spec_suspend and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}) == 1)) + count(((kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"} unless on(namespace,cronjob) kube_cronjob_status_last_successful_time) unless on(namespace,cronjob) (kube_cronjob_spec_suspend and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}) == 1))) or on() vector(0)" + ] + }, + { + "dashboard": "Atlas Jobs", + "panel_title": "Glue Jobs Missing Success", + "panel_id": 5, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "jobs", + "glue" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "count(((kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"} unless on(namespace,cronjob) kube_cronjob_status_last_successful_time) unless on(namespace,cronjob) (kube_cronjob_spec_suspend and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}) == 1)) or on() vector(0)" + ] + }, + { + "dashboard": "Atlas Jobs", + "panel_title": "Glue Jobs Suspended", + "panel_id": 6, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "jobs", + "glue" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum((kube_cronjob_spec_suspend and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}) == 1) or on() vector(0)" + ] + }, + { + "dashboard": "Atlas Jobs", + "panel_title": "Ariadne Task Errors (1h)", + "panel_id": 7, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "jobs", + "glue" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum(increase(ariadne_task_runs_total{status=\"error\"}[1h]))" + ] + }, + { + "dashboard": "Atlas Jobs", + "panel_title": "Ariadne Task Errors (24h)", + "panel_id": 8, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "jobs", + "glue" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum(increase(ariadne_task_runs_total{status=\"error\"}[24h]))" + ] + }, + { + "dashboard": "Atlas Jobs", + "panel_title": "Ariadne Task Runs (1h)", + "panel_id": 9, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "jobs", + "glue" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum(increase(ariadne_task_runs_total[1h]))" + ] + }, + { + "dashboard": "Atlas Jobs", + "panel_title": "Ariadne Schedule Last Error (hours ago)", + "panel_id": 10, + "panel_type": "bargauge", + "description": "", + "tags": [ + "atlas", + "jobs", + "glue" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sort_desc((time() - max_over_time(ariadne_schedule_last_error_timestamp_seconds[$__range])) / 3600)" + ] + }, + { + "dashboard": "Atlas Jobs", + "panel_title": "Ariadne Schedule Last Success (hours ago)", + "panel_id": 11, + "panel_type": "bargauge", + "description": "", + "tags": [ + "atlas", + "jobs", + "glue" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sort_desc((time() - max_over_time(ariadne_schedule_last_success_timestamp_seconds[$__range])) / 3600)" + ] + }, + { + "dashboard": "Atlas Jobs", + "panel_title": "Glue Jobs Last Success (hours ago)", + "panel_id": 12, + "panel_type": "bargauge", + "description": "", + "tags": [ + "atlas", + "jobs", + "glue" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sort_desc((time() - max_over_time((kube_cronjob_status_last_successful_time and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"})[$__range])) / 3600)" + ] + }, + { + "dashboard": "Atlas Jobs", + "panel_title": "Glue Jobs Last Schedule (hours ago)", + "panel_id": 13, + "panel_type": "bargauge", + "description": "", + "tags": [ + "atlas", + "jobs", + "glue" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sort_desc((time() - max_over_time((kube_cronjob_status_last_schedule_time and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"})[$__range])) / 3600)" + ] + }, + { + "dashboard": "Atlas Jobs", + "panel_title": "Ariadne Task Errors (1h)", + "panel_id": 14, + "panel_type": "bargauge", + "description": "", + "tags": [ + "atlas", + "jobs", + "glue" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sort_desc(sum by (task) (increase(ariadne_task_runs_total{status=\"error\"}[1h])))" + ] + }, + { + "dashboard": "Atlas Jobs", + "panel_title": "Ariadne Task Errors (30d)", + "panel_id": 15, + "panel_type": "bargauge", + "description": "", + "tags": [ + "atlas", + "jobs", + "glue" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sort_desc(sum by (task) (increase(ariadne_task_runs_total{status=\"error\"}[30d])))" + ] + }, + { + "dashboard": "Atlas Jobs", + "panel_title": "Ariadne Access Requests", + "panel_id": 16, + "panel_type": "bargauge", + "description": "", + "tags": [ + "atlas", + "jobs", + "glue" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sort_desc(ariadne_access_requests_total)" + ] + }, + { + "dashboard": "Atlas Jobs", + "panel_title": "Ariadne CI Coverage (%)", + "panel_id": 17, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "jobs", + "glue" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "ariadne_ci_coverage_percent{repo=\"ariadne\"}" + ] + }, + { + "dashboard": "Atlas Jobs", + "panel_title": "Ariadne CI Tests (latest)", + "panel_id": 18, + "panel_type": "table", + "description": "", + "tags": [ + "atlas", + "jobs", + "glue" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "ariadne_ci_tests_total{repo=\"ariadne\"}" + ] + }, + { + "dashboard": "Atlas Mail", + "panel_title": "Sent (1d)", + "panel_id": 1, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "mail" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "max(postmark_outbound_sent{window=\"1d\"})" + ] + }, + { + "dashboard": "Atlas Mail", + "panel_title": "Sent (7d)", + "panel_id": 2, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "mail" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "max(postmark_outbound_sent{window=\"7d\"})" + ] + }, + { + "dashboard": "Atlas Mail", + "panel_title": "Mail Bounces (1d)", + "panel_id": 3, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "mail" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "max(postmark_outbound_bounce_rate{window=\"1d\"})", + "max(postmark_outbound_bounced{window=\"1d\"})" + ] + }, + { + "dashboard": "Atlas Mail", + "panel_title": "Success Rate (1d)", + "panel_id": 4, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "mail" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "clamp_min(100 - max(postmark_outbound_bounce_rate{window=\"1d\"}), 0)" + ] + }, + { + "dashboard": "Atlas Mail", + "panel_title": "Limit Used (30d)", + "panel_id": 5, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "mail" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "max(postmark_sending_limit_used_percent)" + ] + }, + { + "dashboard": "Atlas Mail", + "panel_title": "Send Limit (30d)", + "panel_id": 6, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "mail" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "max(postmark_sending_limit)" + ] + }, + { + "dashboard": "Atlas Mail", + "panel_title": "Last Success", + "panel_id": 7, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "mail" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "max(postmark_last_success_timestamp_seconds)" + ] + }, + { + "dashboard": "Atlas Mail", + "panel_title": "Exporter Errors", + "panel_id": 8, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "mail" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum(postmark_request_errors_total)" + ] + }, + { + "dashboard": "Atlas Mail", + "panel_title": "Bounce Rate (1d vs 7d)", + "panel_id": 13, + "panel_type": "timeseries", + "description": "", + "tags": [ + "atlas", + "mail" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "max by (window) (postmark_outbound_bounce_rate)" + ] + }, + { + "dashboard": "Atlas Mail", + "panel_title": "Bounced (1d vs 7d)", + "panel_id": 14, + "panel_type": "timeseries", + "description": "", + "tags": [ + "atlas", + "mail" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "max by (window) (postmark_outbound_bounced)" + ] + }, + { + "dashboard": "Atlas Mail", + "panel_title": "Sent (1d vs 7d)", + "panel_id": 15, + "panel_type": "timeseries", + "description": "", + "tags": [ + "atlas", + "mail" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "max by (window) (postmark_outbound_sent)" + ] + }, + { + "dashboard": "Atlas Mail", + "panel_title": "Exporter Errors", + "panel_id": 16, + "panel_type": "timeseries", + "description": "", + "tags": [ + "atlas", + "mail" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum(postmark_request_errors_total)" + ] + }, + { + "dashboard": "Atlas Network", + "panel_title": "Ingress Success Rate (5m)", + "panel_id": 1, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "network" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "(sum(rate(traefik_entrypoint_requests_total{code!~\"5..\"}[5m]))) / clamp_min(sum(rate(traefik_entrypoint_requests_total[5m])), 1)" + ] + }, + { + "dashboard": "Atlas Network", + "panel_title": "Error Budget Burn (1h)", + "panel_id": 2, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "network" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "(1 - ((sum(rate(traefik_entrypoint_requests_total{code!~\"5..\"}[1h]))) / clamp_min(sum(rate(traefik_entrypoint_requests_total[1h])), 1))) / 0.0010000000000000009" + ] + }, + { + "dashboard": "Atlas Network", + "panel_title": "Error Budget Burn (6h)", + "panel_id": 3, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "network" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "(1 - ((sum(rate(traefik_entrypoint_requests_total{code!~\"5..\"}[6h]))) / clamp_min(sum(rate(traefik_entrypoint_requests_total[6h])), 1))) / 0.0010000000000000009" + ] + }, + { + "dashboard": "Atlas Network", + "panel_title": "Edge P99 Latency (ms)", + "panel_id": 4, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "network" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "histogram_quantile(0.99, sum by (le) (rate(traefik_entrypoint_request_duration_seconds_bucket[5m]))) * 1000" + ] + }, + { + "dashboard": "Atlas Network", + "panel_title": "Ingress Traffic", + "panel_id": 5, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "network" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum(rate(node_network_receive_bytes_total{device!~\"lo|cni.*|veth.*|flannel.*|docker.*|virbr.*|vxlan.*|wg.*\"}[5m])) or on() vector(0)" + ] + }, + { + "dashboard": "Atlas Network", + "panel_title": "Egress Traffic", + "panel_id": 6, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "network" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum(rate(node_network_transmit_bytes_total{device!~\"lo|cni.*|veth.*|flannel.*|docker.*|virbr.*|vxlan.*|wg.*\"}[5m])) or on() vector(0)" + ] + }, + { + "dashboard": "Atlas Network", + "panel_title": "Intra-Cluster Traffic", + "panel_id": 7, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "network" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum(rate(container_network_receive_bytes_total{namespace!=\"traefik\",pod!=\"\"}[5m]) + rate(container_network_transmit_bytes_total{namespace!=\"traefik\",pod!=\"\"}[5m])) or on() vector(0)" + ] + }, + { + "dashboard": "Atlas Network", + "panel_title": "Per-Node Throughput", + "panel_id": 8, + "panel_type": "timeseries", + "description": "", + "tags": [ + "atlas", + "network" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "avg by (node) ((sum(rate(node_network_transmit_bytes_total{device!~\"lo|cni.*|veth.*|flannel.*|docker.*|virbr.*|vxlan.*|wg.*\"}[5m])) or on() vector(0) + sum(rate(node_network_receive_bytes_total{device!~\"lo|cni.*|veth.*|flannel.*|docker.*|virbr.*|vxlan.*|wg.*\"}[5m])) or on() vector(0)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))" + ] + }, + { + "dashboard": "Atlas Network", + "panel_title": "Top Namespaces", + "panel_id": 9, + "panel_type": "table", + "description": "", + "tags": [ + "atlas", + "network" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "topk(10, sum(rate(container_network_transmit_bytes_total{namespace!=\"\"}[5m]) + rate(container_network_receive_bytes_total{namespace!=\"\"}[5m])) by (namespace))" + ] + }, + { + "dashboard": "Atlas Network", + "panel_title": "Top Pods", + "panel_id": 10, + "panel_type": "table", + "description": "", + "tags": [ + "atlas", + "network" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "topk(10, sum(rate(container_network_transmit_bytes_total{pod!=\"\"}[5m]) + rate(container_network_receive_bytes_total{pod!=\"\"}[5m])) by (namespace,pod))" + ] + }, + { + "dashboard": "Atlas Network", + "panel_title": "Traefik Routers (req/s)", + "panel_id": 11, + "panel_type": "timeseries", + "description": "", + "tags": [ + "atlas", + "network" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "topk(10, sum by (router) (rate(traefik_router_requests_total[5m])))" + ] + }, + { + "dashboard": "Atlas Network", + "panel_title": "Traefik Entrypoints (req/s)", + "panel_id": 12, + "panel_type": "timeseries", + "description": "", + "tags": [ + "atlas", + "network" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum by (entrypoint) (rate(traefik_entrypoint_requests_total[5m]))" + ] + }, + { + "dashboard": "Atlas Nodes", + "panel_title": "Worker Nodes Ready", + "panel_id": 1, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "nodes" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum(kube_node_status_condition{condition=\"Ready\",status=\"true\",node=~\"titan-04|titan-05|titan-06|titan-07|titan-08|titan-09|titan-10|titan-11|titan-20|titan-21|titan-12|titan-13|titan-14|titan-15|titan-16|titan-17|titan-18|titan-19|titan-22|titan-24\"})" + ] + }, + { + "dashboard": "Atlas Nodes", + "panel_title": "Control Plane Ready", + "panel_id": 2, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "nodes" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum(kube_node_status_condition{condition=\"Ready\",status=\"true\",node=~\"titan-0a|titan-0b|titan-0c\"})" + ] + }, + { + "dashboard": "Atlas Nodes", + "panel_title": "Control Plane Workloads", + "panel_id": 3, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "nodes" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum(kube_pod_info{node=~\"titan-0a|titan-0b|titan-0c\",namespace!~\"^(kube-.*|.*-system|traefik|monitoring|logging|cert-manager|maintenance|postgres)$\"})" + ] + }, + { + "dashboard": "Atlas Nodes", + "panel_title": "API Server 5xx rate", + "panel_id": 9, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "nodes" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum(rate(apiserver_request_total{code=~\"5..\"}[5m]))" + ] + }, + { + "dashboard": "Atlas Nodes", + "panel_title": "API Server P99 latency", + "panel_id": 10, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "nodes" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "histogram_quantile(0.99, sum by (le) (rate(apiserver_request_duration_seconds_bucket[5m]))) * 1000" + ] + }, + { + "dashboard": "Atlas Nodes", + "panel_title": "etcd P99 latency", + "panel_id": 11, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "nodes" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "histogram_quantile(0.99, sum by (le) (rate(etcd_request_duration_seconds_bucket[5m]))) * 1000" + ] + }, + { + "dashboard": "Atlas Nodes", + "panel_title": "Node CPU", + "panel_id": 4, + "panel_type": "timeseries", + "description": "", + "tags": [ + "atlas", + "nodes" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))" + ] + }, + { + "dashboard": "Atlas Nodes", + "panel_title": "Node RAM", + "panel_id": 5, + "panel_type": "timeseries", + "description": "", + "tags": [ + "atlas", + "nodes" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))" + ] + }, + { + "dashboard": "Atlas Nodes", + "panel_title": "Control Plane (incl. titan-db) CPU", + "panel_id": 6, + "panel_type": "timeseries", + "description": "", + "tags": [ + "atlas", + "nodes" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "(avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-0a|titan-0b|titan-0c|titan-db|titan-jh\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")" + ] + }, + { + "dashboard": "Atlas Nodes", + "panel_title": "Control Plane (incl. titan-db) RAM", + "panel_id": 7, + "panel_type": "timeseries", + "description": "", + "tags": [ + "atlas", + "nodes" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "(avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-0a|titan-0b|titan-0c|titan-db|titan-jh\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")" + ] + }, + { + "dashboard": "Atlas Nodes", + "panel_title": "Root Filesystem Usage", + "panel_id": 8, + "panel_type": "timeseries", + "description": "", + "tags": [ + "atlas", + "nodes" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "avg by (node) ((avg by (instance) ((1 - (node_filesystem_avail_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"})) * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Control Plane Ready", + "panel_id": 2, + "panel_type": "gauge", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum(kube_node_status_condition{condition=\"Ready\",status=\"true\",node=~\"titan-0a|titan-0b|titan-0c\"})" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Control Plane Workloads", + "panel_id": 3, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum(kube_pod_info{node=~\"titan-0a|titan-0b|titan-0c\",namespace!~\"^(kube-.*|.*-system|traefik|monitoring|logging|cert-manager|maintenance|postgres)$\"}) or on() vector(0)" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Stuck Terminating", + "panel_id": 5, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum(max by (namespace,pod) (((time() - kube_pod_deletion_timestamp{pod!=\"\"}) > bool 600) and on(namespace,pod) (kube_pod_deletion_timestamp{pod!=\"\"} > bool 0))) or on() vector(0)" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Atlas Availability", + "panel_id": 27, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "avg_over_time((min(((sum(kube_node_status_condition{condition=\"Ready\",status=\"true\",node=~\"titan-0a|titan-0b|titan-0c\"}) / 3)), ((sum(kube_deployment_status_replicas_available{namespace=~\"traefik|kube-system\",deployment=\"traefik\"}) / clamp_min(sum(kube_deployment_spec_replicas{namespace=~\"traefik|kube-system\",deployment=\"traefik\"}), 1)))))[365d:1h])" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Problem Pods", + "panel_id": 4, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum(max by (namespace,pod) (kube_pod_status_phase{phase!~\"Running|Succeeded\"})) or on() vector(0)" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "CrashLoop / ImagePull", + "panel_id": 6, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum(max by (namespace,pod) (kube_pod_container_status_waiting_reason{reason=~\"CrashLoopBackOff|ImagePullBackOff\"})) or on() vector(0)" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Workers Ready", + "panel_id": 1, + "panel_type": "gauge", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum(kube_node_status_condition{condition=\"Ready\",status=\"true\",node=~\"titan-04|titan-05|titan-06|titan-07|titan-08|titan-09|titan-10|titan-11|titan-20|titan-21|titan-12|titan-13|titan-14|titan-15|titan-16|titan-17|titan-18|titan-19|titan-22|titan-24\"})" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Hottest node: CPU", + "panel_id": 7, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "label_replace(topk(1, avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))), \"__name__\", \"$1\", \"node\", \"(.*)\")" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Hottest node: RAM", + "panel_id": 8, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "label_replace(topk(1, avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))), \"__name__\", \"$1\", \"node\", \"(.*)\")" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Hottest node: NET (rx+tx)", + "panel_id": 9, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "label_replace(topk(1, avg by (node) ((sum by (instance) (rate(node_network_receive_bytes_total{device!~\"lo\"}[5m]) + rate(node_network_transmit_bytes_total{device!~\"lo\"}[5m]))) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))), \"__name__\", \"$1\", \"node\", \"(.*)\")" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Hottest node: I/O (r+w)", + "panel_id": 10, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "label_replace(topk(1, avg by (node) ((sum by (instance) (rate(node_disk_read_bytes_total[5m]) + rate(node_disk_written_bytes_total[5m]))) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))), \"__name__\", \"$1\", \"node\", \"(.*)\")" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Mail Sent (1d)", + "panel_id": 30, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "max(postmark_outbound_sent{window=\"1d\"})" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Mail Bounces (1d)", + "panel_id": 31, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "max(postmark_outbound_bounce_rate{window=\"1d\"})", + "max(postmark_outbound_bounced{window=\"1d\"})" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Mail Success Rate (1d)", + "panel_id": 32, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "clamp_min(100 - max(postmark_outbound_bounce_rate{window=\"1d\"}), 0)" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Mail Limit Used (30d)", + "panel_id": 33, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "max(postmark_sending_limit_used_percent)" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Postgres Connections Used", + "panel_id": 34, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "label_replace(sum(pg_stat_activity_count), \"conn\", \"used\", \"__name__\", \".*\") or label_replace(max(pg_settings_max_connections), \"conn\", \"max\", \"__name__\", \".*\")" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Postgres Hottest Connections", + "panel_id": 35, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "topk(1, sum by (datname) (pg_stat_activity_count))" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Astreae Usage", + "panel_id": 23, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "100 - (sum(node_filesystem_avail_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"}) / sum(node_filesystem_size_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"}) * 100)" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Asteria Usage", + "panel_id": 24, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "100 - (sum(node_filesystem_avail_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"}) / sum(node_filesystem_size_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"}) * 100)" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Astreae Free", + "panel_id": 25, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum(node_filesystem_avail_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"})" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Asteria Free", + "panel_id": 26, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum(node_filesystem_avail_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"})" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "One-off Job Pods (age hours)", + "panel_id": 40, + "panel_type": "bargauge", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sort_desc(((time() - kube_pod_start_time{pod!=\"\"}) / 3600) * on(namespace,pod) group_left(owner_name) (kube_pod_owner{owner_kind=\"Job\"} unless on(namespace, owner_name) label_replace(kube_job_owner{owner_kind=\"CronJob\"}, \"owner_name\", \"$1\", \"job_name\", \"(.*)\")) * on(namespace,pod) group_left(phase) max by (namespace,pod,phase) (kube_pod_status_phase{phase=~\"Running|Succeeded\"}))" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Ariadne Attempts / Failures", + "panel_id": 41, + "panel_type": "timeseries", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum(increase(ariadne_task_runs_total[$__interval]))", + "sum(increase(ariadne_task_runs_total{status=\"error\"}[$__interval]))" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Ariadne Test Success Rate", + "panel_id": 42, + "panel_type": "timeseries", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "100 * sum(max_over_time(ariadne_ci_tests_total{repo=\"ariadne\",result=\"passed\"}[1h])) / clamp_min(sum(max_over_time(ariadne_ci_tests_total{repo=\"ariadne\",result=~\"passed|failed|error\"}[1h])), 1)" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Tests with Failures (24h)", + "panel_id": 43, + "panel_type": "bargauge", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sort_desc(sum by (result) (max_over_time(ariadne_ci_tests_total{repo=\"ariadne\",result=~\"failed|error\"}[24h])))" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Namespace CPU Share", + "panel_id": 11, + "panel_type": "piechart", + "description": "Shares are normalized within the selected filter. Switching scope changes the denominator.", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "100 * ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\",container!=\"POD\",$namespace_scope_cpu}[1m])) by (namespace) ) / clamp_min(sum( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\",container!=\"POD\",$namespace_scope_cpu}[1m])) by (namespace) ), 1)" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Namespace GPU Share", + "panel_id": 12, + "panel_type": "piechart", + "description": "Shares are normalized within the selected filter. Switching scope changes the denominator.", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "(100 * (sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace)) / clamp_min((sum(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace)) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") and on() ((sum(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace)) or on() vector(0)) == 0))" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Namespace RAM Share", + "panel_id": 13, + "panel_type": "piechart", + "description": "Shares are normalized within the selected filter. Switching scope changes the denominator.", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "100 * ( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\",container!=\"POD\",$namespace_scope_ram}) by (namespace) ) / clamp_min(sum( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\",container!=\"POD\",$namespace_scope_ram}) by (namespace) ), 1)" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Worker Node CPU", + "panel_id": 14, + "panel_type": "timeseries", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "(avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-04|titan-05|titan-06|titan-07|titan-08|titan-09|titan-10|titan-11|titan-20|titan-21|titan-12|titan-13|titan-14|titan-15|titan-16|titan-17|titan-18|titan-19|titan-22|titan-24\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Worker Node RAM", + "panel_id": 15, + "panel_type": "timeseries", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "(avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-04|titan-05|titan-06|titan-07|titan-08|titan-09|titan-10|titan-11|titan-20|titan-21|titan-12|titan-13|titan-14|titan-15|titan-16|titan-17|titan-18|titan-19|titan-22|titan-24\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Control plane CPU", + "panel_id": 16, + "panel_type": "timeseries", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "(avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-0a|titan-0b|titan-0c|titan-db|titan-jh\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Control plane RAM", + "panel_id": 17, + "panel_type": "timeseries", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "(avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-0a|titan-0b|titan-0c|titan-db|titan-jh\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Node Pod Share", + "panel_id": 28, + "panel_type": "piechart", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "(sum(kube_pod_info{pod!=\"\" , node!=\"\"}) by (node) / clamp_min(sum(kube_pod_info{pod!=\"\" , node!=\"\"}), 1)) * 100" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Top Nodes by Pod Count", + "panel_id": 29, + "panel_type": "bargauge", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sort_desc(topk(12, sum(kube_pod_info{pod!=\"\" , node!=\"\"}) by (node)))" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Cluster Ingress Throughput", + "panel_id": 18, + "panel_type": "timeseries", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum(rate(node_network_receive_bytes_total{device!~\"lo|cni.*|veth.*|flannel.*|docker.*|virbr.*|vxlan.*|wg.*\"}[5m])) or on() vector(0)" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Cluster Egress Throughput", + "panel_id": 19, + "panel_type": "timeseries", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum(rate(node_network_transmit_bytes_total{device!~\"lo|cni.*|veth.*|flannel.*|docker.*|virbr.*|vxlan.*|wg.*\"}[5m])) or on() vector(0)" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Intra-Cluster Throughput", + "panel_id": 20, + "panel_type": "timeseries", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum(rate(container_network_receive_bytes_total{namespace!=\"traefik\",pod!=\"\"}[5m]) + rate(container_network_transmit_bytes_total{namespace!=\"traefik\",pod!=\"\"}[5m])) or on() vector(0)" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Root Filesystem Usage", + "panel_id": 21, + "panel_type": "timeseries", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "avg by (node) ((avg by (instance) ((1 - (node_filesystem_avail_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"})) * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Nodes Closest to Full Root Disks", + "panel_id": 22, + "panel_type": "bargauge", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sort_desc(topk(12, avg by (node) ((avg by (instance) ((1 - (node_filesystem_avail_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"})) * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))))" + ] + }, + { + "dashboard": "Atlas Pods", + "panel_title": "Problem Pods", + "panel_id": 1, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "pods" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum(max by (namespace,pod) (kube_pod_status_phase{phase!~\"Running|Succeeded\"})) or on() vector(0)" + ] + }, + { + "dashboard": "Atlas Pods", + "panel_title": "CrashLoop / ImagePull", + "panel_id": 2, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "pods" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum(max by (namespace,pod) (kube_pod_container_status_waiting_reason{reason=~\"CrashLoopBackOff|ImagePullBackOff\"})) or on() vector(0)" + ] + }, + { + "dashboard": "Atlas Pods", + "panel_title": "Stuck Terminating (>10m)", + "panel_id": 3, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "pods" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum(max by (namespace,pod) (((time() - kube_pod_deletion_timestamp{pod!=\"\"}) > bool 600) and on(namespace,pod) (kube_pod_deletion_timestamp{pod!=\"\"} > bool 0))) or on() vector(0)" + ] + }, + { + "dashboard": "Atlas Pods", + "panel_title": "Control Plane Workloads", + "panel_id": 4, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "pods" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum(kube_pod_info{node=~\"titan-0a|titan-0b|titan-0c\",namespace!~\"^(kube-.*|.*-system|traefik|monitoring|logging|cert-manager|maintenance|postgres)$\"})" + ] + }, + { + "dashboard": "Atlas Pods", + "panel_title": "Pods Not Running", + "panel_id": 5, + "panel_type": "table", + "description": "", + "tags": [ + "atlas", + "pods" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "(time() - kube_pod_created{pod!=\"\"}) * on(namespace,pod) group_left(node) kube_pod_info * on(namespace,pod) group_left(phase) max by (namespace,pod,phase) (kube_pod_status_phase{phase!~\"Running|Succeeded\"})" + ] + }, + { + "dashboard": "Atlas Pods", + "panel_title": "CrashLoop / ImagePull", + "panel_id": 6, + "panel_type": "table", + "description": "", + "tags": [ + "atlas", + "pods" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "(time() - kube_pod_created{pod!=\"\"}) * on(namespace,pod) group_left(node) kube_pod_info * on(namespace,pod,container) group_left(reason) max by (namespace,pod,container,reason) (kube_pod_container_status_waiting_reason{reason=~\"CrashLoopBackOff|ImagePullBackOff\"})" + ] + }, + { + "dashboard": "Atlas Pods", + "panel_title": "Terminating >10m", + "panel_id": 7, + "panel_type": "table", + "description": "", + "tags": [ + "atlas", + "pods" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "(((time() - kube_pod_deletion_timestamp{pod!=\"\"}) and on(namespace,pod) (kube_pod_deletion_timestamp{pod!=\"\"} > bool 0)) * on(namespace,pod) group_left(node) kube_pod_info)" + ] + }, + { + "dashboard": "Atlas Pods", + "panel_title": "Node Pod Share", + "panel_id": 8, + "panel_type": "piechart", + "description": "", + "tags": [ + "atlas", + "pods" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "(sum(kube_pod_info{pod!=\"\" , node!=\"\"}) by (node) / clamp_min(sum(kube_pod_info{pod!=\"\" , node!=\"\"}), 1)) * 100" + ] + }, + { + "dashboard": "Atlas Pods", + "panel_title": "Top Nodes by Pod Count", + "panel_id": 9, + "panel_type": "bargauge", + "description": "", + "tags": [ + "atlas", + "pods" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sort_desc(topk(12, sum(kube_pod_info{pod!=\"\" , node!=\"\"}) by (node)))" + ] + }, + { + "dashboard": "Atlas Pods", + "panel_title": "Namespace Plurality by Node v27", + "panel_id": 10, + "panel_type": "table", + "description": "", + "tags": [ + "atlas", + "pods" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "(sum by (namespace,node) (kube_pod_info{pod!=\"\" , node!=\"\"}) / on(namespace) group_left() clamp_min(sum by (namespace) (kube_pod_info{pod!=\"\"}), 1) * 100) * on(namespace,node) group_left() ((sum by (namespace,node) (kube_pod_info{pod!=\"\" , node!=\"\"}) / on(namespace) group_left() clamp_min(sum by (namespace) (kube_pod_info{pod!=\"\"}), 1) * 100) + on(node) group_left() ((sum by (node) (kube_node_info{node=\"titan-0a\"}) * 0 + 0.001) or (sum by (node) (kube_node_info{node=\"titan-0b\"}) * 0 + 0.002) or (sum by (node) (kube_node_info{node=\"titan-0c\"}) * 0 + 0.003) or (sum by (node) (kube_node_info{node=\"titan-db\"}) * 0 + 0.004) or (sum by (node) (kube_node_info{node=\"titan-jh\"}) * 0 + 0.005) or (sum by (node) (kube_node_info{node=\"titan-04\"}) * 0 + 0.006) or (sum by (node) (kube_node_info{node=\"titan-05\"}) * 0 + 0.007) or (sum by (node) (kube_node_info{node=\"titan-06\"}) * 0 + 0.008) or (sum by (node) (kube_node_info{node=\"titan-07\"}) * 0 + 0.009000000000000001) or (sum by (node) (kube_node_info{node=\"titan-08\"}) * 0 + 0.01) or (sum by (node) (kube_node_info{node=\"titan-09\"}) * 0 + 0.011) or (sum by (node) (kube_node_info{node=\"titan-10\"}) * 0 + 0.012) or (sum by (node) (kube_node_info{node=\"titan-11\"}) * 0 + 0.013000000000000001) or (sum by (node) (kube_node_info{node=\"titan-20\"}) * 0 + 0.014) or (sum by (node) (kube_node_info{node=\"titan-21\"}) * 0 + 0.015) or (sum by (node) (kube_node_info{node=\"titan-12\"}) * 0 + 0.016) or (sum by (node) (kube_node_info{node=\"titan-13\"}) * 0 + 0.017) or (sum by (node) (kube_node_info{node=\"titan-14\"}) * 0 + 0.018000000000000002) or (sum by (node) (kube_node_info{node=\"titan-15\"}) * 0 + 0.019) or (sum by (node) (kube_node_info{node=\"titan-16\"}) * 0 + 0.02) or (sum by (node) (kube_node_info{node=\"titan-17\"}) * 0 + 0.021) or (sum by (node) (kube_node_info{node=\"titan-18\"}) * 0 + 0.022) or (sum by (node) (kube_node_info{node=\"titan-19\"}) * 0 + 0.023) or (sum by (node) (kube_node_info{node=\"titan-22\"}) * 0 + 0.024) or (sum by (node) (kube_node_info{node=\"titan-24\"}) * 0 + 0.025)) == bool on(namespace) group_left() (max by (namespace) ((sum by (namespace,node) (kube_pod_info{pod!=\"\" , node!=\"\"}) / on(namespace) group_left() clamp_min(sum by (namespace) (kube_pod_info{pod!=\"\"}), 1) * 100) + on(node) group_left() ((sum by (node) (kube_node_info{node=\"titan-0a\"}) * 0 + 0.001) or (sum by (node) (kube_node_info{node=\"titan-0b\"}) * 0 + 0.002) or (sum by (node) (kube_node_info{node=\"titan-0c\"}) * 0 + 0.003) or (sum by (node) (kube_node_info{node=\"titan-db\"}) * 0 + 0.004) or (sum by (node) (kube_node_info{node=\"titan-jh\"}) * 0 + 0.005) or (sum by (node) (kube_node_info{node=\"titan-04\"}) * 0 + 0.006) or (sum by (node) (kube_node_info{node=\"titan-05\"}) * 0 + 0.007) or (sum by (node) (kube_node_info{node=\"titan-06\"}) * 0 + 0.008) or (sum by (node) (kube_node_info{node=\"titan-07\"}) * 0 + 0.009000000000000001) or (sum by (node) (kube_node_info{node=\"titan-08\"}) * 0 + 0.01) or (sum by (node) (kube_node_info{node=\"titan-09\"}) * 0 + 0.011) or (sum by (node) (kube_node_info{node=\"titan-10\"}) * 0 + 0.012) or (sum by (node) (kube_node_info{node=\"titan-11\"}) * 0 + 0.013000000000000001) or (sum by (node) (kube_node_info{node=\"titan-20\"}) * 0 + 0.014) or (sum by (node) (kube_node_info{node=\"titan-21\"}) * 0 + 0.015) or (sum by (node) (kube_node_info{node=\"titan-12\"}) * 0 + 0.016) or (sum by (node) (kube_node_info{node=\"titan-13\"}) * 0 + 0.017) or (sum by (node) (kube_node_info{node=\"titan-14\"}) * 0 + 0.018000000000000002) or (sum by (node) (kube_node_info{node=\"titan-15\"}) * 0 + 0.019) or (sum by (node) (kube_node_info{node=\"titan-16\"}) * 0 + 0.02) or (sum by (node) (kube_node_info{node=\"titan-17\"}) * 0 + 0.021) or (sum by (node) (kube_node_info{node=\"titan-18\"}) * 0 + 0.022) or (sum by (node) (kube_node_info{node=\"titan-19\"}) * 0 + 0.023) or (sum by (node) (kube_node_info{node=\"titan-22\"}) * 0 + 0.024) or (sum by (node) (kube_node_info{node=\"titan-24\"}) * 0 + 0.025)))))" + ] + }, + { + "dashboard": "Atlas Storage", + "panel_title": "Astreae Usage", + "panel_id": 1, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "storage" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "100 - (sum(node_filesystem_avail_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"}) / sum(node_filesystem_size_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"}) * 100)" + ] + }, + { + "dashboard": "Atlas Storage", + "panel_title": "Asteria Usage", + "panel_id": 2, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "storage" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "100 - (sum(node_filesystem_avail_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"}) / sum(node_filesystem_size_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"}) * 100)" + ] + }, + { + "dashboard": "Atlas Storage", + "panel_title": "Astreae Free", + "panel_id": 3, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "storage" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum(node_filesystem_avail_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"})" + ] + }, + { + "dashboard": "Atlas Storage", + "panel_title": "Asteria Free", + "panel_id": 4, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "storage" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum(node_filesystem_avail_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"})" + ] + }, + { + "dashboard": "Atlas Storage", + "panel_title": "Astreae Per-Node Usage", + "panel_id": 5, + "panel_type": "timeseries", + "description": "", + "tags": [ + "atlas", + "storage" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "(avg by (node) ((avg by (instance) ((1 - (node_filesystem_avail_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"})) * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-1[2-9]|titan-2[24]\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")" + ] + }, + { + "dashboard": "Atlas Storage", + "panel_title": "Asteria Per-Node Usage", + "panel_id": 6, + "panel_type": "timeseries", + "description": "", + "tags": [ + "atlas", + "storage" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "(avg by (node) ((avg by (instance) ((1 - (node_filesystem_avail_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"})) * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-1[2-9]|titan-2[24]\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")" + ] + }, + { + "dashboard": "Atlas Storage", + "panel_title": "Astreae Usage History", + "panel_id": 7, + "panel_type": "timeseries", + "description": "", + "tags": [ + "atlas", + "storage" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "100 - (sum(node_filesystem_avail_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"}) / sum(node_filesystem_size_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"}) * 100)" + ] + }, + { + "dashboard": "Atlas Storage", + "panel_title": "Asteria Usage History", + "panel_id": 8, + "panel_type": "timeseries", + "description": "", + "tags": [ + "atlas", + "storage" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "100 - (sum(node_filesystem_avail_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"}) / sum(node_filesystem_size_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"}) * 100)" + ] + }, + { + "dashboard": "Atlas Storage", + "panel_title": "Maintenance Sweepers Ready", + "panel_id": 30, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "storage" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "kube_daemonset_status_number_ready{namespace=\"maintenance\",daemonset=\"node-image-sweeper\"} / on(namespace,daemonset) kube_daemonset_status_desired_number_scheduled{namespace=\"maintenance\",daemonset=\"node-image-sweeper\"} * 100" + ] + }, + { + "dashboard": "Atlas Storage", + "panel_title": "Maintenance Cron Freshness (s)", + "panel_id": 31, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "storage" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "time() - max by (cronjob) (kube_cronjob_status_last_successful_time{namespace=\"maintenance\",cronjob=\"image-sweeper\"})" + ] + } +] diff --git a/services/comms/knowledge/catalog/runbooks.json b/services/comms/knowledge/catalog/runbooks.json index d7356ca..960510d 100644 --- a/services/comms/knowledge/catalog/runbooks.json +++ b/services/comms/knowledge/catalog/runbooks.json @@ -20,6 +20,22 @@ ], "body": "# CI: Gitea \u2192 Jenkins pipeline\n\n## What this is\nAtlas uses Gitea for source control and Jenkins for CI. Authentication is via Keycloak (SSO).\n\n## Where it is configured\n- Gitea manifests: `services/gitea/`\n- Jenkins manifests: `services/jenkins/`\n- Credential sync helpers: `scripts/gitea_cred_sync.sh`, `scripts/jenkins_cred_sync.sh`\n\n## What users do (typical flow)\n- Create a repo in Gitea.\n- Create/update a Jenkins job/pipeline that can fetch the repo.\n- Configure a webhook (or SCM polling) so pushes trigger builds.\n\n## Troubleshooting (common)\n- \u201cWebhook not firing\u201d: confirm ingress host, webhook URL, and Jenkins job is reachable.\n- \u201cAuth denied cloning\u201d: confirm Keycloak group membership and that Jenkins has a valid token/credential configured." }, + { + "path": "runbooks/comms-verify.md", + "title": "Othrys verification checklist", + "tags": [ + "comms", + "matrix", + "element", + "livekit" + ], + "entrypoints": [ + "https://live.bstein.dev", + "https://matrix.live.bstein.dev" + ], + "source_paths": [], + "body": "1) Guest join:\n- Open a private window and visit:\n `https://live.bstein.dev/#/room/#othrys:live.bstein.dev?action=join`\n- Confirm the guest join flow works and the displayname becomes `-`.\n\n2) Keycloak login:\n- Log in from `https://live.bstein.dev` and confirm MAS -> Keycloak -> Element redirect.\n\n3) Video rooms:\n- Start an Element Call room and confirm audio/video with a second account.\n- Check that guests can read public rooms but cannot start calls.\n\n4) Well-known:\n- `https://live.bstein.dev/.well-known/matrix/client` returns JSON.\n- `https://matrix.live.bstein.dev/.well-known/matrix/client` returns JSON.\n\n5) TURN reachability:\n- Confirm `turn.live.bstein.dev:3478` and `turns:5349` are reachable from WAN." + }, { "path": "runbooks/kb-authoring.md", "title": "KB authoring: what to write (and what not to)", @@ -69,5 +85,13 @@ "clusters/atlas/<...>" ], "body": "# \n\n## What this is\n\n## For users (how to)\n\n## For operators (where configured)\n\n## Troubleshooting (symptoms \u2192 checks)" + }, + { + "path": "software/metis.md", + "title": "metis", + "tags": [], + "entrypoints": [], + "source_paths": [], + "body": "# Metis (node recovery)\n\n## Node classes (current map)\n- rpi5 Ubuntu workers: titan-04,05,06,07,08,09,10,11,20,21 (Ubuntu 24.04.3, k3s agent)\n- rpi5 control-plane: titan-0a/0b/0c (Ubuntu 24.04.1, k3s server, control-plane taint)\n- rpi4 Armbian longhorn: titan-13/15/17/19 (Armbian 6.6.x, k3s agent, longhorn disks)\n- rpi4 Armbian standard: titan-12/14/18 (Armbian 6.6.x, k3s agent)\n- amd64 agents: titan-22/24 (Debian 13, k3s agent)\n- External/non-cluster: tethys, titan-db, titan-jh, oceanus/titan-23, plus any newcomers.\n\n### Jetson nodes (titan-20/21)\n- Ubuntu 20.04.6 (Focal), kernel 5.10.104-tegra, CRI containerd 2.0.5-k3s2, arch arm64.\n- Storage: NVMe 232G at / (ext4); onboard mmc partitions present but root on NVMe; 1.9T sda present (unused).\n- k3s agent with drop-in 99-nofile.conf.\n\n## Longhorn disk UUIDs (critical nodes)\n- titan-13: /mnt/astreae UUID=6031fa8b-f28c-45c3-b7bc-6133300e07c6 (ext4); /mnt/asteria UUID=cbd4989d-62b5-4741-8b2a-28fdae259cae (ext4)\n- titan-15: /mnt/astreae UUID=f3362f14-5822-449f-944b-ac570b5cd615 (ext4); /mnt/asteria UUID=9c5316e6-f847-4884-b502-11f2d0d15d6f (ext4)\n- titan-17: /mnt/astreae UUID=1fecdade-08b0-49cb-9ae3-be6c188b0a96 (ext4); /mnt/asteria UUID=2fe9f613-d372-47ca-b84f-82084e4edda0 (ext4)\n- titan-19: /mnt/astreae UUID=4890abb9-dda2-4f4f-9c0f-081ee82849cf (ext4); /mnt/asteria UUID=2b4ea28d-b0e6-4fa3-841b-cd7067ae9153 (ext4)\n\n## Metis repo (~/Development/metis)\n- CLI skeleton in Go (`cmd/metis`), inventory loader (`pkg/inventory`), plan builder (`pkg/plan`).\n- `inventory.example.yaml` shows expected schema (classes + per-node overlay, Longhorn disks, labels, taints).\n- `AGENTS.md` in repo is untracked and holds raw notes.\n\n## Next implementation steps\n- Add per-class golden image refs and checksums (Harbor or file://) when ready.\n- Implement burn execution: download with checksum, write via dd/etcher-equivalent, mount boot/root to inject hostname/IP/k3s tokens/labels/taints, journald/GC drop-ins, and Longhorn fstab entries. Add Windows writer (diskpart + wmic) and Linux writer (dd + sgdisk) paths.\n- Add Keycloak/SSH bootstrap: ensure ssh user, authorized keys, and k3s token/URL injection for agents; control-plane restore path with etcd snapshot selection.\n- Add per-host inventory entries for tethys, titan-db, titan-jh, oceanus/titan-23, future 20/21 once audited.\n\n## Node OS/Kernel/CRI snapshot (Jan 2026)\n- titan-04: Ubuntu 24.04.3 LTS, kernel 6.8.0-1031-raspi, CRI containerd://2.0.5-k3s2, arch arm64\n- titan-05: Ubuntu 24.04.3 LTS, kernel 6.8.0-1039-raspi, CRI containerd://2.0.5-k3s2, arch arm64\n- titan-06: Ubuntu 24.04.3 LTS, kernel 6.8.0-1039-raspi, CRI containerd://2.0.5-k3s2, arch arm64\n- titan-07: Ubuntu 24.04.3 LTS, kernel 6.8.0-1039-raspi, CRI containerd://2.0.5-k3s2, arch arm64\n- titan-08: Ubuntu 24.04.3 LTS, kernel 6.8.0-1039-raspi, CRI containerd://2.0.5-k3s2, arch arm64\n- titan-09: Ubuntu 24.04.3 LTS, kernel 6.8.0-1031-raspi, CRI containerd://2.0.5-k3s2, arch arm64\n- titan-0a: Ubuntu 24.04.1 LTS, kernel 6.8.0-1038-raspi, CRI containerd://2.0.5-k3s2, arch arm64\n- titan-0b: Ubuntu 24.04.1 LTS, kernel 6.8.0-1038-raspi, CRI containerd://2.0.5-k3s2, arch arm64\n- titan-0c: Ubuntu 24.04.1 LTS, kernel 6.8.0-1038-raspi, CRI containerd://2.0.5-k3s2, arch arm64\n- titan-10: Ubuntu 24.04.3 LTS, kernel 6.8.0-1039-raspi, CRI containerd://2.0.5-k3s2, arch arm64\n- titan-11: Ubuntu 24.04.3 LTS, kernel 6.8.0-1039-raspi, CRI containerd://2.0.5-k3s2, arch arm64\n- titan-12: Armbian 24.11.1 noble, kernel 6.6.60-current-bcm2711, CRI containerd://1.7.23-k3s2, arch arm64\n- titan-13: Armbian 25.2.1 noble, kernel 6.6.63-current-bcm2711, CRI containerd://1.7.23-k3s2, arch arm64\n- titan-14: Armbian 24.11.1 noble, kernel 6.6.60-current-bcm2711, CRI containerd://1.7.23-k3s2, arch arm64\n- titan-15: Armbian 25.2.1 noble, kernel 6.6.63-current-bcm2711, CRI containerd://1.7.23-k3s2, arch arm64\n- titan-17: Armbian 25.2.1 noble, kernel 6.6.63-current-bcm2711, CRI containerd://1.7.23-k3s2, arch arm64\n- titan-18: Armbian 24.11.1 noble, kernel 6.6.60-current-bcm2711, CRI containerd://1.7.23-k3s2, arch arm64\n- titan-19: Armbian 25.2.1 noble, kernel 6.6.63-current-bcm2711, CRI containerd://1.7.23-k3s2, arch arm64\n- titan-20: Ubuntu 20.04.6 LTS, kernel 5.10.104-tegra, CRI containerd://2.0.5-k3s2, arch arm64\n- titan-21: Ubuntu 20.04.6 LTS, kernel 5.10.104-tegra, CRI containerd://2.0.5-k3s2, arch arm64\n- titan-22: Debian 13 (trixie), kernel 6.12.41+deb13-amd64, CRI containerd://2.0.5-k3s2, arch amd64\n- titan-24: Debian 13 (trixie), kernel 6.12.57+deb13-amd64, CRI containerd://2.0.5-k3s2, arch amd64\n\n\n### External hosts\n- titan-db: Ubuntu 24.10, kernel 6.11.0-1015-raspi, root on /dev/sda2 ext4 (465G), boot vfat /dev/sda1; PostgreSQL service enabled.\n- titan-jh: Arch Linux ARM (rolling), kernel 6.18.4-2-rpi, NVMe root ext4 238G (/), boot vfat 512M; ~495 packages installed (pacman -Q).\n- titan-23/oceanus: TODO audit (future).\n\n\n### Control plane Pis (titan-0a/0b/0c)\n- Ubuntu 24.04.1 LTS, kernel 6.8.0-1038-raspi, containerd 2.0.5-k3s2.\n- Storage: 477G SSD root (/dev/sda2 ext4), /boot/firmware vfat (/dev/sda1). fstab uses LABEL=writable and LABEL=system-boot.\n- k3s server (control-plane taint expected); etcd snapshots not yet cataloged (TODO).\n\n\n## k3s versions\n- rpi5 workers/control-plane: k3s v1.33.3+k3s1 (crictl v1.31.0-k3s2)\n- rpi4 nodes: k3s v1.31.5+k3s1 (crictl v1.31.0-k3s2)\n- Jetson titan-20/21: k3s v1.33.3+k3s1 (per node info), crictl v1.31.0-k3s2" } ] diff --git a/services/comms/knowledge/diagrams/atlas-http.mmd b/services/comms/knowledge/diagrams/atlas-http.mmd index ab7c362..1aa7ac8 100644 --- a/services/comms/knowledge/diagrams/atlas-http.mmd +++ b/services/comms/knowledge/diagrams/atlas-http.mmd @@ -17,6 +17,11 @@ flowchart LR host_bstein_dev --> svc_bstein_dev_home_bstein_dev_home_backend wl_bstein_dev_home_bstein_dev_home_backend["bstein-dev-home/bstein-dev-home-backend (Deployment)"] svc_bstein_dev_home_bstein_dev_home_backend --> wl_bstein_dev_home_bstein_dev_home_backend + host_budget_bstein_dev["budget.bstein.dev"] + svc_finance_actual_budget["finance/actual-budget (Service)"] + host_budget_bstein_dev --> svc_finance_actual_budget + wl_finance_actual_budget["finance/actual-budget (Deployment)"] + svc_finance_actual_budget --> wl_finance_actual_budget host_call_live_bstein_dev["call.live.bstein.dev"] svc_comms_element_call["comms/element-call (Service)"] host_call_live_bstein_dev --> svc_comms_element_call @@ -37,6 +42,11 @@ flowchart LR host_cloud_bstein_dev --> svc_nextcloud_nextcloud wl_nextcloud_nextcloud["nextcloud/nextcloud (Deployment)"] svc_nextcloud_nextcloud --> wl_nextcloud_nextcloud + host_health_bstein_dev["health.bstein.dev"] + svc_health_wger["health/wger (Service)"] + host_health_bstein_dev --> svc_health_wger + wl_health_wger["health/wger (Deployment)"] + svc_health_wger --> wl_health_wger host_kit_live_bstein_dev["kit.live.bstein.dev"] svc_comms_livekit_token_service["comms/livekit-token-service (Service)"] host_kit_live_bstein_dev --> svc_comms_livekit_token_service @@ -50,6 +60,14 @@ flowchart LR host_live_bstein_dev --> svc_comms_matrix_wellknown svc_comms_othrys_synapse_matrix_synapse["comms/othrys-synapse-matrix-synapse (Service)"] host_live_bstein_dev --> svc_comms_othrys_synapse_matrix_synapse + svc_comms_matrix_guest_register["comms/matrix-guest-register (Service)"] + host_live_bstein_dev --> svc_comms_matrix_guest_register + wl_comms_matrix_guest_register["comms/matrix-guest-register (Deployment)"] + svc_comms_matrix_guest_register --> wl_comms_matrix_guest_register + svc_comms_matrix_authentication_service["comms/matrix-authentication-service (Service)"] + host_live_bstein_dev --> svc_comms_matrix_authentication_service + wl_comms_matrix_authentication_service["comms/matrix-authentication-service (Deployment)"] + svc_comms_matrix_authentication_service --> wl_comms_matrix_authentication_service host_logs_bstein_dev["logs.bstein.dev"] svc_logging_oauth2_proxy_logs["logging/oauth2-proxy-logs (Service)"] host_logs_bstein_dev --> svc_logging_oauth2_proxy_logs @@ -64,21 +82,20 @@ flowchart LR svc_mailu_mailserver_mailu_front["mailu-mailserver/mailu-front (Service)"] host_mail_bstein_dev --> svc_mailu_mailserver_mailu_front host_matrix_live_bstein_dev["matrix.live.bstein.dev"] - svc_comms_matrix_authentication_service["comms/matrix-authentication-service (Service)"] host_matrix_live_bstein_dev --> svc_comms_matrix_authentication_service - wl_comms_matrix_authentication_service["comms/matrix-authentication-service (Deployment)"] - svc_comms_matrix_authentication_service --> wl_comms_matrix_authentication_service host_matrix_live_bstein_dev --> svc_comms_matrix_wellknown host_matrix_live_bstein_dev --> svc_comms_othrys_synapse_matrix_synapse - svc_comms_matrix_guest_register["comms/matrix-guest-register (Service)"] host_matrix_live_bstein_dev --> svc_comms_matrix_guest_register - wl_comms_matrix_guest_register["comms/matrix-guest-register (Deployment)"] - svc_comms_matrix_guest_register --> wl_comms_matrix_guest_register host_monero_bstein_dev["monero.bstein.dev"] svc_crypto_monerod["crypto/monerod (Service)"] host_monero_bstein_dev --> svc_crypto_monerod wl_crypto_monerod["crypto/monerod (Deployment)"] svc_crypto_monerod --> wl_crypto_monerod + host_money_bstein_dev["money.bstein.dev"] + svc_finance_firefly["finance/firefly (Service)"] + host_money_bstein_dev --> svc_finance_firefly + wl_finance_firefly["finance/firefly (Deployment)"] + svc_finance_firefly --> wl_finance_firefly host_notes_bstein_dev["notes.bstein.dev"] svc_outline_outline["outline/outline (Service)"] host_notes_bstein_dev --> svc_outline_outline @@ -143,19 +160,29 @@ flowchart LR svc_comms_livekit wl_comms_livekit svc_comms_othrys_synapse_matrix_synapse - svc_comms_matrix_authentication_service - wl_comms_matrix_authentication_service svc_comms_matrix_guest_register wl_comms_matrix_guest_register + svc_comms_matrix_authentication_service + wl_comms_matrix_authentication_service end subgraph crypto[crypto] svc_crypto_monerod wl_crypto_monerod end + subgraph finance[finance] + svc_finance_actual_budget + wl_finance_actual_budget + svc_finance_firefly + wl_finance_firefly + end subgraph gitea[gitea] svc_gitea_gitea wl_gitea_gitea end + subgraph health[health] + svc_health_wger + wl_health_wger + end subgraph jellyfin[jellyfin] svc_jellyfin_pegasus wl_jellyfin_pegasus diff --git a/services/comms/knowledge/metis.md b/services/comms/knowledge/metis.md new file mode 100644 index 0000000..5b0d06b --- /dev/null +++ b/services/comms/knowledge/metis.md @@ -0,0 +1,26 @@ +# Metis (node recovery) + +## Node classes (current map) +- rpi5 Ubuntu workers: titan-04,05,06,07,08,09,10,11,20,21 (Ubuntu 24.04.3, k3s agent) +- rpi5 control-plane: titan-0a/0b/0c (Ubuntu 24.04.1, k3s server, control-plane taint) +- rpi4 Armbian longhorn: titan-13/15/17/19 (Armbian 6.6.x, k3s agent, longhorn disks) +- rpi4 Armbian standard: titan-12/14/18 (Armbian 6.6.x, k3s agent) +- amd64 agents: titan-22/24 (Debian 13, k3s agent) +- External/non-cluster: tethys, titan-db, titan-jh, oceanus/titan-23, future titan-20/21 (when added), plus any newcomers. + +## Longhorn disk UUIDs (critical nodes) +- titan-13: /mnt/astreae UUID=6031fa8b-f28c-45c3-b7bc-6133300e07c6 (ext4); /mnt/asteria UUID=cbd4989d-62b5-4741-8b2a-28fdae259cae (ext4) +- titan-15: /mnt/astreae UUID=f3362f14-5822-449f-944b-ac570b5cd615 (ext4); /mnt/asteria UUID=9c5316e6-f847-4884-b502-11f2d0d15d6f (ext4) +- titan-17: /mnt/astreae UUID=1fecdade-08b0-49cb-9ae3-be6c188b0a96 (ext4); /mnt/asteria UUID=2fe9f613-d372-47ca-b84f-82084e4edda0 (ext4) +- titan-19: /mnt/astreae UUID=4890abb9-dda2-4f4f-9c0f-081ee82849cf (ext4); /mnt/asteria UUID=2b4ea28d-b0e6-4fa3-841b-cd7067ae9153 (ext4) + +## Metis repo (~/Development/metis) +- CLI skeleton in Go (`cmd/metis`), inventory loader (`pkg/inventory`), plan builder (`pkg/plan`). +- `inventory.example.yaml` shows expected schema (classes + per-node overlay, Longhorn disks, labels, taints). +- `AGENTS.md` in repo is untracked and holds raw notes. + +## Next implementation steps +- Add per-class golden image refs and checksums (Harbor or file://) when ready. +- Implement burn execution: download with checksum, write via dd/etcher-equivalent, mount boot/root to inject hostname/IP/k3s tokens/labels/taints, journald/GC drop-ins, and Longhorn fstab entries. Add Windows writer (diskpart + wmic) and Linux writer (dd + sgdisk) paths. +- Add Keycloak/SSH bootstrap: ensure ssh user, authorized keys, and k3s token/URL injection for agents; control-plane restore path with etcd snapshot selection. +- Add per-host inventory entries for tethys, titan-db, titan-jh, oceanus/titan-23, future 20/21 once audited. diff --git a/services/comms/knowledge/runbooks/comms-verify.md b/services/comms/knowledge/runbooks/comms-verify.md new file mode 100644 index 0000000..8c09d0a --- /dev/null +++ b/services/comms/knowledge/runbooks/comms-verify.md @@ -0,0 +1,30 @@ +--- +title: Othrys verification checklist +tags: + - comms + - matrix + - element + - livekit +entrypoints: + - https://live.bstein.dev + - https://matrix.live.bstein.dev +--- + +1) Guest join: +- Open a private window and visit: + `https://live.bstein.dev/#/room/#othrys:live.bstein.dev?action=join` +- Confirm the guest join flow works and the displayname becomes `-`. + +2) Keycloak login: +- Log in from `https://live.bstein.dev` and confirm MAS -> Keycloak -> Element redirect. + +3) Video rooms: +- Start an Element Call room and confirm audio/video with a second account. +- Check that guests can read public rooms but cannot start calls. + +4) Well-known: +- `https://live.bstein.dev/.well-known/matrix/client` returns JSON. +- `https://matrix.live.bstein.dev/.well-known/matrix/client` returns JSON. + +5) TURN reachability: +- Confirm `turn.live.bstein.dev:3478` and `turns:5349` are reachable from WAN. diff --git a/services/comms/knowledge/software/metis.md b/services/comms/knowledge/software/metis.md new file mode 100644 index 0000000..7ca3b39 --- /dev/null +++ b/services/comms/knowledge/software/metis.md @@ -0,0 +1,73 @@ +# Metis (node recovery) + +## Node classes (current map) +- rpi5 Ubuntu workers: titan-04,05,06,07,08,09,10,11,20,21 (Ubuntu 24.04.3, k3s agent) +- rpi5 control-plane: titan-0a/0b/0c (Ubuntu 24.04.1, k3s server, control-plane taint) +- rpi4 Armbian longhorn: titan-13/15/17/19 (Armbian 6.6.x, k3s agent, longhorn disks) +- rpi4 Armbian standard: titan-12/14/18 (Armbian 6.6.x, k3s agent) +- amd64 agents: titan-22/24 (Debian 13, k3s agent) +- External/non-cluster: tethys, titan-db, titan-jh, oceanus/titan-23, plus any newcomers. + +### Jetson nodes (titan-20/21) +- Ubuntu 20.04.6 (Focal), kernel 5.10.104-tegra, CRI containerd 2.0.5-k3s2, arch arm64. +- Storage: NVMe 232G at / (ext4); onboard mmc partitions present but root on NVMe; 1.9T sda present (unused). +- k3s agent with drop-in 99-nofile.conf. + +## Longhorn disk UUIDs (critical nodes) +- titan-13: /mnt/astreae UUID=6031fa8b-f28c-45c3-b7bc-6133300e07c6 (ext4); /mnt/asteria UUID=cbd4989d-62b5-4741-8b2a-28fdae259cae (ext4) +- titan-15: /mnt/astreae UUID=f3362f14-5822-449f-944b-ac570b5cd615 (ext4); /mnt/asteria UUID=9c5316e6-f847-4884-b502-11f2d0d15d6f (ext4) +- titan-17: /mnt/astreae UUID=1fecdade-08b0-49cb-9ae3-be6c188b0a96 (ext4); /mnt/asteria UUID=2fe9f613-d372-47ca-b84f-82084e4edda0 (ext4) +- titan-19: /mnt/astreae UUID=4890abb9-dda2-4f4f-9c0f-081ee82849cf (ext4); /mnt/asteria UUID=2b4ea28d-b0e6-4fa3-841b-cd7067ae9153 (ext4) + +## Metis repo (~/Development/metis) +- CLI skeleton in Go (`cmd/metis`), inventory loader (`pkg/inventory`), plan builder (`pkg/plan`). +- `inventory.example.yaml` shows expected schema (classes + per-node overlay, Longhorn disks, labels, taints). +- `AGENTS.md` in repo is untracked and holds raw notes. + +## Next implementation steps +- Add per-class golden image refs and checksums (Harbor or file://) when ready. +- Implement burn execution: download with checksum, write via dd/etcher-equivalent, mount boot/root to inject hostname/IP/k3s tokens/labels/taints, journald/GC drop-ins, and Longhorn fstab entries. Add Windows writer (diskpart + wmic) and Linux writer (dd + sgdisk) paths. +- Add Keycloak/SSH bootstrap: ensure ssh user, authorized keys, and k3s token/URL injection for agents; control-plane restore path with etcd snapshot selection. +- Add per-host inventory entries for tethys, titan-db, titan-jh, oceanus/titan-23, future 20/21 once audited. + +## Node OS/Kernel/CRI snapshot (Jan 2026) +- titan-04: Ubuntu 24.04.3 LTS, kernel 6.8.0-1031-raspi, CRI containerd://2.0.5-k3s2, arch arm64 +- titan-05: Ubuntu 24.04.3 LTS, kernel 6.8.0-1039-raspi, CRI containerd://2.0.5-k3s2, arch arm64 +- titan-06: Ubuntu 24.04.3 LTS, kernel 6.8.0-1039-raspi, CRI containerd://2.0.5-k3s2, arch arm64 +- titan-07: Ubuntu 24.04.3 LTS, kernel 6.8.0-1039-raspi, CRI containerd://2.0.5-k3s2, arch arm64 +- titan-08: Ubuntu 24.04.3 LTS, kernel 6.8.0-1039-raspi, CRI containerd://2.0.5-k3s2, arch arm64 +- titan-09: Ubuntu 24.04.3 LTS, kernel 6.8.0-1031-raspi, CRI containerd://2.0.5-k3s2, arch arm64 +- titan-0a: Ubuntu 24.04.1 LTS, kernel 6.8.0-1038-raspi, CRI containerd://2.0.5-k3s2, arch arm64 +- titan-0b: Ubuntu 24.04.1 LTS, kernel 6.8.0-1038-raspi, CRI containerd://2.0.5-k3s2, arch arm64 +- titan-0c: Ubuntu 24.04.1 LTS, kernel 6.8.0-1038-raspi, CRI containerd://2.0.5-k3s2, arch arm64 +- titan-10: Ubuntu 24.04.3 LTS, kernel 6.8.0-1039-raspi, CRI containerd://2.0.5-k3s2, arch arm64 +- titan-11: Ubuntu 24.04.3 LTS, kernel 6.8.0-1039-raspi, CRI containerd://2.0.5-k3s2, arch arm64 +- titan-12: Armbian 24.11.1 noble, kernel 6.6.60-current-bcm2711, CRI containerd://1.7.23-k3s2, arch arm64 +- titan-13: Armbian 25.2.1 noble, kernel 6.6.63-current-bcm2711, CRI containerd://1.7.23-k3s2, arch arm64 +- titan-14: Armbian 24.11.1 noble, kernel 6.6.60-current-bcm2711, CRI containerd://1.7.23-k3s2, arch arm64 +- titan-15: Armbian 25.2.1 noble, kernel 6.6.63-current-bcm2711, CRI containerd://1.7.23-k3s2, arch arm64 +- titan-17: Armbian 25.2.1 noble, kernel 6.6.63-current-bcm2711, CRI containerd://1.7.23-k3s2, arch arm64 +- titan-18: Armbian 24.11.1 noble, kernel 6.6.60-current-bcm2711, CRI containerd://1.7.23-k3s2, arch arm64 +- titan-19: Armbian 25.2.1 noble, kernel 6.6.63-current-bcm2711, CRI containerd://1.7.23-k3s2, arch arm64 +- titan-20: Ubuntu 20.04.6 LTS, kernel 5.10.104-tegra, CRI containerd://2.0.5-k3s2, arch arm64 +- titan-21: Ubuntu 20.04.6 LTS, kernel 5.10.104-tegra, CRI containerd://2.0.5-k3s2, arch arm64 +- titan-22: Debian 13 (trixie), kernel 6.12.41+deb13-amd64, CRI containerd://2.0.5-k3s2, arch amd64 +- titan-24: Debian 13 (trixie), kernel 6.12.57+deb13-amd64, CRI containerd://2.0.5-k3s2, arch amd64 + + +### External hosts +- titan-db: Ubuntu 24.10, kernel 6.11.0-1015-raspi, root on /dev/sda2 ext4 (465G), boot vfat /dev/sda1; PostgreSQL service enabled. +- titan-jh: Arch Linux ARM (rolling), kernel 6.18.4-2-rpi, NVMe root ext4 238G (/), boot vfat 512M; ~495 packages installed (pacman -Q). +- titan-23/oceanus: TODO audit (future). + + +### Control plane Pis (titan-0a/0b/0c) +- Ubuntu 24.04.1 LTS, kernel 6.8.0-1038-raspi, containerd 2.0.5-k3s2. +- Storage: 477G SSD root (/dev/sda2 ext4), /boot/firmware vfat (/dev/sda1). fstab uses LABEL=writable and LABEL=system-boot. +- k3s server (control-plane taint expected); etcd snapshots not yet cataloged (TODO). + + +## k3s versions +- rpi5 workers/control-plane: k3s v1.33.3+k3s1 (crictl v1.31.0-k3s2) +- rpi4 nodes: k3s v1.31.5+k3s1 (crictl v1.31.0-k3s2) +- Jetson titan-20/21: k3s v1.33.3+k3s1 (per node info), crictl v1.31.0-k3s2 diff --git a/services/comms/kustomization.yaml b/services/comms/kustomization.yaml index 3360067..969ca58 100644 --- a/services/comms/kustomization.yaml +++ b/services/comms/kustomization.yaml @@ -14,6 +14,7 @@ resources: - guest-register-deployment.yaml - guest-register-service.yaml - atlasbot-deployment.yaml + - atlasbot-service.yaml - wellknown.yaml - atlasbot-rbac.yaml - mas-secrets-ensure-rbac.yaml @@ -21,23 +22,24 @@ resources: - mas-db-ensure-rbac.yaml - synapse-signingkey-ensure-rbac.yaml - vault-sync-deployment.yaml - - mas-admin-client-secret-ensure-job.yaml - - mas-db-ensure-job.yaml - - comms-secrets-ensure-job.yaml - - synapse-signingkey-ensure-job.yaml - - synapse-seeder-admin-ensure-job.yaml - - synapse-user-seed-job.yaml - - mas-local-users-ensure-job.yaml + - oneoffs/mas-admin-client-secret-ensure-job.yaml + - oneoffs/mas-db-ensure-job.yaml + - oneoffs/comms-secrets-ensure-job.yaml + - oneoffs/synapse-admin-ensure-job.yaml + - oneoffs/synapse-signingkey-ensure-job.yaml + - oneoffs/synapse-seeder-admin-ensure-job.yaml + - oneoffs/synapse-user-seed-job.yaml + - oneoffs/mas-local-users-ensure-job.yaml - mas-deployment.yaml - livekit-token-deployment.yaml - livekit.yaml - coturn.yaml - seed-othrys-room.yaml - guest-name-job.yaml - - othrys-kick-numeric-job.yaml + - oneoffs/othrys-kick-numeric-job.yaml - pin-othrys-job.yaml - reset-othrys-room-job.yaml - - bstein-force-leave-job.yaml + - oneoffs/bstein-force-leave-job.yaml - livekit-ingress.yaml - livekit-middlewares.yaml - matrix-ingress.yaml @@ -73,5 +75,6 @@ configMapGenerator: - INDEX.md=knowledge/INDEX.md - atlas.json=knowledge/catalog/atlas.json - atlas-summary.json=knowledge/catalog/atlas-summary.json + - metrics.json=knowledge/catalog/metrics.json - runbooks.json=knowledge/catalog/runbooks.json - atlas-http.mmd=knowledge/diagrams/atlas-http.mmd diff --git a/services/comms/mas-configmap.yaml b/services/comms/mas-configmap.yaml index 5e6cfdd..9d2c11e 100644 --- a/services/comms/mas-configmap.yaml +++ b/services/comms/mas-configmap.yaml @@ -72,7 +72,7 @@ data: template: "{{ user.name }}" email: action: force - template: "{{ user.email }}" + template: "{{ user.mailu_email }}" policy: data: diff --git a/services/comms/bstein-force-leave-job.yaml b/services/comms/oneoffs/bstein-force-leave-job.yaml similarity index 96% rename from services/comms/bstein-force-leave-job.yaml rename to services/comms/oneoffs/bstein-force-leave-job.yaml index 0286f8c..7efe826 100644 --- a/services/comms/bstein-force-leave-job.yaml +++ b/services/comms/oneoffs/bstein-force-leave-job.yaml @@ -1,10 +1,15 @@ -# services/comms/bstein-force-leave-job.yaml +# services/comms/oneoffs/bstein-force-leave-job.yaml +# One-off job for comms/bstein-leave-rooms-12. +# Purpose: bstein leave rooms 12 (see container args/env in this file). +# Run by setting spec.suspend to false, reconcile, then set it back to true. +# Safe to delete the finished Job/pod; it should not run continuously. apiVersion: batch/v1 kind: Job metadata: name: bstein-leave-rooms-12 namespace: comms spec: + suspend: true backoffLimit: 0 template: metadata: diff --git a/services/comms/comms-secrets-ensure-job.yaml b/services/comms/oneoffs/comms-secrets-ensure-job.yaml similarity index 91% rename from services/comms/comms-secrets-ensure-job.yaml rename to services/comms/oneoffs/comms-secrets-ensure-job.yaml index b71dd40..35ca73c 100644 --- a/services/comms/comms-secrets-ensure-job.yaml +++ b/services/comms/oneoffs/comms-secrets-ensure-job.yaml @@ -1,10 +1,15 @@ -# services/comms/comms-secrets-ensure-job.yaml +# services/comms/oneoffs/comms-secrets-ensure-job.yaml +# One-off job for comms/comms-secrets-ensure-7. +# Purpose: comms secrets ensure 7 (see container args/env in this file). +# Run by setting spec.suspend to false, reconcile, then set it back to true. +# Safe to delete the finished Job/pod; it should not run continuously. apiVersion: batch/v1 kind: Job metadata: - name: comms-secrets-ensure-6 + name: comms-secrets-ensure-7 namespace: comms spec: + suspend: true backoffLimit: 1 ttlSecondsAfterFinished: 3600 template: diff --git a/services/comms/mas-admin-client-secret-ensure-job.yaml b/services/comms/oneoffs/mas-admin-client-secret-ensure-job.yaml similarity index 90% rename from services/comms/mas-admin-client-secret-ensure-job.yaml rename to services/comms/oneoffs/mas-admin-client-secret-ensure-job.yaml index 7b05cca..e1d5458 100644 --- a/services/comms/mas-admin-client-secret-ensure-job.yaml +++ b/services/comms/oneoffs/mas-admin-client-secret-ensure-job.yaml @@ -1,4 +1,8 @@ -# services/comms/mas-admin-client-secret-ensure-job.yaml +# services/comms/oneoffs/mas-admin-client-secret-ensure-job.yaml +# One-off job for comms/mas-admin-client-secret-writer. +# Purpose: mas admin client secret writer (see container args/env in this file). +# Run by setting spec.suspend to false, reconcile, then set it back to true. +# Safe to delete the finished Job/pod; it should not run continuously. apiVersion: v1 kind: ServiceAccount metadata: @@ -41,6 +45,7 @@ metadata: name: mas-admin-client-secret-ensure-11 namespace: comms spec: + suspend: true backoffLimit: 2 template: spec: diff --git a/services/comms/mas-db-ensure-job.yaml b/services/comms/oneoffs/mas-db-ensure-job.yaml similarity index 91% rename from services/comms/mas-db-ensure-job.yaml rename to services/comms/oneoffs/mas-db-ensure-job.yaml index 56707a9..44137da 100644 --- a/services/comms/mas-db-ensure-job.yaml +++ b/services/comms/oneoffs/mas-db-ensure-job.yaml @@ -1,10 +1,15 @@ -# services/comms/mas-db-ensure-job.yaml +# services/comms/oneoffs/mas-db-ensure-job.yaml +# One-off job for comms/mas-db-ensure-22. +# Purpose: mas db ensure 22 (see container args/env in this file). +# Run by setting spec.suspend to false, reconcile, then set it back to true. +# Safe to delete the finished Job/pod; it should not run continuously. apiVersion: batch/v1 kind: Job metadata: name: mas-db-ensure-22 namespace: comms spec: + suspend: true backoffLimit: 1 ttlSecondsAfterFinished: 600 template: diff --git a/services/comms/mas-local-users-ensure-job.yaml b/services/comms/oneoffs/mas-local-users-ensure-job.yaml similarity index 96% rename from services/comms/mas-local-users-ensure-job.yaml rename to services/comms/oneoffs/mas-local-users-ensure-job.yaml index 5802009..7b51072 100644 --- a/services/comms/mas-local-users-ensure-job.yaml +++ b/services/comms/oneoffs/mas-local-users-ensure-job.yaml @@ -1,10 +1,15 @@ -# services/comms/mas-local-users-ensure-job.yaml +# services/comms/oneoffs/mas-local-users-ensure-job.yaml +# One-off job for comms/mas-local-users-ensure-18. +# Purpose: mas local users ensure 18 (see container args/env in this file). +# Run by setting spec.suspend to false, reconcile, then set it back to true. +# Safe to delete the finished Job/pod; it should not run continuously. apiVersion: batch/v1 kind: Job metadata: - name: mas-local-users-ensure-15 + name: mas-local-users-ensure-18 namespace: comms spec: + suspend: true backoffLimit: 1 ttlSecondsAfterFinished: 3600 template: diff --git a/services/comms/othrys-kick-numeric-job.yaml b/services/comms/oneoffs/othrys-kick-numeric-job.yaml similarity index 96% rename from services/comms/othrys-kick-numeric-job.yaml rename to services/comms/oneoffs/othrys-kick-numeric-job.yaml index 0d3914a..e38a6bb 100644 --- a/services/comms/othrys-kick-numeric-job.yaml +++ b/services/comms/oneoffs/othrys-kick-numeric-job.yaml @@ -1,10 +1,15 @@ -# services/comms/othrys-kick-numeric-job.yaml +# services/comms/oneoffs/othrys-kick-numeric-job.yaml +# One-off job for comms/othrys-kick-numeric-8. +# Purpose: othrys kick numeric 8 (see container args/env in this file). +# Run by setting spec.suspend to false, reconcile, then set it back to true. +# Safe to delete the finished Job/pod; it should not run continuously. apiVersion: batch/v1 kind: Job metadata: name: othrys-kick-numeric-8 namespace: comms spec: + suspend: true backoffLimit: 0 template: metadata: diff --git a/services/comms/oneoffs/synapse-admin-ensure-job.yaml b/services/comms/oneoffs/synapse-admin-ensure-job.yaml new file mode 100644 index 0000000..95bc9f2 --- /dev/null +++ b/services/comms/oneoffs/synapse-admin-ensure-job.yaml @@ -0,0 +1,219 @@ +# services/comms/oneoffs/synapse-admin-ensure-job.yaml +# One-off job for comms/synapse-admin-ensure-3. +# Purpose: synapse admin ensure 3 (see container args/env in this file). +# Run by setting spec.suspend to false, reconcile, then set it back to true. +# Safe to delete the finished Job/pod; it should not run continuously. +apiVersion: batch/v1 +kind: Job +metadata: + name: synapse-admin-ensure-3 + namespace: comms +spec: + suspend: true + backoffLimit: 0 + ttlSecondsAfterFinished: 3600 + template: + spec: + serviceAccountName: comms-secrets-ensure + restartPolicy: Never + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: node-role.kubernetes.io/worker + operator: Exists + preferredDuringSchedulingIgnoredDuringExecution: + - weight: 100 + preference: + matchExpressions: + - key: kubernetes.io/arch + operator: In + values: ["arm64"] + containers: + - name: ensure + image: python:3.11-slim + env: + - name: VAULT_ADDR + value: http://vault.vault.svc.cluster.local:8200 + - name: VAULT_ROLE + value: comms-secrets + - name: SYNAPSE_ADMIN_URL + value: http://othrys-synapse-matrix-synapse.comms.svc.cluster.local:8008 + command: + - /bin/sh + - -c + - | + set -euo pipefail + pip install --no-cache-dir psycopg2-binary bcrypt + python - <<'PY' + import json + import os + import secrets + import string + import time + import urllib.error + import urllib.request + + import bcrypt + import psycopg2 + + VAULT_ADDR = os.environ.get("VAULT_ADDR", "http://vault.vault.svc.cluster.local:8200").rstrip("/") + VAULT_ROLE = os.environ.get("VAULT_ROLE", "comms-secrets") + SA_TOKEN_PATH = "/var/run/secrets/kubernetes.io/serviceaccount/token" + PGHOST = "postgres-service.postgres.svc.cluster.local" + PGPORT = 5432 + PGDATABASE = "synapse" + PGUSER = "synapse" + + def log(msg: str) -> None: + print(msg, flush=True) + + def request_json(url: str, payload: dict | None = None) -> dict: + data = None + headers = {"Content-Type": "application/json"} + if payload is not None: + data = json.dumps(payload).encode("utf-8") + req = urllib.request.Request(url, data=data, headers=headers, method="POST" if data else "GET") + with urllib.request.urlopen(req, timeout=30) as resp: + return json.loads(resp.read().decode("utf-8")) + + def vault_login() -> str: + with open(SA_TOKEN_PATH, "r", encoding="utf-8") as f: + jwt = f.read().strip() + payload = {"jwt": jwt, "role": VAULT_ROLE} + resp = request_json(f"{VAULT_ADDR}/v1/auth/kubernetes/login", payload) + token = resp.get("auth", {}).get("client_token") + if not token: + raise RuntimeError("vault login failed") + return token + + def vault_get(token: str, path: str) -> dict: + req = urllib.request.Request( + f"{VAULT_ADDR}/v1/kv/data/atlas/{path}", + headers={"X-Vault-Token": token}, + ) + try: + with urllib.request.urlopen(req, timeout=30) as resp: + payload = json.loads(resp.read().decode("utf-8")) + return payload.get("data", {}).get("data", {}) + except urllib.error.HTTPError as exc: + if exc.code == 404: + return {} + raise + + def vault_put(token: str, path: str, data: dict) -> None: + payload = {"data": data} + req = urllib.request.Request( + f"{VAULT_ADDR}/v1/kv/data/atlas/{path}", + data=json.dumps(payload).encode("utf-8"), + headers={"X-Vault-Token": token, "Content-Type": "application/json"}, + method="POST", + ) + with urllib.request.urlopen(req, timeout=30) as resp: + resp.read() + + def random_password(length: int = 32) -> str: + alphabet = string.ascii_letters + string.digits + return "".join(secrets.choice(alphabet) for _ in range(length)) + + def ensure_admin_creds(token: str) -> dict: + data = vault_get(token, "comms/synapse-admin") + username = (data.get("username") or "").strip() or "synapse-admin" + password = (data.get("password") or "").strip() + if not password: + password = random_password() + data["username"] = username + data["password"] = password + vault_put(token, "comms/synapse-admin", data) + return data + + def ensure_user(cur, cols, user_id, password, admin): + now_ms = int(time.time() * 1000) + values = { + "name": user_id, + "password_hash": bcrypt.hashpw(password.encode(), bcrypt.gensalt()).decode(), + "creation_ts": now_ms, + } + + def add_flag(name, flag): + if name not in cols: + return + if cols[name]["type"] in ("smallint", "integer"): + values[name] = int(flag) + else: + values[name] = bool(flag) + + add_flag("admin", admin) + add_flag("deactivated", False) + add_flag("shadow_banned", False) + add_flag("is_guest", False) + + columns = list(values.keys()) + placeholders = ", ".join(["%s"] * len(columns)) + updates = ", ".join([f"{col}=EXCLUDED.{col}" for col in columns if col != "name"]) + query = f"INSERT INTO users ({', '.join(columns)}) VALUES ({placeholders}) ON CONFLICT (name) DO UPDATE SET {updates};" + cur.execute(query, [values[c] for c in columns]) + + def get_cols(cur): + cur.execute( + """ + SELECT column_name, is_nullable, column_default, data_type + FROM information_schema.columns + WHERE table_schema = 'public' AND table_name = 'users' + """ + ) + cols = {} + for name, is_nullable, default, data_type in cur.fetchall(): + cols[name] = { + "nullable": is_nullable == "YES", + "default": default, + "type": data_type, + } + return cols + + def ensure_access_token(cur, user_id, token_value): + cur.execute("SELECT COALESCE(MAX(id), 0) + 1 FROM access_tokens") + token_id = cur.fetchone()[0] + cur.execute( + """ + INSERT INTO access_tokens (id, user_id, token, device_id, valid_until_ms) + VALUES (%s, %s, %s, %s, NULL) + ON CONFLICT (token) DO NOTHING + """, + (token_id, user_id, token_value, "ariadne-admin"), + ) + + vault_token = vault_login() + admin_data = ensure_admin_creds(vault_token) + if admin_data.get("access_token"): + log("synapse admin token already present") + raise SystemExit(0) + + synapse_db = vault_get(vault_token, "comms/synapse-db") + pg_password = synapse_db.get("POSTGRES_PASSWORD") + if not pg_password: + raise RuntimeError("synapse db password missing") + + user_id = f"@{admin_data['username']}:live.bstein.dev" + conn = psycopg2.connect( + host=PGHOST, + port=PGPORT, + dbname=PGDATABASE, + user=PGUSER, + password=pg_password, + ) + token_value = secrets.token_urlsafe(32) + try: + with conn: + with conn.cursor() as cur: + cols = get_cols(cur) + ensure_user(cur, cols, user_id, admin_data["password"], True) + ensure_access_token(cur, user_id, token_value) + finally: + conn.close() + + admin_data["access_token"] = token_value + vault_put(vault_token, "comms/synapse-admin", admin_data) + log("synapse admin token stored") + PY diff --git a/services/comms/synapse-seeder-admin-ensure-job.yaml b/services/comms/oneoffs/synapse-seeder-admin-ensure-job.yaml similarity index 92% rename from services/comms/synapse-seeder-admin-ensure-job.yaml rename to services/comms/oneoffs/synapse-seeder-admin-ensure-job.yaml index 9905658..1d8972e 100644 --- a/services/comms/synapse-seeder-admin-ensure-job.yaml +++ b/services/comms/oneoffs/synapse-seeder-admin-ensure-job.yaml @@ -1,10 +1,15 @@ -# services/comms/synapse-seeder-admin-ensure-job.yaml +# services/comms/oneoffs/synapse-seeder-admin-ensure-job.yaml +# One-off job for comms/synapse-seeder-admin-ensure-9. +# Purpose: synapse seeder admin ensure 9 (see container args/env in this file). +# Run by setting spec.suspend to false, reconcile, then set it back to true. +# Safe to delete the finished Job/pod; it should not run continuously. apiVersion: batch/v1 kind: Job metadata: - name: synapse-seeder-admin-ensure-7 + name: synapse-seeder-admin-ensure-9 namespace: comms spec: + suspend: true backoffLimit: 2 template: metadata: diff --git a/services/comms/synapse-signingkey-ensure-job.yaml b/services/comms/oneoffs/synapse-signingkey-ensure-job.yaml similarity index 88% rename from services/comms/synapse-signingkey-ensure-job.yaml rename to services/comms/oneoffs/synapse-signingkey-ensure-job.yaml index 402a820..bbc4595 100644 --- a/services/comms/synapse-signingkey-ensure-job.yaml +++ b/services/comms/oneoffs/synapse-signingkey-ensure-job.yaml @@ -1,10 +1,15 @@ -# services/comms/synapse-signingkey-ensure-job.yaml +# services/comms/oneoffs/synapse-signingkey-ensure-job.yaml +# One-off job for comms/othrys-synapse-signingkey-ensure-7. +# Purpose: othrys synapse signingkey ensure 7 (see container args/env in this file). +# Run by setting spec.suspend to false, reconcile, then set it back to true. +# Safe to delete the finished Job/pod; it should not run continuously. apiVersion: batch/v1 kind: Job metadata: name: othrys-synapse-signingkey-ensure-7 namespace: comms spec: + suspend: true backoffLimit: 2 template: spec: diff --git a/services/comms/synapse-user-seed-job.yaml b/services/comms/oneoffs/synapse-user-seed-job.yaml similarity index 95% rename from services/comms/synapse-user-seed-job.yaml rename to services/comms/oneoffs/synapse-user-seed-job.yaml index 7fef796..a732739 100644 --- a/services/comms/synapse-user-seed-job.yaml +++ b/services/comms/oneoffs/synapse-user-seed-job.yaml @@ -1,10 +1,15 @@ -# services/comms/synapse-user-seed-job.yaml +# services/comms/oneoffs/synapse-user-seed-job.yaml +# One-off job for comms/synapse-user-seed-8. +# Purpose: synapse user seed 8 (see container args/env in this file). +# Run by setting spec.suspend to false, reconcile, then set it back to true. +# Safe to delete the finished Job/pod; it should not run continuously. apiVersion: batch/v1 kind: Job metadata: - name: synapse-user-seed-7 + name: synapse-user-seed-8 namespace: comms spec: + suspend: true backoffLimit: 1 ttlSecondsAfterFinished: 3600 template: diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py index e8bd1a8..be256c0 100644 --- a/services/comms/scripts/atlasbot/bot.py +++ b/services/comms/scripts/atlasbot/bot.py @@ -3,7 +3,9 @@ import json import os import re import ssl +import threading import time +from http.server import BaseHTTPRequestHandler, HTTPServer from typing import Any from urllib import error, parse, request @@ -14,17 +16,31 @@ PASSWORD = os.environ["BOT_PASS"] ROOM_ALIAS = "#othrys:live.bstein.dev" OLLAMA_URL = os.environ.get("OLLAMA_URL", "https://chat.ai.bstein.dev/") -MODEL = os.environ.get("OLLAMA_MODEL", "qwen2.5-coder:7b-instruct-q4_0") +MODEL = os.environ.get("OLLAMA_MODEL", "qwen2.5:14b-instruct") +MODEL_FAST = os.environ.get("ATLASBOT_MODEL_FAST", "") +MODEL_DEEP = os.environ.get("ATLASBOT_MODEL_DEEP", "") +FALLBACK_MODEL = os.environ.get("OLLAMA_FALLBACK_MODEL", "") API_KEY = os.environ.get("CHAT_API_KEY", "") +OLLAMA_TIMEOUT_SEC = float(os.environ.get("OLLAMA_TIMEOUT_SEC", "480")) +ATLASBOT_HTTP_PORT = int(os.environ.get("ATLASBOT_HTTP_PORT", "8090")) +ATLASBOT_INTERNAL_TOKEN = os.environ.get("ATLASBOT_INTERNAL_TOKEN") or os.environ.get("CHAT_API_HOMEPAGE", "") +SNAPSHOT_TTL_SEC = int(os.environ.get("ATLASBOT_SNAPSHOT_TTL_SEC", "30")) KB_DIR = os.environ.get("KB_DIR", "") VM_URL = os.environ.get("VM_URL", "http://victoria-metrics-single-server.monitoring.svc.cluster.local:8428") +ARIADNE_STATE_URL = os.environ.get("ARIADNE_STATE_URL", "") +ARIADNE_STATE_TOKEN = os.environ.get("ARIADNE_STATE_TOKEN", "") BOT_MENTIONS = os.environ.get("BOT_MENTIONS", f"{USER},atlas") SERVER_NAME = os.environ.get("MATRIX_SERVER_NAME", "live.bstein.dev") MAX_KB_CHARS = int(os.environ.get("ATLASBOT_MAX_KB_CHARS", "2500")) MAX_TOOL_CHARS = int(os.environ.get("ATLASBOT_MAX_TOOL_CHARS", "2500")) +MAX_FACTS_CHARS = int(os.environ.get("ATLASBOT_MAX_FACTS_CHARS", "8000")) +MAX_CONTEXT_CHARS = int(os.environ.get("ATLASBOT_MAX_CONTEXT_CHARS", "12000")) +THINKING_INTERVAL_SEC = int(os.environ.get("ATLASBOT_THINKING_INTERVAL_SEC", "120")) +OLLAMA_RETRIES = int(os.environ.get("ATLASBOT_OLLAMA_RETRIES", "2")) +OLLAMA_SERIALIZE = os.environ.get("ATLASBOT_OLLAMA_SERIALIZE", "true").lower() != "false" TOKEN_RE = re.compile(r"[a-z0-9][a-z0-9_.-]{1,}", re.IGNORECASE) HOST_RE = re.compile(r"(?i)([a-z0-9-]+(?:\\.[a-z0-9-]+)+)") @@ -52,11 +68,34 @@ STOPWORDS = { "help", "atlas", "othrys", + "system", + "systems", + "service", + "services", + "app", + "apps", + "platform", + "software", + "tool", + "tools", } METRIC_HINT_WORDS = { + "bandwidth", + "connections", + "cpu", + "database", + "db", + "disk", "health", + "memory", + "network", + "node", + "nodes", + "postgres", "status", + "storage", + "usage", "down", "slow", "error", @@ -69,11 +108,221 @@ METRIC_HINT_WORDS = { "pending", "unreachable", "latency", + "pod", + "pods", } +CODE_FENCE_RE = re.compile(r"^```(?:json)?\s*(.*?)\s*```$", re.DOTALL) +TITAN_NODE_RE = re.compile(r"\btitan-[0-9a-z]{2}\b", re.IGNORECASE) +TITAN_RANGE_RE = re.compile(r"\btitan-([0-9a-z]{2})/([0-9a-z]{2})\b", re.IGNORECASE) +_DASH_CHARS = "\u2010\u2011\u2012\u2013\u2014\u2015\u2212\uFE63\uFF0D" +CONFIDENCE_RE = re.compile(r"confidence\s*:\s*(high|medium|low)\b", re.IGNORECASE) + +OPERATION_HINTS = { + "count": ("how many", "count", "number", "total"), + "list": ("list", "which", "what are", "show", "names"), + "top": ("top", "hottest", "highest", "most", "largest", "max", "maximum", "busiest", "busy"), + "bottom": ("lowest", "least", "minimum", "min", "smallest"), + "status": ("ready", "not ready", "unready", "down", "missing", "status"), +} + +METRIC_HINTS = { + "cpu": ("cpu",), + "ram": ("ram", "memory", "mem"), + "net": ("net", "network", "bandwidth", "throughput"), + "io": ("io", "disk", "storage"), + "connections": ("connections", "conn", "postgres", "database", "db"), + "pods": ("pods", "pod"), +} + +CLUSTER_HINT_WORDS = { + "atlas", + "titan", + "cluster", + "k8s", + "kubernetes", + "health", + "node", + "nodes", + "hardware", + "architecture", + "worker", + "workers", + "pod", + "pods", + "namespace", + "service", + "deployment", + "daemonset", + "statefulset", + "snapshot", + "anomaly", + "anomalies", + "monitor", + "monitoring", + "runbook", + "runbooks", + "documentation", + "docs", + "playbook", + "utilization", + "usage", + "grafana", + "victoria", + "prometheus", + "ariadne", + "mailu", + "nextcloud", + "vaultwarden", + "firefly", + "wger", + "jellyfin", + "planka", + "budget", + "element", + "synapse", + "mas", + "comms", + "longhorn", + "harbor", + "jenkins", + "gitea", + "flux", + "keycloak", + "postgres", + "database", + "db", + "atlasbot", + "jetson", + "rpi", + "raspberry", + "amd64", + "arm64", +} + +_INSIGHT_HINT_WORDS = { + "interesting", + "unconventional", + "surprising", + "weird", + "odd", + "unusual", + "outlier", + "fun", + "cool", + "unique", + "notable", + "coolest", + "risk", + "risky", + "favorite", + "favourite", + "trivia", + "anomaly", + "anomalies", + "monitor", + "monitoring", + "alert", + "alerts", + "stand out", + "stands out", +} + +_OVERVIEW_HINT_WORDS = { + "overview", + "summary", + "describe", + "explain", + "tell me about", + "what do you know", + "health", +} + +_OLLAMA_LOCK = threading.Lock() + +HARDWARE_HINTS = { + "amd64": ("amd64", "x86", "x86_64", "x86-64"), + "jetson": ("jetson",), + "rpi4": ("rpi4", "raspberry pi 4", "raspberry pi-4"), + "rpi5": ("rpi5", "raspberry pi 5", "raspberry pi-5"), + "rpi": ("rpi", "raspberry"), + "arm64": ("arm64", "aarch64"), +} + +def normalize_query(text: str) -> str: + cleaned = (text or "").lower() + for ch in _DASH_CHARS: + cleaned = cleaned.replace(ch, "-") + cleaned = cleaned.replace("_", " ") + cleaned = re.sub(r"\s+", " ", cleaned).strip() + return cleaned + def _tokens(text: str) -> list[str]: - toks = [t.lower() for t in TOKEN_RE.findall(text or "")] - return [t for t in toks if t not in STOPWORDS and len(t) >= 2] + cleaned = re.sub(r"[\\_/]", " ", text or "") + toks = [t.lower() for t in TOKEN_RE.findall(cleaned)] + expanded: list[str] = [] + synonyms = { + "network": "net", + "net": "network", + "memory": "ram", + "ram": "memory", + "i/o": "io", + } + for token in toks: + expanded.append(token) + if "-" in token: + expanded.extend(part for part in token.split("-") if part) + for token in list(expanded): + if token in synonyms: + expanded.append(synonyms[token]) + if token.endswith("s") and len(token) > 3: + expanded.append(token.rstrip("s")) + return [t for t in expanded if t not in STOPWORDS and len(t) >= 2] + + +def _ensure_confidence(text: str) -> str: + if not text: + return "" + lines = text.strip().splitlines() + for idx, line in enumerate(lines): + match = CONFIDENCE_RE.search(line) + if match: + level = match.group(1).lower() + lines[idx] = CONFIDENCE_RE.sub(f"Confidence: {level}", line) + return "\n".join(lines) + lines.append("Confidence: medium") + return "\n".join(lines) + + +def _ollama_endpoint() -> str: + url = (OLLAMA_URL or "").strip() + if not url: + return "" + if url.endswith("/api/chat"): + return url + return url.rstrip("/") + "/api/chat" + + +def _history_to_messages(lines: list[str]) -> list[dict[str, str]]: + messages: list[dict[str, str]] = [] + for line in lines: + raw = (line or "").strip() + if not raw: + continue + role = "user" + content = raw + lowered = raw.lower() + if lowered.startswith("atlas:"): + role = "assistant" + content = raw.split(":", 1)[1].strip() + elif lowered.startswith("user:"): + role = "user" + content = raw.split(":", 1)[1].strip() + elif ":" in raw: + content = raw.split(":", 1)[1].strip() + if content: + messages.append({"role": role, "content": content}) + return messages # Mention detection (Matrix rich mentions + plain @atlas). @@ -97,15 +346,60 @@ def normalize_user_id(token: str) -> str: MENTION_USER_IDS = {normalize_user_id(t).lower() for t in MENTION_TOKENS if normalize_user_id(t)} +def _body_mentions_token(body: str) -> bool: + lower = (body or "").strip().lower() + if not lower: + return False + for token in MENTION_LOCALPARTS: + for prefix in (token, f"@{token}"): + if lower.startswith(prefix + ":") or lower.startswith(prefix + ",") or lower.startswith(prefix + " "): + return True + return False + def is_mentioned(content: dict, body: str) -> bool: if MENTION_RE.search(body or "") is not None: return True + if _body_mentions_token(body or ""): + return True mentions = content.get("m.mentions", {}) user_ids = mentions.get("user_ids", []) if not isinstance(user_ids, list): return False return any(isinstance(uid, str) and uid.lower() in MENTION_USER_IDS for uid in user_ids) +def _strip_bot_mention(text: str) -> str: + if not text: + return "" + if not MENTION_LOCALPARTS: + return text.strip() + names = [re.escape(name) for name in MENTION_LOCALPARTS if name] + if not names: + return text.strip() + pattern = r"^(?:\s*@?(?:" + "|".join(names) + r")(?::)?\s+)+" + cleaned = re.sub(pattern, "", text, flags=re.IGNORECASE).strip() + return cleaned or text.strip() + + +def _detect_mode_from_body(body: str, *, default: str = "deep") -> str: + lower = normalize_query(body or "") + if "atlas_quick" in lower or "atlas-quick" in lower: + return "fast" + if "atlas_smart" in lower or "atlas-smart" in lower: + return "deep" + if lower.startswith("quick ") or lower.startswith("fast "): + return "fast" + if lower.startswith("smart ") or lower.startswith("deep "): + return "deep" + return default + + +def _model_for_mode(mode: str) -> str: + if mode == "fast" and MODEL_FAST: + return MODEL_FAST + if mode == "deep" and MODEL_DEEP: + return MODEL_DEEP + return MODEL + # Matrix HTTP helper. def req(method: str, path: str, token: str | None = None, body=None, timeout=60, base: str | None = None): @@ -149,6 +443,8 @@ def send_msg(token: str, room: str, text: str): KB = {"catalog": {}, "runbooks": []} _HOST_INDEX: dict[str, list[dict]] = {} _NAME_INDEX: set[str] = set() +_METRIC_INDEX: list[dict[str, Any]] = [] +NODE_REGEX = re.compile(r'node=~"([^"]+)"') def _load_json_file(path: str) -> Any | None: try: @@ -158,11 +454,12 @@ def _load_json_file(path: str) -> Any | None: return None def load_kb(): - global KB, _HOST_INDEX, _NAME_INDEX + global KB, _HOST_INDEX, _NAME_INDEX, _METRIC_INDEX if not KB_DIR: return catalog = _load_json_file(os.path.join(KB_DIR, "catalog", "atlas.json")) or {} runbooks = _load_json_file(os.path.join(KB_DIR, "catalog", "runbooks.json")) or [] + metrics = _load_json_file(os.path.join(KB_DIR, "catalog", "metrics.json")) or [] KB = {"catalog": catalog, "runbooks": runbooks} host_index: dict[str, list[dict]] = collections.defaultdict(list) @@ -180,15 +477,16 @@ def load_kb(): if isinstance(w, dict) and w.get("name"): names.add(str(w["name"]).lower()) _NAME_INDEX = names + _METRIC_INDEX = metrics if isinstance(metrics, list) else [] -def kb_retrieve(query: str, *, limit: int = 3) -> str: +def _score_kb_docs(query: str) -> list[dict[str, Any]]: q = (query or "").strip() if not q or not KB.get("runbooks"): - return "" + return [] ql = q.lower() q_tokens = _tokens(q) if not q_tokens: - return "" + return [] scored: list[tuple[int, dict]] = [] for doc in KB.get("runbooks", []): @@ -208,9 +506,16 @@ def kb_retrieve(query: str, *, limit: int = 3) -> str: score += 4 if score: scored.append((score, doc)) - scored.sort(key=lambda x: x[0], reverse=True) - picked = [d for _, d in scored[:limit]] + return [d for _, d in scored] + + +def kb_retrieve(query: str, *, limit: int = 3) -> str: + q = (query or "").strip() + if not q: + return "" + scored = _score_kb_docs(q) + picked = scored[:limit] if not picked: return "" @@ -228,6 +533,1684 @@ def kb_retrieve(query: str, *, limit: int = 3) -> str: used += len(chunk) return "\n".join(parts).strip() + +def kb_retrieve_titles(query: str, *, limit: int = 4) -> str: + scored = _score_kb_docs(query) + picked = scored[:limit] + if not picked: + return "" + parts = ["Relevant runbooks:"] + for doc in picked: + title = doc.get("title") or doc.get("path") or "runbook" + path = doc.get("path") or "" + if path: + parts.append(f"- {title} ({path})") + else: + parts.append(f"- {title}") + return "\n".join(parts) + +def _extract_titan_nodes(text: str) -> list[str]: + cleaned = normalize_query(text) + names = {n.lower() for n in TITAN_NODE_RE.findall(cleaned) if n} + for match in re.finditer(r"titan-([0-9a-z]{2}(?:[/,][0-9a-z]{2})+)", cleaned, re.IGNORECASE): + tail = match.group(1) + for part in re.split(r"[/,]", tail): + part = part.strip() + if part: + names.add(f"titan-{part.lower()}") + for match in TITAN_RANGE_RE.finditer(cleaned): + left, right = match.groups() + if left: + names.add(f"titan-{left.lower()}") + if right: + names.add(f"titan-{right.lower()}") + return sorted(names) + +def _humanize_rate(value: str, *, unit: str) -> str: + try: + val = float(value) + except (TypeError, ValueError): + return value + if unit == "%": + return f"{val:.1f}%" + if val >= 1024 * 1024: + return f"{val / (1024 * 1024):.2f} MB/s" + if val >= 1024: + return f"{val / 1024:.2f} KB/s" + return f"{val:.2f} B/s" + +def _has_any(text: str, phrases: tuple[str, ...]) -> bool: + for phrase in phrases: + if " " in phrase: + if phrase in text: + return True + else: + if re.search(rf"\b{re.escape(phrase)}\b", text): + return True + return False + +def _detect_operation(q: str) -> str | None: + if _has_any(q, OPERATION_HINTS["top"]): + return "top" + if _has_any(q, OPERATION_HINTS["bottom"]): + return "bottom" + for op, phrases in OPERATION_HINTS.items(): + if op in ("top", "bottom"): + continue + if _has_any(q, phrases): + return op + return None + +def _detect_metric(q: str) -> str | None: + q = normalize_query(q) + if _has_any(q, ("disk", "storage")): + return "io" + if _has_any(q, ("io",)) and not _has_any(q, METRIC_HINTS["net"]): + return "io" + for metric, phrases in METRIC_HINTS.items(): + if _has_any(q, phrases): + return metric + tokens = set(_tokens(q)) + expanded: set[str] = set(tokens) + for token in list(tokens): + for part in re.split(r"[-_]", token): + part = part.strip() + if len(part) >= 2: + expanded.add(part) + if part.endswith("s") and len(part) >= 4: + expanded.add(part[:-1]) + tokens = expanded + for metric, phrases in METRIC_HINTS.items(): + for phrase in phrases: + if " " in phrase: + if phrase in q: + return metric + elif phrase in tokens: + return metric + return None + +def _detect_hardware_filters(q: str) -> tuple[set[str], set[str]]: + include: set[str] = set() + exclude: set[str] = set() + if any(term in q for term in ("gpu", "gpus", "accelerator", "accelerators", "cuda", "nvidia")): + include.add("jetson") + rpi_specific = any( + phrase in q + for phrase in ( + "rpi4", + "rpi5", + "raspberry pi 4", + "raspberry pi 5", + "raspberry pi-4", + "raspberry pi-5", + ) + ) + for hardware, phrases in HARDWARE_HINTS.items(): + if hardware == "rpi" and rpi_specific: + continue + for phrase in phrases: + if f"non {phrase}" in q or f"non-{phrase}" in q or f"not {phrase}" in q: + exclude.add(hardware) + elif phrase in q: + include.add(hardware) + return include, exclude + + +def _detect_role_filters(q: str) -> set[str]: + roles: set[str] = set() + if "control-plane" in q or "control plane" in q: + roles.add("control-plane") + if "master" in q: + roles.add("master") + if "accelerator" in q: + roles.add("accelerator") + return roles + +def _detect_entity(q: str) -> str | None: + if ( + "node" in q + or "nodes" in q + or "worker" in q + or "hardware" in q + or "architecture" in q + or "machine" in q + or "machines" in q + or "host" in q + or "hosts" in q + or "hostname" in q + or "hostnames" in q + or TITAN_NODE_RE.search(q) + ): + return "node" + if "pod" in q or "pods" in q: + return "pod" + if "namespace" in q or "namespaces" in q: + return "namespace" + return None + +def _metric_entry_score(entry: dict[str, Any], tokens: list[str], *, metric: str | None, op: str | None) -> int: + hay = _metric_tokens(entry) + score = 0 + for t in set(tokens): + if t in hay: + score += 2 if t in (entry.get("panel_title") or "").lower() else 1 + if metric: + for phrase in METRIC_HINTS.get(metric, (metric,)): + if phrase in hay: + score += 3 + if op == "top" and ("hottest" in hay or "top" in hay): + score += 3 + if "node" in hay: + score += 1 + return score + +def _select_metric_entry(tokens: list[str], *, metric: str | None, op: str | None) -> dict[str, Any] | None: + scored: list[tuple[int, dict[str, Any]]] = [] + for entry in _METRIC_INDEX: + if not isinstance(entry, dict): + continue + score = _metric_entry_score(entry, tokens, metric=metric, op=op) + if score: + scored.append((score, entry)) + if not scored: + return None + scored.sort(key=lambda item: item[0], reverse=True) + return scored[0][1] + +def _apply_node_filter(expr: str, node_regex: str | None) -> str: + if not node_regex: + return expr + needle = 'node_uname_info{nodename!=""}' + replacement = f'node_uname_info{{nodename!=\"\",nodename=~\"{node_regex}\"}}' + return expr.replace(needle, replacement) + +def _metric_expr_uses_percent(entry: dict[str, Any]) -> bool: + exprs = entry.get("exprs") + expr = exprs[0] if isinstance(exprs, list) and exprs else "" + return "* 100" in expr or "*100" in expr + + +def _format_metric_value(value: str, *, percent: bool, rate: bool = False) -> str: + try: + num = float(value) + except (TypeError, ValueError): + return value + if percent: + return f"{num:.1f}%" + if rate: + return _humanize_rate(value, unit="rate") + if abs(num) >= 1: + return f"{num:.2f}".rstrip("0").rstrip(".") + return f"{num:.4f}".rstrip("0").rstrip(".") + + +def _format_metric_label(metric: dict[str, Any]) -> str: + label_parts = [] + for k in ("namespace", "pod", "container", "node", "instance", "job", "phase"): + if metric.get(k): + label_parts.append(f"{k}={metric.get(k)}") + if not label_parts: + for k in sorted(metric.keys()): + if k.startswith("__"): + continue + label_parts.append(f"{k}={metric.get(k)}") + if len(label_parts) >= 4: + break + return ", ".join(label_parts) if label_parts else "series" + + +def _primary_series_metric(res: dict | None) -> tuple[str | None, str | None]: + series = _vm_value_series(res or {}) + if not series: + return (None, None) + first = series[0] + metric = first.get("metric") if isinstance(first, dict) else {} + value = first.get("value") if isinstance(first, dict) else [] + node = metric.get("node") if isinstance(metric, dict) else None + val = value[1] if isinstance(value, list) and len(value) > 1 else None + return (node, val) + + +def _format_metric_answer(entry: dict[str, Any], res: dict | None) -> str: + series = _vm_value_series(res) + panel = entry.get("panel_title") or "Metric" + if not series: + return "" + percent = _metric_expr_uses_percent(entry) + lines: list[str] = [] + for r in series[:5]: + if not isinstance(r, dict): + continue + metric = r.get("metric") or {} + value = r.get("value") or [] + val = value[1] if isinstance(value, list) and len(value) > 1 else "" + label = _format_metric_label(metric if isinstance(metric, dict) else {}) + lines.append(f"{label}: {_format_metric_value(val, percent=percent)}") + if not lines: + return "" + if len(lines) == 1: + return f"{panel}: {lines[0]}." + return f"{panel}:\n" + "\n".join(f"- {line}" for line in lines) + +def _inventory_filter( + inventory: list[dict[str, Any]], + *, + include_hw: set[str], + exclude_hw: set[str], + only_workers: bool, + only_ready: bool | None, + nodes_in_query: list[str], +) -> list[dict[str, Any]]: + results = inventory + if nodes_in_query: + results = [node for node in results if node.get("name") in nodes_in_query] + if only_workers: + results = [node for node in results if node.get("is_worker") is True] + if only_ready is True: + results = [node for node in results if node.get("ready") is True] + if only_ready is False: + results = [node for node in results if node.get("ready") is False] + if include_hw: + results = [node for node in results if _hardware_match(node, include_hw)] + if exclude_hw: + results = [node for node in results if not _hardware_match(node, exclude_hw)] + return results + +def _hardware_match(node: dict[str, Any], filters: set[str]) -> bool: + hw = node.get("hardware") or "" + arch = node.get("arch") or "" + for f in filters: + if f == "rpi" and hw in ("rpi4", "rpi5", "rpi"): + return True + if f == "arm64" and arch == "arm64": + return True + if hw == f: + return True + if f == "amd64" and arch == "amd64": + return True + return False + +def _node_roles(labels: dict[str, Any]) -> list[str]: + roles: list[str] = [] + for key in labels.keys(): + if key.startswith("node-role.kubernetes.io/"): + role = key.split("/", 1)[-1] + if role: + roles.append(role) + return sorted(set(roles)) + +def _hardware_class(labels: dict[str, Any]) -> str: + if str(labels.get("jetson") or "").lower() == "true": + return "jetson" + hardware = (labels.get("hardware") or "").strip().lower() + if hardware in ("rpi4", "rpi5", "rpi"): + return hardware + arch = labels.get("kubernetes.io/arch") or labels.get("beta.kubernetes.io/arch") or "" + if arch == "amd64": + return "amd64" + if arch == "arm64": + return "arm64-unknown" + return "unknown" + +def node_inventory_live() -> list[dict[str, Any]]: + try: + data = k8s_get("/api/v1/nodes?limit=500") + except Exception: + return [] + items = data.get("items") or [] + inventory: list[dict[str, Any]] = [] + for node in items if isinstance(items, list) else []: + meta = node.get("metadata") or {} + labels = meta.get("labels") or {} + name = meta.get("name") or "" + if not name: + continue + inventory.append( + { + "name": name, + "arch": labels.get("kubernetes.io/arch") or labels.get("beta.kubernetes.io/arch") or "", + "hardware": _hardware_class(labels), + "roles": _node_roles(labels), + "is_worker": _node_is_worker(node), + "ready": _node_ready_status(node), + } + ) + return sorted(inventory, key=lambda item: item["name"]) + + +def node_inventory() -> list[dict[str, Any]]: + snapshot = _snapshot_state() + inventory = _snapshot_inventory(snapshot) + if inventory: + return inventory + return node_inventory_live() + +def _group_nodes(inventory: list[dict[str, Any]]) -> dict[str, list[str]]: + grouped: dict[str, list[str]] = collections.defaultdict(list) + for node in inventory: + grouped[node.get("hardware") or "unknown"].append(node["name"]) + return {k: sorted(v) for k, v in grouped.items()} + +def node_inventory_context(query: str, inventory: list[dict[str, Any]] | None = None) -> str: + q = normalize_query(query) + if not any(word in q for word in ("node", "nodes", "raspberry", "rpi", "jetson", "amd64", "hardware", "cluster")): + return "" + if inventory is None: + inventory = node_inventory() + if not inventory: + return "" + groups = _group_nodes(inventory) + total = len(inventory) + ready = sum(1 for node in inventory if node.get("ready") is True) + not_ready = sum(1 for node in inventory if node.get("ready") is False) + lines: list[str] = [ + "Node inventory (live):", + f"- total: {total}, ready: {ready}, not ready: {not_ready}", + ] + for key in ("rpi5", "rpi4", "jetson", "amd64", "arm64-unknown", "unknown"): + if key in groups: + lines.append(f"- {key}: {', '.join(groups[key])}") + non_rpi = sorted(set(groups.get("jetson", [])) | set(groups.get("amd64", []))) + if non_rpi: + lines.append(f"- non_raspberry_pi (derived): {', '.join(non_rpi)}") + unknowns = groups.get("arm64-unknown", []) + groups.get("unknown", []) + if unknowns: + lines.append("- note: nodes labeled arm64-unknown/unknown may still be Raspberry Pi unless tagged.") + expected_workers = expected_worker_nodes_from_metrics() + if expected_workers: + ready_workers, not_ready_workers = worker_nodes_status() + missing = sorted(set(expected_workers) - set(ready_workers + not_ready_workers)) + lines.append(f"- expected_workers (grafana): {', '.join(expected_workers)}") + lines.append(f"- workers_ready: {', '.join(ready_workers)}") + if not_ready_workers: + lines.append(f"- workers_not_ready: {', '.join(not_ready_workers)}") + if missing: + lines.append(f"- workers_missing (derived): {', '.join(missing)}") + return "\n".join(lines) + +def node_inventory_for_prompt(prompt: str) -> list[dict[str, Any]]: + q = normalize_query(prompt) + if any(word in q for word in ("node", "nodes", "raspberry", "rpi", "jetson", "amd64", "hardware", "cluster", "worker")): + return node_inventory() + return [] + +def _nodes_by_arch(inventory: list[dict[str, Any]]) -> dict[str, list[str]]: + grouped: dict[str, list[str]] = collections.defaultdict(list) + for node in inventory: + grouped[(node.get("arch") or "unknown")].append(node["name"]) + return {k: sorted(v) for k, v in grouped.items()} + +def _node_usage_table(metrics: dict[str, Any], *, allowed_nodes: set[str] | None = None) -> list[dict[str, Any]]: + usage = metrics.get("node_usage") if isinstance(metrics.get("node_usage"), dict) else {} + per_node: dict[str, dict[str, Any]] = {} + for metric_name, entries in usage.items() if isinstance(usage, dict) else []: + if not isinstance(entries, list): + continue + for entry in entries: + if not isinstance(entry, dict): + continue + node = entry.get("node") + if not isinstance(node, str) or not node: + continue + if allowed_nodes and node not in allowed_nodes: + continue + per_node.setdefault(node, {})[metric_name] = entry.get("value") + return [{"node": node, **vals} for node, vals in sorted(per_node.items())] + +def _usage_extremes(usage_table: list[dict[str, Any]]) -> dict[str, tuple[str, float]]: + extremes: dict[str, tuple[str, float]] = {} + for metric in ("cpu", "ram", "net", "io"): + values: list[tuple[str, float]] = [] + for entry in usage_table: + node = entry.get("node") + raw = entry.get(metric) + if not node or raw is None: + continue + try: + value = float(raw) + except (TypeError, ValueError): + continue + values.append((node, value)) + if not values: + continue + lowest = min(values, key=lambda item: item[1]) + highest = max(values, key=lambda item: item[1]) + extremes[f"min_{metric}"] = lowest + extremes[f"max_{metric}"] = highest + return extremes + +def _workloads_for_facts(workloads: list[dict[str, Any]], limit: int = 25) -> list[dict[str, Any]]: + cleaned: list[dict[str, Any]] = [] + for entry in workloads: + if not isinstance(entry, dict): + continue + cleaned.append( + { + "namespace": entry.get("namespace"), + "workload": entry.get("workload"), + "pods_total": entry.get("pods_total"), + "pods_running": entry.get("pods_running"), + "primary_node": entry.get("primary_node"), + "nodes": entry.get("nodes"), + } + ) + cleaned.sort( + key=lambda item: ( + -(item.get("pods_total") or 0), + str(item.get("namespace") or ""), + str(item.get("workload") or ""), + ) + ) + return cleaned[:limit] + +def _workloads_for_prompt(prompt: str, workloads: list[dict[str, Any]], limit: int = 12) -> list[dict[str, Any]]: + tokens = set(_tokens(prompt)) + if tokens: + matched: list[dict[str, Any]] = [] + for entry in workloads: + if not isinstance(entry, dict): + continue + entry_tokens = _workload_tokens(entry) + if entry_tokens & tokens: + matched.append(entry) + if matched: + return _workloads_for_facts(matched, limit=limit) + return _workloads_for_facts(workloads, limit=limit) + +def facts_context( + prompt: str, + *, + inventory: list[dict[str, Any]] | None, + snapshot: dict[str, Any] | None, + workloads: list[dict[str, Any]] | None, +) -> str: + inv = inventory or [] + nodes_in_query = _extract_titan_nodes(prompt) + metrics = _snapshot_metrics(snapshot) + nodes = snapshot.get("nodes") if isinstance(snapshot, dict) else {} + summary = snapshot.get("nodes_summary") if isinstance(snapshot, dict) else {} + expected_workers = expected_worker_nodes_from_metrics() + ready_workers, not_ready_workers = worker_nodes_status(inv) if inv else ([], []) + total = summary.get("total") if isinstance(summary, dict) and summary.get("total") is not None else nodes.get("total") + ready = summary.get("ready") if isinstance(summary, dict) and summary.get("ready") is not None else nodes.get("ready") + not_ready = summary.get("not_ready") if isinstance(summary, dict) and summary.get("not_ready") is not None else nodes.get("not_ready") + not_ready_names = summary.get("not_ready_names") if isinstance(summary, dict) else nodes.get("not_ready_names") + by_hardware = _group_nodes(inv) if inv else {} + by_arch = _nodes_by_arch(inv) if inv else {} + control_plane_nodes = [ + node["name"] + for node in inv + if any(role in ("control-plane", "master") for role in (node.get("roles") or [])) + ] + worker_nodes = [node["name"] for node in inv if node.get("is_worker") is True] + + lines: list[str] = ["Facts (live snapshot):"] + if total is not None: + lines.append(f"- nodes_total={total}, ready={ready}, not_ready={not_ready}") + if isinstance(summary, dict): + by_arch_counts = summary.get("by_arch") + if isinstance(by_arch_counts, dict) and by_arch_counts: + parts = [f"{arch}={count}" for arch, count in sorted(by_arch_counts.items())] + lines.append(f"- nodes_by_arch: {', '.join(parts)}") + if not_ready_names: + lines.append(f"- nodes_not_ready: {', '.join(not_ready_names)}") + for key in ("rpi5", "rpi4", "jetson", "amd64", "arm64-unknown", "unknown"): + nodes_list = by_hardware.get(key) or [] + if nodes_list: + lines.append(f"- {key}: {', '.join(nodes_list)}") + if by_hardware: + counts = {key: len(nodes_list) for key, nodes_list in by_hardware.items() if nodes_list} + if counts: + parts = [f"{key}={count}" for key, count in sorted(counts.items())] + lines.append(f"- nodes_by_hardware_count: {', '.join(parts)}") + non_rpi = sorted(set(by_hardware.get("jetson", [])) | set(by_hardware.get("amd64", []))) + if non_rpi: + lines.append(f"- non_raspberry_pi: {', '.join(non_rpi)}") + for key, nodes_list in sorted(by_arch.items()): + if nodes_list: + lines.append(f"- arch {key}: {', '.join(nodes_list)}") + if control_plane_nodes: + lines.append(f"- control_plane_nodes: {', '.join(control_plane_nodes)}") + control_plane_by_hw: dict[str, list[str]] = collections.defaultdict(list) + for node in inv: + if node.get("name") in control_plane_nodes: + control_plane_by_hw[node.get("hardware") or "unknown"].append(node["name"]) + parts = [f"{hw}={', '.join(sorted(nodes))}" for hw, nodes in sorted(control_plane_by_hw.items())] + if parts: + lines.append(f"- control_plane_by_hardware: {', '.join(parts)}") + if worker_nodes: + lines.append(f"- worker_nodes: {', '.join(worker_nodes)}") + if ready_workers or not_ready_workers: + lines.append(f"- workers_ready: {', '.join(ready_workers) if ready_workers else 'none'}") + if not_ready_workers: + lines.append(f"- workers_not_ready: {', '.join(not_ready_workers)}") + if expected_workers and any(word in normalize_query(prompt) for word in ("missing", "expected", "should", "not ready", "unready")): + missing = sorted( + set(expected_workers) + - {n.get("name") for n in inv if isinstance(n, dict) and n.get("name")} + ) + lines.append(f"- expected_workers: {', '.join(expected_workers)}") + if missing: + lines.append(f"- expected_workers_missing: {', '.join(missing)}") + + hottest = metrics.get("hottest_nodes") if isinstance(metrics.get("hottest_nodes"), dict) else {} + usage_metrics = metrics.get("node_usage") if isinstance(metrics.get("node_usage"), dict) else {} + for key in ("cpu", "ram", "net", "io"): + entry = hottest.get(key) if isinstance(hottest.get(key), dict) else {} + node = entry.get("node") + value = entry.get("value") + if not node or value is None: + usage = usage_metrics.get(key) if isinstance(usage_metrics.get(key), list) else [] + pick = _node_usage_top(usage, allowed_nodes=None) + if pick: + node, value = pick + if node and value is not None: + value_fmt = _format_metric_value( + str(value), + percent=key in ("cpu", "ram"), + rate=key in ("net", "io"), + ) + lines.append(f"- hottest_{key}: {node} ({value_fmt})") + + postgres = metrics.get("postgres_connections") if isinstance(metrics.get("postgres_connections"), dict) else {} + if isinstance(postgres, dict) and postgres: + used = postgres.get("used") + max_conn = postgres.get("max") + if used is not None and max_conn is not None: + lines.append(f"- postgres_connections: {used} used / {max_conn} max") + hottest_db = postgres.get("hottest_db") if isinstance(postgres.get("hottest_db"), dict) else {} + if hottest_db.get("label"): + lines.append( + f"- postgres_hottest_db: {hottest_db.get('label')} ({hottest_db.get('value')})" + ) + + for key in ("pods_running", "pods_pending", "pods_failed", "pods_succeeded"): + value = metrics.get(key) + if value is not None: + lines.append(f"- {key}: {value}") + if workloads: + ns_counts: dict[str, int] = collections.defaultdict(int) + for entry in workloads: + if not isinstance(entry, dict): + continue + ns = entry.get("namespace") or "" + pods = entry.get("pods_running") + if pods is None: + pods = entry.get("pods_total") + try: + pods_val = int(pods) + except (TypeError, ValueError): + pods_val = 0 + if ns: + ns_counts[ns] += pods_val + if ns_counts: + top_ns = sorted(ns_counts.items(), key=lambda item: item[1], reverse=True)[:5] + parts = [f"{ns}={count}" for ns, count in top_ns] + lines.append(f"- pods_by_namespace: {', '.join(parts)}") + + top_restarts = metrics.get("top_restarts_1h") if isinstance(metrics.get("top_restarts_1h"), list) else [] + if top_restarts: + items = [] + for entry in top_restarts[:5]: + if not isinstance(entry, dict): + continue + metric = entry.get("metric") or {} + pod = metric.get("pod") or metric.get("name") or "" + ns = metric.get("namespace") or "" + value = entry.get("value") + label = f"{ns}/{pod}".strip("/") + if label and value is not None: + items.append(f"{label}={value}") + if items: + lines.append(f"- top_restarts_1h: {', '.join(items)}") + + allowed_nodes = {node.get("name") for node in inv if isinstance(node, dict) and node.get("name")} + usage_table = _node_usage_table(metrics, allowed_nodes=allowed_nodes or None) + if usage_table: + lines.append("- node_usage (cpu/ram/net/io):") + for entry in usage_table: + node = entry.get("node") + if not node: + continue + cpu = _format_metric_value(str(entry.get("cpu")), percent=True) if entry.get("cpu") is not None else "" + ram = _format_metric_value(str(entry.get("ram")), percent=True) if entry.get("ram") is not None else "" + net = ( + _format_metric_value(str(entry.get("net")), percent=False, rate=True) + if entry.get("net") is not None + else "" + ) + io_val = ( + _format_metric_value(str(entry.get("io")), percent=False, rate=True) + if entry.get("io") is not None + else "" + ) + lines.append(f" - {node}: cpu={cpu}, ram={ram}, net={net}, io={io_val}") + extremes = _usage_extremes(usage_table) + for metric in ("cpu", "ram", "net", "io"): + min_key = f"min_{metric}" + if min_key not in extremes: + continue + node, value = extremes[min_key] + value_fmt = _format_metric_value( + str(value), + percent=metric in ("cpu", "ram"), + rate=metric in ("net", "io"), + ) + lines.append(f"- lowest_{metric}: {node} ({value_fmt})") + for metric in ("cpu", "ram"): + hottest_parts: list[str] = [] + lowest_parts: list[str] = [] + for hw, nodes_list in sorted(by_hardware.items()): + entries = [] + for entry in usage_table: + node = entry.get("node") + if node in nodes_list and entry.get(metric) is not None: + try: + value = float(entry.get(metric)) + except (TypeError, ValueError): + continue + entries.append((node, value)) + if not entries: + continue + max_node, max_val = max(entries, key=lambda item: item[1]) + min_node, min_val = min(entries, key=lambda item: item[1]) + hottest_parts.append( + f"{hw}={max_node} ({_format_metric_value(str(max_val), percent=True)})" + ) + lowest_parts.append( + f"{hw}={min_node} ({_format_metric_value(str(min_val), percent=True)})" + ) + if hottest_parts: + lines.append(f"- hottest_{metric}_by_hardware: {', '.join(hottest_parts)}") + if lowest_parts: + lines.append(f"- lowest_{metric}_by_hardware: {', '.join(lowest_parts)}") + + if nodes_in_query: + lines.append("- node_details:") + for name in nodes_in_query: + detail = next((n for n in inv if n.get("name") == name), None) + if not detail: + lines.append(f" - {name}: not found in snapshot") + continue + roles = ",".join(detail.get("roles") or []) or "none" + lines.append( + f" - {name}: hardware={detail.get('hardware')}, arch={detail.get('arch')}, " + f"ready={detail.get('ready')}, roles={roles}" + ) + + workload_entries = _workloads_for_prompt(prompt, workloads or []) + if workload_entries: + lines.append("- workloads:") + for entry in workload_entries: + if not isinstance(entry, dict): + continue + ns = entry.get("namespace") or "" + wl = entry.get("workload") or "" + primary = entry.get("primary_node") or "" + pods_total = entry.get("pods_total") + pods_running = entry.get("pods_running") + label = f"{ns}/{wl}" if ns and wl else (wl or ns) + if not label: + continue + if primary: + lines.append( + f" - {label}: primary_node={primary}, pods_total={pods_total}, pods_running={pods_running}" + ) + else: + lines.append(f" - {label}: pods_total={pods_total}, pods_running={pods_running}") + top = max( + (entry for entry in workload_entries if isinstance(entry.get("pods_total"), (int, float))), + key=lambda item: item.get("pods_total", 0), + default=None, + ) + if isinstance(top, dict) and top.get("pods_total") is not None: + label = f"{top.get('namespace')}/{top.get('workload')}".strip("/") + lines.append(f"- workload_most_pods: {label} ({top.get('pods_total')})") + zero_running = [ + entry + for entry in workload_entries + if isinstance(entry.get("pods_running"), (int, float)) and entry.get("pods_running") == 0 + ] + if zero_running: + labels = [] + for entry in zero_running: + label = f"{entry.get('namespace')}/{entry.get('workload')}".strip("/") + if label: + labels.append(label) + if labels: + lines.append(f"- workloads_zero_running: {', '.join(labels)}") + + rendered = "\n".join(lines) + return rendered[:MAX_FACTS_CHARS] + +def _inventory_sets(inventory: list[dict[str, Any]]) -> dict[str, Any]: + names = [node["name"] for node in inventory] + ready = [node["name"] for node in inventory if node.get("ready") is True] + not_ready = [node["name"] for node in inventory if node.get("ready") is False] + groups = _group_nodes(inventory) + workers = [node for node in inventory if node.get("is_worker") is True] + worker_names = [node["name"] for node in workers] + worker_ready = [node["name"] for node in workers if node.get("ready") is True] + worker_not_ready = [node["name"] for node in workers if node.get("ready") is False] + expected_workers = expected_worker_nodes_from_metrics() + expected_ready = [n for n in expected_workers if n in ready] if expected_workers else [] + expected_not_ready = [n for n in expected_workers if n in not_ready] if expected_workers else [] + expected_missing = [n for n in expected_workers if n not in names] if expected_workers else [] + return { + "names": sorted(names), + "ready": sorted(ready), + "not_ready": sorted(not_ready), + "groups": groups, + "worker_names": sorted(worker_names), + "worker_ready": sorted(worker_ready), + "worker_not_ready": sorted(worker_not_ready), + "expected_workers": expected_workers, + "expected_ready": sorted(expected_ready), + "expected_not_ready": sorted(expected_not_ready), + "expected_missing": sorted(expected_missing), + } + + +def _workload_tokens(entry: dict[str, Any]) -> set[str]: + tokens: set[str] = set() + for key in ("workload", "namespace"): + value = entry.get(key) + if isinstance(value, str) and value: + tokens.update(_tokens(value)) + return tokens + + +def _workload_query_target(prompt: str) -> str: + tokens = set(_tokens(prompt)) + matches = sorted(tokens & _NAME_INDEX) if _NAME_INDEX else [] + return matches[0] if matches else "" + + +def _select_workload(prompt: str, workloads: list[dict[str, Any]]) -> dict[str, Any] | None: + q_tokens = set(_tokens(prompt)) + if not q_tokens: + return None + scored: list[tuple[int, dict[str, Any]]] = [] + for entry in workloads: + if not isinstance(entry, dict): + continue + tokens = _workload_tokens(entry) + score = len(tokens & q_tokens) + name = (entry.get("workload") or "").lower() + namespace = (entry.get("namespace") or "").lower() + if name and name in q_tokens: + score += 5 + if namespace and namespace in q_tokens: + score += 3 + if score: + scored.append((score, entry)) + if not scored: + return None + scored.sort(key=lambda item: item[0], reverse=True) + return scored[0][1] + + +def _format_confidence(answer: str, confidence: str) -> str: + if not answer: + return "" + return f"{answer}\nConfidence: {confidence}." + + +def workload_answer(prompt: str, workloads: list[dict[str, Any]]) -> str: + q = normalize_query(prompt) + if not any(word in q for word in ("where", "which", "node", "run", "running", "host", "located")): + return "" + target = _workload_query_target(prompt) + entry = _select_workload(prompt, workloads) + if not entry: + return "" + workload = entry.get("workload") or "" + namespace = entry.get("namespace") or "" + if target: + workload_l = str(workload).lower() + namespace_l = str(namespace).lower() + if workload_l != target and namespace_l == target and "namespace" not in q and "workload" not in q: + return "" + nodes = entry.get("nodes") if isinstance(entry.get("nodes"), dict) else {} + primary = entry.get("primary_node") or "" + if not workload or not nodes: + return "" + parts = [] + if primary: + parts.append(f"{primary} (primary)") + for node, count in sorted(nodes.items(), key=lambda item: (-item[1], item[0])): + if node == primary: + continue + parts.append(f"{node} ({count} pod{'s' if count != 1 else ''})") + node_text = ", ".join(parts) if parts else primary + answer = f"{workload} runs in {namespace}. Nodes: {node_text}." + return _format_confidence(answer, "medium") + + +def _snapshot_metrics(snapshot: dict[str, Any] | None) -> dict[str, Any]: + if not snapshot: + return {} + metrics = snapshot.get("metrics") + return metrics if isinstance(metrics, dict) else {} + + +def _node_usage_top( + usage: list[dict[str, Any]], + *, + allowed_nodes: set[str] | None, +) -> tuple[str, float] | None: + best_node = "" + best_val = None + for item in usage if isinstance(usage, list) else []: + if not isinstance(item, dict): + continue + node = item.get("node") or "" + if allowed_nodes and node not in allowed_nodes: + continue + value = item.get("value") + try: + numeric = float(value) + except (TypeError, ValueError): + continue + if best_val is None or numeric > best_val: + best_val = numeric + best_node = node + if best_node and best_val is not None: + return best_node, best_val + return None + + +def _node_usage_bottom( + usage: list[dict[str, Any]], + *, + allowed_nodes: set[str] | None, +) -> tuple[str, float] | None: + best_node: str | None = None + best_val: float | None = None + for item in usage: + if not isinstance(item, dict): + continue + node = item.get("node") + if not node or not isinstance(node, str): + continue + if allowed_nodes and node not in allowed_nodes: + continue + value = item.get("value") + try: + numeric = float(value) + except (TypeError, ValueError): + continue + if best_val is None or numeric < best_val: + best_val = numeric + best_node = node + if best_node and best_val is not None: + return best_node, best_val + return None + + +def snapshot_metric_answer( + prompt: str, + *, + snapshot: dict[str, Any] | None, + inventory: list[dict[str, Any]], +) -> str: + if not snapshot: + return "" + metrics = _snapshot_metrics(snapshot) + if not metrics: + return "" + q = normalize_query(prompt) + metric = _detect_metric(q) + op = _detect_operation(q) + if op == "list" and metric in {"cpu", "ram", "net", "io"}: + op = "top" + include_hw, exclude_hw = _detect_hardware_filters(q) + nodes_in_query = _extract_titan_nodes(q) + only_workers = "worker" in q or "workers" in q + + filtered = _inventory_filter( + inventory, + include_hw=include_hw, + exclude_hw=exclude_hw, + only_workers=only_workers, + only_ready=None, + nodes_in_query=nodes_in_query, + ) + allowed_nodes = {node["name"] for node in filtered} if filtered else None + + if metric in {"cpu", "ram", "net", "io"} and op in {"top", "bottom", "status", None}: + usage = metrics.get("node_usage", {}).get(metric, []) + pick = _node_usage_bottom if op == "bottom" else _node_usage_top + chosen = pick(usage, allowed_nodes=allowed_nodes) + if chosen: + node, val = chosen + percent = metric in {"cpu", "ram"} + value = _format_metric_value(str(val), percent=percent, rate=metric in {"net", "io"}) + scope = "" + if include_hw: + scope = f" among {' and '.join(sorted(include_hw))}" + label = "Lowest" if op == "bottom" else "Hottest" + answer = f"{label} node{scope}: {node} ({value})." + if allowed_nodes and len(allowed_nodes) != len(inventory) and op != "bottom": + overall = _node_usage_top(usage, allowed_nodes=None) + if overall and overall[0] != node: + overall_val = _format_metric_value( + str(overall[1]), + percent=percent, + rate=metric in {"net", "io"}, + ) + answer += f" Overall hottest: {overall[0]} ({overall_val})." + return _format_confidence(answer, "high") + + if metric == "connections" or "postgres" in q: + postgres = metrics.get("postgres_connections") if isinstance(metrics.get("postgres_connections"), dict) else {} + used = postgres.get("used") + max_conn = postgres.get("max") + hottest = postgres.get("hottest_db") if isinstance(postgres.get("hottest_db"), dict) else {} + parts: list[str] = [] + if used is not None and max_conn is not None: + free = max_conn - used + if any(word in q for word in ("free", "available", "remaining", "remain", "left")): + parts.append(f"Postgres connections: {used:.0f} used / {max_conn:.0f} max ({free:.0f} free).") + else: + parts.append(f"Postgres connections: {used:.0f} used / {max_conn:.0f} max.") + if hottest.get("label"): + hot_val = hottest.get("value") + hot_val_str = _format_metric_value(str(hot_val), percent=False) if hot_val is not None else "" + parts.append(f"Hottest DB: {hottest.get('label')} ({hot_val_str}).") + if parts: + return _format_confidence(" ".join(parts), "high") + + if metric == "pods": + running = metrics.get("pods_running") + pending = metrics.get("pods_pending") + failed = metrics.get("pods_failed") + succeeded = metrics.get("pods_succeeded") + status_terms = ("running", "pending", "failed", "succeeded", "completed") + if ("most pods" in q or ("most" in q and "pod" in q and "node" in q)) and not nodes_in_query: + return _format_confidence( + "I don't have per-node pod counts in the snapshot.", + "medium", + ) + if "total" in q or "sum" in q: + values = [v for v in (running, pending, failed, succeeded) if isinstance(v, (int, float))] + if values: + return _format_confidence(f"Total pods: {sum(values):.0f}.", "high") + if "not running" in q or "not in running" in q or "non running" in q: + parts = [v for v in (pending, failed, succeeded) if isinstance(v, (int, float))] + if parts: + return _format_confidence(f"Pods not running: {sum(parts):.0f}.", "high") + if sum(1 for term in status_terms if term in q) > 1: + parts = [] + if "running" in q and running is not None: + parts.append(f"running {running:.0f}") + if "pending" in q and pending is not None: + parts.append(f"pending {pending:.0f}") + if "failed" in q and failed is not None: + parts.append(f"failed {failed:.0f}") + if ("succeeded" in q or "completed" in q) and succeeded is not None: + parts.append(f"succeeded {succeeded:.0f}") + if parts: + return _format_confidence(f"Pods: {', '.join(parts)}.", "high") + if "pending" in q and pending is not None: + return _format_confidence(f"Pending pods: {pending:.0f}.", "high") + if "failed" in q and failed is not None: + return _format_confidence(f"Failed pods: {failed:.0f}.", "high") + if "succeeded" in q or "completed" in q: + if succeeded is not None: + return _format_confidence(f"Succeeded pods: {succeeded:.0f}.", "high") + if "running" in q and running is not None: + return _format_confidence(f"Running pods: {running:.0f}.", "high") + parts = [] + if running is not None: + parts.append(f"running {running:.0f}") + if pending is not None: + parts.append(f"pending {pending:.0f}") + if failed is not None: + parts.append(f"failed {failed:.0f}") + if succeeded is not None: + parts.append(f"succeeded {succeeded:.0f}") + if parts: + return _format_confidence(f"Pods: {', '.join(parts)}.", "high") + + return "" + +def structured_answer( + prompt: str, + *, + inventory: list[dict[str, Any]], + metrics_summary: str, + snapshot: dict[str, Any] | None = None, + workloads: list[dict[str, Any]] | None = None, +) -> str: + q = normalize_query(prompt) + if not q: + return "" + + if workloads: + workload_resp = workload_answer(prompt, workloads) + if workload_resp: + return workload_resp + + snap_resp = snapshot_metric_answer(prompt, snapshot=snapshot, inventory=inventory) + if snap_resp: + return snap_resp + + tokens = _tokens(q) + op = _detect_operation(q) + metric = _detect_metric(q) + if op == "list" and metric in {"cpu", "ram", "net", "io"}: + op = "top" + entity = _detect_entity(q) + include_hw, exclude_hw = _detect_hardware_filters(q) + if entity is None and (include_hw or exclude_hw): + entity = "node" + nodes_in_query = _extract_titan_nodes(q) + only_workers = "worker" in q or "workers" in q + role_filters = _detect_role_filters(q) + only_ready: bool | None = None + if ( + "not ready" in q + or "notready" in q + or "not-ready" in q + or "unready" in q + or "down" in q + or "missing" in q + ): + only_ready = False + elif "ready" in q: + only_ready = True + + if entity == "node" and only_ready is not None and op != "count": + op = "status" + if entity == "node" and only_ready is not None and op == "count": + if not any(term in q for term in ("how many", "count", "number")): + op = "status" + + if not op and entity == "node": + op = "list" if (include_hw or exclude_hw or nodes_in_query) else "count" + + if entity == "node" and "total" in q and "ready" in q: + summary = _nodes_summary_line(inventory, snapshot) + if summary: + return _format_confidence(summary, "high") + + if entity == "node" and ("hardware mix" in q or "architecture" in q): + hw_line = _hardware_mix_line(inventory) + if hw_line: + return _format_confidence(hw_line, "high") + + if ( + entity == "node" + and op == "status" + and metric is None + and not (include_hw or exclude_hw or nodes_in_query or only_workers or role_filters) + ): + summary = _nodes_summary_line(inventory, snapshot) + if summary: + return _format_confidence(summary, "high") + + if entity == "node" and metric is None and any(word in q for word in ("hardware", "architecture", "class", "mix")): + hw_line = _hardware_mix_line(inventory) + if hw_line: + return _format_confidence(hw_line, "medium") + + if ( + entity == "node" + and any(term in q for term in ("arm64", "amd64")) + and any(term in q for term in ("mostly", "majority", "more")) + ): + arm64_count = len([n for n in inventory if n.get("arch") == "arm64"]) + amd64_count = len([n for n in inventory if n.get("arch") == "amd64"]) + if arm64_count or amd64_count: + majority = "arm64" if arm64_count >= amd64_count else "amd64" + return _format_confidence( + f"arm64 nodes: {arm64_count}, amd64 nodes: {amd64_count}. Mostly {majority}.", + "high", + ) + + if op == "top" and metric is None and not any(word in q for word in ("hardware", "architecture", "class")): + metric = "cpu" + + # Metrics-first when a metric or top operation is requested. + if metric or op == "top": + entry = _select_metric_entry(tokens, metric=metric, op=op) + if entry and isinstance(entry.get("exprs"), list) and entry["exprs"]: + expr = entry["exprs"][0] + if inventory: + scoped = _inventory_filter( + inventory, + include_hw=include_hw, + exclude_hw=exclude_hw, + only_workers=only_workers, + only_ready=None, + nodes_in_query=nodes_in_query, + ) + if scoped: + node_regex = "|".join([n["name"] for n in scoped]) + expr = _apply_node_filter(expr, node_regex) + res = vm_query(expr, timeout=20) + answer = "" + if op == "top" or "hottest" in (entry.get("panel_title") or "").lower(): + node, val = _primary_series_metric(res) + if node and val is not None: + percent = _metric_expr_uses_percent(entry) + rate = metric in {"net", "io"} + value_fmt = _format_metric_value(val or "", percent=percent, rate=rate) + metric_label = (metric or "").upper() + label = f"{metric_label} node" if metric_label else "node" + answer = f"Hottest {label}: {node} ({value_fmt})." + if not answer: + answer = _format_metric_answer(entry, res) + if answer: + scope_parts: list[str] = [] + if include_hw: + scope_parts.append(" and ".join(sorted(include_hw))) + if exclude_hw: + scope_parts.append(f"excluding {' and '.join(sorted(exclude_hw))}") + if only_workers: + scope_parts.append("worker") + if scope_parts: + scope = " ".join(scope_parts) + overall_note = "" + base_expr = entry["exprs"][0] + if inventory: + all_nodes = "|".join([n["name"] for n in inventory]) + if all_nodes: + base_expr = _apply_node_filter(base_expr, all_nodes) + base_res = vm_query(base_expr, timeout=20) + base_node, base_val = _primary_series_metric(base_res) + scoped_node, scoped_val = _primary_series_metric(res) + if base_node and scoped_node and base_node != scoped_node: + percent = _metric_expr_uses_percent(entry) + rate = metric in {"net", "io"} + base_val_fmt = _format_metric_value(base_val or "", percent=percent, rate=rate) + overall_note = f" Overall hottest node: {base_node} ({base_val_fmt})." + return _format_confidence(f"Among {scope} nodes, {answer}{overall_note}", "high") + return _format_confidence(answer, "high") + if metrics_summary: + return metrics_summary + + if entity != "node" or not inventory: + if any(word in q for word in METRIC_HINT_WORDS) and not metrics_summary: + return "I don't have data to answer that right now." + return "" + + expected_workers = expected_worker_nodes_from_metrics() + filtered = _inventory_filter( + inventory, + include_hw=include_hw, + exclude_hw=exclude_hw, + only_workers=only_workers, + only_ready=only_ready if op in ("status", "count") else None, + nodes_in_query=nodes_in_query, + ) + if role_filters: + filtered = [ + node + for node in filtered + if role_filters.intersection(set(node.get("roles") or [])) + ] + names = [node["name"] for node in filtered] + + if op == "status": + scope_label = "nodes" + if include_hw: + scope_label = f"{' and '.join(sorted(include_hw))} nodes" + elif only_workers: + scope_label = "worker nodes" + if "missing" in q and ("ready" in q or "readiness" in q): + return _format_confidence( + f"Not ready {scope_label}: " + (", ".join(names) if names else "none") + ".", + "high", + ) + if "missing" in q and expected_workers: + missing = sorted(set(expected_workers) - {n["name"] for n in inventory}) + return _format_confidence( + "Missing nodes: " + (", ".join(missing) if missing else "none") + ".", + "high", + ) + if only_ready is False: + return _format_confidence( + f"Not ready {scope_label}: " + (", ".join(names) if names else "none") + ".", + "high", + ) + if only_ready is True: + return _format_confidence( + f"Ready {scope_label} ({len(names)}): " + (", ".join(names) if names else "none") + ".", + "high", + ) + + if op == "count": + scope_label = "nodes" + if include_hw: + scope_label = f"{' and '.join(sorted(include_hw))} nodes" + elif only_workers: + scope_label = "worker nodes" + if only_workers and "ready" in q and ("total" in q or "vs" in q or "versus" in q): + total_workers = _inventory_filter( + inventory, + include_hw=include_hw, + exclude_hw=exclude_hw, + only_workers=True, + only_ready=None, + nodes_in_query=nodes_in_query, + ) + ready_workers = _inventory_filter( + inventory, + include_hw=include_hw, + exclude_hw=exclude_hw, + only_workers=True, + only_ready=True, + nodes_in_query=nodes_in_query, + ) + return _format_confidence( + f"Worker nodes ready: {len(ready_workers)} / {len(total_workers)} total.", + "high", + ) + if expected_workers and ("expected" in q or "should" in q): + missing = sorted(set(expected_workers) - {n["name"] for n in inventory}) + msg = f"Grafana inventory expects {len(expected_workers)} worker nodes." + if missing: + msg += f" Missing: {', '.join(missing)}." + return _format_confidence(msg, "high") + if only_ready is True: + return _format_confidence(f"Ready {scope_label}: {len(names)}.", "high") + if only_ready is False: + return _format_confidence(f"Not ready {scope_label}: {len(names)}.", "high") + if not (include_hw or exclude_hw or nodes_in_query or only_workers or role_filters): + return _format_confidence(f"Atlas has {len(names)} nodes.", "high") + return _format_confidence(f"Matching nodes: {len(names)}.", "high") + + if op == "list": + if nodes_in_query: + parts = [] + existing = {n["name"] for n in inventory} + for node in nodes_in_query: + parts.append(f"{node}: {'present' if node in existing else 'not present'}") + return _format_confidence("Node presence: " + ", ".join(parts) + ".", "high") + if not names: + return _format_confidence("Matching nodes: none.", "high") + shown = names[:30] + suffix = f", … (+{len(names) - 30} more)" if len(names) > 30 else "" + return _format_confidence("Matching nodes: " + ", ".join(shown) + suffix + ".", "high") + + return "" + + +def _nodes_summary_line(inventory: list[dict[str, Any]], snapshot: dict[str, Any] | None) -> str: + summary = snapshot.get("nodes_summary") if isinstance(snapshot, dict) else {} + nodes = snapshot.get("nodes") if isinstance(snapshot, dict) else {} + total = summary.get("total") if isinstance(summary, dict) and summary.get("total") is not None else nodes.get("total") + ready = summary.get("ready") if isinstance(summary, dict) and summary.get("ready") is not None else nodes.get("ready") + not_ready = summary.get("not_ready") if isinstance(summary, dict) and summary.get("not_ready") is not None else nodes.get("not_ready") + if total is None: + total = len(inventory) + ready = len([n for n in inventory if n.get("ready") is True]) + not_ready = len([n for n in inventory if n.get("ready") is False]) + if total is None: + return "" + if not_ready: + names = [] + summary_names = summary.get("not_ready_names") if isinstance(summary, dict) else [] + if isinstance(summary_names, list): + names = [name for name in summary_names if isinstance(name, str)] + if not names and snapshot: + details = snapshot.get("nodes_detail") if isinstance(snapshot.get("nodes_detail"), list) else [] + names = [node.get("name") for node in details if isinstance(node, dict) and node.get("ready") is False] + names = [name for name in names if isinstance(name, str) and name] + suffix = f" (not ready: {', '.join(names)})" if names else "" + return f"Atlas has {total} nodes; {ready} ready, {not_ready} not ready{suffix}." + return f"Atlas has {total} nodes and all are Ready." + + +def _hardware_mix_line(inventory: list[dict[str, Any]]) -> str: + if not inventory: + return "" + groups = _group_nodes(inventory) + parts: list[str] = [] + for key in ("rpi5", "rpi4", "jetson", "amd64", "arm64-unknown", "unknown"): + nodes = groups.get(key) or [] + if nodes: + parts.append(f"{key}={len(nodes)}") + if not parts: + return "" + return "Hardware mix includes " + ", ".join(parts) + "." + + +def _os_mix_line(snapshot: dict[str, Any] | None) -> str: + if not snapshot: + return "" + details = snapshot.get("nodes_detail") if isinstance(snapshot.get("nodes_detail"), list) else [] + counts: dict[str, int] = collections.Counter() + for node in details: + if not isinstance(node, dict): + continue + os_name = (node.get("os") or "").strip() + if os_name: + counts[os_name] += 1 + if not counts or (len(counts) == 1 and "linux" in counts): + return "" + parts = [f"{os_name}={count}" for os_name, count in sorted(counts.items(), key=lambda item: (-item[1], item[0]))] + return "OS mix: " + ", ".join(parts[:5]) + "." + + +def _pods_summary_line(metrics: dict[str, Any]) -> str: + if not metrics: + return "" + running = metrics.get("pods_running") + pending = metrics.get("pods_pending") + failed = metrics.get("pods_failed") + succeeded = metrics.get("pods_succeeded") + if running is None and pending is None and failed is None and succeeded is None: + return "" + parts: list[str] = [] + if running is not None: + parts.append(f"{running:.0f} running") + if pending is not None: + parts.append(f"{pending:.0f} pending") + if failed is not None: + parts.append(f"{failed:.0f} failed") + if succeeded is not None: + parts.append(f"{succeeded:.0f} succeeded") + return "There are " + ", ".join(parts) + " pods." + + +def _postgres_summary_line(metrics: dict[str, Any]) -> str: + if not metrics: + return "" + postgres = metrics.get("postgres_connections") if isinstance(metrics.get("postgres_connections"), dict) else {} + if not postgres: + return "" + used = postgres.get("used") + max_conn = postgres.get("max") + hottest = postgres.get("hottest_db") if isinstance(postgres.get("hottest_db"), dict) else {} + parts: list[str] = [] + if used is not None and max_conn is not None: + parts.append(f"{used:.0f}/{max_conn:.0f} connections") + if hottest.get("label"): + hot_val = hottest.get("value") + hot_val_str = _format_metric_value(str(hot_val), percent=False) if hot_val is not None else "" + parts.append(f"hottest {hottest.get('label')} ({hot_val_str})") + if not parts: + return "" + return "Postgres is at " + ", ".join(parts) + "." + + +def _hottest_summary_line(metrics: dict[str, Any]) -> str: + if not metrics: + return "" + hottest = metrics.get("hottest_nodes") if isinstance(metrics.get("hottest_nodes"), dict) else {} + if not hottest: + return "" + parts: list[str] = [] + for key in ("cpu", "ram", "net", "io"): + entry = hottest.get(key) if isinstance(hottest.get(key), dict) else {} + node = entry.get("node") + value = entry.get("value") + if node and value is not None: + value_fmt = _format_metric_value( + str(value), + percent=key in ("cpu", "ram"), + rate=key in ("net", "io"), + ) + parts.append(f"{key.upper()} {node} ({value_fmt})") + if not parts: + return "" + return "Hot spots: " + "; ".join(parts) + "." + + +_FOLLOWUP_HINTS = ( + "what about", + "how about", + "and what", + "and how", + "tell me more", + "anything else", + "something else", + "that one", + "those", + "them", + "it", + "this", + "that", + "else", + "another", + "again", +) + + +def _is_followup_query(query: str) -> bool: + q = normalize_query(query) + if not q: + return False + if any(hint in q for hint in _FOLLOWUP_HINTS): + return True + if len(q.split()) <= 3 and not any(word in q for word in _INSIGHT_HINT_WORDS): + return True + return False + + +def _is_subjective_query(query: str) -> bool: + q = normalize_query(query) + if not q: + return False + return any(word in q for word in _INSIGHT_HINT_WORDS) or any( + phrase in q + for phrase in ( + "what do you think", + "your favorite", + "your favourite", + "your opinion", + ) + ) + + +def _is_overview_query(query: str) -> bool: + q = normalize_query(query) + if not q: + return False + return any(word in q for word in _OVERVIEW_HINT_WORDS) + + +def _doc_intent(query: str) -> bool: + q = normalize_query(query) + if not q: + return False + return any( + phrase in q + for phrase in ( + "runbook", + "documentation", + "docs", + "guide", + "how do i", + "how to", + "instructions", + "playbook", + "next step", + "next steps", + "what should", + "what do i", + "what to do", + "troubleshoot", + "triage", + "recover", + "remediate", + ) + ) + + +def cluster_overview_answer( + prompt: str, + *, + inventory: list[dict[str, Any]], + snapshot: dict[str, Any] | None, +) -> str: + if not inventory and not snapshot: + return "" + q = normalize_query(prompt) + metrics = _snapshot_metrics(snapshot) + sentences: list[str] = [] + + nodes_line = _nodes_summary_line(inventory, snapshot) + if nodes_line: + sentences.append(nodes_line) + + wants_overview = _is_overview_query(q) or any(word in q for word in ("atlas", "cluster", "titan", "lab")) + wants_hardware = any(word in q for word in ("hardware", "architecture", "nodes", "node")) or wants_overview + wants_metrics = any( + word in q + for word in ( + "status", + "health", + "overview", + "summary", + "pods", + "postgres", + "connections", + "hottest", + "cpu", + "ram", + "memory", + "net", + "network", + "io", + "disk", + "busy", + "load", + "usage", + "utilization", + ) + ) or wants_overview + + if wants_hardware: + hw_line = _hardware_mix_line(inventory) + if hw_line: + sentences.append(hw_line) + os_line = _os_mix_line(snapshot) + if os_line: + sentences.append(os_line) + + if wants_metrics: + pods_line = _pods_summary_line(metrics) + if pods_line: + sentences.append(pods_line) + postgres_line = _postgres_summary_line(metrics) + if postgres_line: + sentences.append(postgres_line) + hottest_line = _hottest_summary_line(metrics) + if hottest_line: + sentences.append(hottest_line) + + if not sentences: + return "" + if len(sentences) > 3 and not wants_overview: + sentences = sentences[:3] + return "Based on the latest snapshot, " + " ".join(sentences) + + +def cluster_answer( + prompt: str, + *, + inventory: list[dict[str, Any]], + snapshot: dict[str, Any] | None, + workloads: list[dict[str, Any]] | None, + history_lines: list[str] | None = None, +) -> str: + metrics_summary = snapshot_context(prompt, snapshot) + structured = structured_answer( + prompt, + inventory=inventory, + metrics_summary=metrics_summary, + snapshot=snapshot, + workloads=workloads, + ) + if structured: + return structured + + q = normalize_query(prompt) + workload_target = _workload_query_target(prompt) + if workload_target and any(word in q for word in ("where", "run", "running", "host", "node")): + return _format_confidence( + f"I don't have workload placement data for {workload_target} in the current snapshot.", + "low", + ) + + overview = cluster_overview_answer(prompt, inventory=inventory, snapshot=snapshot) + if overview: + kb_titles = kb_retrieve_titles(prompt, limit=4) if _doc_intent(prompt) else "" + if kb_titles: + overview = overview + "\n" + kb_titles + return _format_confidence(overview, "medium") + + kb_titles = kb_retrieve_titles(prompt, limit=4) + if kb_titles: + return _format_confidence(kb_titles, "low") + + if metrics_summary: + return _format_confidence(metrics_summary, "low") + + return "" + +def _metric_tokens(entry: dict[str, Any]) -> str: + parts: list[str] = [] + for key in ("panel_title", "dashboard", "description"): + val = entry.get(key) + if isinstance(val, str) and val: + parts.append(val.lower()) + tags = entry.get("tags") + if isinstance(tags, list): + parts.extend(str(t).lower() for t in tags if t) + return " ".join(parts) + +def metrics_lookup(query: str, limit: int = 3) -> list[dict[str, Any]]: + q_tokens = _tokens(query) + if not q_tokens or not _METRIC_INDEX: + return [] + scored: list[tuple[int, dict[str, Any]]] = [] + for entry in _METRIC_INDEX: + if not isinstance(entry, dict): + continue + hay = _metric_tokens(entry) + if not hay: + continue + score = 0 + for t in set(q_tokens): + if t in hay: + score += 2 if t in (entry.get("panel_title") or "").lower() else 1 + if score: + scored.append((score, entry)) + scored.sort(key=lambda item: item[0], reverse=True) + return [entry for _, entry in scored[:limit]] + +def metrics_query_context(prompt: str, *, allow_tools: bool) -> tuple[str, str]: + if not allow_tools: + return "", "" + lower = (prompt or "").lower() + if not any(word in lower for word in METRIC_HINT_WORDS): + return "", "" + matches = metrics_lookup(prompt, limit=1) + if not matches: + return "", "" + entry = matches[0] + dashboard = entry.get("dashboard") or "dashboard" + panel = entry.get("panel_title") or "panel" + exprs = entry.get("exprs") if isinstance(entry.get("exprs"), list) else [] + if not exprs: + return "", "" + rendered_parts: list[str] = [] + for expr in exprs[:2]: + res = vm_query(expr, timeout=20) + rendered = vm_render_result(res, limit=10) + if rendered: + rendered_parts.append(rendered) + if not rendered_parts: + return "", "" + summary = "\n".join(rendered_parts) + context = f"Metrics (from {dashboard} / {panel}):\n{summary}" + return context, "" + def catalog_hints(query: str) -> tuple[str, list[tuple[str, str]]]: q = (query or "").strip() if not q or not KB.get("catalog"): @@ -295,6 +2278,73 @@ def k8s_get(path: str, timeout: int = 8) -> dict: raw = resp.read() return json.loads(raw.decode()) if raw else {} +def _ariadne_state(timeout: int = 5) -> dict | None: + if not ARIADNE_STATE_URL: + return None + headers = {} + if ARIADNE_STATE_TOKEN: + headers["X-Internal-Token"] = ARIADNE_STATE_TOKEN + r = request.Request(ARIADNE_STATE_URL, headers=headers, method="GET") + try: + with request.urlopen(r, timeout=timeout) as resp: + raw = resp.read() + payload = json.loads(raw.decode()) if raw else {} + return payload if isinstance(payload, dict) else None + except Exception: + return None + + +_SNAPSHOT_CACHE: dict[str, Any] = {"payload": None, "ts": 0.0} + + +def _snapshot_state() -> dict[str, Any] | None: + now = time.monotonic() + cached = _SNAPSHOT_CACHE.get("payload") + ts = _SNAPSHOT_CACHE.get("ts") or 0.0 + if cached and now - ts < max(5, SNAPSHOT_TTL_SEC): + return cached + payload = _ariadne_state(timeout=10) + if isinstance(payload, dict) and payload: + _SNAPSHOT_CACHE["payload"] = payload + _SNAPSHOT_CACHE["ts"] = now + return payload + return cached if isinstance(cached, dict) else None + + +def _snapshot_inventory(snapshot: dict[str, Any] | None) -> list[dict[str, Any]]: + if not snapshot: + return [] + items = snapshot.get("nodes_detail") + if not isinstance(items, list): + return [] + inventory: list[dict[str, Any]] = [] + for node in items: + if not isinstance(node, dict): + continue + labels = node.get("labels") if isinstance(node.get("labels"), dict) else {} + name = node.get("name") or "" + if not name: + continue + hardware = node.get("hardware") or _hardware_class(labels) + inventory.append( + { + "name": name, + "arch": node.get("arch") or labels.get("kubernetes.io/arch") or labels.get("beta.kubernetes.io/arch") or "", + "hardware": hardware, + "roles": node.get("roles") or [], + "is_worker": node.get("is_worker") is True, + "ready": node.get("ready") is True, + } + ) + return sorted(inventory, key=lambda item: item["name"]) + + +def _snapshot_workloads(snapshot: dict[str, Any] | None) -> list[dict[str, Any]]: + if not snapshot: + return [] + workloads = snapshot.get("workloads") + return workloads if isinstance(workloads, list) else [] + def k8s_pods(namespace: str) -> list[dict]: data = k8s_get(f"/api/v1/namespaces/{parse.quote(namespace)}/pods?limit=500") items = data.get("items") or [] @@ -404,6 +2454,86 @@ def vm_render_result(res: dict | None, limit: int = 12) -> str: out.append(f"- {labels}: {val}") return "\n".join(out) +def _parse_metric_lines(summary: str) -> dict[str, str]: + parsed: dict[str, str] = {} + for line in (summary or "").splitlines(): + line = line.strip() + if not line.startswith("-"): + continue + try: + label, value = line.lstrip("-").split(":", 1) + except ValueError: + continue + parsed[label.strip()] = value.strip() + return parsed + +def _metrics_fallback_summary(panel: str, summary: str) -> str: + parsed = _parse_metric_lines(summary) + panel_l = (panel or "").lower() + if parsed: + items = list(parsed.items()) + if len(items) == 1: + label, value = items[0] + return f"{panel}: {label} = {value}." + compact = "; ".join(f"{k}={v}" for k, v in items) + return f"{panel}: {compact}." + if panel_l: + return f"{panel}: {summary}" + return summary + +def _node_ready_status(node: dict) -> bool | None: + conditions = node.get("status", {}).get("conditions") or [] + for cond in conditions if isinstance(conditions, list) else []: + if cond.get("type") == "Ready": + if cond.get("status") == "True": + return True + if cond.get("status") == "False": + return False + return None + return None + +def _node_is_worker(node: dict) -> bool: + labels = (node.get("metadata") or {}).get("labels") or {} + if labels.get("node-role.kubernetes.io/control-plane") is not None: + return False + if labels.get("node-role.kubernetes.io/master") is not None: + return False + if labels.get("node-role.kubernetes.io/worker") is not None: + return True + return True + +def worker_nodes_status(inventory: list[dict[str, Any]] | None = None) -> tuple[list[str], list[str]]: + if inventory is None: + inventory = node_inventory() + ready_nodes = [n["name"] for n in inventory if n.get("is_worker") and n.get("ready") is True] + not_ready_nodes = [n["name"] for n in inventory if n.get("is_worker") and n.get("ready") is False] + return (sorted(ready_nodes), sorted(not_ready_nodes)) + +def expected_worker_nodes_from_metrics() -> list[str]: + for entry in _METRIC_INDEX: + panel = (entry.get("panel_title") or "").lower() + if "worker nodes ready" not in panel: + continue + exprs = entry.get("exprs") if isinstance(entry.get("exprs"), list) else [] + for expr in exprs: + if not isinstance(expr, str): + continue + match = NODE_REGEX.search(expr) + if not match: + continue + raw = match.group(1) + nodes = [n.strip() for n in raw.split("|") if n.strip()] + return sorted(nodes) + return [] + +def _context_fallback(context: str) -> str: + if not context: + return "" + trimmed = context.strip() + if len(trimmed) > MAX_TOOL_CHARS: + trimmed = trimmed[: MAX_TOOL_CHARS - 3].rstrip() + "..." + return "Here is what I found:\n" + trimmed + def vm_top_restarts(hours: int = 1) -> str: q = f"topk(5, sum by (namespace,pod) (increase(kube_pod_container_status_restarts_total[{hours}h])))" res = vm_query(q) @@ -442,6 +2572,1832 @@ def vm_cluster_snapshot() -> str: parts.append(pr) return "\n".join(parts).strip() +def _strip_code_fence(text: str) -> str: + cleaned = (text or "").strip() + match = CODE_FENCE_RE.match(cleaned) + if match: + return match.group(1).strip() + return cleaned + +def _normalize_reply(value: Any) -> str: + if isinstance(value, dict): + for key in ("content", "response", "reply", "message"): + if key in value: + return _normalize_reply(value[key]) + for v in value.values(): + if isinstance(v, (str, dict, list)): + return _normalize_reply(v) + return json.dumps(value, ensure_ascii=False) + if isinstance(value, list): + parts = [_normalize_reply(item) for item in value] + return " ".join(p for p in parts if p) + if value is None: + return "" + text = _strip_code_fence(str(value)) + if text.startswith("{") and text.endswith("}"): + try: + return _normalize_reply(json.loads(text)) + except Exception: + return _ensure_confidence(text) + return _ensure_confidence(text) + + +def _history_payload_lines(history_payload: list[Any]) -> list[str]: + lines: list[str] = [] + if not isinstance(history_payload, list): + return lines + for item in history_payload[-12:]: + if isinstance(item, dict): + for key in ("content", "message", "text", "prompt", "question", "body", "answer", "reply", "response"): + val = item.get(key) + if isinstance(val, str) and val.strip(): + lines.append(val.strip()) + elif isinstance(item, str) and item.strip(): + lines.append(item.strip()) + return [line for line in lines if line] + + +def _append_history_context(context: str, history_lines: list[str]) -> str: + lines = [line.strip() for line in history_lines if isinstance(line, str) and line.strip()] + if not lines: + return context + snippet = "\n".join(lines[-6:]) + combined = context + "\nRecent chat:\n" + snippet if context else "Recent chat:\n" + snippet + if len(combined) > MAX_CONTEXT_CHARS: + combined = combined[: MAX_CONTEXT_CHARS - 3].rstrip() + "..." + return combined + + +class ThoughtState: + def __init__(self, total_steps: int = 0): + self._lock = threading.Lock() + self.stage = "starting" + self.note = "" + self.step = 0 + self.total_steps = total_steps + + def update(self, stage: str, *, note: str = "", step: int | None = None) -> None: + with self._lock: + self.stage = stage + if note: + self.note = note + if step is not None: + self.step = step + + def status_line(self) -> str: + with self._lock: + stage = self.stage + note = self.note + step = self.step + total = self.total_steps + step_part = f"{step}/{total}" if total else str(step) if step else "" + detail = f"Stage {step_part}: {stage}".strip() + if note: + return f"Still thinking ({detail}). Latest insight: {note}" + return f"Still thinking ({detail})." + + +def _ollama_json_call( + prompt: str, + *, + context: str, + retries: int = 2, + model: str | None = None, +) -> dict[str, Any]: + system = ( + "System: You are Atlas, a reasoning assistant. " + "Return strict JSON only (no code fences, no trailing commentary). " + "If you cannot comply, return {}. " + "Only use facts from the provided context. " + "If you make an inference, label it as 'inference' in the JSON." + ) + last_exc: Exception | None = None + for attempt in range(max(1, retries + 1)): + try: + raw = _ollama_call( + ("json", "internal"), + prompt, + context=context, + use_history=False, + system_override=system, + model=model, + ) + cleaned = _strip_code_fence(raw).strip() + if cleaned.startswith("{") and cleaned.endswith("}"): + return json.loads(cleaned) + last = json.loads(_strip_code_fence(cleaned)) + if isinstance(last, dict): + return last + except Exception as exc: # noqa: BLE001 + last_exc = exc + time.sleep(min(2, 2 ** attempt)) + if last_exc: + return {} + return {} + + +def _fact_pack_lines( + prompt: str, + *, + inventory: list[dict[str, Any]], + snapshot: dict[str, Any] | None, + workloads: list[dict[str, Any]] | None, +) -> list[str]: + raw = facts_context(prompt, inventory=inventory, snapshot=snapshot, workloads=workloads) + lines: list[str] = [] + for line in raw.splitlines(): + trimmed = line.strip() + if not trimmed or trimmed.lower().startswith("facts"): + continue + lines.append(trimmed) + if _knowledge_intent(prompt) or _doc_intent(prompt) or _is_overview_query(prompt): + kb_titles = kb_retrieve_titles(prompt, limit=4) + if kb_titles: + for kb_line in kb_titles.splitlines(): + if kb_line.strip(): + lines.append(kb_line.strip()) + return lines + + +def _fact_pack_text(lines: list[str], fact_meta: dict[str, dict[str, Any]]) -> str: + labeled: list[str] = [] + for idx, line in enumerate(lines): + fid = f"F{idx + 1}" + tags = fact_meta.get(fid, {}).get("tags") or [] + tag_text = f" [tags: {', '.join(tags)}]" if tags else "" + labeled.append(f"{fid}{tag_text}: {line}") + return "Fact pack:\n" + "\n".join(labeled) + + +def _tool_fact_lines(prompt: str, *, allow_tools: bool) -> list[str]: + if not allow_tools: + return [] + metrics_context, _ = metrics_query_context(prompt, allow_tools=True) + lines: list[str] = [] + if metrics_context: + for line in metrics_context.splitlines(): + trimmed = line.strip() + if trimmed: + lines.append(f"tool_metrics: {trimmed}") + return lines + + +_ALLOWED_INSIGHT_TAGS = { + "availability", + "architecture", + "database", + "hardware", + "inventory", + "node_detail", + "os", + "pods", + "utilization", + "workloads", + "workers", +} + +_DYNAMIC_TAGS = {"availability", "database", "pods", "utilization", "workloads"} +_INVENTORY_TAGS = {"hardware", "architecture", "inventory", "workers", "node_detail", "os"} +_SUBJECTIVE_TAG_PRIORITY = ( + "utilization", + "database", + "pods", + "workloads", + "availability", + "hardware", + "inventory", + "architecture", + "node_detail", + "os", +) + + +def _fact_line_tags(line: str) -> set[str]: + text = (line or "").lower() + tags: set[str] = set() + if any(key in text for key in ("nodes_total", "ready", "not_ready", "workers_ready", "workers_not_ready")): + tags.add("availability") + if "nodes_by_arch" in text or "arch " in text or "architecture" in text: + tags.add("architecture") + if any(key in text for key in ("rpi", "jetson", "amd64", "arm64", "non_raspberry_pi")): + tags.update({"hardware", "inventory"}) + if "control_plane_nodes" in text or "control_plane_by_hardware" in text or "worker_nodes" in text: + tags.add("inventory") + if any(key in text for key in ("hottest_", "lowest_", "node_usage", "cpu=", "ram=", "net=", "io=")): + tags.add("utilization") + if "postgres_" in text or "postgres connections" in text: + tags.add("database") + if "pods_" in text or "pod phases" in text or "restarts" in text: + tags.add("pods") + if "namespace" in text: + tags.add("workloads") + if "workloads" in text or "primary_node" in text or "workload_" in text: + tags.add("workloads") + if "node_details" in text: + tags.add("node_detail") + if "os mix" in text or "os" in text: + tags.add("os") + return tags & _ALLOWED_INSIGHT_TAGS + + +def _fact_pack_meta(lines: list[str]) -> dict[str, dict[str, Any]]: + meta: dict[str, dict[str, Any]] = {} + for idx, line in enumerate(lines): + fid = f"F{idx + 1}" + tags = sorted(_fact_line_tags(line)) + meta[fid] = {"tags": tags} + return meta + + +def _history_tags(history_lines: list[str]) -> set[str]: + tags: set[str] = set() + for line in history_lines[-6:]: + tags.update(_fact_line_tags(line)) + return tags & _ALLOWED_INSIGHT_TAGS + + +def _normalize_fraction(value: Any, *, default: float = 0.5) -> float: + if isinstance(value, (int, float)): + score = float(value) + if score > 1: + score = score / 100.0 + return max(0.0, min(1.0, score)) + return default + + +def _seed_insights( + lines: list[str], + fact_meta: dict[str, dict[str, Any]], + *, + limit: int = 6, +) -> list[dict[str, Any]]: + priority = [ + "utilization", + "database", + "pods", + "workloads", + "availability", + "hardware", + "architecture", + "inventory", + ] + seeds: list[dict[str, Any]] = [] + used_tags: set[str] = set() + for tag in priority: + for idx, line in enumerate(lines): + fid = f"F{idx + 1}" + tags = set(fact_meta.get(fid, {}).get("tags") or []) + if tag not in tags or fid in {s["fact_ids"][0] for s in seeds}: + continue + summary = line.lstrip("- ").strip() + seeds.append( + { + "summary": summary, + "fact_ids": [fid], + "relevance": 0.5, + "novelty": 0.5, + "rationale": "seeded from fact pack", + "tags": sorted(tags), + } + ) + used_tags.update(tags) + if len(seeds) >= limit: + return seeds + return seeds + + +def _insight_tags(insight: dict[str, Any], fact_meta: dict[str, dict[str, Any]]) -> set[str]: + tags: set[str] = set() + for fid in insight.get("fact_ids") if isinstance(insight.get("fact_ids"), list) else []: + tags.update(fact_meta.get(fid, {}).get("tags") or []) + raw_tags = insight.get("tags") if isinstance(insight.get("tags"), list) else [] + tags.update(t for t in raw_tags if isinstance(t, str)) + summary = insight.get("summary") or insight.get("claim") or "" + if isinstance(summary, str): + tags.update(_fact_line_tags(summary)) + return tags & _ALLOWED_INSIGHT_TAGS + + +def _insight_score( + insight: dict[str, Any], + *, + preference: str, + prefer_tags: set[str], + avoid_tags: set[str], + history_tags: set[str], + fact_meta: dict[str, dict[str, Any]], +) -> float: + base = _score_insight(insight, preference) + tags = _insight_tags(insight, fact_meta) + if prefer_tags and tags: + base += 0.15 * len(tags & prefer_tags) + if avoid_tags and tags: + base -= 0.12 * len(tags & avoid_tags) + if history_tags and tags: + base -= 0.08 * len(tags & history_tags) + if preference == "novelty": + if tags & _DYNAMIC_TAGS: + base += 0.12 + if tags & _INVENTORY_TAGS: + base -= 0.08 + return base + + +def _score_insight(insight: dict[str, Any], preference: str) -> float: + relevance = _normalize_fraction(insight.get("relevance"), default=0.6) + novelty = _normalize_fraction(insight.get("novelty"), default=0.5) + if preference == "novelty": + return novelty * 0.6 + relevance * 0.4 + return relevance * 0.6 + novelty * 0.4 + + +def _select_diverse_insights( + candidates: list[dict[str, Any]], + *, + preference: str, + prefer_tags: set[str], + avoid_tags: set[str], + history_tags: set[str], + fact_meta: dict[str, dict[str, Any]], + count: int = 2, +) -> list[dict[str, Any]]: + scored: list[tuple[float, dict[str, Any]]] = [] + for item in candidates: + tags = _insight_tags(item, fact_meta) + item["tags"] = sorted(tags) + score = _insight_score( + item, + preference=preference, + prefer_tags=prefer_tags, + avoid_tags=avoid_tags, + history_tags=history_tags, + fact_meta=fact_meta, + ) + scored.append((score, item)) + scored.sort(key=lambda pair: pair[0], reverse=True) + picked: list[dict[str, Any]] = [] + used_tags: set[str] = set() + for _, item in scored: + tags = set(item.get("tags") or []) + if used_tags and tags and tags <= used_tags and len(picked) < count: + continue + picked.append(item) + used_tags.update(tags) + if len(picked) >= count: + break + if len(picked) < count: + for _, item in scored: + if item in picked: + continue + picked.append(item) + if len(picked) >= count: + break + return picked + + +def _open_ended_system() -> str: + return ( + "System: You are Atlas, the Titan lab assistant for Atlas/Othrys. " + "Use ONLY the provided fact pack and recent chat as your evidence. " + "You may draw light inferences if you label them as such. " + "Write concise, human sentences with a helpful, calm tone (not a list). " + "Be willing to take a light stance; do not over-hedge. " + "If the question is subjective (cool/interesting/unconventional), pick a standout fact, explain why it stands out, " + "and use 2-3 sentences. " + "If the question asks for a list, embed the list inline in a sentence (comma-separated). " + "If the question is ambiguous, pick a reasonable interpretation and state it briefly. " + "Avoid repeating the exact same observation as the last response if possible; vary across metrics, workload, or hardware details. " + "Always include at least one substantive answer sentence before the score lines. " + "Do not mention fact IDs or internal labels (e.g., F1/F2) in your response. " + "When the fact pack includes hottest_cpu/ram/net/io lines, use them to answer hottest/busiest node questions. " + "When the fact pack includes postgres_hottest_db, use it for questions about the busiest database. " + "Do not convert counts into percentages or claim 100% unless a fact explicitly states a percentage. " + "Do not invent numbers or facts. " + "End with lines: Confidence, Relevance (0-100), Satisfaction (0-100), HallucinationRisk (low|medium|high)." + ) + + +def _ollama_call_safe( + hist_key, + prompt: str, + *, + context: str, + fallback: str, + system_override: str | None = None, + model: str | None = None, +) -> str: + try: + return _ollama_call( + hist_key, + prompt, + context=context, + use_history=False, + system_override=system_override, + model=model, + ) + except Exception: + return fallback + + +def _candidate_note(candidate: dict[str, Any]) -> str: + claim = str(candidate.get("focus") or candidate.get("answer") or "") + return claim[:160] + ("…" if len(claim) > 160 else "") + + +def _ensure_scores(answer: str) -> str: + text = answer.strip() + lines = [line.strip() for line in text.splitlines() if line.strip()] + score_map: dict[str, str] = {} + body_lines: list[str] = [] + + def _score_key(line: str) -> str: + cleaned = line.strip().lstrip("-•* ").strip() + return cleaned.lower() + + def _extract_value(line: str) -> str: + cleaned = line.strip().lstrip("-•* ").strip() + if ":" in cleaned: + return cleaned.split(":", 1)[1].strip() + parts = cleaned.split() + return parts[1] if len(parts) > 1 else "" + + def _record_score(key: str, value: str): + if not value: + return + value = value.strip().rstrip("%") + score_map.setdefault(key, value) + + for line in lines: + cleaned = line.strip().lstrip("-•* ").strip() + lowered = cleaned.lower() + if lowered.startswith("confidence,") or ( + "confidence" in lowered and "relevance" in lowered and "satisfaction" in lowered + ): + for key in ("confidence", "relevance", "satisfaction"): + match = re.search(rf"{key}\s*[:=]?\s*(\d{{1,3}}|high|medium|low)", lowered) + if match: + _record_score(key, match.group(1)) + risk_match = re.search(r"hallucination\s*risk\s*[:=]?\s*(low|medium|high)", lowered) + if risk_match: + _record_score("hallucinationrisk", risk_match.group(1)) + continue + if lowered.startswith("confidence"): + _record_score("confidence", _extract_value(cleaned)) + continue + if lowered.startswith("relevance"): + _record_score("relevance", _extract_value(cleaned)) + continue + if lowered.startswith("satisfaction"): + _record_score("satisfaction", _extract_value(cleaned)) + continue + if lowered.replace(" ", "").startswith("hallucinationrisk") or lowered.startswith( + "hallucination risk" + ): + _record_score("hallucinationrisk", _extract_value(cleaned)) + continue + cleaned_body = re.sub( + r"\bconfidence\s*:\s*(high|medium|low)\b\.?\s*", + "", + line, + flags=re.IGNORECASE, + ).strip() + cleaned_body = re.sub( + r"\bconfident\s*level\s*:\s*(high|medium|low)\b\.?\s*", + "", + cleaned_body, + flags=re.IGNORECASE, + ).strip() + cleaned_body = re.sub(r"\bF\d+\b", "", cleaned_body).strip() + if cleaned_body: + body_lines.append(cleaned_body) + + confidence = score_map.get("confidence") or "medium" + relevance = score_map.get("relevance") or "70" + satisfaction = score_map.get("satisfaction") or "70" + risk = score_map.get("hallucinationrisk") or "low" + + final_lines = body_lines + [ + f"Confidence: {confidence}", + f"Relevance: {relevance}", + f"Satisfaction: {satisfaction}", + f"HallucinationRisk: {risk}", + ] + return "\n".join(final_lines) + + +def _open_ended_plan( + prompt: str, + *, + fact_pack: str, + history_lines: list[str], + focus_tags: set[str], + avoid_tags: set[str], + count: int, + state: ThoughtState | None, + step: int, + model: str | None, +) -> list[dict[str, Any]]: + if state: + state.update("planning", step=step, note="mapping angles") + count = max(1, count) + focus_hint = ", ".join(sorted(focus_tags)) if focus_tags else "any" + avoid_hint = ", ".join(sorted(avoid_tags)) if avoid_tags else "none" + prompt_text = ( + "Analyze the question and propose up to " + f"{count} distinct answer angles that can be supported by the fact pack. " + "Keep them diverse (e.g., metrics, hardware, workload placement, recent changes). " + "If the question is subjective, propose at least one angle that surfaces a standout detail. " + f"Prefer angles that align with these tags: {focus_hint}. " + f"Avoid angles that overlap these tags if possible: {avoid_hint}. " + "Avoid repeating the same angle as the most recent response if possible. " + "Return JSON: {\"angles\":[{\"focus\":\"...\",\"reason\":\"...\",\"tags\":[\"tag\"],\"priority\":1-5}]}." + ) + context = _append_history_context(fact_pack, history_lines) + result = _ollama_json_call( + prompt_text + f" Question: {prompt}", + context=context, + model=model, + ) + angles = result.get("angles") if isinstance(result, dict) else None + cleaned: list[dict[str, Any]] = [] + seen: set[str] = set() + if isinstance(angles, list): + for item in angles: + if not isinstance(item, dict): + continue + focus = str(item.get("focus") or "").strip() + if not focus or focus.lower() in seen: + continue + seen.add(focus.lower()) + priority = item.get("priority") + if not isinstance(priority, (int, float)): + priority = 3 + tags = _sanitize_focus_tags(item.get("tags") or []) + cleaned.append( + { + "focus": focus, + "reason": str(item.get("reason") or ""), + "tags": tags, + "priority": int(max(1, min(5, priority))), + } + ) + if not cleaned: + cleaned = [{"focus": "Direct answer", "reason": "Default fallback", "priority": 3}] + cleaned.sort(key=lambda item: item.get("priority", 3), reverse=True) + if state: + state.update("planning", step=1, note=_candidate_note(cleaned[0])) + return cleaned + + +def _sanitize_focus_tags(raw_tags: list[Any]) -> list[str]: + tags: list[str] = [] + for tag in raw_tags: + if not isinstance(tag, str): + continue + tag = tag.strip() + if tag in _ALLOWED_INSIGHT_TAGS and tag not in tags: + tags.append(tag) + return tags + + +def _open_ended_interpret( + prompt: str, + *, + fact_pack: str, + history_lines: list[str], + state: ThoughtState | None, + model: str | None, +) -> dict[str, Any]: + if state: + state.update("interpreting", step=1, note="reading question") + allowed_tags = ", ".join(sorted(_ALLOWED_INSIGHT_TAGS)) + prompt_text = ( + "Classify how to answer the question using only the fact pack. " + "Return JSON: {\"style\":\"objective|subjective\"," + "\"tone\":\"neutral|curious|enthusiastic\"," + "\"focus_tags\":[\"tag\"]," + "\"focus_label\":\"short phrase\"," + "\"allow_list\":true|false}. " + "Use allow_list=true only if the question explicitly asks for names or lists. " + f"Only use tags from: {allowed_tags}." + ) + context = _append_history_context(fact_pack, history_lines) + result = _ollama_json_call( + prompt_text + f" Question: {prompt}", + context=context, + model=model, + ) + if not isinstance(result, dict): + result = {} + style = str(result.get("style") or "").strip().lower() + if style not in ("objective", "subjective"): + style = "subjective" if _is_subjective_query(prompt) else "objective" + tone = str(result.get("tone") or "neutral").strip().lower() + if tone not in ("neutral", "curious", "enthusiastic"): + tone = "neutral" + focus_tags = _sanitize_focus_tags(result.get("focus_tags") or []) + focus_label = str(result.get("focus_label") or "").strip() + allow_list = result.get("allow_list") + if not isinstance(allow_list, bool): + q = normalize_query(prompt) + allow_list = any(phrase in q for phrase in ("list", "which", "what are", "names")) + return { + "style": style, + "tone": tone, + "focus_tags": focus_tags, + "focus_label": focus_label, + "allow_list": allow_list, + } + + +def _preferred_tags_for_prompt(prompt: str) -> set[str]: + q = normalize_query(prompt) + tokens = set(_tokens(prompt)) + tags: set[str] = set() + if tokens & {"cpu", "ram", "memory", "net", "network", "io", "disk", "hottest", "busy", "usage", "utilization", "load"}: + tags.add("utilization") + if tokens & {"postgres", "database", "db", "connections"}: + tags.add("database") + if tokens & {"pod", "pods", "deployment", "job", "cronjob"}: + tags.add("pods") + if tokens & {"workload", "service", "namespace"}: + tags.add("workloads") + if tokens & {"ready", "down", "unreachable", "availability"} or "not ready" in q: + tags.add("availability") + if tokens & {"node", "nodes", "hardware", "arch", "architecture", "rpi", "jetson", "amd64", "arm64", "worker", "control-plane"}: + tags.update({"hardware", "inventory", "architecture"}) + return tags & _ALLOWED_INSIGHT_TAGS + + +def _primary_tags_for_prompt(prompt: str) -> set[str]: + q = normalize_query(prompt) + tokens = set(_tokens(prompt)) + if tokens & {"cpu", "ram", "memory", "net", "network", "io", "disk", "hottest", "busy", "usage", "utilization", "load"}: + return {"utilization"} + if tokens & {"postgres", "database", "db", "connections"}: + return {"database"} + if tokens & {"pod", "pods", "deployment", "job", "cronjob"}: + return {"pods"} + if tokens & {"workload", "service", "namespace"}: + return {"workloads"} + if tokens & {"ready", "down", "unreachable", "availability"} or "not ready" in q: + return {"availability"} + if tokens & {"node", "nodes", "hardware", "arch", "architecture", "rpi", "jetson", "amd64", "arm64", "worker", "control-plane"}: + return {"hardware", "inventory", "architecture"} + return set() + + +_TAG_KEYWORDS: dict[str, tuple[str, ...]] = { + "utilization": ("cpu", "ram", "memory", "net", "network", "io", "disk", "usage", "utilization", "hottest", "busy"), + "database": ("postgres", "db", "database", "connections"), + "pods": ("pod", "pods", "deployment", "daemonset", "job", "cron", "workload"), + "hardware": ("hardware", "architecture", "arch", "rpi", "raspberry", "jetson", "amd64", "arm64", "node", "nodes"), + "availability": ("ready", "not ready", "unready", "down", "missing"), + "workloads": ("workload", "service", "namespace", "app"), + "os": ("os", "kernel", "kubelet", "containerd", "runtime"), +} + + +def _tags_from_text(text: str) -> set[str]: + q = normalize_query(text) + if not q: + return set() + tokens = set(_tokens(text)) + tags: set[str] = set() + for tag, keywords in _TAG_KEYWORDS.items(): + if any(word in tokens for word in keywords): + tags.add(tag) + return tags & _ALLOWED_INSIGHT_TAGS + + +def _history_focus_tags(history_lines: list[str]) -> set[str]: + if not history_lines: + return set() + recent = " ".join(line for line in history_lines[-6:] if isinstance(line, str)) + return _tags_from_text(recent) + + +def _open_ended_insights( + prompt: str, + *, + fact_pack: str, + fact_meta: dict[str, dict[str, Any]], + history_lines: list[str], + count: int, + state: ThoughtState | None, + step: int, + model: str | None, +) -> list[dict[str, Any]]: + if state: + state.update("analyzing", step=step, note="scouting insights") + count = max(1, count) + allowed_tags = ", ".join(sorted(_ALLOWED_INSIGHT_TAGS)) + prompt_text = ( + "Review the fact pack and propose up to " + f"{count} insights that could answer the question. " + "Each insight should be grounded in the facts. " + "Return JSON: {\"insights\":[{\"summary\":\"...\",\"fact_ids\":[\"F1\"]," + "\"relevance\":0-1,\"novelty\":0-1,\"tags\":[\"tag\"],\"rationale\":\"...\"}]}. " + f"Only use tags from: {allowed_tags}." + ) + context = _append_history_context(fact_pack, history_lines) + result = _ollama_json_call( + prompt_text + f" Question: {prompt}", + context=context, + model=model, + ) + insights = result.get("insights") if isinstance(result, dict) else None + cleaned: list[dict[str, Any]] = [] + valid_ids = set(fact_meta.keys()) + if isinstance(insights, list): + for item in insights: + if not isinstance(item, dict): + continue + summary = str(item.get("summary") or item.get("claim") or "").strip() + if not summary: + continue + raw_ids = item.get("fact_ids") if isinstance(item.get("fact_ids"), list) else [] + fact_ids = [fid for fid in raw_ids if isinstance(fid, str) and fid in valid_ids] + if not fact_ids: + continue + cleaned.append( + { + "summary": summary, + "fact_ids": fact_ids, + "relevance": _normalize_fraction(item.get("relevance"), default=0.6), + "novelty": _normalize_fraction(item.get("novelty"), default=0.5), + "rationale": str(item.get("rationale") or ""), + "tags": [t for t in (item.get("tags") or []) if isinstance(t, str)], + } + ) + if cleaned and state: + state.update("analyzing", note=_candidate_note(cleaned[0])) + return cleaned + + +def _rank_insights( + insights: list[dict[str, Any]], + *, + focus_tags: set[str], + avoid_tags: set[str], + count: int, +) -> list[dict[str, Any]]: + if not insights: + return [] + ranked: list[tuple[float, dict[str, Any]]] = [] + for insight in insights: + relevance = _normalize_fraction(insight.get("relevance"), default=0.6) + novelty = _normalize_fraction(insight.get("novelty"), default=0.5) + tags = set(insight.get("tags") or []) + score = relevance * 0.65 + novelty * 0.35 + if focus_tags and tags & focus_tags: + score += 0.1 + if avoid_tags and tags & avoid_tags: + score -= 0.2 + ranked.append((score, insight)) + ranked.sort(key=lambda item: item[0], reverse=True) + return [item for _, item in ranked[:count]] + + +def _fallback_fact_ids( + fact_meta: dict[str, dict[str, Any]], + *, + focus_tags: set[str], + avoid_tags: set[str], + count: int, +) -> list[str]: + if not fact_meta: + return [] + if focus_tags: + tagged = [ + fid + for fid, meta in fact_meta.items() + if focus_tags & set(meta.get("tags") or []) + ] + if avoid_tags: + tagged = [fid for fid in tagged if not (avoid_tags & set(fact_meta.get(fid, {}).get("tags") or []))] + if tagged: + return tagged[:count] + all_ids = list(fact_meta.keys()) + if avoid_tags: + filtered = [fid for fid in all_ids if not (avoid_tags & set(fact_meta.get(fid, {}).get("tags") or []))] + if filtered: + return filtered[:count] + return all_ids[:count] + + +def _open_ended_select_facts( + prompt: str, + *, + fact_pack: str, + fact_meta: dict[str, dict[str, Any]], + history_lines: list[str], + focus_tags: set[str], + avoid_tags: set[str], + avoid_fact_ids: list[str], + count: int, + subjective: bool, + state: ThoughtState | None, + step: int, + model: str | None, +) -> list[str]: + if state: + state.update("selecting facts", step=step, note="picking evidence") + focus_hint = ", ".join(sorted(focus_tags)) if focus_tags else "any" + avoid_tag_hint = ", ".join(sorted(avoid_tags)) if avoid_tags else "none" + avoid_hint = ", ".join(avoid_fact_ids) if avoid_fact_ids else "none" + prompt_text = ( + "Select the fact IDs that best answer the question. " + f"Pick up to {count} fact IDs. " + f"Focus tags: {focus_hint}. " + f"Avoid these tags if possible: {avoid_tag_hint}. " + f"Avoid these fact IDs: {avoid_hint}. " + "If the question is subjective, pick standout or unusual facts; " + "if objective, pick the minimal facts needed. " + "Return JSON: {\"fact_ids\":[\"F1\"...],\"note\":\"...\"}." + ) + context = _append_history_context(fact_pack, history_lines) + result = _ollama_json_call( + prompt_text + f" Question: {prompt}", + context=context, + model=model, + ) + fact_ids = result.get("fact_ids") if isinstance(result, dict) else None + selected: list[str] = [] + if isinstance(fact_ids, list): + for fid in fact_ids: + if isinstance(fid, str) and fid in fact_meta and fid not in selected: + selected.append(fid) + if len(selected) >= count: + break + if avoid_tags: + selected = [ + fid + for fid in selected + if not (avoid_tags & set(fact_meta.get(fid, {}).get("tags") or [])) + ] or selected + seed = _fallback_fact_ids( + fact_meta, + focus_tags=focus_tags, + avoid_tags=avoid_tags, + count=count, + ) + if selected: + for fid in seed: + if fid not in selected: + selected.append(fid) + if len(selected) >= count: + break + else: + selected = seed + return selected + + +def _normalize_score(value: Any, *, default: int = 60) -> int: + if isinstance(value, (int, float)): + return int(max(0, min(100, value))) + return default + + +def _confidence_score(value: Any) -> int: + text = str(value or "").strip().lower() + if text.startswith("high"): + return 85 + if text.startswith("low"): + return 35 + return 60 + + +def _risk_penalty(value: Any) -> int: + text = str(value or "").strip().lower() + if text.startswith("high"): + return 20 + if text.startswith("medium"): + return 10 + return 0 + + +def _open_ended_candidate( + prompt: str, + *, + focus: str, + fact_pack: str, + history_lines: list[str], + subjective: bool, + tone: str, + allow_list: bool, + state: ThoughtState | None, + step: int, + fact_hints: list[str] | None = None, + model: str | None = None, +) -> dict[str, Any]: + if state: + state.update("drafting", step=step, note=focus) + hint_text = "" + if fact_hints: + hint_text = " Prioritize these fact IDs if relevant: " + ", ".join(fact_hints) + "." + style_hint = ( + "Offer a brief opinion grounded in facts and explain why it stands out. " + if subjective + else "Answer directly and succinctly. " + ) + list_hint = ( + "If a list is requested, embed it inline in a sentence (comma-separated). " + if allow_list + else "Avoid bullet lists. " + ) + prompt_text = ( + "Using ONLY the fact pack, answer the question focusing on this angle: " + f"{focus}. " + f"Tone: {tone}. " + + style_hint + + list_hint + + "Write 2-4 sentences in plain prose." + + hint_text + + " " + "If you infer, label it as inference. " + "List which fact pack IDs you used. " + "Return JSON: {\"answer\":\"...\",\"facts_used\":[\"F1\"],\"confidence\":\"high|medium|low\"," + "\"relevance\":0-100,\"satisfaction\":0-100,\"risk\":\"low|medium|high\"}." + ) + context = _append_history_context(fact_pack, history_lines) + result = _ollama_json_call( + prompt_text + f" Question: {prompt}", + context=context, + model=model, + ) + if not isinstance(result, dict): + result = {} + answer = str(result.get("answer") or "").strip() + if not answer: + answer = "I don't have enough data to answer that from the current snapshot." + facts_used = result.get("facts_used") + if not isinstance(facts_used, list): + facts_used = [] + candidate = { + "focus": focus, + "answer": answer, + "facts_used": facts_used, + "confidence": result.get("confidence", "medium"), + "relevance": _normalize_score(result.get("relevance"), default=60), + "satisfaction": _normalize_score(result.get("satisfaction"), default=60), + "risk": result.get("risk", "medium"), + } + candidate["score"] = _candidate_score(candidate) + return candidate + + +def _candidate_score(candidate: dict[str, Any]) -> float: + relevance = _normalize_score(candidate.get("relevance"), default=60) + satisfaction = _normalize_score(candidate.get("satisfaction"), default=60) + confidence = _confidence_score(candidate.get("confidence")) + score = relevance * 0.45 + satisfaction * 0.35 + confidence * 0.2 + if not candidate.get("facts_used"): + score -= 5 + return score - _risk_penalty(candidate.get("risk")) + + +def _select_candidates(candidates: list[dict[str, Any]], *, count: int) -> list[dict[str, Any]]: + if not candidates: + return [] + ranked = sorted(candidates, key=lambda item: item.get("score", 0), reverse=True) + picked: list[dict[str, Any]] = [] + seen_focus: set[str] = set() + for item in ranked: + focus = str(item.get("focus") or "").strip().lower() + if focus and focus in seen_focus: + continue + picked.append(item) + if focus: + seen_focus.add(focus) + if len(picked) >= count: + break + return picked or ranked[:count] + + +def _open_ended_synthesize( + prompt: str, + *, + fact_pack: str, + history_lines: list[str], + candidates: list[dict[str, Any]], + subjective: bool, + tone: str, + allow_list: bool, + state: ThoughtState | None, + step: int, + model: str | None, + critique: str | None = None, +) -> str: + if state: + state.update("synthesizing", step=step, note="composing answer") + critique_block = f"\nCritique guidance: {critique}\n" if critique else "\n" + style_hint = ( + "If the question is subjective, share a light opinion grounded in facts and explain why it stands out. " + if subjective + else "Answer directly without extra caveats. " + ) + list_hint = ( + "If a list is requested, embed it inline in a sentence (comma-separated). " + if allow_list + else "Avoid bullet lists. " + ) + synth_prompt = ( + "Compose the final answer to the question using the candidate answers below. " + "Select the best 1-2 candidates, blend them if helpful, and keep 2-4 sentences. " + "Use only the fact pack as evidence. " + "If you infer, label it as inference. " + "Do not claim nodes are missing or not ready unless the fact pack explicitly lists " + "nodes_not_ready or expected_workers_missing. " + f"Tone: {tone}. " + + style_hint + + list_hint + + "Keep the tone conversational and answer the user's intent directly. " + "Avoid repeating the last response if possible. " + "End with lines: Confidence, Relevance (0-100), Satisfaction (0-100), " + "HallucinationRisk (low|medium|high).\n" + f"Question: {prompt}\n" + f"{critique_block}" + f"Candidates: {json.dumps(candidates, ensure_ascii=False)}" + ) + context = _append_history_context(fact_pack, history_lines) + reply = _ollama_call_safe( + ("open", "synth"), + synth_prompt, + context=context, + fallback="I don't have enough data to answer that.", + system_override=_open_ended_system(), + model=model, + ) + return _ensure_scores(reply) + + +def _open_ended_critique( + prompt: str, + *, + fact_pack: str, + history_lines: list[str], + candidates: list[dict[str, Any]], + state: ThoughtState | None, + step: int, + model: str | None, +) -> str: + if state: + state.update("reviewing", step=step, note="quality check") + critique_prompt = ( + "Review the candidate answers against the fact pack. " + "Identify any missing important detail or risky inference and give one sentence of guidance. " + "Return JSON: {\"guidance\":\"...\",\"risk\":\"low|medium|high\"}." + ) + context = _append_history_context(fact_pack, history_lines) + result = _ollama_json_call( + critique_prompt + f" Question: {prompt} Candidates: {json.dumps(candidates, ensure_ascii=False)}", + context=context, + model=model, + ) + if isinstance(result, dict): + guidance = str(result.get("guidance") or "").strip() + if guidance: + return guidance + return "" + + +def _open_ended_multi( + prompt: str, + *, + fact_pack: str, + fact_lines: list[str], + fact_meta: dict[str, dict[str, Any]], + history_lines: list[str], + state: ThoughtState | None = None, +) -> str: + model = _model_for_mode("deep") + total_steps = _open_ended_total_steps("deep") + if state: + state.total_steps = total_steps + + interpretation = _open_ended_interpret( + prompt, + fact_pack=fact_pack, + history_lines=history_lines, + state=state, + model=model, + ) + style = interpretation.get("style") or "objective" + subjective = style == "subjective" or _is_subjective_query(prompt) + tone = str(interpretation.get("tone") or "").strip().lower() + if tone not in ("neutral", "curious", "enthusiastic"): + tone = "curious" if subjective else "neutral" + allow_list = bool(interpretation.get("allow_list")) + focus_tags = set(interpretation.get("focus_tags") or []) or _preferred_tags_for_prompt(prompt) + if not focus_tags and subjective: + focus_tags = set(_ALLOWED_INSIGHT_TAGS) + avoid_tags = _history_focus_tags(history_lines) if (subjective or _is_followup_query(prompt)) else set() + + angles = _open_ended_plan( + prompt, + fact_pack=fact_pack, + history_lines=history_lines, + focus_tags=focus_tags, + avoid_tags=avoid_tags, + count=5, + state=state, + step=2, + model=model, + ) + if state and avoid_tags: + state.update("planning", step=2, note=f"avoiding {', '.join(sorted(avoid_tags))}") + + insights = _open_ended_insights( + prompt, + fact_pack=fact_pack, + fact_meta=fact_meta, + history_lines=history_lines, + count=7, + state=state, + step=3, + model=model, + ) + ranked_insights = _rank_insights( + insights, + focus_tags=focus_tags, + avoid_tags=avoid_tags, + count=3, + ) + + candidates: list[dict[str, Any]] = [] + step = 4 + for insight in ranked_insights: + candidates.append( + _open_ended_candidate( + prompt, + focus=insight.get("summary") or "insight", + fact_pack=fact_pack, + history_lines=history_lines, + subjective=subjective, + tone=str(tone), + allow_list=allow_list, + state=state, + step=step, + fact_hints=insight.get("fact_ids") or [], + model=model, + ) + ) + step += 1 + if not candidates and angles: + for angle in angles[:2]: + angle_tags = set(angle.get("tags") or []) or _tags_from_text(angle.get("focus") or "") + fact_ids = _open_ended_select_facts( + prompt, + fact_pack=fact_pack, + fact_meta=fact_meta, + history_lines=history_lines, + focus_tags=angle_tags or focus_tags, + avoid_tags=avoid_tags, + avoid_fact_ids=[], + count=4, + subjective=subjective, + state=state, + step=step, + model=model, + ) + candidates.append( + _open_ended_candidate( + prompt, + focus=angle.get("focus") or "alternate angle", + fact_pack=fact_pack, + history_lines=history_lines, + subjective=subjective, + tone=str(tone), + allow_list=allow_list, + state=state, + step=step, + fact_hints=fact_ids, + model=model, + ) + ) + step += 1 + if len(candidates) >= 2: + break + + if state: + state.update("evaluating", step=step, note="ranking candidates") + selected = _select_candidates(candidates, count=2) + step += 1 + critique = _open_ended_critique( + prompt, + fact_pack=fact_pack, + history_lines=history_lines, + candidates=selected or candidates, + state=state, + step=step, + model=model, + ) + step += 1 + reply = _open_ended_synthesize( + prompt, + fact_pack=fact_pack, + history_lines=history_lines, + candidates=selected or candidates, + subjective=subjective, + tone=str(tone), + allow_list=allow_list, + state=state, + step=step, + model=model, + critique=critique, + ) + if state: + state.update("done", step=total_steps) + return reply + + +def _open_ended_total_steps(mode: str) -> int: + if mode == "fast": + return 2 + return 9 + + +def _fast_fact_lines( + fact_lines: list[str], + fact_meta: dict[str, dict[str, Any]], + *, + focus_tags: set[str], + avoid_tags: set[str], + primary_tags: set[str] | None = None, + limit: int = 10, +) -> list[str]: + if not fact_lines: + return [] + primary_tags = primary_tags or set() + scored: list[tuple[int, int, str]] = [] + priority_map = {tag: idx for idx, tag in enumerate(_SUBJECTIVE_TAG_PRIORITY)} + use_priority = not primary_tags and focus_tags == _ALLOWED_INSIGHT_TAGS + for idx, line in enumerate(fact_lines): + fid = f"F{idx + 1}" + tags = set(fact_meta.get(fid, {}).get("tags") or []) + if avoid_tags and (avoid_tags & tags): + continue + score = 0 + if primary_tags: + score += 4 * len(tags & primary_tags) + if focus_tags: + score += 2 * len(tags & focus_tags) + if use_priority and tags: + bonus = 0 + for tag in tags: + if tag in priority_map: + bonus = max(bonus, len(priority_map) - priority_map[tag]) + score += bonus + scored.append((score, idx, line)) + scored.sort(key=lambda item: (-item[0], item[1])) + selected: list[str] = [] + for score, _, line in scored: + if score <= 0 and selected: + break + if score > 0: + selected.append(line) + if len(selected) >= limit: + break + if not selected: + selected = [line for _, _, line in scored[:limit]] + elif len(selected) < limit: + for _, _, line in scored: + if line in selected: + continue + selected.append(line) + if len(selected) >= limit: + break + return selected + + +def _has_body_lines(answer: str) -> bool: + lines = [line.strip() for line in (answer or "").splitlines() if line.strip()] + for line in lines: + lowered = line.lower() + if lowered.startswith("confidence"): + continue + if lowered.startswith("relevance"): + continue + if lowered.startswith("satisfaction"): + continue + if lowered.startswith("hallucinationrisk"): + continue + if lowered.startswith("hallucination risk"): + continue + return True + return False + + +def _fallback_fact_answer(prompt: str, context: str) -> str: + facts: list[str] = [] + parsed_facts: list[tuple[str, str | None, str | None]] = [] + q = normalize_query(prompt) + tokens = set(_tokens(prompt)) + for line in (context or "").splitlines(): + trimmed = line.strip() + if not trimmed: + continue + if trimmed.startswith("F"): + match = re.match(r"^F\d+.*?\]:\s*(.*)$", trimmed) + if not match: + match = re.match(r"^F\d+:\s*(.*)$", trimmed) + if not match: + continue + fact = match.group(1).strip() + else: + if trimmed.lower().startswith("fact pack") or trimmed.lower().startswith("facts"): + continue + if trimmed.startswith("-"): + fact = trimmed.lstrip("-").strip() + else: + fact = trimmed + if fact.startswith("-"): + fact = fact.lstrip("-").strip() + if fact and (":" in fact or "=" in fact): + facts.append(fact) + key_match = re.match(r"^([\w\s/.-]+):\s*(.+)$", fact) + if not key_match: + key_match = re.match(r"^([\w\s/.-]+)=\s*(.+)$", fact) + if key_match: + parsed_facts.append((fact, key_match.group(1).strip(), key_match.group(2).strip())) + else: + parsed_facts.append((fact, None, None)) + if not facts: + return "" + + def _norm_key(text: str) -> str: + return normalize_query(text).replace(" ", "_") + + def _find_value(target: str) -> str | None: + for _fact, key, val in parsed_facts: + if key and _norm_key(key) == target: + return val + return None + + def _parse_counts(text: str) -> dict[str, int]: + counts: dict[str, int] = {} + for part in (text or "").split(","): + if "=" not in part: + continue + k, v = part.split("=", 1) + k = k.strip() + v = v.strip() + if not k or not v: + continue + try: + counts[k] = int(float(v)) + except ValueError: + continue + return counts + + def _parse_map(text: str) -> dict[str, str]: + mapping: dict[str, str] = {} + pattern = re.compile(r"(\w+)\s*=\s*([^=]+?)(?=(?:\s*,\s*\w+\s*=)|$)") + for match in pattern.finditer(text or ""): + mapping[match.group(1).strip()] = match.group(2).strip().strip(",") + return mapping + + list_intent = _is_list_prompt(prompt) or "name" in tokens + count_intent = _is_quantitative_prompt(prompt) and ("how many" in q or "count" in tokens or "number" in tokens) + hottest_intent = any(word in q for word in ("hottest", "highest", "most", "top", "busiest")) + metric = _detect_metric(q) + include_hw, _exclude_hw = _detect_hardware_filters(q) + + if hottest_intent and metric in {"cpu", "ram", "net", "io"}: + hottest_val = _find_value(f"hottest_{metric}") + if hottest_val: + return f"Hottest {metric} is {hottest_val}." + if hottest_intent and tokens & {"postgres", "database", "db", "connections"}: + hottest_db = _find_value("postgres_hottest_db") + if hottest_db: + return f"Hottest database is {hottest_db}." + + if count_intent and tokens & {"pods", "pod"}: + pending = _find_value("pods_pending") + failed = _find_value("pods_failed") + running = _find_value("pods_running") + succeeded = _find_value("pods_succeeded") + if "pending" in q and "failed" in q: + try: + total = float(pending or 0) + float(failed or 0) + return f"Pods pending or failed: {total:.0f}." + except ValueError: + pass + if "pending" in q and pending is not None: + return f"Pods pending is {pending}." + if "failed" in q and failed is not None: + return f"Pods failed is {failed}." + if "succeeded" in q and succeeded is not None: + return f"Pods succeeded is {succeeded}." + if "running" in q and running is not None: + return f"Pods running is {running}." + + if count_intent and tokens & {"nodes", "node"} and "not ready" in q: + nodes_total = _find_value("nodes_total") + if nodes_total and "not_ready" in nodes_total: + match = re.search(r"not_ready=([0-9.]+)", nodes_total) + if match: + return f"Not ready nodes: {match.group(1)}." + + if count_intent and include_hw: + counts_line = _find_value("nodes_by_hardware_count") + if counts_line: + counts = _parse_counts(counts_line) + for hw in include_hw: + if hw in counts: + return f"{hw} nodes: {counts[hw]}." + for hw in include_hw: + hw_line = _find_value(hw) + if hw_line: + items = [item.strip() for item in hw_line.split(",") if item.strip()] + return f"{hw} nodes: {len(items)}." + + if list_intent and include_hw: + if "control" in q: + cp_by_hw = _find_value("control_plane_by_hardware") + if cp_by_hw: + mapping = _parse_map(cp_by_hw) + for hw in include_hw: + if hw in mapping: + return f"{hw} control-plane nodes: {mapping[hw]}." + cp_nodes = _find_value("control_plane_nodes") + if cp_nodes: + return f"Control-plane nodes: {cp_nodes}." + for hw in include_hw: + hw_line = _find_value(hw) + if hw_line: + return f"{hw} nodes: {hw_line}." + + if list_intent and "control" in q: + cp_nodes = _find_value("control_plane_nodes") + if cp_nodes: + return f"Control-plane nodes: {cp_nodes}." + + preferred = tokens & { + "node", + "nodes", + "pod", + "pods", + "postgres", + "db", + "database", + "namespace", + "workload", + "worker", + "workers", + "cpu", + "ram", + "memory", + "net", + "network", + "io", + "disk", + "connection", + "connections", + } + best_fact = "" + best_score = -1 + for fact in facts: + key_match = re.match(r"^([\w\s/.-]+):\s*(.+)$", fact) + if not key_match: + key_match = re.match(r"^([\w\s/.-]+)=\s*(.+)$", fact) + key_tokens: set[str] = set() + if key_match: + key_tokens = set(_tokens(key_match.group(1))) + score = len(tokens & set(_tokens(fact))) + 2 * len(tokens & key_tokens) + if preferred: + score += 3 * len(preferred & key_tokens) + if not (preferred & key_tokens): + score -= 1 + if list_intent and key_match and "count" in key_tokens: + score -= 3 + if score > best_score: + best_score = score + best_fact = fact + if best_score <= 0: + return "" + key_match = re.match(r"^([\w\s/.-]+):\s*(.+)$", best_fact) + if not key_match: + key_match = re.match(r"^([\w\s/.-]+)=\s*(.+)$", best_fact) + if key_match: + key = key_match.group(1).strip().replace("_", " ") + val = key_match.group(2).strip() + sentence = f"{key.capitalize()} is {val}" + else: + sentence = f"Based on the snapshot, {best_fact}" + if not sentence.endswith((".", "!", "?")): + sentence += "." + return sentence + + +def _is_quantitative_prompt(prompt: str) -> bool: + q = normalize_query(prompt) + if not q: + return False + tokens = set(_tokens(prompt)) + if "how many" in q or "count" in tokens or "total" in tokens: + return True + if tokens & {"highest", "lowest", "hottest", "most", "least"}: + return True + return False + + +def _is_list_prompt(prompt: str) -> bool: + q = normalize_query(prompt) + if not q: + return False + if any(phrase in q for phrase in ("list", "names", "name", "show")): + return True + if any(phrase in q for phrase in ("which nodes", "what nodes", "what are the nodes")): + return True + return False + + +def _needs_full_fact_pack(prompt: str) -> bool: + q = normalize_query(prompt) + tokens = set(_tokens(prompt)) + if _is_quantitative_prompt(prompt) or _is_list_prompt(prompt): + return True + if tokens & {"workload", "pods", "namespace", "worker", "workers"}: + return True + if tokens & {"arch", "architecture", "hardware"}: + return True + if tokens & METRIC_HINT_WORDS: + return True + if _NAME_INDEX and tokens & _NAME_INDEX: + return True + if any(phrase in q for phrase in ("where does", "where is", "where are", "running", "run on", "hosted on", "primary node")): + return True + return False + + +def _open_ended_fast_single( + prompt: str, + *, + context: str, + history_lines: list[str] | None = None, + state: ThoughtState | None = None, + model: str, +) -> str: + if state: + state.update("drafting", step=1, note="summarizing") + working_context = _append_history_context(context, history_lines or []) if history_lines else context + reply = _ollama_call( + ("atlasbot_fast", "atlasbot_fast"), + prompt, + context=working_context, + use_history=False, + system_override=_open_ended_system(), + model=model, + ) + if not _has_body_lines(reply): + reply = _ollama_call( + ("atlasbot_fast", "atlasbot_fast"), + prompt + " Provide one clear sentence before the score lines.", + context=working_context, + use_history=False, + system_override=_open_ended_system(), + model=model, + ) + fallback = _fallback_fact_answer(prompt, context) + if fallback and (_is_quantitative_prompt(prompt) or not _has_body_lines(reply)): + reply = fallback + if not _has_body_lines(reply): + reply = "I don't have enough data in the current snapshot to answer that." + if state: + state.update("done", step=_open_ended_total_steps("fast")) + return _ensure_scores(reply) + + +def _open_ended_fast( + prompt: str, + *, + fact_pack: str, + fact_lines: list[str], + fact_meta: dict[str, dict[str, Any]], + history_lines: list[str], + state: ThoughtState | None = None, +) -> str: + model = _model_for_mode("fast") + subjective = _is_subjective_query(prompt) + primary_tags = _primary_tags_for_prompt(prompt) + focus_tags = _preferred_tags_for_prompt(prompt) + if not focus_tags and subjective: + focus_tags = set(_ALLOWED_INSIGHT_TAGS) + avoid_tags = _history_focus_tags(history_lines) if (subjective or _is_followup_query(prompt)) else set() + selected_lines = _fast_fact_lines( + fact_lines, + fact_meta, + focus_tags=focus_tags, + avoid_tags=avoid_tags, + primary_tags=primary_tags, + ) + selected_meta = _fact_pack_meta(selected_lines) + selected_pack = _fact_pack_text(selected_lines, selected_meta) + if _needs_full_fact_pack(prompt) or not selected_lines: + selected_pack = fact_pack + if not subjective and _needs_full_fact_pack(prompt): + fallback = _fallback_fact_answer(prompt, fact_pack) + if fallback: + return _ensure_scores(fallback) + if state: + state.total_steps = _open_ended_total_steps("fast") + return _open_ended_fast_single( + prompt, + context=selected_pack, + history_lines=history_lines, + state=state, + model=model, + ) + + +def _open_ended_deep( + prompt: str, + *, + fact_pack: str, + fact_lines: list[str], + fact_meta: dict[str, dict[str, Any]], + history_lines: list[str], + state: ThoughtState | None = None, +) -> str: + return _open_ended_multi( + prompt, + fact_pack=fact_pack, + fact_lines=fact_lines, + fact_meta=fact_meta, + history_lines=history_lines, + state=state, + ) + + +def open_ended_answer( + prompt: str, + *, + inventory: list[dict[str, Any]], + snapshot: dict[str, Any] | None, + workloads: list[dict[str, Any]], + history_lines: list[str], + mode: str, + allow_tools: bool, + state: ThoughtState | None = None, +) -> str: + lines = _fact_pack_lines(prompt, inventory=inventory, snapshot=snapshot, workloads=workloads) + if _knowledge_intent(prompt) or _doc_intent(prompt): + kb_detail = kb_retrieve(prompt) + if kb_detail: + for line in kb_detail.splitlines(): + if line.strip(): + lines.append(line.strip()) + tool_lines = _tool_fact_lines(prompt, allow_tools=allow_tools) + if tool_lines: + lines.extend(tool_lines) + if not lines: + return _ensure_scores("I don't have enough data to answer that.") + fact_meta = _fact_pack_meta(lines) + fact_pack = _fact_pack_text(lines, fact_meta) + if mode == "fast": + return _open_ended_fast( + prompt, + fact_pack=fact_pack, + fact_lines=lines, + fact_meta=fact_meta, + history_lines=history_lines, + state=state, + ) + return _open_ended_deep( + prompt, + fact_pack=fact_pack, + fact_lines=lines, + fact_meta=fact_meta, + history_lines=history_lines, + state=state, + ) + + +def _non_cluster_reply(prompt: str, *, history_lines: list[str], mode: str) -> str: + system = ( + "System: You are Atlas, a helpful general assistant. " + "Answer using common knowledge when possible, and say when you're unsure. " + "Be concise and avoid unnecessary caveats. " + "Respond in plain sentences (no lists unless asked). " + "End every response with a line: 'Confidence: high|medium|low'." + ) + model = _model_for_mode(mode) + context = _append_history_context("", history_lines) if history_lines else "" + reply = _ollama_call( + ("general", "reply"), + prompt, + context=context, + use_history=False, + system_override=system, + model=model, + ) + reply = re.sub(r"\bconfidence\s*:\s*(high|medium|low)\b\.?\s*", "", reply, flags=re.IGNORECASE).strip() + return _ensure_scores(reply) + + +# Internal HTTP endpoint for cluster answers (website uses this). +class _AtlasbotHandler(BaseHTTPRequestHandler): + server_version = "AtlasbotHTTP/1.0" + + def _write_json(self, status: int, payload: dict[str, Any]): + body = json.dumps(payload).encode("utf-8") + self.send_response(status) + self.send_header("Content-Type", "application/json") + self.send_header("Content-Length", str(len(body))) + self.end_headers() + self.wfile.write(body) + + def _authorized(self) -> bool: + if not ATLASBOT_INTERNAL_TOKEN: + return True + token = self.headers.get("X-Internal-Token", "") + return token == ATLASBOT_INTERNAL_TOKEN + + def do_GET(self): # noqa: N802 + if self.path == "/health": + self._write_json(200, {"status": "ok"}) + return + self._write_json(404, {"error": "not_found"}) + + def do_POST(self): # noqa: N802 + if self.path != "/v1/answer": + self._write_json(404, {"error": "not_found"}) + return + if not self._authorized(): + self._write_json(401, {"error": "unauthorized"}) + return + try: + length = int(self.headers.get("Content-Length", "0")) + except ValueError: + length = 0 + raw = self.rfile.read(length) if length > 0 else b"" + try: + payload = json.loads(raw.decode("utf-8")) if raw else {} + except json.JSONDecodeError: + self._write_json(400, {"error": "invalid_json"}) + return + prompt = str(payload.get("prompt") or payload.get("question") or "").strip() + if not prompt: + self._write_json(400, {"error": "missing_prompt"}) + return + cleaned = _strip_bot_mention(prompt) + mode = str(payload.get("mode") or "deep").lower() + if mode in ("quick", "fast"): + mode = "fast" + elif mode in ("smart", "deep"): + mode = "deep" + else: + mode = "deep" + snapshot = _snapshot_state() + inventory = _snapshot_inventory(snapshot) or node_inventory_live() + workloads = _snapshot_workloads(snapshot) + history_payload = payload.get("history") or [] + history_lines = _history_payload_lines(history_payload) + history_cluster = _history_mentions_cluster( + history_lines, + inventory=inventory, + workloads=workloads, + ) + followup = _is_followup_query(cleaned) + cleaned_q = normalize_query(cleaned) + cluster_affinity = _is_cluster_query(cleaned, inventory=inventory, workloads=workloads) + subjective = _is_subjective_query(cleaned) + followup_affinity = any(word in cleaned_q for word in METRIC_HINT_WORDS) + contextual = history_cluster and (followup or followup_affinity) + cluster_query = cluster_affinity or contextual + context = "" + if cluster_query: + context = build_context( + cleaned, + allow_tools=True, + targets=[], + inventory=inventory, + snapshot=snapshot, + workloads=workloads, + ) + if cluster_query: + answer = open_ended_answer( + cleaned, + inventory=inventory, + snapshot=snapshot, + workloads=workloads, + history_lines=history_lines, + mode=mode, + allow_tools=True, + state=None, + ) + else: + answer = _non_cluster_reply(cleaned, history_lines=history_lines, mode=mode) + self._write_json(200, {"answer": answer}) + + +def _start_http_server(): + server = HTTPServer(("0.0.0.0", ATLASBOT_HTTP_PORT), _AtlasbotHandler) + thread = threading.Thread(target=server.serve_forever, daemon=True) + thread.start() + # Conversation state. history = collections.defaultdict(list) # (room_id, sender|None) -> list[str] (short transcript) @@ -449,17 +4405,56 @@ history = collections.defaultdict(list) # (room_id, sender|None) -> list[str] ( def key_for(room_id: str, sender: str, is_dm: bool): return (room_id, None) if is_dm else (room_id, sender) -def build_context(prompt: str, *, allow_tools: bool, targets: list[tuple[str, str]]) -> str: + +def _history_mentions_cluster( + history_lines: list[str], + *, + inventory: list[dict[str, Any]] | None = None, + workloads: list[dict[str, Any]] | None = None, +) -> bool: + recent = [line for line in history_lines[-8:] if isinstance(line, str)] + for line in recent: + cleaned = normalize_query(line) + if not cleaned: + continue + if _is_cluster_query(cleaned, inventory=inventory, workloads=workloads): + return True + return False + +def build_context( + prompt: str, + *, + allow_tools: bool, + targets: list[tuple[str, str]], + inventory: list[dict[str, Any]] | None = None, + snapshot: dict[str, Any] | None = None, + workloads: list[dict[str, Any]] | None = None, +) -> str: parts: list[str] = [] - kb = kb_retrieve(prompt) - if kb: - parts.append(kb) + facts = facts_context(prompt, inventory=inventory, snapshot=snapshot, workloads=workloads) + if facts: + parts.append(facts) + + snapshot_json = snapshot_compact_context( + prompt, + snapshot, + inventory=inventory, + workloads=workloads, + ) + if snapshot_json: + parts.append(snapshot_json) endpoints, edges = catalog_hints(prompt) if endpoints: parts.append(endpoints) + kb = kb_retrieve(prompt) + if not kb and _knowledge_intent(prompt): + kb = kb_retrieve_titles(prompt, limit=4) + if kb: + parts.append(kb) + if allow_tools: # Scope pod summaries to relevant namespaces/workloads when possible. prefixes_by_ns: dict[str, set[str]] = collections.defaultdict(set) @@ -478,44 +4473,352 @@ def build_context(prompt: str, *, allow_tools: bool, targets: list[tuple[str, st if flux_bad: parts.append("Flux (not ready):\n" + flux_bad) - p_l = (prompt or "").lower() - if any(w in p_l for w in METRIC_HINT_WORDS): - restarts = vm_top_restarts(1) - if restarts: - parts.append("VictoriaMetrics (top restarts 1h):\n" + restarts) - snap = vm_cluster_snapshot() - if snap: - parts.append("VictoriaMetrics (cluster snapshot):\n" + snap) - return "\n\n".join([p for p in parts if p]).strip() -def ollama_reply(hist_key, prompt: str, *, context: str) -> str: - try: - system = ( - "System: You are Atlas, the Titan lab assistant for Atlas/Othrys. " - "Be helpful, direct, and concise. " - "Prefer answering with exact repo paths and Kubernetes resource names. " - "Never include or request secret values." - ) - transcript_parts = [system] - if context: - transcript_parts.append("Context (grounded):\n" + context[:MAX_KB_CHARS]) - transcript_parts.extend(history[hist_key][-24:]) - transcript_parts.append(f"User: {prompt}") - transcript = "\n".join(transcript_parts) - payload = {"model": MODEL, "message": transcript} - headers = {"Content-Type": "application/json"} - if API_KEY: - headers["x-api-key"] = API_KEY - r = request.Request(OLLAMA_URL, data=json.dumps(payload).encode(), headers=headers) - with request.urlopen(r, timeout=20) as resp: - data = json.loads(resp.read().decode()) - reply = data.get("message") or data.get("response") or data.get("reply") or "I'm here to help." +def snapshot_context(prompt: str, snapshot: dict[str, Any] | None) -> str: + if not snapshot: + return "" + metrics = _snapshot_metrics(snapshot) + workloads = _snapshot_workloads(snapshot) + q = normalize_query(prompt) + parts: list[str] = [] + nodes = snapshot.get("nodes") if isinstance(snapshot.get("nodes"), dict) else {} + if nodes.get("total") is not None: + parts.append( + f"Snapshot: nodes_total={nodes.get('total')}, ready={nodes.get('ready')}, not_ready={nodes.get('not_ready')}." + ) + if any(word in q for word in ("postgres", "connections", "db")): + postgres = metrics.get("postgres_connections") if isinstance(metrics.get("postgres_connections"), dict) else {} + if postgres: + parts.append(f"Snapshot: postgres_connections={postgres}.") + if any(word in q for word in ("hottest", "cpu", "ram", "memory", "net", "network", "io", "disk")): + hottest = metrics.get("hottest_nodes") if isinstance(metrics.get("hottest_nodes"), dict) else {} + if hottest: + parts.append(f"Snapshot: hottest_nodes={hottest}.") + if workloads and any(word in q for word in ("run", "running", "host", "node", "where", "which")): + match = _select_workload(prompt, workloads) + if match: + parts.append(f"Snapshot: workload={match}.") + return "\n".join(parts).strip() + +def _compact_nodes_detail(snapshot: dict[str, Any]) -> list[dict[str, Any]]: + details = snapshot.get("nodes_detail") if isinstance(snapshot.get("nodes_detail"), list) else [] + output: list[dict[str, Any]] = [] + for node in details: + if not isinstance(node, dict): + continue + name = node.get("name") + if not name: + continue + output.append( + { + "name": name, + "ready": node.get("ready"), + "hardware": node.get("hardware"), + "arch": node.get("arch"), + "roles": node.get("roles"), + "is_worker": node.get("is_worker"), + "os": node.get("os"), + "kernel": node.get("kernel"), + "kubelet": node.get("kubelet"), + "container_runtime": node.get("container_runtime"), + } + ) + return output + +def _compact_metrics(snapshot: dict[str, Any]) -> dict[str, Any]: + metrics = snapshot.get("metrics") if isinstance(snapshot.get("metrics"), dict) else {} + return { + "pods_running": metrics.get("pods_running"), + "pods_pending": metrics.get("pods_pending"), + "pods_failed": metrics.get("pods_failed"), + "pods_succeeded": metrics.get("pods_succeeded"), + "postgres_connections": metrics.get("postgres_connections"), + "hottest_nodes": metrics.get("hottest_nodes"), + "node_usage": metrics.get("node_usage"), + "top_restarts_1h": metrics.get("top_restarts_1h"), + } + +def snapshot_compact_context( + prompt: str, + snapshot: dict[str, Any] | None, + *, + inventory: list[dict[str, Any]] | None, + workloads: list[dict[str, Any]] | None, +) -> str: + if not snapshot: + return "" + compact = { + "collected_at": snapshot.get("collected_at"), + "nodes_summary": snapshot.get("nodes_summary"), + "expected_workers": expected_worker_nodes_from_metrics(), + "nodes_detail": _compact_nodes_detail(snapshot), + "workloads": _workloads_for_prompt(prompt, workloads or [], limit=40) if workloads else [], + "metrics": _compact_metrics(snapshot), + "flux": snapshot.get("flux"), + "errors": snapshot.get("errors"), + } + text = json.dumps(compact, ensure_ascii=False) + if len(text) > MAX_FACTS_CHARS: + text = text[: MAX_FACTS_CHARS - 3].rstrip() + "..." + return "Cluster snapshot (JSON):\n" + text + + +def _knowledge_intent(prompt: str) -> bool: + q = normalize_query(prompt) + return any( + phrase in q + for phrase in ( + "what do you know", + "tell me about", + "interesting", + "overview", + "summary", + "describe", + "explain", + ) + ) + + +def _is_cluster_query( + prompt: str, + *, + inventory: list[dict[str, Any]] | None, + workloads: list[dict[str, Any]] | None, +) -> bool: + q = normalize_query(prompt) + if not q: + return False + if TITAN_NODE_RE.search(q): + return True + if any(word in q for word in CLUSTER_HINT_WORDS): + return True + if any(word in q for word in METRIC_HINT_WORDS): + return True + for host_match in HOST_RE.finditer(q): + host = host_match.group(1).lower() + if host.endswith("bstein.dev"): + return True + tokens = set(_tokens(q)) + if _NAME_INDEX and tokens & _NAME_INDEX: + return True + return False + + +def _inventory_summary(inventory: list[dict[str, Any]]) -> str: + if not inventory: + return "" + groups = _group_nodes(inventory) + total = len(inventory) + ready = [n for n in inventory if n.get("ready") is True] + not_ready = [n for n in inventory if n.get("ready") is False] + parts = [f"Atlas cluster: {total} nodes ({len(ready)} ready, {len(not_ready)} not ready)."] + for key in ("rpi5", "rpi4", "jetson", "amd64", "arm64-unknown", "unknown"): + nodes = groups.get(key) or [] + if nodes: + parts.append(f"- {key}: {len(nodes)} nodes ({', '.join(nodes)})") + return "\n".join(parts) + + +def knowledge_summary(prompt: str, inventory: list[dict[str, Any]]) -> str: + parts: list[str] = [] + inv = _inventory_summary(inventory) + if inv: + parts.append(inv) + kb_titles = kb_retrieve_titles(prompt, limit=4) + if kb_titles: + parts.append(kb_titles) + summary = "\n".join(parts).strip() + return _format_confidence(summary, "medium") if summary else "" + +def _ollama_call( + hist_key, + prompt: str, + *, + context: str, + use_history: bool = True, + system_override: str | None = None, + model: str | None = None, +) -> str: + system = system_override or ( + "System: You are Atlas, the Titan lab assistant for Atlas/Othrys. " + "Be helpful, direct, and concise. " + "Use the provided context and facts as your source of truth. " + "If the context includes a cluster snapshot, treat the question as about the Atlas/Othrys cluster even if the prompt is ambiguous. " + "When a cluster snapshot is provided, never answer about unrelated meanings of 'Atlas' (maps, mythology, Apache Atlas, etc). " + "Treat 'hottest' as highest utilization (CPU/RAM/NET/IO) rather than temperature. " + "If you infer or synthesize, say 'Based on the snapshot' and keep it brief. " + "For subjective prompts (interesting, favorite, unconventional), pick one or two observations from the context, explain why they stand out in 1-2 sentences, and avoid repeating the same observation as the last response if you can. " + "Prefer exact repo paths and Kubernetes resource names when relevant. " + "Never include or request secret values. " + "Do not suggest commands unless explicitly asked. " + "Respond in plain sentences; do not return JSON or code fences unless explicitly asked. " + "Translate metrics into natural language instead of echoing raw label/value pairs. " + "When providing counts or totals, use the exact numbers from the context; do not invent or truncate. " + "Avoid bare lists unless the user asked for a list; weave numbers into sentences. " + "Do not answer by only listing runbooks; if the question is about Atlas/Othrys, summarize the cluster first and mention docs only if useful. " + "If the question is not about Atlas/Othrys and no cluster context is provided, answer using general knowledge and say when you are unsure. " + "If the answer is not grounded in the provided context or tool data, say you do not know. " + "End every response with a line: 'Confidence: high|medium|low'." + ) + endpoint = _ollama_endpoint() + if not endpoint: + raise RuntimeError("ollama endpoint missing") + messages: list[dict[str, str]] = [{"role": "system", "content": system}] + if context: + messages.append({"role": "user", "content": "Context (grounded):\n" + context[:MAX_CONTEXT_CHARS]}) + if use_history: + messages.extend(_history_to_messages(history[hist_key][-24:])) + messages.append({"role": "user", "content": prompt}) + + model_name = model or MODEL + payload = {"model": model_name, "messages": messages, "stream": False} + headers = {"Content-Type": "application/json"} + if API_KEY: + headers["x-api-key"] = API_KEY + r = request.Request(endpoint, data=json.dumps(payload).encode(), headers=headers) + lock = _OLLAMA_LOCK if OLLAMA_SERIALIZE else None + if lock: + lock.acquire() + try: + try: + with request.urlopen(r, timeout=OLLAMA_TIMEOUT_SEC) as resp: + data = json.loads(resp.read().decode()) + except error.HTTPError as exc: + if exc.code == 404 and FALLBACK_MODEL and FALLBACK_MODEL != payload["model"]: + payload["model"] = FALLBACK_MODEL + r = request.Request(endpoint, data=json.dumps(payload).encode(), headers=headers) + with request.urlopen(r, timeout=OLLAMA_TIMEOUT_SEC) as resp: + data = json.loads(resp.read().decode()) + else: + raise + msg = data.get("message") if isinstance(data, dict) else None + if isinstance(msg, dict): + raw_reply = msg.get("content") + else: + raw_reply = data.get("response") or data.get("reply") or data + reply = _normalize_reply(raw_reply) or "I'm here to help." + if use_history: history[hist_key].append(f"Atlas: {reply}") - return reply - except Exception: - return "I’m here — but I couldn’t reach the model backend." + return reply + finally: + if lock: + lock.release() + +def ollama_reply( + hist_key, + prompt: str, + *, + context: str, + fallback: str = "", + use_history: bool = True, + model: str | None = None, +) -> str: + last_error = None + for attempt in range(max(1, OLLAMA_RETRIES + 1)): + try: + return _ollama_call( + hist_key, + prompt, + context=context, + use_history=use_history, + model=model, + ) + except Exception as exc: # noqa: BLE001 + last_error = exc + time.sleep(min(4, 2 ** attempt)) + if fallback: + if use_history: + history[hist_key].append(f"Atlas: {fallback}") + return fallback + return "I don't have enough data to answer that." + +def ollama_reply_with_thinking( + token: str, + room: str, + hist_key, + prompt: str, + *, + context: str, + fallback: str, + use_history: bool = True, + model: str | None = None, +) -> str: + result: dict[str, str] = {"reply": ""} + done = threading.Event() + + def worker(): + result["reply"] = ollama_reply( + hist_key, + prompt, + context=context, + fallback=fallback, + use_history=use_history, + model=model, + ) + done.set() + + thread = threading.Thread(target=worker, daemon=True) + thread.start() + if not done.wait(2.0): + send_msg(token, room, "Thinking…") + prompt_hint = " ".join((prompt or "").split()) + if len(prompt_hint) > 160: + prompt_hint = prompt_hint[:157] + "…" + heartbeat = max(10, THINKING_INTERVAL_SEC) + next_heartbeat = time.monotonic() + heartbeat + while not done.wait(max(0, next_heartbeat - time.monotonic())): + if prompt_hint: + send_msg(token, room, f"Still thinking about: {prompt_hint} (gathering context)") + else: + send_msg(token, room, "Still thinking (gathering context)…") + next_heartbeat += heartbeat + thread.join(timeout=1) + return result["reply"] or fallback or "Model backend is busy. Try again in a moment." + + +def open_ended_with_thinking( + token: str, + room: str, + prompt: str, + *, + inventory: list[dict[str, Any]], + snapshot: dict[str, Any] | None, + workloads: list[dict[str, Any]], + history_lines: list[str], + mode: str, + allow_tools: bool, +) -> str: + result: dict[str, str] = {"reply": ""} + done = threading.Event() + total_steps = _open_ended_total_steps(mode) + state = ThoughtState(total_steps=total_steps) + + def worker(): + result["reply"] = open_ended_answer( + prompt, + inventory=inventory, + snapshot=snapshot, + workloads=workloads, + history_lines=history_lines, + mode=mode, + allow_tools=allow_tools, + state=state, + ) + done.set() + + thread = threading.Thread(target=worker, daemon=True) + thread.start() + if not done.wait(2.0): + send_msg(token, room, "Thinking…") + heartbeat = max(10, THINKING_INTERVAL_SEC) + next_heartbeat = time.monotonic() + heartbeat + while not done.wait(max(0, next_heartbeat - time.monotonic())): + send_msg(token, room, state.status_line()) + next_heartbeat += heartbeat + thread.join(timeout=1) + return result["reply"] or "Model backend is busy. Try again in a moment." def sync_loop(token: str, room_id: str): since = None @@ -569,7 +4872,11 @@ def sync_loop(token: str, room_id: str): if not (is_dm or mentioned): continue - # Only do live cluster/metrics introspection in DMs. + cleaned_body = _strip_bot_mention(body) + lower_body = cleaned_body.lower() + mode = _detect_mode_from_body(body, default="deep" if is_dm else "deep") + + # Only do live cluster introspection in DMs. allow_tools = is_dm promql = "" @@ -580,7 +4887,7 @@ def sync_loop(token: str, room_id: str): # Attempt to scope tools to the most likely workloads when hostnames are mentioned. targets: list[tuple[str, str]] = [] - for m in HOST_RE.finditer(body.lower()): + for m in HOST_RE.finditer(lower_body): host = m.group(1).lower() for ep in _HOST_INDEX.get(host, []): backend = ep.get("backend") or {} @@ -589,14 +4896,60 @@ def sync_loop(token: str, room_id: str): if isinstance(w, dict) and w.get("name"): targets.append((ns, str(w["name"]))) - context = build_context(body, allow_tools=allow_tools, targets=targets) + snapshot = _snapshot_state() + inventory = node_inventory_for_prompt(cleaned_body) + if not inventory: + inventory = _snapshot_inventory(snapshot) + workloads = _snapshot_workloads(snapshot) + history_cluster = _history_mentions_cluster( + history[hist_key], + inventory=inventory, + workloads=workloads, + ) + followup = _is_followup_query(cleaned_body) + cleaned_q = normalize_query(cleaned_body) + cluster_affinity = _is_cluster_query(cleaned_body, inventory=inventory, workloads=workloads) + subjective = _is_subjective_query(cleaned_body) + followup_affinity = any(word in cleaned_q for word in METRIC_HINT_WORDS) + contextual = history_cluster and (followup or followup_affinity) + cluster_query = cluster_affinity or contextual + context = "" + if cluster_query: + context = build_context( + cleaned_body, + allow_tools=allow_tools, + targets=targets, + inventory=inventory, + snapshot=snapshot, + workloads=workloads, + ) if allow_tools and promql: res = vm_query(promql, timeout=20) rendered = vm_render_result(res, limit=15) or "(no results)" extra = "VictoriaMetrics (PromQL result):\n" + rendered - context = (context + "\n\n" + extra).strip() if context else extra - reply = ollama_reply(hist_key, body, context=context) + send_msg(token, rid, extra) + continue + if cluster_query: + reply = open_ended_with_thinking( + token, + rid, + cleaned_body, + inventory=inventory, + snapshot=snapshot, + workloads=workloads, + history_lines=history[hist_key], + mode=mode if mode in ("fast", "deep") else "deep", + allow_tools=allow_tools, + ) + else: + reply = _non_cluster_reply( + cleaned_body, + history_lines=history[hist_key], + mode=mode if mode in ("fast", "deep") else "deep", + ) send_msg(token, rid, reply) + history[hist_key].append(f"Atlas: {reply}") + history[hist_key] = history[hist_key][-80:] def login_with_retry(): last_err = None @@ -610,6 +4963,7 @@ def login_with_retry(): def main(): load_kb() + _start_http_server() token = login_with_retry() try: room_id = resolve_alias(token, ROOM_ALIAS) diff --git a/services/comms/secretproviderclass.yaml b/services/comms/secretproviderclass.yaml index 69d4b2b..0a89552 100644 --- a/services/comms/secretproviderclass.yaml +++ b/services/comms/secretproviderclass.yaml @@ -11,7 +11,7 @@ spec: roleName: "comms" objects: | - objectName: "harbor-pull__dockerconfigjson" - secretPath: "kv/data/atlas/harbor-pull/comms" + secretPath: "kv/data/atlas/shared/harbor-pull" secretKey: "dockerconfigjson" secretObjects: - secretName: harbor-regcred diff --git a/services/crypto/xmr-miner/secretproviderclass.yaml b/services/crypto/xmr-miner/secretproviderclass.yaml index a72097f..12e4ba1 100644 --- a/services/crypto/xmr-miner/secretproviderclass.yaml +++ b/services/crypto/xmr-miner/secretproviderclass.yaml @@ -11,7 +11,7 @@ spec: roleName: "crypto" objects: | - objectName: "harbor-pull__dockerconfigjson" - secretPath: "kv/data/atlas/harbor-pull/crypto" + secretPath: "kv/data/atlas/shared/harbor-pull" secretKey: "dockerconfigjson" secretObjects: - secretName: harbor-regcred diff --git a/services/finance/actual-budget-deployment.yaml b/services/finance/actual-budget-deployment.yaml index 55186b2..637e9ae 100644 --- a/services/finance/actual-budget-deployment.yaml +++ b/services/finance/actual-budget-deployment.yaml @@ -90,6 +90,8 @@ spec: value: openid - name: ACTUAL_MULTIUSER value: "true" + - name: ACTUAL_USER_CREATION_MODE + value: login - name: ACTUAL_OPENID_DISCOVERY_URL value: https://sso.bstein.dev/realms/atlas - name: ACTUAL_OPENID_AUTHORIZATION_ENDPOINT @@ -128,6 +130,8 @@ spec: value: openid - name: ACTUAL_MULTIUSER value: "true" + - name: ACTUAL_USER_CREATION_MODE + value: login - name: ACTUAL_OPENID_DISCOVERY_URL value: https://sso.bstein.dev/realms/atlas - name: ACTUAL_OPENID_AUTHORIZATION_ENDPOINT diff --git a/services/finance/firefly-cronjob.yaml b/services/finance/firefly-cronjob.yaml index 6c4d507..9e5c852 100644 --- a/services/finance/firefly-cronjob.yaml +++ b/services/finance/firefly-cronjob.yaml @@ -6,6 +6,7 @@ metadata: namespace: finance spec: schedule: "0 3 * * *" + suspend: true concurrencyPolicy: Forbid successfulJobsHistoryLimit: 1 failedJobsHistoryLimit: 3 diff --git a/services/finance/kustomization.yaml b/services/finance/kustomization.yaml index e4c414f..1559f5c 100644 --- a/services/finance/kustomization.yaml +++ b/services/finance/kustomization.yaml @@ -9,7 +9,7 @@ resources: - finance-secrets-ensure-rbac.yaml - actual-budget-data-pvc.yaml - firefly-storage-pvc.yaml - - finance-secrets-ensure-job.yaml + - oneoffs/finance-secrets-ensure-job.yaml - actual-budget-deployment.yaml - firefly-deployment.yaml - firefly-user-sync-cronjob.yaml diff --git a/services/finance/finance-secrets-ensure-job.yaml b/services/finance/oneoffs/finance-secrets-ensure-job.yaml similarity index 83% rename from services/finance/finance-secrets-ensure-job.yaml rename to services/finance/oneoffs/finance-secrets-ensure-job.yaml index 67f06cb..e8c8f58 100644 --- a/services/finance/finance-secrets-ensure-job.yaml +++ b/services/finance/oneoffs/finance-secrets-ensure-job.yaml @@ -1,10 +1,15 @@ -# services/finance/finance-secrets-ensure-job.yaml +# services/finance/oneoffs/finance-secrets-ensure-job.yaml +# One-off job for finance/finance-secrets-ensure-5. +# Purpose: finance secrets ensure 5 (see container args/env in this file). +# Run by setting spec.suspend to false, reconcile, then set it back to true. +# Safe to delete the finished Job/pod; it should not run continuously. apiVersion: batch/v1 kind: Job metadata: name: finance-secrets-ensure-5 namespace: finance spec: + suspend: true backoffLimit: 1 ttlSecondsAfterFinished: 3600 template: diff --git a/services/finance/portal-rbac.yaml b/services/finance/portal-rbac.yaml index 2fb7ede..66eafea 100644 --- a/services/finance/portal-rbac.yaml +++ b/services/finance/portal-rbac.yaml @@ -29,3 +29,17 @@ subjects: - kind: ServiceAccount name: bstein-dev-home namespace: bstein-dev-home +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + name: ariadne-firefly-user-sync + namespace: finance +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: Role + name: bstein-dev-home-firefly-user-sync +subjects: + - kind: ServiceAccount + name: ariadne + namespace: maintenance diff --git a/services/gitea/deployment.yaml b/services/gitea/deployment.yaml index 9dc0c87..da188c3 100644 --- a/services/gitea/deployment.yaml +++ b/services/gitea/deployment.yaml @@ -169,6 +169,8 @@ spec: value: "trace" - name: GITEA__service__REQUIRE_SIGNIN_VIEW value: "false" + - name: GITEA__webhook__ALLOWED_HOST_LIST + value: "ci.bstein.dev" - name: GITEA__server__PROXY_HEADERS value: "X-Forwarded-For, X-Forwarded-Proto, X-Forwarded-Host" - name: GITEA__session__COOKIE_SECURE diff --git a/services/harbor/helmrelease.yaml b/services/harbor/helmrelease.yaml index b0cbdbd..16b81a8 100644 --- a/services/harbor/helmrelease.yaml +++ b/services/harbor/helmrelease.yaml @@ -391,6 +391,16 @@ spec: $patch: delete - name: core-writable emptyDir: {} + - target: + kind: Ingress + name: harbor-ingress + patch: |- + - op: replace + path: /spec/rules/0/http/paths/2/backend/service/name + value: harbor-registry + - op: replace + path: /spec/rules/0/http/paths/2/backend/service/port/number + value: 5000 - target: kind: Deployment name: harbor-jobservice diff --git a/services/harbor/secretproviderclass.yaml b/services/harbor/secretproviderclass.yaml index 03fef95..636f6fa 100644 --- a/services/harbor/secretproviderclass.yaml +++ b/services/harbor/secretproviderclass.yaml @@ -11,7 +11,7 @@ spec: roleName: "harbor" objects: | - objectName: "harbor-pull__dockerconfigjson" - secretPath: "kv/data/atlas/harbor-pull/harbor" + secretPath: "kv/data/atlas/shared/harbor-pull" secretKey: "dockerconfigjson" secretObjects: - secretName: harbor-regcred diff --git a/services/health/portal-rbac.yaml b/services/health/portal-rbac.yaml index cd9acd1..feb7441 100644 --- a/services/health/portal-rbac.yaml +++ b/services/health/portal-rbac.yaml @@ -8,7 +8,7 @@ rules: - apiGroups: ["batch"] resources: ["cronjobs"] verbs: ["get"] - resourceNames: ["wger-user-sync"] + resourceNames: ["wger-user-sync", "wger-admin-ensure"] - apiGroups: ["batch"] resources: ["jobs"] verbs: ["create", "get", "list", "watch"] @@ -29,3 +29,17 @@ subjects: - kind: ServiceAccount name: bstein-dev-home namespace: bstein-dev-home +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + name: ariadne-wger-user-sync + namespace: health +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: Role + name: bstein-dev-home-wger-user-sync +subjects: + - kind: ServiceAccount + name: ariadne + namespace: maintenance diff --git a/services/health/wger-admin-ensure-cronjob.yaml b/services/health/wger-admin-ensure-cronjob.yaml index db178a3..a1063dd 100644 --- a/services/health/wger-admin-ensure-cronjob.yaml +++ b/services/health/wger-admin-ensure-cronjob.yaml @@ -8,6 +8,7 @@ metadata: atlas.bstein.dev/glue: "true" spec: schedule: "15 3 * * *" + suspend: true concurrencyPolicy: Forbid successfulJobsHistoryLimit: 1 failedJobsHistoryLimit: 3 diff --git a/services/jenkins/cache-pvc.yaml b/services/jenkins/cache-pvc.yaml new file mode 100644 index 0000000..a9ed319 --- /dev/null +++ b/services/jenkins/cache-pvc.yaml @@ -0,0 +1,13 @@ +# services/jenkins/cache-pvc.yaml +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: jenkins-cache-v2 + namespace: jenkins +spec: + accessModes: + - ReadWriteOnce + resources: + requests: + storage: 20Gi + storageClassName: astreae diff --git a/services/jenkins/configmap-jcasc.yaml b/services/jenkins/configmap-jcasc.yaml index ac26350..c2144fa 100644 --- a/services/jenkins/configmap-jcasc.yaml +++ b/services/jenkins/configmap-jcasc.yaml @@ -18,7 +18,7 @@ data: logoutFromOpenIdProvider: true postLogoutRedirectUrl: "https://ci.bstein.dev" sendScopesInTokenRequest: true - rootURLFromRequest: true + rootURLFromRequest: false userNameField: "preferred_username" fullNameFieldName: "name" emailFieldName: "email" @@ -49,8 +49,15 @@ data: jobs: - script: | pipelineJob('harbor-arm-build') { - triggers { - scm('H/5 * * * *') + properties { + pipelineTriggers { + triggers { + scmTrigger { + scmpoll_spec('H/5 * * * *') + ignorePostCommitHooks(false) + } + } + } } definition { cpsScm { @@ -83,8 +90,15 @@ data: } } pipelineJob('ci-demo') { - triggers { - scm('H/1 * * * *') + properties { + pipelineTriggers { + triggers { + scmTrigger { + scmpoll_spec('H/1 * * * *') + ignorePostCommitHooks(false) + } + } + } } definition { cpsScm { @@ -102,8 +116,15 @@ data: } } pipelineJob('bstein-dev-home') { - triggers { - scm('H/2 * * * *') + properties { + pipelineTriggers { + triggers { + scmTrigger { + scmpoll_spec('H/2 * * * *') + ignorePostCommitHooks(false) + } + } + } } definition { cpsScm { @@ -120,9 +141,42 @@ data: } } } + pipelineJob('ariadne') { + properties { + pipelineTriggers { + triggers { + scmTrigger { + scmpoll_spec('H/2 * * * *') + ignorePostCommitHooks(false) + } + } + } + } + definition { + cpsScm { + scm { + git { + remote { + url('https://scm.bstein.dev/bstein/ariadne.git') + credentials('gitea-pat') + } + branches('*/master') + } + } + scriptPath('Jenkinsfile') + } + } + } pipelineJob('data-prepper') { - triggers { - scm('H/5 * * * *') + properties { + pipelineTriggers { + triggers { + scmTrigger { + scmpoll_spec('H/5 * * * *') + ignorePostCommitHooks(false) + } + } + } } definition { cpsScm { @@ -139,24 +193,39 @@ data: } } } - pipelineJob('titan-iac-quality-gate') { - triggers { - scm('H/5 * * * *') - } - definition { - cpsScm { - scm { + multibranchPipelineJob('titan-iac-quality-gate') { + branchSources { + branchSource { + source { git { - remote { - url('https://scm.bstein.dev/bstein/titan-iac.git') - credentials('gitea-pat') - } - branches('*/feature/vault-consumption') + id('titan-iac-quality-gate') + remote('https://scm.bstein.dev/bstein/titan-iac.git') + credentialsId('gitea-pat') } } + } + } + factory { + workflowBranchProjectFactory { scriptPath('ci/Jenkinsfile.titan-iac') } } + orphanedItemStrategy { + discardOldItems { + numToKeep(30) + } + } + triggers { + periodicFolderTrigger { + interval('12h') + } + } + configure { node -> + def webhookToken = System.getenv('TITAN_IAC_WEBHOOK_TOKEN') ?: '' + def triggers = node / 'triggers' + def webhook = triggers.appendNode('com.igalg.jenkins.plugins.mswt.trigger.ComputedFolderWebHookTrigger') + webhook.appendNode('token', webhookToken) + } } base.yaml: | jenkins: @@ -189,6 +258,11 @@ data: templates: - name: "default" namespace: "jenkins" + workspaceVolume: + dynamicPVC: + accessModes: "ReadWriteOnce" + requestsSize: "20Gi" + storageClassName: "astreae" containers: - name: "jnlp" args: "^${computer.jnlpmac} ^${computer.name}" @@ -217,3 +291,6 @@ data: crumbIssuer: standard: excludeClientIPFromCrumb: true + unclassified: + location: + url: "https://ci.bstein.dev/" diff --git a/services/jenkins/configmap-plugins.yaml b/services/jenkins/configmap-plugins.yaml index eabea13..1c43cfb 100644 --- a/services/jenkins/configmap-plugins.yaml +++ b/services/jenkins/configmap-plugins.yaml @@ -6,12 +6,17 @@ metadata: namespace: jenkins data: plugins.txt: | - kubernetes - workflow-aggregator - git - pipeline-utility-steps - configuration-as-code - configuration-as-code-support - oic-auth - job-dsl - simple-theme-plugin + kubernetes:4416.v2ea_b_5372da_a_e + workflow-aggregator:608.v67378e9d3db_1 + git:5.8.1 + pipeline-utility-steps:2.20.0 + configuration-as-code:2031.veb_a_fdda_b_3ffd + oic-auth:4.609.v9de140f63d01 + job-dsl:1.93 + simple-theme-plugin:230.v8b_fd91b_b_800c + workflow-multibranch:821.vc3b_4ea_780798 + branch-api:2.1268.v044a_87612da_8 + scm-api:724.v7d839074eb_5c + gitea:268.v75e47974c01d + gitea-checks:603.621.vc708da_fb_371d + multibranch-scan-webhook-trigger:1.0.11 diff --git a/services/jenkins/deployment.yaml b/services/jenkins/deployment.yaml index e846a8e..63f722b 100644 --- a/services/jenkins/deployment.yaml +++ b/services/jenkins/deployment.yaml @@ -22,23 +22,33 @@ spec: vault.hashicorp.com/role: "jenkins" vault.hashicorp.com/agent-inject-secret-jenkins-env: "kv/data/atlas/jenkins/jenkins-oidc" vault.hashicorp.com/agent-inject-template-jenkins-env: | - {{- with secret "kv/data/atlas/jenkins/jenkins-oidc" -}} + {{ with secret "kv/data/atlas/jenkins/jenkins-oidc" }} OIDC_CLIENT_ID={{ .Data.data.clientId }} OIDC_CLIENT_SECRET={{ .Data.data.clientSecret }} OIDC_AUTH_URL={{ .Data.data.authorizationUrl }} OIDC_TOKEN_URL={{ .Data.data.tokenUrl }} OIDC_USERINFO_URL={{ .Data.data.userInfoUrl }} OIDC_LOGOUT_URL={{ .Data.data.logoutUrl }} - {{- end }} - {{- with secret "kv/data/atlas/jenkins/harbor-robot-creds" -}} + {{ end }} + {{ with secret "kv/data/atlas/jenkins/harbor-robot-creds" }} + HARBOR_ROBOT_USERNAME={{ .Data.data.username }} + HARBOR_ROBOT_PASSWORD={{ .Data.data.password }} + {{ end }} + {{ with secret "kv/data/atlas/shared/harbor-pull" }} + {{- if and .Data.data.username .Data.data.password }} HARBOR_ROBOT_USERNAME={{ .Data.data.username }} HARBOR_ROBOT_PASSWORD={{ .Data.data.password }} {{- end }} - {{- with secret "kv/data/atlas/jenkins/gitea-pat" -}} + {{ end }} + {{ with secret "kv/data/atlas/jenkins/gitea-pat" }} GITEA_PAT_USERNAME={{ .Data.data.username }} GITEA_PAT_TOKEN={{ .Data.data.token }} - {{- end -}} - bstein.dev/restarted-at: "2026-01-19T00:25:00Z" + {{ end }} + {{ with secret "kv/data/atlas/jenkins/webhook-tokens" }} + TITAN_IAC_WEBHOOK_TOKEN={{ .Data.data.titan_iac_quality_gate }} + GIT_NOTIFY_TOKEN_BSTEIN_DEV_HOME={{ .Data.data.git_notify_bstein_dev_home }} + {{ end }} + bstein.dev/restarted-at: "2026-01-20T14:52:41Z" spec: serviceAccountName: jenkins nodeSelector: @@ -98,7 +108,9 @@ spec: containerPort: 50000 env: - name: JAVA_OPTS - value: "-Xms512m -Xmx2048m" + value: "-Xms512m -Xmx2048m -Duser.timezone=America/Chicago" + - name: TZ + value: "America/Chicago" - name: JENKINS_OPTS value: "--webroot=/var/jenkins_cache/war" - name: JENKINS_SLAVE_AGENT_PORT @@ -148,6 +160,8 @@ spec: mountPath: /config/jcasc - name: init-scripts mountPath: /usr/share/jenkins/ref/init.groovy.d + - name: init-scripts + mountPath: /var/jenkins_home/init.groovy.d - name: plugin-dir mountPath: /usr/share/jenkins/ref/plugins - name: tmp @@ -157,9 +171,11 @@ spec: persistentVolumeClaim: claimName: jenkins - name: jenkins-cache - emptyDir: {} + persistentVolumeClaim: + claimName: jenkins-cache-v2 - name: plugin-dir - emptyDir: {} + persistentVolumeClaim: + claimName: jenkins-plugins-v2 - name: plugins configMap: name: jenkins-plugins @@ -170,4 +186,5 @@ spec: configMap: name: jenkins-init-scripts - name: tmp - emptyDir: {} + emptyDir: + medium: Memory diff --git a/services/jenkins/kustomization.yaml b/services/jenkins/kustomization.yaml index acb6fb4..df51968 100644 --- a/services/jenkins/kustomization.yaml +++ b/services/jenkins/kustomization.yaml @@ -5,9 +5,14 @@ namespace: jenkins resources: - namespace.yaml - serviceaccount.yaml + - vault-serviceaccount.yaml - pvc.yaml + - cache-pvc.yaml + - plugins-pvc.yaml - configmap-jcasc.yaml - configmap-plugins.yaml + - secretproviderclass.yaml + - vault-sync-deployment.yaml - deployment.yaml - service.yaml - ingress.yaml @@ -16,6 +21,7 @@ configMapGenerator: - name: jenkins-init-scripts namespace: jenkins files: + - git-notify-token.groovy=scripts/git-notify-token.groovy - theme.groovy=scripts/theme.groovy options: disableNameSuffixHash: true diff --git a/services/jenkins/plugins-pvc.yaml b/services/jenkins/plugins-pvc.yaml new file mode 100644 index 0000000..06715eb --- /dev/null +++ b/services/jenkins/plugins-pvc.yaml @@ -0,0 +1,13 @@ +# services/jenkins/plugins-pvc.yaml +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: jenkins-plugins-v2 + namespace: jenkins +spec: + accessModes: + - ReadWriteOnce + resources: + requests: + storage: 10Gi + storageClassName: astreae diff --git a/services/jenkins/scripts/git-notify-token.groovy b/services/jenkins/scripts/git-notify-token.groovy new file mode 100644 index 0000000..336c918 --- /dev/null +++ b/services/jenkins/scripts/git-notify-token.groovy @@ -0,0 +1,41 @@ +import hudson.plugins.git.ApiTokenPropertyConfiguration +import hudson.Util +import java.nio.charset.StandardCharsets +import java.security.MessageDigest + + +def entries = [ + [env: 'GIT_NOTIFY_TOKEN_BSTEIN_DEV_HOME', name: 'gitea-bstein-dev-home'], +] + +entries.each { entry -> + def token = System.getenv(entry.env) + if (!token || token.trim().isEmpty()) { + println("Git notifyCommit token ${entry.env} missing; skipping") + return + } + + try { + def config = ApiTokenPropertyConfiguration.get() + if (config.hasMatchingApiToken(token)) { + println("Git notifyCommit token ${entry.name} already configured") + return + } + + def digest = MessageDigest.getInstance("SHA-256") + def hash = Util.toHexString(digest.digest(token.getBytes(StandardCharsets.US_ASCII))) + + def field = ApiTokenPropertyConfiguration.class.getDeclaredField("apiTokens") + field.setAccessible(true) + def tokens = field.get(config) + + def ctor = ApiTokenPropertyConfiguration.HashedApiToken.class.getDeclaredConstructor(String.class, String.class) + ctor.setAccessible(true) + tokens.add(ctor.newInstance(entry.name, hash)) + config.save() + + println("Added git notifyCommit access token ${entry.name}") + } catch (Throwable e) { + println("Failed to configure git notifyCommit token ${entry.name}: ${e.class.simpleName}: ${e.message}") + } +} diff --git a/services/jenkins/scripts/theme.groovy b/services/jenkins/scripts/theme.groovy index cf171f7..58755c0 100644 --- a/services/jenkins/scripts/theme.groovy +++ b/services/jenkins/scripts/theme.groovy @@ -1,15 +1,137 @@ import jenkins.model.Jenkins import org.codefirst.SimpleThemeDecorator +import org.jenkinsci.plugins.simpletheme.CssTextThemeElement def instance = Jenkins.get() def decorators = instance.getExtensionList(SimpleThemeDecorator.class) if (decorators?.size() > 0) { def theme = decorators[0] - theme.setCssUrl("https://jenkins-contrib-themes.github.io/jenkins-material-theme/dist/material-ocean.css") + def cssRules = """ +:root, +.app-theme-picker__picker[data-theme=none] { + --background: #0f1216 !important; + --header-background: #141922 !important; + --header-border: #2b313b !important; + --white: #141922 !important; + --black: #e6e9ef !important; + --very-light-grey: #171b21 !important; + --light-grey: #202734 !important; + --medium-grey: #2b313b !important; + --dark-grey: #0b0f14 !important; + --text-color: #e6e9ef !important; + --text-color-secondary: #a6adba !important; + --card-background: #171b21 !important; + --card-border-color: #2b313b !important; + --pane-header-bg: #1f252d !important; + --pane-header-border-color: #2b313b !important; + --pane-border-color: #2b313b !important; + --pane-text-color: #e6e9ef !important; + --pane-header-text-color: #e6e9ef !important; + --link-color: #8fb7ff !important; + --link-color--hover: #b0ccff !important; + --link-dark-color: #e6e9ef !important; + --link-dark-color--hover: #b0ccff !important; + --input-color: #151a20 !important; + --input-border: #2b313b !important; + --input-border-hover: #3a424d !important; + --button-background: #232a33 !important; + --button-background--hover: #2b313b !important; + --button-background--active: #323b46 !important; + --item-background--hover: #232a33 !important; + --item-background--active: #2b313b !important; + --accent-color: #8fb7ff !important; +} + +body, +#page-body, +#page-header, +#header, +#main-panel, +#main-panel-content, +#side-panel, +.top-sticker-inner, +.bottom-sticker-inner, +#breadcrumbBar, +#breadcrumbs { + background-color: var(--background) !important; + color: var(--text-color) !important; +} + +.jenkins-card, +.jenkins-section, +.jenkins-section__item, +#main-panel .jenkins-card, +#main-panel .jenkins-section { + background-color: var(--card-background) !important; + color: var(--text-color) !important; + border-color: var(--card-border-color) !important; +} + +table.pane, +table.pane td, +table.pane th, +#projectstatus td, +#projectstatus th { + background-color: var(--card-background) !important; + color: var(--text-color) !important; +} + +table.pane tr:nth-child(even) td, +#projectstatus tr:hover td { + background-color: #1f252d !important; +} + +input, +select, +textarea, +#search-box { + background-color: #151a20 !important; + color: var(--text-color) !important; + border-color: var(--input-border) !important; +} + +a, +a:visited, +a:link { + color: var(--link-color) !important; +} + +a:hover { + opacity: 0.85; +} + +#side-panel .task-link, +#breadcrumbs a, +#breadcrumbs, +#projectstatus th a { + color: var(--text-color-secondary) !important; +} + +.console-output, +.console-output pre, +pre, +code, +.CodeMirror { + background-color: #0c0f14 !important; + color: #d9dee7 !important; +} + +#footer { + background-color: var(--background) !important; + color: var(--text-color-secondary) !important; +} + +.jenkins_ver:after { + content: "atlas dark"; +} +""".stripIndent().trim() + + theme.setElements([new CssTextThemeElement(cssRules)]) + theme.setCssUrl("") + theme.setCssRules(cssRules) theme.setJsUrl("") - theme.setTheme("") - instance.save() + theme.save() println("Applied simple-theme-plugin dark theme") } else { println("simple-theme-plugin not installed; skipping theme configuration") diff --git a/services/jenkins/secretproviderclass.yaml b/services/jenkins/secretproviderclass.yaml new file mode 100644 index 0000000..a9d9dd5 --- /dev/null +++ b/services/jenkins/secretproviderclass.yaml @@ -0,0 +1,21 @@ +# services/jenkins/secretproviderclass.yaml +apiVersion: secrets-store.csi.x-k8s.io/v1 +kind: SecretProviderClass +metadata: + name: jenkins-vault + namespace: jenkins +spec: + provider: vault + parameters: + vaultAddress: "http://vault.vault.svc.cluster.local:8200" + roleName: "jenkins" + objects: | + - objectName: "harbor-pull__dockerconfigjson" + secretPath: "kv/data/atlas/shared/harbor-pull" + secretKey: "dockerconfigjson" + secretObjects: + - secretName: harbor-bstein-robot + type: kubernetes.io/dockerconfigjson + data: + - objectName: harbor-pull__dockerconfigjson + key: .dockerconfigjson diff --git a/services/jenkins/vault-serviceaccount.yaml b/services/jenkins/vault-serviceaccount.yaml new file mode 100644 index 0000000..8d31400 --- /dev/null +++ b/services/jenkins/vault-serviceaccount.yaml @@ -0,0 +1,6 @@ +# services/jenkins/vault-serviceaccount.yaml +apiVersion: v1 +kind: ServiceAccount +metadata: + name: jenkins-vault-sync + namespace: jenkins diff --git a/services/jenkins/vault-sync-deployment.yaml b/services/jenkins/vault-sync-deployment.yaml new file mode 100644 index 0000000..6abcace --- /dev/null +++ b/services/jenkins/vault-sync-deployment.yaml @@ -0,0 +1,37 @@ +# services/jenkins/vault-sync-deployment.yaml +apiVersion: apps/v1 +kind: Deployment +metadata: + name: jenkins-vault-sync + namespace: jenkins +spec: + replicas: 1 + selector: + matchLabels: + app: jenkins-vault-sync + template: + metadata: + labels: + app: jenkins-vault-sync + spec: + serviceAccountName: jenkins-vault-sync + nodeSelector: + kubernetes.io/arch: arm64 + node-role.kubernetes.io/worker: "true" + containers: + - name: sync + image: alpine:3.20 + command: ["/bin/sh", "-c"] + args: + - "sleep infinity" + volumeMounts: + - name: vault-secrets + mountPath: /vault/secrets + readOnly: true + volumes: + - name: vault-secrets + csi: + driver: secrets-store.csi.k8s.io + readOnly: true + volumeAttributes: + secretProviderClass: jenkins-vault diff --git a/services/keycloak/deployment.yaml b/services/keycloak/deployment.yaml index 3d241c9..131169d 100644 --- a/services/keycloak/deployment.yaml +++ b/services/keycloak/deployment.yaml @@ -126,7 +126,7 @@ spec: - name: KC_EVENTS_LISTENERS value: jboss-logging,mailu-http - name: KC_SPI_EVENTS_LISTENER_MAILU-HTTP_ENDPOINT - value: http://mailu-sync-listener.mailu-mailserver.svc.cluster.local:8080/events + value: http://ariadne.maintenance.svc.cluster.local/events ports: - containerPort: 8080 name: http diff --git a/services/keycloak/kustomization.yaml b/services/keycloak/kustomization.yaml index 6030a82..6027891 100644 --- a/services/keycloak/kustomization.yaml +++ b/services/keycloak/kustomization.yaml @@ -10,21 +10,21 @@ resources: - secretproviderclass.yaml - vault-sync-deployment.yaml - deployment.yaml - - realm-settings-job.yaml - - portal-admin-client-secret-ensure-job.yaml - - portal-e2e-client-job.yaml - - portal-e2e-target-client-job.yaml - - portal-e2e-token-exchange-permissions-job.yaml - - portal-e2e-token-exchange-test-job.yaml - - portal-e2e-execute-actions-email-test-job.yaml - - ldap-federation-job.yaml - - user-overrides-job.yaml - - mas-secrets-ensure-job.yaml - - synapse-oidc-secret-ensure-job.yaml - - logs-oidc-secret-ensure-job.yaml - - harbor-oidc-secret-ensure-job.yaml - - vault-oidc-secret-ensure-job.yaml - - actual-oidc-secret-ensure-job.yaml + - oneoffs/realm-settings-job.yaml + - oneoffs/portal-admin-client-secret-ensure-job.yaml + - oneoffs/portal-e2e-client-job.yaml + - oneoffs/portal-e2e-target-client-job.yaml + - oneoffs/portal-e2e-token-exchange-permissions-job.yaml + - oneoffs/portal-e2e-token-exchange-test-job.yaml + - oneoffs/portal-e2e-execute-actions-email-test-job.yaml + - oneoffs/ldap-federation-job.yaml + - oneoffs/user-overrides-job.yaml + - oneoffs/mas-secrets-ensure-job.yaml + - oneoffs/synapse-oidc-secret-ensure-job.yaml + - oneoffs/logs-oidc-secret-ensure-job.yaml + - oneoffs/harbor-oidc-secret-ensure-job.yaml + - oneoffs/vault-oidc-secret-ensure-job.yaml + - oneoffs/actual-oidc-secret-ensure-job.yaml - service.yaml - ingress.yaml generatorOptions: diff --git a/services/keycloak/actual-oidc-secret-ensure-job.yaml b/services/keycloak/oneoffs/actual-oidc-secret-ensure-job.yaml similarity index 83% rename from services/keycloak/actual-oidc-secret-ensure-job.yaml rename to services/keycloak/oneoffs/actual-oidc-secret-ensure-job.yaml index 3dadb52..d4da1f1 100644 --- a/services/keycloak/actual-oidc-secret-ensure-job.yaml +++ b/services/keycloak/oneoffs/actual-oidc-secret-ensure-job.yaml @@ -1,10 +1,15 @@ -# services/keycloak/actual-oidc-secret-ensure-job.yaml +# services/keycloak/oneoffs/actual-oidc-secret-ensure-job.yaml +# One-off job for sso/actual-oidc-secret-ensure-3. +# Purpose: actual oidc secret ensure 3 (see container args/env in this file). +# Run by setting spec.suspend to false, reconcile, then set it back to true. +# Safe to delete the finished Job/pod; it should not run continuously. apiVersion: batch/v1 kind: Job metadata: name: actual-oidc-secret-ensure-3 namespace: sso spec: + suspend: true backoffLimit: 0 ttlSecondsAfterFinished: 3600 template: diff --git a/services/keycloak/harbor-oidc-secret-ensure-job.yaml b/services/keycloak/oneoffs/harbor-oidc-secret-ensure-job.yaml similarity index 81% rename from services/keycloak/harbor-oidc-secret-ensure-job.yaml rename to services/keycloak/oneoffs/harbor-oidc-secret-ensure-job.yaml index 8eac50d..c368241 100644 --- a/services/keycloak/harbor-oidc-secret-ensure-job.yaml +++ b/services/keycloak/oneoffs/harbor-oidc-secret-ensure-job.yaml @@ -1,10 +1,15 @@ -# services/keycloak/harbor-oidc-secret-ensure-job.yaml +# services/keycloak/oneoffs/harbor-oidc-secret-ensure-job.yaml +# One-off job for sso/harbor-oidc-secret-ensure-10. +# Purpose: harbor oidc secret ensure 10 (see container args/env in this file). +# Run by setting spec.suspend to false, reconcile, then set it back to true. +# Safe to delete the finished Job/pod; it should not run continuously. apiVersion: batch/v1 kind: Job metadata: - name: harbor-oidc-secret-ensure-9 + name: harbor-oidc-secret-ensure-10 namespace: sso spec: + suspend: true backoffLimit: 0 ttlSecondsAfterFinished: 3600 template: diff --git a/services/keycloak/ldap-federation-job.yaml b/services/keycloak/oneoffs/ldap-federation-job.yaml similarity index 86% rename from services/keycloak/ldap-federation-job.yaml rename to services/keycloak/oneoffs/ldap-federation-job.yaml index 303fd9f..9e9a5f9 100644 --- a/services/keycloak/ldap-federation-job.yaml +++ b/services/keycloak/oneoffs/ldap-federation-job.yaml @@ -1,10 +1,15 @@ -# services/keycloak/ldap-federation-job.yaml +# services/keycloak/oneoffs/ldap-federation-job.yaml +# One-off job for sso/keycloak-ldap-federation-12. +# Purpose: keycloak ldap federation 12 (see container args/env in this file). +# Run by setting spec.suspend to false, reconcile, then set it back to true. +# Safe to delete the finished Job/pod; it should not run continuously. apiVersion: batch/v1 kind: Job metadata: - name: keycloak-ldap-federation-11 + name: keycloak-ldap-federation-12 namespace: sso spec: + suspend: true backoffLimit: 2 template: metadata: @@ -325,6 +330,54 @@ spec: if status not in (201, 204): raise SystemExit(f"Unexpected group mapper create status: {status}") + def ensure_user_attr_mapper(name: str, ldap_attr: str, user_attr: str): + mapper = None + for c in components: + if c.get("name") == name and c.get("parentId") == ldap_component_id: + mapper = c + break + + payload = { + "name": name, + "providerId": "user-attribute-ldap-mapper", + "providerType": "org.keycloak.storage.ldap.mappers.LDAPStorageMapper", + "parentId": ldap_component_id, + "config": { + "ldap.attribute": [ldap_attr], + "user.model.attribute": [user_attr], + "read.only": ["false"], + "always.read.value.from.ldap": ["false"], + "is.mandatory.in.ldap": ["false"], + }, + } + + if mapper: + payload["id"] = mapper["id"] + payload["parentId"] = mapper.get("parentId", payload["parentId"]) + print(f"Updating LDAP user mapper: {payload['id']} ({name})") + status, _, _ = http_json( + "PUT", + f"{base_url}/admin/realms/{realm}/components/{payload['id']}", + token, + payload, + ) + if status not in (200, 204): + raise SystemExit(f"Unexpected user mapper update status for {name}: {status}") + else: + print(f"Creating LDAP user mapper: {name}") + status, _, _ = http_json( + "POST", + f"{base_url}/admin/realms/{realm}/components", + token, + payload, + ) + if status not in (201, 204): + raise SystemExit(f"Unexpected user mapper create status for {name}: {status}") + + ensure_user_attr_mapper("openldap-email", "mail", "email") + ensure_user_attr_mapper("openldap-first-name", "givenName", "firstName") + ensure_user_attr_mapper("openldap-last-name", "sn", "lastName") + # Cleanup duplicate LDAP federation providers and their child components (mappers, etc). # Keep only the canonical provider we updated/created above. try: diff --git a/services/keycloak/logs-oidc-secret-ensure-job.yaml b/services/keycloak/oneoffs/logs-oidc-secret-ensure-job.yaml similarity index 94% rename from services/keycloak/logs-oidc-secret-ensure-job.yaml rename to services/keycloak/oneoffs/logs-oidc-secret-ensure-job.yaml index 14e80df..bce9e5b 100644 --- a/services/keycloak/logs-oidc-secret-ensure-job.yaml +++ b/services/keycloak/oneoffs/logs-oidc-secret-ensure-job.yaml @@ -1,10 +1,15 @@ -# services/keycloak/logs-oidc-secret-ensure-job.yaml +# services/keycloak/oneoffs/logs-oidc-secret-ensure-job.yaml +# One-off job for sso/logs-oidc-secret-ensure-10. +# Purpose: logs oidc secret ensure 10 (see container args/env in this file). +# Run by setting spec.suspend to false, reconcile, then set it back to true. +# Safe to delete the finished Job/pod; it should not run continuously. apiVersion: batch/v1 kind: Job metadata: name: logs-oidc-secret-ensure-10 namespace: sso spec: + suspend: true backoffLimit: 0 ttlSecondsAfterFinished: 3600 template: diff --git a/services/keycloak/mas-secrets-ensure-job.yaml b/services/keycloak/oneoffs/mas-secrets-ensure-job.yaml similarity index 95% rename from services/keycloak/mas-secrets-ensure-job.yaml rename to services/keycloak/oneoffs/mas-secrets-ensure-job.yaml index 24c9e04..c3bd1be 100644 --- a/services/keycloak/mas-secrets-ensure-job.yaml +++ b/services/keycloak/oneoffs/mas-secrets-ensure-job.yaml @@ -1,4 +1,8 @@ -# services/keycloak/mas-secrets-ensure-job.yaml +# services/keycloak/oneoffs/mas-secrets-ensure-job.yaml +# One-off job for sso/mas-secrets-ensure. +# Purpose: mas secrets ensure (see container args/env in this file). +# Run by setting spec.suspend to false, reconcile, then set it back to true. +# Safe to delete the finished Job/pod; it should not run continuously. apiVersion: v1 kind: ServiceAccount metadata: @@ -13,6 +17,7 @@ metadata: name: mas-secrets-ensure-21 namespace: sso spec: + suspend: true backoffLimit: 0 ttlSecondsAfterFinished: 3600 template: diff --git a/services/keycloak/portal-admin-client-secret-ensure-job.yaml b/services/keycloak/oneoffs/portal-admin-client-secret-ensure-job.yaml similarity index 96% rename from services/keycloak/portal-admin-client-secret-ensure-job.yaml rename to services/keycloak/oneoffs/portal-admin-client-secret-ensure-job.yaml index 90dd4b7..1d3e7f3 100644 --- a/services/keycloak/portal-admin-client-secret-ensure-job.yaml +++ b/services/keycloak/oneoffs/portal-admin-client-secret-ensure-job.yaml @@ -1,10 +1,15 @@ -# services/keycloak/portal-admin-client-secret-ensure-job.yaml +# services/keycloak/oneoffs/portal-admin-client-secret-ensure-job.yaml +# One-off job for sso/keycloak-portal-admin-secret-ensure-4. +# Purpose: keycloak portal admin secret ensure 4 (see container args/env in this file). +# Run by setting spec.suspend to false, reconcile, then set it back to true. +# Safe to delete the finished Job/pod; it should not run continuously. apiVersion: batch/v1 kind: Job metadata: name: keycloak-portal-admin-secret-ensure-4 namespace: sso spec: + suspend: true backoffLimit: 0 template: metadata: diff --git a/services/keycloak/portal-e2e-client-job.yaml b/services/keycloak/oneoffs/portal-e2e-client-job.yaml similarity index 97% rename from services/keycloak/portal-e2e-client-job.yaml rename to services/keycloak/oneoffs/portal-e2e-client-job.yaml index 4e0c006..274dd27 100644 --- a/services/keycloak/portal-e2e-client-job.yaml +++ b/services/keycloak/oneoffs/portal-e2e-client-job.yaml @@ -1,10 +1,15 @@ -# services/keycloak/portal-e2e-client-job.yaml +# services/keycloak/oneoffs/portal-e2e-client-job.yaml +# One-off job for sso/keycloak-portal-e2e-client-8. +# Purpose: keycloak portal e2e client 8 (see container args/env in this file). +# Run by setting spec.suspend to false, reconcile, then set it back to true. +# Safe to delete the finished Job/pod; it should not run continuously. apiVersion: batch/v1 kind: Job metadata: name: keycloak-portal-e2e-client-8 namespace: sso spec: + suspend: true backoffLimit: 0 template: metadata: diff --git a/services/keycloak/portal-e2e-execute-actions-email-test-job.yaml b/services/keycloak/oneoffs/portal-e2e-execute-actions-email-test-job.yaml similarity index 89% rename from services/keycloak/portal-e2e-execute-actions-email-test-job.yaml rename to services/keycloak/oneoffs/portal-e2e-execute-actions-email-test-job.yaml index 35f79a6..518d839 100644 --- a/services/keycloak/portal-e2e-execute-actions-email-test-job.yaml +++ b/services/keycloak/oneoffs/portal-e2e-execute-actions-email-test-job.yaml @@ -1,10 +1,15 @@ -# services/keycloak/portal-e2e-execute-actions-email-test-job.yaml +# services/keycloak/oneoffs/portal-e2e-execute-actions-email-test-job.yaml +# One-off job for sso/keycloak-portal-e2e-execute-actions-email-14. +# Purpose: keycloak portal e2e execute actions email 14 (see container args/env in this file). +# Run by setting spec.suspend to false, reconcile, then set it back to true. +# Safe to delete the finished Job/pod; it should not run continuously. apiVersion: batch/v1 kind: Job metadata: name: keycloak-portal-e2e-execute-actions-email-14 namespace: sso spec: + suspend: true backoffLimit: 3 template: metadata: diff --git a/services/keycloak/portal-e2e-target-client-job.yaml b/services/keycloak/oneoffs/portal-e2e-target-client-job.yaml similarity index 95% rename from services/keycloak/portal-e2e-target-client-job.yaml rename to services/keycloak/oneoffs/portal-e2e-target-client-job.yaml index 196b48b..900d029 100644 --- a/services/keycloak/portal-e2e-target-client-job.yaml +++ b/services/keycloak/oneoffs/portal-e2e-target-client-job.yaml @@ -1,10 +1,15 @@ -# services/keycloak/portal-e2e-target-client-job.yaml +# services/keycloak/oneoffs/portal-e2e-target-client-job.yaml +# One-off job for sso/keycloak-portal-e2e-target-7. +# Purpose: keycloak portal e2e target 7 (see container args/env in this file). +# Run by setting spec.suspend to false, reconcile, then set it back to true. +# Safe to delete the finished Job/pod; it should not run continuously. apiVersion: batch/v1 kind: Job metadata: name: keycloak-portal-e2e-target-7 namespace: sso spec: + suspend: true backoffLimit: 0 template: metadata: diff --git a/services/keycloak/portal-e2e-token-exchange-permissions-job.yaml b/services/keycloak/oneoffs/portal-e2e-token-exchange-permissions-job.yaml similarity index 97% rename from services/keycloak/portal-e2e-token-exchange-permissions-job.yaml rename to services/keycloak/oneoffs/portal-e2e-token-exchange-permissions-job.yaml index 647b8f9..0d41b47 100644 --- a/services/keycloak/portal-e2e-token-exchange-permissions-job.yaml +++ b/services/keycloak/oneoffs/portal-e2e-token-exchange-permissions-job.yaml @@ -1,10 +1,15 @@ -# services/keycloak/portal-e2e-token-exchange-permissions-job.yaml +# services/keycloak/oneoffs/portal-e2e-token-exchange-permissions-job.yaml +# One-off job for sso/keycloak-portal-e2e-token-exchange-permissions-11. +# Purpose: keycloak portal e2e token exchange permissions 11 (see container args/env in this file). +# Run by setting spec.suspend to false, reconcile, then set it back to true. +# Safe to delete the finished Job/pod; it should not run continuously. apiVersion: batch/v1 kind: Job metadata: name: keycloak-portal-e2e-token-exchange-permissions-11 namespace: sso spec: + suspend: true backoffLimit: 6 template: metadata: diff --git a/services/keycloak/portal-e2e-token-exchange-test-job.yaml b/services/keycloak/oneoffs/portal-e2e-token-exchange-test-job.yaml similarity index 89% rename from services/keycloak/portal-e2e-token-exchange-test-job.yaml rename to services/keycloak/oneoffs/portal-e2e-token-exchange-test-job.yaml index edd7555..eb05e09 100644 --- a/services/keycloak/portal-e2e-token-exchange-test-job.yaml +++ b/services/keycloak/oneoffs/portal-e2e-token-exchange-test-job.yaml @@ -1,10 +1,15 @@ -# services/keycloak/portal-e2e-token-exchange-test-job.yaml +# services/keycloak/oneoffs/portal-e2e-token-exchange-test-job.yaml +# One-off job for sso/keycloak-portal-e2e-token-exchange-test-7. +# Purpose: keycloak portal e2e token exchange test 7 (see container args/env in this file). +# Run by setting spec.suspend to false, reconcile, then set it back to true. +# Safe to delete the finished Job/pod; it should not run continuously. apiVersion: batch/v1 kind: Job metadata: name: keycloak-portal-e2e-token-exchange-test-7 namespace: sso spec: + suspend: true backoffLimit: 6 ttlSecondsAfterFinished: 3600 template: diff --git a/services/keycloak/realm-settings-job.yaml b/services/keycloak/oneoffs/realm-settings-job.yaml similarity index 78% rename from services/keycloak/realm-settings-job.yaml rename to services/keycloak/oneoffs/realm-settings-job.yaml index f680200..ea88d83 100644 --- a/services/keycloak/realm-settings-job.yaml +++ b/services/keycloak/oneoffs/realm-settings-job.yaml @@ -1,10 +1,15 @@ -# services/keycloak/realm-settings-job.yaml +# services/keycloak/oneoffs/realm-settings-job.yaml +# One-off job for sso/keycloak-realm-settings-36. +# Purpose: keycloak realm settings 36 (see container args/env in this file). +# Run by setting spec.suspend to false, reconcile, then set it back to true. +# Safe to delete the finished Job/pod; it should not run continuously. apiVersion: batch/v1 kind: Job metadata: - name: keycloak-realm-settings-32 + name: keycloak-realm-settings-36 namespace: sso spec: + suspend: true backoffLimit: 0 template: metadata: @@ -331,6 +336,9 @@ spec: # Ensure basic realm groups exist for provisioning. ensure_group("dev") ensure_group("admin") + ensure_group("demo") + ensure_group("test") + ensure_group("vaultwarden_grandfathered") planka_group = ensure_group("planka-users") if planka_group and planka_group.get("id"): @@ -467,6 +475,126 @@ spec: if status not in (201, 204): raise SystemExit(f"Unexpected protocol mapper create response: {status}") + # Ensure mailu_email overrides email claim for service clients. + excluded_email_clients = { + "account", + "account-console", + "admin-cli", + "security-admin-console", + "realm-management", + "broker", + } + status, clients = http_json( + "GET", + f"{base_url}/admin/realms/{realm}/clients", + access_token, + ) + if status == 200 and isinstance(clients, list): + for client in clients: + if not isinstance(client, dict): + continue + if client.get("protocol") != "openid-connect": + continue + client_name = client.get("clientId") if isinstance(client.get("clientId"), str) else "" + if not client_name or client_name in excluded_email_clients: + continue + client_id = client.get("id") + if not client_id: + continue + email_mapper = { + "name": "mailu-email", + "protocol": "openid-connect", + "protocolMapper": "oidc-usermodel-attribute-mapper", + "consentRequired": False, + "config": { + "user.attribute": "mailu_email", + "claim.name": "email", + "jsonType.label": "String", + "id.token.claim": "true", + "access.token.claim": "true", + "userinfo.token.claim": "true", + "multivalued": "false", + "aggregate.attrs": "false", + }, + } + status, mappers = http_json( + "GET", + f"{base_url}/admin/realms/{realm}/clients/{client_id}/protocol-mappers/models", + access_token, + ) + existing = None + if status == 200 and isinstance(mappers, list): + for item in mappers: + if isinstance(item, dict) and item.get("name") == email_mapper["name"]: + existing = item + break + if existing and existing.get("id"): + email_mapper["id"] = existing["id"] + status, _ = http_json( + "PUT", + f"{base_url}/admin/realms/{realm}/clients/{client_id}/protocol-mappers/models/{existing['id']}", + access_token, + email_mapper, + ) + if status not in (200, 204): + raise SystemExit(f"Unexpected mailu email mapper update response: {status}") + else: + status, _ = http_json( + "POST", + f"{base_url}/admin/realms/{realm}/clients/{client_id}/protocol-mappers/models", + access_token, + email_mapper, + ) + if status not in (201, 204): + raise SystemExit(f"Unexpected mailu email mapper create response: {status}") + + mailu_claim_mapper = { + "name": "mailu-email-claim", + "protocol": "openid-connect", + "protocolMapper": "oidc-usermodel-attribute-mapper", + "consentRequired": False, + "config": { + "user.attribute": "mailu_email", + "claim.name": "mailu_email", + "jsonType.label": "String", + "id.token.claim": "true", + "access.token.claim": "true", + "userinfo.token.claim": "true", + "multivalued": "false", + "aggregate.attrs": "false", + }, + } + status, mappers = http_json( + "GET", + f"{base_url}/admin/realms/{realm}/clients/{client_id}/protocol-mappers/models", + access_token, + ) + existing_claim = None + if status == 200 and isinstance(mappers, list): + for item in mappers: + if isinstance(item, dict) and item.get("name") == mailu_claim_mapper["name"]: + existing_claim = item + break + if existing_claim and existing_claim.get("id"): + mailu_claim_mapper["id"] = existing_claim["id"] + status, _ = http_json( + "PUT", + f"{base_url}/admin/realms/{realm}/clients/{client_id}/protocol-mappers/models/{existing_claim['id']}", + access_token, + mailu_claim_mapper, + ) + if status not in (200, 204): + raise SystemExit(f"Unexpected mailu email claim mapper update response: {status}") + else: + status, _ = http_json( + "POST", + f"{base_url}/admin/realms/{realm}/clients/{client_id}/protocol-mappers/models", + access_token, + mailu_claim_mapper, + ) + if status not in (201, 204): + raise SystemExit(f"Unexpected mailu email claim mapper create response: {status}") + # Ensure MFA is on by default for newly-created users. status, required_actions = http_json( "GET", diff --git a/services/keycloak/synapse-oidc-secret-ensure-job.yaml b/services/keycloak/oneoffs/synapse-oidc-secret-ensure-job.yaml similarity index 92% rename from services/keycloak/synapse-oidc-secret-ensure-job.yaml rename to services/keycloak/oneoffs/synapse-oidc-secret-ensure-job.yaml index e808e7e..15b7a31 100644 --- a/services/keycloak/synapse-oidc-secret-ensure-job.yaml +++ b/services/keycloak/oneoffs/synapse-oidc-secret-ensure-job.yaml @@ -1,10 +1,15 @@ -# services/keycloak/synapse-oidc-secret-ensure-job.yaml +# services/keycloak/oneoffs/synapse-oidc-secret-ensure-job.yaml +# One-off job for sso/synapse-oidc-secret-ensure-10. +# Purpose: synapse oidc secret ensure 10 (see container args/env in this file). +# Run by setting spec.suspend to false, reconcile, then set it back to true. +# Safe to delete the finished Job/pod; it should not run continuously. apiVersion: batch/v1 kind: Job metadata: name: synapse-oidc-secret-ensure-10 namespace: sso spec: + suspend: true backoffLimit: 0 ttlSecondsAfterFinished: 3600 template: diff --git a/services/keycloak/user-overrides-job.yaml b/services/keycloak/oneoffs/user-overrides-job.yaml similarity index 96% rename from services/keycloak/user-overrides-job.yaml rename to services/keycloak/oneoffs/user-overrides-job.yaml index 7623c84..0d52d6d 100644 --- a/services/keycloak/user-overrides-job.yaml +++ b/services/keycloak/oneoffs/user-overrides-job.yaml @@ -1,10 +1,15 @@ -# services/keycloak/user-overrides-job.yaml +# services/keycloak/oneoffs/user-overrides-job.yaml +# One-off job for sso/keycloak-user-overrides-9. +# Purpose: keycloak user overrides 9 (see container args/env in this file). +# Run by setting spec.suspend to false, reconcile, then set it back to true. +# Safe to delete the finished Job/pod; it should not run continuously. apiVersion: batch/v1 kind: Job metadata: name: keycloak-user-overrides-9 namespace: sso spec: + suspend: true backoffLimit: 0 template: metadata: diff --git a/services/keycloak/vault-oidc-secret-ensure-job.yaml b/services/keycloak/oneoffs/vault-oidc-secret-ensure-job.yaml similarity index 83% rename from services/keycloak/vault-oidc-secret-ensure-job.yaml rename to services/keycloak/oneoffs/vault-oidc-secret-ensure-job.yaml index 3aa3ca5..a76c52e 100644 --- a/services/keycloak/vault-oidc-secret-ensure-job.yaml +++ b/services/keycloak/oneoffs/vault-oidc-secret-ensure-job.yaml @@ -1,10 +1,15 @@ -# services/keycloak/vault-oidc-secret-ensure-job.yaml +# services/keycloak/oneoffs/vault-oidc-secret-ensure-job.yaml +# One-off job for sso/vault-oidc-secret-ensure-8. +# Purpose: vault oidc secret ensure 8 (see container args/env in this file). +# Run by setting spec.suspend to false, reconcile, then set it back to true. +# Safe to delete the finished Job/pod; it should not run continuously. apiVersion: batch/v1 kind: Job metadata: name: vault-oidc-secret-ensure-8 namespace: sso spec: + suspend: true backoffLimit: 0 ttlSecondsAfterFinished: 3600 template: diff --git a/services/keycloak/scripts/harbor_oidc_secret_ensure.sh b/services/keycloak/scripts/harbor_oidc_secret_ensure.sh index 7187d34..c70caa2 100755 --- a/services/keycloak/scripts/harbor_oidc_secret_ensure.sh +++ b/services/keycloak/scripts/harbor_oidc_secret_ensure.sh @@ -29,7 +29,7 @@ CLIENT_QUERY="$(curl -sS -H "Authorization: Bearer ${ACCESS_TOKEN}" \ CLIENT_ID="$(echo "$CLIENT_QUERY" | jq -r '.[0].id' 2>/dev/null || true)" if [ -z "$CLIENT_ID" ] || [ "$CLIENT_ID" = "null" ]; then - create_payload='{"clientId":"harbor","enabled":true,"protocol":"openid-connect","publicClient":false,"standardFlowEnabled":true,"implicitFlowEnabled":false,"directAccessGrantsEnabled":false,"serviceAccountsEnabled":false,"redirectUris":["https://registry.bstein.dev/c/oidc/callback"],"webOrigins":["https://registry.bstein.dev"],"rootUrl":"https://registry.bstein.dev","baseUrl":"/"}' + create_payload='{"clientId":"harbor","enabled":true,"protocol":"openid-connect","publicClient":false,"standardFlowEnabled":true,"implicitFlowEnabled":false,"directAccessGrantsEnabled":true,"serviceAccountsEnabled":false,"redirectUris":["https://registry.bstein.dev/c/oidc/callback"],"webOrigins":["https://registry.bstein.dev"],"rootUrl":"https://registry.bstein.dev","baseUrl":"/"}' status="$(curl -sS -o /dev/null -w "%{http_code}" -X POST \ -H "Authorization: Bearer ${ACCESS_TOKEN}" \ -H 'Content-Type: application/json' \ @@ -49,6 +49,21 @@ if [ -z "$CLIENT_ID" ] || [ "$CLIENT_ID" = "null" ]; then exit 1 fi +CLIENT_CONFIG="$(curl -sS -H "Authorization: Bearer ${ACCESS_TOKEN}" \ + "$KC_URL/admin/realms/atlas/clients/${CLIENT_ID}" || true)" +if [ -n "$CLIENT_CONFIG" ]; then + updated_config="$(echo "$CLIENT_CONFIG" | jq '.directAccessGrantsEnabled=true')" + status="$(curl -sS -o /dev/null -w "%{http_code}" -X PUT \ + -H "Authorization: Bearer ${ACCESS_TOKEN}" \ + -H 'Content-Type: application/json' \ + -d "${updated_config}" \ + "$KC_URL/admin/realms/atlas/clients/${CLIENT_ID}")" + if [ "$status" != "200" ] && [ "$status" != "204" ]; then + echo "Keycloak client update failed (status ${status})" >&2 + exit 1 + fi +fi + SCOPE_ID="$(curl -sS -H "Authorization: Bearer ${ACCESS_TOKEN}" \ "$KC_URL/admin/realms/atlas/client-scopes?search=groups" | jq -r '.[] | select(.name=="groups") | .id' 2>/dev/null | head -n1 || true)" if [ -z "$SCOPE_ID" ] || [ "$SCOPE_ID" = "null" ]; then @@ -77,6 +92,26 @@ if ! echo "$DEFAULT_SCOPES" | jq -e '.[] | select(.name=="groups")' >/dev/null 2 fi fi +OFFLINE_SCOPE_ID="$(curl -sS -H "Authorization: Bearer ${ACCESS_TOKEN}" \ + "$KC_URL/admin/realms/atlas/client-scopes?search=offline_access" | jq -r '.[] | select(.name=="offline_access") | .id' 2>/dev/null | head -n1 || true)" +if [ -n "$OFFLINE_SCOPE_ID" ] && [ "$OFFLINE_SCOPE_ID" != "null" ]; then + if ! echo "$DEFAULT_SCOPES" | jq -e '.[] | select(.name=="offline_access")' >/dev/null 2>&1 \ + && ! echo "$OPTIONAL_SCOPES" | jq -e '.[] | select(.name=="offline_access")' >/dev/null 2>&1; then + status="$(curl -sS -o /dev/null -w "%{http_code}" -X PUT \ + -H "Authorization: Bearer ${ACCESS_TOKEN}" \ + "$KC_URL/admin/realms/atlas/clients/${CLIENT_ID}/optional-client-scopes/${OFFLINE_SCOPE_ID}")" + if [ "$status" != "200" ] && [ "$status" != "201" ] && [ "$status" != "204" ]; then + status="$(curl -sS -o /dev/null -w "%{http_code}" -X POST \ + -H "Authorization: Bearer ${ACCESS_TOKEN}" \ + "$KC_URL/admin/realms/atlas/clients/${CLIENT_ID}/optional-client-scopes/${OFFLINE_SCOPE_ID}")" + if [ "$status" != "200" ] && [ "$status" != "201" ] && [ "$status" != "204" ]; then + echo "Failed to attach offline_access scope to harbor (status ${status})" >&2 + exit 1 + fi + fi + fi +fi + CLIENT_SECRET="$(curl -sS -H "Authorization: Bearer ${ACCESS_TOKEN}" \ "$KC_URL/admin/realms/atlas/clients/${CLIENT_ID}/client-secret" | jq -r '.value' 2>/dev/null || true)" if [ -z "$CLIENT_SECRET" ] || [ "$CLIENT_SECRET" = "null" ]; then diff --git a/services/keycloak/secretproviderclass.yaml b/services/keycloak/secretproviderclass.yaml index 86cebd2..d4c094f 100644 --- a/services/keycloak/secretproviderclass.yaml +++ b/services/keycloak/secretproviderclass.yaml @@ -11,7 +11,7 @@ spec: roleName: "sso" objects: | - objectName: "harbor-pull__dockerconfigjson" - secretPath: "kv/data/atlas/harbor-pull/sso" + secretPath: "kv/data/atlas/shared/harbor-pull" secretKey: "dockerconfigjson" secretObjects: - secretName: harbor-regcred diff --git a/services/logging/kustomization.yaml b/services/logging/kustomization.yaml index 08c73a8..dc48715 100644 --- a/services/logging/kustomization.yaml +++ b/services/logging/kustomization.yaml @@ -15,9 +15,9 @@ resources: - opensearch-dashboards-helmrelease.yaml - data-prepper-helmrelease.yaml - otel-collector-helmrelease.yaml - - opensearch-ism-job.yaml - - opensearch-dashboards-setup-job.yaml - - opensearch-observability-setup-job.yaml + - oneoffs/opensearch-ism-job.yaml + - oneoffs/opensearch-dashboards-setup-job.yaml + - oneoffs/opensearch-observability-setup-job.yaml - opensearch-prune-cronjob.yaml - fluent-bit-helmrelease.yaml - node-log-rotation-daemonset.yaml diff --git a/services/logging/opensearch-dashboards-setup-job.yaml b/services/logging/oneoffs/opensearch-dashboards-setup-job.yaml similarity index 88% rename from services/logging/opensearch-dashboards-setup-job.yaml rename to services/logging/oneoffs/opensearch-dashboards-setup-job.yaml index 06149d7..1d1a9b6 100644 --- a/services/logging/opensearch-dashboards-setup-job.yaml +++ b/services/logging/oneoffs/opensearch-dashboards-setup-job.yaml @@ -1,10 +1,15 @@ -# services/logging/opensearch-dashboards-setup-job.yaml +# services/logging/oneoffs/opensearch-dashboards-setup-job.yaml +# One-off job for logging/opensearch-dashboards-setup-4. +# Purpose: opensearch dashboards setup 4 (see container args/env in this file). +# Run by setting spec.suspend to false, reconcile, then set it back to true. +# Safe to delete the finished Job/pod; it should not run continuously. apiVersion: batch/v1 kind: Job metadata: name: opensearch-dashboards-setup-4 namespace: logging spec: + suspend: true backoffLimit: 3 ttlSecondsAfterFinished: 3600 template: diff --git a/services/logging/opensearch-ism-job.yaml b/services/logging/oneoffs/opensearch-ism-job.yaml similarity index 91% rename from services/logging/opensearch-ism-job.yaml rename to services/logging/oneoffs/opensearch-ism-job.yaml index 3313571..476bca7 100644 --- a/services/logging/opensearch-ism-job.yaml +++ b/services/logging/oneoffs/opensearch-ism-job.yaml @@ -1,10 +1,15 @@ -# services/logging/opensearch-ism-job.yaml +# services/logging/oneoffs/opensearch-ism-job.yaml +# One-off job for logging/opensearch-ism-setup-5. +# Purpose: opensearch ism setup 5 (see container args/env in this file). +# Run by setting spec.suspend to false, reconcile, then set it back to true. +# Safe to delete the finished Job/pod; it should not run continuously. apiVersion: batch/v1 kind: Job metadata: name: opensearch-ism-setup-5 namespace: logging spec: + suspend: true backoffLimit: 3 ttlSecondsAfterFinished: 3600 template: diff --git a/services/logging/opensearch-observability-setup-job.yaml b/services/logging/oneoffs/opensearch-observability-setup-job.yaml similarity index 76% rename from services/logging/opensearch-observability-setup-job.yaml rename to services/logging/oneoffs/opensearch-observability-setup-job.yaml index e4590fb..6caa076 100644 --- a/services/logging/opensearch-observability-setup-job.yaml +++ b/services/logging/oneoffs/opensearch-observability-setup-job.yaml @@ -1,10 +1,15 @@ -# services/logging/opensearch-observability-setup-job.yaml +# services/logging/oneoffs/opensearch-observability-setup-job.yaml +# One-off job for logging/opensearch-observability-setup-2. +# Purpose: opensearch observability setup 2 (see container args/env in this file). +# Run by setting spec.suspend to false, reconcile, then set it back to true. +# Safe to delete the finished Job/pod; it should not run continuously. apiVersion: batch/v1 kind: Job metadata: name: opensearch-observability-setup-2 namespace: logging spec: + suspend: true backoffLimit: 3 ttlSecondsAfterFinished: 3600 template: diff --git a/services/logging/opensearch-prune-cronjob.yaml b/services/logging/opensearch-prune-cronjob.yaml index 75e72db..dc0dffb 100644 --- a/services/logging/opensearch-prune-cronjob.yaml +++ b/services/logging/opensearch-prune-cronjob.yaml @@ -6,6 +6,7 @@ metadata: namespace: logging spec: schedule: "23 3 * * *" + suspend: true concurrencyPolicy: Forbid successfulJobsHistoryLimit: 1 failedJobsHistoryLimit: 3 diff --git a/services/logging/secretproviderclass.yaml b/services/logging/secretproviderclass.yaml index f5db15e..6ff642d 100644 --- a/services/logging/secretproviderclass.yaml +++ b/services/logging/secretproviderclass.yaml @@ -11,7 +11,7 @@ spec: roleName: "logging" objects: | - objectName: "harbor-pull__dockerconfigjson" - secretPath: "kv/data/atlas/harbor-pull/logging" + secretPath: "kv/data/atlas/shared/harbor-pull" secretKey: "dockerconfigjson" secretObjects: - secretName: harbor-regcred diff --git a/services/mailu/helmrelease.yaml b/services/mailu/helmrelease.yaml index 7342141..2a7e6f5 100644 --- a/services/mailu/helmrelease.yaml +++ b/services/mailu/helmrelease.yaml @@ -219,6 +219,8 @@ spec: overrides: postfix.cf: | mynetworks = 127.0.0.0/8 [::1]/128 10.42.0.0/16 10.43.0.0/16 192.168.22.0/24 + recipient_canonical_maps = regexp:/overrides/recipient_canonical, socketmap:unix:/tmp/podop.socket:recipientmap + recipient_canonical_classes = envelope_recipient,header_recipient smtpd_delay_reject = yes smtpd_helo_required = yes smtpd_helo_restrictions = reject_invalid_helo_hostname, reject_non_fqdn_helo_hostname, reject_unknown_helo_hostname @@ -238,8 +240,10 @@ spec: smtpd_client_message_rate_limit = 100 smtpd_client_recipient_rate_limit = 200 smtpd_recipient_limit = 100 + recipient_canonical: | + /^double-bounce@mail\.bstein\.dev$/ double-bounce@bstein.dev podAnnotations: - bstein.dev/restarted-at: "2026-01-06T00:00:00Z" + bstein.dev/restarted-at: "2026-01-20T04:35:00Z" redis: enabled: true architecture: standalone @@ -335,8 +339,15 @@ spec: export INITIAL_ADMIN_PW="{{ .Data.data.password }}" {{ end }} {{ with secret "kv/data/atlas/shared/postmark-relay" }} + {{- $access := index .Data.data "accesskey" -}} + {{- $secret := index .Data.data "secretkey" -}} + {{- if and $access $secret }} + export RELAYUSER="{{ $access }}" + export RELAYPASSWORD="{{ $secret }}" + {{- else }} export RELAYUSER="{{ index .Data.data "apikey" }}" export RELAYPASSWORD="{{ index .Data.data "apikey" }}" + {{- end }} {{ end }} spec: serviceAccountName: mailu-vault-sync @@ -397,8 +408,15 @@ spec: export INITIAL_ADMIN_PW="{{ .Data.data.password }}" {{ end }} {{ with secret "kv/data/atlas/shared/postmark-relay" }} + {{- $access := index .Data.data "accesskey" -}} + {{- $secret := index .Data.data "secretkey" -}} + {{- if and $access $secret }} + export RELAYUSER="{{ $access }}" + export RELAYPASSWORD="{{ $secret }}" + {{- else }} export RELAYUSER="{{ index .Data.data "apikey" }}" export RELAYPASSWORD="{{ index .Data.data "apikey" }}" + {{- end }} {{ end }} spec: serviceAccountName: mailu-vault-sync @@ -441,6 +459,8 @@ spec: metadata: name: mailu-postfix spec: + strategy: + type: Recreate template: metadata: annotations: @@ -459,8 +479,15 @@ spec: export INITIAL_ADMIN_PW="{{ .Data.data.password }}" {{ end }} {{ with secret "kv/data/atlas/shared/postmark-relay" }} + {{- $access := index .Data.data "accesskey" -}} + {{- $secret := index .Data.data "secretkey" -}} + {{- if and $access $secret }} + export RELAYUSER="{{ $access }}" + export RELAYPASSWORD="{{ $secret }}" + {{- else }} export RELAYUSER="{{ index .Data.data "apikey" }}" export RELAYPASSWORD="{{ index .Data.data "apikey" }}" + {{- end }} {{ end }} spec: serviceAccountName: mailu-vault-sync @@ -521,8 +548,15 @@ spec: export INITIAL_ADMIN_PW="{{ .Data.data.password }}" {{ end }} {{ with secret "kv/data/atlas/shared/postmark-relay" }} + {{- $access := index .Data.data "accesskey" -}} + {{- $secret := index .Data.data "secretkey" -}} + {{- if and $access $secret }} + export RELAYUSER="{{ $access }}" + export RELAYPASSWORD="{{ $secret }}" + {{- else }} export RELAYUSER="{{ index .Data.data "apikey" }}" export RELAYPASSWORD="{{ index .Data.data "apikey" }}" + {{- end }} {{ end }} spec: serviceAccountName: mailu-vault-sync @@ -583,8 +617,15 @@ spec: export INITIAL_ADMIN_PW="{{ .Data.data.password }}" {{ end }} {{ with secret "kv/data/atlas/shared/postmark-relay" }} + {{- $access := index .Data.data "accesskey" -}} + {{- $secret := index .Data.data "secretkey" -}} + {{- if and $access $secret }} + export RELAYUSER="{{ $access }}" + export RELAYPASSWORD="{{ $secret }}" + {{- else }} export RELAYUSER="{{ index .Data.data "apikey" }}" export RELAYPASSWORD="{{ index .Data.data "apikey" }}" + {{- end }} {{ end }} spec: serviceAccountName: mailu-vault-sync @@ -645,8 +686,15 @@ spec: export INITIAL_ADMIN_PW="{{ .Data.data.password }}" {{ end }} {{ with secret "kv/data/atlas/shared/postmark-relay" }} + {{- $access := index .Data.data "accesskey" -}} + {{- $secret := index .Data.data "secretkey" -}} + {{- if and $access $secret }} + export RELAYUSER="{{ $access }}" + export RELAYPASSWORD="{{ $secret }}" + {{- else }} export RELAYUSER="{{ index .Data.data "apikey" }}" export RELAYPASSWORD="{{ index .Data.data "apikey" }}" + {{- end }} {{ end }} spec: serviceAccountName: mailu-vault-sync diff --git a/services/mailu/kustomization.yaml b/services/mailu/kustomization.yaml index 5c111eb..3e0494e 100644 --- a/services/mailu/kustomization.yaml +++ b/services/mailu/kustomization.yaml @@ -13,9 +13,8 @@ resources: - unbound-configmap.yaml - serverstransport.yaml - ingressroute.yaml - - mailu-sync-job.yaml + - oneoffs/mailu-sync-job.yaml - mailu-sync-cronjob.yaml - - mailu-sync-listener.yaml - front-lb.yaml configMapGenerator: @@ -31,10 +30,6 @@ configMapGenerator: - sync.py=scripts/mailu_sync.py options: disableNameSuffixHash: true - - name: mailu-sync-listener - namespace: mailu-mailserver - files: - - listener.py=scripts/mailu_sync_listener.py - name: mailu-vault-entrypoint namespace: mailu-mailserver files: diff --git a/services/mailu/mailu-sync-cronjob.yaml b/services/mailu/mailu-sync-cronjob.yaml index 1da1981..bbe9909 100644 --- a/services/mailu/mailu-sync-cronjob.yaml +++ b/services/mailu/mailu-sync-cronjob.yaml @@ -8,6 +8,7 @@ metadata: atlas.bstein.dev/glue: "true" spec: schedule: "30 4 * * *" + suspend: true concurrencyPolicy: Forbid jobTemplate: spec: @@ -37,6 +38,9 @@ spec: {{- with secret "kv/data/atlas/mailu/mailu-initial-account-secret" -}}{{ .Data.data.password }}{{- end -}} spec: restartPolicy: OnFailure + nodeSelector: + kubernetes.io/arch: arm64 + node-role.kubernetes.io/worker: "true" serviceAccountName: mailu-vault-sync containers: - name: mailu-sync diff --git a/services/mailu/mailu-sync-listener.yaml b/services/mailu/mailu-sync-listener.yaml index cc98107..0644c5b 100644 --- a/services/mailu/mailu-sync-listener.yaml +++ b/services/mailu/mailu-sync-listener.yaml @@ -30,7 +30,7 @@ spec: app: mailu-sync-listener annotations: vault.hashicorp.com/agent-inject: "true" - atlas.bstein.dev/mailu-sync-rev: "2" + atlas.bstein.dev/mailu-sync-rev: "4" vault.hashicorp.com/role: "mailu-mailserver" vault.hashicorp.com/agent-inject-secret-mailu-db-secret__database: "kv/data/atlas/mailu/mailu-db-secret" vault.hashicorp.com/agent-inject-template-mailu-db-secret__database: | @@ -52,6 +52,9 @@ spec: {{- with secret "kv/data/atlas/mailu/mailu-initial-account-secret" -}}{{ .Data.data.password }}{{- end -}} spec: restartPolicy: Always + nodeSelector: + kubernetes.io/arch: arm64 + node-role.kubernetes.io/worker: "true" serviceAccountName: mailu-vault-sync containers: - name: listener diff --git a/services/mailu/mailu-sync-job.yaml b/services/mailu/oneoffs/mailu-sync-job.yaml similarity index 93% rename from services/mailu/mailu-sync-job.yaml rename to services/mailu/oneoffs/mailu-sync-job.yaml index 8589e9e..38648ac 100644 --- a/services/mailu/mailu-sync-job.yaml +++ b/services/mailu/oneoffs/mailu-sync-job.yaml @@ -1,10 +1,15 @@ -# services/mailu/mailu-sync-job.yaml +# services/mailu/oneoffs/mailu-sync-job.yaml +# One-off job for mailu-mailserver/mailu-sync-9. +# Purpose: mailu sync 9 (see container args/env in this file). +# Run by setting spec.suspend to false, reconcile, then set it back to true. +# Safe to delete the finished Job/pod; it should not run continuously. apiVersion: batch/v1 kind: Job metadata: name: mailu-sync-9 namespace: mailu-mailserver spec: + suspend: true template: metadata: annotations: diff --git a/services/mailu/scripts/mailu_sync.py b/services/mailu/scripts/mailu_sync.py index 001917a..71b0f5a 100644 --- a/services/mailu/scripts/mailu_sync.py +++ b/services/mailu/scripts/mailu_sync.py @@ -130,7 +130,9 @@ def kc_update_attributes(token, user, attributes): if not isinstance(current_attrs, dict): current_attrs = {} current_attrs.update(attributes) - resp = SESSION.put(user_url, headers=headers, json={"attributes": current_attrs}, timeout=20) + payload = _safe_update_payload(current_payload) + payload["attributes"] = current_attrs + resp = SESSION.put(user_url, headers=headers, json=payload, timeout=20) resp.raise_for_status() verify = SESSION.get( user_url, @@ -144,6 +146,34 @@ def kc_update_attributes(token, user, attributes): raise Exception(f"attribute not persisted for {user.get('email') or user['username']}") +def _safe_update_payload(user_payload: dict) -> dict: + payload: dict = {} + username = user_payload.get("username") + if isinstance(username, str): + payload["username"] = username + enabled = user_payload.get("enabled") + if isinstance(enabled, bool): + payload["enabled"] = enabled + email = user_payload.get("email") + if isinstance(email, str): + payload["email"] = email + email_verified = user_payload.get("emailVerified") + if isinstance(email_verified, bool): + payload["emailVerified"] = email_verified + first_name = user_payload.get("firstName") + if isinstance(first_name, str): + payload["firstName"] = first_name + last_name = user_payload.get("lastName") + if isinstance(last_name, str): + payload["lastName"] = last_name + actions = user_payload.get("requiredActions") + if isinstance(actions, list): + payload["requiredActions"] = [a for a in actions if isinstance(a, str)] + attrs = user_payload.get("attributes") + payload["attributes"] = attrs if isinstance(attrs, dict) else {} + return payload + + def random_password(): alphabet = string.ascii_letters + string.digits return "".join(secrets.choice(alphabet) for _ in range(24)) diff --git a/services/mailu/scripts/mailu_sync_listener.py b/services/mailu/scripts/mailu_sync_listener.py index 6ac0da7..4e31c81 100644 --- a/services/mailu/scripts/mailu_sync_listener.py +++ b/services/mailu/scripts/mailu_sync_listener.py @@ -39,12 +39,12 @@ def _run_sync_blocking() -> int: sync_done.set() -def _trigger_sync_async() -> bool: +def _trigger_sync_async(force: bool = False) -> bool: with lock: now = time() if sync_running: return False - if now - last_run < MIN_INTERVAL_SECONDS: + if not force and now - last_run < MIN_INTERVAL_SECONDS: return False thread = threading.Thread(target=_run_sync_blocking, daemon=True) @@ -64,15 +64,17 @@ class Handler(http.server.BaseHTTPRequestHandler): return wait = False + force = False if isinstance(payload, dict): wait = bool(payload.get("wait")) + force = bool(payload.get("force")) if wait: with lock: already_running = sync_running if not already_running: - _trigger_sync_async() + _trigger_sync_async(force=force) sync_done.wait(timeout=WAIT_TIMEOUT_SECONDS) with lock: @@ -87,7 +89,7 @@ class Handler(http.server.BaseHTTPRequestHandler): self.end_headers() return - _trigger_sync_async() + _trigger_sync_async(force=force) self.send_response(202) self.end_headers() diff --git a/services/mailu/secretproviderclass.yaml b/services/mailu/secretproviderclass.yaml index f58c69b..f9e281e 100644 --- a/services/mailu/secretproviderclass.yaml +++ b/services/mailu/secretproviderclass.yaml @@ -11,7 +11,7 @@ spec: roleName: "mailu-mailserver" objects: | - objectName: "harbor-pull__dockerconfigjson" - secretPath: "kv/data/atlas/harbor-pull/mailu-mailserver" + secretPath: "kv/data/atlas/shared/harbor-pull" secretKey: "dockerconfigjson" secretObjects: - secretName: harbor-regcred diff --git a/services/maintenance/ariadne-deployment.yaml b/services/maintenance/ariadne-deployment.yaml new file mode 100644 index 0000000..fce1ded --- /dev/null +++ b/services/maintenance/ariadne-deployment.yaml @@ -0,0 +1,359 @@ +# services/maintenance/ariadne-deployment.yaml +apiVersion: apps/v1 +kind: Deployment +metadata: + name: ariadne + namespace: maintenance +spec: + replicas: 1 + revisionHistoryLimit: 3 + selector: + matchLabels: + app: ariadne + template: + metadata: + labels: + app: ariadne + annotations: + prometheus.io/scrape: "true" + prometheus.io/port: "8080" + prometheus.io/path: "/metrics" + vault.hashicorp.com/agent-inject: "true" + vault.hashicorp.com/role: "maintenance" + vault.hashicorp.com/agent-inject-secret-ariadne-env.sh: "kv/data/atlas/maintenance/ariadne-db" + vault.hashicorp.com/agent-inject-template-ariadne-env.sh: | + {{ with secret "kv/data/atlas/maintenance/ariadne-db" }} + export ARIADNE_DATABASE_URL="{{ .Data.data.database_url }}" + {{ end }} + {{ with secret "kv/data/atlas/portal/atlas-portal-db" }} + export PORTAL_DATABASE_URL="{{ .Data.data.PORTAL_DATABASE_URL }}" + {{ end }} + {{ with secret "kv/data/atlas/portal/bstein-dev-home-keycloak-admin" }} + export KEYCLOAK_ADMIN_CLIENT_SECRET="{{ .Data.data.client_secret }}" + {{ end }} + {{ with secret "kv/data/atlas/nextcloud/nextcloud-db" }} + export NEXTCLOUD_DB_NAME="{{ .Data.data.database }}" + export NEXTCLOUD_DB_USER="{{ index .Data.data "db-username" }}" + export NEXTCLOUD_DB_PASSWORD="{{ index .Data.data "db-password" }}" + {{ end }} + {{ with secret "kv/data/atlas/nextcloud/nextcloud-admin" }} + export NEXTCLOUD_ADMIN_USER="{{ index .Data.data "admin-user" }}" + export NEXTCLOUD_ADMIN_PASSWORD="{{ index .Data.data "admin-password" }}" + {{ end }} + {{ with secret "kv/data/atlas/health/wger-admin" }} + export WGER_ADMIN_USERNAME="{{ .Data.data.username }}" + export WGER_ADMIN_PASSWORD="{{ .Data.data.password }}" + {{ end }} + {{ with secret "kv/data/atlas/finance/firefly-secrets" }} + export FIREFLY_CRON_TOKEN="{{ .Data.data.STATIC_CRON_TOKEN }}" + {{ end }} + {{ with secret "kv/data/atlas/mailu/mailu-db-secret" }} + export MAILU_DB_NAME="{{ .Data.data.database }}" + export MAILU_DB_USER="{{ .Data.data.username }}" + export MAILU_DB_PASSWORD="{{ .Data.data.password }}" + {{ end }} + {{ with secret "kv/data/atlas/mailu/mailu-initial-account-secret" }} + export SMTP_HOST="mailu-front.mailu-mailserver.svc.cluster.local" + export SMTP_PORT="587" + export SMTP_STARTTLS="true" + export SMTP_USE_TLS="false" + export SMTP_USERNAME="no-reply-portal@bstein.dev" + export SMTP_PASSWORD="{{ .Data.data.password }}" + export SMTP_FROM="no-reply-portal@bstein.dev" + export MAILU_SYSTEM_PASSWORD="{{ .Data.data.password }}" + {{ end }} + {{ with secret "kv/data/atlas/comms/mas-admin-client-runtime" }} + export COMMS_MAS_ADMIN_CLIENT_SECRET="{{ .Data.data.client_secret }}" + {{ end }} + {{ with secret "kv/data/atlas/comms/atlasbot-credentials-runtime" }} + export COMMS_BOT_PASSWORD="{{ index .Data.data "bot-password" }}" + export COMMS_SEEDER_PASSWORD="{{ index .Data.data "seeder-password" }}" + {{ end }} + {{ with secret "kv/data/atlas/comms/synapse-admin" }} + export COMMS_SYNAPSE_ADMIN_TOKEN="{{ .Data.data.access_token }}" + {{ end }} + {{ with secret "kv/data/atlas/comms/synapse-db" }} + export COMMS_SYNAPSE_DB_PASSWORD="{{ .Data.data.POSTGRES_PASSWORD }}" + {{ end }} + {{ with secret "kv/data/atlas/vault/vault-oidc-config" }} + export VAULT_OIDC_DISCOVERY_URL="{{ .Data.data.discovery_url }}" + export VAULT_OIDC_CLIENT_ID="{{ .Data.data.client_id }}" + export VAULT_OIDC_CLIENT_SECRET="{{ .Data.data.client_secret }}" + export VAULT_OIDC_DEFAULT_ROLE="{{ .Data.data.default_role }}" + export VAULT_OIDC_SCOPES="{{ .Data.data.scopes }}" + export VAULT_OIDC_USER_CLAIM="{{ .Data.data.user_claim }}" + export VAULT_OIDC_GROUPS_CLAIM="{{ .Data.data.groups_claim }}" + export VAULT_OIDC_TOKEN_POLICIES="{{ .Data.data.token_policies }}" + export VAULT_OIDC_ADMIN_GROUP="{{ .Data.data.admin_group }}" + export VAULT_OIDC_ADMIN_POLICIES="{{ .Data.data.admin_policies }}" + export VAULT_OIDC_DEV_GROUP="{{ .Data.data.dev_group }}" + export VAULT_OIDC_DEV_POLICIES="{{ .Data.data.dev_policies }}" + export VAULT_OIDC_USER_GROUP="{{ .Data.data.user_group }}" + export VAULT_OIDC_USER_POLICIES="{{ .Data.data.user_policies }}" + export VAULT_OIDC_REDIRECT_URIS="{{ .Data.data.redirect_uris }}" + export VAULT_OIDC_BOUND_AUDIENCES="{{ .Data.data.bound_audiences }}" + {{- if .Data.data.bound_claims_type }} + export VAULT_OIDC_BOUND_CLAIMS_TYPE="{{ .Data.data.bound_claims_type }}" + {{- else }} + export VAULT_OIDC_BOUND_CLAIMS_TYPE="string" + {{- end }} + {{ end }} + spec: + serviceAccountName: ariadne + nodeSelector: + kubernetes.io/arch: arm64 + node-role.kubernetes.io/worker: "true" + containers: + - name: ariadne + image: registry.bstein.dev/bstein/ariadne:0.1.0-0 + imagePullPolicy: Always + command: ["/bin/sh", "-c"] + args: + - >- + . /vault/secrets/ariadne-env.sh + && exec uvicorn ariadne.app:app --host 0.0.0.0 --port 8080 + ports: + - name: http + containerPort: 8080 + env: + - name: KEYCLOAK_URL + value: https://sso.bstein.dev + - name: KEYCLOAK_REALM + value: atlas + - name: KEYCLOAK_CLIENT_ID + value: bstein-dev-home + - name: KEYCLOAK_ISSUER + value: https://sso.bstein.dev/realms/atlas + - name: KEYCLOAK_JWKS_URL + value: http://keycloak.sso.svc.cluster.local/realms/atlas/protocol/openid-connect/certs + - name: KEYCLOAK_ADMIN_URL + value: http://keycloak.sso.svc.cluster.local + - name: KEYCLOAK_ADMIN_REALM + value: atlas + - name: KEYCLOAK_ADMIN_CLIENT_ID + value: bstein-dev-home-admin + - name: PORTAL_PUBLIC_BASE_URL + value: https://bstein.dev + - name: ARIADNE_LOG_LEVEL + value: INFO + - name: ARIADNE_DB_POOL_MIN + value: "0" + - name: ARIADNE_DB_POOL_MAX + value: "5" + - name: ARIADNE_DB_CONNECT_TIMEOUT_SEC + value: "5" + - name: ARIADNE_DB_LOCK_TIMEOUT_SEC + value: "5" + - name: ARIADNE_DB_STATEMENT_TIMEOUT_SEC + value: "30" + - name: ARIADNE_DB_IDLE_IN_TX_TIMEOUT_SEC + value: "10" + - name: ARIADNE_RUN_MIGRATIONS + value: "false" + - name: PORTAL_ADMIN_USERS + value: bstein + - name: PORTAL_ADMIN_GROUPS + value: admin + - name: ACCOUNT_ALLOWED_GROUPS + value: dev,admin + - name: ALLOWED_FLAG_GROUPS + value: demo,test,vaultwarden_grandfathered + - name: DEFAULT_USER_GROUPS + value: dev + - name: MAILU_DOMAIN + value: bstein.dev + - name: MAILU_HOST + value: mail.bstein.dev + - name: MAILU_SYNC_URL + value: http://ariadne.maintenance.svc.cluster.local/events + - name: MAILU_EVENT_MIN_INTERVAL_SEC + value: "10" + - name: MAILU_SYSTEM_USERS + value: no-reply-portal@bstein.dev,no-reply-vaultwarden@bstein.dev + - name: MAILU_MAILBOX_WAIT_TIMEOUT_SEC + value: "180" + - name: MAILU_DB_HOST + value: postgres-service.postgres.svc.cluster.local + - name: MAILU_DB_PORT + value: "5432" + - name: NEXTCLOUD_NAMESPACE + value: nextcloud + - name: NEXTCLOUD_POD_LABEL + value: app=nextcloud + - name: NEXTCLOUD_CONTAINER + value: nextcloud + - name: NEXTCLOUD_EXEC_TIMEOUT_SEC + value: "120" + - name: NEXTCLOUD_URL + value: https://cloud.bstein.dev + - name: NEXTCLOUD_DB_HOST + value: postgres-service.postgres.svc.cluster.local + - name: NEXTCLOUD_DB_PORT + value: "5432" + - name: WGER_NAMESPACE + value: health + - name: WGER_USER_SYNC_WAIT_TIMEOUT_SEC + value: "90" + - name: WGER_POD_LABEL + value: app=wger + - name: WGER_CONTAINER + value: wger + - name: WGER_ADMIN_EMAIL + value: brad@bstein.dev + - name: FIREFLY_NAMESPACE + value: finance + - name: FIREFLY_USER_SYNC_WAIT_TIMEOUT_SEC + value: "90" + - name: FIREFLY_POD_LABEL + value: app=firefly + - name: FIREFLY_CONTAINER + value: firefly + - name: FIREFLY_CRON_BASE_URL + value: http://firefly.finance.svc.cluster.local/api/v1/cron + - name: FIREFLY_CRON_TIMEOUT_SEC + value: "30" + - name: VAULT_NAMESPACE + value: vault + - name: VAULT_ADDR + value: http://vault.vault.svc.cluster.local:8200 + - name: VAULT_K8S_ROLE + value: vault-admin + - name: VAULT_K8S_ROLE_TTL + value: 1h + - name: COMMS_NAMESPACE + value: comms + - name: COMMS_SYNAPSE_BASE + value: http://othrys-synapse-matrix-synapse.comms.svc.cluster.local:8008 + - name: COMMS_AUTH_BASE + value: http://matrix-authentication-service.comms.svc.cluster.local:8080 + - name: COMMS_MAS_ADMIN_API_BASE + value: http://matrix-authentication-service.comms.svc.cluster.local:8081/api/admin/v1 + - name: COMMS_MAS_TOKEN_URL + value: http://matrix-authentication-service.comms.svc.cluster.local:8080/oauth2/token + - name: COMMS_MAS_ADMIN_CLIENT_ID + value: 01KDXMVQBQ5JNY6SEJPZW6Z8BM + - name: COMMS_SERVER_NAME + value: live.bstein.dev + - name: COMMS_ROOM_ALIAS + value: "#othrys:live.bstein.dev" + - name: COMMS_ROOM_NAME + value: Othrys + - name: COMMS_PIN_MESSAGE + value: "Invite guests: share https://live.bstein.dev/#/room/#othrys:live.bstein.dev?action=join and choose 'Continue' -> 'Join as guest'." + - name: COMMS_SEEDER_USER + value: othrys-seeder + - name: COMMS_BOT_USER + value: atlasbot + - name: COMMS_SYNAPSE_DB_HOST + value: postgres-service.postgres.svc.cluster.local + - name: COMMS_SYNAPSE_DB_PORT + value: "5432" + - name: COMMS_SYNAPSE_DB_NAME + value: synapse + - name: COMMS_SYNAPSE_DB_USER + value: synapse + - name: COMMS_TIMEOUT_SEC + value: "30" + - name: COMMS_GUEST_STALE_DAYS + value: "14" + - name: VAULTWARDEN_NAMESPACE + value: vaultwarden + - name: VAULTWARDEN_POD_LABEL + value: app=vaultwarden + - name: VAULTWARDEN_POD_PORT + value: "80" + - name: VAULTWARDEN_SERVICE_HOST + value: vaultwarden-service.vaultwarden.svc.cluster.local + - name: VAULTWARDEN_ADMIN_SECRET_NAME + value: vaultwarden-admin + - name: VAULTWARDEN_ADMIN_SECRET_KEY + value: ADMIN_TOKEN + - name: VAULTWARDEN_ADMIN_SESSION_TTL_SEC + value: "900" + - name: VAULTWARDEN_ADMIN_RATE_LIMIT_BACKOFF_SEC + value: "600" + - name: VAULTWARDEN_RETRY_COOLDOWN_SEC + value: "1800" + - name: VAULTWARDEN_FAILURE_BAILOUT + value: "2" + - name: ARIADNE_PROVISION_POLL_INTERVAL_SEC + value: "5" + - name: ARIADNE_PROVISION_RETRY_COOLDOWN_SEC + value: "30" + - name: ARIADNE_SCHEDULE_TICK_SEC + value: "5" + - name: ARIADNE_SCHEDULE_MAILU_SYNC + value: "30 4 * * *" + - name: ARIADNE_SCHEDULE_NEXTCLOUD_SYNC + value: "0 5 * * *" + - name: ARIADNE_SCHEDULE_NEXTCLOUD_CRON + value: "*/5 * * * *" + - name: ARIADNE_SCHEDULE_NEXTCLOUD_MAINTENANCE + value: "30 4 * * *" + - name: ARIADNE_SCHEDULE_VAULTWARDEN_SYNC + value: "0 * * * *" + - name: ARIADNE_SCHEDULE_WGER_USER_SYNC + value: "0 5 * * *" + - name: ARIADNE_SCHEDULE_WGER_ADMIN + value: "15 3 * * *" + - name: ARIADNE_SCHEDULE_FIREFLY_USER_SYNC + value: "0 6 * * *" + - name: ARIADNE_SCHEDULE_FIREFLY_CRON + value: "0 3 * * *" + - name: ARIADNE_SCHEDULE_POD_CLEANER + value: "0 * * * *" + - name: ARIADNE_SCHEDULE_OPENSEARCH_PRUNE + value: "23 3 * * *" + - name: ARIADNE_SCHEDULE_IMAGE_SWEEPER + value: "30 4 * * 0" + - name: ARIADNE_SCHEDULE_VAULT_K8S_AUTH + value: "0 * * * *" + - name: ARIADNE_SCHEDULE_VAULT_OIDC + value: "0 * * * *" + - name: ARIADNE_SCHEDULE_COMMS_GUEST_NAME + value: "*/5 * * * *" + - name: ARIADNE_SCHEDULE_COMMS_PIN_INVITE + value: "0 0 1 * *" + - name: ARIADNE_SCHEDULE_COMMS_RESET_ROOM + value: "0 0 1 1 *" + - name: ARIADNE_SCHEDULE_COMMS_SEED_ROOM + value: "*/10 * * * *" + - name: ARIADNE_SCHEDULE_CLUSTER_STATE + value: "*/15 * * * *" + - name: ARIADNE_CLUSTER_STATE_KEEP + value: "168" + - name: WELCOME_EMAIL_ENABLED + value: "true" + - name: K8S_API_TIMEOUT_SEC + value: "5" + - name: ARIADNE_VM_URL + value: http://victoria-metrics-single-server.monitoring.svc.cluster.local:8428 + - name: ARIADNE_CLUSTER_STATE_VM_TIMEOUT_SEC + value: "5" + - name: OPENSEARCH_URL + value: http://opensearch-master.logging.svc.cluster.local:9200 + - name: OPENSEARCH_LIMIT_BYTES + value: "1099511627776" + - name: OPENSEARCH_INDEX_PATTERNS + value: kube-*,journald-*,trace-analytics-* + - name: METRICS_PATH + value: "/metrics" + resources: + requests: + cpu: 100m + memory: 128Mi + limits: + cpu: 500m + memory: 512Mi + livenessProbe: + httpGet: + path: /health + port: http + initialDelaySeconds: 10 + periodSeconds: 10 + readinessProbe: + httpGet: + path: /health + port: http + initialDelaySeconds: 5 + periodSeconds: 10 diff --git a/services/maintenance/ariadne-rbac.yaml b/services/maintenance/ariadne-rbac.yaml new file mode 100644 index 0000000..33620d0 --- /dev/null +++ b/services/maintenance/ariadne-rbac.yaml @@ -0,0 +1,58 @@ +# services/maintenance/ariadne-rbac.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: ariadne-job-spawner +rules: + - apiGroups: ["batch"] + resources: + - jobs + - cronjobs + verbs: + - get + - list + - watch + - create + - apiGroups: [""] + resources: + - pods + verbs: + - get + - list + - watch + - delete + - apiGroups: [""] + resources: + - nodes + - namespaces + verbs: + - get + - list + - watch + - apiGroups: [""] + resources: + - pods/exec + verbs: + - get + - create + - apiGroups: ["kustomize.toolkit.fluxcd.io"] + resources: + - kustomizations + verbs: + - get + - list + - watch + +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: ariadne-job-spawner +subjects: + - kind: ServiceAccount + name: ariadne + namespace: maintenance +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: ariadne-job-spawner diff --git a/services/maintenance/ariadne-service.yaml b/services/maintenance/ariadne-service.yaml new file mode 100644 index 0000000..9c93e1d --- /dev/null +++ b/services/maintenance/ariadne-service.yaml @@ -0,0 +1,13 @@ +# services/maintenance/ariadne-service.yaml +apiVersion: v1 +kind: Service +metadata: + name: ariadne + namespace: maintenance +spec: + selector: + app: ariadne + ports: + - name: http + port: 80 + targetPort: http diff --git a/services/maintenance/ariadne-serviceaccount.yaml b/services/maintenance/ariadne-serviceaccount.yaml new file mode 100644 index 0000000..9adcef7 --- /dev/null +++ b/services/maintenance/ariadne-serviceaccount.yaml @@ -0,0 +1,8 @@ +# services/maintenance/ariadne-serviceaccount.yaml +apiVersion: v1 +kind: ServiceAccount +metadata: + name: ariadne + namespace: maintenance +imagePullSecrets: + - name: harbor-regcred diff --git a/services/maintenance/image-sweeper-cronjob.yaml b/services/maintenance/image-sweeper-cronjob.yaml index c94fcca..0039206 100644 --- a/services/maintenance/image-sweeper-cronjob.yaml +++ b/services/maintenance/image-sweeper-cronjob.yaml @@ -6,6 +6,7 @@ metadata: namespace: maintenance spec: schedule: "30 4 * * 0" + suspend: true concurrencyPolicy: Forbid successfulJobsHistoryLimit: 2 failedJobsHistoryLimit: 2 diff --git a/services/maintenance/image.yaml b/services/maintenance/image.yaml new file mode 100644 index 0000000..fd28d90 --- /dev/null +++ b/services/maintenance/image.yaml @@ -0,0 +1,23 @@ +# services/maintenance/image.yaml +apiVersion: image.toolkit.fluxcd.io/v1beta2 +kind: ImageRepository +metadata: + name: ariadne + namespace: maintenance +spec: + image: registry.bstein.dev/bstein/ariadne + interval: 1m0s + secretRef: + name: harbor-regcred +--- +apiVersion: image.toolkit.fluxcd.io/v1beta2 +kind: ImagePolicy +metadata: + name: ariadne + namespace: maintenance +spec: + imageRepositoryRef: + name: ariadne + policy: + semver: + range: ">=0.1.0-0" diff --git a/services/maintenance/kustomization.yaml b/services/maintenance/kustomization.yaml index e53ed3c..19b2ba9 100644 --- a/services/maintenance/kustomization.yaml +++ b/services/maintenance/kustomization.yaml @@ -3,19 +3,30 @@ apiVersion: kustomize.config.k8s.io/v1beta1 kind: Kustomization resources: - namespace.yaml + - image.yaml + - secretproviderclass.yaml + - vault-serviceaccount.yaml + - vault-sync-deployment.yaml + - ariadne-serviceaccount.yaml + - ariadne-rbac.yaml - disable-k3s-traefik-serviceaccount.yaml - k3s-traefik-cleanup-rbac.yaml - node-nofile-serviceaccount.yaml - pod-cleaner-rbac.yaml + - ariadne-deployment.yaml + - oneoffs/ariadne-migrate-job.yaml + - ariadne-service.yaml - disable-k3s-traefik-daemonset.yaml - - k3s-traefik-cleanup-job.yaml + - oneoffs/k3s-traefik-cleanup-job.yaml - node-nofile-daemonset.yaml - k3s-agent-restart-daemonset.yaml - pod-cleaner-cronjob.yaml - node-image-sweeper-serviceaccount.yaml - node-image-sweeper-daemonset.yaml - image-sweeper-cronjob.yaml - +images: + - name: registry.bstein.dev/bstein/ariadne + newTag: 0.1.0-59 # {"$imagepolicy": "maintenance:ariadne:tag"} configMapGenerator: - name: disable-k3s-traefik-script namespace: maintenance diff --git a/services/maintenance/oneoffs/ariadne-migrate-job.yaml b/services/maintenance/oneoffs/ariadne-migrate-job.yaml new file mode 100644 index 0000000..ecac68d --- /dev/null +++ b/services/maintenance/oneoffs/ariadne-migrate-job.yaml @@ -0,0 +1,50 @@ +# services/maintenance/oneoffs/ariadne-migrate-job.yaml +# One-off job for maintenance/ariadne-migrate-2. +# Purpose: ariadne migrate 2 (see container args/env in this file). +# Run by setting spec.suspend to false, reconcile, then set it back to true. +# Safe to delete the finished Job/pod; it should not run continuously. +apiVersion: batch/v1 +kind: Job +metadata: + name: ariadne-migrate-2 + namespace: maintenance + annotations: + kustomize.toolkit.fluxcd.io/force: "true" +spec: + suspend: true + backoffLimit: 1 + ttlSecondsAfterFinished: 3600 + template: + metadata: + labels: + app: ariadne-migrate + annotations: + vault.hashicorp.com/agent-inject: "true" + vault.hashicorp.com/agent-pre-populate-only: "true" + vault.hashicorp.com/role: "maintenance" + vault.hashicorp.com/agent-inject-secret-ariadne-env.sh: "kv/data/atlas/maintenance/ariadne-db" + vault.hashicorp.com/agent-inject-template-ariadne-env.sh: | + {{ with secret "kv/data/atlas/maintenance/ariadne-db" }} + export ARIADNE_DATABASE_URL="{{ .Data.data.database_url }}" + {{ end }} + {{ with secret "kv/data/atlas/portal/atlas-portal-db" }} + export PORTAL_DATABASE_URL="{{ .Data.data.PORTAL_DATABASE_URL }}" + {{ end }} + spec: + serviceAccountName: ariadne + restartPolicy: Never + nodeSelector: + kubernetes.io/arch: arm64 + node-role.kubernetes.io/worker: "true" + containers: + - name: migrate + image: registry.bstein.dev/bstein/ariadne:0.1.0-0 + imagePullPolicy: Always + command: ["/bin/sh", "-c"] + args: + - >- + . /vault/secrets/ariadne-env.sh + && exec python -m ariadne.migrate + env: + - name: ARIADNE_RUN_MIGRATIONS + value: "true" diff --git a/services/maintenance/k3s-traefik-cleanup-job.yaml b/services/maintenance/oneoffs/k3s-traefik-cleanup-job.yaml similarity index 77% rename from services/maintenance/k3s-traefik-cleanup-job.yaml rename to services/maintenance/oneoffs/k3s-traefik-cleanup-job.yaml index d5d12a6..2c365a9 100644 --- a/services/maintenance/k3s-traefik-cleanup-job.yaml +++ b/services/maintenance/oneoffs/k3s-traefik-cleanup-job.yaml @@ -1,10 +1,15 @@ -# services/maintenance/k3s-traefik-cleanup-job.yaml +# services/maintenance/oneoffs/k3s-traefik-cleanup-job.yaml +# One-off job for maintenance/k3s-traefik-cleanup-2. +# Purpose: k3s traefik cleanup 2 (see container args/env in this file). +# Run by setting spec.suspend to false, reconcile, then set it back to true. +# Safe to delete the finished Job/pod; it should not run continuously. apiVersion: batch/v1 kind: Job metadata: name: k3s-traefik-cleanup-2 namespace: maintenance spec: + suspend: true backoffLimit: 1 template: spec: diff --git a/services/maintenance/pod-cleaner-cronjob.yaml b/services/maintenance/pod-cleaner-cronjob.yaml index e083c85..99d13f6 100644 --- a/services/maintenance/pod-cleaner-cronjob.yaml +++ b/services/maintenance/pod-cleaner-cronjob.yaml @@ -6,6 +6,7 @@ metadata: namespace: maintenance spec: schedule: "0 * * * *" + suspend: true concurrencyPolicy: Forbid successfulJobsHistoryLimit: 1 failedJobsHistoryLimit: 3 diff --git a/services/maintenance/secretproviderclass.yaml b/services/maintenance/secretproviderclass.yaml new file mode 100644 index 0000000..85df2af --- /dev/null +++ b/services/maintenance/secretproviderclass.yaml @@ -0,0 +1,21 @@ +# services/maintenance/secretproviderclass.yaml +apiVersion: secrets-store.csi.x-k8s.io/v1 +kind: SecretProviderClass +metadata: + name: maintenance-vault + namespace: maintenance +spec: + provider: vault + parameters: + vaultAddress: "http://vault.vault.svc.cluster.local:8200" + roleName: "maintenance" + objects: | + - objectName: "harbor-pull__dockerconfigjson" + secretPath: "kv/data/atlas/shared/harbor-pull" + secretKey: "dockerconfigjson" + secretObjects: + - secretName: harbor-regcred + type: kubernetes.io/dockerconfigjson + data: + - objectName: harbor-pull__dockerconfigjson + key: .dockerconfigjson diff --git a/services/maintenance/vault-serviceaccount.yaml b/services/maintenance/vault-serviceaccount.yaml new file mode 100644 index 0000000..f60b43e --- /dev/null +++ b/services/maintenance/vault-serviceaccount.yaml @@ -0,0 +1,6 @@ +# services/maintenance/vault-serviceaccount.yaml +apiVersion: v1 +kind: ServiceAccount +metadata: + name: maintenance-vault-sync + namespace: maintenance diff --git a/services/maintenance/vault-sync-deployment.yaml b/services/maintenance/vault-sync-deployment.yaml new file mode 100644 index 0000000..edc0456 --- /dev/null +++ b/services/maintenance/vault-sync-deployment.yaml @@ -0,0 +1,34 @@ +# services/maintenance/vault-sync-deployment.yaml +apiVersion: apps/v1 +kind: Deployment +metadata: + name: maintenance-vault-sync + namespace: maintenance +spec: + replicas: 1 + selector: + matchLabels: + app: maintenance-vault-sync + template: + metadata: + labels: + app: maintenance-vault-sync + spec: + serviceAccountName: maintenance-vault-sync + containers: + - name: sync + image: alpine:3.20 + command: ["/bin/sh", "-c"] + args: + - "sleep infinity" + volumeMounts: + - name: vault-secrets + mountPath: /vault/secrets + readOnly: true + volumes: + - name: vault-secrets + csi: + driver: secrets-store.csi.k8s.io + readOnly: true + volumeAttributes: + secretProviderClass: maintenance-vault diff --git a/services/monitoring/dashboards/atlas-gpu.json b/services/monitoring/dashboards/atlas-gpu.json index af8a1c5..6f993d9 100644 --- a/services/monitoring/dashboards/atlas-gpu.json +++ b/services/monitoring/dashboards/atlas-gpu.json @@ -20,7 +20,7 @@ }, "targets": [ { - "expr": "(100 * (sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace)) / clamp_min((sum(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace)) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") and on() ((sum(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace)) or on() vector(0)) == 0))", + "expr": "(100 * (sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})))) / clamp_min((sum(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})))) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") * scalar((sum(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})))) or on() vector(0)) == bool 0))", "refId": "A", "legendFormat": "{{namespace}}" } @@ -89,7 +89,7 @@ }, "targets": [ { - "expr": "sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace)", + "expr": "sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})))", "refId": "A", "legendFormat": "{{namespace}}" } @@ -126,7 +126,7 @@ }, "targets": [ { - "expr": "sum by (Hostname) (DCGM_FI_DEV_GPU_UTIL{pod!=\"\"})", + "expr": "label_replace(avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"}), \"Hostname\", \"$1\", \"node\", \"(.*)\")", "refId": "A", "legendFormat": "{{Hostname}}" } diff --git a/services/monitoring/dashboards/atlas-jobs.json b/services/monitoring/dashboards/atlas-jobs.json new file mode 100644 index 0000000..37b888d --- /dev/null +++ b/services/monitoring/dashboards/atlas-jobs.json @@ -0,0 +1,1253 @@ +{ + "uid": "atlas-jobs", + "title": "Atlas Jobs", + "folderUid": "atlas-internal", + "editable": true, + "panels": [ + { + "id": 1, + "type": "bargauge", + "title": "Ariadne Task Errors (range)", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 7, + "w": 8, + "x": 0, + "y": 0 + }, + "targets": [ + { + "expr": "sort_desc(sum by (task) (increase(ariadne_task_runs_total{status=\"error\"}[$__range])))", + "refId": "A", + "legendFormat": "{{task}}", + "instant": true + } + ], + "fieldConfig": { + "defaults": { + "unit": "none", + "min": 0, + "max": null, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 1 + }, + { + "color": "orange", + "value": 3 + }, + { + "color": "red", + "value": 5 + } + ] + } + }, + "overrides": [] + }, + "options": { + "displayMode": "gradient", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + } + }, + "transformations": [ + { + "id": "sortBy", + "options": { + "fields": [ + "Value" + ], + "order": "desc" + } + } + ] + }, + { + "id": 2, + "type": "timeseries", + "title": "Ariadne Attempts / Failures", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 7, + "w": 8, + "x": 8, + "y": 0 + }, + "targets": [ + { + "expr": "sum(increase(ariadne_task_runs_total[$__interval]))", + "refId": "A", + "legendFormat": "Attempts" + }, + { + "expr": "sum(increase(ariadne_task_runs_total{status=\"error\"}[$__interval]))", + "refId": "B", + "legendFormat": "Failures" + } + ], + "fieldConfig": { + "defaults": { + "unit": "none" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "Attempts" + }, + "properties": [ + { + "id": "color", + "value": { + "mode": "fixed", + "fixedColor": "green" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Failures" + }, + "properties": [ + { + "id": "color", + "value": { + "mode": "fixed", + "fixedColor": "red" + } + } + ] + } + ] + }, + "options": { + "legend": { + "displayMode": "table", + "placement": "right" + }, + "tooltip": { + "mode": "multi" + } + } + }, + { + "id": 3, + "type": "bargauge", + "title": "One-off Job Pods (age hours)", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 7, + "w": 8, + "x": 16, + "y": 0 + }, + "targets": [ + { + "expr": "sort_desc(((time() - kube_pod_start_time{pod!=\"\"}) / 3600) * on(namespace,pod) group_left(owner_name) (kube_pod_owner{owner_kind=\"Job\"} unless on(namespace, owner_name) label_replace(kube_job_owner{owner_kind=\"CronJob\"}, \"owner_name\", \"$1\", \"job_name\", \"(.*)\")) * on(namespace,pod) group_left(phase) max by (namespace,pod,phase) (kube_pod_status_phase{phase=~\"Running|Succeeded\"}))", + "refId": "A", + "legendFormat": "{{namespace}}/{{pod}}", + "instant": true + } + ], + "fieldConfig": { + "defaults": { + "unit": "h", + "min": 0, + "max": null, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 6 + }, + { + "color": "orange", + "value": 24 + }, + { + "color": "red", + "value": 48 + } + ] + }, + "decimals": 2 + }, + "overrides": [] + }, + "options": { + "displayMode": "gradient", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + } + }, + "transformations": [ + { + "id": "sortBy", + "options": { + "fields": [ + "Value" + ], + "order": "desc" + } + }, + { + "id": "limit", + "options": { + "limit": 12 + } + } + ] + }, + { + "id": 4, + "type": "stat", + "title": "Glue Jobs Stale (>36h)", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 4, + "w": 4, + "x": 0, + "y": 7 + }, + "targets": [ + { + "expr": "(sum((((time() - (kube_cronjob_status_last_successful_time and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"})) > bool 129600) unless on(namespace,cronjob) (kube_cronjob_spec_suspend and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}) == 1)) + count(((kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"} unless on(namespace,cronjob) kube_cronjob_status_last_successful_time) unless on(namespace,cronjob) (kube_cronjob_spec_suspend and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}) == 1))) or on() vector(0)", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 1 + }, + { + "color": "orange", + "value": 2 + }, + { + "color": "red", + "value": 3 + } + ] + }, + "unit": "none", + "custom": { + "displayMode": "auto" + } + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "value" + } + }, + { + "id": 5, + "type": "stat", + "title": "Glue Jobs Missing Success", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 4, + "w": 4, + "x": 4, + "y": 7 + }, + "targets": [ + { + "expr": "count(((kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"} unless on(namespace,cronjob) kube_cronjob_status_last_successful_time) unless on(namespace,cronjob) (kube_cronjob_spec_suspend and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}) == 1)) or on() vector(0)", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "rgba(115, 115, 115, 1)", + "value": null + }, + { + "color": "green", + "value": 1 + } + ] + }, + "unit": "none", + "custom": { + "displayMode": "auto" + } + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "value" + } + }, + { + "id": 6, + "type": "stat", + "title": "Glue Jobs Suspended", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 4, + "w": 4, + "x": 8, + "y": 7 + }, + "targets": [ + { + "expr": "sum((kube_cronjob_spec_suspend and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}) == 1) or on() vector(0)", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "rgba(115, 115, 115, 1)", + "value": null + }, + { + "color": "green", + "value": 1 + } + ] + }, + "unit": "none", + "custom": { + "displayMode": "auto" + } + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "value" + } + }, + { + "id": 7, + "type": "stat", + "title": "Ariadne Task Errors (1h)", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 4, + "w": 4, + "x": 12, + "y": 7 + }, + "targets": [ + { + "expr": "sum(increase(ariadne_task_runs_total{status=\"error\"}[1h]))", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "rgba(115, 115, 115, 1)", + "value": null + }, + { + "color": "green", + "value": 1 + } + ] + }, + "unit": "none", + "custom": { + "displayMode": "auto" + } + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "value" + } + }, + { + "id": 8, + "type": "stat", + "title": "Ariadne Task Errors (24h)", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 4, + "w": 4, + "x": 16, + "y": 7 + }, + "targets": [ + { + "expr": "sum(increase(ariadne_task_runs_total{status=\"error\"}[24h]))", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "rgba(115, 115, 115, 1)", + "value": null + }, + { + "color": "green", + "value": 1 + } + ] + }, + "unit": "none", + "custom": { + "displayMode": "auto" + } + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "value" + } + }, + { + "id": 9, + "type": "stat", + "title": "Ariadne Task Runs (1h)", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 4, + "w": 4, + "x": 20, + "y": 7 + }, + "targets": [ + { + "expr": "sum(increase(ariadne_task_runs_total[1h]))", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "rgba(115, 115, 115, 1)", + "value": null + }, + { + "color": "green", + "value": 1 + } + ] + }, + "unit": "none", + "custom": { + "displayMode": "auto" + } + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "value" + } + }, + { + "id": 10, + "type": "bargauge", + "title": "Ariadne Schedule Last Error (hours ago)", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 6, + "w": 12, + "x": 0, + "y": 17 + }, + "targets": [ + { + "expr": "sort_desc((time() - max_over_time(ariadne_schedule_last_error_timestamp_seconds[$__range])) / 3600)", + "refId": "A", + "legendFormat": "{{task}}", + "instant": true + } + ], + "fieldConfig": { + "defaults": { + "unit": "h", + "min": 0, + "max": null, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "red", + "value": null + }, + { + "color": "orange", + "value": 1 + }, + { + "color": "yellow", + "value": 6 + }, + { + "color": "green", + "value": 24 + } + ] + }, + "decimals": 2 + }, + "overrides": [] + }, + "options": { + "displayMode": "gradient", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + } + }, + "transformations": [ + { + "id": "sortBy", + "options": { + "fields": [ + "Value" + ], + "order": "desc" + } + } + ] + }, + { + "id": 11, + "type": "bargauge", + "title": "Ariadne Schedule Last Success (hours ago)", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 6, + "w": 12, + "x": 12, + "y": 17 + }, + "targets": [ + { + "expr": "sort_desc((time() - max_over_time(ariadne_schedule_last_success_timestamp_seconds[$__range])) / 3600)", + "refId": "A", + "legendFormat": "{{task}}", + "instant": true + } + ], + "fieldConfig": { + "defaults": { + "unit": "h", + "min": 0, + "max": null, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 6 + }, + { + "color": "orange", + "value": 24 + }, + { + "color": "red", + "value": 48 + } + ] + }, + "decimals": 2 + }, + "overrides": [] + }, + "options": { + "displayMode": "gradient", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + } + }, + "transformations": [ + { + "id": "sortBy", + "options": { + "fields": [ + "Value" + ], + "order": "desc" + } + } + ] + }, + { + "id": 12, + "type": "bargauge", + "title": "Glue Jobs Last Success (hours ago)", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 6, + "w": 12, + "x": 0, + "y": 23 + }, + "targets": [ + { + "expr": "sort_desc((time() - max_over_time((kube_cronjob_status_last_successful_time and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"})[$__range])) / 3600)", + "refId": "A", + "legendFormat": "{{namespace}}/{{cronjob}}", + "instant": true + } + ], + "fieldConfig": { + "defaults": { + "unit": "h", + "min": 0, + "max": null, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 6 + }, + { + "color": "orange", + "value": 24 + }, + { + "color": "red", + "value": 48 + } + ] + }, + "decimals": 2 + }, + "overrides": [] + }, + "options": { + "displayMode": "gradient", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + } + }, + "transformations": [ + { + "id": "sortBy", + "options": { + "fields": [ + "Value" + ], + "order": "desc" + } + } + ] + }, + { + "id": 13, + "type": "bargauge", + "title": "Glue Jobs Last Schedule (hours ago)", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 6, + "w": 12, + "x": 12, + "y": 23 + }, + "targets": [ + { + "expr": "sort_desc((time() - max_over_time((kube_cronjob_status_last_schedule_time and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"})[$__range])) / 3600)", + "refId": "A", + "legendFormat": "{{namespace}}/{{cronjob}}", + "instant": true + } + ], + "fieldConfig": { + "defaults": { + "unit": "h", + "min": 0, + "max": null, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 6 + }, + { + "color": "orange", + "value": 24 + }, + { + "color": "red", + "value": 48 + } + ] + }, + "decimals": 2 + }, + "overrides": [] + }, + "options": { + "displayMode": "gradient", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + } + }, + "transformations": [ + { + "id": "sortBy", + "options": { + "fields": [ + "Value" + ], + "order": "desc" + } + } + ] + }, + { + "id": 14, + "type": "bargauge", + "title": "Ariadne Task Errors (1h)", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 6, + "w": 12, + "x": 0, + "y": 29 + }, + "targets": [ + { + "expr": "sort_desc(sum by (task) (increase(ariadne_task_runs_total{status=\"error\"}[1h])))", + "refId": "A", + "legendFormat": "{{task}}", + "instant": true + } + ], + "fieldConfig": { + "defaults": { + "unit": "none", + "min": 0, + "max": null, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 1 + }, + { + "color": "orange", + "value": 3 + }, + { + "color": "red", + "value": 5 + } + ] + } + }, + "overrides": [] + }, + "options": { + "displayMode": "gradient", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + } + }, + "transformations": [ + { + "id": "sortBy", + "options": { + "fields": [ + "Value" + ], + "order": "desc" + } + } + ] + }, + { + "id": 15, + "type": "bargauge", + "title": "Ariadne Task Errors (30d)", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 6, + "w": 12, + "x": 12, + "y": 29 + }, + "targets": [ + { + "expr": "sort_desc(sum by (task) (increase(ariadne_task_runs_total{status=\"error\"}[30d])))", + "refId": "A", + "legendFormat": "{{task}}", + "instant": true + } + ], + "fieldConfig": { + "defaults": { + "unit": "none", + "min": 0, + "max": null, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 1 + }, + { + "color": "orange", + "value": 3 + }, + { + "color": "red", + "value": 5 + } + ] + } + }, + "overrides": [] + }, + "options": { + "displayMode": "gradient", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + } + }, + "transformations": [ + { + "id": "sortBy", + "options": { + "fields": [ + "Value" + ], + "order": "desc" + } + } + ] + }, + { + "id": 16, + "type": "bargauge", + "title": "Ariadne Access Requests", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 6, + "w": 8, + "x": 0, + "y": 11 + }, + "targets": [ + { + "expr": "sort_desc(ariadne_access_requests_total)", + "refId": "A", + "legendFormat": "{{status}}", + "instant": true + } + ], + "fieldConfig": { + "defaults": { + "unit": "none", + "min": 0, + "max": null, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 50 + }, + { + "color": "orange", + "value": 70 + }, + { + "color": "red", + "value": 85 + } + ] + } + }, + "overrides": [] + }, + "options": { + "displayMode": "gradient", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + } + }, + "transformations": [ + { + "id": "sortBy", + "options": { + "fields": [ + "Value" + ], + "order": "desc" + } + } + ] + }, + { + "id": 17, + "type": "stat", + "title": "Ariadne CI Coverage (%)", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 6, + "w": 4, + "x": 8, + "y": 11 + }, + "targets": [ + { + "expr": "ariadne_ci_coverage_percent{repo=\"ariadne\"}", + "refId": "A", + "legendFormat": "{{branch}}", + "instant": true + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "rgba(115, 115, 115, 1)", + "value": null + }, + { + "color": "green", + "value": 1 + } + ] + }, + "unit": "percent", + "custom": { + "displayMode": "auto" + }, + "decimals": 1 + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "value" + } + }, + { + "id": 18, + "type": "table", + "title": "Ariadne CI Tests (latest)", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 6, + "w": 12, + "x": 12, + "y": 11 + }, + "targets": [ + { + "expr": "ariadne_ci_tests_total{repo=\"ariadne\"}", + "refId": "A", + "instant": true + } + ], + "fieldConfig": { + "defaults": { + "unit": "none", + "custom": { + "filterable": true + } + }, + "overrides": [] + }, + "options": { + "showHeader": true, + "columnFilters": false + }, + "transformations": [ + { + "id": "labelsToFields", + "options": {} + }, + { + "id": "sortBy", + "options": { + "fields": [ + "Value" + ], + "order": "desc" + } + } + ] + } + ], + "time": { + "from": "now-7d", + "to": "now" + }, + "annotations": { + "list": [] + }, + "schemaVersion": 39, + "style": "dark", + "tags": [ + "atlas", + "jobs", + "glue" + ] +} diff --git a/services/monitoring/dashboards/atlas-nodes.json b/services/monitoring/dashboards/atlas-nodes.json index 2d60042..ea59579 100644 --- a/services/monitoring/dashboards/atlas-nodes.json +++ b/services/monitoring/dashboards/atlas-nodes.json @@ -20,7 +20,7 @@ }, "targets": [ { - "expr": "sum(kube_node_status_condition{condition=\"Ready\",status=\"true\",node=~\"titan-04|titan-05|titan-06|titan-07|titan-08|titan-09|titan-10|titan-11|titan-20|titan-21|titan-12|titan-13|titan-14|titan-15|titan-17|titan-18|titan-19|titan-22|titan-24\"})", + "expr": "sum(kube_node_status_condition{condition=\"Ready\",status=\"true\",node=~\"titan-04|titan-05|titan-06|titan-07|titan-08|titan-09|titan-10|titan-11|titan-20|titan-21|titan-12|titan-13|titan-14|titan-15|titan-16|titan-17|titan-18|titan-19|titan-22|titan-24\"})", "refId": "A" } ], @@ -46,7 +46,7 @@ "unit": "none", "custom": { "displayMode": "auto", - "valueSuffix": "/19" + "valueSuffix": "/20" } }, "overrides": [] diff --git a/services/monitoring/dashboards/atlas-overview.json b/services/monitoring/dashboards/atlas-overview.json index c5f30d1..1f8635b 100644 --- a/services/monitoring/dashboards/atlas-overview.json +++ b/services/monitoring/dashboards/atlas-overview.json @@ -449,14 +449,14 @@ }, "targets": [ { - "expr": "sum(kube_node_status_condition{condition=\"Ready\",status=\"true\",node=~\"titan-04|titan-05|titan-06|titan-07|titan-08|titan-09|titan-10|titan-11|titan-20|titan-21|titan-12|titan-13|titan-14|titan-15|titan-17|titan-18|titan-19|titan-22|titan-24\"})", + "expr": "sum(kube_node_status_condition{condition=\"Ready\",status=\"true\",node=~\"titan-04|titan-05|titan-06|titan-07|titan-08|titan-09|titan-10|titan-11|titan-20|titan-21|titan-12|titan-13|titan-14|titan-15|titan-16|titan-17|titan-18|titan-19|titan-22|titan-24\"})", "refId": "A" } ], "fieldConfig": { "defaults": { "min": 0, - "max": 19, + "max": 20, "thresholds": { "mode": "absolute", "steps": [ @@ -466,15 +466,15 @@ }, { "color": "orange", - "value": 17 - }, - { - "color": "yellow", "value": 18 }, { - "color": "green", + "color": "yellow", "value": 19 + }, + { + "color": "green", + "value": 20 } ] } @@ -795,8 +795,8 @@ "uid": "atlas-vm" }, "gridPos": { - "h": 2, - "w": 5, + "h": 3, + "w": 4, "x": 0, "y": 8 }, @@ -862,9 +862,9 @@ "uid": "atlas-vm" }, "gridPos": { - "h": 2, - "w": 5, - "x": 10, + "h": 3, + "w": 4, + "x": 8, "y": 8 }, "targets": [ @@ -967,9 +967,9 @@ "uid": "atlas-vm" }, "gridPos": { - "h": 2, - "w": 5, - "x": 5, + "h": 3, + "w": 4, + "x": 4, "y": 8 }, "targets": [ @@ -1043,9 +1043,9 @@ "uid": "atlas-vm" }, "gridPos": { - "h": 2, - "w": 5, - "x": 15, + "h": 3, + "w": 4, + "x": 12, "y": 8 }, "targets": [ @@ -1110,6 +1110,132 @@ } ] }, + { + "id": 34, + "type": "stat", + "title": "Postgres Connections Used", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 3, + "w": 4, + "x": 16, + "y": 8 + }, + "targets": [ + { + "expr": "label_replace(sum(pg_stat_activity_count), \"conn\", \"used\", \"__name__\", \".*\") or label_replace(max(pg_settings_max_connections), \"conn\", \"max\", \"__name__\", \".*\")", + "refId": "A", + "legendFormat": "{{conn}}", + "instant": true + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "rgba(115, 115, 115, 1)", + "value": null + }, + { + "color": "green", + "value": 1 + } + ] + }, + "unit": "none", + "custom": { + "displayMode": "auto" + }, + "decimals": 0 + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "name_and_value" + } + }, + { + "id": 35, + "type": "stat", + "title": "Postgres Hottest Connections", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 3, + "w": 4, + "x": 20, + "y": 8 + }, + "targets": [ + { + "expr": "topk(1, sum by (datname) (pg_stat_activity_count))", + "refId": "A", + "legendFormat": "{{datname}}", + "instant": true + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "rgba(115, 115, 115, 1)", + "value": null + }, + { + "color": "green", + "value": 1 + } + ] + }, + "unit": "none", + "custom": { + "displayMode": "auto" + }, + "decimals": 0 + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "name_and_value" + } + }, { "id": 23, "type": "stat", @@ -1119,10 +1245,10 @@ "uid": "atlas-vm" }, "gridPos": { - "h": 6, + "h": 3, "w": 6, "x": 0, - "y": 10 + "y": 11 }, "targets": [ { @@ -1194,10 +1320,10 @@ "uid": "atlas-vm" }, "gridPos": { - "h": 6, + "h": 3, "w": 6, "x": 6, - "y": 10 + "y": 11 }, "targets": [ { @@ -1269,10 +1395,10 @@ "uid": "atlas-vm" }, "gridPos": { - "h": 6, + "h": 3, "w": 6, "x": 12, - "y": 10 + "y": 11 }, "targets": [ { @@ -1336,10 +1462,10 @@ "uid": "atlas-vm" }, "gridPos": { - "h": 6, + "h": 3, "w": 6, "x": 18, - "y": 10 + "y": 11 }, "targets": [ { @@ -1394,6 +1520,302 @@ } ] }, + { + "id": 40, + "type": "bargauge", + "title": "One-off Job Pods (age hours)", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 6, + "w": 6, + "x": 0, + "y": 14 + }, + "targets": [ + { + "expr": "sort_desc(((time() - kube_pod_start_time{pod!=\"\"}) / 3600) * on(namespace,pod) group_left(owner_name) (kube_pod_owner{owner_kind=\"Job\"} unless on(namespace, owner_name) label_replace(kube_job_owner{owner_kind=\"CronJob\"}, \"owner_name\", \"$1\", \"job_name\", \"(.*)\")) * on(namespace,pod) group_left(phase) max by (namespace,pod,phase) (kube_pod_status_phase{phase=~\"Running|Succeeded\"}))", + "refId": "A", + "legendFormat": "{{namespace}}/{{pod}}", + "instant": true + } + ], + "fieldConfig": { + "defaults": { + "unit": "h", + "min": 0, + "max": null, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 6 + }, + { + "color": "orange", + "value": 24 + }, + { + "color": "red", + "value": 48 + } + ] + }, + "decimals": 2 + }, + "overrides": [] + }, + "options": { + "displayMode": "gradient", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + } + }, + "transformations": [ + { + "id": "sortBy", + "options": { + "fields": [ + "Value" + ], + "order": "desc" + } + }, + { + "id": "limit", + "options": { + "limit": 8 + } + } + ] + }, + { + "id": 41, + "type": "timeseries", + "title": "Ariadne Attempts / Failures", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 6, + "w": 6, + "x": 6, + "y": 14 + }, + "targets": [ + { + "expr": "sum(increase(ariadne_task_runs_total[$__interval]))", + "refId": "A", + "legendFormat": "Attempts" + }, + { + "expr": "sum(increase(ariadne_task_runs_total{status=\"error\"}[$__interval]))", + "refId": "B", + "legendFormat": "Failures" + } + ], + "fieldConfig": { + "defaults": { + "unit": "none" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "Attempts" + }, + "properties": [ + { + "id": "color", + "value": { + "mode": "fixed", + "fixedColor": "green" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Failures" + }, + "properties": [ + { + "id": "color", + "value": { + "mode": "fixed", + "fixedColor": "red" + } + } + ] + } + ] + }, + "options": { + "legend": { + "displayMode": "table", + "placement": "right" + }, + "tooltip": { + "mode": "multi" + } + } + }, + { + "id": 42, + "type": "timeseries", + "title": "Ariadne Test Success Rate", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 6, + "w": 6, + "x": 12, + "y": 14 + }, + "targets": [ + { + "expr": "100 * sum(max_over_time(ariadne_ci_tests_total{repo=\"ariadne\",result=\"passed\"}[30d])) / clamp_min(sum(max_over_time(ariadne_ci_tests_total{repo=\"ariadne\",result=~\"passed|failed|error\"}[30d])), 1)", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "unit": "percent", + "max": 100 + }, + "overrides": [] + }, + "options": { + "legend": { + "displayMode": "list", + "placement": "bottom" + }, + "tooltip": { + "mode": "multi" + } + } + }, + { + "id": 43, + "type": "bargauge", + "title": "Tests with Failures (24h)", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 6, + "w": 6, + "x": 18, + "y": 14 + }, + "targets": [ + { + "expr": "sort_desc(sum by (result) (max_over_time(ariadne_ci_tests_total{repo=\"ariadne\",result=~\"failed|error\"}[24h])))", + "refId": "A", + "legendFormat": "{{result}}", + "instant": true + } + ], + "fieldConfig": { + "defaults": { + "unit": "none", + "min": 0, + "max": null, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 1 + }, + { + "color": "orange", + "value": 5 + }, + { + "color": "red", + "value": 10 + } + ] + } + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "error" + }, + "properties": [ + { + "id": "color", + "value": { + "mode": "fixed", + "fixedColor": "yellow" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "failed" + }, + "properties": [ + { + "id": "color", + "value": { + "mode": "fixed", + "fixedColor": "red" + } + } + ] + } + ] + }, + "options": { + "displayMode": "gradient", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + } + }, + "transformations": [ + { + "id": "sortBy", + "options": { + "fields": [ + "Value" + ], + "order": "desc" + } + } + ] + }, { "id": 11, "type": "piechart", @@ -1406,7 +1828,7 @@ "h": 9, "w": 8, "x": 0, - "y": 16 + "y": 20 }, "targets": [ { @@ -1475,11 +1897,11 @@ "h": 9, "w": 8, "x": 8, - "y": 16 + "y": 20 }, "targets": [ { - "expr": "(100 * (sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace)) / clamp_min((sum(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace)) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") and on() ((sum(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace)) or on() vector(0)) == 0))", + "expr": "(100 * (sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})))) / clamp_min((sum(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})))) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") * scalar((sum(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})))) or on() vector(0)) == bool 0))", "refId": "A", "legendFormat": "{{namespace}}" } @@ -1544,7 +1966,7 @@ "h": 9, "w": 8, "x": 16, - "y": 16 + "y": 20 }, "targets": [ { @@ -1613,11 +2035,11 @@ "h": 12, "w": 12, "x": 0, - "y": 32 + "y": 36 }, "targets": [ { - "expr": "(avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-04|titan-05|titan-06|titan-07|titan-08|titan-09|titan-10|titan-11|titan-20|titan-21|titan-12|titan-13|titan-14|titan-15|titan-17|titan-18|titan-19|titan-22|titan-24\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")", + "expr": "(avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-04|titan-05|titan-06|titan-07|titan-08|titan-09|titan-10|titan-11|titan-20|titan-21|titan-12|titan-13|titan-14|titan-15|titan-16|titan-17|titan-18|titan-19|titan-22|titan-24\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")", "refId": "A", "legendFormat": "{{node}}" } @@ -1660,11 +2082,11 @@ "h": 12, "w": 12, "x": 12, - "y": 32 + "y": 36 }, "targets": [ { - "expr": "(avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-04|titan-05|titan-06|titan-07|titan-08|titan-09|titan-10|titan-11|titan-20|titan-21|titan-12|titan-13|titan-14|titan-15|titan-17|titan-18|titan-19|titan-22|titan-24\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")", + "expr": "(avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-04|titan-05|titan-06|titan-07|titan-08|titan-09|titan-10|titan-11|titan-20|titan-21|titan-12|titan-13|titan-14|titan-15|titan-16|titan-17|titan-18|titan-19|titan-22|titan-24\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")", "refId": "A", "legendFormat": "{{node}}" } @@ -1707,7 +2129,7 @@ "h": 10, "w": 12, "x": 0, - "y": 44 + "y": 48 }, "targets": [ { @@ -1744,7 +2166,7 @@ "h": 10, "w": 12, "x": 12, - "y": 44 + "y": 48 }, "targets": [ { @@ -1781,7 +2203,7 @@ "h": 10, "w": 12, "x": 0, - "y": 54 + "y": 58 }, "targets": [ { @@ -1832,11 +2254,11 @@ "h": 10, "w": 12, "x": 12, - "y": 54 + "y": 58 }, "targets": [ { - "expr": "topk(12, sum(kube_pod_info{pod!=\"\" , node!=\"\"}) by (node))", + "expr": "sort_desc(topk(12, sum(kube_pod_info{pod!=\"\" , node!=\"\"}) by (node)))", "refId": "A", "legendFormat": "{{node}}", "instant": true @@ -1913,7 +2335,7 @@ "h": 7, "w": 8, "x": 0, - "y": 25 + "y": 29 }, "targets": [ { @@ -1957,7 +2379,7 @@ "h": 7, "w": 8, "x": 8, - "y": 25 + "y": 29 }, "targets": [ { @@ -2001,7 +2423,7 @@ "h": 7, "w": 8, "x": 16, - "y": 25 + "y": 29 }, "targets": [ { @@ -2045,7 +2467,7 @@ "h": 16, "w": 12, "x": 0, - "y": 64 + "y": 68 }, "targets": [ { @@ -2093,11 +2515,11 @@ "h": 16, "w": 12, "x": 12, - "y": 64 + "y": 68 }, "targets": [ { - "expr": "topk(12, avg by (node) ((avg by (instance) ((1 - (node_filesystem_avail_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"})) * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")))", + "expr": "sort_desc(topk(12, avg by (node) ((avg by (instance) ((1 - (node_filesystem_avail_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"})) * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))))", "refId": "A", "legendFormat": "{{node}}" } diff --git a/services/monitoring/dashboards/atlas-pods.json b/services/monitoring/dashboards/atlas-pods.json index adab84b..0c8104c 100644 --- a/services/monitoring/dashboards/atlas-pods.json +++ b/services/monitoring/dashboards/atlas-pods.json @@ -439,7 +439,7 @@ }, "targets": [ { - "expr": "topk(12, sum(kube_pod_info{pod!=\"\" , node!=\"\"}) by (node))", + "expr": "sort_desc(topk(12, sum(kube_pod_info{pod!=\"\" , node!=\"\"}) by (node)))", "refId": "A", "legendFormat": "{{node}}", "instant": true @@ -520,7 +520,7 @@ }, "targets": [ { - "expr": "(sum by (namespace,node) (kube_pod_info{pod!=\"\" , node!=\"\"}) / on(namespace) group_left() clamp_min(sum by (namespace) (kube_pod_info{pod!=\"\"}), 1) * 100) * on(namespace,node) group_left() ((sum by (namespace,node) (kube_pod_info{pod!=\"\" , node!=\"\"}) / on(namespace) group_left() clamp_min(sum by (namespace) (kube_pod_info{pod!=\"\"}), 1) * 100) + on(node) group_left() ((sum by (node) (kube_node_info{node=\"titan-0a\"}) * 0 + 0.001) or (sum by (node) (kube_node_info{node=\"titan-0b\"}) * 0 + 0.002) or (sum by (node) (kube_node_info{node=\"titan-0c\"}) * 0 + 0.003) or (sum by (node) (kube_node_info{node=\"titan-db\"}) * 0 + 0.004) or (sum by (node) (kube_node_info{node=\"titan-jh\"}) * 0 + 0.005) or (sum by (node) (kube_node_info{node=\"titan-04\"}) * 0 + 0.006) or (sum by (node) (kube_node_info{node=\"titan-05\"}) * 0 + 0.007) or (sum by (node) (kube_node_info{node=\"titan-06\"}) * 0 + 0.008) or (sum by (node) (kube_node_info{node=\"titan-07\"}) * 0 + 0.009000000000000001) or (sum by (node) (kube_node_info{node=\"titan-08\"}) * 0 + 0.01) or (sum by (node) (kube_node_info{node=\"titan-09\"}) * 0 + 0.011) or (sum by (node) (kube_node_info{node=\"titan-10\"}) * 0 + 0.012) or (sum by (node) (kube_node_info{node=\"titan-11\"}) * 0 + 0.013000000000000001) or (sum by (node) (kube_node_info{node=\"titan-20\"}) * 0 + 0.014) or (sum by (node) (kube_node_info{node=\"titan-21\"}) * 0 + 0.015) or (sum by (node) (kube_node_info{node=\"titan-12\"}) * 0 + 0.016) or (sum by (node) (kube_node_info{node=\"titan-13\"}) * 0 + 0.017) or (sum by (node) (kube_node_info{node=\"titan-14\"}) * 0 + 0.018000000000000002) or (sum by (node) (kube_node_info{node=\"titan-15\"}) * 0 + 0.019) or (sum by (node) (kube_node_info{node=\"titan-17\"}) * 0 + 0.02) or (sum by (node) (kube_node_info{node=\"titan-18\"}) * 0 + 0.021) or (sum by (node) (kube_node_info{node=\"titan-19\"}) * 0 + 0.022) or (sum by (node) (kube_node_info{node=\"titan-22\"}) * 0 + 0.023) or (sum by (node) (kube_node_info{node=\"titan-24\"}) * 0 + 0.024)) == bool on(namespace) group_left() (max by (namespace) ((sum by (namespace,node) (kube_pod_info{pod!=\"\" , node!=\"\"}) / on(namespace) group_left() clamp_min(sum by (namespace) (kube_pod_info{pod!=\"\"}), 1) * 100) + on(node) group_left() ((sum by (node) (kube_node_info{node=\"titan-0a\"}) * 0 + 0.001) or (sum by (node) (kube_node_info{node=\"titan-0b\"}) * 0 + 0.002) or (sum by (node) (kube_node_info{node=\"titan-0c\"}) * 0 + 0.003) or (sum by (node) (kube_node_info{node=\"titan-db\"}) * 0 + 0.004) or (sum by (node) (kube_node_info{node=\"titan-jh\"}) * 0 + 0.005) or (sum by (node) (kube_node_info{node=\"titan-04\"}) * 0 + 0.006) or (sum by (node) (kube_node_info{node=\"titan-05\"}) * 0 + 0.007) or (sum by (node) (kube_node_info{node=\"titan-06\"}) * 0 + 0.008) or (sum by (node) (kube_node_info{node=\"titan-07\"}) * 0 + 0.009000000000000001) or (sum by (node) (kube_node_info{node=\"titan-08\"}) * 0 + 0.01) or (sum by (node) (kube_node_info{node=\"titan-09\"}) * 0 + 0.011) or (sum by (node) (kube_node_info{node=\"titan-10\"}) * 0 + 0.012) or (sum by (node) (kube_node_info{node=\"titan-11\"}) * 0 + 0.013000000000000001) or (sum by (node) (kube_node_info{node=\"titan-20\"}) * 0 + 0.014) or (sum by (node) (kube_node_info{node=\"titan-21\"}) * 0 + 0.015) or (sum by (node) (kube_node_info{node=\"titan-12\"}) * 0 + 0.016) or (sum by (node) (kube_node_info{node=\"titan-13\"}) * 0 + 0.017) or (sum by (node) (kube_node_info{node=\"titan-14\"}) * 0 + 0.018000000000000002) or (sum by (node) (kube_node_info{node=\"titan-15\"}) * 0 + 0.019) or (sum by (node) (kube_node_info{node=\"titan-17\"}) * 0 + 0.02) or (sum by (node) (kube_node_info{node=\"titan-18\"}) * 0 + 0.021) or (sum by (node) (kube_node_info{node=\"titan-19\"}) * 0 + 0.022) or (sum by (node) (kube_node_info{node=\"titan-22\"}) * 0 + 0.023) or (sum by (node) (kube_node_info{node=\"titan-24\"}) * 0 + 0.024)))))", + "expr": "(sum by (namespace,node) (kube_pod_info{pod!=\"\" , node!=\"\"}) / on(namespace) group_left() clamp_min(sum by (namespace) (kube_pod_info{pod!=\"\"}), 1) * 100) * on(namespace,node) group_left() ((sum by (namespace,node) (kube_pod_info{pod!=\"\" , node!=\"\"}) / on(namespace) group_left() clamp_min(sum by (namespace) (kube_pod_info{pod!=\"\"}), 1) * 100) + on(node) group_left() ((sum by (node) (kube_node_info{node=\"titan-0a\"}) * 0 + 0.001) or (sum by (node) (kube_node_info{node=\"titan-0b\"}) * 0 + 0.002) or (sum by (node) (kube_node_info{node=\"titan-0c\"}) * 0 + 0.003) or (sum by (node) (kube_node_info{node=\"titan-db\"}) * 0 + 0.004) or (sum by (node) (kube_node_info{node=\"titan-jh\"}) * 0 + 0.005) or (sum by (node) (kube_node_info{node=\"titan-04\"}) * 0 + 0.006) or (sum by (node) (kube_node_info{node=\"titan-05\"}) * 0 + 0.007) or (sum by (node) (kube_node_info{node=\"titan-06\"}) * 0 + 0.008) or (sum by (node) (kube_node_info{node=\"titan-07\"}) * 0 + 0.009000000000000001) or (sum by (node) (kube_node_info{node=\"titan-08\"}) * 0 + 0.01) or (sum by (node) (kube_node_info{node=\"titan-09\"}) * 0 + 0.011) or (sum by (node) (kube_node_info{node=\"titan-10\"}) * 0 + 0.012) or (sum by (node) (kube_node_info{node=\"titan-11\"}) * 0 + 0.013000000000000001) or (sum by (node) (kube_node_info{node=\"titan-20\"}) * 0 + 0.014) or (sum by (node) (kube_node_info{node=\"titan-21\"}) * 0 + 0.015) or (sum by (node) (kube_node_info{node=\"titan-12\"}) * 0 + 0.016) or (sum by (node) (kube_node_info{node=\"titan-13\"}) * 0 + 0.017) or (sum by (node) (kube_node_info{node=\"titan-14\"}) * 0 + 0.018000000000000002) or (sum by (node) (kube_node_info{node=\"titan-15\"}) * 0 + 0.019) or (sum by (node) (kube_node_info{node=\"titan-16\"}) * 0 + 0.02) or (sum by (node) (kube_node_info{node=\"titan-17\"}) * 0 + 0.021) or (sum by (node) (kube_node_info{node=\"titan-18\"}) * 0 + 0.022) or (sum by (node) (kube_node_info{node=\"titan-19\"}) * 0 + 0.023) or (sum by (node) (kube_node_info{node=\"titan-22\"}) * 0 + 0.024) or (sum by (node) (kube_node_info{node=\"titan-24\"}) * 0 + 0.025)) == bool on(namespace) group_left() (max by (namespace) ((sum by (namespace,node) (kube_pod_info{pod!=\"\" , node!=\"\"}) / on(namespace) group_left() clamp_min(sum by (namespace) (kube_pod_info{pod!=\"\"}), 1) * 100) + on(node) group_left() ((sum by (node) (kube_node_info{node=\"titan-0a\"}) * 0 + 0.001) or (sum by (node) (kube_node_info{node=\"titan-0b\"}) * 0 + 0.002) or (sum by (node) (kube_node_info{node=\"titan-0c\"}) * 0 + 0.003) or (sum by (node) (kube_node_info{node=\"titan-db\"}) * 0 + 0.004) or (sum by (node) (kube_node_info{node=\"titan-jh\"}) * 0 + 0.005) or (sum by (node) (kube_node_info{node=\"titan-04\"}) * 0 + 0.006) or (sum by (node) (kube_node_info{node=\"titan-05\"}) * 0 + 0.007) or (sum by (node) (kube_node_info{node=\"titan-06\"}) * 0 + 0.008) or (sum by (node) (kube_node_info{node=\"titan-07\"}) * 0 + 0.009000000000000001) or (sum by (node) (kube_node_info{node=\"titan-08\"}) * 0 + 0.01) or (sum by (node) (kube_node_info{node=\"titan-09\"}) * 0 + 0.011) or (sum by (node) (kube_node_info{node=\"titan-10\"}) * 0 + 0.012) or (sum by (node) (kube_node_info{node=\"titan-11\"}) * 0 + 0.013000000000000001) or (sum by (node) (kube_node_info{node=\"titan-20\"}) * 0 + 0.014) or (sum by (node) (kube_node_info{node=\"titan-21\"}) * 0 + 0.015) or (sum by (node) (kube_node_info{node=\"titan-12\"}) * 0 + 0.016) or (sum by (node) (kube_node_info{node=\"titan-13\"}) * 0 + 0.017) or (sum by (node) (kube_node_info{node=\"titan-14\"}) * 0 + 0.018000000000000002) or (sum by (node) (kube_node_info{node=\"titan-15\"}) * 0 + 0.019) or (sum by (node) (kube_node_info{node=\"titan-16\"}) * 0 + 0.02) or (sum by (node) (kube_node_info{node=\"titan-17\"}) * 0 + 0.021) or (sum by (node) (kube_node_info{node=\"titan-18\"}) * 0 + 0.022) or (sum by (node) (kube_node_info{node=\"titan-19\"}) * 0 + 0.023) or (sum by (node) (kube_node_info{node=\"titan-22\"}) * 0 + 0.024) or (sum by (node) (kube_node_info{node=\"titan-24\"}) * 0 + 0.025)))))", "refId": "A", "instant": true, "format": "table" diff --git a/services/monitoring/dashboards/atlas-testing.json b/services/monitoring/dashboards/atlas-testing.json deleted file mode 100644 index 25cf3f8..0000000 --- a/services/monitoring/dashboards/atlas-testing.json +++ /dev/null @@ -1,339 +0,0 @@ -{ - "uid": "atlas-testing", - "title": "Atlas Testing", - "folderUid": "atlas-internal", - "editable": true, - "panels": [ - { - "id": 1, - "type": "stat", - "title": "Glue Jobs Stale (>36h)", - "datasource": { - "type": "prometheus", - "uid": "atlas-vm" - }, - "gridPos": { - "h": 4, - "w": 6, - "x": 0, - "y": 0 - }, - "targets": [ - { - "expr": "(sum((((time() - (kube_cronjob_status_last_successful_time and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"})) > bool 129600) unless on(namespace,cronjob) (kube_cronjob_spec_suspend and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}) == 1)) + count(((kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"} unless on(namespace,cronjob) kube_cronjob_status_last_successful_time) unless on(namespace,cronjob) (kube_cronjob_spec_suspend and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}) == 1)))", - "refId": "A" - } - ], - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "yellow", - "value": 1 - }, - { - "color": "orange", - "value": 2 - }, - { - "color": "red", - "value": 3 - } - ] - }, - "unit": "none", - "custom": { - "displayMode": "auto" - } - }, - "overrides": [] - }, - "options": { - "colorMode": "value", - "graphMode": "area", - "justifyMode": "center", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - }, - "textMode": "value" - } - }, - { - "id": 2, - "type": "table", - "title": "Glue Jobs Missing Success", - "datasource": { - "type": "prometheus", - "uid": "atlas-vm" - }, - "gridPos": { - "h": 4, - "w": 6, - "x": 6, - "y": 0 - }, - "targets": [ - { - "expr": "((kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"} unless on(namespace,cronjob) kube_cronjob_status_last_successful_time) unless on(namespace,cronjob) (kube_cronjob_spec_suspend and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}) == 1)", - "refId": "A", - "instant": true - } - ], - "fieldConfig": { - "defaults": { - "unit": "none", - "custom": { - "filterable": true - } - }, - "overrides": [] - }, - "options": { - "showHeader": true, - "columnFilters": false - }, - "transformations": [ - { - "id": "labelsToFields", - "options": {} - }, - { - "id": "sortBy", - "options": { - "fields": [ - "Value" - ], - "order": "desc" - } - } - ] - }, - { - "id": 3, - "type": "table", - "title": "Glue Jobs Suspended", - "datasource": { - "type": "prometheus", - "uid": "atlas-vm" - }, - "gridPos": { - "h": 4, - "w": 6, - "x": 12, - "y": 0 - }, - "targets": [ - { - "expr": "(kube_cronjob_spec_suspend and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}) == 1", - "refId": "A", - "instant": true - } - ], - "fieldConfig": { - "defaults": { - "unit": "none", - "custom": { - "filterable": true - } - }, - "overrides": [] - }, - "options": { - "showHeader": true, - "columnFilters": false - }, - "transformations": [ - { - "id": "labelsToFields", - "options": {} - }, - { - "id": "sortBy", - "options": { - "fields": [ - "Value" - ], - "order": "desc" - } - } - ] - }, - { - "id": 4, - "type": "table", - "title": "Glue Jobs Active Runs", - "datasource": { - "type": "prometheus", - "uid": "atlas-vm" - }, - "gridPos": { - "h": 4, - "w": 6, - "x": 18, - "y": 0 - }, - "targets": [ - { - "expr": "(kube_cronjob_status_active and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"})", - "refId": "A", - "instant": true - } - ], - "fieldConfig": { - "defaults": { - "unit": "none", - "custom": { - "filterable": true - } - }, - "overrides": [] - }, - "options": { - "showHeader": true, - "columnFilters": false - }, - "transformations": [ - { - "id": "labelsToFields", - "options": {} - }, - { - "id": "sortBy", - "options": { - "fields": [ - "Value" - ], - "order": "desc" - } - } - ] - }, - { - "id": 5, - "type": "table", - "title": "Glue Jobs Last Success (hours ago)", - "datasource": { - "type": "prometheus", - "uid": "atlas-vm" - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 4 - }, - "targets": [ - { - "expr": "((time() - (kube_cronjob_status_last_successful_time and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}))) / 3600", - "refId": "A", - "instant": true - } - ], - "fieldConfig": { - "defaults": { - "unit": "h", - "custom": { - "filterable": true - } - }, - "overrides": [] - }, - "options": { - "showHeader": true, - "columnFilters": false - }, - "transformations": [ - { - "id": "labelsToFields", - "options": {} - }, - { - "id": "sortBy", - "options": { - "fields": [ - "Value" - ], - "order": "desc" - } - } - ] - }, - { - "id": 6, - "type": "table", - "title": "Glue Jobs Last Schedule (hours ago)", - "datasource": { - "type": "prometheus", - "uid": "atlas-vm" - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 4 - }, - "targets": [ - { - "expr": "((time() - (kube_cronjob_status_last_schedule_time and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}))) / 3600", - "refId": "A", - "instant": true - } - ], - "fieldConfig": { - "defaults": { - "unit": "h", - "custom": { - "filterable": true - } - }, - "overrides": [] - }, - "options": { - "showHeader": true, - "columnFilters": false - }, - "transformations": [ - { - "id": "labelsToFields", - "options": {} - }, - { - "id": "sortBy", - "options": { - "fields": [ - "Value" - ], - "order": "desc" - } - } - ] - } - ], - "time": { - "from": "now-7d", - "to": "now" - }, - "annotations": { - "list": [] - }, - "schemaVersion": 39, - "style": "dark", - "tags": [ - "atlas", - "testing" - ] -} diff --git a/services/monitoring/dcgm-exporter.yaml b/services/monitoring/dcgm-exporter.yaml index 8760c9f..ff5aed5 100644 --- a/services/monitoring/dcgm-exporter.yaml +++ b/services/monitoring/dcgm-exporter.yaml @@ -50,6 +50,10 @@ spec: env: - name: DCGM_EXPORTER_KUBERNETES value: "true" + - name: KUBERNETES_VIRTUAL_GPUS + value: "true" + - name: NVIDIA_RESOURCE_NAMES + value: nvidia.com/gpu.shared securityContext: privileged: true resources: diff --git a/services/monitoring/grafana-alerting-config.yaml b/services/monitoring/grafana-alerting-config.yaml index daa1e29..33ac739 100644 --- a/services/monitoring/grafana-alerting-config.yaml +++ b/services/monitoring/grafana-alerting-config.yaml @@ -145,7 +145,7 @@ data: model: intervalMs: 60000 maxDataPoints: 43200 - expr: avg_over_time((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100)[10m:1m] + expr: avg_over_time((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100)[10m:1m] * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\") legendFormat: '{{instance}}' datasource: type: prometheus @@ -175,11 +175,64 @@ data: type: last type: query noDataState: NoData - execErrState: Error + execErrState: OK annotations: - summary: "{{ $labels.instance }} CPU >90% for 10m" + summary: "{{ $labels.node }} CPU >90% for 10m" labels: severity: warning + - orgId: 1 + name: atlas-metrics + folder: Alerts + interval: 1m + rules: + - uid: victoria-metrics-down + title: "VictoriaMetrics unavailable (>30m)" + condition: C + for: "30m" + data: + - refId: A + relativeTimeRange: + from: 600 + to: 0 + datasourceUid: atlas-vm + model: + intervalMs: 60000 + maxDataPoints: 43200 + expr: sum(up{job="victoriametrics"}) + legendFormat: victoriametrics + datasource: + type: prometheus + uid: atlas-vm + - refId: B + datasourceUid: __expr__ + model: + expression: A + intervalMs: 60000 + maxDataPoints: 43200 + reducer: last + type: reduce + - refId: C + datasourceUid: __expr__ + model: + expression: B + intervalMs: 60000 + maxDataPoints: 43200 + type: threshold + conditions: + - evaluator: + params: [1] + type: lt + operator: + type: and + reducer: + type: last + type: query + noDataState: Alerting + execErrState: Alerting + annotations: + summary: "VictoriaMetrics is unavailable for >30m" + labels: + severity: critical - orgId: 1 name: maintenance folder: Alerts @@ -244,7 +297,7 @@ data: to: 0 datasourceUid: atlas-vm model: - expr: time() - max by (cronjob) (kube_cronjob_status_last_successful_time{namespace="maintenance",cronjob="image-sweeper"}) + expr: time() - max by (cronjob) (kube_cronjob_status_last_successful_time{namespace="maintenance",cronjob="image-sweeper"}) and on(cronjob) (kube_cronjob_spec_suspend{namespace="maintenance",cronjob="image-sweeper"} == 0) intervalMs: 60000 maxDataPoints: 43200 legendFormat: '{{cronjob}}' diff --git a/services/monitoring/grafana-dashboard-gpu.yaml b/services/monitoring/grafana-dashboard-gpu.yaml index d7950f2..3407963 100644 --- a/services/monitoring/grafana-dashboard-gpu.yaml +++ b/services/monitoring/grafana-dashboard-gpu.yaml @@ -29,7 +29,7 @@ data: }, "targets": [ { - "expr": "(100 * (sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace)) / clamp_min((sum(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace)) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") and on() ((sum(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace)) or on() vector(0)) == 0))", + "expr": "(100 * (sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})))) / clamp_min((sum(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})))) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") * scalar((sum(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})))) or on() vector(0)) == bool 0))", "refId": "A", "legendFormat": "{{namespace}}" } @@ -98,7 +98,7 @@ data: }, "targets": [ { - "expr": "sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace)", + "expr": "sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})))", "refId": "A", "legendFormat": "{{namespace}}" } @@ -135,7 +135,7 @@ data: }, "targets": [ { - "expr": "sum by (Hostname) (DCGM_FI_DEV_GPU_UTIL{pod!=\"\"})", + "expr": "label_replace(avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"}), \"Hostname\", \"$1\", \"node\", \"(.*)\")", "refId": "A", "legendFormat": "{{Hostname}}" } diff --git a/services/monitoring/grafana-dashboard-jobs.yaml b/services/monitoring/grafana-dashboard-jobs.yaml new file mode 100644 index 0000000..b16c9cb --- /dev/null +++ b/services/monitoring/grafana-dashboard-jobs.yaml @@ -0,0 +1,1262 @@ +# services/monitoring/grafana-dashboard-jobs.yaml +apiVersion: v1 +kind: ConfigMap +metadata: + name: grafana-dashboard-jobs + labels: + grafana_dashboard: "1" +data: + atlas-jobs.json: | + { + "uid": "atlas-jobs", + "title": "Atlas Jobs", + "folderUid": "atlas-internal", + "editable": true, + "panels": [ + { + "id": 1, + "type": "bargauge", + "title": "Ariadne Task Errors (range)", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 7, + "w": 8, + "x": 0, + "y": 0 + }, + "targets": [ + { + "expr": "sort_desc(sum by (task) (increase(ariadne_task_runs_total{status=\"error\"}[$__range])))", + "refId": "A", + "legendFormat": "{{task}}", + "instant": true + } + ], + "fieldConfig": { + "defaults": { + "unit": "none", + "min": 0, + "max": null, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 1 + }, + { + "color": "orange", + "value": 3 + }, + { + "color": "red", + "value": 5 + } + ] + } + }, + "overrides": [] + }, + "options": { + "displayMode": "gradient", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + } + }, + "transformations": [ + { + "id": "sortBy", + "options": { + "fields": [ + "Value" + ], + "order": "desc" + } + } + ] + }, + { + "id": 2, + "type": "timeseries", + "title": "Ariadne Attempts / Failures", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 7, + "w": 8, + "x": 8, + "y": 0 + }, + "targets": [ + { + "expr": "sum(increase(ariadne_task_runs_total[$__interval]))", + "refId": "A", + "legendFormat": "Attempts" + }, + { + "expr": "sum(increase(ariadne_task_runs_total{status=\"error\"}[$__interval]))", + "refId": "B", + "legendFormat": "Failures" + } + ], + "fieldConfig": { + "defaults": { + "unit": "none" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "Attempts" + }, + "properties": [ + { + "id": "color", + "value": { + "mode": "fixed", + "fixedColor": "green" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Failures" + }, + "properties": [ + { + "id": "color", + "value": { + "mode": "fixed", + "fixedColor": "red" + } + } + ] + } + ] + }, + "options": { + "legend": { + "displayMode": "table", + "placement": "right" + }, + "tooltip": { + "mode": "multi" + } + } + }, + { + "id": 3, + "type": "bargauge", + "title": "One-off Job Pods (age hours)", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 7, + "w": 8, + "x": 16, + "y": 0 + }, + "targets": [ + { + "expr": "sort_desc(((time() - kube_pod_start_time{pod!=\"\"}) / 3600) * on(namespace,pod) group_left(owner_name) (kube_pod_owner{owner_kind=\"Job\"} unless on(namespace, owner_name) label_replace(kube_job_owner{owner_kind=\"CronJob\"}, \"owner_name\", \"$1\", \"job_name\", \"(.*)\")) * on(namespace,pod) group_left(phase) max by (namespace,pod,phase) (kube_pod_status_phase{phase=~\"Running|Succeeded\"}))", + "refId": "A", + "legendFormat": "{{namespace}}/{{pod}}", + "instant": true + } + ], + "fieldConfig": { + "defaults": { + "unit": "h", + "min": 0, + "max": null, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 6 + }, + { + "color": "orange", + "value": 24 + }, + { + "color": "red", + "value": 48 + } + ] + }, + "decimals": 2 + }, + "overrides": [] + }, + "options": { + "displayMode": "gradient", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + } + }, + "transformations": [ + { + "id": "sortBy", + "options": { + "fields": [ + "Value" + ], + "order": "desc" + } + }, + { + "id": "limit", + "options": { + "limit": 12 + } + } + ] + }, + { + "id": 4, + "type": "stat", + "title": "Glue Jobs Stale (>36h)", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 4, + "w": 4, + "x": 0, + "y": 7 + }, + "targets": [ + { + "expr": "(sum((((time() - (kube_cronjob_status_last_successful_time and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"})) > bool 129600) unless on(namespace,cronjob) (kube_cronjob_spec_suspend and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}) == 1)) + count(((kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"} unless on(namespace,cronjob) kube_cronjob_status_last_successful_time) unless on(namespace,cronjob) (kube_cronjob_spec_suspend and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}) == 1))) or on() vector(0)", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 1 + }, + { + "color": "orange", + "value": 2 + }, + { + "color": "red", + "value": 3 + } + ] + }, + "unit": "none", + "custom": { + "displayMode": "auto" + } + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "value" + } + }, + { + "id": 5, + "type": "stat", + "title": "Glue Jobs Missing Success", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 4, + "w": 4, + "x": 4, + "y": 7 + }, + "targets": [ + { + "expr": "count(((kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"} unless on(namespace,cronjob) kube_cronjob_status_last_successful_time) unless on(namespace,cronjob) (kube_cronjob_spec_suspend and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}) == 1)) or on() vector(0)", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "rgba(115, 115, 115, 1)", + "value": null + }, + { + "color": "green", + "value": 1 + } + ] + }, + "unit": "none", + "custom": { + "displayMode": "auto" + } + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "value" + } + }, + { + "id": 6, + "type": "stat", + "title": "Glue Jobs Suspended", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 4, + "w": 4, + "x": 8, + "y": 7 + }, + "targets": [ + { + "expr": "sum((kube_cronjob_spec_suspend and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}) == 1) or on() vector(0)", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "rgba(115, 115, 115, 1)", + "value": null + }, + { + "color": "green", + "value": 1 + } + ] + }, + "unit": "none", + "custom": { + "displayMode": "auto" + } + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "value" + } + }, + { + "id": 7, + "type": "stat", + "title": "Ariadne Task Errors (1h)", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 4, + "w": 4, + "x": 12, + "y": 7 + }, + "targets": [ + { + "expr": "sum(increase(ariadne_task_runs_total{status=\"error\"}[1h]))", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "rgba(115, 115, 115, 1)", + "value": null + }, + { + "color": "green", + "value": 1 + } + ] + }, + "unit": "none", + "custom": { + "displayMode": "auto" + } + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "value" + } + }, + { + "id": 8, + "type": "stat", + "title": "Ariadne Task Errors (24h)", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 4, + "w": 4, + "x": 16, + "y": 7 + }, + "targets": [ + { + "expr": "sum(increase(ariadne_task_runs_total{status=\"error\"}[24h]))", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "rgba(115, 115, 115, 1)", + "value": null + }, + { + "color": "green", + "value": 1 + } + ] + }, + "unit": "none", + "custom": { + "displayMode": "auto" + } + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "value" + } + }, + { + "id": 9, + "type": "stat", + "title": "Ariadne Task Runs (1h)", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 4, + "w": 4, + "x": 20, + "y": 7 + }, + "targets": [ + { + "expr": "sum(increase(ariadne_task_runs_total[1h]))", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "rgba(115, 115, 115, 1)", + "value": null + }, + { + "color": "green", + "value": 1 + } + ] + }, + "unit": "none", + "custom": { + "displayMode": "auto" + } + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "value" + } + }, + { + "id": 10, + "type": "bargauge", + "title": "Ariadne Schedule Last Error (hours ago)", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 6, + "w": 12, + "x": 0, + "y": 17 + }, + "targets": [ + { + "expr": "sort_desc((time() - max_over_time(ariadne_schedule_last_error_timestamp_seconds[$__range])) / 3600)", + "refId": "A", + "legendFormat": "{{task}}", + "instant": true + } + ], + "fieldConfig": { + "defaults": { + "unit": "h", + "min": 0, + "max": null, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "red", + "value": null + }, + { + "color": "orange", + "value": 1 + }, + { + "color": "yellow", + "value": 6 + }, + { + "color": "green", + "value": 24 + } + ] + }, + "decimals": 2 + }, + "overrides": [] + }, + "options": { + "displayMode": "gradient", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + } + }, + "transformations": [ + { + "id": "sortBy", + "options": { + "fields": [ + "Value" + ], + "order": "desc" + } + } + ] + }, + { + "id": 11, + "type": "bargauge", + "title": "Ariadne Schedule Last Success (hours ago)", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 6, + "w": 12, + "x": 12, + "y": 17 + }, + "targets": [ + { + "expr": "sort_desc((time() - max_over_time(ariadne_schedule_last_success_timestamp_seconds[$__range])) / 3600)", + "refId": "A", + "legendFormat": "{{task}}", + "instant": true + } + ], + "fieldConfig": { + "defaults": { + "unit": "h", + "min": 0, + "max": null, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 6 + }, + { + "color": "orange", + "value": 24 + }, + { + "color": "red", + "value": 48 + } + ] + }, + "decimals": 2 + }, + "overrides": [] + }, + "options": { + "displayMode": "gradient", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + } + }, + "transformations": [ + { + "id": "sortBy", + "options": { + "fields": [ + "Value" + ], + "order": "desc" + } + } + ] + }, + { + "id": 12, + "type": "bargauge", + "title": "Glue Jobs Last Success (hours ago)", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 6, + "w": 12, + "x": 0, + "y": 23 + }, + "targets": [ + { + "expr": "sort_desc((time() - max_over_time((kube_cronjob_status_last_successful_time and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"})[$__range])) / 3600)", + "refId": "A", + "legendFormat": "{{namespace}}/{{cronjob}}", + "instant": true + } + ], + "fieldConfig": { + "defaults": { + "unit": "h", + "min": 0, + "max": null, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 6 + }, + { + "color": "orange", + "value": 24 + }, + { + "color": "red", + "value": 48 + } + ] + }, + "decimals": 2 + }, + "overrides": [] + }, + "options": { + "displayMode": "gradient", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + } + }, + "transformations": [ + { + "id": "sortBy", + "options": { + "fields": [ + "Value" + ], + "order": "desc" + } + } + ] + }, + { + "id": 13, + "type": "bargauge", + "title": "Glue Jobs Last Schedule (hours ago)", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 6, + "w": 12, + "x": 12, + "y": 23 + }, + "targets": [ + { + "expr": "sort_desc((time() - max_over_time((kube_cronjob_status_last_schedule_time and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"})[$__range])) / 3600)", + "refId": "A", + "legendFormat": "{{namespace}}/{{cronjob}}", + "instant": true + } + ], + "fieldConfig": { + "defaults": { + "unit": "h", + "min": 0, + "max": null, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 6 + }, + { + "color": "orange", + "value": 24 + }, + { + "color": "red", + "value": 48 + } + ] + }, + "decimals": 2 + }, + "overrides": [] + }, + "options": { + "displayMode": "gradient", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + } + }, + "transformations": [ + { + "id": "sortBy", + "options": { + "fields": [ + "Value" + ], + "order": "desc" + } + } + ] + }, + { + "id": 14, + "type": "bargauge", + "title": "Ariadne Task Errors (1h)", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 6, + "w": 12, + "x": 0, + "y": 29 + }, + "targets": [ + { + "expr": "sort_desc(sum by (task) (increase(ariadne_task_runs_total{status=\"error\"}[1h])))", + "refId": "A", + "legendFormat": "{{task}}", + "instant": true + } + ], + "fieldConfig": { + "defaults": { + "unit": "none", + "min": 0, + "max": null, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 1 + }, + { + "color": "orange", + "value": 3 + }, + { + "color": "red", + "value": 5 + } + ] + } + }, + "overrides": [] + }, + "options": { + "displayMode": "gradient", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + } + }, + "transformations": [ + { + "id": "sortBy", + "options": { + "fields": [ + "Value" + ], + "order": "desc" + } + } + ] + }, + { + "id": 15, + "type": "bargauge", + "title": "Ariadne Task Errors (30d)", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 6, + "w": 12, + "x": 12, + "y": 29 + }, + "targets": [ + { + "expr": "sort_desc(sum by (task) (increase(ariadne_task_runs_total{status=\"error\"}[30d])))", + "refId": "A", + "legendFormat": "{{task}}", + "instant": true + } + ], + "fieldConfig": { + "defaults": { + "unit": "none", + "min": 0, + "max": null, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 1 + }, + { + "color": "orange", + "value": 3 + }, + { + "color": "red", + "value": 5 + } + ] + } + }, + "overrides": [] + }, + "options": { + "displayMode": "gradient", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + } + }, + "transformations": [ + { + "id": "sortBy", + "options": { + "fields": [ + "Value" + ], + "order": "desc" + } + } + ] + }, + { + "id": 16, + "type": "bargauge", + "title": "Ariadne Access Requests", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 6, + "w": 8, + "x": 0, + "y": 11 + }, + "targets": [ + { + "expr": "sort_desc(ariadne_access_requests_total)", + "refId": "A", + "legendFormat": "{{status}}", + "instant": true + } + ], + "fieldConfig": { + "defaults": { + "unit": "none", + "min": 0, + "max": null, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 50 + }, + { + "color": "orange", + "value": 70 + }, + { + "color": "red", + "value": 85 + } + ] + } + }, + "overrides": [] + }, + "options": { + "displayMode": "gradient", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + } + }, + "transformations": [ + { + "id": "sortBy", + "options": { + "fields": [ + "Value" + ], + "order": "desc" + } + } + ] + }, + { + "id": 17, + "type": "stat", + "title": "Ariadne CI Coverage (%)", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 6, + "w": 4, + "x": 8, + "y": 11 + }, + "targets": [ + { + "expr": "ariadne_ci_coverage_percent{repo=\"ariadne\"}", + "refId": "A", + "legendFormat": "{{branch}}", + "instant": true + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "rgba(115, 115, 115, 1)", + "value": null + }, + { + "color": "green", + "value": 1 + } + ] + }, + "unit": "percent", + "custom": { + "displayMode": "auto" + }, + "decimals": 1 + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "value" + } + }, + { + "id": 18, + "type": "table", + "title": "Ariadne CI Tests (latest)", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 6, + "w": 12, + "x": 12, + "y": 11 + }, + "targets": [ + { + "expr": "ariadne_ci_tests_total{repo=\"ariadne\"}", + "refId": "A", + "instant": true + } + ], + "fieldConfig": { + "defaults": { + "unit": "none", + "custom": { + "filterable": true + } + }, + "overrides": [] + }, + "options": { + "showHeader": true, + "columnFilters": false + }, + "transformations": [ + { + "id": "labelsToFields", + "options": {} + }, + { + "id": "sortBy", + "options": { + "fields": [ + "Value" + ], + "order": "desc" + } + } + ] + } + ], + "time": { + "from": "now-7d", + "to": "now" + }, + "annotations": { + "list": [] + }, + "schemaVersion": 39, + "style": "dark", + "tags": [ + "atlas", + "jobs", + "glue" + ] + } diff --git a/services/monitoring/grafana-dashboard-nodes.yaml b/services/monitoring/grafana-dashboard-nodes.yaml index f0f1982..98123b9 100644 --- a/services/monitoring/grafana-dashboard-nodes.yaml +++ b/services/monitoring/grafana-dashboard-nodes.yaml @@ -29,7 +29,7 @@ data: }, "targets": [ { - "expr": "sum(kube_node_status_condition{condition=\"Ready\",status=\"true\",node=~\"titan-04|titan-05|titan-06|titan-07|titan-08|titan-09|titan-10|titan-11|titan-20|titan-21|titan-12|titan-13|titan-14|titan-15|titan-17|titan-18|titan-19|titan-22|titan-24\"})", + "expr": "sum(kube_node_status_condition{condition=\"Ready\",status=\"true\",node=~\"titan-04|titan-05|titan-06|titan-07|titan-08|titan-09|titan-10|titan-11|titan-20|titan-21|titan-12|titan-13|titan-14|titan-15|titan-16|titan-17|titan-18|titan-19|titan-22|titan-24\"})", "refId": "A" } ], @@ -55,7 +55,7 @@ data: "unit": "none", "custom": { "displayMode": "auto", - "valueSuffix": "/19" + "valueSuffix": "/20" } }, "overrides": [] diff --git a/services/monitoring/grafana-dashboard-overview.yaml b/services/monitoring/grafana-dashboard-overview.yaml index 8ad7523..fdfe1a7 100644 --- a/services/monitoring/grafana-dashboard-overview.yaml +++ b/services/monitoring/grafana-dashboard-overview.yaml @@ -458,14 +458,14 @@ data: }, "targets": [ { - "expr": "sum(kube_node_status_condition{condition=\"Ready\",status=\"true\",node=~\"titan-04|titan-05|titan-06|titan-07|titan-08|titan-09|titan-10|titan-11|titan-20|titan-21|titan-12|titan-13|titan-14|titan-15|titan-17|titan-18|titan-19|titan-22|titan-24\"})", + "expr": "sum(kube_node_status_condition{condition=\"Ready\",status=\"true\",node=~\"titan-04|titan-05|titan-06|titan-07|titan-08|titan-09|titan-10|titan-11|titan-20|titan-21|titan-12|titan-13|titan-14|titan-15|titan-16|titan-17|titan-18|titan-19|titan-22|titan-24\"})", "refId": "A" } ], "fieldConfig": { "defaults": { "min": 0, - "max": 19, + "max": 20, "thresholds": { "mode": "absolute", "steps": [ @@ -475,15 +475,15 @@ data: }, { "color": "orange", - "value": 17 - }, - { - "color": "yellow", "value": 18 }, { - "color": "green", + "color": "yellow", "value": 19 + }, + { + "color": "green", + "value": 20 } ] } @@ -804,8 +804,8 @@ data: "uid": "atlas-vm" }, "gridPos": { - "h": 2, - "w": 5, + "h": 3, + "w": 4, "x": 0, "y": 8 }, @@ -871,9 +871,9 @@ data: "uid": "atlas-vm" }, "gridPos": { - "h": 2, - "w": 5, - "x": 10, + "h": 3, + "w": 4, + "x": 8, "y": 8 }, "targets": [ @@ -976,9 +976,9 @@ data: "uid": "atlas-vm" }, "gridPos": { - "h": 2, - "w": 5, - "x": 5, + "h": 3, + "w": 4, + "x": 4, "y": 8 }, "targets": [ @@ -1052,9 +1052,9 @@ data: "uid": "atlas-vm" }, "gridPos": { - "h": 2, - "w": 5, - "x": 15, + "h": 3, + "w": 4, + "x": 12, "y": 8 }, "targets": [ @@ -1119,6 +1119,132 @@ data: } ] }, + { + "id": 34, + "type": "stat", + "title": "Postgres Connections Used", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 3, + "w": 4, + "x": 16, + "y": 8 + }, + "targets": [ + { + "expr": "label_replace(sum(pg_stat_activity_count), \"conn\", \"used\", \"__name__\", \".*\") or label_replace(max(pg_settings_max_connections), \"conn\", \"max\", \"__name__\", \".*\")", + "refId": "A", + "legendFormat": "{{conn}}", + "instant": true + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "rgba(115, 115, 115, 1)", + "value": null + }, + { + "color": "green", + "value": 1 + } + ] + }, + "unit": "none", + "custom": { + "displayMode": "auto" + }, + "decimals": 0 + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "name_and_value" + } + }, + { + "id": 35, + "type": "stat", + "title": "Postgres Hottest Connections", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 3, + "w": 4, + "x": 20, + "y": 8 + }, + "targets": [ + { + "expr": "topk(1, sum by (datname) (pg_stat_activity_count))", + "refId": "A", + "legendFormat": "{{datname}}", + "instant": true + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "rgba(115, 115, 115, 1)", + "value": null + }, + { + "color": "green", + "value": 1 + } + ] + }, + "unit": "none", + "custom": { + "displayMode": "auto" + }, + "decimals": 0 + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "name_and_value" + } + }, { "id": 23, "type": "stat", @@ -1128,10 +1254,10 @@ data: "uid": "atlas-vm" }, "gridPos": { - "h": 6, + "h": 3, "w": 6, "x": 0, - "y": 10 + "y": 11 }, "targets": [ { @@ -1203,10 +1329,10 @@ data: "uid": "atlas-vm" }, "gridPos": { - "h": 6, + "h": 3, "w": 6, "x": 6, - "y": 10 + "y": 11 }, "targets": [ { @@ -1278,10 +1404,10 @@ data: "uid": "atlas-vm" }, "gridPos": { - "h": 6, + "h": 3, "w": 6, "x": 12, - "y": 10 + "y": 11 }, "targets": [ { @@ -1345,10 +1471,10 @@ data: "uid": "atlas-vm" }, "gridPos": { - "h": 6, + "h": 3, "w": 6, "x": 18, - "y": 10 + "y": 11 }, "targets": [ { @@ -1403,6 +1529,302 @@ data: } ] }, + { + "id": 40, + "type": "bargauge", + "title": "One-off Job Pods (age hours)", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 6, + "w": 6, + "x": 0, + "y": 14 + }, + "targets": [ + { + "expr": "sort_desc(((time() - kube_pod_start_time{pod!=\"\"}) / 3600) * on(namespace,pod) group_left(owner_name) (kube_pod_owner{owner_kind=\"Job\"} unless on(namespace, owner_name) label_replace(kube_job_owner{owner_kind=\"CronJob\"}, \"owner_name\", \"$1\", \"job_name\", \"(.*)\")) * on(namespace,pod) group_left(phase) max by (namespace,pod,phase) (kube_pod_status_phase{phase=~\"Running|Succeeded\"}))", + "refId": "A", + "legendFormat": "{{namespace}}/{{pod}}", + "instant": true + } + ], + "fieldConfig": { + "defaults": { + "unit": "h", + "min": 0, + "max": null, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 6 + }, + { + "color": "orange", + "value": 24 + }, + { + "color": "red", + "value": 48 + } + ] + }, + "decimals": 2 + }, + "overrides": [] + }, + "options": { + "displayMode": "gradient", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + } + }, + "transformations": [ + { + "id": "sortBy", + "options": { + "fields": [ + "Value" + ], + "order": "desc" + } + }, + { + "id": "limit", + "options": { + "limit": 8 + } + } + ] + }, + { + "id": 41, + "type": "timeseries", + "title": "Ariadne Attempts / Failures", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 6, + "w": 6, + "x": 6, + "y": 14 + }, + "targets": [ + { + "expr": "sum(increase(ariadne_task_runs_total[$__interval]))", + "refId": "A", + "legendFormat": "Attempts" + }, + { + "expr": "sum(increase(ariadne_task_runs_total{status=\"error\"}[$__interval]))", + "refId": "B", + "legendFormat": "Failures" + } + ], + "fieldConfig": { + "defaults": { + "unit": "none" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "Attempts" + }, + "properties": [ + { + "id": "color", + "value": { + "mode": "fixed", + "fixedColor": "green" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Failures" + }, + "properties": [ + { + "id": "color", + "value": { + "mode": "fixed", + "fixedColor": "red" + } + } + ] + } + ] + }, + "options": { + "legend": { + "displayMode": "table", + "placement": "right" + }, + "tooltip": { + "mode": "multi" + } + } + }, + { + "id": 42, + "type": "timeseries", + "title": "Ariadne Test Success Rate", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 6, + "w": 6, + "x": 12, + "y": 14 + }, + "targets": [ + { + "expr": "100 * sum(max_over_time(ariadne_ci_tests_total{repo=\"ariadne\",result=\"passed\"}[30d])) / clamp_min(sum(max_over_time(ariadne_ci_tests_total{repo=\"ariadne\",result=~\"passed|failed|error\"}[30d])), 1)", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "unit": "percent", + "max": 100 + }, + "overrides": [] + }, + "options": { + "legend": { + "displayMode": "list", + "placement": "bottom" + }, + "tooltip": { + "mode": "multi" + } + } + }, + { + "id": 43, + "type": "bargauge", + "title": "Tests with Failures (24h)", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 6, + "w": 6, + "x": 18, + "y": 14 + }, + "targets": [ + { + "expr": "sort_desc(sum by (result) (max_over_time(ariadne_ci_tests_total{repo=\"ariadne\",result=~\"failed|error\"}[24h])))", + "refId": "A", + "legendFormat": "{{result}}", + "instant": true + } + ], + "fieldConfig": { + "defaults": { + "unit": "none", + "min": 0, + "max": null, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 1 + }, + { + "color": "orange", + "value": 5 + }, + { + "color": "red", + "value": 10 + } + ] + } + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "error" + }, + "properties": [ + { + "id": "color", + "value": { + "mode": "fixed", + "fixedColor": "yellow" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "failed" + }, + "properties": [ + { + "id": "color", + "value": { + "mode": "fixed", + "fixedColor": "red" + } + } + ] + } + ] + }, + "options": { + "displayMode": "gradient", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + } + }, + "transformations": [ + { + "id": "sortBy", + "options": { + "fields": [ + "Value" + ], + "order": "desc" + } + } + ] + }, { "id": 11, "type": "piechart", @@ -1415,7 +1837,7 @@ data: "h": 9, "w": 8, "x": 0, - "y": 16 + "y": 20 }, "targets": [ { @@ -1484,11 +1906,11 @@ data: "h": 9, "w": 8, "x": 8, - "y": 16 + "y": 20 }, "targets": [ { - "expr": "(100 * (sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace)) / clamp_min((sum(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace)) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") and on() ((sum(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace)) or on() vector(0)) == 0))", + "expr": "(100 * (sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})))) / clamp_min((sum(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})))) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") * scalar((sum(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})))) or on() vector(0)) == bool 0))", "refId": "A", "legendFormat": "{{namespace}}" } @@ -1553,7 +1975,7 @@ data: "h": 9, "w": 8, "x": 16, - "y": 16 + "y": 20 }, "targets": [ { @@ -1622,11 +2044,11 @@ data: "h": 12, "w": 12, "x": 0, - "y": 32 + "y": 36 }, "targets": [ { - "expr": "(avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-04|titan-05|titan-06|titan-07|titan-08|titan-09|titan-10|titan-11|titan-20|titan-21|titan-12|titan-13|titan-14|titan-15|titan-17|titan-18|titan-19|titan-22|titan-24\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")", + "expr": "(avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-04|titan-05|titan-06|titan-07|titan-08|titan-09|titan-10|titan-11|titan-20|titan-21|titan-12|titan-13|titan-14|titan-15|titan-16|titan-17|titan-18|titan-19|titan-22|titan-24\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")", "refId": "A", "legendFormat": "{{node}}" } @@ -1669,11 +2091,11 @@ data: "h": 12, "w": 12, "x": 12, - "y": 32 + "y": 36 }, "targets": [ { - "expr": "(avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-04|titan-05|titan-06|titan-07|titan-08|titan-09|titan-10|titan-11|titan-20|titan-21|titan-12|titan-13|titan-14|titan-15|titan-17|titan-18|titan-19|titan-22|titan-24\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")", + "expr": "(avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-04|titan-05|titan-06|titan-07|titan-08|titan-09|titan-10|titan-11|titan-20|titan-21|titan-12|titan-13|titan-14|titan-15|titan-16|titan-17|titan-18|titan-19|titan-22|titan-24\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")", "refId": "A", "legendFormat": "{{node}}" } @@ -1716,7 +2138,7 @@ data: "h": 10, "w": 12, "x": 0, - "y": 44 + "y": 48 }, "targets": [ { @@ -1753,7 +2175,7 @@ data: "h": 10, "w": 12, "x": 12, - "y": 44 + "y": 48 }, "targets": [ { @@ -1790,7 +2212,7 @@ data: "h": 10, "w": 12, "x": 0, - "y": 54 + "y": 58 }, "targets": [ { @@ -1841,11 +2263,11 @@ data: "h": 10, "w": 12, "x": 12, - "y": 54 + "y": 58 }, "targets": [ { - "expr": "topk(12, sum(kube_pod_info{pod!=\"\" , node!=\"\"}) by (node))", + "expr": "sort_desc(topk(12, sum(kube_pod_info{pod!=\"\" , node!=\"\"}) by (node)))", "refId": "A", "legendFormat": "{{node}}", "instant": true @@ -1922,7 +2344,7 @@ data: "h": 7, "w": 8, "x": 0, - "y": 25 + "y": 29 }, "targets": [ { @@ -1966,7 +2388,7 @@ data: "h": 7, "w": 8, "x": 8, - "y": 25 + "y": 29 }, "targets": [ { @@ -2010,7 +2432,7 @@ data: "h": 7, "w": 8, "x": 16, - "y": 25 + "y": 29 }, "targets": [ { @@ -2054,7 +2476,7 @@ data: "h": 16, "w": 12, "x": 0, - "y": 64 + "y": 68 }, "targets": [ { @@ -2102,11 +2524,11 @@ data: "h": 16, "w": 12, "x": 12, - "y": 64 + "y": 68 }, "targets": [ { - "expr": "topk(12, avg by (node) ((avg by (instance) ((1 - (node_filesystem_avail_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"})) * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")))", + "expr": "sort_desc(topk(12, avg by (node) ((avg by (instance) ((1 - (node_filesystem_avail_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"})) * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))))", "refId": "A", "legendFormat": "{{node}}" } diff --git a/services/monitoring/grafana-dashboard-pods.yaml b/services/monitoring/grafana-dashboard-pods.yaml index f537d4c..1461eac 100644 --- a/services/monitoring/grafana-dashboard-pods.yaml +++ b/services/monitoring/grafana-dashboard-pods.yaml @@ -448,7 +448,7 @@ data: }, "targets": [ { - "expr": "topk(12, sum(kube_pod_info{pod!=\"\" , node!=\"\"}) by (node))", + "expr": "sort_desc(topk(12, sum(kube_pod_info{pod!=\"\" , node!=\"\"}) by (node)))", "refId": "A", "legendFormat": "{{node}}", "instant": true @@ -529,7 +529,7 @@ data: }, "targets": [ { - "expr": "(sum by (namespace,node) (kube_pod_info{pod!=\"\" , node!=\"\"}) / on(namespace) group_left() clamp_min(sum by (namespace) (kube_pod_info{pod!=\"\"}), 1) * 100) * on(namespace,node) group_left() ((sum by (namespace,node) (kube_pod_info{pod!=\"\" , node!=\"\"}) / on(namespace) group_left() clamp_min(sum by (namespace) (kube_pod_info{pod!=\"\"}), 1) * 100) + on(node) group_left() ((sum by (node) (kube_node_info{node=\"titan-0a\"}) * 0 + 0.001) or (sum by (node) (kube_node_info{node=\"titan-0b\"}) * 0 + 0.002) or (sum by (node) (kube_node_info{node=\"titan-0c\"}) * 0 + 0.003) or (sum by (node) (kube_node_info{node=\"titan-db\"}) * 0 + 0.004) or (sum by (node) (kube_node_info{node=\"titan-jh\"}) * 0 + 0.005) or (sum by (node) (kube_node_info{node=\"titan-04\"}) * 0 + 0.006) or (sum by (node) (kube_node_info{node=\"titan-05\"}) * 0 + 0.007) or (sum by (node) (kube_node_info{node=\"titan-06\"}) * 0 + 0.008) or (sum by (node) (kube_node_info{node=\"titan-07\"}) * 0 + 0.009000000000000001) or (sum by (node) (kube_node_info{node=\"titan-08\"}) * 0 + 0.01) or (sum by (node) (kube_node_info{node=\"titan-09\"}) * 0 + 0.011) or (sum by (node) (kube_node_info{node=\"titan-10\"}) * 0 + 0.012) or (sum by (node) (kube_node_info{node=\"titan-11\"}) * 0 + 0.013000000000000001) or (sum by (node) (kube_node_info{node=\"titan-20\"}) * 0 + 0.014) or (sum by (node) (kube_node_info{node=\"titan-21\"}) * 0 + 0.015) or (sum by (node) (kube_node_info{node=\"titan-12\"}) * 0 + 0.016) or (sum by (node) (kube_node_info{node=\"titan-13\"}) * 0 + 0.017) or (sum by (node) (kube_node_info{node=\"titan-14\"}) * 0 + 0.018000000000000002) or (sum by (node) (kube_node_info{node=\"titan-15\"}) * 0 + 0.019) or (sum by (node) (kube_node_info{node=\"titan-17\"}) * 0 + 0.02) or (sum by (node) (kube_node_info{node=\"titan-18\"}) * 0 + 0.021) or (sum by (node) (kube_node_info{node=\"titan-19\"}) * 0 + 0.022) or (sum by (node) (kube_node_info{node=\"titan-22\"}) * 0 + 0.023) or (sum by (node) (kube_node_info{node=\"titan-24\"}) * 0 + 0.024)) == bool on(namespace) group_left() (max by (namespace) ((sum by (namespace,node) (kube_pod_info{pod!=\"\" , node!=\"\"}) / on(namespace) group_left() clamp_min(sum by (namespace) (kube_pod_info{pod!=\"\"}), 1) * 100) + on(node) group_left() ((sum by (node) (kube_node_info{node=\"titan-0a\"}) * 0 + 0.001) or (sum by (node) (kube_node_info{node=\"titan-0b\"}) * 0 + 0.002) or (sum by (node) (kube_node_info{node=\"titan-0c\"}) * 0 + 0.003) or (sum by (node) (kube_node_info{node=\"titan-db\"}) * 0 + 0.004) or (sum by (node) (kube_node_info{node=\"titan-jh\"}) * 0 + 0.005) or (sum by (node) (kube_node_info{node=\"titan-04\"}) * 0 + 0.006) or (sum by (node) (kube_node_info{node=\"titan-05\"}) * 0 + 0.007) or (sum by (node) (kube_node_info{node=\"titan-06\"}) * 0 + 0.008) or (sum by (node) (kube_node_info{node=\"titan-07\"}) * 0 + 0.009000000000000001) or (sum by (node) (kube_node_info{node=\"titan-08\"}) * 0 + 0.01) or (sum by (node) (kube_node_info{node=\"titan-09\"}) * 0 + 0.011) or (sum by (node) (kube_node_info{node=\"titan-10\"}) * 0 + 0.012) or (sum by (node) (kube_node_info{node=\"titan-11\"}) * 0 + 0.013000000000000001) or (sum by (node) (kube_node_info{node=\"titan-20\"}) * 0 + 0.014) or (sum by (node) (kube_node_info{node=\"titan-21\"}) * 0 + 0.015) or (sum by (node) (kube_node_info{node=\"titan-12\"}) * 0 + 0.016) or (sum by (node) (kube_node_info{node=\"titan-13\"}) * 0 + 0.017) or (sum by (node) (kube_node_info{node=\"titan-14\"}) * 0 + 0.018000000000000002) or (sum by (node) (kube_node_info{node=\"titan-15\"}) * 0 + 0.019) or (sum by (node) (kube_node_info{node=\"titan-17\"}) * 0 + 0.02) or (sum by (node) (kube_node_info{node=\"titan-18\"}) * 0 + 0.021) or (sum by (node) (kube_node_info{node=\"titan-19\"}) * 0 + 0.022) or (sum by (node) (kube_node_info{node=\"titan-22\"}) * 0 + 0.023) or (sum by (node) (kube_node_info{node=\"titan-24\"}) * 0 + 0.024)))))", + "expr": "(sum by (namespace,node) (kube_pod_info{pod!=\"\" , node!=\"\"}) / on(namespace) group_left() clamp_min(sum by (namespace) (kube_pod_info{pod!=\"\"}), 1) * 100) * on(namespace,node) group_left() ((sum by (namespace,node) (kube_pod_info{pod!=\"\" , node!=\"\"}) / on(namespace) group_left() clamp_min(sum by (namespace) (kube_pod_info{pod!=\"\"}), 1) * 100) + on(node) group_left() ((sum by (node) (kube_node_info{node=\"titan-0a\"}) * 0 + 0.001) or (sum by (node) (kube_node_info{node=\"titan-0b\"}) * 0 + 0.002) or (sum by (node) (kube_node_info{node=\"titan-0c\"}) * 0 + 0.003) or (sum by (node) (kube_node_info{node=\"titan-db\"}) * 0 + 0.004) or (sum by (node) (kube_node_info{node=\"titan-jh\"}) * 0 + 0.005) or (sum by (node) (kube_node_info{node=\"titan-04\"}) * 0 + 0.006) or (sum by (node) (kube_node_info{node=\"titan-05\"}) * 0 + 0.007) or (sum by (node) (kube_node_info{node=\"titan-06\"}) * 0 + 0.008) or (sum by (node) (kube_node_info{node=\"titan-07\"}) * 0 + 0.009000000000000001) or (sum by (node) (kube_node_info{node=\"titan-08\"}) * 0 + 0.01) or (sum by (node) (kube_node_info{node=\"titan-09\"}) * 0 + 0.011) or (sum by (node) (kube_node_info{node=\"titan-10\"}) * 0 + 0.012) or (sum by (node) (kube_node_info{node=\"titan-11\"}) * 0 + 0.013000000000000001) or (sum by (node) (kube_node_info{node=\"titan-20\"}) * 0 + 0.014) or (sum by (node) (kube_node_info{node=\"titan-21\"}) * 0 + 0.015) or (sum by (node) (kube_node_info{node=\"titan-12\"}) * 0 + 0.016) or (sum by (node) (kube_node_info{node=\"titan-13\"}) * 0 + 0.017) or (sum by (node) (kube_node_info{node=\"titan-14\"}) * 0 + 0.018000000000000002) or (sum by (node) (kube_node_info{node=\"titan-15\"}) * 0 + 0.019) or (sum by (node) (kube_node_info{node=\"titan-16\"}) * 0 + 0.02) or (sum by (node) (kube_node_info{node=\"titan-17\"}) * 0 + 0.021) or (sum by (node) (kube_node_info{node=\"titan-18\"}) * 0 + 0.022) or (sum by (node) (kube_node_info{node=\"titan-19\"}) * 0 + 0.023) or (sum by (node) (kube_node_info{node=\"titan-22\"}) * 0 + 0.024) or (sum by (node) (kube_node_info{node=\"titan-24\"}) * 0 + 0.025)) == bool on(namespace) group_left() (max by (namespace) ((sum by (namespace,node) (kube_pod_info{pod!=\"\" , node!=\"\"}) / on(namespace) group_left() clamp_min(sum by (namespace) (kube_pod_info{pod!=\"\"}), 1) * 100) + on(node) group_left() ((sum by (node) (kube_node_info{node=\"titan-0a\"}) * 0 + 0.001) or (sum by (node) (kube_node_info{node=\"titan-0b\"}) * 0 + 0.002) or (sum by (node) (kube_node_info{node=\"titan-0c\"}) * 0 + 0.003) or (sum by (node) (kube_node_info{node=\"titan-db\"}) * 0 + 0.004) or (sum by (node) (kube_node_info{node=\"titan-jh\"}) * 0 + 0.005) or (sum by (node) (kube_node_info{node=\"titan-04\"}) * 0 + 0.006) or (sum by (node) (kube_node_info{node=\"titan-05\"}) * 0 + 0.007) or (sum by (node) (kube_node_info{node=\"titan-06\"}) * 0 + 0.008) or (sum by (node) (kube_node_info{node=\"titan-07\"}) * 0 + 0.009000000000000001) or (sum by (node) (kube_node_info{node=\"titan-08\"}) * 0 + 0.01) or (sum by (node) (kube_node_info{node=\"titan-09\"}) * 0 + 0.011) or (sum by (node) (kube_node_info{node=\"titan-10\"}) * 0 + 0.012) or (sum by (node) (kube_node_info{node=\"titan-11\"}) * 0 + 0.013000000000000001) or (sum by (node) (kube_node_info{node=\"titan-20\"}) * 0 + 0.014) or (sum by (node) (kube_node_info{node=\"titan-21\"}) * 0 + 0.015) or (sum by (node) (kube_node_info{node=\"titan-12\"}) * 0 + 0.016) or (sum by (node) (kube_node_info{node=\"titan-13\"}) * 0 + 0.017) or (sum by (node) (kube_node_info{node=\"titan-14\"}) * 0 + 0.018000000000000002) or (sum by (node) (kube_node_info{node=\"titan-15\"}) * 0 + 0.019) or (sum by (node) (kube_node_info{node=\"titan-16\"}) * 0 + 0.02) or (sum by (node) (kube_node_info{node=\"titan-17\"}) * 0 + 0.021) or (sum by (node) (kube_node_info{node=\"titan-18\"}) * 0 + 0.022) or (sum by (node) (kube_node_info{node=\"titan-19\"}) * 0 + 0.023) or (sum by (node) (kube_node_info{node=\"titan-22\"}) * 0 + 0.024) or (sum by (node) (kube_node_info{node=\"titan-24\"}) * 0 + 0.025)))))", "refId": "A", "instant": true, "format": "table" diff --git a/services/monitoring/grafana-dashboard-testing.yaml b/services/monitoring/grafana-dashboard-testing.yaml deleted file mode 100644 index 80a7043..0000000 --- a/services/monitoring/grafana-dashboard-testing.yaml +++ /dev/null @@ -1,348 +0,0 @@ -# services/monitoring/grafana-dashboard-testing.yaml -apiVersion: v1 -kind: ConfigMap -metadata: - name: grafana-dashboard-testing - labels: - grafana_dashboard: "1" -data: - atlas-testing.json: | - { - "uid": "atlas-testing", - "title": "Atlas Testing", - "folderUid": "atlas-internal", - "editable": true, - "panels": [ - { - "id": 1, - "type": "stat", - "title": "Glue Jobs Stale (>36h)", - "datasource": { - "type": "prometheus", - "uid": "atlas-vm" - }, - "gridPos": { - "h": 4, - "w": 6, - "x": 0, - "y": 0 - }, - "targets": [ - { - "expr": "(sum((((time() - (kube_cronjob_status_last_successful_time and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"})) > bool 129600) unless on(namespace,cronjob) (kube_cronjob_spec_suspend and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}) == 1)) + count(((kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"} unless on(namespace,cronjob) kube_cronjob_status_last_successful_time) unless on(namespace,cronjob) (kube_cronjob_spec_suspend and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}) == 1)))", - "refId": "A" - } - ], - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "yellow", - "value": 1 - }, - { - "color": "orange", - "value": 2 - }, - { - "color": "red", - "value": 3 - } - ] - }, - "unit": "none", - "custom": { - "displayMode": "auto" - } - }, - "overrides": [] - }, - "options": { - "colorMode": "value", - "graphMode": "area", - "justifyMode": "center", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - }, - "textMode": "value" - } - }, - { - "id": 2, - "type": "table", - "title": "Glue Jobs Missing Success", - "datasource": { - "type": "prometheus", - "uid": "atlas-vm" - }, - "gridPos": { - "h": 4, - "w": 6, - "x": 6, - "y": 0 - }, - "targets": [ - { - "expr": "((kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"} unless on(namespace,cronjob) kube_cronjob_status_last_successful_time) unless on(namespace,cronjob) (kube_cronjob_spec_suspend and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}) == 1)", - "refId": "A", - "instant": true - } - ], - "fieldConfig": { - "defaults": { - "unit": "none", - "custom": { - "filterable": true - } - }, - "overrides": [] - }, - "options": { - "showHeader": true, - "columnFilters": false - }, - "transformations": [ - { - "id": "labelsToFields", - "options": {} - }, - { - "id": "sortBy", - "options": { - "fields": [ - "Value" - ], - "order": "desc" - } - } - ] - }, - { - "id": 3, - "type": "table", - "title": "Glue Jobs Suspended", - "datasource": { - "type": "prometheus", - "uid": "atlas-vm" - }, - "gridPos": { - "h": 4, - "w": 6, - "x": 12, - "y": 0 - }, - "targets": [ - { - "expr": "(kube_cronjob_spec_suspend and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}) == 1", - "refId": "A", - "instant": true - } - ], - "fieldConfig": { - "defaults": { - "unit": "none", - "custom": { - "filterable": true - } - }, - "overrides": [] - }, - "options": { - "showHeader": true, - "columnFilters": false - }, - "transformations": [ - { - "id": "labelsToFields", - "options": {} - }, - { - "id": "sortBy", - "options": { - "fields": [ - "Value" - ], - "order": "desc" - } - } - ] - }, - { - "id": 4, - "type": "table", - "title": "Glue Jobs Active Runs", - "datasource": { - "type": "prometheus", - "uid": "atlas-vm" - }, - "gridPos": { - "h": 4, - "w": 6, - "x": 18, - "y": 0 - }, - "targets": [ - { - "expr": "(kube_cronjob_status_active and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"})", - "refId": "A", - "instant": true - } - ], - "fieldConfig": { - "defaults": { - "unit": "none", - "custom": { - "filterable": true - } - }, - "overrides": [] - }, - "options": { - "showHeader": true, - "columnFilters": false - }, - "transformations": [ - { - "id": "labelsToFields", - "options": {} - }, - { - "id": "sortBy", - "options": { - "fields": [ - "Value" - ], - "order": "desc" - } - } - ] - }, - { - "id": 5, - "type": "table", - "title": "Glue Jobs Last Success (hours ago)", - "datasource": { - "type": "prometheus", - "uid": "atlas-vm" - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 4 - }, - "targets": [ - { - "expr": "((time() - (kube_cronjob_status_last_successful_time and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}))) / 3600", - "refId": "A", - "instant": true - } - ], - "fieldConfig": { - "defaults": { - "unit": "h", - "custom": { - "filterable": true - } - }, - "overrides": [] - }, - "options": { - "showHeader": true, - "columnFilters": false - }, - "transformations": [ - { - "id": "labelsToFields", - "options": {} - }, - { - "id": "sortBy", - "options": { - "fields": [ - "Value" - ], - "order": "desc" - } - } - ] - }, - { - "id": 6, - "type": "table", - "title": "Glue Jobs Last Schedule (hours ago)", - "datasource": { - "type": "prometheus", - "uid": "atlas-vm" - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 4 - }, - "targets": [ - { - "expr": "((time() - (kube_cronjob_status_last_schedule_time and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}))) / 3600", - "refId": "A", - "instant": true - } - ], - "fieldConfig": { - "defaults": { - "unit": "h", - "custom": { - "filterable": true - } - }, - "overrides": [] - }, - "options": { - "showHeader": true, - "columnFilters": false - }, - "transformations": [ - { - "id": "labelsToFields", - "options": {} - }, - { - "id": "sortBy", - "options": { - "fields": [ - "Value" - ], - "order": "desc" - } - } - ] - } - ], - "time": { - "from": "now-7d", - "to": "now" - }, - "annotations": { - "list": [] - }, - "schemaVersion": 39, - "style": "dark", - "tags": [ - "atlas", - "testing" - ] - } diff --git a/services/monitoring/helmrelease.yaml b/services/monitoring/helmrelease.yaml index 304de05..6651738 100644 --- a/services/monitoring/helmrelease.yaml +++ b/services/monitoring/helmrelease.yaml @@ -286,6 +286,7 @@ spec: podAnnotations: vault.hashicorp.com/agent-inject: "true" vault.hashicorp.com/role: "monitoring" + monitoring.bstein.dev/restart-rev: "1" vault.hashicorp.com/agent-inject-secret-grafana-env.sh: "kv/data/atlas/monitoring/grafana-admin" vault.hashicorp.com/agent-inject-template-grafana-env.sh: | {{ with secret "kv/data/atlas/monitoring/grafana-admin" }} @@ -339,10 +340,10 @@ spec: GF_AUTH_ANONYMOUS_ORG_NAME: "Overview" GF_AUTH_ANONYMOUS_ORG_ROLE: "Viewer" GF_SMTP_ENABLED: "true" - GF_SMTP_HOST: "mail.bstein.dev:587" - GF_SMTP_FROM: "no-reply-grafana@bstein.dev" + GF_SMTP_HOST: "smtp.postmarkapp.com:587" + GF_SMTP_FROM_ADDRESS: "no-reply-grafana@bstein.dev" GF_SMTP_FROM_NAME: "Atlas Grafana" - GRAFANA_ALERT_EMAILS: "alerts@bstein.dev" + GRAFANA_ALERT_EMAILS: "brad@bstein.dev" GF_SECURITY_ALLOW_EMBEDDING: "true" GF_AUTH_GENERIC_OAUTH_ENABLED: "true" GF_AUTH_GENERIC_OAUTH_NAME: "Keycloak" @@ -354,6 +355,8 @@ spec: GF_AUTH_GENERIC_OAUTH_ROLE_ATTRIBUTE_PATH: "contains(groups, 'admin') && 'Admin' || 'Viewer'" GF_AUTH_GENERIC_OAUTH_USE_PKCE: "true" GF_AUTH_GENERIC_OAUTH_TLS_SKIP_VERIFY_INSECURE: "false" + GF_AUTH_GENERIC_OAUTH_ALLOW_INSECURE_EMAIL_LOOKUP: "true" + GF_AUTH_GENERIC_OAUTH_EMAIL_ATTRIBUTE_PATH: "email" GF_AUTH_SIGNOUT_REDIRECT_URL: "https://sso.bstein.dev/realms/atlas/protocol/openid-connect/logout?redirect_uri=https://metrics.bstein.dev/" grafana.ini: server: @@ -469,14 +472,14 @@ spec: editable: true options: path: /var/lib/grafana/dashboards/mail - - name: testing + - name: jobs orgId: 1 folder: Atlas Internal type: file disableDeletion: false editable: true options: - path: /var/lib/grafana/dashboards/testing + path: /var/lib/grafana/dashboards/jobs dashboardsConfigMaps: overview: grafana-dashboard-overview overview-public: grafana-dashboard-overview @@ -486,7 +489,7 @@ spec: gpu: grafana-dashboard-gpu network: grafana-dashboard-network mail: grafana-dashboard-mail - testing: grafana-dashboard-testing + jobs: grafana-dashboard-jobs extraConfigmapMounts: - name: grafana-folders mountPath: /etc/grafana/provisioning/folders diff --git a/services/monitoring/jetson-tegrastats-exporter.yaml b/services/monitoring/jetson-tegrastats-exporter.yaml index 8788b20..ba25c9f 100644 --- a/services/monitoring/jetson-tegrastats-exporter.yaml +++ b/services/monitoring/jetson-tegrastats-exporter.yaml @@ -17,6 +17,7 @@ spec: annotations: prometheus.io/scrape: "true" prometheus.io/port: "9100" + monitoring.bstein.dev/restart-rev: "7" spec: serviceAccountName: default hostPID: true @@ -44,6 +45,10 @@ spec: env: - name: JETSON_EXPORTER_PORT value: "9100" + - name: NODE_NAME + valueFrom: + fieldRef: + fieldPath: spec.nodeName volumeMounts: - name: script mountPath: /etc/tegrastats-exporter diff --git a/services/monitoring/kustomization.yaml b/services/monitoring/kustomization.yaml index 7d0b01b..23c1595 100644 --- a/services/monitoring/kustomization.yaml +++ b/services/monitoring/kustomization.yaml @@ -14,7 +14,7 @@ resources: - grafana-dashboard-network.yaml - grafana-dashboard-gpu.yaml - grafana-dashboard-mail.yaml - - grafana-dashboard-testing.yaml + - grafana-dashboard-jobs.yaml - dcgm-exporter.yaml - jetson-tegrastats-exporter.yaml - postmark-exporter-service.yaml @@ -23,7 +23,8 @@ resources: - grafana-alerting-config.yaml - grafana-folders.yaml - helmrelease.yaml - - grafana-org-bootstrap.yaml + - oneoffs/grafana-org-bootstrap.yaml + - oneoffs/grafana-user-dedupe-job.yaml configMapGenerator: - name: postmark-exporter-script diff --git a/services/monitoring/grafana-org-bootstrap.yaml b/services/monitoring/oneoffs/grafana-org-bootstrap.yaml similarity index 93% rename from services/monitoring/grafana-org-bootstrap.yaml rename to services/monitoring/oneoffs/grafana-org-bootstrap.yaml index f1d4075..6f824cc 100644 --- a/services/monitoring/grafana-org-bootstrap.yaml +++ b/services/monitoring/oneoffs/grafana-org-bootstrap.yaml @@ -1,10 +1,15 @@ -# services/monitoring/grafana-org-bootstrap.yaml +# services/monitoring/oneoffs/grafana-org-bootstrap.yaml +# One-off job for monitoring/grafana-org-bootstrap-3. +# Purpose: grafana org bootstrap 3 (see container args/env in this file). +# Run by setting spec.suspend to false, reconcile, then set it back to true. +# Safe to delete the finished Job/pod; it should not run continuously. apiVersion: batch/v1 kind: Job metadata: name: grafana-org-bootstrap-3 namespace: monitoring spec: + suspend: true backoffLimit: 2 template: metadata: diff --git a/services/monitoring/oneoffs/grafana-user-dedupe-job.yaml b/services/monitoring/oneoffs/grafana-user-dedupe-job.yaml new file mode 100644 index 0000000..8194f18 --- /dev/null +++ b/services/monitoring/oneoffs/grafana-user-dedupe-job.yaml @@ -0,0 +1,148 @@ +# services/monitoring/oneoffs/grafana-user-dedupe-job.yaml +# One-off job for monitoring/grafana-user-dedupe-api-v7. +# Purpose: grafana user dedupe api v7 (see container args/env in this file). +# Run by setting spec.suspend to false, reconcile, then set it back to true. +# Safe to delete the finished Job/pod; it should not run continuously. +apiVersion: batch/v1 +kind: Job +metadata: + name: grafana-user-dedupe-api-v7 + namespace: monitoring +spec: + suspend: true + backoffLimit: 1 + template: + metadata: + annotations: + vault.hashicorp.com/agent-inject: "true" + vault.hashicorp.com/agent-pre-populate-only: "true" + vault.hashicorp.com/role: "monitoring" + vault.hashicorp.com/agent-inject-secret-grafana-env.sh: "kv/data/atlas/monitoring/grafana-admin" + vault.hashicorp.com/agent-inject-template-grafana-env.sh: | + {{ with secret "kv/data/atlas/monitoring/grafana-admin" }} + export GRAFANA_USER="{{ index .Data.data "admin-user" }}" + export GRAFANA_PASSWORD="{{ index .Data.data "admin-password" }}" + {{ end }} + spec: + serviceAccountName: monitoring-vault-sync + automountServiceAccountToken: true + restartPolicy: Never + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: node-role.kubernetes.io/worker + operator: Exists + preferredDuringSchedulingIgnoredDuringExecution: + - weight: 100 + preference: + matchExpressions: + - key: kubernetes.io/arch + operator: In + values: ["arm64"] + containers: + - name: dedupe + image: python:3.12-slim + command: + - /bin/sh + - -c + args: + - | + set -euo pipefail + for _ in $(seq 1 30); do + if [ -f /vault/secrets/grafana-env.sh ]; then + break + fi + sleep 1 + done + if [ ! -f /vault/secrets/grafana-env.sh ]; then + echo "Vault secret not available" + exit 1 + fi + . /vault/secrets/grafana-env.sh + grafana_url="${GRAFANA_URL}" + if [ -z "${grafana_url}" ]; then + echo "GRAFANA_URL is required" + exit 1 + fi + if [ -z "${GRAFANA_USER}" ] || [ -z "${GRAFANA_PASSWORD}" ]; then + echo "Grafana admin credentials missing" + exit 1 + fi + if [ -z "${GRAFANA_DEDUPE_EMAILS}" ]; then + echo "GRAFANA_DEDUPE_EMAILS is required" + exit 1 + fi + python - <<'PY' + import base64 + import json + import os + import urllib.parse + import urllib.error + import urllib.request + + grafana_url = os.environ["GRAFANA_URL"].rstrip("/") + user = os.environ["GRAFANA_USER"] + password = os.environ["GRAFANA_PASSWORD"] + lookups = [e.strip() for e in os.environ["GRAFANA_DEDUPE_EMAILS"].split(",") if e.strip()] + + token = base64.b64encode(f"{user}:{password}".encode("utf-8")).decode("utf-8") + headers = {"Authorization": f"Basic {token}"} + + def request(method: str, url: str): + req = urllib.request.Request(url, headers=headers, method=method) + try: + with urllib.request.urlopen(req, timeout=10) as resp: + return resp.status, resp.read() + except urllib.error.HTTPError as err: + body = err.read() + return err.code, body + + for _ in range(60): + status, _ = request("GET", f"{grafana_url}/api/health") + if status == 200: + break + else: + raise SystemExit("Grafana API did not become ready in time") + + for lookup in lookups: + search_url = f"{grafana_url}/api/users/search?query={urllib.parse.quote(lookup)}" + status, body = request("GET", search_url) + if status != 200: + print(f"search failed for {lookup}: status={status} body={body.decode('utf-8', errors='ignore')}") + continue + payload = json.loads(body) + users = payload.get("users", []) + matches = [ + user + for user in users + if user.get("email", "").lower() == lookup.lower() + or user.get("login", "").lower() == lookup.lower() + ] + if not matches: + print(f"no grafana user found for {lookup}") + continue + for user in matches: + user_id = user.get("id") + if not user_id: + continue + print(f"deleting grafana user {user_id} ({user.get('email')})") + delete_url = f"{grafana_url}/api/admin/users/{user_id}" + del_status, del_body = request("DELETE", delete_url) + if del_status not in (200, 202, 204): + print( + "delete failed for", + user_id, + "status", + del_status, + "body", + del_body.decode("utf-8", errors="ignore"), + ) + PY + echo "done" + env: + - name: GRAFANA_URL + value: http://grafana.monitoring.svc.cluster.local + - name: GRAFANA_DEDUPE_EMAILS + value: brad.stein@gmail.com,brad@bstein.dev diff --git a/services/monitoring/postmark-exporter-deployment.yaml b/services/monitoring/postmark-exporter-deployment.yaml index 6406224..98791d9 100644 --- a/services/monitoring/postmark-exporter-deployment.yaml +++ b/services/monitoring/postmark-exporter-deployment.yaml @@ -18,9 +18,9 @@ spec: prometheus.io/path: "/metrics" vault.hashicorp.com/agent-inject: "true" vault.hashicorp.com/role: "monitoring" - vault.hashicorp.com/agent-inject-secret-postmark-env: "kv/data/atlas/monitoring/postmark-exporter" + vault.hashicorp.com/agent-inject-secret-postmark-env: "kv/data/atlas/shared/postmark-relay" vault.hashicorp.com/agent-inject-template-postmark-env: | - {{- with secret "kv/data/atlas/monitoring/postmark-exporter" -}} + {{- with secret "kv/data/atlas/shared/postmark-relay" -}} export POSTMARK_SERVER_TOKEN="{{ index .Data.data "apikey" }}" export POSTMARK_SERVER_TOKEN_FALLBACK="{{ index .Data.data "apikey" }}" {{- if index .Data.data "sending-limit" }} diff --git a/services/monitoring/scripts/jetson_tegrastats_exporter.py b/services/monitoring/scripts/jetson_tegrastats_exporter.py index cd557e7..8b36111 100644 --- a/services/monitoring/scripts/jetson_tegrastats_exporter.py +++ b/services/monitoring/scripts/jetson_tegrastats_exporter.py @@ -3,53 +3,59 @@ import os import re import socketserver import subprocess -import threading from time import time PORT = int(os.environ.get("JETSON_EXPORTER_PORT", "9100")) -METRICS = { +NODE_NAME = os.environ.get("NODE_NAME") or os.uname().nodename +BASE_METRICS = { "gr3d_freq_percent": 0.0, "gpu_temp_c": 0.0, "cpu_temp_c": 0.0, "ram_used_mb": 0.0, "ram_total_mb": 0.0, "power_5v_in_mw": 0.0, + "log_line_len": 0.0, "last_scrape_ts": 0.0, } -LOCK = threading.Lock() -def parse_line(line: str): +def parse_line(line: str) -> dict: + line = line.strip() updates = {} - m = re.search(r"GR3D_FREQ\\s+(\\d+)%", line) + m = re.search(r"GR3D_FREQ\s+(\d+)%", line) if m: updates["gr3d_freq_percent"] = float(m.group(1)) - m = re.search(r"GPU@(\\d+(?:\\.\\d+)?)C", line) + m = re.search(r"GPU@(\d+(?:\.\d+)?)C", line) if m: updates["gpu_temp_c"] = float(m.group(1)) - m = re.search(r"CPU@(\\d+(?:\\.\\d+)?)C", line) + m = re.search(r"CPU@(\d+(?:\.\d+)?)C", line) if m: updates["cpu_temp_c"] = float(m.group(1)) - m = re.search(r"RAM\\s+(\\d+)/(\\d+)MB", line) + m = re.search(r"RAM\s+(\d+)/(\d+)MB", line) if m: updates["ram_used_mb"] = float(m.group(1)) updates["ram_total_mb"] = float(m.group(2)) - m = re.search(r"POM_5V_IN\\s+(\\d+)/(\\d+)", line) + m = re.search(r"(?:POM_5V_IN|VDD_IN)\s+(\d+)(?:mW)?/(\d+)(?:mW)?", line) if m: updates["power_5v_in_mw"] = float(m.group(1)) - with LOCK: - METRICS.update(updates) - METRICS["last_scrape_ts"] = time() + return updates -def run_tegrastats(): - proc = subprocess.Popen( - ["/host/usr/bin/tegrastats", "--interval", "1000"], - stdout=subprocess.PIPE, - stderr=subprocess.STDOUT, - text=True, - bufsize=1, - ) - for line in proc.stdout: - parse_line(line) +def read_latest_line() -> str: + try: + proc = subprocess.Popen( + ["/host/usr/bin/tegrastats", "--interval", "1000"], + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, + text=True, + ) + line = proc.stdout.readline() + proc.terminate() + try: + proc.wait(timeout=1) + except subprocess.TimeoutExpired: + proc.kill() + return line + except OSError: + return "" class Handler(http.server.BaseHTTPRequestHandler): def do_GET(self): @@ -57,13 +63,18 @@ class Handler(http.server.BaseHTTPRequestHandler): self.send_response(404) self.end_headers() return - with LOCK: - metrics = METRICS.copy() + metrics = BASE_METRICS.copy() + line = read_latest_line() + if line: + metrics.update(parse_line(line)) + metrics["log_line_len"] = float(len(line)) + metrics["last_scrape_ts"] = time() out = [] + label = f'{{node="{NODE_NAME}"}}' for k, v in metrics.items(): out.append(f"# TYPE jetson_{k} gauge") - out.append(f"jetson_{k} {v}") - body = "\\n".join(out) + "\\n" + out.append(f"jetson_{k}{label} {v}") + body = "\n".join(out) + "\n" self.send_response(200) self.send_header("Content-Type", "text/plain; version=0.0.4") self.send_header("Content-Length", str(len(body))) @@ -74,7 +85,5 @@ class Handler(http.server.BaseHTTPRequestHandler): return if __name__ == "__main__": - t = threading.Thread(target=run_tegrastats, daemon=True) - t.start() with socketserver.TCPServer(("", PORT), Handler) as httpd: httpd.serve_forever() diff --git a/services/monitoring/secretproviderclass.yaml b/services/monitoring/secretproviderclass.yaml index 8a6c5fb..350d6aa 100644 --- a/services/monitoring/secretproviderclass.yaml +++ b/services/monitoring/secretproviderclass.yaml @@ -11,7 +11,7 @@ spec: roleName: "monitoring" objects: | - objectName: "harbor-pull__dockerconfigjson" - secretPath: "kv/data/atlas/harbor-pull/monitoring" + secretPath: "kv/data/atlas/shared/harbor-pull" secretKey: "dockerconfigjson" secretObjects: - secretName: harbor-regcred diff --git a/services/nextcloud-mail-sync/cronjob.yaml b/services/nextcloud-mail-sync/cronjob.yaml index 2073d76..6913b60 100644 --- a/services/nextcloud-mail-sync/cronjob.yaml +++ b/services/nextcloud-mail-sync/cronjob.yaml @@ -8,6 +8,7 @@ metadata: atlas.bstein.dev/glue: "true" spec: schedule: "0 5 * * *" + suspend: true concurrencyPolicy: Forbid successfulJobsHistoryLimit: 3 failedJobsHistoryLimit: 1 diff --git a/services/nextcloud-mail-sync/portal-rbac.yaml b/services/nextcloud-mail-sync/portal-rbac.yaml index dc9a4e4..009b2e0 100644 --- a/services/nextcloud-mail-sync/portal-rbac.yaml +++ b/services/nextcloud-mail-sync/portal-rbac.yaml @@ -27,3 +27,16 @@ subjects: - kind: ServiceAccount name: bstein-dev-home namespace: bstein-dev-home +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + name: ariadne-nextcloud-mail-sync +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: Role + name: bstein-dev-home-nextcloud-mail-sync +subjects: + - kind: ServiceAccount + name: ariadne + namespace: maintenance diff --git a/services/nextcloud/cronjob.yaml b/services/nextcloud/cronjob.yaml index cc0091b..58d8aa1 100644 --- a/services/nextcloud/cronjob.yaml +++ b/services/nextcloud/cronjob.yaml @@ -6,6 +6,7 @@ metadata: namespace: nextcloud spec: schedule: "*/5 * * * *" + suspend: true concurrencyPolicy: Forbid jobTemplate: spec: diff --git a/services/nextcloud/maintenance-cronjob.yaml b/services/nextcloud/maintenance-cronjob.yaml index d4008c7..177cc02 100644 --- a/services/nextcloud/maintenance-cronjob.yaml +++ b/services/nextcloud/maintenance-cronjob.yaml @@ -6,6 +6,7 @@ metadata: namespace: nextcloud spec: schedule: "30 4 * * *" + suspend: true concurrencyPolicy: Forbid jobTemplate: spec: diff --git a/services/pegasus/deployment.yaml b/services/pegasus/deployment.yaml index bc3db70..b6a1639 100644 --- a/services/pegasus/deployment.yaml +++ b/services/pegasus/deployment.yaml @@ -72,7 +72,7 @@ spec: containers: - name: pegasus - image: registry.bstein.dev/streaming/pegasus-vault:1.2.32 # {"$imagepolicy": "jellyfin:pegasus"} + image: registry.bstein.dev/streaming/pegasus-vault:1.2.32 # {"$imagepolicy": "jellyfin:pegasus:tag"} imagePullPolicy: Always env: - name: PEGASUS_MEDIA_ROOT diff --git a/services/pegasus/kustomization.yaml b/services/pegasus/kustomization.yaml index bef2b40..05c3baa 100644 --- a/services/pegasus/kustomization.yaml +++ b/services/pegasus/kustomization.yaml @@ -3,6 +3,7 @@ apiVersion: kustomize.config.k8s.io/v1beta1 kind: Kustomization resources: - configmap.yaml + - image.yaml - vault-serviceaccount.yaml - secretproviderclass.yaml - service.yaml diff --git a/services/pegasus/secretproviderclass.yaml b/services/pegasus/secretproviderclass.yaml index b4621a5..b8d1df9 100644 --- a/services/pegasus/secretproviderclass.yaml +++ b/services/pegasus/secretproviderclass.yaml @@ -11,7 +11,7 @@ spec: roleName: "pegasus" objects: | - objectName: "harbor-pull__dockerconfigjson" - secretPath: "kv/data/atlas/harbor-pull/jellyfin" + secretPath: "kv/data/atlas/shared/harbor-pull" secretKey: "dockerconfigjson" secretObjects: - secretName: harbor-regcred diff --git a/services/vault/k8s-auth-config-cronjob.yaml b/services/vault/k8s-auth-config-cronjob.yaml index 29e8e80..5a2d682 100644 --- a/services/vault/k8s-auth-config-cronjob.yaml +++ b/services/vault/k8s-auth-config-cronjob.yaml @@ -8,6 +8,7 @@ metadata: atlas.bstein.dev/glue: "true" spec: schedule: "*/15 * * * *" + suspend: false concurrencyPolicy: Forbid successfulJobsHistoryLimit: 1 failedJobsHistoryLimit: 3 @@ -33,6 +34,11 @@ spec: value: http://10.43.57.249:8200 - name: VAULT_K8S_ROLE value: vault-admin + - name: VAULT_TOKEN + valueFrom: + secretKeyRef: + name: vault-init + key: root_token - name: VAULT_K8S_TOKEN_REVIEWER_JWT_FILE value: /var/run/secrets/vault-token-reviewer/token - name: VAULT_K8S_ROLE_TTL diff --git a/services/vault/oidc-config-cronjob.yaml b/services/vault/oidc-config-cronjob.yaml index 013c9f3..4d317c5 100644 --- a/services/vault/oidc-config-cronjob.yaml +++ b/services/vault/oidc-config-cronjob.yaml @@ -8,6 +8,7 @@ metadata: atlas.bstein.dev/glue: "true" spec: schedule: "*/15 * * * *" + suspend: true concurrencyPolicy: Forbid successfulJobsHistoryLimit: 1 failedJobsHistoryLimit: 3 diff --git a/services/vault/scripts/vault_k8s_auth_configure.sh b/services/vault/scripts/vault_k8s_auth_configure.sh index 202879f..0212180 100644 --- a/services/vault/scripts/vault_k8s_auth_configure.sh +++ b/services/vault/scripts/vault_k8s_auth_configure.sh @@ -193,8 +193,8 @@ path "kv/data/atlas/shared/*" { write_raw_policy "dev-kv" "${dev_kv_policy}" log "writing role vault-admin" vault_cmd write "auth/kubernetes/role/vault-admin" \ - bound_service_account_names="vault-admin" \ - bound_service_account_namespaces="vault" \ + bound_service_account_names="vault-admin,ariadne" \ + bound_service_account_namespaces="vault,maintenance" \ policies="vault-admin" \ ttl="${role_ttl}" @@ -203,40 +203,42 @@ write_policy_and_role "outline" "outline" "outline-vault" \ write_policy_and_role "planka" "planka" "planka-vault" \ "planka/* shared/postmark-relay" "" write_policy_and_role "bstein-dev-home" "bstein-dev-home" "bstein-dev-home,bstein-dev-home-vault-sync" \ - "portal/* shared/chat-ai-keys-runtime shared/portal-e2e-client shared/postmark-relay mailu/mailu-initial-account-secret harbor-pull/bstein-dev-home" "" + "portal/* shared/chat-ai-keys-runtime shared/portal-e2e-client shared/postmark-relay mailu/mailu-initial-account-secret shared/harbor-pull" "" write_policy_and_role "gitea" "gitea" "gitea-vault" \ "gitea/*" "" write_policy_and_role "vaultwarden" "vaultwarden" "vaultwarden-vault" \ "vaultwarden/* mailu/mailu-initial-account-secret" "" write_policy_and_role "sso" "sso" "sso-vault,sso-vault-sync,mas-secrets-ensure" \ - "sso/* portal/bstein-dev-home-keycloak-admin shared/keycloak-admin shared/portal-e2e-client shared/postmark-relay harbor-pull/sso" "" + "sso/* portal/bstein-dev-home-keycloak-admin shared/keycloak-admin shared/portal-e2e-client shared/postmark-relay shared/harbor-pull" "" write_policy_and_role "mailu-mailserver" "mailu-mailserver" "mailu-vault-sync" \ - "mailu/* shared/postmark-relay harbor-pull/mailu-mailserver" "" + "mailu/* shared/postmark-relay shared/harbor-pull" "" write_policy_and_role "harbor" "harbor" "harbor-vault-sync" \ - "harbor/* harbor-pull/harbor" "" + "harbor/* shared/harbor-pull" "" write_policy_and_role "nextcloud" "nextcloud" "nextcloud-vault" \ "nextcloud/* shared/keycloak-admin shared/postmark-relay" "" write_policy_and_role "comms" "comms" "comms-vault,atlasbot" \ - "comms/* shared/chat-ai-keys-runtime harbor-pull/comms" "" -write_policy_and_role "jenkins" "jenkins" "jenkins" \ - "jenkins/*" "" + "comms/* shared/chat-ai-keys-runtime shared/harbor-pull" "" +write_policy_and_role "jenkins" "jenkins" "jenkins,jenkins-vault-sync" \ + "jenkins/* shared/harbor-pull" "" write_policy_and_role "monitoring" "monitoring" "monitoring-vault-sync" \ - "monitoring/* shared/postmark-relay harbor-pull/monitoring" "" + "monitoring/* shared/postmark-relay shared/harbor-pull" "" write_policy_and_role "logging" "logging" "logging-vault-sync" \ - "logging/* harbor-pull/logging" "" + "logging/* shared/harbor-pull" "" write_policy_and_role "pegasus" "jellyfin" "pegasus-vault-sync" \ - "pegasus/* harbor-pull/jellyfin" "" + "pegasus/* shared/harbor-pull" "" write_policy_and_role "crypto" "crypto" "crypto-vault-sync" \ - "crypto/* harbor-pull/crypto" "" + "crypto/* shared/harbor-pull" "" write_policy_and_role "health" "health" "health-vault-sync" \ "health/*" "" +write_policy_and_role "maintenance" "maintenance" "ariadne,maintenance-vault-sync" \ + "maintenance/ariadne-db portal/atlas-portal-db portal/bstein-dev-home-keycloak-admin mailu/mailu-db-secret mailu/mailu-initial-account-secret nextcloud/nextcloud-db nextcloud/nextcloud-admin health/wger-admin finance/firefly-secrets comms/mas-admin-client-runtime comms/atlasbot-credentials-runtime comms/synapse-db comms/synapse-admin vault/vault-oidc-config shared/harbor-pull" "" write_policy_and_role "finance" "finance" "finance-vault" \ "finance/* shared/postmark-relay" "" write_policy_and_role "finance-secrets" "finance" "finance-secrets-ensure" \ "" \ "finance/*" write_policy_and_role "longhorn" "longhorn-system" "longhorn-vault,longhorn-vault-sync" \ - "longhorn/* harbor-pull/longhorn" "" + "longhorn/* shared/harbor-pull" "" write_policy_and_role "postgres" "postgres" "postgres-vault" \ "postgres/postgres-db" "" write_policy_and_role "vault" "vault" "vault" \ @@ -251,4 +253,4 @@ write_policy_and_role "crypto-secrets" "crypto" "crypto-secrets-ensure" \ write_policy_and_role "comms-secrets" "comms" \ "comms-secrets-ensure,mas-db-ensure,mas-admin-client-secret-writer,othrys-synapse-signingkey-job" \ "" \ - "comms/turn-shared-secret comms/livekit-api comms/synapse-redis comms/synapse-macaroon comms/atlasbot-credentials-runtime comms/synapse-db comms/mas-db comms/mas-admin-client-runtime comms/mas-secrets-runtime comms/othrys-synapse-signingkey" + "comms/turn-shared-secret comms/livekit-api comms/synapse-redis comms/synapse-macaroon comms/atlasbot-credentials-runtime comms/synapse-db comms/synapse-admin comms/synapse-registration comms/mas-db comms/mas-admin-client-runtime comms/mas-secrets-runtime comms/othrys-synapse-signingkey" diff --git a/services/vaultwarden/ariadne-rbac.yaml b/services/vaultwarden/ariadne-rbac.yaml new file mode 100644 index 0000000..ee903ca --- /dev/null +++ b/services/vaultwarden/ariadne-rbac.yaml @@ -0,0 +1,28 @@ +# services/vaultwarden/ariadne-rbac.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: Role +metadata: + name: ariadne-vaultwarden-admin-reader + namespace: vaultwarden +rules: + - apiGroups: [""] + resources: ["secrets"] + verbs: ["get"] + resourceNames: ["vaultwarden-admin"] + - apiGroups: [""] + resources: ["pods"] + verbs: ["get", "list"] +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + name: ariadne-vaultwarden-admin-reader + namespace: vaultwarden +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: Role + name: ariadne-vaultwarden-admin-reader +subjects: + - kind: ServiceAccount + name: ariadne + namespace: maintenance diff --git a/services/vaultwarden/deployment.yaml b/services/vaultwarden/deployment.yaml index 2893a92..e1d888a 100644 --- a/services/vaultwarden/deployment.yaml +++ b/services/vaultwarden/deployment.yaml @@ -39,7 +39,7 @@ spec: node-role.kubernetes.io/worker: "true" containers: - name: vaultwarden - image: vaultwarden/server:1.33.2 + image: vaultwarden/server:1.35.2 command: ["/bin/sh", "-c"] args: - >- diff --git a/services/vaultwarden/kustomization.yaml b/services/vaultwarden/kustomization.yaml index c53cb1c..ca5ef26 100644 --- a/services/vaultwarden/kustomization.yaml +++ b/services/vaultwarden/kustomization.yaml @@ -5,6 +5,7 @@ namespace: vaultwarden resources: - namespace.yaml - serviceaccount.yaml + - ariadne-rbac.yaml - pvc.yaml - deployment.yaml - service.yaml